the-algorithm/pushservice/src/main/python/models/libs/get_feat_config.py

import os

from twitter.deepbird.projects.magic_recs.libs.metric_fn_utils import USER_AGE_FEATURE_NAME
from twitter.deepbird.projects.magic_recs.libs.model_utils import read_config
from twml.contrib import feature_config as contrib_feature_config


# checkstyle: noqa

FEAT_CONFIG_DEFAULT_VAL = -1.23456789

DEFAULT_INPUT_SIZE_BITS = 18

DEFAULT_FEATURE_LIST_PATH = "./feature_list_default.yaml"
FEATURE_LIST_DEFAULT_PATH = os.path.join(
  os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_PATH
)

DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH = "./feature_list_light_ranking.yaml"
FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH = os.path.join(
  os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH
)

FEATURE_LIST_DEFAULT = read_config(FEATURE_LIST_DEFAULT_PATH).items()
FEATURE_LIST_LIGHT_RANKING_DEFAULT = read_config(FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH).items()


LABELS = ["label"]
LABELS_MTL = {"OONC": ["label"], "OONC_Engagement": ["label", "label.engagement"]}
LABELS_LR = {
  "Sent": ["label.sent"],
  "HeavyRankPosition": ["meta.ranking.is_top3"],
  "HeavyRankProbability": ["meta.ranking.weighted_oonc_model_score"],
}


def _get_new_feature_config_base(
  data_spec_path,
  labels,
  add_sparse_continous=True,
  add_gbdt=True,
  add_user_id=False,
  add_timestamp=False,
  add_user_age=False,
  feature_list_provided=[],
  opt=None,
  run_light_ranking_group_metrics_in_bq=False,
):
  """
  Getter of the feature config based on specification.

  Args:
    data_spec_path: A string indicating the path of the data_spec.json file, which could be
      either a local path or a hdfs path.
    labels: A list of strings indicating the name of the label in the data spec.
    add_sparse_continous: A bool indicating if sparse_continuous feature needs to be included.
    add_gbdt: A bool indicating if gbdt feature needs to be included.
    add_user_id: A bool indicating if user_id feature needs to be included.
    add_timestamp: A bool indicating if timestamp feature needs to be included. This will be useful
      for sequential models and meta learning models.
    add_user_age: A bool indicating if the user age feature needs to be included.
    feature_list_provided: A list of features thats need to be included. If not specified, will use
      FEATURE_LIST_DEFAULT by default.
    opt: A namespace of arguments indicating the hyparameters.
    run_light_ranking_group_metrics_in_bq: A bool indicating if heavy ranker score info needs to be included to compute group metrics in BigQuery.

  Returns:
    A twml feature config object.
  """

  input_size_bits = DEFAULT_INPUT_SIZE_BITS if opt is None else opt.input_size_bits

  feature_list = feature_list_provided if feature_list_provided != [] else FEATURE_LIST_DEFAULT
  a_string_feat_list = [f[0] for f in feature_list if f[1] != "S"]

  builder = contrib_feature_config.FeatureConfigBuilder(data_spec_path=data_spec_path)

  builder = builder.extract_feature_group(
    feature_regexes=a_string_feat_list,
    group_name="continuous",
    default_value=FEAT_CONFIG_DEFAULT_VAL,
    type_filter=["CONTINUOUS"],
  )

  builder = builder.extract_features_as_hashed_sparse(
    feature_regexes=a_string_feat_list,
    output_tensor_name="sparse_no_continuous",
    hash_space_size_bits=input_size_bits,
    type_filter=["BINARY", "DISCRETE", "STRING", "SPARSE_BINARY"],
  )

  if add_gbdt:
    builder = builder.extract_features_as_hashed_sparse(
      feature_regexes=["ads\..*"],
      output_tensor_name="gbdt_sparse",
      hash_space_size_bits=input_size_bits,
    )

  if add_sparse_continous:
    s_string_feat_list = [f[0] for f in feature_list if f[1] == "S"]

    builder = builder.extract_features_as_hashed_sparse(
      feature_regexes=s_string_feat_list,
      output_tensor_name="sparse_continuous",
      hash_space_size_bits=input_size_bits,
      type_filter=["SPARSE_CONTINUOUS"],
    )

  if add_user_id:
    builder = builder.extract_feature("meta.user_id")
  if add_timestamp:
    builder = builder.extract_feature("meta.timestamp")
  if add_user_age:
    builder = builder.extract_feature(USER_AGE_FEATURE_NAME)

  if run_light_ranking_group_metrics_in_bq:
    builder = builder.extract_feature("meta.trace_id")
    builder = builder.extract_feature("meta.ranking.weighted_oonc_model_score")

  builder = builder.add_labels(labels).define_weight("meta.weight")

  return builder.build()


def get_feature_config_with_sparse_continuous(
  data_spec_path,
  feature_list_provided=[],
  opt=None,
  add_user_id=False,
  add_timestamp=False,
  add_user_age=False,
):
  task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "OONC"
  if task_name not in LABELS_MTL:
    raise ValueError("Invalid Task Name !")

  return _get_new_feature_config_base(
    data_spec_path=data_spec_path,
    labels=LABELS_MTL[task_name],
    add_sparse_continous=True,
    add_user_id=add_user_id,
    add_timestamp=add_timestamp,
    add_user_age=add_user_age,
    feature_list_provided=feature_list_provided,
    opt=opt,
  )


def get_feature_config_light_ranking(
  data_spec_path,
  feature_list_provided=[],
  opt=None,
  add_user_id=True,
  add_timestamp=False,
  add_user_age=False,
  add_gbdt=False,
  run_light_ranking_group_metrics_in_bq=False,
):
  task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "HeavyRankPosition"
  if task_name not in LABELS_LR:
    raise ValueError("Invalid Task Name !")
  if not feature_list_provided:
    feature_list_provided = FEATURE_LIST_LIGHT_RANKING_DEFAULT

  return _get_new_feature_config_base(
    data_spec_path=data_spec_path,
    labels=LABELS_LR[task_name],
    add_sparse_continous=False,
    add_gbdt=add_gbdt,
    add_user_id=add_user_id,
    add_timestamp=add_timestamp,
    add_user_age=add_user_age,
    feature_list_provided=feature_list_provided,
    opt=opt,
    run_light_ranking_group_metrics_in_bq=run_light_ranking_group_metrics_in_bq,
  )