the-algorithm/pushservice/src/main/python/models/libs/get_feat_config.py
twitter-team b389c3d302 Open-sourcing pushservice
Pushservice is the main recommendation service we use to surface recommendations to our users via notifications. It fetches candidates from various sources, ranks them in order of relevance, and applies filters to determine the best one to send.
2023-05-19 16:27:07 -05:00

177 lines
5.8 KiB
Python

import os
from twitter.deepbird.projects.magic_recs.libs.metric_fn_utils import USER_AGE_FEATURE_NAME
from twitter.deepbird.projects.magic_recs.libs.model_utils import read_config
from twml.contrib import feature_config as contrib_feature_config
# checkstyle: noqa
FEAT_CONFIG_DEFAULT_VAL = -1.23456789
DEFAULT_INPUT_SIZE_BITS = 18
DEFAULT_FEATURE_LIST_PATH = "./feature_list_default.yaml"
FEATURE_LIST_DEFAULT_PATH = os.path.join(
os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_PATH
)
DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH = "./feature_list_light_ranking.yaml"
FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH = os.path.join(
os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH
)
FEATURE_LIST_DEFAULT = read_config(FEATURE_LIST_DEFAULT_PATH).items()
FEATURE_LIST_LIGHT_RANKING_DEFAULT = read_config(FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH).items()
LABELS = ["label"]
LABELS_MTL = {"OONC": ["label"], "OONC_Engagement": ["label", "label.engagement"]}
LABELS_LR = {
"Sent": ["label.sent"],
"HeavyRankPosition": ["meta.ranking.is_top3"],
"HeavyRankProbability": ["meta.ranking.weighted_oonc_model_score"],
}
def _get_new_feature_config_base(
data_spec_path,
labels,
add_sparse_continous=True,
add_gbdt=True,
add_user_id=False,
add_timestamp=False,
add_user_age=False,
feature_list_provided=[],
opt=None,
run_light_ranking_group_metrics_in_bq=False,
):
"""
Getter of the feature config based on specification.
Args:
data_spec_path: A string indicating the path of the data_spec.json file, which could be
either a local path or a hdfs path.
labels: A list of strings indicating the name of the label in the data spec.
add_sparse_continous: A bool indicating if sparse_continuous feature needs to be included.
add_gbdt: A bool indicating if gbdt feature needs to be included.
add_user_id: A bool indicating if user_id feature needs to be included.
add_timestamp: A bool indicating if timestamp feature needs to be included. This will be useful
for sequential models and meta learning models.
add_user_age: A bool indicating if the user age feature needs to be included.
feature_list_provided: A list of features thats need to be included. If not specified, will use
FEATURE_LIST_DEFAULT by default.
opt: A namespace of arguments indicating the hyparameters.
run_light_ranking_group_metrics_in_bq: A bool indicating if heavy ranker score info needs to be included to compute group metrics in BigQuery.
Returns:
A twml feature config object.
"""
input_size_bits = DEFAULT_INPUT_SIZE_BITS if opt is None else opt.input_size_bits
feature_list = feature_list_provided if feature_list_provided != [] else FEATURE_LIST_DEFAULT
a_string_feat_list = [f[0] for f in feature_list if f[1] != "S"]
builder = contrib_feature_config.FeatureConfigBuilder(data_spec_path=data_spec_path)
builder = builder.extract_feature_group(
feature_regexes=a_string_feat_list,
group_name="continuous",
default_value=FEAT_CONFIG_DEFAULT_VAL,
type_filter=["CONTINUOUS"],
)
builder = builder.extract_features_as_hashed_sparse(
feature_regexes=a_string_feat_list,
output_tensor_name="sparse_no_continuous",
hash_space_size_bits=input_size_bits,
type_filter=["BINARY", "DISCRETE", "STRING", "SPARSE_BINARY"],
)
if add_gbdt:
builder = builder.extract_features_as_hashed_sparse(
feature_regexes=["ads\..*"],
output_tensor_name="gbdt_sparse",
hash_space_size_bits=input_size_bits,
)
if add_sparse_continous:
s_string_feat_list = [f[0] for f in feature_list if f[1] == "S"]
builder = builder.extract_features_as_hashed_sparse(
feature_regexes=s_string_feat_list,
output_tensor_name="sparse_continuous",
hash_space_size_bits=input_size_bits,
type_filter=["SPARSE_CONTINUOUS"],
)
if add_user_id:
builder = builder.extract_feature("meta.user_id")
if add_timestamp:
builder = builder.extract_feature("meta.timestamp")
if add_user_age:
builder = builder.extract_feature(USER_AGE_FEATURE_NAME)
if run_light_ranking_group_metrics_in_bq:
builder = builder.extract_feature("meta.trace_id")
builder = builder.extract_feature("meta.ranking.weighted_oonc_model_score")
builder = builder.add_labels(labels).define_weight("meta.weight")
return builder.build()
def get_feature_config_with_sparse_continuous(
data_spec_path,
feature_list_provided=[],
opt=None,
add_user_id=False,
add_timestamp=False,
add_user_age=False,
):
task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "OONC"
if task_name not in LABELS_MTL:
raise ValueError("Invalid Task Name !")
return _get_new_feature_config_base(
data_spec_path=data_spec_path,
labels=LABELS_MTL[task_name],
add_sparse_continous=True,
add_user_id=add_user_id,
add_timestamp=add_timestamp,
add_user_age=add_user_age,
feature_list_provided=feature_list_provided,
opt=opt,
)
def get_feature_config_light_ranking(
data_spec_path,
feature_list_provided=[],
opt=None,
add_user_id=True,
add_timestamp=False,
add_user_age=False,
add_gbdt=False,
run_light_ranking_group_metrics_in_bq=False,
):
task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "HeavyRankPosition"
if task_name not in LABELS_LR:
raise ValueError("Invalid Task Name !")
if not feature_list_provided:
feature_list_provided = FEATURE_LIST_LIGHT_RANKING_DEFAULT
return _get_new_feature_config_base(
data_spec_path=data_spec_path,
labels=LABELS_LR[task_name],
add_sparse_continous=False,
add_gbdt=add_gbdt,
add_user_id=add_user_id,
add_timestamp=add_timestamp,
add_user_age=add_user_age,
feature_list_provided=feature_list_provided,
opt=opt,
run_light_ranking_group_metrics_in_bq=run_light_ranking_group_metrics_in_bq,
)