the-algorithm/pushservice/src/main/python/models/libs/get_feat_config.py

177 lines
5.8 KiB
Python

import os
from twitter.deepbird.projects.magic_recs.libs.metric_fn_utils import USER_AGE_FEATURE_NAME
from twitter.deepbird.projects.magic_recs.libs.model_utils import read_config
from twml.contrib import feature_config as contrib_feature_config
# checkstyle: noqa
FEAT_CONFIG_DEFAULT_VAL = -1.23456789
DEFAULT_INPUT_SIZE_BITS = 18
DEFAULT_FEATURE_LIST_PATH = "./feature_list_default.yaml"
FEATURE_LIST_DEFAULT_PATH = os.path.join(
os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_PATH
)
DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH = "./feature_list_light_ranking.yaml"
FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH = os.path.join(
os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH
)
FEATURE_LIST_DEFAULT = read_config(FEATURE_LIST_DEFAULT_PATH).items()
FEATURE_LIST_LIGHT_RANKING_DEFAULT = read_config(FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH).items()
LABELS = ["label"]
LABELS_MTL = {"OONC": ["label"], "OONC_Engagement": ["label", "label.engagement"]}
LABELS_LR = {
"Sent": ["label.sent"],
"HeavyRankPosition": ["meta.ranking.is_top3"],
"HeavyRankProbability": ["meta.ranking.weighted_oonc_model_score"],
}
def _get_new_feature_config_base(
data_spec_path,
labels,
add_sparse_continous=True,
add_gbdt=True,
add_user_id=False,
add_timestamp=False,
add_user_age=False,
feature_list_provided=[],
opt=None,
run_light_ranking_group_metrics_in_bq=False,
):
"""
Getter of the feature config based on specification.
Args:
data_spec_path: A string indicating the path of the data_spec.json file, which could be
either a local path or a hdfs path.
labels: A list of strings indicating the name of the label in the data spec.
add_sparse_continous: A bool indicating if sparse_continuous feature needs to be included.
add_gbdt: A bool indicating if gbdt feature needs to be included.
add_user_id: A bool indicating if user_id feature needs to be included.
add_timestamp: A bool indicating if timestamp feature needs to be included. This will be useful
for sequential models and meta learning models.
add_user_age: A bool indicating if the user age feature needs to be included.
feature_list_provided: A list of features thats need to be included. If not specified, will use
FEATURE_LIST_DEFAULT by default.
opt: A namespace of arguments indicating the hyparameters.
run_light_ranking_group_metrics_in_bq: A bool indicating if heavy ranker score info needs to be included to compute group metrics in BigQuery.
Returns:
A twml feature config object.
"""
input_size_bits = DEFAULT_INPUT_SIZE_BITS if opt is None else opt.input_size_bits
feature_list = feature_list_provided if feature_list_provided != [] else FEATURE_LIST_DEFAULT
a_string_feat_list = [f[0] for f in feature_list if f[1] != "S"]
builder = contrib_feature_config.FeatureConfigBuilder(data_spec_path=data_spec_path)
builder = builder.extract_feature_group(
feature_regexes=a_string_feat_list,
group_name="continuous",
default_value=FEAT_CONFIG_DEFAULT_VAL,
type_filter=["CONTINUOUS"],
)
builder = builder.extract_features_as_hashed_sparse(
feature_regexes=a_string_feat_list,
output_tensor_name="sparse_no_continuous",
hash_space_size_bits=input_size_bits,
type_filter=["BINARY", "DISCRETE", "STRING", "SPARSE_BINARY"],
)
if add_gbdt:
builder = builder.extract_features_as_hashed_sparse(
feature_regexes=["ads\..*"],
output_tensor_name="gbdt_sparse",
hash_space_size_bits=input_size_bits,
)
if add_sparse_continous:
s_string_feat_list = [f[0] for f in feature_list if f[1] == "S"]
builder = builder.extract_features_as_hashed_sparse(
feature_regexes=s_string_feat_list,
output_tensor_name="sparse_continuous",
hash_space_size_bits=input_size_bits,
type_filter=["SPARSE_CONTINUOUS"],
)
if add_user_id:
builder = builder.extract_feature("meta.user_id")
if add_timestamp:
builder = builder.extract_feature("meta.timestamp")
if add_user_age:
builder = builder.extract_feature(USER_AGE_FEATURE_NAME)
if run_light_ranking_group_metrics_in_bq:
builder = builder.extract_feature("meta.trace_id")
builder = builder.extract_feature("meta.ranking.weighted_oonc_model_score")
builder = builder.add_labels(labels).define_weight("meta.weight")
return builder.build()
def get_feature_config_with_sparse_continuous(
data_spec_path,
feature_list_provided=[],
opt=None,
add_user_id=False,
add_timestamp=False,
add_user_age=False,
):
task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "OONC"
if task_name not in LABELS_MTL:
raise ValueError("Invalid Task Name !")
return _get_new_feature_config_base(
data_spec_path=data_spec_path,
labels=LABELS_MTL[task_name],
add_sparse_continous=True,
add_user_id=add_user_id,
add_timestamp=add_timestamp,
add_user_age=add_user_age,
feature_list_provided=feature_list_provided,
opt=opt,
)
def get_feature_config_light_ranking(
data_spec_path,
feature_list_provided=[],
opt=None,
add_user_id=True,
add_timestamp=False,
add_user_age=False,
add_gbdt=False,
run_light_ranking_group_metrics_in_bq=False,
):
task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "HeavyRankPosition"
if task_name not in LABELS_LR:
raise ValueError("Invalid Task Name !")
if not feature_list_provided:
feature_list_provided = FEATURE_LIST_LIGHT_RANKING_DEFAULT
return _get_new_feature_config_base(
data_spec_path=data_spec_path,
labels=LABELS_LR[task_name],
add_sparse_continous=False,
add_gbdt=add_gbdt,
add_user_id=add_user_id,
add_timestamp=add_timestamp,
add_user_age=add_user_age,
feature_list_provided=feature_list_provided,
opt=opt,
run_light_ranking_group_metrics_in_bq=run_light_ranking_group_metrics_in_bq,
)