177 lines
5.8 KiB
Python
177 lines
5.8 KiB
Python
import os
|
|
|
|
from twitter.deepbird.projects.magic_recs.libs.metric_fn_utils import USER_AGE_FEATURE_NAME
|
|
from twitter.deepbird.projects.magic_recs.libs.model_utils import read_config
|
|
from twml.contrib import feature_config as contrib_feature_config
|
|
|
|
|
|
# checkstyle: noqa
|
|
|
|
FEAT_CONFIG_DEFAULT_VAL = -1.23456789
|
|
|
|
DEFAULT_INPUT_SIZE_BITS = 18
|
|
|
|
DEFAULT_FEATURE_LIST_PATH = "./feature_list_default.yaml"
|
|
FEATURE_LIST_DEFAULT_PATH = os.path.join(
|
|
os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_PATH
|
|
)
|
|
|
|
DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH = "./feature_list_light_ranking.yaml"
|
|
FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH = os.path.join(
|
|
os.path.dirname(os.path.realpath(__file__)), DEFAULT_FEATURE_LIST_LIGHT_RANKING_PATH
|
|
)
|
|
|
|
FEATURE_LIST_DEFAULT = read_config(FEATURE_LIST_DEFAULT_PATH).items()
|
|
FEATURE_LIST_LIGHT_RANKING_DEFAULT = read_config(FEATURE_LIST_DEFAULT_LIGHT_RANKING_PATH).items()
|
|
|
|
|
|
LABELS = ["label"]
|
|
LABELS_MTL = {"OONC": ["label"], "OONC_Engagement": ["label", "label.engagement"]}
|
|
LABELS_LR = {
|
|
"Sent": ["label.sent"],
|
|
"HeavyRankPosition": ["meta.ranking.is_top3"],
|
|
"HeavyRankProbability": ["meta.ranking.weighted_oonc_model_score"],
|
|
}
|
|
|
|
|
|
def _get_new_feature_config_base(
|
|
data_spec_path,
|
|
labels,
|
|
add_sparse_continous=True,
|
|
add_gbdt=True,
|
|
add_user_id=False,
|
|
add_timestamp=False,
|
|
add_user_age=False,
|
|
feature_list_provided=[],
|
|
opt=None,
|
|
run_light_ranking_group_metrics_in_bq=False,
|
|
):
|
|
"""
|
|
Getter of the feature config based on specification.
|
|
|
|
Args:
|
|
data_spec_path: A string indicating the path of the data_spec.json file, which could be
|
|
either a local path or a hdfs path.
|
|
labels: A list of strings indicating the name of the label in the data spec.
|
|
add_sparse_continous: A bool indicating if sparse_continuous feature needs to be included.
|
|
add_gbdt: A bool indicating if gbdt feature needs to be included.
|
|
add_user_id: A bool indicating if user_id feature needs to be included.
|
|
add_timestamp: A bool indicating if timestamp feature needs to be included. This will be useful
|
|
for sequential models and meta learning models.
|
|
add_user_age: A bool indicating if the user age feature needs to be included.
|
|
feature_list_provided: A list of features thats need to be included. If not specified, will use
|
|
FEATURE_LIST_DEFAULT by default.
|
|
opt: A namespace of arguments indicating the hyparameters.
|
|
run_light_ranking_group_metrics_in_bq: A bool indicating if heavy ranker score info needs to be included to compute group metrics in BigQuery.
|
|
|
|
Returns:
|
|
A twml feature config object.
|
|
"""
|
|
|
|
input_size_bits = DEFAULT_INPUT_SIZE_BITS if opt is None else opt.input_size_bits
|
|
|
|
feature_list = feature_list_provided if feature_list_provided != [] else FEATURE_LIST_DEFAULT
|
|
a_string_feat_list = [f[0] for f in feature_list if f[1] != "S"]
|
|
|
|
builder = contrib_feature_config.FeatureConfigBuilder(data_spec_path=data_spec_path)
|
|
|
|
builder = builder.extract_feature_group(
|
|
feature_regexes=a_string_feat_list,
|
|
group_name="continuous",
|
|
default_value=FEAT_CONFIG_DEFAULT_VAL,
|
|
type_filter=["CONTINUOUS"],
|
|
)
|
|
|
|
builder = builder.extract_features_as_hashed_sparse(
|
|
feature_regexes=a_string_feat_list,
|
|
output_tensor_name="sparse_no_continuous",
|
|
hash_space_size_bits=input_size_bits,
|
|
type_filter=["BINARY", "DISCRETE", "STRING", "SPARSE_BINARY"],
|
|
)
|
|
|
|
if add_gbdt:
|
|
builder = builder.extract_features_as_hashed_sparse(
|
|
feature_regexes=["ads\..*"],
|
|
output_tensor_name="gbdt_sparse",
|
|
hash_space_size_bits=input_size_bits,
|
|
)
|
|
|
|
if add_sparse_continous:
|
|
s_string_feat_list = [f[0] for f in feature_list if f[1] == "S"]
|
|
|
|
builder = builder.extract_features_as_hashed_sparse(
|
|
feature_regexes=s_string_feat_list,
|
|
output_tensor_name="sparse_continuous",
|
|
hash_space_size_bits=input_size_bits,
|
|
type_filter=["SPARSE_CONTINUOUS"],
|
|
)
|
|
|
|
if add_user_id:
|
|
builder = builder.extract_feature("meta.user_id")
|
|
if add_timestamp:
|
|
builder = builder.extract_feature("meta.timestamp")
|
|
if add_user_age:
|
|
builder = builder.extract_feature(USER_AGE_FEATURE_NAME)
|
|
|
|
if run_light_ranking_group_metrics_in_bq:
|
|
builder = builder.extract_feature("meta.trace_id")
|
|
builder = builder.extract_feature("meta.ranking.weighted_oonc_model_score")
|
|
|
|
builder = builder.add_labels(labels).define_weight("meta.weight")
|
|
|
|
return builder.build()
|
|
|
|
|
|
def get_feature_config_with_sparse_continuous(
|
|
data_spec_path,
|
|
feature_list_provided=[],
|
|
opt=None,
|
|
add_user_id=False,
|
|
add_timestamp=False,
|
|
add_user_age=False,
|
|
):
|
|
task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "OONC"
|
|
if task_name not in LABELS_MTL:
|
|
raise ValueError("Invalid Task Name !")
|
|
|
|
return _get_new_feature_config_base(
|
|
data_spec_path=data_spec_path,
|
|
labels=LABELS_MTL[task_name],
|
|
add_sparse_continous=True,
|
|
add_user_id=add_user_id,
|
|
add_timestamp=add_timestamp,
|
|
add_user_age=add_user_age,
|
|
feature_list_provided=feature_list_provided,
|
|
opt=opt,
|
|
)
|
|
|
|
|
|
def get_feature_config_light_ranking(
|
|
data_spec_path,
|
|
feature_list_provided=[],
|
|
opt=None,
|
|
add_user_id=True,
|
|
add_timestamp=False,
|
|
add_user_age=False,
|
|
add_gbdt=False,
|
|
run_light_ranking_group_metrics_in_bq=False,
|
|
):
|
|
task_name = opt.task_name if getattr(opt, "task_name", None) is not None else "HeavyRankPosition"
|
|
if task_name not in LABELS_LR:
|
|
raise ValueError("Invalid Task Name !")
|
|
if not feature_list_provided:
|
|
feature_list_provided = FEATURE_LIST_LIGHT_RANKING_DEFAULT
|
|
|
|
return _get_new_feature_config_base(
|
|
data_spec_path=data_spec_path,
|
|
labels=LABELS_LR[task_name],
|
|
add_sparse_continous=False,
|
|
add_gbdt=add_gbdt,
|
|
add_user_id=add_user_id,
|
|
add_timestamp=add_timestamp,
|
|
add_user_age=add_user_age,
|
|
feature_list_provided=feature_list_provided,
|
|
opt=opt,
|
|
run_light_ranking_group_metrics_in_bq=run_light_ranking_group_metrics_in_bq,
|
|
)
|