the-algorithm/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

146 lines
4.5 KiB
Python

import re
from twitter.deepbird.io.util import _get_feature_id
class Parser(object):
def parse(self, line):
match = re.search(self.pattern(), line)
if match:
return self._parse_match(match)
return None
def pattern(self):
raise NotImplementedError
def _parse_match(self, match):
raise NotImplementedError
class BiasParser(Parser):
'''
Parses the bias feature available in lolly model tsv files.
'''
def pattern(self):
'''
Matches lines like:
unified_engagement bias -0.935945
:return: a RegEx that extracts feature weight.
'''
return r"\t(bias)\t([^\s]+)"
def _parse_match(self, match):
return float(match.group(2))
class BinaryFeatureParser(Parser):
'''
Parses binary features available in lolly model tsv files.
'''
def pattern(self):
'''
Matches lines like:
unified_engagement encoded_tweet_features.is_user_spam_flag -0.181130
:return: a RegEx that extracts feature name and weight.
'''
return r"\t([\w\.]+)\t([^\s]+)"
def _parse_match(self, match):
return (match.group(1), float(match.group(2)))
class DiscretizedFeatureParser(Parser):
'''
Parses discretized features available in lolly model tsv files.
'''
def pattern(self):
'''
Matches lines like:
unified_engagement encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00 0.031004
:return: a RegEx that extracts feature name, bin boundaries and weight.
'''
return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)"
def _parse_match(self, match):
left_bin_side, right_bin_side = [float(number) for number in match.group(2).split("_")]
return (
match.group(1),
left_bin_side,
right_bin_side,
float(match.group(3))
)
class LollyModelFeaturesParser(Parser):
def __init__(self, bias_parser=BiasParser(), binary_feature_parser=BinaryFeatureParser(), discretized_feature_parser=DiscretizedFeatureParser()):
self._bias_parser = bias_parser
self._binary_feature_parser = binary_feature_parser
self._discretized_feature_parser = discretized_feature_parser
def parse(self, lolly_model_reader):
parsed_features = {
"bias": None,
"binary": {},
"discretized": {}
}
def process_line_fn(line):
bias_parser_result = self._bias_parser.parse(line)
if bias_parser_result:
parsed_features["bias"] = bias_parser_result
return
binary_feature_parser_result = self._binary_feature_parser.parse(line)
if binary_feature_parser_result:
name, value = binary_feature_parser_result
parsed_features["binary"][name] = value
return
discretized_feature_parser_result = self._discretized_feature_parser.parse(line)
if discretized_feature_parser_result:
name, left_bin, right_bin, weight = discretized_feature_parser_result
discretized_features = parsed_features["discretized"]
if name not in discretized_features:
discretized_features[name] = []
discretized_features[name].append((left_bin, right_bin, weight))
lolly_model_reader.read(process_line_fn)
return parsed_features
class DBv2DataExampleParser(Parser):
'''
Parses data records printed by the DBv2 train.py build_graph function.
Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]]
'''
def __init__(self, lolly_model_reader, lolly_model_features_parser=LollyModelFeaturesParser()):
self.features = lolly_model_features_parser.parse(lolly_model_reader)
self.feature_name_by_dbv2_id = {}
for feature_name in list(self.features["binary"].keys()) + list(self.features["discretized"].keys()):
self.feature_name_by_dbv2_id[str(_get_feature_id(feature_name))] = feature_name
def pattern(self):
'''
:return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values.
'''
return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]"
def _parse_match(self, match):
feature_ids = match.group(3).split(" ")
feature_values = match.group(4).split(" ")
value_by_feature_name = {}
for index in range(len(feature_ids)):
feature_id = feature_ids[index]
if feature_id not in self.feature_name_by_dbv2_id:
print("Missing feature with id: " + str(feature_id))
continue
value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(feature_values[index])
return value_by_feature_name