diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD deleted file mode 100644 index a834ba69e..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -python3_library( - name = "libs_py3", - sources = ["*.py"], - dependencies = [ - "src/python/twitter/deepbird/io", - "twml:twml-nodeps", - ], -) - -python37_binary( - name = "score", - source = "score.py", - dependencies = [ - ":libs_py3", - "3rdparty/python/_closures/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly:score", - "twml", - ], -) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD.docx new file mode 100644 index 000000000..e977474f9 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.docx new file mode 100644 index 000000000..0fcd18354 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.docx new file mode 100644 index 000000000..b614c22b8 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py deleted file mode 100644 index 723dd626c..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py +++ /dev/null @@ -1,23 +0,0 @@ -# checkstyle: noqa -import tensorflow.compat.v1 as tf -from ..constants import EB_SCORE_IDX - -# The rationale behind this logic is available at TQ-9678. -def get_lolly_logits(labels): - ''' - :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config. - :return: tf.Tensor of shape (batch size) with the extracted lolly logits. - ''' - eb_lolly_scores = get_lolly_scores(labels) - inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores) - lolly_activations = tf.math.subtract(tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores)) - return lolly_activations - -def get_lolly_scores(labels): - ''' - :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config. - :return: tf.Tensor of shape (batch size) with the extracted lolly scores. - ''' - logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1)) - eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0) - return eb_lolly_scores diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.docx new file mode 100644 index 000000000..3490d33aa Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py deleted file mode 100644 index cb39c67a7..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py +++ /dev/null @@ -1,145 +0,0 @@ -import re - -from twitter.deepbird.io.util import _get_feature_id - - -class Parser(object): - def parse(self, line): - match = re.search(self.pattern(), line) - if match: - return self._parse_match(match) - return None - - def pattern(self): - raise NotImplementedError - - def _parse_match(self, match): - raise NotImplementedError - - -class BiasParser(Parser): - ''' - Parses the bias feature available in lolly model tsv files. - ''' - - def pattern(self): - ''' - Matches lines like: - unified_engagement bias -0.935945 - :return: a RegEx that extracts feature weight. - ''' - return r"\t(bias)\t([^\s]+)" - - def _parse_match(self, match): - return float(match.group(2)) - - -class BinaryFeatureParser(Parser): - ''' - Parses binary features available in lolly model tsv files. - ''' - - def pattern(self): - ''' - Matches lines like: - unified_engagement encoded_tweet_features.is_user_spam_flag -0.181130 - :return: a RegEx that extracts feature name and weight. - ''' - return r"\t([\w\.]+)\t([^\s]+)" - - def _parse_match(self, match): - return (match.group(1), float(match.group(2))) - - -class DiscretizedFeatureParser(Parser): - ''' - Parses discretized features available in lolly model tsv files. - ''' - - def pattern(self): - ''' - Matches lines like: - unified_engagement encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00 0.031004 - :return: a RegEx that extracts feature name, bin boundaries and weight. - ''' - return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)" - - def _parse_match(self, match): - left_bin_side, right_bin_side = [float(number) for number in match.group(2).split("_")] - return ( - match.group(1), - left_bin_side, - right_bin_side, - float(match.group(3)) - ) - - -class LollyModelFeaturesParser(Parser): - def __init__(self, bias_parser=BiasParser(), binary_feature_parser=BinaryFeatureParser(), discretized_feature_parser=DiscretizedFeatureParser()): - self._bias_parser = bias_parser - self._binary_feature_parser = binary_feature_parser - self._discretized_feature_parser = discretized_feature_parser - - def parse(self, lolly_model_reader): - parsed_features = { - "bias": None, - "binary": {}, - "discretized": {} - } - def process_line_fn(line): - bias_parser_result = self._bias_parser.parse(line) - if bias_parser_result: - parsed_features["bias"] = bias_parser_result - return - - binary_feature_parser_result = self._binary_feature_parser.parse(line) - if binary_feature_parser_result: - name, value = binary_feature_parser_result - parsed_features["binary"][name] = value - return - - discretized_feature_parser_result = self._discretized_feature_parser.parse(line) - if discretized_feature_parser_result: - name, left_bin, right_bin, weight = discretized_feature_parser_result - discretized_features = parsed_features["discretized"] - if name not in discretized_features: - discretized_features[name] = [] - discretized_features[name].append((left_bin, right_bin, weight)) - - lolly_model_reader.read(process_line_fn) - - return parsed_features - - -class DBv2DataExampleParser(Parser): - ''' - Parses data records printed by the DBv2 train.py build_graph function. - Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]] - ''' - - def __init__(self, lolly_model_reader, lolly_model_features_parser=LollyModelFeaturesParser()): - self.features = lolly_model_features_parser.parse(lolly_model_reader) - self.feature_name_by_dbv2_id = {} - - for feature_name in list(self.features["binary"].keys()) + list(self.features["discretized"].keys()): - self.feature_name_by_dbv2_id[str(_get_feature_id(feature_name))] = feature_name - - def pattern(self): - ''' - :return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values. - ''' - return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]" - - def _parse_match(self, match): - feature_ids = match.group(3).split(" ") - feature_values = match.group(4).split(" ") - - value_by_feature_name = {} - for index in range(len(feature_ids)): - feature_id = feature_ids[index] - if feature_id not in self.feature_name_by_dbv2_id: - print("Missing feature with id: " + str(feature_id)) - continue - value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(feature_values[index]) - - return value_by_feature_name diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.docx new file mode 100644 index 000000000..1fb5fc64c Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py deleted file mode 100644 index ab33ee4e7..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py +++ /dev/null @@ -1,8 +0,0 @@ -class LollyModelReader(object): - def __init__(self, lolly_model_file_path): - self._lolly_model_file_path = lolly_model_file_path - - def read(self, process_line_fn): - with open(self._lolly_model_file_path, "r") as file: - for line in file: - process_line_fn(line) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.docx new file mode 100644 index 000000000..4ff26fdc5 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py deleted file mode 100644 index 5692616c2..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys - -from .parsers import DBv2DataExampleParser -from .reader import LollyModelReader -from .scorer import LollyModelScorer - - -if __name__ == "__main__": - lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1]) - lolly_model_scorer = LollyModelScorer(data_example_parser=DBv2DataExampleParser(lolly_model_reader)) - - score = lolly_model_scorer.score(data_example=sys.argv[2]) - print(score) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.docx new file mode 100644 index 000000000..330524c62 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py deleted file mode 100644 index 621c43388..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py +++ /dev/null @@ -1,37 +0,0 @@ -class LollyModelScorer(object): - def __init__(self, data_example_parser): - self._data_example_parser = data_example_parser - - def score(self, data_example): - value_by_feature_name = self._data_example_parser.parse(data_example) - features = self._data_example_parser.features - return self._score(value_by_feature_name, features) - - def _score(self, value_by_feature_name, features): - score = features["bias"] - score += self._score_binary_features(features["binary"], value_by_feature_name) - score += self._score_discretized_features(features["discretized"], value_by_feature_name) - return score - - def _score_binary_features(self, binary_features, value_by_feature_name): - score = 0.0 - for binary_feature_name, binary_feature_weight in binary_features.items(): - if binary_feature_name in value_by_feature_name: - score += binary_feature_weight - return score - - def _score_discretized_features(self, discretized_features, value_by_feature_name): - score = 0.0 - for discretized_feature_name, buckets in discretized_features.items(): - if discretized_feature_name in value_by_feature_name: - feature_value = value_by_feature_name[discretized_feature_name] - score += self._find_matching_bucket_weight(buckets, feature_value) - return score - - def _find_matching_bucket_weight(self, buckets, feature_value): - for left_side, right_side, weight in buckets: - # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) - if feature_value >= left_side and feature_value < right_side: - return weight - - raise LookupError("Couldn't find a matching bucket for the given feature value.") diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.docx new file mode 100644 index 000000000..032e3e96d Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py deleted file mode 100644 index 2d0342551..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py +++ /dev/null @@ -1,91 +0,0 @@ -from .parsers import LollyModelFeaturesParser - - -class TFModelInitializerBuilder: - - def __init__(self, model_features_parser=LollyModelFeaturesParser()): - self._model_features_parser = model_features_parser - - def build(self, lolly_model_reader): - ''' - :param lolly_model_reader: LollyModelReader instance - :return: tf_model_initializer dictionary of the following format: - { - "features": { - "bias": 0.0, - "binary": { - # (feature name : feature weight) pairs - "feature_name_1": 0.0, - ... - "feature_nameN": 0.0 - }, - "discretized": { - # (feature name : index aligned lists of bin_boundaries and weights - "feature_name_1": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - ... - "feature_name_K": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - } - } - } - ''' - tf_model_initializer = { - "features": {} - } - - features = self._model_features_parser.parse(lolly_model_reader) - tf_model_initializer["features"]["bias"] = features["bias"] - self._set_discretized_features(features["discretized"], tf_model_initializer) - - self._dedup_binary_features(features["binary"], features["discretized"]) - tf_model_initializer["features"]["binary"] = features["binary"] - - return tf_model_initializer - - def _set_discretized_features(self, discretized_features, tf_model_initializer): - if len(discretized_features) == 0: - return - - num_bins = max([len(bins) for bins in discretized_features.values()]) - - bin_boundaries_and_weights = {} - for feature_name in discretized_features: - bin_boundaries_and_weights[feature_name] = self._extract_bin_boundaries_and_weights( - discretized_features[feature_name], num_bins) - - tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights - - def _dedup_binary_features(self, binary_features, discretized_features): - [binary_features.pop(feature_name) for feature_name in discretized_features] - - def _extract_bin_boundaries_and_weights(self, discretized_feature_buckets, num_bins): - bin_boundary_weight_pairs = [] - - for bucket in discretized_feature_buckets: - bin_boundary_weight_pairs.append([bucket[0], bucket[2]]) - - # The default DBv2 HashingDiscretizer bin membership interval is (a, b] - # - # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) - # - # Thus, convert (a, b] to [a, b) by inverting the bin boundaries. - for bin_boundary_weight_pair in bin_boundary_weight_pairs: - if bin_boundary_weight_pair[0] < float("inf"): - bin_boundary_weight_pair[0] *= -1 - - while len(bin_boundary_weight_pairs) < num_bins: - bin_boundary_weight_pairs.append([float("inf"), float(0)]) - - bin_boundary_weight_pairs.sort(key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0]) - - bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs)) - - return { - "bin_boundaries": bin_boundaries, - "weights": weights - } diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.docx new file mode 100644 index 000000000..c016c5b4e Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py deleted file mode 100644 index 6919914f8..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py +++ /dev/null @@ -1,120 +0,0 @@ -# checkstyle: noqa -import tensorflow.compat.v1 as tf -from collections import OrderedDict -from .constants import EB_SCORE_IDX -from .lolly.data_helpers import get_lolly_scores - -import twml - -def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1): - """ - This function was copied from twml/metrics.py with the following adjustments: - - Override example weights with the ones set in graph_output. - - Tile labels in order to support per engagement metrics for both TF and Lolly scores. - - Add lolly_tf_score_MSE metric. - Note: All custom lines have a comment that starts with 'Added' - """ - # pylint: disable=invalid-name,dict-keys-not-iterating - if metrics is None: - # remove expensive metrics by default for faster eval - metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys()) - metrics.remove('pr_curve') - - def get_eval_metric_ops(graph_output, labels, weights): - """ - graph_output: - dict that is returned by build_graph given input features. - labels: - target labels associated to batch. - weights: - weights of the samples.. - """ - - # Added to support the example weights overriding. - weights = graph_output["weights"] - # Added to support per engagement metrics for both TF and Lolly scores. - labels = tf.tile(labels, [1, 2]) - - eval_metric_ops = OrderedDict() - - preds = graph_output['output'] - - threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5 - - hard_preds = graph_output.get('hard_output') - if not hard_preds: - hard_preds = tf.greater_equal(preds, threshold) - - shape = labels.get_shape() - - # basic sanity check: multi_metric dimension must exist - assert len(shape) > class_dim, "Dimension specified by class_dim does not exist." - - num_labels = shape[class_dim] - # If we are doing multi-class / multi-label metric, the number of classes / labels must - # be know at graph construction time. This dimension cannot have size None. - assert num_labels is not None, "The multi-metric dimension cannot be None." - assert classes is None or len(classes) == num_labels, ( - "Number of classes must match the number of labels") - - weights_shape = weights.get_shape() if weights is not None else None - if weights_shape is None: - num_weights = None - elif len(weights_shape) > 1: - num_weights = weights_shape[class_dim] - else: - num_weights = 1 - - for i in range(num_labels): - - # add metrics to eval_metric_ops dict - for metric_name in metrics: - metric_name = metric_name.lower() # metric name are case insensitive. - - class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i)) - - if class_metric_name in eval_metric_ops: - # avoid adding duplicate metrics. - continue - - class_labels = tf.gather(labels, indices=[i], axis=class_dim) - class_preds = tf.gather(preds, indices=[i], axis=class_dim) - class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim) - - if num_weights is None: - class_weights = None - elif num_weights == num_labels: - class_weights = tf.gather(weights, indices=[i], axis=class_dim) - elif num_weights == 1: - class_weights = weights - else: - raise ValueError("num_weights (%d) and num_labels (%d) do not match" - % (num_weights, num_labels)) - - metric_factory, requires_threshold = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name) - if metric_factory: - value_op, update_op = metric_factory( - labels=class_labels, - predictions=(class_hard_preds if requires_threshold else class_preds), - weights=class_weights, name=class_metric_name) - eval_metric_ops[class_metric_name] = (value_op, update_op) - else: - raise ValueError('Cannot find the metric named ' + metric_name) - - # Added to compare TF and Lolly scores. - eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels) - - return eval_metric_ops - - return get_eval_metric_ops - - -def get_mse(predictions, labels): - lolly_scores = get_lolly_scores(labels) - tf_scores = predictions[:, EB_SCORE_IDX] - squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores)) - - value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op") - update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op") - - return value_op, update_op diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD deleted file mode 100644 index d8cd264ad..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD +++ /dev/null @@ -1,8 +0,0 @@ -python3_library( - name = "libs_py3", - sources = ["*.py"], - dependencies = [ - "src/python/twitter/deepbird/io", - "twml:twml-nodeps", - ], -) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD.docx new file mode 100644 index 000000000..d79bef99e Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.docx new file mode 100644 index 000000000..0fcd18354 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.docx new file mode 100644 index 000000000..f79cd3496 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py deleted file mode 100644 index 82c31bde0..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py +++ /dev/null @@ -1,62 +0,0 @@ -from .hashing_utils import make_feature_id - -from twml.contrib.layers.hashing_discretizer import HashingDiscretizer -import numpy as np - - -class TFModelDiscretizerBuilder(object): - def __init__(self, num_bits): - self.num_bits = num_bits - - def build(self, tf_model_initializer): - ''' - :param tf_model_initializer: dictionary of the following format: - { - "features": { - "bias": 0.0, - "binary": { - # (feature name : feature weight) pairs - "feature_name_1": 0.0, - ... - "feature_nameN": 0.0 - }, - "discretized": { - # (feature name : index aligned lists of bin_boundaries and weights - "feature_name_1": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - ... - "feature_name_K": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - } - } - } - :return: a HashingDiscretizer instance. - ''' - discretized_features = tf_model_initializer["features"]["discretized"] - - max_bins = 0 - - feature_ids = [] - bin_vals = [] - for feature_name in discretized_features: - bin_boundaries = discretized_features[feature_name]["bin_boundaries"] - feature_id = make_feature_id(feature_name, self.num_bits) - feature_ids.append(feature_id) - np_bin_boundaries = [np.float(bin_boundary) for bin_boundary in bin_boundaries] - bin_vals.append(np_bin_boundaries) - - max_bins = max(max_bins, len(np_bin_boundaries)) - - feature_ids_np = np.array(feature_ids) - bin_vals_np = np.array(bin_vals).flatten() - - return HashingDiscretizer( - feature_ids=feature_ids_np, - bin_vals=bin_vals_np, - n_bin=max_bins, - out_bits=self.num_bits - ) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.docx new file mode 100644 index 000000000..ab43793e7 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py deleted file mode 100644 index 2c57f8d63..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py +++ /dev/null @@ -1,29 +0,0 @@ -from twitter.deepbird.io.util import _get_feature_id - -import numpy as np - - -def numpy_hashing_uniform(the_id, bin_idx, output_bits): - """ - integer_multiplicative_hashing - This is a reimplementation, for testing purposes, of the - c++ version found in hashing_discretizer_impl.cpp - """ - hashing_constant = 2654435761 - N = 32 - with np.errstate(over='ignore'): - the_id *= hashing_constant - the_id += bin_idx - the_id *= hashing_constant - the_id >>= N - output_bits - the_id &= (1 << output_bits) - 1 - return the_id - - -def make_feature_id(name, num_bits): - feature_id = _get_feature_id(name) - return np.int64(limit_bits(feature_id, num_bits)) - - -def limit_bits(value, num_bits): - return value & ((2 ** num_bits) - 1) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.docx new file mode 100644 index 000000000..95378abd9 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py deleted file mode 100644 index 63491ea38..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py +++ /dev/null @@ -1,34 +0,0 @@ -from .hashing_utils import make_feature_id, numpy_hashing_uniform - -import numpy as np -import tensorflow.compat.v1 as tf -import twml - - -class TFModelWeightsInitializerBuilder(object): - def __init__(self, num_bits): - self.num_bits = num_bits - - def build(self, tf_model_initializer): - ''' - :return: (bias_initializer, weight_initializer) - ''' - initial_weights = np.zeros((2 ** self.num_bits, 1)) - - features = tf_model_initializer["features"] - self._set_binary_feature_weights(initial_weights, features["binary"]) - self._set_discretized_feature_weights(initial_weights, features["discretized"]) - - return tf.constant_initializer(features["bias"]), twml.contrib.initializers.PartitionConstant(initial_weights) - - def _set_binary_feature_weights(self, initial_weights, binary_features): - for feature_name, weight in binary_features.items(): - feature_id = make_feature_id(feature_name, self.num_bits) - initial_weights[feature_id][0] = weight - - def _set_discretized_feature_weights(self, initial_weights, discretized_features): - for feature_name, discretized_feature in discretized_features.items(): - feature_id = make_feature_id(feature_name, self.num_bits) - for bin_idx, weight in enumerate(discretized_feature["weights"]): - final_bucket_id = numpy_hashing_uniform(feature_id, bin_idx, self.num_bits) - initial_weights[final_bucket_id][0] = weight diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.docx new file mode 100644 index 000000000..c507836a7 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py deleted file mode 100644 index 6ef181f5f..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py +++ /dev/null @@ -1,212 +0,0 @@ -# checkstyle: noqa -import tensorflow.compat.v1 as tf -from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn -from tensorflow.python.framework import dtypes -from tensorflow.python.ops import array_ops -import tensorflow_hub as hub - -from datetime import datetime -from tensorflow.compat.v1 import logging -from twitter.deepbird.projects.timelines.configs import all_configs -from twml.trainers import DataRecordTrainer -from twml.contrib.calibrators.common_calibrators import build_percentile_discretizer_graph -from twml.contrib.calibrators.common_calibrators import calibrate_discretizer_and_export -from .metrics import get_multi_binary_class_metric_fn -from .constants import TARGET_LABEL_IDX, PREDICTED_CLASSES -from .example_weights import add_weight_arguments, make_weights_tensor -from .lolly.data_helpers import get_lolly_logits -from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder -from .lolly.reader import LollyModelReader -from .tf_model.discretizer_builder import TFModelDiscretizerBuilder -from .tf_model.weights_initializer_builder import TFModelWeightsInitializerBuilder - -import twml - -def get_feature_values(features_values, params): - if params.lolly_model_tsv: - # The default DBv2 HashingDiscretizer bin membership interval is (a, b] - # - # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) - # - # TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries. - # - # Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket. - return tf.multiply(features_values, -1.0) - else: - return features_values - -def build_graph(features, label, mode, params, config=None): - weights = None - if "weights" in features: - weights = make_weights_tensor(features["weights"], label, params) - - num_bits = params.input_size_bits - - if mode == "infer": - indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits) - dense_shape = tf.stack([features["input_sparse_tensor_shape"][0], 1 << num_bits]) - sparse_tf = tf.SparseTensor( - indices=indices, - values=get_feature_values(features["input_sparse_tensor_values"], params), - dense_shape=dense_shape - ) - else: - features["values"] = get_feature_values(features["values"], params) - sparse_tf = twml.util.convert_to_sparse(features, num_bits) - - if params.lolly_model_tsv: - tf_model_initializer = TFModelInitializerBuilder().build(LollyModelReader(params.lolly_model_tsv)) - bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder(num_bits).build(tf_model_initializer) - discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer) - else: - discretizer = hub.Module(params.discretizer_save_dir) - bias_initializer, weight_initializer = None, None - - input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator") - - logits = twml.layers.full_sparse( - inputs=input_sparse, - output_size=1, - bias_initializer=bias_initializer, - weight_initializer=weight_initializer, - use_sparse_grads=(mode == "train"), - use_binary_values=True, - name="full_sparse_1" - ) - - loss = None - - if mode != "infer": - lolly_activations = get_lolly_logits(label) - - if opt.print_data_examples: - logits = print_data_example(logits, lolly_activations, features) - - if params.replicate_lolly: - loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations)) - else: - batch_size = tf.shape(label)[0] - target_label = tf.reshape(tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1)) - loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=target_label, logits=logits) - loss = twml.util.weighted_average(loss, weights) - - num_labels = tf.shape(label)[1] - eb_scores = tf.tile(lolly_activations, [1, num_labels]) - logits = tf.tile(logits, [1, num_labels]) - logits = tf.concat([logits, eb_scores], axis=1) - - output = tf.nn.sigmoid(logits) - - return {"output": output, "loss": loss, "weights": weights} - -def print_data_example(logits, lolly_activations, features): - return tf.Print( - logits, - [logits, lolly_activations, tf.reshape(features['keys'], (1, -1)), tf.reshape(tf.multiply(features['values'], -1.0), (1, -1))], - message="DATA EXAMPLE = ", - summarize=10000 - ) - -def earlybird_output_fn(graph_output): - export_outputs = { - tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: - tf.estimator.export.PredictOutput( - {"prediction": tf.identity(graph_output["output"], name="output_scores")} - ) - } - return export_outputs - -if __name__ == "__main__": - parser = DataRecordTrainer.add_parser_arguments() - - parser = twml.contrib.calibrators.add_discretizer_arguments(parser) - - parser.add_argument("--label", type=str, help="label for the engagement") - parser.add_argument("--model.use_existing_discretizer", action="store_true", - dest="model_use_existing_discretizer", - help="Load a pre-trained calibration or train a new one") - parser.add_argument("--input_size_bits", type=int) - parser.add_argument("--export_module_name", type=str, default="base_mlp", dest="export_module_name") - parser.add_argument("--feature_config", type=str) - parser.add_argument("--replicate_lolly", type=bool, default=False, dest="replicate_lolly", - help="Train a regression model with MSE loss and the logged Earlybird score as a label") - parser.add_argument("--lolly_model_tsv", type=str, required=False, dest="lolly_model_tsv", - help="Initialize with weights and discretizer bins available in the given Lolly model tsv file" - "No discretizer gets trained or loaded if set.") - parser.add_argument("--print_data_examples", type=bool, default=False, dest="print_data_examples", - help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'") - add_weight_arguments(parser) - - opt = parser.parse_args() - - feature_config_module = all_configs.select_feature_config(opt.feature_config) - - feature_config = feature_config_module.get_feature_config(data_spec_path=opt.data_spec, label=opt.label) - - parse_fn = twml.parsers.get_sparse_parse_fn( - feature_config, - keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes")) - - if not opt.lolly_model_tsv: - if opt.model_use_existing_discretizer: - logging.info("Skipping discretizer calibration [model.use_existing_discretizer=True]") - logging.info(f"Using calibration at {opt.discretizer_save_dir}") - else: - logging.info("Calibrating new discretizer [model.use_existing_discretizer=False]") - calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator( - opt.discretizer_num_bins, - opt.discretizer_output_size_bits - ) - calibrate_discretizer_and_export(name="recap_earlybird_hashing_discretizer", - params=opt, - calibrator=calibrator, - build_graph_fn=build_percentile_discretizer_graph, - feature_config=feature_config) - - trainer = DataRecordTrainer( - name="earlybird", - params=opt, - build_graph_fn=build_graph, - save_dir=opt.save_dir, - feature_config=feature_config, - metric_fn=get_multi_binary_class_metric_fn( - metrics=["roc_auc"], - classes=PREDICTED_CLASSES - ), - warm_start_from=None - ) - - train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn) - eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn) - - logging.info("Training and Evaluation ...") - trainingStartTime = datetime.now() - trainer.train_and_evaluate(train_input_fn=train_input_fn, eval_input_fn=eval_input_fn) - trainingEndTime = datetime.now() - logging.info("Training and Evaluation time: " + str(trainingEndTime - trainingStartTime)) - - if trainer._estimator.config.is_chief: - serving_input_in_earlybird = { - "input_sparse_tensor_indices": array_ops.placeholder( - name="input_sparse_tensor_indices", - shape=[None, 2], - dtype=dtypes.int64), - "input_sparse_tensor_values": array_ops.placeholder( - name="input_sparse_tensor_values", - shape=[None], - dtype=dtypes.float32), - "input_sparse_tensor_shape": array_ops.placeholder( - name="input_sparse_tensor_shape", - shape=[2], - dtype=dtypes.int64) - } - serving_input_receiver_fn = build_raw_serving_input_receiver_fn(serving_input_in_earlybird) - twml.contrib.export.export_fn.export_all_models( - trainer=trainer, - export_dir=opt.export_dir, - parse_fn=parse_fn, - serving_input_receiver_fn=serving_input_receiver_fn, - export_output_fn=earlybird_output_fn, - feature_spec=feature_config.get_feature_spec() - ) - logging.info("The export model path is: " + opt.export_dir) diff --git a/src/scala/com/twitter/graph/batch/BUILD.bazel b/src/scala/com/twitter/graph/batch/BUILD.bazel deleted file mode 100644 index 0dcfc85cf..000000000 --- a/src/scala/com/twitter/graph/batch/BUILD.bazel +++ /dev/null @@ -1,91 +0,0 @@ -JOB = ["job/**/*"] - -scala_library( - name = "batch", - sources = ["**/*.scala"], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "3rdparty/jvm/cascading:cascading-core", - "3rdparty/jvm/cascading:cascading-hadoop", - "3rdparty/jvm/cascading:cascading-local", - "3rdparty/jvm/cascading:cascading-thrift", - "3rdparty/jvm/com/twitter/algebird:core", - "3rdparty/jvm/com/twitter/algebird:util", - "3rdparty/jvm/com/twitter/storehaus:algebra", - "3rdparty/jvm/com/twitter/storehaus:core", - "3rdparty/src/jvm/com/twitter/scalding:args", - "3rdparty/src/jvm/com/twitter/scalding:commons", - "3rdparty/src/jvm/com/twitter/scalding:core", - "3rdparty/src/jvm/com/twitter/scalding:date", - "3rdparty/src/jvm/com/twitter/scalding:parquet", - "3rdparty/src/jvm/com/twitter/summingbird:batch", - "3rdparty/src/jvm/com/twitter/summingbird:client", - "graphstore/common:flock_follows-java", - "src/java/com/twitter/common_internal/util:date_util", - "src/java/com/twitter/twadoop/batch", - "src/java/com/twitter/twadoop/util/dbconfig", - "src/java/com/twitter/twadoop/util/yaml", - "src/protobuf/com/twitter/twadoop", - "src/scala/com/twitter/pluck", - "src/scala/com/twitter/pluck/source/combined_user_source", - "src/scala/com/twitter/pluck/source/jdbc", - "src/scala/com/twitter/scalding_internal/error_handling", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/multiformat", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/wtf/scalding/jobs/common:date_util", - "src/thrift/com/twitter/gizmoduck:user-thrift-java", - "src/thrift/com/twitter/twadoop/user/gen:gen-java", - "util/util-core:scala", - ], -) - -#pants.new build target for the old "dist" -hadoop_binary( - name = "graph-batch-deploy", - main = "com.twitter.scalding.Tool", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweepcred", - ], -) - -# Generated with `capesospy-v2 create_target tweepcred_job science/scalding/mesos/wtf/recos_platform_atla_proc.yaml`, config hash d63a47. -scalding_job( - name = "tweepcred_job", - main = "com.twitter.graph.batch.job.tweepcred.TweepcredBatchJob", - args = ["--weighted false --hadoop_config /etc/hadoop/hadoop-conf-proc-atla"], - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.queue", "cassowary.default"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.reducers", "1200"), - ("hadoop.submitter.disk", "200000m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - cron = "24,44,04 * * * *", - hadoop_cluster = "atla-proc", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweepcred", - ], -) diff --git a/src/scala/com/twitter/graph/batch/BUILD.docx b/src/scala/com/twitter/graph/batch/BUILD.docx new file mode 100644 index 000000000..ef24c6dc8 Binary files /dev/null and b/src/scala/com/twitter/graph/batch/BUILD.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.docx new file mode 100644 index 000000000..22adbe9d1 Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.scala deleted file mode 100644 index 568e85251..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.scala +++ /dev/null @@ -1,83 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.pluck.source.combined_user_source.MostRecentCombinedUserSnapshotSource -import com.twitter.scalding._ - -/** - * Calculate tweepcred from the given pagerank file. If post_adjust is true, - * reduce pagerank for users with low followers compared to number of - * followings based on existing reputation code. - * Options: - * --input_pagerank: given pagerank - * --user_mass: user mass tsv file, generated by twadoop user_mass job - * --output_pagerank: where to put pagerank file - * --output_tweepcred: where to put tweepcred file - * optional arguments: - * --post_adjust: whether to do post adjust, default true - * - */ -class ExtractTweepcred(args: Args) extends Job(args) { - val POST_ADJUST = args.getOrElse("post_adjust", "true").toBoolean - - val inputPagerank = getInputPagerank(args("input_pagerank")) - .map(() -> ('num_followers, 'num_followings)) { (u: Unit) => - (0, 0) - } - - val userInfo = TypedPipe - .from(MostRecentCombinedUserSnapshotSource) - .flatMap { combinedUser => - val user = Option(combinedUser.user) - val userId = user.map(_.id).getOrElse(0L) - val userExtended = Option(combinedUser.user_extended) - val numFollowers = userExtended.flatMap(u => Option(u.followers)).map(_.toInt).getOrElse(0) - val numFollowings = userExtended.flatMap(u => Option(u.followings)).map(_.toInt).getOrElse(0) - - if (userId == 0L || user.map(_.safety).exists(_.deactivated)) { - None - } else { - Some((userId, 0.0, numFollowers, numFollowings)) - } - } - .toPipe[(Long, Double, Int, Int)]('src_id, 'mass_input, 'num_followers, 'num_followings) - - val pagerankWithSuspended = (inputPagerank ++ userInfo) - .groupBy('src_id) { - _.max('mass_input) - .max('num_followers) - .max('num_followings) - } - - pagerankWithSuspended - .discard('num_followers, 'num_followings) - .write(Tsv(args("output_pagerank"))) - - val adjustedPagerank = - if (POST_ADJUST) { - pagerankWithSuspended - .map(('mass_input, 'num_followers, 'num_followings) -> 'mass_input) { - input: (Double, Int, Int) => - Reputation.adjustReputationsPostCalculation(input._1, input._2, input._3) - } - .normalize('mass_input) - } else { - pagerankWithSuspended - .discard('num_followers, 'num_followings) - } - - val tweepcred = adjustedPagerank - .map('mass_input -> 'mass_input) { input: Double => - Reputation.scaledReputation(input) - } - - tweepcred.write(Tsv(args("output_tweepcred"))) - tweepcred.write(Tsv(args("current_tweepcred"))) - tweepcred.write(Tsv(args("today_tweepcred"))) - - def getInputPagerank(fileName: String) = { - Tsv(fileName).read - .mapTo((0, 1) -> ('src_id, 'mass_input)) { input: (Long, Double) => - input - } - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.docx new file mode 100644 index 000000000..82345af6f Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.scala deleted file mode 100644 index 284ba45f8..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.scala +++ /dev/null @@ -1,275 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.data.proto.Flock -import com.twitter.scalding._ -import com.twitter.pluck.source._ -import com.twitter.pluck.source.combined_user_source.MostRecentCombinedUserSnapshotSource -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.service.interactions.InteractionGraph -import graphstore.common.FlockFollowsJavaDataset -import java.util.TimeZone - -/** - * Prepare the graph data for page rank calculation. Also generate the initial - * pagerank as the starting point. Afterwards, start WeightedPageRank job. - * - * Either read a tsv file for testing or read the following to build the graph - * flock edges Flock.Edge - * real graph input for weights InteractionGraph.Edge - * - * Options: - * --pwd: working directory, will generate the following files there - * numnodes: total number of nodes - * nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> - * pagerank: the page rank file - * --user_mass: user mass tsv file, generated by twadoop user_mass job - * Optional arguments: - * --input: use the given tsv file instead of flock and real graph - * --weighted: do weighted pagerank, default false - * --flock_edges_only: restrict graph to flock edges, default true - * --input_pagerank: continue pagerank from this - * - * Plus the following options for WeightedPageRank and ExtractTweepcred: - * --output_pagerank: where to put pagerank file - * --output_tweepcred: where to put tweepcred file - * Optional: - * --maxiterations: how many iterations to run. Default is 20 - * --jumpprob: probability of a random jump, default is 0.1 - * --threshold: total difference before finishing early, default 0.001 - * --post_adjust: whether to do post adjust, default true - */ -class PreparePageRankData(args: Args) extends Job(args) { - implicit val timeZone: TimeZone = DateOps.UTC - val PWD = args("pwd") - val WEIGHTED = args.getOrElse("weighted", "false").toBoolean - val FLOCK_EDGES_ONLY = args.getOrElse("flock_edges_only", "true").toBoolean - - val ROW_TYPE_1 = 1 - val ROW_TYPE_2 = 2 - - // graph data and user mass - val userMass = getUserMass - val nodesWithPrior = getGraphData(userMass) - val numNodes = nodesWithPrior.groupAll { _.size } - numNodes.write(Tsv(PWD + "/numnodes")) - dumpNodes(nodesWithPrior, PWD + "/nodes"); - - // initial pagerank to start computation - generateInitialPagerank(nodesWithPrior) - - // continue with the calculation - override def next = { - Some(new WeightedPageRank(args)) - } - - /** - * read flock edges - */ - def getFlockEdges = { - DAL - .readMostRecentSnapshotNoOlderThan(FlockFollowsJavaDataset, Days(7)) - .toTypedSource - .flatMapTo('src_id, 'dst_id) { edge: Flock.Edge => - if (edge.getStateId() == Flock.State.Positive.getNumber()) { - Some((edge.getSourceId(), edge.getDestinationId())) - } else { - None - } - } - } - - /** - * read real graph edges with weights - */ - def getRealGraphEdges = { - RealGraphEdgeSource() - .flatMapTo('src_id, 'dst_id, 'weight) { edge: InteractionGraph.Edge => - if (edge.getSourceId() != edge.getDestinationId()) { - val srcId = edge.getSourceId() - val dstId = edge.getDestinationId() - val weight = edge.getWeight().toFloat - Some((srcId, dstId, weight)) - } else { - None - } - } - } - - /** - * combine real graph and flock. If flock_edges_only is true, only take the - * flock edges; otherwise edges are either from flock or from real graph. - * edges weights default to be 1, overwritten by weights from real graph - */ - def getFlockRealGraphEdges = { - val flock = getFlockEdges - - if (WEIGHTED) { - val flockWithWeight = flock - .map(() -> ('weight, 'rowtype)) { (u: Unit) => - (1.0f, ROW_TYPE_1) - } - - val realGraph = getRealGraphEdges - .map(() -> 'rowtype) { (u: Unit) => - (ROW_TYPE_2) - } - - val combined = (flockWithWeight ++ realGraph) - .groupBy('src_id, 'dst_id) { - _.min('rowtype) - .max('weight) // take whichever is bigger - } - - if (FLOCK_EDGES_ONLY) { - combined.filter('rowtype) { (rowtype: Int) => - rowtype == ROW_TYPE_1 - } - } else { - combined - } - } else { - flock.map(() -> ('weight)) { (u: Unit) => - 1.0f - } - }.project('src_id, 'dst_id, 'weight) - } - - def getCsvEdges(fileName: String) = { - Tsv(fileName).read - .mapTo((0, 1, 2) -> ('src_id, 'dst_id, 'weight)) { input: (Long, Long, Float) => - input - } - } - - /* - * Compute user mass based on combined user - */ - def getUserMass = - TypedPipe - .from(MostRecentCombinedUserSnapshotSource) - .flatMap { user => - UserMass.getUserMass(user) - } - .map { userMassInfo => - (userMassInfo.userId, userMassInfo.mass) - } - .toPipe[(Long, Double)]('src_id_input, 'mass_prior) - .normalize('mass_prior) - - /** - * Read either flock/real_graph or a given tsv file - * group by the source id, and output node data structure - * merge with the user_mass. - * return <'src_id, 'dst_ids, 'weights, 'mass_prior> - * - * make sure src_id is the same set as in user_mass, and dst_ids - * are subset of user_mass. eg flock has edges like 1->2, - * where both users 1 and 2 do not exist anymore - */ - def getGraphData(userMass: RichPipe) = { - val edges: RichPipe = args.optional("input") match { - case None => getFlockRealGraphEdges - case Some(input) => getCsvEdges(input) - } - - // remove edges where dst_id is not in userMass - val filterByDst = userMass - .joinWithLarger('src_id_input -> 'dst_id, edges) - .discard('src_id_input, 'mass_prior) - - // aggreate by the source id - val nodes = filterByDst - .groupBy('src_id) { - _.mapReduceMap(('dst_id, 'weight) -> ('dst_ids, 'weights)) /* map1 */ { a: (Long, Float) => - (Vector(a._1), if (WEIGHTED) Vector(a._2) else Vector()) - } /* reduce */ { (a: (Vector[Long], Vector[Float]), b: (Vector[Long], Vector[Float])) => - { - (a._1 ++ b._1, a._2 ++ b._2) - } - } /* map2 */ { a: (Vector[Long], Vector[Float]) => - a - } - } - .mapTo( - ('src_id, 'dst_ids, 'weights) -> ('src_id, 'dst_ids, 'weights, 'mass_prior, 'rowtype)) { - input: (Long, Vector[Long], Vector[Float]) => - { - (input._1, input._2.toArray, input._3.toArray, 0.0, ROW_TYPE_1) - } - } - - // get to the same schema - val userMassNodes = userMass - .mapTo(('src_id_input, 'mass_prior) -> ('src_id, 'dst_ids, 'weights, 'mass_prior, 'rowtype)) { - input: (Long, Double) => - { - (input._1, Array[Long](), Array[Float](), input._2, ROW_TYPE_2) - } - } - - // make src_id the same set as in userMass - (nodes ++ userMassNodes) - .groupBy('src_id) { - _.sortBy('rowtype) - .head('dst_ids, 'weights) - .last('mass_prior, 'rowtype) - } - .filter('rowtype) { input: Int => - input == ROW_TYPE_2 - } - } - - /** - * generate the graph data output - */ - def dumpNodes(nodes: RichPipe, fileName: String) = { - mode match { - case Hdfs(_, conf) => nodes.write(SequenceFile(fileName)) - case _ => - nodes - .mapTo((0, 1, 2, 3) -> (0, 1, 2, 3)) { input: (Long, Array[Long], Array[Float], Double) => - (input._1, input._2.mkString(","), input._3.mkString(","), input._4) - } - .write(Tsv(fileName)) - } - } - - /* - * output prior mass or copy the given mass file (merge, normalize) - * to be used as the starting point - */ - def generateInitialPagerank(nodes: RichPipe) = { - val prior = nodes - .project('src_id, 'mass_prior) - - val combined = args.optional("input_pagerank") match { - case None => prior - case Some(fileName) => { - val massInput = Tsv(fileName).read - .mapTo((0, 1) -> ('src_id, 'mass_prior, 'rowtype)) { input: (Long, Double) => - (input._1, input._2, ROW_TYPE_2) - } - - val priorRow = prior - .map(() -> ('rowtype)) { (u: Unit) => - ROW_TYPE_1 - } - - (priorRow ++ massInput) - .groupBy('src_id) { - _.sortBy('rowtype) - .last('mass_prior) - .head('rowtype) - } - // throw away extra nodes from input file - .filter('rowtype) { (rowtype: Int) => - rowtype == ROW_TYPE_1 - } - .discard('rowtype) - .normalize('mass_prior) - } - } - - combined.write(Tsv(PWD + "/pagerank_0")) - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/README b/src/scala/com/twitter/graph/batch/job/tweepcred/README deleted file mode 100644 index 55ef3b093..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/README +++ /dev/null @@ -1,75 +0,0 @@ -Tweepcred - -Tweepcred is a social network analysis tool that calculates the influence of Twitter users based on their interactions with other users. The tool uses the PageRank algorithm to rank users based on their influence. - -PageRank Algorithm -PageRank is a graph algorithm that was originally developed by Google to determine the importance of web pages in search results. The algorithm works by assigning a numerical score to each page based on the number and quality of other pages that link to it. The more links a page has from other high-quality pages, the higher its PageRank score. - -In the Tweepcred project, the PageRank algorithm is used to determine the influence of Twitter users based on their interactions with other users. The graph is constructed by treating Twitter users as nodes, and their interactions (mentions, retweets, etc.) as edges. The PageRank score of a user represents their influence in the network. - -Tweepcred PageRank Implementation -The implementation of the PageRank algorithm in Tweepcred is based on the Hadoop MapReduce framework. The algorithm is split into two stages: preparation and iteration. - -The preparation stage involves constructing the graph of Twitter users and their interactions, and initializing each user's PageRank score to a default value. This stage is implemented in the PreparePageRankData class. - -The iteration stage involves repeatedly calculating and updating the PageRank scores of each user until convergence is reached. This stage is implemented in the UpdatePageRank class, which is run multiple times until the algorithm converges. - -The Tweepcred PageRank implementation also includes a number of optimizations to improve performance and reduce memory usage. These optimizations include block compression, lazy loading, and in-memory caching. - - -========================================== TweepcredBatchJob.scala ========================================== - - -This is a Scala class that represents a batch job for computing the "tweepcred" (Twitter credibility) score for Twitter users using weighted or unweighted PageRank algorithm. The class extends the AnalyticsIterativeBatchJob class, which is part of the Scalding framework used for data processing on Hadoop. - -The class defines various properties and methods that are used to configure and run the batch job. The args parameter represents the command-line arguments that are passed to the batch job, such as the --weighted flag that determines whether to use the weighted PageRank algorithm or not. - -The run method overrides the run method of the base class and prints the batch statistics after the job has finished. The children method defines a list of child jobs that need to be executed as part of the batch job. The messageHeader method returns a string that represents the header of the batch job message. - -========================================== ExtractTweepcred.scala ========================================== - -This class is a Scalding job that calculates "tweepcred" from a given pagerank file. Tweepcred is a measure of reputation for Twitter users that takes into account the number of followers they have and the number of people they follow. If the optional argument post_adjust is set to true (default value), then the pagerank values are adjusted based on the user's follower-to-following ratio. - -The class takes several command-line arguments specifying input and output files and options, and it uses the Scalding library to perform distributed data processing on the input files. It reads in the pagerank file and a user mass file, both in TSV format, and combines them to produce a new pagerank file with the adjusted values. The adjusted pagerank is then used to calculate tweepcred values, which are written to output files. - -The code makes use of the MostRecentCombinedUserSnapshotSource class from the com.twitter.pluck.source.combined_user_source package to obtain user information from the user mass file. It also uses the Reputation class to perform the tweepcred calculations and adjustments. - - -========================================== UserMass.scala ========================================== - -The UserMass class is a helper class used to calculate the "mass" of a user on Twitter, as defined by a certain algorithm. The mass score represents the user's reputation and is used in various applications, such as in determining which users should be recommended to follow or which users should have their content highlighted. - -The getUserMass method of the UserMass class takes in a CombinedUser object, which contains information about a Twitter user, and returns an optional UserMassInfo object, which contains the user's ID and calculated mass score. - -The algorithm used to calculate the mass score takes into account various factors such as the user's account age, number of followers and followings, device usage, and safety status (restricted, suspended, verified). The calculation involves adding and multiplying weight factors and adjusting the mass score based on a threshold for the number of friends and followers. - - -========================================== PreparePageRankData.scala ========================================== - -The PreparePageRankData class prepares the graph data for the page rank calculation. It generates the initial pagerank and then starts the WeightedPageRank job. It has the following functionalities: - -It reads the user mass TSV file generated by the twadoop user_mass job. -It reads the graph data, which is either a TSV file or a combination of flock edges and real graph inputs for weights. -It generates the initial pagerank as the starting point for the pagerank computation. -It writes the number of nodes to a TSV file and dumps the nodes to another TSV file. -It has several options like weighted, flock_edges_only, and input_pagerank to fine-tune the pagerank calculation. -It also has options for the WeightedPageRank and ExtractTweepcred jobs, like output_pagerank, output_tweepcred, maxiterations, jumpprob, threshold, and post_adjust. -The PreparePageRankData class has several helper functions like getFlockEdges, getRealGraphEdges, getFlockRealGraphEdges, and getCsvEdges that read the graph data from different sources like DAL, InteractionGraph, or CSV files. It also has the generateInitialPagerank function that generates the initial pagerank from the graph data. - -========================================== WeightedPageRank.scala ========================================== - -WeightedPageRank is a class that performs the weighted PageRank algorithm on a given graph. - -The algorithm starts from a given PageRank value and performs one iteration, then tests for convergence. If convergence has not been reached, the algorithm clones itself and starts the next PageRank job with the updated PageRank as input. If convergence has been reached, the algorithm starts the ExtractTweepcred job instead. - -The class takes in several options, including the working directory, total number of nodes, nodes file, PageRank file, total difference, whether to perform weighted PageRank, the current iteration, maximum iterations to run, probability of a random jump, and whether to do post adjust. - -The algorithm reads a nodes file that includes the source node ID, destination node IDs, weights, and mass prior. The algorithm also reads an input PageRank file that includes the source node ID and mass input. The algorithm then performs one iteration of the PageRank algorithm and writes the output PageRank to a file. - -The algorithm tests for convergence by calculating the total difference between the input and output PageRank masses. If convergence has not been reached, the algorithm clones itself and starts the next PageRank job. If convergence has been reached, the algorithm starts the ExtractTweepcred job. - -========================================== Reputation.scala ========================================== - -This is a helper class called Reputation that contains methods for calculating a user's reputation score. The first method called scaledReputation takes a Double parameter raw which represents the user's page rank, and returns a Byte value that represents the user's reputation on a scale of 0 to 100. This method uses a formula that involves converting the logarithm of the page rank to a number between 0 and 100. - -The second method called adjustReputationsPostCalculation takes three parameters: mass (a Double value representing the user's page rank), numFollowers (an Int value representing the number of followers a user has), and numFollowings (an Int value representing the number of users a user is following). This method reduces the page rank of users who have a low number of followers but a high number of followings. It calculates a division factor based on the ratio of followings to followers, and reduces the user's page rank by dividing it by this factor. The method returns the adjusted page rank. diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/README.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/README.docx new file mode 100644 index 000000000..f74bf915d Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/README.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.docx new file mode 100644 index 000000000..0375417bb Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.scala deleted file mode 100644 index 6c81805fd..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.scala +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -/** - * helper class to calculate reputation, borrowed from repo reputations - */ -object Reputation { - - /** - * convert pagerank to tweepcred between 0 and 100, - * take from repo reputations, util/Utils.scala - */ - def scaledReputation(raw: Double): Byte = { - if (raw == 0 || (raw < 1.0e-20)) { - 0 - } else { - // convert log(pagerank) to a number between 0 and 100 - // the two parameters are from a linear fit by converting - // max pagerank -> 95 - // min pagerank -> 15 - val e: Double = 130d + 5.21 * scala.math.log(raw) // log to the base e - val pos = scala.math.rint(e) - val v = if (pos > 100) 100.0 else if (pos < 0) 0.0 else pos - v.toByte - } - } - - // these constants are take from repo reputations, config/production.conf - private val threshAbsNumFriendsReps = 2500 - private val constantDivisionFactorGt_threshFriendsToFollowersRatioReps = 3.0 - private val threshFriendsToFollowersRatioUMass = 0.6 - private val maxDivFactorReps = 50 - - /** - * reduce pagerank of users with low followers but high followings - */ - def adjustReputationsPostCalculation(mass: Double, numFollowers: Int, numFollowings: Int) = { - if (numFollowings > threshAbsNumFriendsReps) { - val friendsToFollowersRatio = (1.0 + numFollowings) / (1.0 + numFollowers) - val divFactor = - scala.math.exp( - constantDivisionFactorGt_threshFriendsToFollowersRatioReps * - (friendsToFollowersRatio - threshFriendsToFollowersRatioUMass) * - scala.math.log(scala.math.log(numFollowings)) - ) - mass / ((divFactor min maxDivFactorReps) max 1.0) - } else { - mass - } - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.docx new file mode 100644 index 000000000..585d64dde Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.scala deleted file mode 100644 index 48c06027b..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.scala +++ /dev/null @@ -1,64 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.scalding._ -import com.twitter.scalding_internal.job._ -import com.twitter.scalding_internal.job.analytics_batch._ - -/** - * Register the beginning of the tweepcred job in analytic batch table - * - * Options: - * --weighted: do weighted pagerank - * --hadoop_config: /etc/hadoop/hadoop-conf-proc-atla - * - */ -class TweepcredBatchJob(args: Args) extends AnalyticsIterativeBatchJob(args) { - - def WEIGHTED = args("weighted").toBoolean - - override def timeout = Hours(36) - override def hasFlow = false - def descriptionSuffix = " weighted=" + args("weighted") - override def batchIncrement = Hours(24) - override def firstTime = RichDate("2015-10-02") - override def batchDescription = classOf[TweepcredBatchJob].getCanonicalName + descriptionSuffix - - override def run = { - val success = super.run - println("Batch Stat: " + messageHeader + " " + jobStat.get.toString) - success - } - - def startTime = dateRange.start - def dateString = startTime.toString("yyyy/MM/dd") - - override def children = { - val BASEDIR = "/user/cassowary/tweepcred/" - val baseDir = BASEDIR + (if (WEIGHTED) "weighted" else "unweighted") + "/daily/" - val tmpDir = baseDir + "tmp" - val outputDir = baseDir + dateString - val pageRankDir = outputDir + "/finalmass" - val tweepcredDir = outputDir + "/finaltweepcred" - val yesterdayStr = (startTime - Days(1)).toString("yyyy/MM/dd") - val yestPageRankDir = baseDir + yesterdayStr + "/finalmass" - val TWEEPCRED = "/tweepcred" - val curRep = (if (WEIGHTED) baseDir else BASEDIR) + "current" - val todayRep = (if (WEIGHTED) baseDir else BASEDIR) + dateString - val newArgs = args + ("pwd", Some(tmpDir)) + - ("output_pagerank", Some(pageRankDir)) + - ("output_tweepcred", Some(tweepcredDir)) + - ("input_pagerank", Some(yestPageRankDir)) + - ("current_tweepcred", Some(curRep + TWEEPCRED)) + - ("today_tweepcred", Some(todayRep + TWEEPCRED)) - - val prJob = new PreparePageRankData(newArgs) - - List(prJob) - } - - private def messageHeader = { - val dateString = dateRange.start.toString("yyyy/MM/dd") - classOf[TweepcredBatchJob].getSimpleName + - (if (WEIGHTED) " weighted " else " unweighted ") + dateString - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.docx new file mode 100644 index 000000000..3c81d3f6b Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.scala deleted file mode 100644 index 064819bb0..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.scala +++ /dev/null @@ -1,69 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.twadoop.user.gen.CombinedUser -import com.twitter.util.Time -import com.twitter.wtf.scalding.jobs.common.DateUtil - -case class UserMassInfo(userId: Long, mass: Double) - -/** - * helper class to calculate user mass, borrowed from repo reputations - */ -object UserMass { - - private val currentTimestamp = Time.now.inMilliseconds - private val constantDivisionFactorGt_threshFriendsToFollowersRatioUMass = 5.0 - private val threshAbsNumFriendsUMass = 500 - private val threshFriendsToFollowersRatioUMass = 0.6 - private val deviceWeightAdditive = 0.5 - private val ageWeightAdditive = 0.2 - private val restrictedWeightMultiplicative = 0.1 - - def getUserMass(combinedUser: CombinedUser): Option[UserMassInfo] = { - val user = Option(combinedUser.user) - val userId = user.map(_.id).getOrElse(0L) - val userExtended = Option(combinedUser.user_extended) - val age = user.map(_.created_at_msec).map(DateUtil.diffDays(_, currentTimestamp)).getOrElse(0) - val isRestricted = user.map(_.safety).exists(_.restricted) - val isSuspended = user.map(_.safety).exists(_.suspended) - val isVerified = user.map(_.safety).exists(_.verified) - val hasValidDevice = user.flatMap(u => Option(u.devices)).exists(_.isSetMessaging_devices) - val numFollowers = userExtended.flatMap(u => Option(u.followers)).map(_.toInt).getOrElse(0) - val numFollowings = userExtended.flatMap(u => Option(u.followings)).map(_.toInt).getOrElse(0) - - if (userId == 0L || user.map(_.safety).exists(_.deactivated)) { - None - } else { - val mass = - if (isSuspended) - 0 - else if (isVerified) - 100 - else { - var score = deviceWeightAdditive * 0.1 + - (if (hasValidDevice) deviceWeightAdditive else 0) - val normalizedAge = if (age > 30) 1.0 else (1.0 min scala.math.log(1.0 + age / 15.0)) - score *= normalizedAge - if (score < 0.01) score = 0.01 - if (isRestricted) score *= restrictedWeightMultiplicative - score = (score min 1.0) max 0 - score *= 100 - score - } - - val friendsToFollowersRatio = (1.0 + numFollowings) / (1.0 + numFollowers) - val adjustedMass = - if (numFollowings > threshAbsNumFriendsUMass && - friendsToFollowersRatio > threshFriendsToFollowersRatioUMass) { - mass / scala.math.exp( - constantDivisionFactorGt_threshFriendsToFollowersRatioUMass * - (friendsToFollowersRatio - threshFriendsToFollowersRatioUMass) - ) - } else { - mass - } - - Some(UserMassInfo(userId, adjustedMass)) - } - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.docx new file mode 100644 index 000000000..ba44df6d2 Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.scala deleted file mode 100644 index 7e06077a1..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.scala +++ /dev/null @@ -1,235 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.scalding._ - -/** - * weighted page rank for the given graph, start from the given pagerank, - * perform one iteration, test for convergence, if not yet, clone itself - * and start the next page rank job with updated pagerank as input; - * if converged, start ExtractTweepcred job instead - * - * Options: - * --pwd: working directory, will read/generate the following files there - * numnodes: total number of nodes - * nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> - * pagerank: the page rank file eg pagerank_0, pagerank_1 etc - * totaldiff: the current max pagerank delta - * Optional arguments: - * --weighted: do weighted pagerank, default false - * --curiteration: what is the current iteration, default 0 - * --maxiterations: how many iterations to run. Default is 20 - * --jumpprob: probability of a random jump, default is 0.1 - * --threshold: total difference before finishing early, default 0.001 - * - * plus the following options for ExtractTweepcred: - * --user_mass: user mass tsv file, generated by twadoop user_mass job - * --output_pagerank: where to put pagerank file - * --output_tweepcred: where to put tweepcred file - * Optional: - * --post_adjust: whether to do post adjust, default true - * - */ -class WeightedPageRank(args: Args) extends Job(args) { - val ROW_TYPE_1 = 1 - val ROW_TYPE_2 = 2 - - val PWD = args("pwd") - val ALPHA = args.getOrElse("jumpprob", "0.1").toDouble - val WEIGHTED = args.getOrElse("weighted", "false").toBoolean - val THRESHOLD = args.getOrElse("threshold", "0.001").toDouble - val MAXITERATIONS = args.getOrElse("maxiterations", "20").toInt - val CURITERATION = args.getOrElse("curiteration", "0").toInt - - // 'size - val numNodes = getNumNodes(PWD + "/numnodes") - - // 'src_id, 'dst_ids, 'weights, 'mass_prior - val nodes = getNodes(PWD + "/nodes") - - // 'src_id_input, 'mass_input - val inputPagerank = getInputPagerank(PWD + "/pagerank_" + CURITERATION) - - // one iteration of pagerank - val outputPagerank = doPageRank(nodes, inputPagerank) - val outputFileName = PWD + "/pagerank_" + (CURITERATION + 1) - outputPagerank - .project('src_id, 'mass_n) - .write(Tsv(outputFileName)) - - // detect convergence - val totalDiff = outputPagerank - .mapTo(('mass_input, 'mass_n) -> 'mass_diff) { args: (Double, Double) => - scala.math.abs(args._1 - args._2) - } - .groupAll { _.sum[Double]('mass_diff) } - .write(Tsv(PWD + "/totaldiff")) - - /** - * test convergence, if not yet, kick off the next iteration - */ - override def next = { - // the max diff generated above - val totalDiff = Tsv(PWD + "/totaldiff").readAtSubmitter[Double].head - - if (CURITERATION < MAXITERATIONS - 1 && totalDiff > THRESHOLD) { - val newArgs = args + ("curiteration", Some((CURITERATION + 1).toString)) - Some(clone(newArgs)) - } else { - val newArgs = args + ("input_pagerank", Some(outputFileName)) - Some(new ExtractTweepcred(newArgs)) - } - } - - def getInputPagerank(fileName: String) = { - Tsv(fileName).read - .mapTo((0, 1) -> ('src_id_input, 'mass_input)) { input: (Long, Double) => - input - } - } - - /** - * read the pregenerated nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> - */ - def getNodes(fileName: String) = { - mode match { - case Hdfs(_, conf) => { - SequenceFile(fileName).read - .mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) { - input: (Long, Array[Long], Array[Float], Double) => - input - } - } - case _ => { - Tsv(fileName).read - .mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) { - input: (Long, String, String, Double) => - { - ( - input._1, - // convert string to int array - if (input._2 != null && input._2.length > 0) { - input._2.split(",").map { _.toLong } - } else { - Array[Long]() - }, - // convert string to float array - if (input._3 != null && input._3.length > 0) { - input._3.split(",").map { _.toFloat } - } else { - Array[Float]() - }, - input._4 - ) - } - } - } - } - } - - /** - * the total number of nodes, single line file - */ - def getNumNodes(fileName: String) = { - Tsv(fileName).read - .mapTo(0 -> 'size) { input: Long => - input - } - } - - /** - * one iteration of pagerank - * inputPagerank: <'src_id_input, 'mass_input> - * return <'src_id, 'mass_n, 'mass_input> - * - * Here is a highlevel view of the unweighted algorithm: - * let - * N: number of nodes - * inputPagerank(N_i): prob of walking to node i, - * d(N_j): N_j's out degree - * then - * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) / d_j) - * deadPagerank = (1 - \sum_{i} pagerankNext(N_i)) / N - * randomPagerank(N_i) = userMass(N_i) * ALPHA + deadPagerank * (1-ALPHA) - * pagerankOutput(N_i) = randomPagerank(N_i) + pagerankNext(N_i) * (1-ALPHA) - * - * For weighted algorithm: - * let - * w(N_j, N_i): weight from N_j to N_i - * tw(N_j): N_j's total out weights - * then - * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) * w(N_j, N_i) / tw(N_j)) - * - */ - def doPageRank(nodeRows: RichPipe, inputPagerank: RichPipe): RichPipe = { - // 'src_id, 'dst_ids, 'weights, 'mass_prior, 'mass_input - val nodeJoined = nodeRows - .joinWithSmaller('src_id -> 'src_id_input, inputPagerank) - .discard('src_id_input) - - // 'src_id, 'mass_n - val pagerankNext = nodeJoined - .flatMapTo(('dst_ids, 'weights, 'mass_input) -> ('src_id, 'mass_n)) { - args: (Array[Long], Array[Float], Double) => - { - if (args._1.length > 0) { - if (WEIGHTED) { - // weighted distribution - val total: Double = args._2.sum - (args._1 zip args._2).map { idWeight: (Long, Float) => - (idWeight._1, args._3 * idWeight._2 / total) - } - } else { - // equal distribution - val dist: Double = args._3 / args._1.length - args._1.map { id: Long => - (id, dist) - } - } - } else { - //Here is a node that points to no other nodes (dangling) - Nil - } - } - } - .groupBy('src_id) { - _.sum[Double]('mass_n) - } - - // 'sum_mass - val sumPagerankNext = pagerankNext.groupAll { _.sum[Double]('mass_n -> 'sum_mass) } - - // 'deadMass - // single row jobs - // the dead page rank equally distributed to every node - val deadPagerank = sumPagerankNext - .crossWithTiny(numNodes) - .map(('sum_mass, 'size) -> 'deadMass) { input: (Double, Long) => - (1.0 - input._1) / input._2 - } - .discard('size, 'sum_mass) - - // 'src_id_r, 'mass_n_r - // random jump probability plus dead page rank - val randomPagerank = nodeJoined - .crossWithTiny(deadPagerank) - .mapTo(('src_id, 'mass_prior, 'deadMass, 'mass_input) -> ('src_id, 'mass_n, 'mass_input)) { - ranks: (Long, Double, Double, Double) => - (ranks._1, ranks._2 * ALPHA + ranks._3 * (1 - ALPHA), ranks._4) - } - - // 'src_id, 'mass_n - // scale next page rank to 1-ALPHA - val pagerankNextScaled = pagerankNext - .map('mass_n -> ('mass_n, 'mass_input)) { m: Double => - ((1 - ALPHA) * m, 0.0) - } - - // 'src_id, 'mass_n, 'mass_input - // random probability + next probability - (randomPagerank ++ pagerankNextScaled) - .groupBy('src_id) { - _.sum[Double]('mass_input) // keep the input pagerank - .sum[Double]('mass_n) // take the sum - } - } -} diff --git a/src/scala/com/twitter/interaction_graph/README.docx b/src/scala/com/twitter/interaction_graph/README.docx new file mode 100644 index 000000000..dc53d8739 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/README.md b/src/scala/com/twitter/interaction_graph/README.md deleted file mode 100644 index 31b4cf00b..000000000 --- a/src/scala/com/twitter/interaction_graph/README.md +++ /dev/null @@ -1,19 +0,0 @@ -## Real Graph (bqe) - -This project builds a machine learning model using a gradient boosting tree classifier to predict the likelihood of a Twitter user interacting with another user. - -The algorithm works by first creating a labeled dataset of user interactions from a graph of Twitter users. This graph is represented in a BigQuery table where each row represents a directed edge between two users, along with various features such as the number of tweets, follows, favorites, and other metrics related to user behavior. - -To create the labeled dataset, the algorithm first selects a set of candidate interactions by identifying all edges that were active during a certain time period. It then joins this candidate set with a set of labeled interactions that occurred one day after the candidate period. Positive interactions are labeled as "1" and negative interactions are labeled as "0". The resulting labeled dataset is then used to train a boosted tree classifier model. - -The model is trained using the labeled dataset and various hyperparameters, including the maximum number of iterations and the subsample rate. The algorithm splits the labeled dataset into training and testing sets based on the source user's ID, using a custom data split method. - -Once the model is trained, it can be used to generate a score estimating the probability of a user interacting with another user. - -## Real Graph (scio) - -This project aggregates the number of interactions between pairs of users on Twitter. On a daily basis, there are multiple dataflow jobs that perform this aggregation, which includes public engagements like favorites, retweets, follows, etc. as well as private engagements like profile views, tweet clicks, and whether or not a user has another user in their address book (given a user opt-in to share address book). - -After the daily aggregation of interactions, there is a rollup job that aggregates yesterday's aggregation with today's interactions. The rollup job outputs several results, including the daily count of interactions per interaction types between a pair of users, the daily incoming interactions made on a user per interaction type, the rollup aggregation of interactions as a decayed sum between a pair of users, and the rollup aggregation of incoming interactions made on a user. - -Finally, the rollup job outputs the ML predicted interaction score between the pair of users alongside the rollup aggregation of interactions as a decayed sum between them. diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/README.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/README.docx new file mode 100644 index 000000000..ed5558f1c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/README.md b/src/scala/com/twitter/interaction_graph/bqe/scoring/README.md deleted file mode 100644 index 0e435feb8..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/README.md +++ /dev/null @@ -1,58 +0,0 @@ -# Scoring - -This folder contains the sql files that we'll use for scoring the real graph edges in BQ. We have 4 steps that take place: -- check to make sure that our models are in place. the feature importance query should return 20 rows in total: 10 rows per model, 1 for each feature. -- follow graph feature generation. this is to ensure that we have features for all users regardless if they have had any recent activity. -- candidate generation. this query combines the candidates from the follow graph and the activity graph, and the features from both. -- scoring. this query scores with 2 of our prod models and saves the scores to a table, with an additional field that distinguishes if an edge in in/out of network. - -## Instructions - -For deploying the job, you would need to create a zip file, upload to packer, and then schedule it with aurora. - -``` -zip -jr real_graph_scoring src/scala/com/twitter/interaction_graph/bqe/scoring && \ -packer add_version --cluster=atla cassowary real_graph_scoring real_graph_scoring.zip -aurora cron schedule atla/cassowary/prod/real_graph_scoring src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.aurora && \ -aurora cron start atla/cassowary/prod/real_graph_scoring -``` - -# candidates.sql - -This BigQuery (BQ) query does the following: - -1. Declares two variables, date_start and date_end, which are both of type DATE. -2. Sets the date_end variable to the maximum partition ID of the interaction_graph_labels_daily table, using the PARSE_DATE() function to convert the partition ID to a date format. -3. Sets the date_start variable to 30 days prior to the date_end variable, using the DATE_SUB() function. -4. Creates a new table called candidates in the realgraph dataset, partitioned by ds. -5. The query uses three common table expressions (T1, T2, and T3) to join data from two tables (interaction_graph_labels_daily and tweeting_follows) to generate a table containing candidate information and features. -6. The table T3 is the result of a full outer join between T1 and T2, grouping by source_id and destination_id, and aggregating values such as num_tweets, label_types, and the counts of different types of labels (e.g. num_follows, num_favorites, etc.). -7. The T4 table ranks each source_id by the number of num_days and num_tweets, and selects the top 2000 rows for each source_id. -8. Finally, the query selects all columns from the T4 table and appends the date_end variable as a new column named ds. - -Overall, the query generates a table of candidates and their associated features for a particular date range, using data from two tables in the twttr-bq-cassowary-prod and twttr-recos-ml-prod datasets. - -# follow_graph_features.sql - -This BigQuery script creates a table twttr-recos-ml-prod.realgraph.tweeting_follows that includes features for Twitter user interactions, specifically tweet counts and follows. - -First, it sets two variables date_latest_tweet and date_latest_follows to the most recent dates available in two separate tables: twttr-bq-tweetsource-pub-prod.user.public_tweets and twttr-recos-ml-prod.user_events.valid_user_follows, respectively. - -Then, it creates the tweet_count and all_follows CTEs. - -The tweet_count CTE counts the number of tweets made by each user within the last 3 days prior to date_latest_tweet. - -The all_follows CTE retrieves all the follows from the valid_user_follows table that happened on date_latest_follows and left joins it with the tweet_count CTE. It also adds a row number that partitions by the source user ID and orders by the number of tweets in descending order. The final output is filtered to keep only the top 2000 follows per user based on the row number. - -The final SELECT statement combines the all_follows CTE with the date_latest_tweet variable and inserts the results into the twttr-recos-ml-prod.realgraph.tweeting_follows table partitioned by date. - -# scoring.sql - -This BQ code performs operations on a BigQuery table called twttr-recos-ml-prod.realgraph.scores. Here is a step-by-step breakdown of what the code does: - -Declare two variables, date_end and date_latest_follows, and set their values based on the latest partitions in the twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS and twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS tables that correspond to specific tables, respectively. The PARSE_DATE() function is used to convert the partition IDs to date format. - -Delete rows from the twttr-recos-ml-prod.realgraph.scores table where the value of the ds column is equal to date_end. - -Insert rows into the twttr-recos-ml-prod.realgraph.scores table based on a query that generates predicted scores for pairs of user IDs using two machine learning models. Specifically, the query uses the ML.PREDICT() function to apply two machine learning models (twttr-recos-ml-prod.realgraph.prod and twttr-recos-ml-prod.realgraph.prod_explicit) to the twttr-recos-ml-prod.realgraph.candidates table. The resulting predicted scores are joined with the twttr-recos-ml-prod.realgraph.tweeting_follows table, which contains information about the number of tweets made by users and their follow relationships, using a full outer join. The final result includes columns for the source ID, destination ID, predicted score (prob), explicit predicted score (prob_explicit), a binary variable indicating whether the destination ID is followed by the source ID (followed), and the value of date_end for the ds column. If there is no match in the predicted_scores table for a given pair of user IDs, the COALESCE() function is used to return the corresponding values from the tweeting_follows table, with default values of 0.0 for the predicted scores. - diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.docx new file mode 100644 index 000000000..5b9364ab7 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.sql b/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.sql deleted file mode 100644 index 89bd30d38..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.sql +++ /dev/null @@ -1,42 +0,0 @@ -DECLARE date_start, date_end DATE; -SET date_end = ( - SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id - FROM `twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="interaction_graph_labels_daily" -); -SET date_start = DATE_SUB(date_end, INTERVAL 30 DAY); - --- all candidates and their features -CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.candidates` -PARTITION BY ds -AS -WITH T1 AS ( - SELECT source_id, destination_id, label, dateHour - FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily` - LEFT JOIN UNNEST(labels) AS label - WHERE DATE(dateHour) BETWEEN date_start AND date_end -), T2 AS ( - SELECT source_id, destination_id, num_tweets - FROM `twttr-recos-ml-prod.realgraph.tweeting_follows` -), T3 AS ( -SELECT -COALESCE(T1.source_id, T2.source_id) AS source_id, -COALESCE(T1.destination_id, T2.destination_id) AS destination_id, -COUNT(DISTINCT(T1.dateHour)) AS num_days, -MIN(COALESCE(num_tweets,0)) AS num_tweets, -- all rows' num_tweets should be the same -COALESCE(DATE_DIFF(date_end, DATE(MAX(T1.dateHour)), DAY),30) AS days_since_last_interaction, -COUNT(DISTINCT(label)) AS label_types, -COUNTIF(label="num_follows") AS num_follows, -COUNTIF(label="num_favorites") AS num_favorites, -COUNTIF(label="num_tweet_clicks") AS num_tweet_clicks, -COUNTIF(label="num_profile_views") AS num_profile_views, -FROM T1 -FULL JOIN T2 -USING (source_id, destination_id) -GROUP BY 1,2 -ORDER BY 3 DESC,4 DESC -), T4 AS ( - SELECT RANK() OVER (PARTITION BY source_id ORDER BY num_days DESC, num_tweets DESC) AS rn, * - FROM T3 -) SELECT *, date_end AS ds FROM T4 WHERE rn <= 2000 - diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.docx new file mode 100644 index 000000000..4970b73d0 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.sql b/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.sql deleted file mode 100644 index 6baecc2ed..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.sql +++ /dev/null @@ -1,5 +0,0 @@ -(SELECT * FROM ML.FEATURE_IMPORTANCE(MODEL `twttr-recos-ml-prod.realgraph.prod`) -ORDER BY importance_gain DESC) -UNION ALL -(SELECT * FROM ML.FEATURE_IMPORTANCE(MODEL `twttr-recos-ml-prod.realgraph.prod_explicit`) -ORDER BY importance_gain DESC) diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.docx new file mode 100644 index 000000000..a0e1cb10c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.sql b/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.sql deleted file mode 100644 index ace7e2f36..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.sql +++ /dev/null @@ -1,28 +0,0 @@ -DECLARE date_latest_tweet, date_latest_follows DATE; -SET date_latest_tweet = ( - SELECT PARSE_DATE('%Y%m%d', SUBSTRING(MAX(partition_id), 1, 8)) AS partition_id - FROM `twttr-bq-tweetsource-pub-prod.user.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="public_tweets"); -SET date_latest_follows = ( - SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id - FROM `twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="valid_user_follows"); - --- tweet count candidate features -CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.tweeting_follows` -PARTITION BY ds -AS -WITH tweet_count AS ( - SELECT userId, COUNT(userId) AS num_tweets - FROM `twttr-bq-tweetsource-pub-prod.user.public_tweets` - WHERE DATE(ts) BETWEEN DATE_SUB(date_latest_tweet, INTERVAL 3 DAY) AND date_latest_tweet - GROUP BY 1 -), all_follows AS ( - SELECT F.sourceId AS source_id, F.destinationId AS destination_id, COALESCE(T.num_tweets,0) AS num_tweets, - ROW_NUMBER() OVER (PARTITION BY F.sourceId ORDER BY T.num_tweets DESC) AS rn - FROM `twttr-recos-ml-prod.user_events.valid_user_follows` F - LEFT JOIN tweet_count T - ON F.destinationId=T.userId - WHERE DATE(F._PARTITIONTIME) = date_latest_follows -) SELECT *, date_latest_tweet AS ds FROM all_follows WHERE rn <= 2000 -; diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.docx new file mode 100644 index 000000000..9a148c116 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.sql b/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.sql deleted file mode 100644 index 5694c0988..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.sql +++ /dev/null @@ -1,52 +0,0 @@ -DECLARE date_end, date_latest_follows DATE; -SET date_end = ( - SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id - FROM `twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="interaction_graph_labels_daily" -); -SET date_latest_follows = ( - SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id - FROM `twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="valid_user_follows"); - -DELETE -FROM `twttr-recos-ml-prod.realgraph.scores` -WHERE ds = date_end; - --- score candidates (59m) -INSERT INTO `twttr-recos-ml-prod.realgraph.scores` -WITH predicted_scores AS ( - SELECT - source_id, - destination_id, - p1.prob AS prob, - p2.prob AS prob_explicit - FROM ML.PREDICT(MODEL `twttr-recos-ml-prod.realgraph.prod`, - ( - SELECT - * - FROM - `twttr-recos-ml-prod.realgraph.candidates` ) ) S1 - CROSS JOIN UNNEST(S1.predicted_label_probs) AS p1 - JOIN ML.PREDICT(MODEL `twttr-recos-ml-prod.realgraph.prod_explicit`, - ( - SELECT - * - FROM - `twttr-recos-ml-prod.realgraph.candidates` ) ) S2 - USING (source_id, destination_id) - CROSS JOIN UNNEST(S2.predicted_label_probs) AS p2 - WHERE p1.label=1 AND p2.label=1 -) -SELECT - COALESCE(predicted_scores.source_id, tweeting_follows.source_id) AS source_id, - COALESCE(predicted_scores.destination_id, tweeting_follows.destination_id) AS destination_id, - COALESCE(prob, 0.0) AS prob, - COALESCE(prob_explicit, 0.0) AS prob_explicit, - (tweeting_follows.source_id IS NOT NULL) AND (tweeting_follows.destination_id IS NOT NULL) AS followed, - date_end AS ds -FROM - predicted_scores - FULL JOIN - `twttr-recos-ml-prod.realgraph.tweeting_follows` tweeting_follows - USING (source_id, destination_id) diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/README.docx b/src/scala/com/twitter/interaction_graph/bqe/training/README.docx new file mode 100644 index 000000000..10ba2fed0 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/README.md b/src/scala/com/twitter/interaction_graph/bqe/training/README.md deleted file mode 100644 index 17e94e7f5..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/README.md +++ /dev/null @@ -1,60 +0,0 @@ -# Training - -This folder contains the sql files that we'll use for training the prod real graph models: -- prod (predicts any interactions the next day) -- prod_explicit (predicts any explicit interactions the next day) - -We have 3 steps that take place: -- candidate generation + feature hydration. this query samples 1% of edges from the `twttr-recos-ml-prod.realgraph.candidates` table which is already produced daily and saves it to `twttr-recos-ml-prod.realgraph.candidates_sampled`. we save each day's data according to the statebird batch run date and hence require checks to make sure that the data exists to begin with. -- label candidates. we join day T's candidates with day T+1's labels while filtering out any negative interactions to get our labeled dataset. we append an additional day's worth of segments for each day. we finally generate the training dataset which uses all day's labeled data for training, performing negative downsampling to get a roughly 50-50 split of positive to negative labels. -- training. we use bqml for training our xgboost models. - -## Instructions - -For deploying the job, you would need to create a zip file, upload to packer, and then schedule it with aurora. - -``` -zip -jr real_graph_training src/scala/com/twitter/interaction_graph/bqe/training && \ -packer add_version --cluster=atla cassowary real_graph_training real_graph_training.zip -aurora cron schedule atla/cassowary/prod/real_graph_training src/scala/com/twitter/interaction_graph/bqe/training/training.aurora && \ -aurora cron start atla/cassowary/prod/real_graph_training -``` - -# candidates.sql - -1. Sets the value of the variable date_candidates to the date of the latest partition of the candidates_for_training table. -2. Creates a new table candidates_sampled if it does not exist already, which will contain a sample of 100 rows from the candidates_for_training table. -3. Deletes any existing rows from the candidates_sampled table where the ds column matches the date_candidates value, to avoid double-writing. -4. Inserts a sample of rows into the candidates_sampled table from the candidates_for_training table, where the modulo of the absolute value of the FARM_FINGERPRINT of the concatenation of source_id and destination_id is equal to the value of the $mod_remainder$ variable, and where the ds column matches the date_candidates value. - -# check_candidates_exist.sql - -This BigQuery prepares a table of candidates for training a machine learning model. It does the following: - -1. Declares two variables date_start and date_end that are 30 days apart, and date_end is set to the value of $start_time$ parameter (which is a Unix timestamp). -2. Creates a table candidates_for_training that is partitioned by ds (date) and populated with data from several other tables in the database. It joins information from tables of user interactions, tweeting, and interaction graph aggregates, filters out negative edge snapshots, calculates some statistics and aggregates them by source_id and destination_id. Then, it ranks each source_id by the number of days and tweets, selects top 2000, and adds date_end as a new column ds. -3. Finally, it selects the ds column from candidates_for_training where ds equals date_end. - -Overall, this script prepares a table of 2000 candidate pairs of user interactions with statistics and labels, which can be used to train a machine learning model for recommendation purposes. - -# labeled_candidates.sql - -The BQ does the following: - -1. Defines two variables date_candidates and date_labels as dates based on the $start_time$ parameter. -2. Creates a new table twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ with default values. -3. Deletes any prior data in the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table for the current date_candidates. -4. Joins the twttr-recos-ml-prod.realgraph.candidates_sampled table with the twttr-bq-cassowary-prod.user.interaction_graph_labels_daily table and the twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot table. It assigns a label of 1 for positive interactions and 0 for negative interactions, and selects only the rows where there is no negative interaction. -5. Inserts the joined data into the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table. -6. Calculates the positive rate by counting the number of positive labels and dividing it by the total number of labels. -7. Creates a new table twttr-recos-ml-prod.realgraph.train$table_suffix$ by sampling from the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table, with a downsampling of negative examples to balance the number of positive and negative examples, based on the positive rate calculated in step 6. - -The resulting twttr-recos-ml-prod.realgraph.train$table_suffix$ table is used as a training dataset for a machine learning model. - -# train_model.sql - -This BQ command creates or replaces a machine learning model called twttr-recos-ml-prod.realgraph.prod$table_suffix$. The model is a boosted tree classifier, which is used for binary classification problems. - -The options provided in the command configure the specific settings for the model, such as the number of parallel trees, the maximum number of iterations, and the data split method. The DATA_SPLIT_METHOD parameter is set to CUSTOM, and DATA_SPLIT_COL is set to if_eval, which means the data will be split into training and evaluation sets based on the if_eval column. The IF function is used to assign a boolean value of true or false to if_eval based on the modulo operation performed on source_id. - -The SELECT statement specifies the input data for the model. The columns selected include label (the target variable to be predicted), as well as various features such as num_days, num_tweets, and num_follows that are used to predict the target variable. \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/candidates.docx b/src/scala/com/twitter/interaction_graph/bqe/training/candidates.docx new file mode 100644 index 000000000..6b4c42926 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/candidates.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/candidates.sql b/src/scala/com/twitter/interaction_graph/bqe/training/candidates.sql deleted file mode 100644 index 8c47b8184..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/candidates.sql +++ /dev/null @@ -1,18 +0,0 @@ --- get latest partition of candidates with data -DECLARE date_candidates DATE; -SET date_candidates = (SELECT DATE(TIMESTAMP_MILLIS($start_time$))); - -CREATE TABLE IF NOT EXISTS `twttr-recos-ml-prod.realgraph.candidates_sampled` AS -SELECT * FROM `twttr-recos-ml-prod.realgraph.candidates_for_training` LIMIT 100; - --- remove previous output snapshot (if exists) to avoid double-writing -DELETE -FROM `twttr-recos-ml-prod.realgraph.candidates_sampled` -WHERE ds = date_candidates; - --- sample from candidates table instead of recomputing features -INSERT INTO `twttr-recos-ml-prod.realgraph.candidates_sampled` -SELECT * FROM `twttr-recos-ml-prod.realgraph.candidates_for_training` -WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(source_id, '_', destination_id))), 100) = $mod_remainder$ -AND ds = date_candidates; - diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.docx b/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.docx new file mode 100644 index 000000000..fd4412aad Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.sql b/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.sql deleted file mode 100644 index 5cb380b4f..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.sql +++ /dev/null @@ -1,43 +0,0 @@ -DECLARE date_start, date_end DATE; -SET date_end = (SELECT DATE(TIMESTAMP_MILLIS($start_time$))); -SET date_start = DATE_SUB(date_end, INTERVAL 30 DAY); - -CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.candidates_for_training` -PARTITION BY ds -AS -WITH T1 AS ( - SELECT source_id, destination_id, label, dateHour - FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily` - LEFT JOIN UNNEST(labels) AS label - WHERE DATE(dateHour) BETWEEN date_start AND date_end -), T2 AS ( - SELECT source_id, destination_id, num_tweets - FROM `twttr-recos-ml-prod.realgraph.tweeting_follows` -), T3 AS ( -SELECT -COALESCE(T1.source_id, T2.source_id) AS source_id, -COALESCE(T1.destination_id, T2.destination_id) AS destination_id, -COUNT(DISTINCT(T1.dateHour)) AS num_days, -MIN(COALESCE(num_tweets,0)) AS num_tweets, -- all rows' num_tweets should be the same -COALESCE(DATE_DIFF(date_end, DATE(MAX(T1.dateHour)), DAY),30) AS days_since_last_interaction, -COUNT(DISTINCT(label)) AS label_types, -COUNTIF(label="num_follows") AS num_follows, -COUNTIF(label="num_favorites") AS num_favorites, -COUNTIF(label="num_tweet_clicks") AS num_tweet_clicks, -COUNTIF(label="num_profile_views") AS num_profile_views, -FROM T1 -FULL JOIN T2 -USING (source_id, destination_id) -LEFT JOIN `twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot` N -USING (source_id, destination_id) -WHERE N.source_id IS NULL AND N.destination_id IS NULL -GROUP BY 1,2 -ORDER BY 3 DESC,4 DESC -), T4 AS ( - SELECT RANK() OVER (PARTITION BY source_id ORDER BY num_days DESC, num_tweets DESC) AS rn, * - FROM T3 -) SELECT *, date_end AS ds FROM T4 WHERE rn <= 2000; - -SELECT ds FROM `twttr-recos-ml-prod.realgraph.candidates_for_training` -WHERE ds = (SELECT DATE(TIMESTAMP_MILLIS($start_time$))) -LIMIT 1 diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.docx b/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.docx new file mode 100644 index 000000000..e816d3e90 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.sql b/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.sql deleted file mode 100644 index 20a372b4a..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT dateHour FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily` -WHERE dateHour = (SELECT TIMESTAMP_ADD(TIMESTAMP_MILLIS($start_time$), INTERVAL 1 DAY)) -LIMIT 1 - diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.docx b/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.docx new file mode 100644 index 000000000..c636b043c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.sql b/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.sql deleted file mode 100644 index 4230ee5c5..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.sql +++ /dev/null @@ -1,67 +0,0 @@ --- date_labels is 1 day after date_candidates (which is the current batch run's start date) -DECLARE date_candidates, date_labels DATE; -DECLARE positive_rate FLOAT64; -SET date_candidates = (SELECT DATE(TIMESTAMP_MILLIS($start_time$))); -SET date_labels = DATE_ADD(date_candidates, INTERVAL 1 DAY); - -CREATE TABLE IF NOT EXISTS `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` AS -SELECT - 0 AS source_id, - 1 AS destination_id, - 1 AS label, - 1 AS num_days, - 1 AS num_tweets, - 1 AS num_follows, - 1 AS num_favorites, - 1 AS num_tweet_clicks, - 1 AS num_profile_views, - 1 AS days_since_last_interaction, - 1 AS label_types, - DATE("2023-01-08") AS ds; - --- delete any prior data to avoid double writing -DELETE -FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` -WHERE ds = date_candidates; - --- join labels with candidates with 1 day attribution delay and insert new segment -INSERT INTO `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` -WITH label_positive AS ( - SELECT source_id, destination_id - FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily` - WHERE DATE(dateHour)=date_labels -), label_negative AS ( - SELECT source_id, destination_id - FROM `twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot` -) SELECT - F.source_id, - F.destination_id, - CASE WHEN P.source_id IS NULL THEN 0 ELSE 1 END AS label, - num_days, - num_tweets, - num_follows, - num_favorites, - num_tweet_clicks, - num_profile_views, - days_since_last_interaction, - label_types, - date_candidates AS ds -FROM `twttr-recos-ml-prod.realgraph.candidates_sampled` F -LEFT JOIN label_positive P USING(source_id, destination_id) -LEFT JOIN label_negative N USING(source_id, destination_id) -WHERE N.source_id IS NULL AND N.destination_id IS NULL -AND F.ds=date_candidates -; - --- get positive rate -SET positive_rate = -(SELECT SUM(label)/COUNT(label) AS pct_positive -FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` -); - --- create training dataset with negative downsampling (should get ~50-50 split) --- this spans over the cumulative date range of the labeled candidates table. -CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.train$table_suffix$` AS -SELECT * FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` -WHERE CASE WHEN label = 0 AND RAND() < positive_rate THEN true WHEN label = 1 AND RAND() < (1-positive_rate) THEN true ELSE false END -; diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/train_model.docx b/src/scala/com/twitter/interaction_graph/bqe/training/train_model.docx new file mode 100644 index 000000000..4da040141 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/train_model.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/train_model.sql b/src/scala/com/twitter/interaction_graph/bqe/training/train_model.sql deleted file mode 100644 index c7a5df501..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/train_model.sql +++ /dev/null @@ -1,27 +0,0 @@ -CREATE OR REPLACE MODEL `twttr-recos-ml-prod.realgraph.prod$table_suffix$` -OPTIONS(MODEL_TYPE='BOOSTED_TREE_CLASSIFIER', - BOOSTER_TYPE = 'GBTREE', - NUM_PARALLEL_TREE = 1, - MAX_ITERATIONS = 20, - TREE_METHOD = 'HIST', - EARLY_STOP = TRUE, - SUBSAMPLE = 0.01, - INPUT_LABEL_COLS = ['label'], - DATA_SPLIT_METHOD = 'CUSTOM', - DATA_SPLIT_COL = 'if_eval') -AS SELECT - label, - source_id, - destination_id, - num_days, - num_tweets, - num_follows, - num_favorites, - num_tweet_clicks, - num_profile_views, - days_since_last_interaction, - label_types, - -- partition train/test by source_id's - IF(MOD(ABS(FARM_FINGERPRINT(CAST(source_id AS STRING))), 10) = 0, true, false) AS if_eval, -FROM `twttr-recos-ml-prod.realgraph.train$table_suffix$` -; diff --git a/src/scala/com/twitter/interaction_graph/injection/BUILD b/src/scala/com/twitter/interaction_graph/injection/BUILD deleted file mode 100644 index 3e9d55ccf..000000000 --- a/src/scala/com/twitter/interaction_graph/injection/BUILD +++ /dev/null @@ -1,25 +0,0 @@ -scala_library( - name = "user_session_inj", - sources = ["UserSessionInjection.scala"], - platform = "java8", - strict_deps = True, - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/bijection:scrooge", - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/thrift/com/twitter/user_session_store:thrift-scala", - ], -) - -scala_library( - name = "edge_list_injection", - sources = ["EdgeListInjection.scala"], - platform = "java8", - strict_deps = True, - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/bijection:scrooge", - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/injection/BUILD.docx b/src/scala/com/twitter/interaction_graph/injection/BUILD.docx new file mode 100644 index 000000000..3a4ab2c90 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/injection/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.docx b/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.docx new file mode 100644 index 000000000..1970d5d0f Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.docx differ diff --git a/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.scala b/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.scala deleted file mode 100644 index c03ad097c..000000000 --- a/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.scala +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.interaction_graph.injection - -import com.twitter.interaction_graph.thriftscala.EdgeList -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift - -object EdgeListInjection { - final val injection: KeyValInjection[Long, EdgeList] = - KeyValInjection( - Long2BigEndian, - ScalaCompactThrift(EdgeList) - ) -} diff --git a/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.docx b/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.docx new file mode 100644 index 000000000..9fd0a4d8d Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.docx differ diff --git a/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.scala b/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.scala deleted file mode 100644 index f6c84e184..000000000 --- a/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.scala +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.interaction_graph.injection - -import com.twitter.user_session_store.thriftscala.UserSession -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian - -object UserSessionInjection { - final val injection: KeyValInjection[Long, UserSession] = - KeyValInjection( - Long2BigEndian, - ScalaCompactThrift(UserSession) - ) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/README.docx b/src/scala/com/twitter/interaction_graph/scio/README.docx new file mode 100644 index 000000000..7de6543b4 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/README.md b/src/scala/com/twitter/interaction_graph/scio/README.md deleted file mode 100644 index c7ef6d713..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# Interaction Graph - -This folder contains the code used in the offline pipeline for real graph v2. - -The ETL jobs are contained in folders prefaced with `agg_*`, while the jobs powering the ml pipeline are in the ml folder. - -Note that the jobs in the ml folder are mostly ETL jobs; the main training and scoring happens within BQML. diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD deleted file mode 100644 index 3f7e0491e..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD +++ /dev/null @@ -1,62 +0,0 @@ -scala_library( - name = "agg_address_book", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_address_book_edge_snapshot-scala", - ":interaction_graph_agg_address_book_vertex_snapshot-scala", - "3rdparty/jvm/com/twitter/storehaus:algebra", - "addressbook/jobs/src/main/scala/com/twitter/addressbook/jobs/simplematches:simple_user_matches-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/interaction_graph/scio/common", - ], -) - -jvm_binary( - name = "interaction_graph_address_book_scio", - main = "com.twitter.interaction_graph.scio.agg_address_book.InteractionGraphAddressBookJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":agg_address_book", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_address_book_edge_snapshot", - description = "User-user directed edges with addressbook features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_address_book_vertex_snapshot", - description = "User vertex with addressbook features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD.docx new file mode 100644 index 000000000..94ce50873 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.docx new file mode 100644 index 000000000..495967902 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.scala deleted file mode 100644 index 0d57c4cae..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.scala +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.spotify.scio.ScioMetrics -import org.apache.beam.sdk.metrics.Counter - -trait InteractionGraphAddressBookCountersTrait { - val Namespace = "Interaction Graph Address Book" - - def emailFeatureInc(): Unit - - def phoneFeatureInc(): Unit - - def bothFeatureInc(): Unit -} - -/** - * SCIO counters are used to gather run time statistics - */ -case object InteractionGraphAddressBookCounters extends InteractionGraphAddressBookCountersTrait { - val emailFeatureCounter: Counter = - ScioMetrics.counter(Namespace, "Email Feature") - - val phoneFeatureCounter: Counter = - ScioMetrics.counter(Namespace, "Phone Feature") - - val bothFeatureCounter: Counter = - ScioMetrics.counter(Namespace, "Both Feature") - - override def emailFeatureInc(): Unit = emailFeatureCounter.inc() - - override def phoneFeatureInc(): Unit = phoneFeatureCounter.inc() - - override def bothFeatureInc(): Unit = bothFeatureCounter.inc() -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.docx new file mode 100644 index 000000000..3cc523b66 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.scala deleted file mode 100644 index 360b52cee..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.scala +++ /dev/null @@ -1,71 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.dal.DAL.PathLayout -import com.twitter.beam.io.dal.DAL.WriteOptions -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import java.time.Instant -import org.joda.time.Interval - -object InteractionGraphAddressBookJob extends ScioBeamJob[InteractionGraphAddressBookOption] { - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphAddressBookOption - ): Unit = { - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val dateInterval: Interval = pipelineOptions.interval - implicit lazy val addressBookCounters: InteractionGraphAddressBookCountersTrait = - InteractionGraphAddressBookCounters - - val interactionGraphAddressBookSource = InteractionGraphAddressBookSource(pipelineOptions) - - val addressBook: SCollection[UserMatchesRecord] = - interactionGraphAddressBookSource.readSimpleUserMatches( - dateInterval.withStart(dateInterval.getStart.minusDays(3)) - ) - val (vertex, edges) = InteractionGraphAddressBookUtil.process(addressBook) - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - - vertex.saveAsCustomOutput( - "Write Vertex Records", - DAL.writeSnapshot[Vertex]( - InteractionGraphAggAddressBookVertexSnapshotScalaDataset, - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_vertex_daily"), - Instant.ofEpochMilli(dateInterval.getEndMillis), - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = - WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 16.0).ceil.toInt)) - ) - ) - - edges.saveAsCustomOutput( - "Write Edge Records", - DAL.writeSnapshot[Edge]( - InteractionGraphAggAddressBookEdgeSnapshotScalaDataset, - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_edge_daily"), - Instant.ofEpochMilli(dateInterval.getEndMillis), - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.docx new file mode 100644 index 000000000..f44fc3f62 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.scala deleted file mode 100644 index b5c34e94c..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphAddressBookOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.docx new file mode 100644 index 000000000..3d597ff65 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.scala deleted file mode 100644 index 66e3903bc..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.scala +++ /dev/null @@ -1,28 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.addressbook.jobs.simplematches.SimpleUserMatchesScalaDataset -import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.cde.scio.dal_read.SourceUtil -import org.joda.time.Interval - -case class InteractionGraphAddressBookSource( - pipelineOptions: InteractionGraphAddressBookOption -)( - implicit sc: ScioContext, -) { - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readSimpleUserMatches( - dateInterval: Interval - ): SCollection[UserMatchesRecord] = { - SourceUtil.readMostRecentSnapshotDALDataset[UserMatchesRecord]( - SimpleUserMatchesScalaDataset, - dateInterval, - dalEnvironment) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.docx new file mode 100644 index 000000000..5e8c08be9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.scala deleted file mode 100644 index fc5898ce0..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.scala +++ /dev/null @@ -1,93 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.spotify.scio.values.SCollection -import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.Vertex - -object InteractionGraphAddressBookUtil { - val EMAIL = "email" - val PHONE = "phone" - val BOTH = "both" - - val DefaultAge = 1 - val DegaultFeatureValue = 1.0 - - def process( - addressBook: SCollection[UserMatchesRecord] - )( - implicit addressBookCounters: InteractionGraphAddressBookCountersTrait - ): (SCollection[Vertex], SCollection[Edge]) = { - // First construct a data with (src, dst, name), where name can be "email", "phone", or "both" - val addressBookTypes: SCollection[((Long, Long), String)] = addressBook.flatMap { record => - record.forwardMatches.toSeq.flatMap { matchDetails => - val matchedUsers = (record.userId, matchDetails.userId) - (matchDetails.matchedByEmail, matchDetails.matchedByPhone) match { - case (true, true) => - Seq((matchedUsers, EMAIL), (matchedUsers, PHONE), (matchedUsers, BOTH)) - case (true, false) => Seq((matchedUsers, EMAIL)) - case (false, true) => Seq((matchedUsers, PHONE)) - case _ => Seq.empty - } - } - } - - // Then construct the input data for feature calculation - val addressBookFeatureInput: SCollection[InteractionGraphRawInput] = addressBookTypes - .map { - case ((src, dst), name) => - if (src < dst) - ((src, dst, name), false) - else - ((dst, src, name), true) - }.groupByKey - .flatMap { - case ((src, dst, name), iterator) => - val isReversedValues = iterator.toSeq - // check if (src, dst) is mutual follow - val isMutualFollow = isReversedValues.size == 2 - // get correct srcId and dstId if there is no mutual follow and they are reversed - val (srcId, dstId) = { - if (!isMutualFollow && isReversedValues.head) - (dst, src) - else - (src, dst) - } - // get the feature name and mutual follow name - val (featureName, mfFeatureName) = name match { - case EMAIL => - addressBookCounters.emailFeatureInc() - (FeatureName.AddressBookEmail, FeatureName.AddressBookMutualEdgeEmail) - case PHONE => - addressBookCounters.phoneFeatureInc() - (FeatureName.AddressBookPhone, FeatureName.AddressBookMutualEdgePhone) - case BOTH => - addressBookCounters.bothFeatureInc() - (FeatureName.AddressBookInBoth, FeatureName.AddressBookMutualEdgeInBoth) - } - // construct the TypedPipe for feature calculation - if (isMutualFollow) { - Iterator( - InteractionGraphRawInput(srcId, dstId, featureName, DefaultAge, DegaultFeatureValue), - InteractionGraphRawInput(dstId, srcId, featureName, DefaultAge, DegaultFeatureValue), - InteractionGraphRawInput( - srcId, - dstId, - mfFeatureName, - DefaultAge, - DegaultFeatureValue), - InteractionGraphRawInput(dstId, srcId, mfFeatureName, DefaultAge, DegaultFeatureValue) - ) - } else { - Iterator( - InteractionGraphRawInput(srcId, dstId, featureName, DefaultAge, DegaultFeatureValue)) - } - } - - // Calculate the Features - FeatureGeneratorUtil.getFeatures(addressBookFeatureInput) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.docx new file mode 100644 index 000000000..9c80b4658 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.md deleted file mode 100644 index 4d895c71d..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphAddressBook Dataflow Job - -#### IntelliJ -``` -./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio -``` - -#### Compile -``` -./bazel build src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio -``` - -#### Build Jar -``` -./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-address-book-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_address_book/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-04-13 \ - --bind=profile.output_path=processed/interaction_graph_agg_address_book_dataflow -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD deleted file mode 100644 index 61dc35906..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD +++ /dev/null @@ -1,175 +0,0 @@ -scala_library( - name = "agg_all", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_history_aggregated_raw_edge_daily-scala", - ":interaction_graph_history_aggregated_vertex_daily-scala", - ":interaction_graph_aggregated_edge_daily-scala", - ":interaction_graph_aggregated_vertex_daily-scala", - ":interaction_graph_history_aggregated_edge_snapshot-scala", - ":interaction_graph_history_aggregated_vertex_snapshot-scala", - ":real_graph_features-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_agg_address_book_edge_snapshot-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_agg_address_book_vertex_snapshot-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_edge_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_vertex_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_edge_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_vertex_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_edge_snapshot-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_vertex_snapshot-scala", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/scala/com/twitter/interaction_graph/scio/ml/scores:real_graph_in_scores-scala", - "src/scala/com/twitter/interaction_graph/scio/ml/scores:real_graph_oon_scores-scala", - "src/scala/com/twitter/wtf/dataflow/user_events:valid_user_follows-scala", - "src/thrift/com/twitter/wtf/candidate:wtf-candidate-scala", - "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_history_aggregated_raw_edge_daily", - description = "User-user directed edges with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_history_aggregated_vertex_daily", - description = "User vertex with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -jvm_binary( - name = "interaction_graph_aggregation_job_scio", - main = "com.twitter.interaction_graph.scio.agg_all.InteractionGraphAggregationJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":agg_all", - ], -) - -create_datasets( - base_name = "interaction_graph_history_aggregated_edge_snapshot", - description = "User-user directed edges with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_history_aggregated_vertex_snapshot", - description = "User vertex with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_aggregated_edge_daily", - description = "User-user directed edges with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_aggregated_vertex_daily", - description = "User vertex with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "real_graph_features", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.injection.UserSessionInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.user_session_store.thriftscala.UserSession", - scala_dependencies = [ - "src/scala/com/twitter/interaction_graph/injection:user_session_inj", - ], -) - -create_datasets( - base_name = "home_light_ranker_top_k_real_graph_features", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.injection.EdgeListInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.interaction_graph.thriftscala.EdgeList", - scala_dependencies = [ - "src/scala/com/twitter/interaction_graph/injection:edge_list_injection", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD.docx new file mode 100644 index 000000000..b94077877 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.docx new file mode 100644 index 000000000..121702e9f Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.scala deleted file mode 100644 index 2f9b0da57..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.scala +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -object InteractionGraphScoringConfig { - - /** - * This is alpha for a variant of the Exponentially weighted moving average, computed as: - * ewma_{t+1} = x_{t+1} + (1-alpha) * ewma_t (ewma_1 = x_1, t > 0) - * We choose alpha such that the half life of weights is 7 days. - * Note that we don't down-weight x_{t+1} (unlike in EWMA) as we only want to decay actions - * as they grow old, not compute the average value. - */ - val ALPHA = 1.0 - val ONE_MINUS_ALPHA = 0.955 -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.docx new file mode 100644 index 000000000..6bde8f011 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.scala deleted file mode 100644 index 06942205d..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.scala +++ /dev/null @@ -1,314 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -import com.google.cloud.bigquery.BigQueryOptions -import com.google.cloud.bigquery.QueryJobConfiguration -import com.spotify.scio.ScioContext -import com.spotify.scio.ScioMetrics -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.dal.DAL.PathLayout -import com.twitter.beam.io.dal.DAL.WriteOptions -import com.twitter.beam.io.exception.DataNotFoundException -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.interaction_graph.scio.agg_all.InteractionGraphAggregationTransform._ -import com.twitter.interaction_graph.scio.common.DateUtil -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.scio.common.UserUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.user_session_store.thriftscala.UserSession -import com.twitter.util.Duration -import com.twitter.wtf.candidate.thriftscala.ScoredEdge -import java.time.Instant -import org.apache.avro.generic.GenericRecord -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead -import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord -import org.apache.beam.sdk.transforms.SerializableFunction -import org.joda.time.Interval -import scala.collection.JavaConverters._ - -object InteractionGraphAggregationJob extends ScioBeamJob[InteractionGraphAggregationOption] { - - // to parse latest date from the BQ table we're reading from - val parseDateRow = new SerializableFunction[SchemaAndRecord, String] { - override def apply(input: SchemaAndRecord): String = { - val genericRecord: GenericRecord = input.getRecord() - genericRecord.get("ds").toString - } - } - - // note that we're using the prob_explicit for real_graph_features (for Home) - val parseRow = new SerializableFunction[SchemaAndRecord, ScoredEdge] { - override def apply(record: SchemaAndRecord): ScoredEdge = { - val genericRecord: GenericRecord = record.getRecord() - ScoredEdge( - genericRecord.get("source_id").asInstanceOf[Long], - genericRecord.get("destination_id").asInstanceOf[Long], - genericRecord.get("prob_explicit").asInstanceOf[Double], - genericRecord.get("followed").asInstanceOf[Boolean], - ) - } - } - - override def runPipeline( - sc: ScioContext, - opts: InteractionGraphAggregationOption - ): Unit = { - - val dateStr: String = opts.getDate().value.getStart.toString("yyyyMMdd") - logger.info(s"dateStr $dateStr") - val project: String = "twttr-recos-ml-prod" - val datasetName: String = "realgraph" - val bqTableName: String = "scores" - val fullBqTableName: String = s"$project:$datasetName.$bqTableName" - - if (opts.getDALWriteEnvironment.toLowerCase == "prod") { - val bqClient = - BigQueryOptions.newBuilder.setProjectId(project).build.getService - val query = - s""" - |SELECT total_rows - |FROM `$project.$datasetName.INFORMATION_SCHEMA.PARTITIONS` - |WHERE partition_id ="$dateStr" AND - |table_name="$bqTableName" AND total_rows > 0 - |""".stripMargin - val queryConfig = QueryJobConfiguration.of(query) - val results = bqClient.query(queryConfig).getValues.asScala.toSeq - if (results.isEmpty || results.head.get(0).getLongValue == 0) { - throw new DataNotFoundException(s"$dateStr not present in $fullBqTableName.") - } - } - sc.run() - } - - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphAggregationOption - ): Unit = { - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val dateInterval: Interval = pipelineOptions.interval - val yesterday = DateUtil.subtract(dateInterval, Duration.fromDays(1)) - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - val dateStr: String = pipelineOptions.getDate().value.getStart.toString("yyyy-MM-dd") - logger.info(s"dateStr $dateStr") - val project: String = "twttr-recos-ml-prod" - val datasetName: String = "realgraph" - val bqTableName: String = "scores" - val fullBqTableName: String = s"$project:$datasetName.$bqTableName" - - val scoreExport: SCollection[ScoredEdge] = - sc.customInput( - s"Read from BQ table $fullBqTableName", - BigQueryIO - .read(parseRow) - .fromQuery(s"""SELECT source_id, destination_id, prob_explicit, followed - |FROM `$project.$datasetName.$bqTableName` - |WHERE ds = '$dateStr'""".stripMargin) - .usingStandardSql() - .withMethod(TypedRead.Method.DEFAULT) - ) - - val source = InteractionGraphAggregationSource(pipelineOptions) - - val (addressEdgeFeatures, addressVertexFeatures) = source.readAddressBookFeatures() - - val (clientEventLogsEdgeFeatures, clientEventLogsVertexFeatures) = - source.readClientEventLogsFeatures(dateInterval) - - val (flockEdgeFeatures, flockVertexFeatures) = source.readFlockFeatures() - - val (directInteractionsEdgeFeatures, directInteractionsVertexFeatures) = - source.readDirectInteractionsFeatures(dateInterval) - - val invalidUsers = UserUtil.getInvalidUsers(source.readFlatUsers()) - - val (prevAggEdge, prevAggVertex) = source.readAggregatedFeatures(yesterday) - - val prevAggregatedVertex: SCollection[Vertex] = - UserUtil - .filterUsersByIdMapping[Vertex]( - prevAggVertex, - invalidUsers, - v => v.userId - ) - - /** Remove status-based features (flock/ab) from current graph, because we only need the latest - * This is to allow us to filter and roll-up a smaller dataset, to which we will still add - * back the status-based features for the complete scoredAggregates (that other teams will read). - */ - val prevAggEdgeFiltered = prevAggEdge - .filter { e => - e.sourceId != e.destinationId - } - .withName("filtering status-based edges") - .flatMap(FeatureGeneratorUtil.removeStatusFeatures) - val prevAggEdgeValid: SCollection[Edge] = - UserUtil - .filterUsersByMultipleIdMappings[Edge]( - prevAggEdgeFiltered, - invalidUsers, - Seq(e => e.sourceId, e => e.destinationId) - ) - - val aggregatedActivityVertexDaily = UserUtil - .filterUsersByIdMapping[Vertex]( - FeatureGeneratorUtil - .combineVertexFeatures( - clientEventLogsVertexFeatures ++ - directInteractionsVertexFeatures ++ - addressVertexFeatures ++ - flockVertexFeatures - ), - invalidUsers, - v => v.userId - ) - - // we split up the roll-up of decayed counts between status vs activity/count-based features - val aggregatedActivityEdgeDaily = FeatureGeneratorUtil - .combineEdgeFeatures(clientEventLogsEdgeFeatures ++ directInteractionsEdgeFeatures) - - // Vertex level, Add the decay sum for history and daily - val aggregatedActivityVertex = FeatureGeneratorUtil - .combineVertexFeaturesWithDecay( - prevAggregatedVertex, - aggregatedActivityVertexDaily, - InteractionGraphScoringConfig.ONE_MINUS_ALPHA, - InteractionGraphScoringConfig.ALPHA - ) - - // Edge level, Add the decay sum for history and daily - val aggregatedActivityEdge = FeatureGeneratorUtil - .combineEdgeFeaturesWithDecay( - prevAggEdgeValid, - aggregatedActivityEdgeDaily, - InteractionGraphScoringConfig.ONE_MINUS_ALPHA, - InteractionGraphScoringConfig.ALPHA - ) - .filter(FeatureGeneratorUtil.edgeWithFeatureOtherThanDwellTime) - .withName("removing edges that only have dwell time features") - - val edgeKeyedScores = scoreExport.keyBy { e => (e.sourceId, e.destinationId) } - - val scoredAggregatedActivityEdge = aggregatedActivityEdge - .keyBy { e => (e.sourceId, e.destinationId) } - .withName("join with scores") - .leftOuterJoin(edgeKeyedScores) - .map { - case (_, (e, scoredEdgeOpt)) => - val scoreOpt = scoredEdgeOpt.map(_.score) - e.copy(weight = if (scoreOpt.nonEmpty) { - ScioMetrics.counter("after joining edge with scores", "has score").inc() - scoreOpt - } else { - ScioMetrics.counter("after joining edge with scores", "no score").inc() - None - }) - } - - val combinedFeatures = FeatureGeneratorUtil - .combineEdgeFeatures(aggregatedActivityEdge ++ addressEdgeFeatures ++ flockEdgeFeatures) - .keyBy { e => (e.sourceId, e.destinationId) } - - val aggregatedActivityScoredEdge = - edgeKeyedScores - .withName("join with combined edge features") - .leftOuterJoin(combinedFeatures) - .map { - case (_, (scoredEdge, combinedFeaturesOpt)) => - if (combinedFeaturesOpt.exists(_.features.nonEmpty)) { - ScioMetrics.counter("after joining scored edge with features", "has features").inc() - Edge( - sourceId = scoredEdge.sourceId, - destinationId = scoredEdge.destinationId, - weight = Some(scoredEdge.score), - features = combinedFeaturesOpt.map(_.features).getOrElse(Nil) - ) - } else { - ScioMetrics.counter("after joining scored edge with features", "no features").inc() - Edge( - sourceId = scoredEdge.sourceId, - destinationId = scoredEdge.destinationId, - weight = Some(scoredEdge.score), - features = Nil - ) - } - } - - val realGraphFeatures = - getTopKTimelineFeatures(aggregatedActivityScoredEdge, pipelineOptions.getMaxDestinationIds) - - aggregatedActivityVertex.saveAsCustomOutput( - "Write History Aggregated Vertex Records", - DAL.writeSnapshot[Vertex]( - dataset = InteractionGraphHistoryAggregatedVertexSnapshotScalaDataset, - pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_vertex"), - endDate = Instant.ofEpochMilli(dateInterval.getEndMillis), - diskFormat = DiskFormat.Parquet, - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards / 10)) - ) - ) - - scoredAggregatedActivityEdge.saveAsCustomOutput( - "Write History Aggregated Edge Records", - DAL.writeSnapshot[Edge]( - dataset = InteractionGraphHistoryAggregatedEdgeSnapshotScalaDataset, - pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_raw_edge"), - endDate = Instant.ofEpochMilli(dateInterval.getEndMillis), - diskFormat = DiskFormat.Parquet, - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - - aggregatedActivityVertexDaily.saveAsCustomOutput( - "Write Daily Aggregated Vertex Records", - DAL.write[Vertex]( - dataset = InteractionGraphAggregatedVertexDailyScalaDataset, - pathLayout = - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_vertex_daily"), - interval = dateInterval, - diskFormat = DiskFormat.Parquet, - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards / 10)) - ) - ) - - aggregatedActivityEdgeDaily.saveAsCustomOutput( - "Write Daily Aggregated Edge Records", - DAL.write[Edge]( - dataset = InteractionGraphAggregatedEdgeDailyScalaDataset, - pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_edge_daily"), - interval = dateInterval, - diskFormat = DiskFormat.Parquet, - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - - realGraphFeatures.saveAsCustomOutput( - "Write Timeline Real Graph Features", - DAL.writeVersionedKeyVal[KeyVal[Long, UserSession]]( - dataset = RealGraphFeaturesScalaDataset, - pathLayout = - PathLayout.VersionedPath(pipelineOptions.getOutputPath + "/real_graph_features"), - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.docx new file mode 100644 index 000000000..2eac1d68c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.scala deleted file mode 100644 index 94e7ffae6..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.scala +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphAggregationOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit - - @Description("BQ Table name for reading scores from") - def getBqTableName: String - def setBqTableName(value: String): Unit - - @Description("max destination ids that we will store for real graph features in TL") - def getMaxDestinationIds: Integer - def setMaxDestinationIds(value: Integer): Unit - - @Description("true if getting scores from BQ instead of DAL-based dataset in GCS") - def getScoresFromBQ: Boolean - def setScoresFromBQ(value: Boolean): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.docx new file mode 100644 index 000000000..eddb208e9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.scala deleted file mode 100644 index b1ea8ff05..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.scala +++ /dev/null @@ -1,182 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.ReadOptions -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.dal.client.dataset.SnapshotDALDatasetBase -import com.twitter.dal.client.dataset.TimePartitionedDALDataset -import com.twitter.interaction_graph.scio.agg_address_book.InteractionGraphAggAddressBookEdgeSnapshotScalaDataset -import com.twitter.interaction_graph.scio.agg_address_book.InteractionGraphAggAddressBookVertexSnapshotScalaDataset -import com.twitter.interaction_graph.scio.agg_client_event_logs.InteractionGraphAggClientEventLogsEdgeDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_client_event_logs.InteractionGraphAggClientEventLogsVertexDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_direct_interactions.InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_direct_interactions.InteractionGraphAggDirectInteractionsVertexDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_flock.InteractionGraphAggFlockEdgeSnapshotScalaDataset -import com.twitter.interaction_graph.scio.agg_flock.InteractionGraphAggFlockVertexSnapshotScalaDataset -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser -import com.twitter.util.Duration -import org.joda.time.Interval - -case class InteractionGraphAggregationSource( - pipelineOptions: InteractionGraphAggregationOption -)( - implicit sc: ScioContext) { - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readDALDataset[T: Manifest]( - dataset: TimePartitionedDALDataset[T], - interval: Interval, - dalEnvironment: String, - projections: Option[Seq[String]] = None - )( - implicit sc: ScioContext, - ): SCollection[T] = { - sc.customInput( - s"Reading ${dataset.role.name}.${dataset.logicalName}", - DAL.read[T]( - dataset = dataset, - interval = interval, - environmentOverride = Environment.valueOf(dalEnvironment), - readOptions = ReadOptions(projections) - ) - ) - } - - def readMostRecentSnapshotDALDataset[T: Manifest]( - dataset: SnapshotDALDatasetBase[T], - dateInterval: Interval, - dalEnvironment: String, - projections: Option[Seq[String]] = None - )( - implicit sc: ScioContext, - ): SCollection[T] = { - sc.customInput( - s"Reading most recent snapshot ${dataset.role.name}.${dataset.logicalName}", - DAL.readMostRecentSnapshot[T]( - dataset, - dateInterval, - Environment.valueOf(dalEnvironment), - readOptions = ReadOptions(projections) - ) - ) - } - - def readMostRecentSnapshotNoOlderThanDALDataset[T: Manifest]( - dataset: SnapshotDALDatasetBase[T], - noOlderThan: Duration, - dalEnvironment: String, - projections: Option[Seq[String]] = None - )( - implicit sc: ScioContext, - ): SCollection[T] = { - sc.customInput( - s"Reading most recent snapshot ${dataset.role.name}.${dataset.logicalName}", - DAL.readMostRecentSnapshotNoOlderThan[T]( - dataset, - noOlderThan, - environmentOverride = Environment.valueOf(dalEnvironment), - readOptions = ReadOptions(projections) - ) - ) - } - - def readAddressBookFeatures(): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readMostRecentSnapshotNoOlderThanDALDataset[Edge]( - dataset = InteractionGraphAggAddressBookEdgeSnapshotScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - ) - - val vertex = readMostRecentSnapshotNoOlderThanDALDataset[Vertex]( - dataset = InteractionGraphAggAddressBookVertexSnapshotScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - ) - - (edges, vertex) - } - - def readClientEventLogsFeatures( - dateInterval: Interval - ): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readDALDataset[Edge]( - dataset = InteractionGraphAggClientEventLogsEdgeDailyScalaDataset, - dalEnvironment = dalEnvironment, - interval = dateInterval - ) - - val vertex = readDALDataset[Vertex]( - dataset = InteractionGraphAggClientEventLogsVertexDailyScalaDataset, - dalEnvironment = dalEnvironment, - interval = dateInterval - ) - - (edges, vertex) - } - - def readDirectInteractionsFeatures( - dateInterval: Interval - ): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readDALDataset[Edge]( - dataset = InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset, - dalEnvironment = dalEnvironment, - interval = dateInterval - ) - - val vertex = readDALDataset[Vertex]( - dataset = InteractionGraphAggDirectInteractionsVertexDailyScalaDataset, - dalEnvironment = dalEnvironment, - interval = dateInterval - ) - - (edges, vertex) - } - - def readFlockFeatures(): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readMostRecentSnapshotNoOlderThanDALDataset[Edge]( - dataset = InteractionGraphAggFlockEdgeSnapshotScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - ) - - val vertex = readMostRecentSnapshotNoOlderThanDALDataset[Vertex]( - dataset = InteractionGraphAggFlockVertexSnapshotScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - ) - - (edges, vertex) - } - - def readAggregatedFeatures(dateInterval: Interval): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readMostRecentSnapshotDALDataset[Edge]( - dataset = InteractionGraphHistoryAggregatedEdgeSnapshotScalaDataset, - dalEnvironment = dalEnvironment, - dateInterval = dateInterval - ) - - val vertex = readMostRecentSnapshotDALDataset[Vertex]( - dataset = InteractionGraphHistoryAggregatedVertexSnapshotScalaDataset, - dalEnvironment = dalEnvironment, - dateInterval = dateInterval - ) - - (edges, vertex) - } - - def readFlatUsers(): SCollection[FlatUser] = - readMostRecentSnapshotNoOlderThanDALDataset[FlatUser]( - dataset = UsersourceFlatScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - projections = Some(Seq("id", "valid_user")) - ) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.docx new file mode 100644 index 000000000..4e0d0678e Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.scala deleted file mode 100644 index c76592c10..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.scala +++ /dev/null @@ -1,59 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -import collection.JavaConverters._ -import com.spotify.scio.values.SCollection -import com.twitter.algebird.mutable.PriorityQueueMonoid -import com.twitter.interaction_graph.scio.common.GraphUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.timelines.real_graph.thriftscala.RealGraphFeatures -import com.twitter.timelines.real_graph.thriftscala.RealGraphFeaturesTest -import com.twitter.timelines.real_graph.v1.thriftscala.{RealGraphFeatures => RealGraphFeaturesV1} -import com.twitter.user_session_store.thriftscala.UserSession -import com.twitter.interaction_graph.scio.common.ConversionUtil._ - -object InteractionGraphAggregationTransform { - val ordering: Ordering[Edge] = Ordering.by(-_.weight.getOrElse(0.0)) - - // converts our Edge thrift into timelines' thrift - def getTopKTimelineFeatures( - scoredAggregatedEdge: SCollection[Edge], - maxDestinationIds: Int - ): SCollection[KeyVal[Long, UserSession]] = { - scoredAggregatedEdge - .filter(_.weight.exists(_ > 0)) - .keyBy(_.sourceId) - .groupByKey - .map { - case (sourceId, edges) => - val (inEdges, outEdges) = edges.partition(GraphUtil.isFollow) - val inTopK = - if (inEdges.isEmpty) Nil - else { - val inTopKQueue = - new PriorityQueueMonoid[Edge](maxDestinationIds)(ordering) - inTopKQueue - .build(inEdges).iterator().asScala.toList.flatMap( - toRealGraphEdgeFeatures(hasTimelinesRequiredFeatures)) - } - val outTopK = - if (outEdges.isEmpty) Nil - else { - val outTopKQueue = - new PriorityQueueMonoid[Edge](maxDestinationIds)(ordering) - outTopKQueue - .build(outEdges).iterator().asScala.toList.flatMap( - toRealGraphEdgeFeatures(hasTimelinesRequiredFeatures)) - } - KeyVal( - sourceId, - UserSession( - userId = Some(sourceId), - realGraphFeatures = Some(RealGraphFeatures.V1(RealGraphFeaturesV1(inTopK, outTopK))), - realGraphFeaturesTest = - Some(RealGraphFeaturesTest.V1(RealGraphFeaturesV1(inTopK, outTopK))) - ) - ) - } - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/README.docx new file mode 100644 index 000000000..130f33aeb Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_all/README.md deleted file mode 100644 index cedf39b12..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/README.md +++ /dev/null @@ -1,38 +0,0 @@ -## InteractionGraphAggregationJob Dataflow Job - -This job aggregates the previous day's history with today's activities, and outputs an updated -history. This history is joined with the explicit scores from real graph's BQML pipeline, and -exported as features for timelines (which is why we're using their thrift). - -#### IntelliJ -``` -fastpass create --name rg_agg_all --intellij src/scala/com/twitter/interaction_graph/scio/agg_all:interaction_graph_aggregation_job_scio -``` - -#### Compile -``` -bazel build src/scala/com/twitter/interaction_graph/scio/agg_all:interaction_graph_aggregation_job_scio -``` - -#### Build Jar -``` -bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_all:interaction_graph_aggregation_job_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-aggregation-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_all/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-11-08 \ - --bind=profile.output_path=processed/interaction_graph_aggregation_dataflow -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD deleted file mode 100644 index 9c14f4d38..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD +++ /dev/null @@ -1,61 +0,0 @@ -scala_library( - name = "agg_client_event_logs", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_client_event_logs_edge_daily-scala", - ":interaction_graph_agg_client_event_logs_vertex_daily-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/scala/com/twitter/wtf/scalding/jobs/client_event_processing:user_interaction-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/combined:usersource-scala", - ], -) - -jvm_binary( - name = "interaction_graph_client_event_logs_scio", - main = "com.twitter.interaction_graph.scio.agg_client_event_logs.InteractionGraphClientEventLogsJob", - platform = "java8", - dependencies = [ - ":agg_client_event_logs", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_client_event_logs_edge_daily", - description = "User-user directed edges with client events features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_client_event_logs_vertex_daily", - description = "User vertex with client events features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD.docx new file mode 100644 index 000000000..4a80a6b86 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.docx new file mode 100644 index 000000000..872c12777 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.scala deleted file mode 100644 index cc9793ba8..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.scala +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.spotify.scio.ScioMetrics - -trait InteractionGraphClientEventLogsCountersTrait { - val Namespace = "Interaction Graph Client Event Logs" - def profileViewFeaturesInc(): Unit - def linkOpenFeaturesInc(): Unit - def tweetClickFeaturesInc(): Unit - def tweetImpressionFeaturesInc(): Unit - def catchAllInc(): Unit -} - -case object InteractionGraphClientEventLogsCounters - extends InteractionGraphClientEventLogsCountersTrait { - - val profileViewCounter = ScioMetrics.counter(Namespace, "Profile View Features") - val linkOpenCounter = ScioMetrics.counter(Namespace, "Link Open Features") - val tweetClickCounter = ScioMetrics.counter(Namespace, "Tweet Click Features") - val tweetImpressionCounter = ScioMetrics.counter(Namespace, "Tweet Impression Features") - val catchAllCounter = ScioMetrics.counter(Namespace, "Catch All") - - override def profileViewFeaturesInc(): Unit = profileViewCounter.inc() - - override def linkOpenFeaturesInc(): Unit = linkOpenCounter.inc() - - override def tweetClickFeaturesInc(): Unit = tweetClickCounter.inc() - - override def tweetImpressionFeaturesInc(): Unit = tweetImpressionCounter.inc() - - override def catchAllInc(): Unit = catchAllCounter.inc() -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.docx new file mode 100644 index 000000000..2183163b9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.scala deleted file mode 100644 index 1a12b33d9..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.scala +++ /dev/null @@ -1,74 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.spotify.scio.ScioContext -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.dal.DAL.WriteOptions -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.interaction_graph.scio.common.UserUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import org.joda.time.Interval - -object InteractionGraphClientEventLogsJob - extends ScioBeamJob[InteractionGraphClientEventLogsOption] { - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphClientEventLogsOption - ): Unit = { - - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val jobCounters: InteractionGraphClientEventLogsCountersTrait = - InteractionGraphClientEventLogsCounters - - lazy val dateInterval: Interval = pipelineOptions.interval - - val sources = InteractionGraphClientEventLogsSource(pipelineOptions) - - val userInteractions = sources.readUserInteractions(dateInterval) - val rawUsers = sources.readCombinedUsers() - val safeUsers = UserUtil.getValidUsers(rawUsers) - - val (vertex, edges) = InteractionGraphClientEventLogsUtil.process(userInteractions, safeUsers) - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - - vertex.saveAsCustomOutput( - "Write Vertex Records", - DAL.write[Vertex]( - InteractionGraphAggClientEventLogsVertexDailyScalaDataset, - PathLayout.DailyPath( - pipelineOptions.getOutputPath + "/aggregated_client_event_logs_vertex_daily"), - dateInterval, - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = - WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 32.0).ceil.toInt)) - ) - ) - - edges.saveAsCustomOutput( - "Write Edge Records", - DAL.write[Edge]( - InteractionGraphAggClientEventLogsEdgeDailyScalaDataset, - PathLayout.DailyPath( - pipelineOptions.getOutputPath + "/aggregated_client_event_logs_edge_daily"), - dateInterval, - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.docx new file mode 100644 index 000000000..a6bf3d26b Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.scala deleted file mode 100644 index 7a07a6913..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphClientEventLogsOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.docx new file mode 100644 index 000000000..ca3e847f7 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.scala deleted file mode 100644 index 1cf2da318..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.scala +++ /dev/null @@ -1,40 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.twadoop.user.gen.thriftscala.CombinedUser -import com.twitter.usersource.snapshot.combined.UsersourceScalaDataset -import com.twitter.util.Duration -import com.twitter.cde.scio.dal_read.SourceUtil -import com.twitter.wtf.scalding.client_event_processing.thriftscala.UserInteraction -import com.twitter.wtf.scalding.jobs.client_event_processing.UserInteractionScalaDataset -import org.joda.time.Interval - -case class InteractionGraphClientEventLogsSource( - pipelineOptions: InteractionGraphClientEventLogsOption -)( - implicit sc: ScioContext) { - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readUserInteractions(dateInterval: Interval): SCollection[UserInteraction] = { - - SourceUtil.readDALDataset[UserInteraction]( - dataset = UserInteractionScalaDataset, - interval = dateInterval, - dalEnvironment = dalEnvironment) - - } - - def readCombinedUsers(): SCollection[CombinedUser] = { - - SourceUtil.readMostRecentSnapshotNoOlderThanDALDataset[CombinedUser]( - dataset = UsersourceScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.docx new file mode 100644 index 000000000..0ebc65fc1 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.scala deleted file mode 100644 index 521a1f07f..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.scala +++ /dev/null @@ -1,137 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.spotify.scio.values.SCollection -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.scio.common.FeatureKey -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.wtf.scalding.client_event_processing.thriftscala.InteractionDetails -import com.twitter.wtf.scalding.client_event_processing.thriftscala.InteractionType -import com.twitter.wtf.scalding.client_event_processing.thriftscala.UserInteraction - -object InteractionGraphClientEventLogsUtil { - - val DefaultAge = 1 - val DefaultFeatureValue = 1.0 - - def process( - userInteractions: SCollection[UserInteraction], - safeUsers: SCollection[Long] - )( - implicit jobCounters: InteractionGraphClientEventLogsCountersTrait - ): (SCollection[Vertex], SCollection[Edge]) = { - - val unfilteredFeatureInput = userInteractions - .flatMap { - case UserInteraction( - userId, - _, - interactionType, - InteractionDetails.ProfileClickDetails(profileClick)) - if interactionType == InteractionType.ProfileClicks && userId != profileClick.profileId => - jobCounters.profileViewFeaturesInc() - Seq( - FeatureKey( - userId, - profileClick.profileId, - FeatureName.NumProfileViews) -> DefaultFeatureValue - ) - - case UserInteraction( - userId, - _, - interactionType, - InteractionDetails.TweetClickDetails(tweetClick)) - if interactionType == InteractionType.TweetClicks && - Some(userId) != tweetClick.authorId => - ( - for { - authorId <- tweetClick.authorId - } yield { - jobCounters.tweetClickFeaturesInc() - FeatureKey(userId, authorId, FeatureName.NumTweetClicks) -> DefaultFeatureValue - - } - ).toSeq - - case UserInteraction( - userId, - _, - interactionType, - InteractionDetails.LinkClickDetails(linkClick)) - if interactionType == InteractionType.LinkClicks && - Some(userId) != linkClick.authorId => - ( - for { - authorId <- linkClick.authorId - } yield { - jobCounters.linkOpenFeaturesInc() - FeatureKey(userId, authorId, FeatureName.NumLinkClicks) -> DefaultFeatureValue - } - ).toSeq - - case UserInteraction( - userId, - _, - interactionType, - InteractionDetails.TweetImpressionDetails(tweetImpression)) - if interactionType == InteractionType.TweetImpressions && - Some(userId) != tweetImpression.authorId => - ( - for { - authorId <- tweetImpression.authorId - dwellTime <- tweetImpression.dwellTimeInSec - } yield { - jobCounters.tweetImpressionFeaturesInc() - Seq( - FeatureKey( - userId, - authorId, - FeatureName.NumInspectedStatuses) -> DefaultFeatureValue, - FeatureKey(userId, authorId, FeatureName.TotalDwellTime) -> dwellTime.toDouble - ) - } - ).getOrElse(Nil) - - case _ => - jobCounters.catchAllInc() - Nil - } - .sumByKey - .collect { - case (FeatureKey(srcId, destId, featureName), featureValue) => - InteractionGraphRawInput( - src = srcId, - dst = destId, - name = featureName, - age = 1, - featureValue = featureValue - ) - } - - val filteredFeatureInput = filterForSafeUsers(unfilteredFeatureInput, safeUsers) - - // Calculate the Features - FeatureGeneratorUtil.getFeatures(filteredFeatureInput) - - } - - private def filterForSafeUsers( - featureInput: SCollection[InteractionGraphRawInput], - safeUsers: SCollection[Long] - ): SCollection[InteractionGraphRawInput] = { - - featureInput - .keyBy(_.src) - .withName("Filter out unsafe users") - .intersectByKey(safeUsers) - .values // Fetch only InteractionGraphRawInput - .keyBy(_.dst) - .withName("Filter out unsafe authors") - .intersectByKey(safeUsers) - .values // Fetch only InteractionGraphRawInput - } - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.docx new file mode 100644 index 000000000..a8b0f994c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.md deleted file mode 100644 index 6bd1ea2cd..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphClientEventLogs Dataflow Job - -#### IntelliJ -``` -./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_client_event_logs_scio -``` - -#### Compile -``` -./bazel build src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_client_event_logs_scio -``` - -#### Build Jar -``` -./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_client_event_logs_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-client-event-logs-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-04-27 \ - --bind=profile.output_path=processed/interaction_graph_agg_client_event_logs_dataflow -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD deleted file mode 100644 index 51479c70d..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD +++ /dev/null @@ -1,65 +0,0 @@ -scala_library( - name = "agg_direct_interactions", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_direct_interactions_edge_daily-scala", - ":interaction_graph_agg_direct_interactions_vertex_daily-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/thrift/com/twitter/timelineservice/server/internal:thrift-scala", - "twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala", - "twadoop_config/configuration/log_categories/group/tweetypie:tweetypie_media_tag_events-scala", - "tweetsource/common:unhydrated_flat-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/combined:usersource-scala", - ], -) - -jvm_binary( - name = "interaction_graph_agg_direct_interactions_scio", - main = "com.twitter.interaction_graph.scio.agg_direct_interactions.InteractionGraphAggDirectInteractionsJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":agg_direct_interactions", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_direct_interactions_edge_daily", - description = "User-user directed edges with direct interactions features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_direct_interactions_vertex_daily", - description = "User vertex with direct interactions features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD.docx new file mode 100644 index 000000000..ed63316c4 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.docx new file mode 100644 index 000000000..414764dff Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.scala deleted file mode 100644 index 0b855cee2..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.scala +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_direct_interactions - -import com.spotify.scio.ScioContext -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.io.fs.multiformat.WriteOptions -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.interaction_graph.scio.common.UserUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import org.joda.time.Interval - -object InteractionGraphAggDirectInteractionsJob - extends ScioBeamJob[InteractionGraphAggDirectInteractionsOption] { - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphAggDirectInteractionsOption - ): Unit = { - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val dateInterval: Interval = pipelineOptions.interval - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - - val source = InteractionGraphAggDirectInteractionsSource(pipelineOptions) - - val rawUsers = source.readCombinedUsers() - val safeUsers = UserUtil.getValidUsers(rawUsers) - - val rawFavorites = source.readFavorites(dateInterval) - val rawPhotoTags = source.readPhotoTags(dateInterval) - val tweetSource = source.readTweetSource(dateInterval) - - val (vertex, edges) = InteractionGraphAggDirectInteractionsUtil.process( - rawFavorites, - tweetSource, - rawPhotoTags, - safeUsers - ) - - vertex.saveAsCustomOutput( - "Write Vertex Records", - DAL.write[Vertex]( - InteractionGraphAggDirectInteractionsVertexDailyScalaDataset, - PathLayout.DailyPath( - pipelineOptions.getOutputPath + "/aggregated_direct_interactions_vertex_daily"), - dateInterval, - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = - WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 8.0).ceil.toInt)) - ) - ) - - edges.saveAsCustomOutput( - "Write Edge Records", - DAL.write[Edge]( - InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset, - PathLayout.DailyPath( - pipelineOptions.getOutputPath + "/aggregated_direct_interactions_edge_daily"), - dateInterval, - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.docx new file mode 100644 index 000000000..9de9a108c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.scala deleted file mode 100644 index 43d3d08df..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_direct_interactions - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphAggDirectInteractionsOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.docx new file mode 100644 index 000000000..1d195d603 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.scala deleted file mode 100644 index 9470b1980..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.scala +++ /dev/null @@ -1,51 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_direct_interactions - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.cde.scio.dal_read.SourceUtil -import com.twitter.timelineservice.thriftscala.ContextualizedFavoriteEvent -import com.twitter.twadoop.user.gen.thriftscala.CombinedUser -import com.twitter.tweetsource.common.thriftscala.UnhydratedFlatTweet -import com.twitter.tweetypie.thriftscala.TweetMediaTagEvent -import com.twitter.usersource.snapshot.combined.UsersourceScalaDataset -import com.twitter.util.Duration -import org.joda.time.Interval -import twadoop_config.configuration.log_categories.group.timeline.TimelineServiceFavoritesScalaDataset -import twadoop_config.configuration.log_categories.group.tweetypie.TweetypieMediaTagEventsScalaDataset -import tweetsource.common.UnhydratedFlatScalaDataset - -case class InteractionGraphAggDirectInteractionsSource( - pipelineOptions: InteractionGraphAggDirectInteractionsOption -)( - implicit sc: ScioContext) { - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readFavorites(dateInterval: Interval): SCollection[ContextualizedFavoriteEvent] = - SourceUtil.readDALDataset[ContextualizedFavoriteEvent]( - dataset = TimelineServiceFavoritesScalaDataset, - interval = dateInterval, - dalEnvironment = dalEnvironment - ) - - def readPhotoTags(dateInterval: Interval): SCollection[TweetMediaTagEvent] = - SourceUtil.readDALDataset[TweetMediaTagEvent]( - dataset = TweetypieMediaTagEventsScalaDataset, - interval = dateInterval, - dalEnvironment = dalEnvironment) - - def readTweetSource(dateInterval: Interval): SCollection[UnhydratedFlatTweet] = - SourceUtil.readDALDataset[UnhydratedFlatTweet]( - dataset = UnhydratedFlatScalaDataset, - interval = dateInterval, - dalEnvironment = dalEnvironment) - - def readCombinedUsers(): SCollection[CombinedUser] = - SourceUtil.readMostRecentSnapshotNoOlderThanDALDataset[CombinedUser]( - dataset = UsersourceScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment - ) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.docx new file mode 100644 index 000000000..dfbeddb5e Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.scala deleted file mode 100644 index 1d996116e..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.scala +++ /dev/null @@ -1,168 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_direct_interactions - -import com.spotify.scio.ScioMetrics -import com.spotify.scio.values.SCollection -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.scio.common.FeatureKey -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.scio.common.UserUtil.DUMMY_USER_ID -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.timelineservice.thriftscala.ContextualizedFavoriteEvent -import com.twitter.timelineservice.thriftscala.FavoriteEventUnion.Favorite -import com.twitter.tweetsource.common.thriftscala.UnhydratedFlatTweet -import com.twitter.tweetypie.thriftscala.TweetMediaTagEvent - -object InteractionGraphAggDirectInteractionsUtil { - - val DefaultFeatureValue = 1L - - def favouriteFeatures( - rawFavorites: SCollection[ContextualizedFavoriteEvent] - ): SCollection[(FeatureKey, Long)] = { - rawFavorites - .withName("fav features") - .flatMap { event => - event.event match { - case Favorite(e) if e.userId != e.tweetUserId => - ScioMetrics.counter("process", "fav").inc() - Some( - FeatureKey(e.userId, e.tweetUserId, FeatureName.NumFavorites) -> DefaultFeatureValue) - case _ => None - } - } - - } - - def mentionFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource - .withName("mention features") - .flatMap { - case s if s.shareSourceTweetId.isEmpty => // only for non-retweets - s.atMentionedUserIds - .map { users => - users.toSet.map { uid: Long => - ScioMetrics.counter("process", "mention").inc() - FeatureKey(s.userId, uid, FeatureName.NumMentions) -> DefaultFeatureValue - }.toSeq - } - .getOrElse(Nil) - case _ => - Nil - } - } - - def photoTagFeatures( - rawPhotoTags: SCollection[TweetMediaTagEvent] - ): SCollection[(FeatureKey, Long)] = { - rawPhotoTags - .withName("photo tag features") - .flatMap { p => - p.taggedUserIds.map { (p.userId, _) } - } - .collect { - case (src, dst) if src != dst => - ScioMetrics.counter("process", "photo tag").inc() - FeatureKey(src, dst, FeatureName.NumPhotoTags) -> DefaultFeatureValue - } - } - - def retweetFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource - .withName("retweet features") - .collect { - case s if s.shareSourceUserId.exists(_ != s.userId) => - ScioMetrics.counter("process", "share tweet").inc() - FeatureKey( - s.userId, - s.shareSourceUserId.get, - FeatureName.NumRetweets) -> DefaultFeatureValue - } - } - - def quotedTweetFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource - .withName("quoted tweet features") - .collect { - case t if t.quotedTweetUserId.isDefined => - ScioMetrics.counter("process", "quote tweet").inc() - FeatureKey( - t.userId, - t.quotedTweetUserId.get, - FeatureName.NumTweetQuotes) -> DefaultFeatureValue - } - } - - def replyTweetFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource - .withName("reply tweet features") - .collect { - case t if t.inReplyToUserId.isDefined => - ScioMetrics.counter("process", "reply tweet").inc() - FeatureKey(t.userId, t.inReplyToUserId.get, FeatureName.NumReplies) -> DefaultFeatureValue - } - } - - // we create edges to a dummy user id since creating a tweet has no destination id - def createTweetFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource.withName("create tweet features").map { tweet => - ScioMetrics.counter("process", "create tweet").inc() - FeatureKey(tweet.userId, DUMMY_USER_ID, FeatureName.NumCreateTweets) -> DefaultFeatureValue - } - } - - def process( - rawFavorites: SCollection[ContextualizedFavoriteEvent], - tweetSource: SCollection[UnhydratedFlatTweet], - rawPhotoTags: SCollection[TweetMediaTagEvent], - safeUsers: SCollection[Long] - ): (SCollection[Vertex], SCollection[Edge]) = { - val favouriteInput = favouriteFeatures(rawFavorites) - val mentionInput = mentionFeatures(tweetSource) - val photoTagInput = photoTagFeatures(rawPhotoTags) - val retweetInput = retweetFeatures(tweetSource) - val quotedTweetInput = quotedTweetFeatures(tweetSource) - val replyInput = replyTweetFeatures(tweetSource) - val createTweetInput = createTweetFeatures(tweetSource) - - val allInput = SCollection.unionAll( - Seq( - favouriteInput, - mentionInput, - photoTagInput, - retweetInput, - quotedTweetInput, - replyInput, - createTweetInput - )) - - val filteredFeatureInput = allInput - .keyBy(_._1.src) - .intersectByKey(safeUsers) // filter for safe users - .values - .collect { - case (FeatureKey(src, dst, feature), featureValue) if src != dst => - FeatureKey(src, dst, feature) -> featureValue - } - .sumByKey - .map { - case (FeatureKey(src, dst, feature), featureValue) => - val age = 1 - InteractionGraphRawInput(src, dst, feature, age, featureValue) - } - - FeatureGeneratorUtil.getFeatures(filteredFeatureInput) - } - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.docx new file mode 100644 index 000000000..de2e6d502 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.md deleted file mode 100644 index a9e9d3610..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphAggDirectInteractions Dataflow Job - -#### IntelliJ -``` -./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_scio -``` - -#### Compile -``` -./bazel build src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_scio -``` - -#### Build Jar -``` -./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-agg-direct-interactions-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-04-13 \ - --bind=profile.output_path=processed/interaction_graph_agg_direct_interactions_dataflow -``` diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD deleted file mode 100644 index 3bf51323c..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD +++ /dev/null @@ -1,70 +0,0 @@ -scala_library( - name = "agg_flock", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_flock_edge_snapshot-scala", - ":interaction_graph_agg_flock_vertex_snapshot-scala", - "3rdparty/jvm/com/twitter/storehaus:algebra", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "flockdb-tools/datasets/flock:flock-blocks-edges-scala", - "flockdb-tools/datasets/flock:flock-mutes-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-abuse-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-spam-edges-scala", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/scala/com/twitter/wtf/dataflow/user_events:valid_user_follows-scala", - "src/thrift/com/twitter/core_workflows/user_model:user_model-scala", - "src/thrift/com/twitter/twadoop/user/gen:gen-java", - "src/thrift/com/twitter/twadoop/user/gen:gen-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/combined:usersource-scala", - ], -) - -jvm_binary( - name = "interaction_graph_agg_flock_scio", - main = "com.twitter.interaction_graph.scio.agg_flock.InteractionGraphAggFlockJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":agg_flock", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_flock_edge_snapshot", - description = "User-user directed edges with flock features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_flock_vertex_snapshot", - description = "User vertex with flock features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD.docx new file mode 100644 index 000000000..ea6103ce5 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.docx new file mode 100644 index 000000000..43b51729d Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.scala deleted file mode 100644 index e0a9f934d..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.scala +++ /dev/null @@ -1,84 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_flock - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.dal.DAL.PathLayout -import com.twitter.beam.io.dal.DAL.WriteOptions -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.interaction_graph.scio.agg_flock.InteractionGraphAggFlockUtil._ -import com.twitter.interaction_graph.scio.common.DateUtil -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.util.Duration -import java.time.Instant -import org.joda.time.Interval - -object InteractionGraphAggFlockJob extends ScioBeamJob[InteractionGraphAggFlockOption] { - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphAggFlockOption - ): Unit = { - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val dateInterval: Interval = pipelineOptions.interval - - val source = InteractionGraphAggFlockSource(pipelineOptions) - - val embiggenInterval = DateUtil.embiggen(dateInterval, Duration.fromDays(7)) - - val flockFollowsSnapshot = source.readFlockFollowsSnapshot(embiggenInterval) - - // the flock snapshot we're reading from has already been filtered for safe/valid users hence no filtering for safeUsers - val flockFollowsFeature = - getFlockFeatures(flockFollowsSnapshot, FeatureName.NumFollows, dateInterval) - - val flockMutualFollowsFeature = getMutualFollowFeature(flockFollowsFeature) - - val allSCollections = Seq(flockFollowsFeature, flockMutualFollowsFeature) - - val allFeatures = SCollection.unionAll(allSCollections) - - val (vertex, edges) = FeatureGeneratorUtil.getFeatures(allFeatures) - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - - vertex.saveAsCustomOutput( - "Write Vertex Records", - DAL.writeSnapshot[Vertex]( - InteractionGraphAggFlockVertexSnapshotScalaDataset, - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_flock_vertex_daily"), - Instant.ofEpochMilli(dateInterval.getEndMillis), - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = - WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 64.0).ceil.toInt)) - ) - ) - - edges.saveAsCustomOutput( - "Write Edge Records", - DAL.writeSnapshot[Edge]( - InteractionGraphAggFlockEdgeSnapshotScalaDataset, - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_flock_edge_daily"), - Instant.ofEpochMilli(dateInterval.getEndMillis), - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.docx new file mode 100644 index 000000000..dfdc8fa22 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.scala deleted file mode 100644 index f5ef58b55..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_flock - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphAggFlockOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.docx new file mode 100644 index 000000000..af0ac9daa Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.scala deleted file mode 100644 index 726293475..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_flock - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.flockdb.tools.datasets.flock.thriftscala.FlockEdge -import com.twitter.cde.scio.dal_read.SourceUtil -import com.twitter.wtf.dataflow.user_events.ValidUserFollowsScalaDataset -import org.joda.time.Interval - -case class InteractionGraphAggFlockSource( - pipelineOptions: InteractionGraphAggFlockOption -)( - implicit sc: ScioContext) { - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readFlockFollowsSnapshot(dateInterval: Interval): SCollection[FlockEdge] = - SourceUtil.readMostRecentSnapshotDALDataset( - dataset = ValidUserFollowsScalaDataset, - dateInterval = dateInterval, - dalEnvironment = dalEnvironment) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.docx new file mode 100644 index 000000000..8b7d8a9cd Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.scala deleted file mode 100644 index 89858a89a..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.scala +++ /dev/null @@ -1,63 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_flock - -import com.spotify.scio.values.SCollection -import com.twitter.algebird.Min -import com.twitter.flockdb.tools.datasets.flock.thriftscala.FlockEdge -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.thriftscala.FeatureName -import java.time.Instant -import java.time.temporal.ChronoUnit -import org.joda.time.Interval - -object InteractionGraphAggFlockUtil { - - def getFlockFeatures( - edges: SCollection[FlockEdge], - featureName: FeatureName, - dateInterval: Interval - ): SCollection[InteractionGraphRawInput] = { - edges - .withName(s"${featureName.toString} - Converting flock edge to interaction graph input") - .map { edge => - // NOTE: getUpdatedAt gives time in the seconds resolution - // Because we use .extend() when reading the data source, the updatedAt time might be larger than the dateRange. - // We need to cap them, otherwise, DateUtil.diffDays gives incorrect results. - val start = (edge.updatedAt * 1000L).min(dateInterval.getEnd.toInstant.getMillis) - val end = dateInterval.getStart.toInstant.getMillis - val age = ChronoUnit.DAYS.between( - Instant.ofEpochMilli(start), - Instant.ofEpochMilli(end) - ) + 1 - InteractionGraphRawInput(edge.sourceId, edge.destinationId, featureName, age.toInt, 1.0) - } - - } - - def getMutualFollowFeature( - flockFollowFeature: SCollection[InteractionGraphRawInput] - ): SCollection[InteractionGraphRawInput] = { - flockFollowFeature - .withName("Convert FlockFollows to Mutual Follows") - .map { input => - val sourceId = input.src - val destId = input.dst - - if (sourceId < destId) { - Tuple2(sourceId, destId) -> Tuple2(Set(true), Min(input.age)) // true means follow - } else { - Tuple2(destId, sourceId) -> Tuple2(Set(false), Min(input.age)) // false means followed_by - } - } - .sumByKey - .flatMap { - case ((id1, id2), (followSet, minAge)) if followSet.size == 2 => - val age = minAge.get - Seq( - InteractionGraphRawInput(id1, id2, FeatureName.NumMutualFollows, age, 1.0), - InteractionGraphRawInput(id2, id1, FeatureName.NumMutualFollows, age, 1.0)) - case _ => - Nil - } - } - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.docx new file mode 100644 index 000000000..14af45521 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.md deleted file mode 100644 index 0ff797194..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphClientEventLogs Dataflow Job - -#### IntelliJ -``` -./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_scio -``` - -#### Compile -``` -./bazel build src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_scio -``` - -#### Build Jar -``` -./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-agg-flock-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_flock/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-04-13 \ - --bind=profile.output_path=processed/interaction_graph_agg_flock_dataflow -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD deleted file mode 100644 index 1fbe57e1f..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD +++ /dev/null @@ -1,43 +0,0 @@ -scala_library( - name = "agg_negative", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":real_graph_negative_features-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "flockdb-tools/datasets/flock:flock-blocks-edges-scala", - "flockdb-tools/datasets/flock:flock-mutes-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-abuse-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-spam-edges-scala", - "socialgraph/hadoop/src/main/scala/com/twitter/socialgraph/hadoop:socialgraph-unfollows-scala", - "src/scala/com/twitter/interaction_graph/scio/common", - "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam", - ], -) - -jvm_binary( - name = "interaction_graph_negative_scio", - main = "com.twitter.interaction_graph.scio.agg_negative.InteractionGraphNegativeJob", - platform = "java8", - dependencies = [ - ":agg_negative", - ], -) - -create_datasets( - base_name = "real_graph_negative_features", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.injection.UserSessionInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.user_session_store.thriftscala.UserSession", - scala_dependencies = [ - "src/scala/com/twitter/interaction_graph/injection:user_session_inj", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD.docx new file mode 100644 index 000000000..8d696ebb4 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.docx new file mode 100644 index 000000000..242e1b8c6 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.scala deleted file mode 100644 index 479b67524..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.scala +++ /dev/null @@ -1,155 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_negative - -import com.google.api.services.bigquery.model.TimePartitioning -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.algebird.mutable.PriorityQueueMonoid -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.io.fs.multiformat.WriteOptions -import com.twitter.conversions.DurationOps._ -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.interaction_graph.scio.common.ConversionUtil.hasNegativeFeatures -import com.twitter.interaction_graph.scio.common.ConversionUtil.toRealGraphEdgeFeatures -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil.getEdgeFeature -import com.twitter.interaction_graph.scio.common.GraphUtil -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.scrooge.ThriftStruct -import com.twitter.socialgraph.hadoop.SocialgraphUnfollowsScalaDataset -import com.twitter.tcdc.bqblaster.beam.syntax._ -import com.twitter.tcdc.bqblaster.core.avro.TypedProjection -import com.twitter.tcdc.bqblaster.core.transform.RootTransform -import com.twitter.timelines.real_graph.thriftscala.RealGraphFeaturesTest -import com.twitter.timelines.real_graph.v1.thriftscala.{RealGraphFeatures => RealGraphFeaturesV1} -import com.twitter.user_session_store.thriftscala.UserSession -import flockdb_tools.datasets.flock.FlockBlocksEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockMutesEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockReportAsAbuseEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockReportAsSpamEdgesScalaDataset -import java.time.Instant -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO - -object InteractionGraphNegativeJob extends ScioBeamJob[InteractionGraphNegativeOption] { - val maxDestinationIds = 500 // p99 is about 500 - def getFeatureCounts(e: Edge): Int = e.features.size - val negativeEdgeOrdering = Ordering.by[Edge, Int](getFeatureCounts) - val negativeEdgeReverseOrdering = negativeEdgeOrdering.reverse - implicit val pqMonoid: PriorityQueueMonoid[Edge] = - new PriorityQueueMonoid[Edge](maxDestinationIds)(negativeEdgeOrdering) - - override protected def configurePipeline( - sc: ScioContext, - opts: InteractionGraphNegativeOption - ): Unit = { - - val endTs = opts.interval.getEndMillis - - // read input datasets - val blocks: SCollection[InteractionGraphRawInput] = - GraphUtil.getFlockFeatures( - readSnapshot(FlockBlocksEdgesScalaDataset, sc), - FeatureName.NumBlocks, - endTs) - - val mutes: SCollection[InteractionGraphRawInput] = - GraphUtil.getFlockFeatures( - readSnapshot(FlockMutesEdgesScalaDataset, sc), - FeatureName.NumMutes, - endTs) - - val abuseReports: SCollection[InteractionGraphRawInput] = - GraphUtil.getFlockFeatures( - readSnapshot(FlockReportAsAbuseEdgesScalaDataset, sc), - FeatureName.NumReportAsAbuses, - endTs) - - val spamReports: SCollection[InteractionGraphRawInput] = - GraphUtil.getFlockFeatures( - readSnapshot(FlockReportAsSpamEdgesScalaDataset, sc), - FeatureName.NumReportAsSpams, - endTs) - - // we only keep unfollows in the past 90 days due to the huge size of this dataset, - // and to prevent permanent "shadow-banning" in the event of accidental unfollows. - // we treat unfollows as less critical than above 4 negative signals, since it deals more with - // interest than health typically, which might change over time. - val unfollows: SCollection[InteractionGraphRawInput] = - GraphUtil - .getSocialGraphFeatures( - readSnapshot(SocialgraphUnfollowsScalaDataset, sc), - FeatureName.NumUnfollows, - endTs) - .filter(_.age < 90) - - // group all features by (src, dest) - val allEdgeFeatures: SCollection[Edge] = - getEdgeFeature(SCollection.unionAll(Seq(blocks, mutes, abuseReports, spamReports, unfollows))) - - val negativeFeatures: SCollection[KeyVal[Long, UserSession]] = - allEdgeFeatures - .keyBy(_.sourceId) - .topByKey(maxDestinationIds)(Ordering.by(_.features.size)) - .map { - case (srcId, pqEdges) => - val topKNeg = - pqEdges.toSeq.flatMap(toRealGraphEdgeFeatures(hasNegativeFeatures)) - KeyVal( - srcId, - UserSession( - userId = Some(srcId), - realGraphFeaturesTest = - Some(RealGraphFeaturesTest.V1(RealGraphFeaturesV1(topKNeg))))) - } - - // save to GCS (via DAL) - negativeFeatures.saveAsCustomOutput( - "Write Negative Edge Label", - DAL.writeVersionedKeyVal( - dataset = RealGraphNegativeFeaturesScalaDataset, - pathLayout = PathLayout.VersionedPath(opts.getOutputPath), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis), - writeOption = WriteOptions(numOfShards = Some(3000)) - ) - ) - - // save to BQ - val ingestionDate = opts.getDate().value.getStart.toDate - val bqDataset = opts.getBqDataset - val bqFieldsTransform = RootTransform - .Builder() - .withPrependedFields("dateHour" -> TypedProjection.fromConstant(ingestionDate)) - val timePartitioning = new TimePartitioning() - .setType("DAY").setField("dateHour").setExpirationMs(21.days.inMilliseconds) - val bqWriter = BigQueryIO - .write[Edge] - .to(s"${bqDataset}.interaction_graph_agg_negative_edge_snapshot") - .withExtendedErrorInfo() - .withTimePartitioning(timePartitioning) - .withLoadJobProjectId("twttr-recos-ml-prod") - .withThriftSupport(bqFieldsTransform.build(), AvroConverter.Legacy) - .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition( - BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE - ) // we only want the latest snapshot - - allEdgeFeatures - .saveAsCustomOutput( - s"Save Recommendations to BQ interaction_graph_agg_negative_edge_snapshot", - bqWriter - ) - } - - def readSnapshot[T <: ThriftStruct]( - dataset: SnapshotDALDataset[T], - sc: ScioContext - ): SCollection[T] = { - sc.customInput( - s"Reading most recent snaphost ${dataset.role.name}.${dataset.logicalName}", - DAL.readMostRecentSnapshotNoOlderThan[T](dataset, 7.days) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.docx new file mode 100644 index 000000000..659bb48b8 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.scala deleted file mode 100644 index c44dc3396..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.scala +++ /dev/null @@ -1,18 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_negative - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphNegativeOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("BQ dataset prefix") - def getBqDataset: String - def setBqDataset(value: String): Unit - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.docx new file mode 100644 index 000000000..bdeaa81c5 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.md deleted file mode 100644 index 9df76e7ad..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.md +++ /dev/null @@ -1,35 +0,0 @@ -## InteractionGraphNegative Dataflow Job - -#### IntelliJ -``` -fastpass create --name rg_neg --intellij src/scala/com/twitter/interaction_graph/scio/agg_negative -``` - -#### Compile -``` -bazel build src/scala/com/twitter/interaction_graph/scio/agg_negative:interaction_graph_negative_scio -``` - -#### Build Jar -``` -bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_negative:interaction_graph_negative_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-negative-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_negative/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-10-19 \ - --bind=profile.output_path=processed/interaction_graph_agg_negative_dataflow \ - --bind=profile.bq_dataset="twttr-bq-cassowary-prod:user" -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD deleted file mode 100644 index 25dfa572b..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD +++ /dev/null @@ -1,65 +0,0 @@ -scala_library( - name = "agg_notifications", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_notifications_edge_daily-scala", - ":interaction_graph_agg_notifications_vertex_daily-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/frigate/data_pipeline_beam/mr-client-event-filtering-job/src/main/scala/com/twitter/client_event_filtering:frigate_filtered_client_events_dataflow-scala", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/scala/com/twitter/wtf/scalding/jobs/client_event_processing:user_interaction-scala", - "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam", - "twadoop_config/configuration/log_categories/group/frigate:frigate_notifier-scala", - "tweetsource/public_tweets/src/main/scala/com/twitter/tweetsource/public_tweets:public_tweets-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/combined:usersource-scala", - ], -) - -jvm_binary( - name = "interaction_graph_notifications_scio", - main = "com.twitter.interaction_graph.scio.agg_notifications.InteractionGraphNotificationsJob", - platform = "java8", - dependencies = [ - ":agg_notifications", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_notifications_edge_daily", - description = "User-user directed edges with notification features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_notifications_vertex_daily", - description = "User vertex with notification features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD.docx new file mode 100644 index 000000000..69e25dcc0 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.docx new file mode 100644 index 000000000..d35c89e6e Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.scala deleted file mode 100644 index 2ca5a9cf4..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.scala +++ /dev/null @@ -1,132 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_notifications - -import com.spotify.scio.ScioMetrics -import com.twitter.clientapp.thriftscala.EventNamespace -import com.twitter.clientapp.thriftscala.LogEvent -import com.twitter.interaction_graph.thriftscala.FeatureName - -object InteractionGraphNotificationUtil { - - val PUSH_OPEN_ACTIONS = Set("open", "background_open") - val NTAB_CLICK_ACTIONS = Set("navigate", "click") - val STATUS_ID_REGEX = "^twitter:\\/\\/tweet\\?status_id=([0-9]+).*".r - val TWEET_ID_REGEX = "^twitter:\\/\\/tweet.id=([0-9]+).*".r - - def extractTweetIdFromUrl(url: String): Option[Long] = url match { - case STATUS_ID_REGEX(statusId) => - ScioMetrics.counter("regex matching", "status_id=").inc() - Some(statusId.toLong) - case TWEET_ID_REGEX(tweetId) => - ScioMetrics.counter("regex matching", "tweet?id=").inc() - Some(tweetId.toLong) - case _ => None - } - - def getPushNtabEvents(e: LogEvent): Seq[(Long, (Long, FeatureName))] = { - for { - logBase <- e.logBase.toSeq - userId <- logBase.userId.toSeq - namespace <- e.eventNamespace.toSeq - (tweetId, featureName) <- namespace match { - case EventNamespace(_, _, _, _, _, Some(action)) if PUSH_OPEN_ACTIONS.contains(action) => - (for { - details <- e.eventDetails - url <- details.url - tweetId <- extractTweetIdFromUrl(url) - } yield { - ScioMetrics.counter("event type", "push open").inc() - (tweetId, FeatureName.NumPushOpens) - }).toSeq - case EventNamespace(_, Some("ntab"), _, _, _, Some("navigate")) => - val tweetIds = for { - details <- e.eventDetails.toSeq - items <- details.items.toSeq - item <- items - ntabDetails <- item.notificationTabDetails.toSeq - clientEventMetadata <- ntabDetails.clientEventMetadata.toSeq - tweetIds <- clientEventMetadata.tweetIds.toSeq - tweetId <- tweetIds - } yield { - ScioMetrics.counter("event type", "ntab navigate").inc() - tweetId - } - tweetIds.map((_, FeatureName.NumNtabClicks)) - case EventNamespace(_, Some("ntab"), _, _, _, Some("click")) => - val tweetIds = for { - details <- e.eventDetails.toSeq - items <- details.items.toSeq - item <- items - tweetId <- item.id - } yield { - ScioMetrics.counter("event type", "ntab click").inc() - tweetId - } - tweetIds.map((_, FeatureName.NumNtabClicks)) - case _ => Nil - } - } yield (tweetId, (userId, featureName)) - } - - /** - * Returns events corresponding to ntab clicks. We have the tweet id from ntab clicks and can join - * those with public tweets. - */ - def getNtabEvents(e: LogEvent): Seq[(Long, (Long, FeatureName))] = { - for { - logBase <- e.logBase.toSeq - userId <- logBase.userId.toSeq - namespace <- e.eventNamespace.toSeq - (tweetId, featureName) <- namespace match { - case EventNamespace(_, Some("ntab"), _, _, _, Some("navigate")) => - val tweetIds = for { - details <- e.eventDetails.toSeq - items <- details.items.toSeq - item <- items - ntabDetails <- item.notificationTabDetails.toSeq - clientEventMetadata <- ntabDetails.clientEventMetadata.toSeq - tweetIds <- clientEventMetadata.tweetIds.toSeq - tweetId <- tweetIds - } yield { - ScioMetrics.counter("event type", "ntab navigate").inc() - tweetId - } - tweetIds.map((_, FeatureName.NumNtabClicks)) - case EventNamespace(_, Some("ntab"), _, _, _, Some("click")) => - val tweetIds = for { - details <- e.eventDetails.toSeq - items <- details.items.toSeq - item <- items - tweetId <- item.id - } yield { - ScioMetrics.counter("event type", "ntab click").inc() - tweetId - } - tweetIds.map((_, FeatureName.NumNtabClicks)) - case _ => Nil - } - } yield (tweetId, (userId, featureName)) - } - - /** - * get push open events, keyed by impressionId (as the client event does not always have the tweetId nor the authorId) - */ - def getPushOpenEvents(e: LogEvent): Seq[(String, (Long, FeatureName))] = { - for { - logBase <- e.logBase.toSeq - userId <- logBase.userId.toSeq - namespace <- e.eventNamespace.toSeq - (tweetId, featureName) <- namespace match { - case EventNamespace(_, _, _, _, _, Some(action)) if PUSH_OPEN_ACTIONS.contains(action) => - val impressionIdOpt = for { - details <- e.notificationDetails - impressionId <- details.impressionId - } yield { - ScioMetrics.counter("event type", "push open").inc() - impressionId - } - impressionIdOpt.map((_, FeatureName.NumPushOpens)).toSeq - case _ => Nil - } - } yield (tweetId, (userId, featureName)) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.docx new file mode 100644 index 000000000..5f5b8237a Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.scala deleted file mode 100644 index 2a01988be..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.scala +++ /dev/null @@ -1,86 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_notifications - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.DiskFormat -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.io.fs.multiformat.ReadOptions -import com.twitter.beam.io.fs.multiformat.WriteOptions -import com.twitter.client_event_filtering.FrigateFilteredClientEventsDataflowScalaDataset -import com.twitter.clientapp.thriftscala.LogEvent -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.thriftscala._ -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.tweetsource.public_tweets.PublicTweetsScalaDataset - -object InteractionGraphNotificationsJob extends ScioBeamJob[InteractionGraphNotificationsOption] { - override protected def configurePipeline( - sc: ScioContext, - opts: InteractionGraphNotificationsOption - ): Unit = { - - val pushClientEvents: SCollection[LogEvent] = sc - .customInput( - name = "Read Push Client Events", - DAL - .read( - FrigateFilteredClientEventsDataflowScalaDataset, - opts.interval, - DAL.Environment.Prod, - ) - ) - val pushNtabEvents = - pushClientEvents.flatMap(InteractionGraphNotificationUtil.getPushNtabEvents) - - // look back tweets for 2 days because MR gets tweets from 2 days ago. - // Allow a grace period of 24 hours to reduce oncall workload - val graceHours = 24 - val interval2DaysBefore = - opts.interval.withStart(opts.interval.getStart.minusDays(2).plusHours(graceHours)) - val tweetAuthors: SCollection[(Long, Long)] = sc - .customInput( - name = "Read Tweets", - DAL - .read( - dataset = PublicTweetsScalaDataset, - interval = interval2DaysBefore, - environmentOverride = DAL.Environment.Prod, - readOptions = ReadOptions(projections = Some(Seq("tweetId", "userId"))) - ) - ).map { t => (t.tweetId, t.userId) } - - val pushNtabEdgeCounts = pushNtabEvents - .join(tweetAuthors) - .map { - case (_, ((srcId, feature), destId)) => ((srcId, destId, feature), 1L) - } - .withName("summing edge feature counts") - .sumByKey - - val aggPushEdges = pushNtabEdgeCounts - .map { - case ((srcId, destId, featureName), count) => - (srcId, destId) -> Seq( - EdgeFeature(featureName, FeatureGeneratorUtil.initializeTSS(count))) - } - .sumByKey - .map { - case ((srcId, destId), edgeFeatures) => - Edge(srcId, destId, None, edgeFeatures.sortBy(_.name.value)) - } - - aggPushEdges.saveAsCustomOutput( - "Write Edge Records", - DAL.write[Edge]( - InteractionGraphAggNotificationsEdgeDailyScalaDataset, - PathLayout.DailyPath(opts.getOutputPath + "/aggregated_notifications_edge_daily"), - opts.interval, - DiskFormat.Parquet, - Environment.valueOf(opts.getDALWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(opts.getNumberOfShards)) - ) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.docx new file mode 100644 index 000000000..bc002c17d Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.scala deleted file mode 100644 index dd1b4c769..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_notifications - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphNotificationsOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(8) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.docx new file mode 100644 index 000000000..b8d827be0 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.md deleted file mode 100644 index f5f274ad8..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphClientEventLogs Dataflow Job - -#### IntelliJ -``` -fastpass create --name rg_labels --intellij src/scala/com/twitter/interaction_graph/scio/agg_notifications -``` - -#### Compile -``` -bazel build src/scala/com/twitter/interaction_graph/scio/agg_notifications:interaction_graph_notifications_scio -``` - -#### Build Jar -``` -bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_notifications:interaction_graph_notifications_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-notifications-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_notifications/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-05-10 \ - --bind=profile.output_path=processed/interaction_graph_agg_notifications_dataflow -``` diff --git a/src/scala/com/twitter/interaction_graph/scio/common/BUILD b/src/scala/com/twitter/interaction_graph/scio/common/BUILD deleted file mode 100644 index 4916728c5..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -scala_library( - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/algebird:core", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "flockdb-tools/datasets/flock/src/main/thrift:thrift-scala", - "src/scala/com/twitter/pluck/source/combined_user_scrooge_source", - "src/thrift/com/twitter/gizmoduck:user-thrift-scala", - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - "src/thrift/com/twitter/socialgraph:thrift-scala", - "src/thrift/com/twitter/twadoop/user/gen:gen-scala", - "src/thrift/com/twitter/user_session_store:thrift-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - ], -) - -scala_library( - name = "feature_groups", - sources = ["FeatureGroups.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/common/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/common/BUILD.docx new file mode 100644 index 000000000..b2167503c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.docx b/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.docx new file mode 100644 index 000000000..9744a6eb9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.scala b/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.scala deleted file mode 100644 index d8264fd8e..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.scala +++ /dev/null @@ -1,21 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.interaction_graph.thriftscala.FeatureName - -/** Interaction Graph Raw Input type defines a common type for edge / vertex feature calculation - * It has fields: (source Id, destination Id, Feature Name, age of this relationship (in days), - * and value to be aggregated) - */ -case class InteractionGraphRawInput( - src: Long, - dst: Long, - name: FeatureName, - age: Int, - featureValue: Double) - -case class FeatureKey( - src: Long, - dest: Long, - name: FeatureName) - -case class Tweepcred(userId: Long, tweepcred: Short) diff --git a/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.docx new file mode 100644 index 000000000..82dcb38cf Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.scala deleted file mode 100644 index a23816078..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.scala +++ /dev/null @@ -1,110 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.ScioMetrics -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics -import com.twitter.timelines.real_graph.v1.thriftscala.RealGraphEdgeFeatures -import com.twitter.timelines.real_graph.v1.thriftscala.{ - RealGraphEdgeFeature => RealGraphEdgeFeatureV1 -} - -object ConversionUtil { - def toRealGraphEdgeFeatureV1(tss: TimeSeriesStatistics): RealGraphEdgeFeatureV1 = { - RealGraphEdgeFeatureV1( - mean = Some(tss.mean), - ewma = Some(tss.ewma), - m2ForVariance = Some(tss.m2ForVariance), - daysSinceLast = tss.numDaysSinceLast.map(_.toShort), - nonZeroDays = Some(tss.numNonZeroDays.toShort), - elapsedDays = Some(tss.numElapsedDays.toShort), - isMissing = Some(false) - ) - } - - /** - * Checks if the converted `RealGraphEdgeFeatures` has negative edges features. - * Our pipeline includes other negative interactions that aren't in the UserSession thrift - * so we'll just filter them away for now (for parity). - */ - def hasNegativeFeatures(rgef: RealGraphEdgeFeatures): Boolean = { - rgef.numMutes.nonEmpty || - rgef.numBlocks.nonEmpty || - rgef.numReportAsAbuses.nonEmpty || - rgef.numReportAsSpams.nonEmpty - } - - /** - * Checks if the converted `RealGraphEdgeFeatures` has some of the key interaction features present. - * This is adapted from timeline's code here: - */ - def hasTimelinesRequiredFeatures(rgef: RealGraphEdgeFeatures): Boolean = { - rgef.retweetsFeature.nonEmpty || - rgef.favsFeature.nonEmpty || - rgef.mentionsFeature.nonEmpty || - rgef.tweetClicksFeature.nonEmpty || - rgef.linkClicksFeature.nonEmpty || - rgef.profileViewsFeature.nonEmpty || - rgef.dwellTimeFeature.nonEmpty || - rgef.inspectedStatusesFeature.nonEmpty || - rgef.photoTagsFeature.nonEmpty || - rgef.numTweetQuotes.nonEmpty || - rgef.followFeature.nonEmpty || - rgef.mutualFollowFeature.nonEmpty || - rgef.addressBookEmailFeature.nonEmpty || - rgef.addressBookPhoneFeature.nonEmpty - } - - /** - * Convert an Edge into a RealGraphEdgeFeature. - * We return the converted RealGraphEdgeFeature when filterFn is true. - * This is to allow us to filter early on during the conversion if required, rather than map over the whole - * collection of records again to filter. - * - * @param filterFn true if and only if we want to keep the converted feature - */ - def toRealGraphEdgeFeatures( - filterFn: RealGraphEdgeFeatures => Boolean - )( - e: Edge - ): Option[RealGraphEdgeFeatures] = { - val baseFeature = RealGraphEdgeFeatures(destId = e.destinationId) - val aggregatedFeature = e.features.foldLeft(baseFeature) { - case (aggregatedFeature, edgeFeature) => - val f = Some(toRealGraphEdgeFeatureV1(edgeFeature.tss)) - ScioMetrics.counter("toRealGraphEdgeFeatures", edgeFeature.name.name).inc() - edgeFeature.name match { - case FeatureName.NumRetweets => aggregatedFeature.copy(retweetsFeature = f) - case FeatureName.NumFavorites => aggregatedFeature.copy(favsFeature = f) - case FeatureName.NumMentions => aggregatedFeature.copy(mentionsFeature = f) - case FeatureName.NumTweetClicks => aggregatedFeature.copy(tweetClicksFeature = f) - case FeatureName.NumLinkClicks => aggregatedFeature.copy(linkClicksFeature = f) - case FeatureName.NumProfileViews => aggregatedFeature.copy(profileViewsFeature = f) - case FeatureName.TotalDwellTime => aggregatedFeature.copy(dwellTimeFeature = f) - case FeatureName.NumInspectedStatuses => - aggregatedFeature.copy(inspectedStatusesFeature = f) - case FeatureName.NumPhotoTags => aggregatedFeature.copy(photoTagsFeature = f) - case FeatureName.NumFollows => aggregatedFeature.copy(followFeature = f) - case FeatureName.NumMutualFollows => aggregatedFeature.copy(mutualFollowFeature = f) - case FeatureName.AddressBookEmail => aggregatedFeature.copy(addressBookEmailFeature = f) - case FeatureName.AddressBookPhone => aggregatedFeature.copy(addressBookPhoneFeature = f) - case FeatureName.AddressBookInBoth => aggregatedFeature.copy(addressBookInBothFeature = f) - case FeatureName.AddressBookMutualEdgeEmail => - aggregatedFeature.copy(addressBookMutualEdgeEmailFeature = f) - case FeatureName.AddressBookMutualEdgePhone => - aggregatedFeature.copy(addressBookMutualEdgePhoneFeature = f) - case FeatureName.AddressBookMutualEdgeInBoth => - aggregatedFeature.copy(addressBookMutualEdgeInBothFeature = f) - case FeatureName.NumTweetQuotes => aggregatedFeature.copy(numTweetQuotes = f) - case FeatureName.NumBlocks => aggregatedFeature.copy(numBlocks = f) - case FeatureName.NumMutes => aggregatedFeature.copy(numMutes = f) - case FeatureName.NumReportAsSpams => aggregatedFeature.copy(numReportAsSpams = f) - case FeatureName.NumReportAsAbuses => aggregatedFeature.copy(numReportAsAbuses = f) - case _ => aggregatedFeature - } - } - if (filterFn(aggregatedFeature)) - Some(aggregatedFeature.copy(weight = e.weight.orElse(Some(0.0)))) - else None - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.docx new file mode 100644 index 000000000..a2eb39c5f Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.scala deleted file mode 100644 index f791d538a..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.scala +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.util.Duration -import org.joda.time.Interval - -object DateUtil { - def embiggen(dateInterval: Interval, duration: Duration): Interval = { - - val days = duration.inDays - val newStart = dateInterval.getStart.minusDays(days) - val newEnd = dateInterval.getEnd.plusDays(days) - new Interval(newStart, newEnd) - } - - def subtract(dateInterval: Interval, duration: Duration): Interval = { - val days = duration.inDays - val newStart = dateInterval.getStart.minusDays(days) - val newEnd = dateInterval.getEnd.minusDays(days) - new Interval(newStart, newEnd) - } - - def prependDays(dateInterval: Interval, duration: Duration): Interval = { - val days = duration.inDays - val newStart = dateInterval.getStart.minusDays(days) - new Interval(newStart, dateInterval.getEnd.toInstant) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.docx b/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.docx new file mode 100644 index 000000000..649513ba4 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.scala b/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.scala deleted file mode 100644 index 004a141bb..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.scala +++ /dev/null @@ -1,350 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.ScioMetrics -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.EdgeFeature -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics - -object EdgeFeatureCombiner { - def apply(srcId: Long, destId: Long): EdgeFeatureCombiner = new EdgeFeatureCombiner( - instanceEdge = Edge(srcId, destId), - featureMap = Map( - FeatureName.NumRetweets -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumFavorites -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumMentions -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumTweetClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumLinkClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumProfileViews -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumFollows -> new BooleanOrEdgeCombiner, - FeatureName.NumUnfollows -> new BooleanOrEdgeCombiner, - FeatureName.NumMutualFollows -> new BooleanOrEdgeCombiner, - FeatureName.NumBlocks -> new BooleanOrEdgeCombiner, - FeatureName.NumMutes -> new BooleanOrEdgeCombiner, - FeatureName.NumReportAsAbuses -> new BooleanOrEdgeCombiner, - FeatureName.NumReportAsSpams -> new BooleanOrEdgeCombiner, - FeatureName.NumTweetQuotes -> new WeightedAdditiveEdgeCombiner, - FeatureName.AddressBookEmail -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookPhone -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookInBoth -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookMutualEdgeEmail -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookMutualEdgePhone -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookMutualEdgeInBoth -> new BooleanOrEdgeCombiner, - FeatureName.TotalDwellTime -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumInspectedStatuses -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumPhotoTags -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumPushOpens -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumNtabClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtMentions -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtReplies -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtRetweets -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtFavories -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtLinkClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtTweetClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtTweetQuotes -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumShares -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumEmailOpen -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumEmailClick -> new WeightedAdditiveEdgeCombiner, - ) - ) -} - -/** - * This class can take in a number of input Edge thrift objects, (all of which are assumed to - * contain information about a single edge) and builds a combined Edge protobuf object, which has - * the union of all the input. - *

- * There are two modes of aggregation: one of them just adds the values in assuming that these are - * from the same day, and the other adds them in a time-decayed manner using the passed in weights. - *

- * The input objects features must be disjoint. Also, remember that the edge is directed! - */ -class EdgeFeatureCombiner(instanceEdge: Edge, featureMap: Map[FeatureName, EFeatureCombiner]) { - - /** - * Adds features without any decay. To be used for the same day. - * - * @param edge edge to be added into the combiner - */ - def addFeature(edge: Edge): EdgeFeatureCombiner = { - - val newEdge = - if (edge.weight.isDefined) instanceEdge.copy(weight = edge.weight) else instanceEdge - val newFeatures = featureMap.map { - case (featureName, combiner) => - edge.features.find(_.name.equals(featureName)) match { - case Some(feature) => - val updatedCombiner = - if (combiner.isSet) combiner.updateFeature(feature) else combiner.setFeature(feature) - (featureName, updatedCombiner) - case _ => (featureName, combiner) - } - } - - new EdgeFeatureCombiner(newEdge, newFeatures) - - } - - /** - * Adds features with decays. Used for combining multiple days. - * - * @param edge edge to be added into the combiner - * @param alpha parameters for the decay calculation - * @param day number of days from today - */ - def addFeature(edge: Edge, alpha: Double, day: Int): EdgeFeatureCombiner = { - - val newEdge = if (edge.weight.isDefined) edge.copy(weight = edge.weight) else edge - val newFeatures = featureMap.map { - case (featureName, combiner) => - edge.features.find(_.name.equals(featureName)) match { - case Some(feature) => - val updatedCombiner = - if (combiner.isSet) combiner.updateFeature(feature, alpha, day) - else combiner.setFeature(feature, alpha, day) - ScioMetrics.counter("EdgeFeatureCombiner.addFeature", feature.name.name).inc() - (featureName, updatedCombiner) - case _ => (featureName, combiner) - } - } - new EdgeFeatureCombiner(newEdge, newFeatures) - } - - /** - * Generate the final combined Edge instance - * We return a deterministically sorted list of edge features - * - * @param totalDays total number of days to be combined together - */ - def getCombinedEdge(totalDays: Int): Edge = { - val moreFeatures = featureMap.values - .flatMap { combiner => - combiner.getFinalFeature(totalDays) - }.toList.sortBy(_.name.value) - instanceEdge.copy( - features = moreFeatures - ) - } - -} - -/** - * This portion contains the actual combination logic. For now, we only implement a simple - * additive combiner, but in future we'd like to have things like time-weighted (exponential - * decay, maybe) values. - */ - -trait EFeatureCombiner { - val edgeFeature: Option[EdgeFeature] - val startingDay: Int - val endingDay: Int - val timeSeriesStatistics: Option[TimeSeriesStatistics] - - def updateTSS(feature: EdgeFeature, alpha: Double): Option[TimeSeriesStatistics] - - def addToTSS(feature: EdgeFeature): Option[TimeSeriesStatistics] - - def updateFeature(feature: EdgeFeature): EFeatureCombiner - - def updateFeature(feature: EdgeFeature, alpha: Double, day: Int): EFeatureCombiner - - def isSet: Boolean - - def dropFeature: Boolean - - def setFeature(feature: EdgeFeature, alpha: Double, day: Int): EFeatureCombiner - - def setFeature(feature: EdgeFeature): EFeatureCombiner - - def getFinalFeature(totalDays: Int): Option[EdgeFeature] - -} - -case class WeightedAdditiveEdgeCombiner( - override val edgeFeature: Option[EdgeFeature] = None, - override val startingDay: Int = Integer.MAX_VALUE, - override val endingDay: Int = Integer.MIN_VALUE, - override val timeSeriesStatistics: Option[TimeSeriesStatistics] = None) - extends EFeatureCombiner { - - override def updateTSS( - feature: EdgeFeature, - alpha: Double - ): Option[TimeSeriesStatistics] = { - timeSeriesStatistics.map(tss => - InteractionGraphUtils.updateTimeSeriesStatistics(tss, feature.tss.mean, alpha)) - } - - override def addToTSS(feature: EdgeFeature): Option[TimeSeriesStatistics] = { - timeSeriesStatistics.map(tss => - InteractionGraphUtils.addToTimeSeriesStatistics(tss, feature.tss.mean)) - } - - override def updateFeature(feature: EdgeFeature): WeightedAdditiveEdgeCombiner = { - WeightedAdditiveEdgeCombiner( - edgeFeature, - startingDay, - endingDay, - addToTSS(feature) - ) - } - - def setFeature(feature: EdgeFeature, alpha: Double, day: Int): WeightedAdditiveEdgeCombiner = { - val newStartingDay = Math.min(startingDay, day) - val newEndingDay = Math.max(endingDay, day) - - val numDaysSinceLast = - if (feature.tss.numDaysSinceLast.exists(_ > 0)) - feature.tss.numDaysSinceLast - else Some(feature.tss.numElapsedDays - feature.tss.numNonZeroDays + 1) - - val tss = feature.tss.copy( - numDaysSinceLast = numDaysSinceLast, - ewma = alpha * feature.tss.ewma - ) - - val newFeature = EdgeFeature( - name = feature.name, - tss = tss - ) - - WeightedAdditiveEdgeCombiner( - Some(newFeature), - newStartingDay, - newEndingDay, - Some(tss) - ) - } - - def getFinalFeature(totalDays: Int): Option[EdgeFeature] = { - if (edgeFeature.isEmpty || dropFeature) return None - - val newTss = if (totalDays > 0) { - val elapsed = - timeSeriesStatistics.map(tss => tss.numElapsedDays + totalDays - 1 - startingDay) - - val latest = - if (endingDay > 0) Some(totalDays - endingDay) - else - timeSeriesStatistics.flatMap(tss => - tss.numDaysSinceLast.map(numDaysSinceLast => numDaysSinceLast + totalDays - 1)) - - timeSeriesStatistics.map(tss => - tss.copy( - numElapsedDays = elapsed.get, - numDaysSinceLast = latest - )) - } else timeSeriesStatistics - - edgeFeature.map(ef => ef.copy(tss = newTss.get)) - } - - override def updateFeature( - feature: EdgeFeature, - alpha: Double, - day: Int - ): WeightedAdditiveEdgeCombiner = copy( - endingDay = Math.max(endingDay, day), - timeSeriesStatistics = updateTSS(feature, alpha) - ) - - override def dropFeature: Boolean = timeSeriesStatistics.exists(tss => - tss.numDaysSinceLast.exists(_ > InteractionGraphUtils.MAX_DAYS_RETENTION) || - tss.ewma < InteractionGraphUtils.MIN_FEATURE_VALUE) - - override def isSet = edgeFeature.isDefined - - override def setFeature(feature: EdgeFeature): WeightedAdditiveEdgeCombiner = - setFeature(feature, 1.0, 0) - -} - -/** - * This combiner resets the value to 0 if the latest event being combined = 0. Ignores time decays. - */ -case class BooleanOrEdgeCombiner( - override val edgeFeature: Option[EdgeFeature] = None, - override val startingDay: Int = Integer.MAX_VALUE, - override val endingDay: Int = Integer.MIN_VALUE, - override val timeSeriesStatistics: Option[TimeSeriesStatistics] = None) - extends EFeatureCombiner { - - override def updateTSS( - feature: EdgeFeature, - alpha: Double - ): Option[TimeSeriesStatistics] = { - val value = timeSeriesStatistics.map(tss => Math.floor(tss.ewma)) - val newValue = if (value.exists(_ == 1.0) || feature.tss.mean > 0.0) 1.0 else 0.0 - timeSeriesStatistics.map(tss => - tss.copy( - mean = newValue, - ewma = newValue, - numNonZeroDays = tss.numNonZeroDays + 1 - )) - } - - override def addToTSS(feature: EdgeFeature): Option[TimeSeriesStatistics] = { - val value = timeSeriesStatistics.map(tss => Math.floor(tss.ewma)) - val newValue = if (value.exists(_ == 1.0) || feature.tss.mean > 0.0) 1.0 else 0.0 - timeSeriesStatistics.map(tss => tss.copy(mean = newValue, ewma = newValue)) - } - - override def updateFeature(feature: EdgeFeature): BooleanOrEdgeCombiner = BooleanOrEdgeCombiner( - edgeFeature, - startingDay, - endingDay, - addToTSS(feature) - ) - - def setFeature(feature: EdgeFeature, alpha: Double, day: Int): BooleanOrEdgeCombiner = { - val newStartingDay = Math.min(startingDay, day) - val newEndingDay = Math.max(endingDay, day) - - val numDaysSinceLast = - if (feature.tss.numDaysSinceLast.exists(_ > 0)) - feature.tss.numDaysSinceLast.get - else feature.tss.numElapsedDays - feature.tss.numNonZeroDays + 1 - - val tss = feature.tss.copy( - numDaysSinceLast = Some(numDaysSinceLast), - ewma = alpha * feature.tss.ewma - ) - - val newFeature = EdgeFeature( - name = feature.name, - tss = tss - ) - - BooleanOrEdgeCombiner( - Some(newFeature), - newStartingDay, - newEndingDay, - Some(tss) - ) - } - - override def getFinalFeature(totalDays: Int): Option[EdgeFeature] = - if (timeSeriesStatistics.exists(tss => tss.ewma < 1.0)) None - else { - if (edgeFeature.isEmpty || dropFeature) return None - edgeFeature.map(ef => - ef.copy( - tss = timeSeriesStatistics.get - )) - } - - override def updateFeature( - feature: EdgeFeature, - alpha: Double, - day: Int - ): BooleanOrEdgeCombiner = copy( - endingDay = Math.max(endingDay, day), - timeSeriesStatistics = updateTSS(feature, alpha) - ) - - override def dropFeature: Boolean = false // we will keep rolling up status-based features - - override def isSet = edgeFeature.isDefined - - override def setFeature(feature: EdgeFeature): BooleanOrEdgeCombiner = setFeature(feature, 1.0, 0) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.docx new file mode 100644 index 000000000..e7e591e87 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.scala deleted file mode 100644 index 56c403522..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.scala +++ /dev/null @@ -1,263 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.ScioMetrics -import com.spotify.scio.values.SCollection -import com.twitter.interaction_graph.scio.common.FeatureGroups.DWELL_TIME_FEATURE_LIST -import com.twitter.interaction_graph.scio.common.FeatureGroups.STATUS_FEATURE_LIST -import com.twitter.interaction_graph.scio.common.UserUtil.DUMMY_USER_ID -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.EdgeFeature -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.interaction_graph.thriftscala.VertexFeature - -object FeatureGeneratorUtil { - - // Initialize a TimeSeriesStatistics object by (value, age) pair - def initializeTSS(featureValue: Double, age: Int = 1): TimeSeriesStatistics = - TimeSeriesStatistics( - mean = featureValue, - m2ForVariance = 0.0, - ewma = featureValue, - numElapsedDays = age, - numNonZeroDays = age, - numDaysSinceLast = Some(age) - ) - - /** - * Create vertex feature from InteractionGraphRawInput graph (src, dst, feature name, age, featureValue) - * We will represent non-directional features (eg num_create_tweets) as "outgoing" values. - * @return - */ - def getVertexFeature( - input: SCollection[InteractionGraphRawInput] - ): SCollection[Vertex] = { - // For vertex features we need to calculate both in and out featureValue - val vertexAggregatedFeatureValues = input - .flatMap { input => - if (input.dst != DUMMY_USER_ID) { - Seq( - ((input.src, input.name.value), (input.featureValue, 0.0)), - ((input.dst, input.name.value), (0.0, input.featureValue)) - ) - } else { - // we put the non-directional features as "outgoing" values - Seq(((input.src, input.name.value), (input.featureValue, 0.0))) - } - } - .sumByKey - .map { - case ((userId, nameId), (outEdges, inEdges)) => - (userId, (FeatureName(nameId), outEdges, inEdges)) - }.groupByKey - - vertexAggregatedFeatureValues.map { - case (userId, records) => - // sort features by FeatureName for deterministic order (esp during testing) - val features = records.toSeq.sortBy(_._1.value).flatMap { - case (name, outEdges, inEdges) => - // create out vertex features - val outFeatures = if (outEdges > 0) { - val outTss = initializeTSS(outEdges) - List( - VertexFeature( - name = name, - outgoing = true, - tss = outTss - )) - } else Nil - - // create in vertex features - val inFeatures = if (inEdges > 0) { - val inTss = initializeTSS(inEdges) - List( - VertexFeature( - name = name, - outgoing = false, - tss = inTss - )) - } else Nil - - outFeatures ++ inFeatures - } - Vertex(userId = userId, features = features) - } - } - - /** - * Create edge feature from InteractionGraphRawInput graph (src, dst, feature name, age, featureValue) - * We will exclude all non-directional features (eg num_create_tweets) from all edge aggregates - */ - def getEdgeFeature( - input: SCollection[InteractionGraphRawInput] - ): SCollection[Edge] = { - input - .withName("filter non-directional features") - .flatMap { input => - if (input.dst != DUMMY_USER_ID) { - ScioMetrics.counter("getEdgeFeature", s"directional feature ${input.name.name}").inc() - Some(((input.src, input.dst), (input.name, input.age, input.featureValue))) - } else { - ScioMetrics.counter("getEdgeFeature", s"non-directional feature ${input.name.name}").inc() - None - } - } - .withName("group features by pairs") - .groupByKey - .map { - case ((src, dst), records) => - // sort features by FeatureName for deterministic order (esp during testing) - val features = records.toSeq.sortBy(_._1.value).map { - case (name, age, featureValue) => - val tss = initializeTSS(featureValue, age) - EdgeFeature( - name = name, - tss = tss - ) - } - Edge( - sourceId = src, - destinationId = dst, - weight = Some(0.0), - features = features.toSeq - ) - } - } - - // For same user id, combine different vertex feature records into one record - // The input will assume for each (userId, featureName, direction), there will be only one record - def combineVertexFeatures( - vertex: SCollection[Vertex], - ): SCollection[Vertex] = { - vertex - .groupBy { v: Vertex => - v.userId - } - .map { - case (userId, vertexes) => - val combiner = vertexes.foldLeft(VertexFeatureCombiner(userId)) { - case (combiner, vertex) => - combiner.addFeature(vertex) - } - combiner.getCombinedVertex(0) - } - - } - - def combineEdgeFeatures( - edge: SCollection[Edge] - ): SCollection[Edge] = { - edge - .groupBy { e => - (e.sourceId, e.destinationId) - } - .withName("combining edge features for each (src, dst)") - .map { - case ((src, dst), edges) => - val combiner = edges.foldLeft(EdgeFeatureCombiner(src, dst)) { - case (combiner, edge) => - combiner.addFeature(edge) - } - combiner.getCombinedEdge(0) - } - } - - def combineVertexFeaturesWithDecay( - history: SCollection[Vertex], - daily: SCollection[Vertex], - historyWeight: Double, - dailyWeight: Double - ): SCollection[Vertex] = { - - history - .keyBy(_.userId) - .cogroup(daily.keyBy(_.userId)).map { - case (userId, (h, d)) => - // Adding history iterators - val historyCombiner = h.toList.foldLeft(VertexFeatureCombiner(userId)) { - case (combiner, vertex) => - combiner.addFeature(vertex, historyWeight, 0) - } - // Adding daily iterators - val finalCombiner = d.toList.foldLeft(historyCombiner) { - case (combiner, vertex) => - combiner.addFeature(vertex, dailyWeight, 1) - } - - finalCombiner.getCombinedVertex( - 2 - ) // 2 means totally we have 2 days(yesterday and today) data to combine together - } - } - - def combineEdgeFeaturesWithDecay( - history: SCollection[Edge], - daily: SCollection[Edge], - historyWeight: Double, - dailyWeight: Double - ): SCollection[Edge] = { - - history - .keyBy { e => - (e.sourceId, e.destinationId) - } - .withName("combine history and daily edges with decay") - .cogroup(daily.keyBy { e => - (e.sourceId, e.destinationId) - }).map { - case ((src, dst), (h, d)) => - //val combiner = EdgeFeatureCombiner(src, dst) - // Adding history iterators - - val historyCombiner = h.toList.foldLeft(EdgeFeatureCombiner(src, dst)) { - case (combiner, edge) => - combiner.addFeature(edge, historyWeight, 0) - } - - val finalCombiner = d.toList.foldLeft(historyCombiner) { - case (combiner, edge) => - combiner.addFeature(edge, dailyWeight, 1) - } - - finalCombiner.getCombinedEdge( - 2 - ) // 2 means totally we have 2 days(yesterday and today) data to combine together - - } - } - - /** - * Create features from following graph (src, dst, age, featureValue) - * Note that we will filter out vertex features represented as edges from the edge output. - */ - def getFeatures( - input: SCollection[InteractionGraphRawInput] - ): (SCollection[Vertex], SCollection[Edge]) = { - (getVertexFeature(input), getEdgeFeature(input)) - } - - // remove the edge features that from flock, address book or sms as we will refresh them on a daily basis - def removeStatusFeatures(e: Edge): Seq[Edge] = { - val updatedFeatureList = e.features.filter { e => - !STATUS_FEATURE_LIST.contains(e.name) - } - if (updatedFeatureList.size > 0) { - val edge = Edge( - sourceId = e.sourceId, - destinationId = e.destinationId, - weight = e.weight, - features = updatedFeatureList - ) - Seq(edge) - } else - Nil - } - - // check if the edge feature has features other than dwell time feature - def edgeWithFeatureOtherThanDwellTime(e: Edge): Boolean = { - e.features.exists { f => - !DWELL_TIME_FEATURE_LIST.contains(f.name) - } - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.docx b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.docx new file mode 100644 index 000000000..eacbc45ef Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.scala b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.scala deleted file mode 100644 index 89887be99..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.scala +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.interaction_graph.thriftscala.FeatureName - -object FeatureGroups { - - val HEALTH_FEATURE_LIST: Set[FeatureName] = Set( - FeatureName.NumMutes, - FeatureName.NumBlocks, - FeatureName.NumReportAsSpams, - FeatureName.NumReportAsAbuses - ) - - val STATUS_FEATURE_LIST: Set[FeatureName] = Set( - FeatureName.AddressBookEmail, - FeatureName.AddressBookPhone, - FeatureName.AddressBookInBoth, - FeatureName.AddressBookMutualEdgeEmail, - FeatureName.AddressBookMutualEdgePhone, - FeatureName.AddressBookMutualEdgeInBoth, - FeatureName.NumFollows, - FeatureName.NumUnfollows, - FeatureName.NumMutualFollows - ) ++ HEALTH_FEATURE_LIST - - val DWELL_TIME_FEATURE_LIST: Set[FeatureName] = Set( - FeatureName.TotalDwellTime, - FeatureName.NumInspectedStatuses - ) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.docx new file mode 100644 index 000000000..1b0ca045b Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.scala deleted file mode 100644 index f94c136df..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.scala +++ /dev/null @@ -1,93 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.ScioMetrics -import com.spotify.scio.values.SCollection -import com.twitter.socialgraph.presto.thriftscala.{Edge => SocialGraphEdge} -import com.twitter.flockdb.tools.datasets.flock.thriftscala.FlockEdge -import com.twitter.interaction_graph.scio.common.FeatureGroups.HEALTH_FEATURE_LIST -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName - -import java.time.Instant -import java.time.temporal.ChronoUnit - -object GraphUtil { - - /** - * Convert FlockEdge into common InteractionGraphRawInput class. - * updatedAt field in socialgraph.unfollows is in seconds. - */ - def getFlockFeatures( - edges: SCollection[FlockEdge], - featureName: FeatureName, - currentTimeMillis: Long - ): SCollection[InteractionGraphRawInput] = { - edges - .withName(s"${featureName.toString} - Converting flock edge to interaction graph input") - .map { edge => - val age = ChronoUnit.DAYS.between( - Instant.ofEpochMilli(edge.updatedAt * 1000L), // updatedAt is in seconds - Instant.ofEpochMilli(currentTimeMillis) - ) - InteractionGraphRawInput( - edge.sourceId, - edge.destinationId, - featureName, - age.max(0).toInt, - 1.0) - } - } - - /** - * Convert com.twitter.socialgraph.presto.thriftscala.Edge (from unfollows) into common InteractionGraphRawInput class. - * updatedAt field in socialgraph.unfollows is in seconds. - */ - def getSocialGraphFeatures( - edges: SCollection[SocialGraphEdge], - featureName: FeatureName, - currentTimeMillis: Long - ): SCollection[InteractionGraphRawInput] = { - edges - .withName(s"${featureName.toString} - Converting flock edge to interaction graph input") - .map { edge => - val age = ChronoUnit.DAYS.between( - Instant.ofEpochMilli(edge.updatedAt * 1000L), // updatedAt is in seconds - Instant.ofEpochMilli(currentTimeMillis) - ) - InteractionGraphRawInput( - edge.sourceId, - edge.destinationId, - featureName, - age.max(0).toInt, - 1.0) - } - } - def isFollow(edge: Edge): Boolean = { - val result = edge.features - .find(_.name == FeatureName.NumFollows) - .exists(_.tss.mean == 1.0) - result - } - - def filterExtremes(edge: Edge): Boolean = { - if (edge.weight.exists(_.isNaN)) { - ScioMetrics.counter("filter extremes", "nan").inc() - false - } else if (edge.weight.contains(Double.MaxValue)) { - ScioMetrics.counter("filter extremes", "max value").inc() - false - } else if (edge.weight.contains(Double.PositiveInfinity)) { - ScioMetrics.counter("filter extremes", "+ve inf").inc() - false - } else if (edge.weight.exists(_ < 0.0)) { - ScioMetrics.counter("filter extremes", "negative").inc() - false - } else { - true - } - } - - def filterNegative(edge: Edge): Boolean = { - !edge.features.find(ef => HEALTH_FEATURE_LIST.contains(ef.name)).exists(_.tss.mean > 0.0) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.docx b/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.docx new file mode 100644 index 000000000..3f5a229f9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.scala b/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.scala deleted file mode 100644 index be6aa0153..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.scala +++ /dev/null @@ -1,40 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics - -object InteractionGraphUtils { - final val MIN_FEATURE_VALUE = Math.pow(0.955, 60) - final val MAX_DAYS_RETENTION = 60L - final val MILLISECONDS_PER_DAY = 1000 * 60 * 60 * 24 - - def updateTimeSeriesStatistics( - timeSeriesStatistics: TimeSeriesStatistics, - currValue: Double, - alpha: Double - ): TimeSeriesStatistics = { - val numNonZeroDays = timeSeriesStatistics.numNonZeroDays + 1 - - val delta = currValue - timeSeriesStatistics.mean - val updatedMean = timeSeriesStatistics.mean + delta / numNonZeroDays - val m2ForVariance = timeSeriesStatistics.m2ForVariance + delta * (currValue - updatedMean) - val ewma = alpha * currValue + timeSeriesStatistics.ewma - - timeSeriesStatistics.copy( - mean = updatedMean, - m2ForVariance = m2ForVariance, - ewma = ewma, - numNonZeroDays = numNonZeroDays - ) - } - - def addToTimeSeriesStatistics( - timeSeriesStatistics: TimeSeriesStatistics, - currValue: Double - ): TimeSeriesStatistics = { - timeSeriesStatistics.copy( - mean = timeSeriesStatistics.mean + currValue, - ewma = timeSeriesStatistics.ewma + currValue - ) - } - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.docx new file mode 100644 index 000000000..e703114c8 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.scala deleted file mode 100644 index 39ac51006..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.scala +++ /dev/null @@ -1,76 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.coders.Coder -import com.spotify.scio.values.SCollection -import com.twitter.twadoop.user.gen.thriftscala.CombinedUser -import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser - -object UserUtil { - - /** - * placeholder for the destId when representing vertex features with no dest (eg create tweet) - * this will only be aggregated and saved in the vertex datasets but not the edge datasets - */ - val DUMMY_USER_ID = -1L - def getValidUsers(users: SCollection[CombinedUser]): SCollection[Long] = { - users - .flatMap { u => - for { - user <- u.user - if user.id != 0 - safety <- user.safety - if !(safety.suspended || safety.deactivated || safety.restricted || - safety.nsfwUser || safety.nsfwAdmin || safety.erased) - } yield { - user.id - } - } - } - - def getValidFlatUsers(users: SCollection[FlatUser]): SCollection[Long] = { - users - .flatMap { u => - for { - id <- u.id - if id != 0 && u.validUser.contains(true) - } yield { - id - } - } - } - - def getInvalidUsers(users: SCollection[FlatUser]): SCollection[Long] = { - users - .flatMap { user => - for { - valid <- user.validUser - if !valid - id <- user.id - } yield id - } - } - - def filterUsersByIdMapping[T: Coder]( - input: SCollection[T], - usersToBeFiltered: SCollection[Long], - userIdMapping: T => Long - ): SCollection[T] = { - input - .withName("filter users by id") - .keyBy(userIdMapping(_)) - .leftOuterJoin[Long](usersToBeFiltered.map(x => (x, x))) - .collect { - // only return data if the key is not in the list of usersToBeFiltered - case (_, (data, None)) => data - } - } - - def filterUsersByMultipleIdMappings[T: Coder]( - input: SCollection[T], - usersToBeFiltered: SCollection[Long], - userIdMappings: Seq[T => Long] - ): SCollection[T] = { - userIdMappings.foldLeft(input)((data, mapping) => - filterUsersByIdMapping(data, usersToBeFiltered, mapping)) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.docx b/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.docx new file mode 100644 index 000000000..fd0ac4588 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.scala b/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.scala deleted file mode 100644 index fb7ae7947..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.scala +++ /dev/null @@ -1,342 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.interaction_graph.thriftscala.VertexFeature - -object VertexFeatureCombiner { - def apply(userId: Long): VertexFeatureCombiner = new VertexFeatureCombiner( - instanceVertex = Vertex(userId), - featureMap = Map( - (FeatureName.NumRetweets, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRetweets, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumFavorites, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumFavorites, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumMentions, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumMentions, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumTweetClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumTweetClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumLinkClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumLinkClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumProfileViews, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumProfileViews, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumFollows, true) -> new ReplacementVertexCombiner, - (FeatureName.NumFollows, false) -> new ReplacementVertexCombiner, - (FeatureName.NumUnfollows, true) -> new ReplacementVertexCombiner, - (FeatureName.NumUnfollows, false) -> new ReplacementVertexCombiner, - (FeatureName.NumMutualFollows, true) -> new ReplacementVertexCombiner, - (FeatureName.NumBlocks, true) -> new ReplacementVertexCombiner, - (FeatureName.NumBlocks, false) -> new ReplacementVertexCombiner, - (FeatureName.NumMutes, true) -> new ReplacementVertexCombiner, - (FeatureName.NumMutes, false) -> new ReplacementVertexCombiner, - (FeatureName.NumReportAsAbuses, true) -> new ReplacementVertexCombiner, - (FeatureName.NumReportAsAbuses, false) -> new ReplacementVertexCombiner, - (FeatureName.NumReportAsSpams, true) -> new ReplacementVertexCombiner, - (FeatureName.NumReportAsSpams, false) -> new ReplacementVertexCombiner, - (FeatureName.NumTweetQuotes, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumTweetQuotes, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumMutualFollows, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookEmail, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookEmail, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookPhone, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookPhone, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookInBoth, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookInBoth, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgeEmail, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgeEmail, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgePhone, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgePhone, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgeInBoth, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgeInBoth, false) -> new ReplacementVertexCombiner, - (FeatureName.TotalDwellTime, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.TotalDwellTime, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumInspectedStatuses, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumInspectedStatuses, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumPhotoTags, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumPhotoTags, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumPushOpens, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumPushOpens, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumNtabClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumNtabClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtFavories, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtFavories, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtTweetQuotes, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtTweetQuotes, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtTweetClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtTweetClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtRetweets, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtRetweets, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtReplies, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtReplies, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtLinkClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtLinkClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtMentions, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtMentions, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumShares, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumShares, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumEmailOpen, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumEmailOpen, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumEmailClick, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumEmailClick, false) -> new WeightedAdditiveVertexCombiner, - ) - ) -} - -/** - * This class can take in a number of input Vertex thrift objects (all of which are assumed to - * contain information about a single vertex) and builds a combined Vertex protobuf object, which - * has the union of all the input. Note that we do a weighted addition for a time-decayed value. - *

- * The input objects features must be disjoint. Also, remember that the Vertex is directed! - */ -class VertexFeatureCombiner( - instanceVertex: Vertex, - featureMap: Map[(FeatureName, Boolean), VFeatureCombiner]) { - - /** - * Adds features without any decay. To be used for the same day. - * - * @param vertex vertex to be added into the combiner - */ - def addFeature(vertex: Vertex): VertexFeatureCombiner = { - val newVertex = instanceVertex.copy(weight = vertex.weight) - val newFeatures = featureMap.map { - case ((featureName, outgoing), combiner) => - vertex.features.find(f => f.name.equals(featureName) && f.outgoing.equals(outgoing)) match { - case Some(feature) => - val updatedCombiner = - if (combiner.isSet) combiner.updateFeature(feature) else combiner.setFeature(feature) - ((featureName, outgoing), updatedCombiner) - case _ => ((featureName, outgoing), combiner) - } - } - - new VertexFeatureCombiner(newVertex, newFeatures) - } - - /** - * Adds features with decays. Used for combining multiple days. - * - * @param vertex vertex to be added into the combiner - * @param alpha parameters for the decay calculation - * @param day number of days from today - */ - def addFeature(vertex: Vertex, alpha: Double, day: Int): VertexFeatureCombiner = { - - val newVertex = instanceVertex.copy(weight = vertex.weight) - val newFeatures = featureMap.map { - case ((featureName, outgoing), combiner) => - vertex.features.find(f => f.name.equals(featureName) && f.outgoing.equals(outgoing)) match { - case Some(feature) => - val updatedCombiner = - if (combiner.isSet) combiner.updateFeature(feature, alpha, day) - else combiner.setFeature(feature, alpha, day) - ((featureName, outgoing), updatedCombiner) - case _ => ((featureName, outgoing), combiner) - } - } - - new VertexFeatureCombiner(newVertex, newFeatures) - } - - /** - * Generate the final combined Vertex instance - * - * @param totalDays total number of days to be combined together - */ - def getCombinedVertex(totalDays: Int): Vertex = { - val moreFeatures = featureMap.values.flatMap { - case combiner => combiner.getFinalFeature(totalDays) - } - instanceVertex.copy(features = moreFeatures.toSeq) - } - -} - -/** - * This portion contains the actual combination logic. For now, we only implement a simple - * additive combiner, but in future we'd like to have things like time-weighted (exponential - * decay, maybe) values. - */ -trait VFeatureCombiner { - val startingDay: Int - val endingDay: Int - val timeSeriesStatistics: Option[TimeSeriesStatistics] - val vertexFeature: Option[VertexFeature] - - def updateTss(feature: VertexFeature, alpha: Double): VFeatureCombiner - def addToTss(feature: VertexFeature): VFeatureCombiner - def updateFeature(feature: VertexFeature, alpha: Double, day: Int): VFeatureCombiner - def updateFeature(feature: VertexFeature): VFeatureCombiner - def isSet: Boolean - def dropFeature: Boolean - def setFeature(feature: VertexFeature, alpha: Double, day: Int): VFeatureCombiner - def setFeature(feature: VertexFeature): VFeatureCombiner - def getFinalFeature(totalDays: Int): Option[VertexFeature] -} - -case class WeightedAdditiveVertexCombiner( - override val vertexFeature: Option[VertexFeature] = None, - override val startingDay: Int = Integer.MAX_VALUE, - override val endingDay: Int = Integer.MIN_VALUE, - override val timeSeriesStatistics: Option[TimeSeriesStatistics] = None) - extends VFeatureCombiner { - override def updateTss( - feature: VertexFeature, - alpha: Double - ): WeightedAdditiveVertexCombiner = copy(timeSeriesStatistics = timeSeriesStatistics.map(tss => - InteractionGraphUtils.updateTimeSeriesStatistics(tss, feature.tss.mean, alpha))) - - override def addToTss(feature: VertexFeature): WeightedAdditiveVertexCombiner = - copy(timeSeriesStatistics = timeSeriesStatistics.map(tss => - InteractionGraphUtils.addToTimeSeriesStatistics(tss, feature.tss.mean))) - - override def updateFeature(feature: VertexFeature, alpha: Double, day: Int): VFeatureCombiner = { - updateTss(feature, alpha).copy( - vertexFeature, - startingDay = startingDay, - endingDay = Math.max(endingDay, day) - ) - } - - override def updateFeature(feature: VertexFeature): VFeatureCombiner = - addToTss(feature) - - override def setFeature(feature: VertexFeature, alpha: Double, day: Int): VFeatureCombiner = { - val newStartingDay = Math.min(startingDay, day) - val newEndingDay = Math.max(endingDay, day) - - val numDaysSinceLast = - if (feature.tss.numDaysSinceLast.exists(_ > 0)) - feature.tss.numDaysSinceLast - else Some(feature.tss.numElapsedDays - feature.tss.numNonZeroDays + 1) - - val tss = feature.tss.copy(numDaysSinceLast = numDaysSinceLast) - - val newFeature = VertexFeature( - name = feature.name, - outgoing = feature.outgoing, - tss = tss - ) - - WeightedAdditiveVertexCombiner( - Some(newFeature), - newStartingDay, - newEndingDay, - Some(tss) - ) - } - - def getFinalFeature(totalDays: Int): Option[VertexFeature] = { - if (vertexFeature.isEmpty || dropFeature) return None - - val newTss = if (totalDays > 0) { - val elapsed = - timeSeriesStatistics.map(tss => tss.numElapsedDays + totalDays - 1 - startingDay) - val latest = - if (endingDay > 0) Some(totalDays - endingDay) - else timeSeriesStatistics.map(tss => tss.numDaysSinceLast.get + totalDays - 1) - - timeSeriesStatistics.map(tss => - tss.copy( - numElapsedDays = elapsed.get, - numDaysSinceLast = latest - )) - } else timeSeriesStatistics - - vertexFeature.map(vf => vf.copy(tss = newTss.get)) - } - - override def setFeature(feature: VertexFeature): VFeatureCombiner = setFeature(feature, 1.0, 0) - override def isSet: Boolean = vertexFeature.isDefined - override def dropFeature: Boolean = - timeSeriesStatistics.exists(tss => - tss.numDaysSinceLast.exists(_ > InteractionGraphUtils.MAX_DAYS_RETENTION) && - tss.ewma < InteractionGraphUtils.MIN_FEATURE_VALUE) -} - -/** - * This combiner always replaces the old value with the current. Ignores time-decays. - */ -case class ReplacementVertexCombiner( - override val vertexFeature: Option[VertexFeature] = None, - override val startingDay: Int = Integer.MAX_VALUE, - override val endingDay: Int = Integer.MIN_VALUE, - override val timeSeriesStatistics: Option[TimeSeriesStatistics] = None) - extends VFeatureCombiner { - override def updateTss( - feature: VertexFeature, - alpha: Double - ): ReplacementVertexCombiner = setFeature(feature, 1.0, 0) - - override def addToTss(feature: VertexFeature): ReplacementVertexCombiner = - setFeature(feature, 1.0, 0) - - override def updateFeature( - feature: VertexFeature, - alpha: Double, - day: Int - ): ReplacementVertexCombiner = updateTss(feature, alpha).copy( - vertexFeature, - startingDay = startingDay, - endingDay = Math.max(endingDay, day) - ) - - override def updateFeature(feature: VertexFeature): ReplacementVertexCombiner = - addToTss(feature) - - override def setFeature( - feature: VertexFeature, - alpha: Double, - day: Int - ): ReplacementVertexCombiner = { - val newStartingDay = Math.min(startingDay, day) - val newEndingDay = Math.max(endingDay, day) - - val numDaysSinceLast = - if (feature.tss.numDaysSinceLast.exists(_ > 0)) - feature.tss.numDaysSinceLast - else Some(feature.tss.numElapsedDays - feature.tss.numNonZeroDays + 1) - - val tss = feature.tss.copy(numDaysSinceLast = numDaysSinceLast) - - val newFeature = VertexFeature( - name = feature.name, - outgoing = feature.outgoing, - tss = tss - ) - - ReplacementVertexCombiner( - Some(newFeature), - newStartingDay, - newEndingDay, - Some(tss) - ) - } - - override def getFinalFeature(totalDays: Int): Option[VertexFeature] = { - if (vertexFeature.isEmpty || dropFeature) return None - if (timeSeriesStatistics.exists(tss => tss.ewma < 1.0)) return None - val newTss = if (totalDays > 0) { - val latest = - if (endingDay > 0) totalDays - endingDay - else timeSeriesStatistics.get.numDaysSinceLast.get + totalDays - 1 - - timeSeriesStatistics.map(tss => - tss.copy( - numElapsedDays = 1, - numDaysSinceLast = Some(latest) - )) - } else timeSeriesStatistics - - vertexFeature.map(vf => vf.copy(tss = newTss.get)) - } - - override def setFeature(feature: VertexFeature): VFeatureCombiner = setFeature(feature, 1.0, 0) - override def isSet: Boolean = vertexFeature.isDefined - override def dropFeature: Boolean = - timeSeriesStatistics.exists(tss => - tss.numDaysSinceLast.exists(_ > InteractionGraphUtils.MAX_DAYS_RETENTION) && - tss.ewma < InteractionGraphUtils.MIN_FEATURE_VALUE) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD b/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD deleted file mode 100644 index f06c0c08d..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD +++ /dev/null @@ -1,49 +0,0 @@ -scala_library( - name = "labels", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_labels_daily-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "socialgraph/hadoop/src/main/scala/com/twitter/socialgraph/hadoop:socialgraph-follow-events-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_edge_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_edge_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_email:interaction_graph_extended_email_edge_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_notifications:interaction_graph_agg_notifications_edge_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_retweets:interaction_graph_extended_retweet_edge_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_shares:interaction_graph_extended_share_edge_daily-scala", - "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam", - ], -) - -jvm_binary( - name = "interaction_graph_labels", - main = "com.twitter.interaction_graph.scio.ml.labels.InteractionGraphLabelsJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":labels", - ], -) - -create_datasets( - base_name = "interaction_graph_labels_daily", - description = "Daily labels", - java_schema = "com.twitter.interaction_graph.thriftjava.EdgeLabel", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.EdgeLabel", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD.docx new file mode 100644 index 000000000..07b5ce027 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.docx new file mode 100644 index 000000000..f99e381fd Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.scala b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.scala deleted file mode 100644 index a6d9999c8..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.scala +++ /dev/null @@ -1,123 +0,0 @@ -package com.twitter.interaction_graph.scio.ml.labels - -import com.google.api.services.bigquery.model.TimePartitioning -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.DiskFormat -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.io.fs.multiformat.WriteOptions -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.cde.scio.dal_read.SourceUtil -import com.twitter.conversions.DurationOps._ -import com.twitter.dal.client.dataset.TimePartitionedDALDataset -import com.twitter.interaction_graph.scio.agg_client_event_logs.InteractionGraphAggClientEventLogsEdgeDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_direct_interactions.InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_notifications.InteractionGraphAggNotificationsEdgeDailyScalaDataset -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.EdgeLabel -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.socialgraph.event.thriftscala.FollowEvent -import com.twitter.socialgraph.hadoop.SocialgraphFollowEventsScalaDataset -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.tcdc.bqblaster.beam.syntax._ -import com.twitter.tcdc.bqblaster.core.avro.TypedProjection -import com.twitter.tcdc.bqblaster.core.transform.RootTransform -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO -import org.joda.time.Interval - -object InteractionGraphLabelsJob extends ScioBeamJob[InteractionGraphLabelsOption] { - - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphLabelsOption - ): Unit = { - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val dateInterval: Interval = pipelineOptions.interval - - val bqTableName: String = pipelineOptions.getBqTableName - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - - def readPartition[T: Manifest](dataset: TimePartitionedDALDataset[T]): SCollection[T] = { - SourceUtil.readDALDataset[T]( - dataset = dataset, - interval = dateInterval, - dalEnvironment = dalEnvironment - ) - } - - val follows = readPartition[FollowEvent](SocialgraphFollowEventsScalaDataset) - .flatMap(LabelUtil.fromFollowEvent) - - val directInteractions = - readPartition[Edge](InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset) - .flatMap(LabelUtil.fromInteractionGraphEdge) - - val clientEvents = - readPartition[Edge](InteractionGraphAggClientEventLogsEdgeDailyScalaDataset) - .flatMap(LabelUtil.fromInteractionGraphEdge) - - val pushEvents = - readPartition[Edge](InteractionGraphAggNotificationsEdgeDailyScalaDataset) - .flatMap(LabelUtil.fromInteractionGraphEdge) - - - val labels = groupLabels( - follows ++ - directInteractions ++ - clientEvents ++ - pushEvents) - - labels.saveAsCustomOutput( - "Write Edge Labels", - DAL.write[EdgeLabel]( - InteractionGraphLabelsDailyScalaDataset, - PathLayout.DailyPath(pipelineOptions.getOutputPath), - dateInterval, - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - - // save to BQ - if (pipelineOptions.getBqTableName != null) { - val ingestionTime = pipelineOptions.getDate().value.getStart.toDate - val bqFieldsTransform = RootTransform - .Builder() - .withPrependedFields("dateHour" -> TypedProjection.fromConstant(ingestionTime)) - val timePartitioning = new TimePartitioning() - .setType("DAY").setField("dateHour").setExpirationMs(90.days.inMilliseconds) - val bqWriter = BigQueryIO - .write[EdgeLabel] - .to(bqTableName) - .withExtendedErrorInfo() - .withTimePartitioning(timePartitioning) - .withLoadJobProjectId("twttr-recos-ml-prod") - .withThriftSupport(bqFieldsTransform.build(), AvroConverter.Legacy) - .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) - labels - .saveAsCustomOutput( - s"Save Recommendations to BQ $bqTableName", - bqWriter - ) - } - - } - - def groupLabels(labels: SCollection[EdgeLabel]): SCollection[EdgeLabel] = { - labels - .map { e: EdgeLabel => ((e.sourceId, e.destinationId), e.labels.toSet) } - .sumByKey - .map { case ((srcId, destId), labels) => EdgeLabel(srcId, destId, labels) } - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.docx new file mode 100644 index 000000000..aed20ad43 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.scala b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.scala deleted file mode 100644 index 7c0a9a27a..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.scala +++ /dev/null @@ -1,28 +0,0 @@ -package com.twitter.interaction_graph.scio.ml.labels - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphLabelsOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Output bq table name") - def getBqTableName: String - def setBqTableName(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(10) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.docx new file mode 100644 index 000000000..a3794b998 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.scala b/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.scala deleted file mode 100644 index 350c86c84..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.scala +++ /dev/null @@ -1,63 +0,0 @@ -package com.twitter.interaction_graph.scio.ml.labels - -import com.spotify.scio.ScioMetrics -import com.twitter.interaction_graph.thriftscala.EdgeFeature -import com.twitter.interaction_graph.thriftscala.EdgeLabel -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.{Edge => TEdge} -import com.twitter.socialgraph.event.thriftscala.FollowEvent - -object LabelUtil { - - val LabelExplicit = Set( - FeatureName.NumFollows, - FeatureName.NumFavorites, - FeatureName.NumRetweets, - FeatureName.NumMentions, - FeatureName.NumTweetQuotes, - FeatureName.NumPhotoTags, - FeatureName.NumRtFavories, - FeatureName.NumRtReplies, - FeatureName.NumRtTweetQuotes, - FeatureName.NumRtRetweets, - FeatureName.NumRtMentions, - FeatureName.NumShares, - FeatureName.NumReplies, - ) - - val LabelImplicit = Set( - FeatureName.NumTweetClicks, - FeatureName.NumProfileViews, - FeatureName.NumLinkClicks, - FeatureName.NumPushOpens, - FeatureName.NumNtabClicks, - FeatureName.NumRtTweetClicks, - FeatureName.NumRtLinkClicks, - FeatureName.NumEmailOpen, - FeatureName.NumEmailClick, - ) - - val LabelSet = (LabelExplicit ++ LabelImplicit).map(_.value) - - def fromFollowEvent(f: FollowEvent): Option[EdgeLabel] = { - for { - srcId <- f.sourceId - destId <- f.targetId - } yield EdgeLabel(srcId, destId, labels = Set(FeatureName.NumFollows)) - } - - def fromInteractionGraphEdge(e: TEdge): Option[EdgeLabel] = { - val labels = e.features.collect { - case EdgeFeature(featureName: FeatureName, _) if LabelSet.contains(featureName.value) => - ScioMetrics.counter("fromInteractionGraphEdge", featureName.toString).inc() - featureName - }.toSet - if (labels.nonEmpty) { - Some(EdgeLabel(e.sourceId, e.destinationId, labels)) - } else None - } - - def toTEdge(e: EdgeLabel): EdgeLabel = { - EdgeLabel(e.sourceId, e.destinationId, labels = e.labels) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.docx new file mode 100644 index 000000000..b57cc7c91 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.md b/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.md deleted file mode 100644 index f67a624fb..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphLabels Dataflow Job - -#### IntelliJ -``` -fastpass create --name rg_labels --intellij src/scala/com/twitter/interaction_graph/scio/ml/labels -``` - -#### Compile -``` -bazel build src/scala/com/twitter/interaction_graph/scio/ml/labels:interaction_graph_labels -``` - -#### Build Jar -``` -bazel bundle src/scala/com/twitter/interaction_graph/scio/ml/labels:interaction_graph_labels -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-labels-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/ml/labels/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-05-15 \ - --bind=profile.output_path=processed/interaction_graph/labels -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD b/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD deleted file mode 100644 index f5f1cacc2..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD +++ /dev/null @@ -1,54 +0,0 @@ -scala_library( - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":real_graph_in_scores-scala", - ":real_graph_oon_scores-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam", - ], -) - -jvm_binary( - name = "interaction_graph_scores_scio", - main = "com.twitter.interaction_graph.scio.ml.scores.InteractionGraphScoreExportJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":scores", - ], -) - -create_datasets( - base_name = "real_graph_in_scores", - description = "Real Graph in network scores", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.wtf.scalding.jobs.injection.CandidateSeqInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.wtf.candidate.thriftscala.CandidateSeq", - scala_dependencies = [ - "src/scala/com/twitter/wtf/scalding/jobs/injection", - ], -) - -create_datasets( - base_name = "real_graph_oon_scores", - description = "Real Graph OON Scores", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.wtf.scalding.jobs.injection.CandidateSeqInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.wtf.candidate.thriftscala.CandidateSeq", - scala_dependencies = [ - "src/scala/com/twitter/wtf/scalding/jobs/injection", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD.docx new file mode 100644 index 000000000..de94002c6 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.docx b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.docx new file mode 100644 index 000000000..b51c1f022 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.scala b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.scala deleted file mode 100644 index 85e2284c2..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.scala +++ /dev/null @@ -1,134 +0,0 @@ -package com.twitter.interaction_graph.scio.ml.scores - -import com.google.cloud.bigquery.BigQueryOptions -import com.google.cloud.bigquery.QueryJobConfiguration -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.exception.DataNotFoundException -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.wtf.candidate.thriftscala.Candidate -import com.twitter.wtf.candidate.thriftscala.CandidateSeq -import com.twitter.wtf.candidate.thriftscala.ScoredEdge -import org.apache.avro.generic.GenericRecord -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead -import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord -import org.apache.beam.sdk.transforms.SerializableFunction -import scala.collection.JavaConverters._ - -object InteractionGraphScoreExportJob extends ScioBeamJob[InteractionGraphScoreExportOption] { - - // to parse latest date from the BQ table we're reading from - val parseDateRow = new SerializableFunction[SchemaAndRecord, String] { - override def apply(input: SchemaAndRecord): String = { - val genericRecord: GenericRecord = input.getRecord() - genericRecord.get("ds").toString - } - } - - // to parse each row from the BQ table we're reading from - val parseRow = new SerializableFunction[SchemaAndRecord, ScoredEdge] { - override def apply(record: SchemaAndRecord): ScoredEdge = { - val genericRecord: GenericRecord = record.getRecord() - ScoredEdge( - genericRecord.get("source_id").asInstanceOf[Long], - genericRecord.get("destination_id").asInstanceOf[Long], - genericRecord.get("prob").asInstanceOf[Double], - genericRecord.get("followed").asInstanceOf[Boolean], - ) - } - } - - override def runPipeline( - sc: ScioContext, - opts: InteractionGraphScoreExportOption - ): Unit = { - - val dateStr: String = opts.getDate().value.getStart.toString("yyyyMMdd") - logger.info(s"dateStr $dateStr") - val project: String = "twttr-recos-ml-prod" - val datasetName: String = "realgraph" - val bqTableName: String = "scores" - val fullBqTableName: String = s"$project:$datasetName.$bqTableName" - - if (opts.getDALWriteEnvironment == "PROD") { - val bqClient = - BigQueryOptions.newBuilder.setProjectId("twttr-recos-ml-prod").build.getService - val query = - s""" - |SELECT total_rows - |FROM `$project.$datasetName.INFORMATION_SCHEMA.PARTITIONS` - |WHERE partition_id ="$dateStr" AND - |table_name="$bqTableName" AND total_rows > 0 - |""".stripMargin - val queryConfig = QueryJobConfiguration.of(query) - val results = bqClient.query(queryConfig).getValues.asScala.toSeq - if (results.isEmpty || results.head.get(0).getLongValue == 0) { - throw new DataNotFoundException(s"$dateStr not present in $fullBqTableName.") - } - } - sc.run() - } - - override protected def configurePipeline( - sc: ScioContext, - opts: InteractionGraphScoreExportOption - ): Unit = { - - val dateStr: String = opts.getDate().value.getStart.toString("yyyy-MM-dd") - logger.info(s"dateStr $dateStr") - val project: String = "twttr-recos-ml-prod" - val datasetName: String = "realgraph" - val bqTableName: String = "scores" - val fullBqTableName: String = s"$project:$datasetName.$bqTableName" - - val scoreExport: SCollection[ScoredEdge] = sc - .customInput( - s"Read from BQ table $fullBqTableName", - BigQueryIO - .read(parseRow) - .from(fullBqTableName) - .withSelectedFields(List("source_id", "destination_id", "prob", "followed").asJava) - .withRowRestriction(s"ds = '$dateStr'") - .withMethod(TypedRead.Method.DIRECT_READ) - ) - - val inScores = scoreExport - .collect { - case ScoredEdge(src, dest, score, true) => - (src, Candidate(dest, score)) - } - .groupByKey - .map { - case (src, candidateIter) => KeyVal(src, CandidateSeq(candidateIter.toSeq.sortBy(-_.score))) - } - - val outScores = scoreExport - .collect { - case ScoredEdge(src, dest, score, false) => - (src, Candidate(dest, score)) - } - .groupByKey - .map { - case (src, candidateIter) => KeyVal(src, CandidateSeq(candidateIter.toSeq.sortBy(-_.score))) - } - - inScores.saveAsCustomOutput( - "Write real_graph_in_scores", - DAL.writeVersionedKeyVal( - RealGraphInScoresScalaDataset, - PathLayout.VersionedPath(opts.getOutputPath + "/in"), - ) - ) - outScores.saveAsCustomOutput( - "Write real_graph_oon_scores", - DAL.writeVersionedKeyVal( - RealGraphOonScoresScalaDataset, - PathLayout.VersionedPath(opts.getOutputPath + "/oon"), - ) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.docx b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.docx new file mode 100644 index 000000000..3dcd0249f Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.scala b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.scala deleted file mode 100644 index 3b55c517b..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.ml.scores - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphScoreExportOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(1000) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.docx b/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.docx new file mode 100644 index 000000000..91fa58ef4 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.md b/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.md deleted file mode 100644 index 51ace9d9a..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphLabels Dataflow Job - -#### IntelliJ -``` -fastpass create --name rg_scores --intellij src/scala/com/twitter/interaction_graph/scio/ml/scores -``` - -#### Compile -``` -bazel build src/scala/com/twitter/interaction_graph/scio/ml/scores -``` - -#### Build Jar -``` -bazel bundle src/scala/com/twitter/interaction_graph/scio/ml/scores -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-scores-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/ml/scores/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-06-23 \ - --bind=profile.output_path=manhattan_sequence_files/real_graph_scores_v2 -``` \ No newline at end of file diff --git a/src/scala/com/twitter/recos/decider/BUILD b/src/scala/com/twitter/recos/decider/BUILD deleted file mode 100644 index d1eb8d74f..000000000 --- a/src/scala/com/twitter/recos/decider/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "decider/src/main/scala", - "src/scala/com/twitter/recos/util:recos-util", - ], -) diff --git a/src/scala/com/twitter/recos/decider/BUILD.docx b/src/scala/com/twitter/recos/decider/BUILD.docx new file mode 100644 index 000000000..5ea409ce1 Binary files /dev/null and b/src/scala/com/twitter/recos/decider/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/decider/BaseDecider.docx b/src/scala/com/twitter/recos/decider/BaseDecider.docx new file mode 100644 index 000000000..613942ed5 Binary files /dev/null and b/src/scala/com/twitter/recos/decider/BaseDecider.docx differ diff --git a/src/scala/com/twitter/recos/decider/BaseDecider.scala b/src/scala/com/twitter/recos/decider/BaseDecider.scala deleted file mode 100644 index 841963631..000000000 --- a/src/scala/com/twitter/recos/decider/BaseDecider.scala +++ /dev/null @@ -1,110 +0,0 @@ -package com.twitter.recos.decider - -import com.twitter.decider.Decider -import com.twitter.decider.DeciderFactory -import com.twitter.decider.RandomRecipient -import com.twitter.decider.Recipient -import com.twitter.decider.SimpleRecipient -import com.twitter.recos.util.TeamUsers - -case class GuestRecipient(id: Long) extends Recipient { - override def isGuest: Boolean = true -} - -sealed trait BaseDecider { - def baseConfig: Option[String] = None - - def overlayConfig: Option[String] = None - - lazy val decider: Decider = DeciderFactory(baseConfig, overlayConfig)() - - def isAvailable(feature: String, recipient: Option[Recipient]): Boolean = - decider.isAvailable(feature, recipient) - - def isAvailable(feature: String): Boolean = isAvailable(feature, None) - - def isAvailableExceptTeam(feature: String, id: Long, isUser: Boolean = true): Boolean = { - if (isUser) TeamUsers.team.contains(id) || isAvailable(feature, Some(SimpleRecipient(id))) - else isAvailable(feature, Some(GuestRecipient(id))) - } -} - -case class RecosDecider(env: String, cluster: String = "atla") extends BaseDecider { - override val baseConfig = Some("/com/twitter/recos/config/decider.yml") - override val overlayConfig = Some( - s"/usr/local/config/overlays/recos/service/prod/$cluster/decider_overlay.yml" - ) - - def shouldCompute(id: Long, displayLocation: String, isUser: Boolean = true): Boolean = { - isAvailableExceptTeam(RecosDecider.recosIncomingTraffic + "_" + displayLocation, id, isUser) - } - - def shouldReturn(id: Long, displayLocation: String, isUser: Boolean = true): Boolean = { - isAvailableExceptTeam(RecosDecider.recosShouldReturn + "_" + displayLocation, id, isUser) - } - - def shouldDarkmode(experiment: String): Boolean = { - isAvailable(RecosDecider.recosShouldDark + "_exp_" + experiment, None) - } - - def shouldScribe(id: Long, isUser: Boolean = true): Boolean = { - if (isUser) (id > 0) && isAvailableExceptTeam(RecosDecider.recosShouldScribe, id, isUser) - else false // TODO: define the behavior for guests - } - - def shouldWriteMomentCapsuleOpenEdge(): Boolean = { - val capsuleOpenDecider = env match { - case "prod" => RecosDecider.recosShouldWriteMomentCapsuleOpenEdge - case _ => RecosDecider.recosShouldWriteMomentCapsuleOpenEdge + RecosDecider.testSuffix - } - - isAvailable(capsuleOpenDecider, Some(RandomRecipient)) - } -} - -object RecosDecider { - val testSuffix = "_test" - - val recosIncomingTraffic: String = "recos_incoming_traffic" - val recosShouldReturn: String = "recos_should_return" - val recosShouldDark: String = "recos_should_dark" - val recosRealtimeBlacklist: String = "recos_realtime_blacklist" - val recosRealtimeDeveloperlist: String = "recos_realtime_developerlist" - val recosShouldScribe: String = "recos_should_scribe" - val recosShouldWriteMomentCapsuleOpenEdge: String = "recos_should_write_moment_capsule_open_edge" -} - -trait GraphDecider extends BaseDecider { - val graphNamePrefix: String - - override val baseConfig = Some("/com/twitter/recos/config/decider.yml") - override val overlayConfig = Some( - "/usr/local/config/overlays/recos/service/prod/atla/decider_overlay.yml" - ) -} - -case class UserTweetEntityGraphDecider() extends GraphDecider { - override val graphNamePrefix: String = "user_tweet_entity_graph" - - def tweetSocialProof: Boolean = { - isAvailable("user_tweet_entity_graph_tweet_social_proof") - } - - def entitySocialProof: Boolean = { - isAvailable("user_tweet_entity_graph_entity_social_proof") - } - -} - -case class UserUserGraphDecider() extends GraphDecider { - override val graphNamePrefix: String = "user_user_graph" -} - -case class UserTweetGraphDecider(env: String, dc: String) extends GraphDecider { - override val graphNamePrefix: String = "user-tweet-graph" - - override val baseConfig = Some("/com/twitter/recos/config/user-tweet-graph_decider.yml") - override val overlayConfig = Some( - s"/usr/local/config/overlays/user-tweet-graph/user-tweet-graph/$env/$dc/decider_overlay.yml" - ) -} diff --git a/src/scala/com/twitter/recos/decider/EndpointLoadShedder.docx b/src/scala/com/twitter/recos/decider/EndpointLoadShedder.docx new file mode 100644 index 000000000..429990e2c Binary files /dev/null and b/src/scala/com/twitter/recos/decider/EndpointLoadShedder.docx differ diff --git a/src/scala/com/twitter/recos/decider/EndpointLoadShedder.scala b/src/scala/com/twitter/recos/decider/EndpointLoadShedder.scala deleted file mode 100644 index 73a06e5af..000000000 --- a/src/scala/com/twitter/recos/decider/EndpointLoadShedder.scala +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.recos.decider - -import com.twitter.decider.Decider -import com.twitter.decider.RandomRecipient -import com.twitter.util.Future -import scala.util.control.NoStackTrace - -/* - Provides deciders-controlled load shedding for a given endpoint. - The format of the decider keys is: - - enable_loadshedding__ - E.g.: - enable_loadshedding_user-tweet-graph_relatedTweets - - Deciders are fractional, so a value of 50.00 will drop 50% of responses. If a decider key is not - defined for a particular endpoint, those requests will always be - served. - - We should therefore aim to define keys for the endpoints we care most about in decider.yml, - so that we can control them during incidents. - */ -class EndpointLoadShedder( - decider: GraphDecider) { - import EndpointLoadShedder._ - - private val keyPrefix = "enable_loadshedding" - - def apply[T](endpointName: String)(serve: => Future[T]): Future[T] = { - val key = s"${keyPrefix}_${decider.graphNamePrefix}_${endpointName}" - if (decider.isAvailable(key, recipient = Some(RandomRecipient))) - Future.exception(LoadSheddingException) - else serve - } -} - -object EndpointLoadShedder { - object LoadSheddingException extends Exception with NoStackTrace -} diff --git a/src/scala/com/twitter/recos/graph_common/ActionEdgeTypeMask.docx b/src/scala/com/twitter/recos/graph_common/ActionEdgeTypeMask.docx new file mode 100644 index 000000000..aa4488e12 Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/ActionEdgeTypeMask.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/ActionEdgeTypeMask.scala b/src/scala/com/twitter/recos/graph_common/ActionEdgeTypeMask.scala deleted file mode 100644 index d29b12bc4..000000000 --- a/src/scala/com/twitter/recos/graph_common/ActionEdgeTypeMask.scala +++ /dev/null @@ -1,99 +0,0 @@ -package com.twitter.recos.graph_common - -import com.twitter.graphjet.bipartite.api.EdgeTypeMask -import com.twitter.recos.recos_common.thriftscala.SocialProofType - -/** - * The bit mask is used to encode edge types in the top bits of an integer, - * e.g. favorite, retweet, reply and click. Under current segment configuration, each segment - * stores up to 128M edges. Assuming that each node on one side is unique, each segment - * stores up to 128M unique nodes on one side, which occupies the lower 27 bits of an integer. - * This leaves five bits to encode the edge types, which at max can store 32 edge types. - * The following implementation utilizes the top four bits and leaves one free bit out. - */ -class ActionEdgeTypeMask extends EdgeTypeMask { - import ActionEdgeTypeMask._ - - override def encode(node: Int, edgeType: Byte): Int = { - if (edgeType == FAVORITE) { - node | EDGEARRAY(FAVORITE) - } else if (edgeType == RETWEET) { - node | EDGEARRAY(RETWEET) - } else if (edgeType == REPLY) { - node | EDGEARRAY(REPLY) - } else if (edgeType == TWEET) { - node | EDGEARRAY(TWEET) - } else { - // Anything that is not a public engagement (i.e. openlink, share, select, etc.) is a "click" - node | EDGEARRAY(CLICK) - } - } - - override def edgeType(node: Int): Byte = { - (node >> 28).toByte - } - - override def restore(node: Int): Int = { - node & MASK - } -} - -object ActionEdgeTypeMask { - - /** - * Reserve the top four bits of each integer to encode the edge type information. - */ - val MASK: Int = - Integer.parseInt("00001111111111111111111111111111", 2) - val CLICK: Byte = 0 - val FAVORITE: Byte = 1 - val RETWEET: Byte = 2 - val REPLY: Byte = 3 - val TWEET: Byte = 4 - val SIZE: Byte = 5 - val UNUSED6: Byte = 6 - val UNUSED7: Byte = 7 - val UNUSED8: Byte = 8 - val UNUSED9: Byte = 9 - val UNUSED10: Byte = 10 - val UNUSED11: Byte = 11 - val UNUSED12: Byte = 12 - val UNUSED13: Byte = 13 - val UNUSED14: Byte = 14 - val UNUSED15: Byte = 15 - val EDGEARRAY: Array[Int] = Array( - 0, - 1 << 28, - 2 << 28, - 3 << 28, - 4 << 28, - 5 << 28, - 6 << 28, - 7 << 28, - 8 << 28, - 9 << 28, - 10 << 28, - 11 << 28, - 12 << 28, - 13 << 28, - 14 << 28, - 15 << 28 - ) - - /** - * Map valid social proof types specified by clients to an array of bytes. If clients do not - * specify any social proof types in thrift, it will return all available social types by - * default. - * - * @param socialProofTypes are the valid socialProofTypes specified by clients - * @return an array of bytes representing valid social proof types - */ - def getUserTweetGraphSocialProofTypes( - socialProofTypes: Option[Seq[SocialProofType]] - ): Array[Byte] = { - socialProofTypes - .map { _.map { _.getValue }.toArray } - .getOrElse((0 until SIZE).toArray) - .map { _.toByte } - } -} diff --git a/src/scala/com/twitter/recos/graph_common/BUILD b/src/scala/com/twitter/recos/graph_common/BUILD deleted file mode 100644 index dd1f455ef..000000000 --- a/src/scala/com/twitter/recos/graph_common/BUILD +++ /dev/null @@ -1,12 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - strict_deps = False, - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/graphjet", - "finagle/finagle-stats/src/main/scala", - "src/scala/com/twitter/recos/util:recos-util", - "src/thrift/com/twitter/recos:recos-common-scala", - ], -) diff --git a/src/scala/com/twitter/recos/graph_common/BUILD.docx b/src/scala/com/twitter/recos/graph_common/BUILD.docx new file mode 100644 index 000000000..4fd2b893f Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/BipartiteGraphHelper.docx b/src/scala/com/twitter/recos/graph_common/BipartiteGraphHelper.docx new file mode 100644 index 000000000..3bd083d71 Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/BipartiteGraphHelper.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/BipartiteGraphHelper.scala b/src/scala/com/twitter/recos/graph_common/BipartiteGraphHelper.scala deleted file mode 100644 index 645bb900c..000000000 --- a/src/scala/com/twitter/recos/graph_common/BipartiteGraphHelper.scala +++ /dev/null @@ -1,40 +0,0 @@ -package com.twitter.recos.graph_common - -import com.twitter.graphjet.algorithms.TweetIDMask -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import scala.collection.mutable.ListBuffer - -/* - * The helper class encodes and decodes tweet ids with tweetypie's card information - * when querying recos salsa library. Inside salsa library, all tweet ids are - * encoded with card information for the purpose of inline filtering. - */ -class BipartiteGraphHelper(graph: BipartiteGraph) { - private val tweetIDMask = new TweetIDMask - - def getLeftNodeEdges(leftNode: Long): Seq[(Long, Byte)] = { - val iterator = graph.getLeftNodeEdges(leftNode) - - val edges: ListBuffer[(Long, Byte)] = ListBuffer() - if (iterator != null) { - while (iterator.hasNext) { - val node = iterator.nextLong() - val engagementType = iterator.currentEdgeType() - edges += ((tweetIDMask.restore(node), engagementType)) - } - } - edges.reverse.distinct // Most recent edges first, no duplications - } - - def getRightNodeEdges(rightNode: Long): Seq[Long] = { - val iterator = graph.getRightNodeEdges(rightNode) - val leftNodes: ListBuffer[Long] = ListBuffer() - if (iterator != null) { - while (iterator.hasNext) { - leftNodes += iterator.nextLong() - } - } - - leftNodes.reverse.distinct // Most recent edges first, no duplications - } -} diff --git a/src/scala/com/twitter/recos/graph_common/FinagleCounterWrapper.docx b/src/scala/com/twitter/recos/graph_common/FinagleCounterWrapper.docx new file mode 100644 index 000000000..4eccb4b0a Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/FinagleCounterWrapper.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/FinagleCounterWrapper.scala b/src/scala/com/twitter/recos/graph_common/FinagleCounterWrapper.scala deleted file mode 100644 index 3c4d62b1d..000000000 --- a/src/scala/com/twitter/recos/graph_common/FinagleCounterWrapper.scala +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.recos.graph_common - -import com.twitter.finagle.stats.Counter -import com.twitter.graphjet.stats.{Counter => GraphCounter} - -/** - * FinagleCounterWrapper wraps Twitter's Finagle Counter. - * - * This is because GraphJet is an openly available library which does not - * depend on Finagle, but tracks stats using a similar interface. - */ -class FinagleCounterWrapper(counter: Counter) extends GraphCounter { - def incr() = counter.incr() - def incr(delta: Int) = counter.incr(delta) -} diff --git a/src/scala/com/twitter/recos/graph_common/FinagleStatsReceiverWrapper.docx b/src/scala/com/twitter/recos/graph_common/FinagleStatsReceiverWrapper.docx new file mode 100644 index 000000000..44efcb341 Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/FinagleStatsReceiverWrapper.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/FinagleStatsReceiverWrapper.scala b/src/scala/com/twitter/recos/graph_common/FinagleStatsReceiverWrapper.scala deleted file mode 100644 index ac8bfc883..000000000 --- a/src/scala/com/twitter/recos/graph_common/FinagleStatsReceiverWrapper.scala +++ /dev/null @@ -1,16 +0,0 @@ -package com.twitter.recos.graph_common - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.stats.{StatsReceiver => GraphStatsReceiver} - -/** - * FinagleStatsReceiverWrapper wraps Twitter's Finagle StatsReceiver. - * - * This is because GraphJet is an openly available library which does not - * depend on Finagle, but tracks stats using a similar interface. - */ -case class FinagleStatsReceiverWrapper(statsReceiver: StatsReceiver) extends GraphStatsReceiver { - - def scope(namespace: String) = new FinagleStatsReceiverWrapper(statsReceiver.scope(namespace)) - def counter(name: String) = new FinagleCounterWrapper(statsReceiver.counter(name)) -} diff --git a/src/scala/com/twitter/recos/graph_common/LeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.docx b/src/scala/com/twitter/recos/graph_common/LeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.docx new file mode 100644 index 000000000..ce4610c7b Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/LeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/LeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.scala b/src/scala/com/twitter/recos/graph_common/LeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.scala deleted file mode 100644 index 7e21b82c7..000000000 --- a/src/scala/com/twitter/recos/graph_common/LeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.scala +++ /dev/null @@ -1,59 +0,0 @@ -package com.twitter.recos.graph_common - -import com.twitter.graphjet.bipartite.LeftIndexedPowerLawMultiSegmentBipartiteGraph -import com.twitter.graphjet.bipartite.api.EdgeTypeMask -import com.twitter.graphjet.stats.StatsReceiver - -/** - * The GraphBuilder builds a LeftIndexedPowerLawMultiSegmentBipartiteGraph given a set of - * parameters. - */ -object LeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder { - - /** - * This encapsulates all the state needed to initialize the in-memory graph. - * - * @param maxNumSegments is the maximum number of segments we'll add to the graph. - * At that point, the oldest segments will start getting dropped - * @param maxNumEdgesPerSegment determines when the implementation decides to fork off a - * new segment - * @param expectedNumLeftNodes is the expected number of left nodes that would be inserted in - * the segment - * @param expectedMaxLeftDegree is the maximum degree expected for any left node - * @param leftPowerLawExponent is the exponent of the LHS power-law graph. see - * [[com.twitter.graphjet.bipartite.edgepool.PowerLawDegreeEdgePool]] - * for details - * @param expectedNumRightNodes is the expected number of right nodes that would be inserted in - * the segment - */ - case class GraphBuilderConfig( - maxNumSegments: Int, - maxNumEdgesPerSegment: Int, - expectedNumLeftNodes: Int, - expectedMaxLeftDegree: Int, - leftPowerLawExponent: Double, - expectedNumRightNodes: Int, - edgeTypeMask: EdgeTypeMask) - - /** - * This apply function returns a mutuable bipartiteGraph - * - * @param graphBuilderConfig is the graph builder config - * - */ - def apply( - graphBuilderConfig: GraphBuilderConfig, - statsReceiverWrapper: StatsReceiver - ): LeftIndexedPowerLawMultiSegmentBipartiteGraph = { - new LeftIndexedPowerLawMultiSegmentBipartiteGraph( - graphBuilderConfig.maxNumSegments, - graphBuilderConfig.maxNumEdgesPerSegment, - graphBuilderConfig.expectedNumLeftNodes, - graphBuilderConfig.expectedMaxLeftDegree, - graphBuilderConfig.leftPowerLawExponent, - graphBuilderConfig.expectedNumRightNodes, - graphBuilderConfig.edgeTypeMask, - statsReceiverWrapper - ) - } -} diff --git a/src/scala/com/twitter/recos/graph_common/MultiSegmentPowerLawBipartiteGraphBuilder.docx b/src/scala/com/twitter/recos/graph_common/MultiSegmentPowerLawBipartiteGraphBuilder.docx new file mode 100644 index 000000000..c77e249d7 Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/MultiSegmentPowerLawBipartiteGraphBuilder.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/MultiSegmentPowerLawBipartiteGraphBuilder.scala b/src/scala/com/twitter/recos/graph_common/MultiSegmentPowerLawBipartiteGraphBuilder.scala deleted file mode 100644 index ca777c97d..000000000 --- a/src/scala/com/twitter/recos/graph_common/MultiSegmentPowerLawBipartiteGraphBuilder.scala +++ /dev/null @@ -1,64 +0,0 @@ -package com.twitter.recos.graph_common - -import com.twitter.graphjet.stats.StatsReceiver -import com.twitter.graphjet.bipartite.MultiSegmentPowerLawBipartiteGraph - -/** - * The GraphBuilder builds a MultiSegmentPowerLawBipartiteGraph given a set of parameters. - */ -object MultiSegmentPowerLawBipartiteGraphBuilder { - - /** - * This encapsulates all the state needed to initialize the in-memory graph. - * - * @param maxNumSegments is the maximum number of segments we'll add to the graph. - * At that point, the oldest segments will start getting dropped - * @param maxNumEdgesPerSegment determines when the implementation decides to fork off a - * new segment - * @param expectedNumLeftNodes is the expected number of left nodes that would be inserted in - * the segment - * @param expectedMaxLeftDegree is the maximum degree expected for any left node - * @param leftPowerLawExponent is the exponent of the LHS power-law graph. see - * [[com.twitter.graphjet.bipartite.edgepool.PowerLawDegreeEdgePool]] - * for details - * @param expectedNumRightNodes is the expected number of right nodes that would be inserted in - * the segment - * @param expectedMaxRightDegree is the maximum degree expected for any right node - * @param rightPowerLawExponent is the exponent of the RHS power-law graph. see - * [[com.twitter.graphjet.bipartite.edgepool.PowerLawDegreeEdgePool]] - * for details - */ - case class GraphBuilderConfig( - maxNumSegments: Int, - maxNumEdgesPerSegment: Int, - expectedNumLeftNodes: Int, - expectedMaxLeftDegree: Int, - leftPowerLawExponent: Double, - expectedNumRightNodes: Int, - expectedMaxRightDegree: Int, - rightPowerLawExponent: Double) - - /** - * This apply function returns a mutuable bipartiteGraph - * - * @param graphBuilderConfig is the graph builder config - * - */ - def apply( - graphBuilderConfig: GraphBuilderConfig, - statsReceiver: StatsReceiver - ): MultiSegmentPowerLawBipartiteGraph = { - new MultiSegmentPowerLawBipartiteGraph( - graphBuilderConfig.maxNumSegments, - graphBuilderConfig.maxNumEdgesPerSegment, - graphBuilderConfig.expectedNumLeftNodes, - graphBuilderConfig.expectedMaxLeftDegree, - graphBuilderConfig.leftPowerLawExponent, - graphBuilderConfig.expectedNumRightNodes, - graphBuilderConfig.expectedMaxRightDegree, - graphBuilderConfig.rightPowerLawExponent, - new ActionEdgeTypeMask(), - statsReceiver - ) - } -} diff --git a/src/scala/com/twitter/recos/graph_common/NodeInfoHandler.docx b/src/scala/com/twitter/recos/graph_common/NodeInfoHandler.docx new file mode 100644 index 000000000..91a5fc9f5 Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/NodeInfoHandler.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/NodeInfoHandler.scala b/src/scala/com/twitter/recos/graph_common/NodeInfoHandler.scala deleted file mode 100644 index 5e71f6b03..000000000 --- a/src/scala/com/twitter/recos/graph_common/NodeInfoHandler.scala +++ /dev/null @@ -1,59 +0,0 @@ -package com.twitter.recos.graph_common - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.recos.recos_common.thriftscala.{ - SocialProofType, - GetRecentEdgesRequest, - GetRecentEdgesResponse, - NodeInfo, - RecentEdge -} -import com.twitter.recos.util.Stats._ -import com.twitter.servo.request._ -import com.twitter.util.Future - -/** - * Implementation of the Thrift-defined service interface. - */ -class LeftNodeEdgesHandler(graphHelper: BipartiteGraphHelper, statsReceiver: StatsReceiver) - extends RequestHandler[GetRecentEdgesRequest, GetRecentEdgesResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - - private val CLICK = 0 - private val FAVORITE = 1 - private val RETWEET = 2 - private val REPLY = 3 - private val TWEET = 4 - - override def apply(request: GetRecentEdgesRequest): Future[GetRecentEdgesResponse] = { - trackFutureBlockStats(stats) { - val recentEdges = graphHelper.getLeftNodeEdges(request.requestId).flatMap { - case (node, engagementType) if engagementType == CLICK => - Some(RecentEdge(node, SocialProofType.Click)) - case (node, engagementType) if engagementType == FAVORITE => - Some(RecentEdge(node, SocialProofType.Favorite)) - case (node, engagementType) if engagementType == RETWEET => - Some(RecentEdge(node, SocialProofType.Retweet)) - case (node, engagementType) if engagementType == REPLY => - Some(RecentEdge(node, SocialProofType.Reply)) - case (node, engagementType) if engagementType == TWEET => - Some(RecentEdge(node, SocialProofType.Tweet)) - case _ => - None - } - Future.value(GetRecentEdgesResponse(recentEdges)) - } - } -} - -class RightNodeInfoHandler(graphHelper: BipartiteGraphHelper, statsReceiver: StatsReceiver) - extends RequestHandler[Long, NodeInfo] { - private[this] val stats = statsReceiver.scope(this.getClass.getSimpleName) - - override def apply(rightNode: Long): Future[NodeInfo] = { - trackFutureBlockStats(stats) { - val edges = graphHelper.getRightNodeEdges(rightNode) - Future.value(NodeInfo(edges = edges)) - } - } -} diff --git a/src/scala/com/twitter/recos/graph_common/NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.docx b/src/scala/com/twitter/recos/graph_common/NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.docx new file mode 100644 index 000000000..664f0af09 Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.scala b/src/scala/com/twitter/recos/graph_common/NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.scala deleted file mode 100644 index ce63644a6..000000000 --- a/src/scala/com/twitter/recos/graph_common/NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.scala +++ /dev/null @@ -1,63 +0,0 @@ -package com.twitter.recos.graph_common - -import com.twitter.graphjet.bipartite.api.EdgeTypeMask -import com.twitter.graphjet.bipartite.NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph -import com.twitter.graphjet.stats.StatsReceiver - -/** - * The GraphBuilder builds a NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder given a set of - * parameters. - */ -object NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder { - - /** - * This encapsulates all the state needed to initialize the in-memory graph. - * - * @param maxNumSegments is the maximum number of segments we'll add to the graph. - * At that point, the oldest segments will start getting dropped - * @param maxNumEdgesPerSegment determines when the implementation decides to fork off a - * new segment - * @param expectedNumLeftNodes is the expected number of left nodes that would be inserted in - * the segment - * @param expectedMaxLeftDegree is the maximum degree expected for any left node - * @param leftPowerLawExponent is the exponent of the LHS power-law graph. see - * [[com.twitter.graphjet.bipartite.edgepool.PowerLawDegreeEdgePool]] - * for details - * @param expectedNumRightNodes is the expected number of right nodes that would be inserted in - * the segment - * @param numRightNodeMetadataTypes is the max number of node metadata types associated with the - * right nodes - */ - case class GraphBuilderConfig( - maxNumSegments: Int, - maxNumEdgesPerSegment: Int, - expectedNumLeftNodes: Int, - expectedMaxLeftDegree: Int, - leftPowerLawExponent: Double, - expectedNumRightNodes: Int, - numRightNodeMetadataTypes: Int, - edgeTypeMask: EdgeTypeMask) - - /** - * This apply function returns a mutuable bipartiteGraph - * - * @param graphBuilderConfig is the graph builder config - * - */ - def apply( - graphBuilderConfig: GraphBuilderConfig, - statsReceiverWrapper: StatsReceiver - ): NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph = { - new NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph( - graphBuilderConfig.maxNumSegments, - graphBuilderConfig.maxNumEdgesPerSegment, - graphBuilderConfig.expectedNumLeftNodes, - graphBuilderConfig.expectedMaxLeftDegree, - graphBuilderConfig.leftPowerLawExponent, - graphBuilderConfig.expectedNumRightNodes, - graphBuilderConfig.numRightNodeMetadataTypes, - graphBuilderConfig.edgeTypeMask, - statsReceiverWrapper - ) - } -} diff --git a/src/scala/com/twitter/recos/graph_common/RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.docx b/src/scala/com/twitter/recos/graph_common/RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.docx new file mode 100644 index 000000000..e68256550 Binary files /dev/null and b/src/scala/com/twitter/recos/graph_common/RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.docx differ diff --git a/src/scala/com/twitter/recos/graph_common/RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.scala b/src/scala/com/twitter/recos/graph_common/RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.scala deleted file mode 100644 index 353b47d92..000000000 --- a/src/scala/com/twitter/recos/graph_common/RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.scala +++ /dev/null @@ -1,63 +0,0 @@ -package com.twitter.recos.graph_common - -import com.twitter.graphjet.bipartite.RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph -import com.twitter.graphjet.bipartite.api.EdgeTypeMask -import com.twitter.graphjet.stats.StatsReceiver - -/** - * The GraphBuilder builds a RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder given a set of - * parameters. - */ -object RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder { - - /** - * This encapsulates all the state needed to initialize the in-memory graph. - * - * @param maxNumSegments is the maximum number of segments we'll add to the graph. - * At that point, the oldest segments will start getting dropped - * @param maxNumEdgesPerSegment determines when the implementation decides to fork off a - * new segment - * @param expectedNumLeftNodes is the expected number of left nodes that would be inserted in - * the segment - * @param expectedMaxLeftDegree is the maximum degree expected for any left node - * @param leftPowerLawExponent is the exponent of the LHS power-law graph. see - * [[com.twitter.graphjet.bipartite.edgepool.PowerLawDegreeEdgePool]] - * for details - * @param expectedNumRightNodes is the expected number of right nodes that would be inserted in - * the segment - * @param numRightNodeMetadataTypes is the max number of node metadata types associated with the - * right nodes - */ - case class GraphBuilderConfig( - maxNumSegments: Int, - maxNumEdgesPerSegment: Int, - expectedNumLeftNodes: Int, - expectedMaxLeftDegree: Int, - leftPowerLawExponent: Double, - expectedNumRightNodes: Int, - numRightNodeMetadataTypes: Int, - edgeTypeMask: EdgeTypeMask) - - /** - * This apply function returns a mutuable bipartiteGraph - * - * @param graphBuilderConfig is the graph builder config - * - */ - def apply( - graphBuilderConfig: GraphBuilderConfig, - statsReceiverWrapper: StatsReceiver - ): RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph = { - new RightNodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph( - graphBuilderConfig.maxNumSegments, - graphBuilderConfig.maxNumEdgesPerSegment, - graphBuilderConfig.expectedNumLeftNodes, - graphBuilderConfig.expectedMaxLeftDegree, - graphBuilderConfig.leftPowerLawExponent, - graphBuilderConfig.expectedNumRightNodes, - graphBuilderConfig.numRightNodeMetadataTypes, - graphBuilderConfig.edgeTypeMask, - statsReceiverWrapper - ) - } -} diff --git a/src/scala/com/twitter/recos/hose/common/BUILD b/src/scala/com/twitter/recos/hose/common/BUILD deleted file mode 100644 index 9fcb19b5f..000000000 --- a/src/scala/com/twitter/recos/hose/common/BUILD +++ /dev/null @@ -1,15 +0,0 @@ -scala_library( - sources = ["*.scala"], - strict_deps = False, - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/graphjet", - "3rdparty/jvm/org/apache/kafka:rosette-kafka", - "finagle/finagle-stats/src/main/scala", - "kafka/finagle-kafka/finatra-kafka/src/main/scala", - "kafka/libs/src/main/scala/com/twitter/kafka/client/processor", - "servo/repo/src/main/scala", - "src/scala/com/twitter/recos/util:recos-util", - "src/thrift/com/twitter/recos:recos-internal-scala", - ], -) diff --git a/src/scala/com/twitter/recos/hose/common/BUILD.docx b/src/scala/com/twitter/recos/hose/common/BUILD.docx new file mode 100644 index 000000000..adc501ac9 Binary files /dev/null and b/src/scala/com/twitter/recos/hose/common/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/hose/common/BufferedEdgeWriter.docx b/src/scala/com/twitter/recos/hose/common/BufferedEdgeWriter.docx new file mode 100644 index 000000000..8cba1c34c Binary files /dev/null and b/src/scala/com/twitter/recos/hose/common/BufferedEdgeWriter.docx differ diff --git a/src/scala/com/twitter/recos/hose/common/BufferedEdgeWriter.scala b/src/scala/com/twitter/recos/hose/common/BufferedEdgeWriter.scala deleted file mode 100644 index f2f5ee056..000000000 --- a/src/scala/com/twitter/recos/hose/common/BufferedEdgeWriter.scala +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.recos.hose.common - -import com.twitter.finagle.stats.{Stat, StatsReceiver} -import com.twitter.logging.Logger -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import java.util.concurrent.Semaphore - -/** - * This class reads a buffer of edges from the concurrently linked queue - * and inserts each edge into the recos graph. - * If the queue is empty the thread will sleep for 100ms and attempt to read from the queue again. - */ -case class BufferedEdgeWriter( - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore, - edgeCollector: EdgeCollector, - statsReceiver: StatsReceiver, - isRunning: () => Boolean) - extends Runnable { - val logger = Logger() - private val queueRemoveCounter = statsReceiver.counter("queueRemove") - private val queueSleepCounter = statsReceiver.counter("queueSleep") - - def running: Boolean = { - isRunning() - } - - override def run(): Unit = { - while (running) { - val currentBatch = queue.poll - if (currentBatch != null) { - queueRemoveCounter.incr() - queuelimit.release() - var i = 0 - Stat.time(statsReceiver.stat("batchAddEdge")) { - while (i < currentBatch.length) { - edgeCollector.addEdge(currentBatch(i)) - i = i + 1 - } - } - } else { - queueSleepCounter.incr() - Thread.sleep(100L) - } - } - logger.info(this.getClass.getSimpleName + " is done") - } -} diff --git a/src/scala/com/twitter/recos/hose/common/EdgeCollector.docx b/src/scala/com/twitter/recos/hose/common/EdgeCollector.docx new file mode 100644 index 000000000..8d9e6eec2 Binary files /dev/null and b/src/scala/com/twitter/recos/hose/common/EdgeCollector.docx differ diff --git a/src/scala/com/twitter/recos/hose/common/EdgeCollector.scala b/src/scala/com/twitter/recos/hose/common/EdgeCollector.scala deleted file mode 100644 index c5279496c..000000000 --- a/src/scala/com/twitter/recos/hose/common/EdgeCollector.scala +++ /dev/null @@ -1,42 +0,0 @@ -package com.twitter.recos.hose.common - -import com.twitter.finagle.stats.{Stat, StatsReceiver} -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import java.util.concurrent.Semaphore - -trait EdgeCollector { - def addEdge(message: RecosHoseMessage): Unit -} - -/** - * The class consumes incoming edges and inserts them into a buffer of a specified bufferSize. - * Once the buffer is full of edges, it is written to a concurrently linked queue where the size is bounded by queuelimit. - */ -case class BufferedEdgeCollector( - bufferSize: Int, - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore, - statsReceiver: StatsReceiver) - extends EdgeCollector { - - private var buffer = new Array[RecosHoseMessage](bufferSize) - private var index = 0 - private val queueAddCounter = statsReceiver.counter("queueAdd") - - override def addEdge(message: RecosHoseMessage): Unit = { - buffer(index) = message - index = index + 1 - if (index >= bufferSize) { - val oldBuffer = buffer - buffer = new Array[RecosHoseMessage](bufferSize) - index = 0 - - Stat.time(statsReceiver.stat("waitEnqueue")) { - queuelimit.acquireUninterruptibly() - } - - queue.add(oldBuffer) - queueAddCounter.incr() - } - } -} diff --git a/src/scala/com/twitter/recos/hose/common/RecosEdgeProcessor.docx b/src/scala/com/twitter/recos/hose/common/RecosEdgeProcessor.docx new file mode 100644 index 000000000..2582771ce Binary files /dev/null and b/src/scala/com/twitter/recos/hose/common/RecosEdgeProcessor.docx differ diff --git a/src/scala/com/twitter/recos/hose/common/RecosEdgeProcessor.scala b/src/scala/com/twitter/recos/hose/common/RecosEdgeProcessor.scala deleted file mode 100644 index 243fce628..000000000 --- a/src/scala/com/twitter/recos/hose/common/RecosEdgeProcessor.scala +++ /dev/null @@ -1,41 +0,0 @@ -package com.twitter.recos.hose.common - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.util.Future -import org.apache.kafka.clients.consumer.ConsumerRecord - -/** - * The class processes RecosHoseMessage and inserts the message as an edge into a recos graph. - */ -case class RecosEdgeProcessor( - edgeCollector: EdgeCollector -)( - implicit statsReceiver: StatsReceiver) { - - private val scopedStats = statsReceiver.scope("RecosEdgeProcessor") - - private val processEventsCounter = scopedStats.counter("process_events") - private val nullPointerEventCounter = scopedStats.counter("null_pointer_num") - private val errorCounter = scopedStats.counter("process_errors") - - def process(record: ConsumerRecord[String, RecosHoseMessage]): Future[Unit] = { - processEventsCounter.incr() - val message = record.value() - try { - // the message is nullable - if (message != null) { - edgeCollector.addEdge(message) - } else { - nullPointerEventCounter.incr() - } - Future.Unit - } catch { - case e: Throwable => - errorCounter.incr() - e.printStackTrace() - Future.Unit - } - } - -} diff --git a/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriter.docx b/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriter.docx new file mode 100644 index 000000000..bf4ff89c6 Binary files /dev/null and b/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriter.docx differ diff --git a/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriter.scala b/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriter.scala deleted file mode 100644 index bac62e418..000000000 --- a/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriter.scala +++ /dev/null @@ -1,217 +0,0 @@ -package com.twitter.recos.hose.common - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.graphjet.bipartite.LeftIndexedMultiSegmentBipartiteGraph -import com.twitter.graphjet.bipartite.segment.LeftIndexedBipartiteGraphSegment -import com.twitter.kafka.client.processor.{AtLeastOnceProcessor, ThreadSafeKafkaConsumerClient} -import com.twitter.logging.Logger -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import java.util.concurrent.atomic.AtomicBoolean -import java.util.concurrent.{ConcurrentLinkedQueue, ExecutorService, Executors, Semaphore} - -/** - * The class submits a number of graph writer threads, BufferedEdgeWriter, - * during service startup. One of them is live writer thread, and the other $(numBootstrapWriters - 1) - * are catchup writer threads. All of them consume kafka events from an internal concurrent queue, - * which is populated by kafka reader threads. At bootstrap time, the kafka reader threads look - * back kafka offset from several hours ago and populate the internal concurrent queue. - * Each graph writer thread writes to an individual graph segment separately. - * The (numBootstrapWriters - 1) catchup writer threads will stop once all events - * between current system time at startup and the time in memcache are processed. - * The live writer thread will continue to write all incoming kafka events. - * It lives through the entire life cycle of recos graph service. - */ -trait UnifiedGraphWriter[ - TSegment <: LeftIndexedBipartiteGraphSegment, - TGraph <: LeftIndexedMultiSegmentBipartiteGraph[TSegment]] { writer => - - import UnifiedGraphWriter._ - - def shardId: String - def env: String - def hosename: String - def bufferSize: Int - def consumerNum: Int - def catchupWriterNum: Int - def kafkaConsumerBuilder: FinagleKafkaConsumerBuilder[String, RecosHoseMessage] - def clientId: String - def statsReceiver: StatsReceiver - - /** - * Adds a RecosHoseMessage to the graph. used by live writer to insert edges to the - * current segment - */ - def addEdgeToGraph(graph: TGraph, recosHoseMessage: RecosHoseMessage): Unit - - /** - * Adds a RecosHoseMessage to the given segment in the graph. Used by catch up writers to - * insert edges to non-current (old) segments - */ - def addEdgeToSegment(segment: TSegment, recosHoseMessage: RecosHoseMessage): Unit - - private val log = Logger() - private val isRunning: AtomicBoolean = new AtomicBoolean(true) - private val initialized: AtomicBoolean = new AtomicBoolean(false) - private var processors: Seq[AtLeastOnceProcessor[String, RecosHoseMessage]] = Seq.empty - private var consumers: Seq[ThreadSafeKafkaConsumerClient[String, RecosHoseMessage]] = Seq.empty - private val threadPool: ExecutorService = Executors.newCachedThreadPool() - - def shutdown(): Unit = { - processors.foreach { processor => - processor.close() - } - processors = Seq.empty - consumers.foreach { consumer => - consumer.close() - } - consumers = Seq.empty - threadPool.shutdown() - isRunning.set(false) - } - - def initHose(liveGraph: TGraph): Unit = this.synchronized { - if (!initialized.get) { - initialized.set(true) - - val queue: java.util.Queue[Array[RecosHoseMessage]] = - new ConcurrentLinkedQueue[Array[RecosHoseMessage]]() - val queuelimit: Semaphore = new Semaphore(1024) - - initRecosHoseKafka(queue, queuelimit) - initGrpahWriters(liveGraph, queue, queuelimit) - } else { - throw new RuntimeException("attempt to re-init kafka hose") - } - } - - private def initRecosHoseKafka( - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore, - ): Unit = { - try { - consumers = (0 until consumerNum).map { index => - new ThreadSafeKafkaConsumerClient( - kafkaConsumerBuilder.clientId(s"clientId-$index").enableAutoCommit(false).config) - } - processors = consumers.zipWithIndex.map { - case (consumer, index) => - val bufferedWriter = BufferedEdgeCollector(bufferSize, queue, queuelimit, statsReceiver) - val processor = RecosEdgeProcessor(bufferedWriter)(statsReceiver) - - AtLeastOnceProcessor[String, RecosHoseMessage]( - s"recos-injector-kafka-$index", - hosename, - consumer, - processor.process, - maxPendingRequests = MaxPendingRequests * bufferSize, - workerThreads = ProcessorThreads, - commitIntervalMs = CommitIntervalMs, - statsReceiver = statsReceiver - ) - } - - log.info(s"starting ${processors.size} recosKafka processors") - processors.foreach { processor => - processor.start() - } - } catch { - case e: Throwable => - e.printStackTrace() - log.error(e, e.toString) - processors.foreach { processor => - processor.close() - } - processors = Seq.empty - consumers.foreach { consumer => - consumer.close() - } - consumers = Seq.empty - } - } - - /** - * Initialize the graph writers, - * by first creating catch up writers to bootstrap the older segments, - * and then assigning a live writer to populate the live segment. - */ - private def initGrpahWriters( - liveGraph: TGraph, - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore - ): Unit = { - // define a number of (numBootstrapWriters - 1) catchup writer threads, each of which will write - // to a separate graph segment. - val catchupWriters = (0 until (catchupWriterNum - 1)).map { index => - val segment = liveGraph.getLiveSegment - liveGraph.rollForwardSegment() - getCatchupWriter(segment, queue, queuelimit, index) - } - val threadPool: ExecutorService = Executors.newCachedThreadPool() - - // define one live writer thread - val liveWriter = getLiveWriter(liveGraph, queue, queuelimit) - log.info("starting live graph writer that runs until service shutdown") - threadPool.submit(liveWriter) - log.info( - "starting catchup graph writer, which will terminate as soon as the catchup segment is full" - ) - catchupWriters.map(threadPool.submit(_)) - } - - private def getLiveWriter( - liveGraph: TGraph, - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore - ): BufferedEdgeWriter = { - val liveEdgeCollector = new EdgeCollector { - override def addEdge(message: RecosHoseMessage): Unit = addEdgeToGraph(liveGraph, message) - } - BufferedEdgeWriter( - queue, - queuelimit, - liveEdgeCollector, - statsReceiver.scope("liveWriter"), - isRunning.get - ) - } - - private def getCatchupWriter( - segment: TSegment, - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore, - catchupWriterIndex: Int - ): BufferedEdgeWriter = { - val catchupEdgeCollector = new EdgeCollector { - var currentNumEdges = 0 - - override def addEdge(message: RecosHoseMessage): Unit = { - currentNumEdges += 1 - addEdgeToSegment(segment, message) - } - } - val maxEdges = segment.getMaxNumEdges - - def runCondition(): Boolean = { - isRunning.get && ((maxEdges - catchupEdgeCollector.currentNumEdges) > bufferSize) - } - - BufferedEdgeWriter( - queue, - queuelimit, - catchupEdgeCollector, - statsReceiver.scope("catcher_" + catchupWriterIndex), - runCondition - ) - } -} - -private object UnifiedGraphWriter { - - // The RecosEdgeProcessor is not thread-safe. Only use one thread to process each instance. - val ProcessorThreads = 1 - // Each one cache at most 1000 * bufferSize requests. - val MaxPendingRequests = 1000 - // Short Commit MS to reduce duplicate messages. - val CommitIntervalMs: Long = 5000 // 5 seconds, Default Kafka value. -} diff --git a/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriterMulti.docx b/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriterMulti.docx new file mode 100644 index 000000000..4849d2f7a Binary files /dev/null and b/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriterMulti.docx differ diff --git a/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriterMulti.scala b/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriterMulti.scala deleted file mode 100644 index af69a9c2c..000000000 --- a/src/scala/com/twitter/recos/hose/common/UnifiedGraphWriterMulti.scala +++ /dev/null @@ -1,228 +0,0 @@ -package src.scala.com.twitter.recos.hose.common - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.graphjet.bipartite.LeftIndexedMultiSegmentBipartiteGraph -import com.twitter.graphjet.bipartite.segment.LeftIndexedBipartiteGraphSegment -import com.twitter.kafka.client.processor.AtLeastOnceProcessor -import com.twitter.kafka.client.processor.ThreadSafeKafkaConsumerClient -import com.twitter.logging.Logger -import com.twitter.recos.hose.common.BufferedEdgeCollector -import com.twitter.recos.hose.common.BufferedEdgeWriter -import com.twitter.recos.hose.common.EdgeCollector -import com.twitter.recos.hose.common.RecosEdgeProcessor -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.recos.util.Action -import java.util.concurrent.atomic.AtomicBoolean -import java.util.concurrent.ConcurrentLinkedQueue -import java.util.concurrent.ExecutorService -import java.util.concurrent.Executors -import java.util.concurrent.Semaphore - -/** - * The class is an variation of UnifiedGraphWriter which allow one instance to hold multiple graphs - */ -trait UnifiedGraphWriterMulti[ - TSegment <: LeftIndexedBipartiteGraphSegment, - TGraph <: LeftIndexedMultiSegmentBipartiteGraph[TSegment]] { writer => - - import UnifiedGraphWriterMulti._ - - def shardId: String - def env: String - def hosename: String - def bufferSize: Int - def consumerNum: Int - def catchupWriterNum: Int - def kafkaConsumerBuilder: FinagleKafkaConsumerBuilder[String, RecosHoseMessage] - def clientId: String - def statsReceiver: StatsReceiver - - /** - * Adds a RecosHoseMessage to the graph. used by live writer to insert edges to the - * current segment - */ - def addEdgeToGraph( - graphs: Seq[(TGraph, Set[Action.Value])], - recosHoseMessage: RecosHoseMessage - ): Unit - - /** - * Adds a RecosHoseMessage to the given segment in the graph. Used by catch up writers to - * insert edges to non-current (old) segments - */ - def addEdgeToSegment( - segment: Seq[(TSegment, Set[Action.Value])], - recosHoseMessage: RecosHoseMessage - ): Unit - - private val log = Logger() - private val isRunning: AtomicBoolean = new AtomicBoolean(true) - private val initialized: AtomicBoolean = new AtomicBoolean(false) - private var processors: Seq[AtLeastOnceProcessor[String, RecosHoseMessage]] = Seq.empty - private var consumers: Seq[ThreadSafeKafkaConsumerClient[String, RecosHoseMessage]] = Seq.empty - private val threadPool: ExecutorService = Executors.newCachedThreadPool() - - def shutdown(): Unit = { - processors.foreach { processor => - processor.close() - } - processors = Seq.empty - consumers.foreach { consumer => - consumer.close() - } - consumers = Seq.empty - threadPool.shutdown() - isRunning.set(false) - } - - def initHose(liveGraphs: Seq[(TGraph, Set[Action.Value])]): Unit = this.synchronized { - if (!initialized.get) { - initialized.set(true) - - val queue: java.util.Queue[Array[RecosHoseMessage]] = - new ConcurrentLinkedQueue[Array[RecosHoseMessage]]() - val queuelimit: Semaphore = new Semaphore(1024) - - initRecosHoseKafka(queue, queuelimit) - initGrpahWriters(liveGraphs, queue, queuelimit) - } else { - throw new RuntimeException("attempt to re-init kafka hose") - } - } - - private def initRecosHoseKafka( - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore, - ): Unit = { - try { - consumers = (0 until consumerNum).map { index => - new ThreadSafeKafkaConsumerClient( - kafkaConsumerBuilder.clientId(s"clientId-$index").enableAutoCommit(false).config) - } - processors = consumers.zipWithIndex.map { - case (consumer, index) => - val bufferedWriter = BufferedEdgeCollector(bufferSize, queue, queuelimit, statsReceiver) - val processor = RecosEdgeProcessor(bufferedWriter)(statsReceiver) - - AtLeastOnceProcessor[String, RecosHoseMessage]( - s"recos-injector-kafka-$index", - hosename, - consumer, - processor.process, - maxPendingRequests = MaxPendingRequests * bufferSize, - workerThreads = ProcessorThreads, - commitIntervalMs = CommitIntervalMs, - statsReceiver = statsReceiver - ) - } - - log.info(s"starting ${processors.size} recosKafka processors") - processors.foreach { processor => - processor.start() - } - } catch { - case e: Throwable => - e.printStackTrace() - log.error(e, e.toString) - processors.foreach { processor => - processor.close() - } - processors = Seq.empty - consumers.foreach { consumer => - consumer.close() - } - consumers = Seq.empty - } - } - - /** - * Initialize the graph writers, - * by first creating catch up writers to bootstrap the older segments, - * and then assigning a live writer to populate the live segment. - */ - private def initGrpahWriters( - liveGraphs: Seq[(TGraph, Set[Action.Value])], - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore - ): Unit = { - // define a number of (numBootstrapWriters - 1) catchup writer threads, each of which will write - // to a separate graph segment. - val catchupWriters = (0 until (catchupWriterNum - 1)).map { index => - val segments = liveGraphs.map { case (graph, actions) => (graph.getLiveSegment, actions) } - for (liveGraph <- liveGraphs) { - liveGraph._1.rollForwardSegment() - } - getCatchupWriter(segments, queue, queuelimit, index) - } - val threadPool: ExecutorService = Executors.newCachedThreadPool() - - log.info("starting live graph writer that runs until service shutdown") - - // define one live writer thread - val liveWriter = getLiveWriter(liveGraphs, queue, queuelimit) - threadPool.submit(liveWriter) - - log.info( - "starting catchup graph writer, which will terminate as soon as the catchup segment is full" - ) - catchupWriters.map(threadPool.submit(_)) - } - - private def getLiveWriter( - liveGraphs: Seq[(TGraph, Set[Action.Value])], - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore, - ): BufferedEdgeWriter = { - val liveEdgeCollector = new EdgeCollector { - override def addEdge(message: RecosHoseMessage): Unit = - addEdgeToGraph(liveGraphs, message) - } - BufferedEdgeWriter( - queue, - queuelimit, - liveEdgeCollector, - statsReceiver.scope("liveWriter"), - isRunning.get - ) - } - - private def getCatchupWriter( - segments: Seq[(TSegment, Set[Action.Value])], - queue: java.util.Queue[Array[RecosHoseMessage]], - queuelimit: Semaphore, - catchupWriterIndex: Int, - ): BufferedEdgeWriter = { - val catchupEdgeCollector = new EdgeCollector { - var currentNumEdges = 0 - - override def addEdge(message: RecosHoseMessage): Unit = { - currentNumEdges += 1 - addEdgeToSegment(segments, message) - } - } - val maxEdges = segments.map(_._1.getMaxNumEdges).sum - - def runCondition(): Boolean = { - isRunning.get && ((maxEdges - catchupEdgeCollector.currentNumEdges) > bufferSize) - } - - BufferedEdgeWriter( - queue, - queuelimit, - catchupEdgeCollector, - statsReceiver.scope("catcher_" + catchupWriterIndex), - runCondition - ) - } -} - -private object UnifiedGraphWriterMulti { - - // The RecosEdgeProcessor is not thread-safe. Only use one thread to process each instance. - val ProcessorThreads = 1 - // Each one cache at most 1000 * bufferSize requests. - val MaxPendingRequests = 1000 - // Short Commit MS to reduce duplicate messages. - val CommitIntervalMs: Long = 5000 // 5 seconds, Default Kafka value. -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/BUILD b/src/scala/com/twitter/recos/user_tweet_entity_graph/BUILD deleted file mode 100644 index 779703f07..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/BUILD +++ /dev/null @@ -1,67 +0,0 @@ -scala_library( - name = "user_tweet_entity_graph", - sources = ["*.scala"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/cascading:cascading-local", - "3rdparty/jvm/com/backtype:dfs-datastores", - "3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/netflix/curator:curator-framework", - "3rdparty/jvm/com/twitter/graphjet", - "3rdparty/jvm/io/netty:netty4-tcnative-boringssl-static", - "3rdparty/jvm/it/unimi/dsi:fastutil", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/kafka:rosette-kafka", - "3rdparty/jvm/org/apache/thrift:libthrift", - "abdecider/src/main/scala", - "decider/src/main/scala", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/server", - "finagle/finagle-core/src/main", - "finagle/finagle-http/src/main/scala", - "finagle/finagle-memcached/src/main/scala", - "finagle/finagle-stats/src/main/scala", - "finagle/finagle-thriftmux/src/main/scala", - "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/util", - "scrooge/scrooge-core/src/main/scala", - "servo/repo/src/main/scala", - "servo/request/src/main/scala", - "servo/util/src/main/scala", - "src/resources/com/twitter/recos:decider", - "src/scala/com/twitter/recos/decider", - "src/scala/com/twitter/recos/graph_common", - "src/scala/com/twitter/recos/hose/common", - "src/scala/com/twitter/recos/model:recos-model", - "src/scala/com/twitter/recos/serviceapi", - "src/scala/com/twitter/recos/util:recos-util", - "src/thrift/com/twitter/recos:recos-common-scala", - "src/thrift/com/twitter/recos:recos-internal-scala", - "src/thrift/com/twitter/recos/user_tweet_entity_graph:user_tweet_entity_graph-scala", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms/model", - "twitter-server-internal/src/main/scala", - "twitter-server/server/src/main/scala", - "twitter-server/slf4j-jdk14/src/main/scala/com/twitter/server/logging", - "util/util-app/src/main/scala", - "util/util-hashing/src/main/scala", - "util/util-logging/src/main/scala", - "util/util-stats/src/main/scala", - ], -) - -jvm_binary( - name = "bin", - basename = "user_tweet_entity_graph-server", - main = "com.twitter.recos.user_tweet_entity_graph.Main", - runtime_platform = "java11", - tags = [ - "bazel-compatible", - "known-to-fail-jira:SD-20990", - ], - dependencies = [ - ":user_tweet_entity_graph", - "3rdparty/jvm/org/slf4j:slf4j-jdk14", - "twitter-server/slf4j-jdk14/src/main/scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/BUILD.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/BUILD.docx new file mode 100644 index 000000000..54c54ab58 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/EntitySocialProofRunner.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/EntitySocialProofRunner.docx new file mode 100644 index 000000000..9bc958748 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/EntitySocialProofRunner.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/EntitySocialProofRunner.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/EntitySocialProofRunner.scala deleted file mode 100644 index 2f5806fea..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/EntitySocialProofRunner.scala +++ /dev/null @@ -1,167 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import java.util.Random -import com.twitter.concurrent.AsyncQueue -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.bipartite.NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph -import com.twitter.graphjet.algorithms.{ - RecommendationInfo, - RecommendationType => JavaRecommendationType -} -import com.twitter.graphjet.algorithms.socialproof.{ - NodeMetadataSocialProofGenerator, - NodeMetadataSocialProofResult, - NodeMetadataSocialProofRequest => SocialProofJavaRequest, - SocialProofResponse => SocialProofJavaResponse -} -import com.twitter.logging.Logger -import com.twitter.recos.model.SalsaQueryRunner.SalsaRunnerConfig -import com.twitter.recos.user_tweet_entity_graph.thriftscala.{ - RecommendationType => ThriftRecommendationType, - RecommendationSocialProofRequest => SocialProofThriftRequest -} -import com.twitter.util.{Future, Try} -import it.unimi.dsi.fastutil.bytes.{Byte2ObjectArrayMap, Byte2ObjectMap} -import it.unimi.dsi.fastutil.ints.{IntOpenHashSet, IntSet} -import it.unimi.dsi.fastutil.longs.{Long2DoubleMap, Long2DoubleOpenHashMap} -import scala.collection.JavaConverters._ - -/** - * EntitySocialProofRunner creates a queue of reader threads, NodeMetadataProofGenerator, - * and each one reads from the graph and computes social proofs. - */ -class EntitySocialProofRunner( - graph: NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph, - salsaRunnerConfig: SalsaRunnerConfig, - statsReceiver: StatsReceiver) { - private val log: Logger = Logger() - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - private val socialProofSizeStat = stats.stat("socialProofSize") - - private val socialProofFailureCounter = stats.counter("failure") - private val pollCounter = stats.counter("poll") - private val pollTimeoutCounter = stats.counter("pollTimeout") - private val offerCounter = stats.counter("offer") - private val pollLatencyStat = stats.stat("pollLatency") - private val socialProofRunnerPool = initSocialProofRunnerPool() - - private def initSocialProofRunnerPool(): AsyncQueue[NodeMetadataSocialProofGenerator] = { - val socialProofQueue = new AsyncQueue[NodeMetadataSocialProofGenerator] - (0 until salsaRunnerConfig.numSalsaRunners).foreach { _ => - socialProofQueue.offer(new NodeMetadataSocialProofGenerator(graph)) - } - socialProofQueue - } - - /** - * Helper method to interpret the output of SocialProofJavaResponse - * - * @param socialProofResponse is the response from running NodeMetadataSocialProof - * @return a sequence of SocialProofResult - */ - private def transformSocialProofResponse( - socialProofResponse: Option[SocialProofJavaResponse] - ): Seq[RecommendationInfo] = { - socialProofResponse match { - case Some(response) => - val scalaResponse = response.getRankedRecommendations.asScala - scalaResponse.foreach { result => - socialProofSizeStat.add( - result.asInstanceOf[NodeMetadataSocialProofResult].getSocialProofSize) - } - scalaResponse.toSeq - case _ => Nil - } - } - - /** - * Helper method to run social proof computation and convert the results to Option - * - * @param socialProof is socialProof reader on bipartite graph - * @param request is the socialProof request - * @return is an option of SocialProofJavaResponse - */ - private def getSocialProofResponse( - socialProof: NodeMetadataSocialProofGenerator, - request: SocialProofJavaRequest, - random: Random - )( - implicit statsReceiver: StatsReceiver - ): Option[SocialProofJavaResponse] = { - val attempt = Try(socialProof.computeRecommendations(request, random)).onFailure { e => - socialProofFailureCounter.incr() - log.error(e, "SocialProof computation failed") - } - attempt.toOption - } - - /** - * Attempt to retrieve a NodeMetadataSocialProof thread from the runner pool - * to execute a socialProofRequest - */ - private def handleSocialProofRequest(socialProofRequest: SocialProofJavaRequest) = { - pollCounter.incr() - val t0 = System.currentTimeMillis() - socialProofRunnerPool.poll().map { entitySocialProof => - val pollTime = System.currentTimeMillis - t0 - pollLatencyStat.add(pollTime) - val socialProofResponse = Try { - if (pollTime < salsaRunnerConfig.timeoutSalsaRunner) { - val response = - getSocialProofResponse(entitySocialProof, socialProofRequest, new Random())( - statsReceiver - ) - transformSocialProofResponse(response) - } else { - // if we did not get a social proof in time, then fail fast here and immediately put it back - log.warning("socialProof polling timeout") - pollTimeoutCounter.incr() - throw new RuntimeException("socialProof poll timeout") - Nil - } - } ensure { - socialProofRunnerPool.offer(entitySocialProof) - offerCounter.incr() - } - socialProofResponse.toOption getOrElse Nil - } - } - - /** - * This apply() supports requests coming from the new social proof endpoint in UTEG that works for - * tweet social proof generation, as well as hashtag and url social proof generation. - * Currently this endpoint supports url social proof generation for Guide. - */ - def apply(request: SocialProofThriftRequest): Future[Seq[RecommendationInfo]] = { - val nodeMetadataTypeToIdsMap: Byte2ObjectMap[IntSet] = new Byte2ObjectArrayMap[IntSet]() - request.recommendationIdsForSocialProof.collect { - case (ThriftRecommendationType.Url, urlIds) => - // We must convert the Long url ids into type Int since the underlying library expects Int type metadata ids. - val urlIntIds = urlIds.map(_.toInt) - nodeMetadataTypeToIdsMap.put( - JavaRecommendationType.URL.getValue.toByte, - new IntOpenHashSet(urlIntIds.toArray) - ) - case (ThriftRecommendationType.Hashtag, hashtagIds) => - // We must convert the Long hashtag ids into type Int since the underlying library expects Int type metadata ids. - val hashtagIntIds = hashtagIds.map(_.toInt) - nodeMetadataTypeToIdsMap.put( - JavaRecommendationType.HASHTAG.getValue.toByte, - new IntOpenHashSet(hashtagIntIds.toArray) - ) - } - - val leftSeedNodes: Long2DoubleMap = new Long2DoubleOpenHashMap( - request.seedsWithWeights.keys.toArray, - request.seedsWithWeights.values.toArray - ) - - val socialProofRequest = new SocialProofJavaRequest( - nodeMetadataTypeToIdsMap, - leftSeedNodes, - UserTweetEdgeTypeMask.getUserTweetGraphSocialProofTypes(request.socialProofTypes) - ) - - handleSocialProofRequest(socialProofRequest) - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/LoggingUserTweetEntityGraph.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/LoggingUserTweetEntityGraph.docx new file mode 100644 index 000000000..fbad62f41 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/LoggingUserTweetEntityGraph.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/LoggingUserTweetEntityGraph.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/LoggingUserTweetEntityGraph.scala deleted file mode 100644 index ab1a44324..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/LoggingUserTweetEntityGraph.scala +++ /dev/null @@ -1,103 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.finagle.tracing.Trace -import com.twitter.logging.Logger -import com.twitter.recos.user_tweet_entity_graph.thriftscala._ -import com.twitter.util.Future - -trait LoggingUserTweetEntityGraph extends thriftscala.UserTweetEntityGraph.MethodPerEndpoint { - private[this] val accessLog = Logger("access") - - abstract override def recommendTweets( - request: RecommendTweetEntityRequest - ): Future[RecommendTweetEntityResponse] = { - val time = System.currentTimeMillis - super.recommendTweets(request) onSuccess { resp => - accessLog.info( - "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\tRecommendTweetResponse size: %s\t%s in %d ms" - .format( - time, - Trace.id.toString(), - request.requesterId, - request.displayLocation, - request.recommendationTypes, - request.maxResultsByType, - request.excludedTweetIds.map(_.take(5)), - request.excludedTweetIds.map(_.size), - request.seedsWithWeights.take(5), - request.seedsWithWeights.size, - request.maxTweetAgeInMillis, - request.maxUserSocialProofSize, - request.maxTweetSocialProofSize, - request.minUserSocialProofSizes, - request.tweetTypes, - request.socialProofTypes, - request.socialProofTypeUnions, - resp.recommendations.size, - resp.recommendations.take(20).toList map { - case UserTweetEntityRecommendationUnion.TweetRec(tweetRec) => - (tweetRec.tweetId, tweetRec.socialProofByType.map { case (k, v) => (k, v.size) }) - case UserTweetEntityRecommendationUnion.HashtagRec(hashtagRec) => - (hashtagRec.id, hashtagRec.socialProofByType.map { case (k, v) => (k, v.size) }) - case UserTweetEntityRecommendationUnion.UrlRec(urlRec) => - (urlRec.id, urlRec.socialProofByType.map { case (k, v) => (k, v.size) }) - case _ => - throw new Exception("Unsupported recommendation types") - }, - System.currentTimeMillis - time - ) - ) - } onFailure { exc => - accessLog.error( - "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s in %d ms".format( - time, - Trace.id.toString(), - request.requesterId, - request.displayLocation, - request.recommendationTypes, - request.maxResultsByType, - request.excludedTweetIds.map(_.take(5)), - request.excludedTweetIds.map(_.size), - request.seedsWithWeights.take(5), - request.seedsWithWeights.size, - request.maxTweetAgeInMillis, - request.maxUserSocialProofSize, - request.maxTweetSocialProofSize, - request.minUserSocialProofSizes, - request.tweetTypes, - request.socialProofTypes, - request.socialProofTypeUnions, - exc, - System.currentTimeMillis - time - ) - ) - } - } - - abstract override def findTweetSocialProofs( - request: SocialProofRequest - ): Future[SocialProofResponse] = { - val time = System.currentTimeMillis - super.findTweetSocialProofs(request) onSuccess { resp => - accessLog.info( - "%s\t%s\t%d\tResponse: %s\tin %d ms".format( - Trace.id.toString, - request.requesterId, - request.seedsWithWeights.size, - resp.socialProofResults.toList, - System.currentTimeMillis - time - ) - ) - } onFailure { exc => - accessLog.info( - "%s\t%s\t%d\tException: %s\tin %d ms".format( - Trace.id.toString, - request.requesterId, - request.seedsWithWeights.size, - exc, - System.currentTimeMillis - time - ) - ) - } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/Main.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/Main.docx new file mode 100644 index 000000000..5a31b7202 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/Main.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/Main.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/Main.scala deleted file mode 100644 index 9bd39d57e..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/Main.scala +++ /dev/null @@ -1,258 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.abdecider.ABDeciderFactory -import com.twitter.abdecider.LoggingABDecider -import com.twitter.app.Flag -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.ThriftMux -import com.twitter.finagle.http.HttpMuxer -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.mtls.server.MtlsStackServer._ -import com.twitter.finagle.mux.transport.OpportunisticTls -import com.twitter.finagle.thrift.ClientId -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.finatra.kafka.domain.KafkaGroupId -import com.twitter.finatra.kafka.domain.SeekStrategy -import com.twitter.finatra.kafka.serde.ScalaSerdes -import com.twitter.frigate.common.util.ElfOwlFilter -import com.twitter.frigate.common.util.ElfOwlFilter.ByLdapGroup -import com.twitter.graphjet.bipartite.NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph -import com.twitter.logging._ -import com.twitter.recos.decider.UserTweetEntityGraphDecider -import com.twitter.recos.graph_common.FinagleStatsReceiverWrapper -import com.twitter.recos.graph_common.NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.recos.model.Constants -import com.twitter.recos.user_tweet_entity_graph.RecosConfig._ -import com.twitter.server.logging.{Logging => JDK14Logging} -import com.twitter.server.Deciderable -import com.twitter.server.TwitterServer -import com.twitter.thriftwebforms.MethodOptions -import com.twitter.thriftwebforms.TwitterServerThriftWebForms -import com.twitter.util.Await -import com.twitter.util.Duration -import java.net.InetSocketAddress -import java.util.concurrent.TimeUnit -import org.apache.kafka.clients.CommonClientConfigs -import org.apache.kafka.common.config.SaslConfigs -import org.apache.kafka.common.config.SslConfigs -import org.apache.kafka.common.security.auth.SecurityProtocol -import org.apache.kafka.common.serialization.StringDeserializer - -object Main extends TwitterServer with JDK14Logging with Deciderable { - profile => - - val shardId: Flag[Int] = flag("shardId", 0, "Shard ID") - val servicePort: Flag[InetSocketAddress] = - flag("service.port", new InetSocketAddress(10143), "Thrift service port") - val logDir: Flag[String] = flag("logdir", "recos", "Logging directory") - val numShards: Flag[Int] = flag("numShards", 1, "Number of shards for this service") - val truststoreLocation: Flag[String] = - flag[String]("truststore_location", "", "Truststore file location") - val hoseName: Flag[String] = - flag("hosename", "recos_injector_user_user", "the kafka stream used for incoming edges") - - val dataCenter: Flag[String] = flag("service.cluster", "atla", "Data Center") - val serviceRole: Flag[String] = flag("service.role", "Service Role") - val serviceEnv: Flag[String] = flag("service.env", "Service Env") - val serviceName: Flag[String] = flag("service.name", "Service Name") - - private val maxNumSegments = - flag("maxNumSegments", graphBuilderConfig.maxNumSegments, "the number of segments in the graph") - - private val statsReceiverWrapper = FinagleStatsReceiverWrapper(statsReceiver) - - lazy val clientId = ClientId(s"usertweetentitygraph.${serviceEnv()}") - - private val shutdownTimeout = flag( - "service.shutdownTimeout", - 5.seconds, - "Maximum amount of time to wait for pending requests to complete on shutdown" - ) - - // ********* logging ********** - - lazy val loggingLevel: Level = Level.INFO - lazy val recosLogPath: String = logDir() + "/recos.log" - lazy val graphLogPath: String = logDir() + "/graph.log" - lazy val accessLogPath: String = logDir() + "/access.log" - - override def loggerFactories: List[LoggerFactory] = - List( - LoggerFactory( - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = recosLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "graph", - useParents = false, - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = graphLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "access", - useParents = false, - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = accessLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "client_event", - level = Some(loggingLevel), - useParents = false, - handlers = QueueingHandler( - maxQueueSize = 10000, - handler = ScribeHandler( - category = "client_event", - formatter = BareFormatter - ) - ) :: Nil - ) - ) - // ******** Decider ************* - - val graphDecider: UserTweetEntityGraphDecider = UserTweetEntityGraphDecider() - - // ********* ABdecider ********** - - val abDeciderYmlPath: String = "/usr/local/config/abdecider/abdecider.yml" - - val scribeLogger: Option[Logger] = Some(Logger.get("client_event")) - - val abDecider: LoggingABDecider = - ABDeciderFactory( - abDeciderYmlPath = abDeciderYmlPath, - scribeLogger = scribeLogger, - environment = Some("production") - ).buildWithLogging() - - // ********* Recos service ********** - - private def getKafkaBuilder() = { - FinagleKafkaConsumerBuilder[String, RecosHoseMessage]() - .dest("/s/kafka/recommendations:kafka-tls") - .groupId(KafkaGroupId(f"user_tweet_entity_graph-${shardId()}%06d")) - .keyDeserializer(new StringDeserializer) - .valueDeserializer(ScalaSerdes.Thrift[RecosHoseMessage].deserializer) - .seekStrategy(SeekStrategy.REWIND) - .rewindDuration(20.hours) - .withConfig(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, SecurityProtocol.SASL_SSL.toString) - .withConfig(SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG, truststoreLocation()) - .withConfig(SaslConfigs.SASL_MECHANISM, SaslConfigs.GSSAPI_MECHANISM) - .withConfig(SaslConfigs.SASL_KERBEROS_SERVICE_NAME, "kafka") - .withConfig(SaslConfigs.SASL_KERBEROS_SERVER_NAME, "kafka") - } - def main(): Unit = { - log.info("building graph with maxNumSegments = " + profile.maxNumSegments()) - val graph = NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder( - graphBuilderConfig.copy(maxNumSegments = profile.maxNumSegments()), - statsReceiverWrapper - ) - - val kafkaConfigBuilder = getKafkaBuilder() - - val graphWriter = - UserTweetEntityGraphWriter( - shardId().toString, - serviceEnv(), - hoseName(), - 128, // keep the original setting. - kafkaConfigBuilder, - clientId.name, - statsReceiver, - ) - graphWriter.initHose(graph) - - val tweetRecsRunner = new TweetRecommendationsRunner( - graph, - Constants.salsaRunnerConfig, - statsReceiverWrapper - ) - - val tweetSocialProofRunner = new TweetSocialProofRunner( - graph, - Constants.salsaRunnerConfig, - statsReceiver - ) - - val entitySocialProofRunner = new EntitySocialProofRunner( - graph, - Constants.salsaRunnerConfig, - statsReceiver - ) - - val recommendationHandler = new RecommendationHandler(tweetRecsRunner, statsReceiver) - - /* - * Old social proof handler retained to support old tweet social proof endpoint. - * Future clients should utilize the findRecommendationSocialProofs endpoint which will use - * the more broad "SocialProofHandler" - */ - val tweetSocialProofHandler = new TweetSocialProofHandler( - tweetSocialProofRunner, - graphDecider, - statsReceiver - ) - val socialProofHandler = new SocialProofHandler( - tweetSocialProofRunner, - entitySocialProofRunner, - graphDecider, - statsReceiver - ) - val userTweetEntityGraph = new UserTweetEntityGraph( - recommendationHandler, - tweetSocialProofHandler, - socialProofHandler - ) with LoggingUserTweetEntityGraph - - // For MutualTLS - val serviceIdentifier = ServiceIdentifier( - role = serviceRole(), - service = serviceName(), - environment = serviceEnv(), - zone = dataCenter() - ) - log.info(s"ServiceIdentifier = ${serviceIdentifier.toString}") - - val thriftServer = ThriftMux.server - .withOpportunisticTls(OpportunisticTls.Required) - .withMutualTls(serviceIdentifier) - .serveIface(servicePort(), userTweetEntityGraph) - - log.info("clientid: " + clientId.toString) - log.info("servicePort: " + servicePort().toString) - - log.info("adding shutdown hook") - onExit { - graphWriter.shutdown() - thriftServer.close(shutdownTimeout().fromNow) - } - log.info("added shutdown hook") - - // Wait on the thriftServer so that shutdownTimeout is respected. - Await.result(thriftServer) - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/README.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/README.docx new file mode 100644 index 000000000..08d742c9d Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/README.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/README.md b/src/scala/com/twitter/recos/user_tweet_entity_graph/README.md deleted file mode 100644 index 39af44deb..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# UserTweetEntityGraph (UTEG) - -## What is it -User Tweet Entity Graph (UTEG) is a Finalge thrift service built on the GraphJet framework. It maintains a graph of user-tweet relationships and serves user recommendations based on traversals in this graph. - -## How is it used on Twitter -UTEG generates the "XXX Liked" out-of-network tweets seen on Twitter's Home Timeline. -The core idea behind UTEG is collaborative filtering. UTEG takes a user's weighted follow graph (i.e a list of weighted userIds) as input, -performs efficient traversal & aggregation, and returns the top-weighted tweets engaged based on # of users that engaged the tweet, as well as -the engaged users' weights. - -UTEG is a stateful service and relies on a Kafka stream to ingest & persist states. It maintains in-memory user engagements over the past -24-48 hours. Older events are dropped and GC'ed. - -For full details on storage & processing, please check out our open-sourced project GraphJet, a general-purpose high-performance in-memory storage engine. -- https://github.com/twitter/GraphJet -- http://www.vldb.org/pvldb/vol9/p1281-sharma.pdf diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/RecommendationHandler.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/RecommendationHandler.docx new file mode 100644 index 000000000..fa82c810e Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/RecommendationHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/RecommendationHandler.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/RecommendationHandler.scala deleted file mode 100644 index 80749cd76..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/RecommendationHandler.scala +++ /dev/null @@ -1,78 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.util.StatsUtil -import com.twitter.graphjet.algorithms.RecommendationType -import com.twitter.graphjet.algorithms.counting.tweet.TweetMetadataRecommendationInfo -import com.twitter.graphjet.algorithms.counting.tweet.TweetRecommendationInfo -import com.twitter.recos.user_tweet_entity_graph.thriftscala._ -import com.twitter.recos.util.Stats -import com.twitter.servo.request._ -import com.twitter.util.Future - -/** - * Implementation of the Thrift-defined service interface. - * -* A wrapper of magicRecsRunner. - */ -class RecommendationHandler( - tweetRecsRunner: TweetRecommendationsRunner, - statsReceiver: StatsReceiver) - extends RequestHandler[RecommendTweetEntityRequest, RecommendTweetEntityResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - private val socialProofHydrator = new SocialProofHydrator(stats) - - override def apply(request: RecommendTweetEntityRequest): Future[RecommendTweetEntityResponse] = { - val scopedStats: StatsReceiver = stats.scope(request.displayLocation.toString) - - StatsUtil.trackBlockStats(scopedStats) { - val candidatesFuture = tweetRecsRunner.apply(request) - - candidatesFuture.map { candidates => - if (candidates.isEmpty) scopedStats.counter(Stats.EmptyResult).incr() - else scopedStats.counter(Stats.Served).incr(candidates.size) - - RecommendTweetEntityResponse(candidates.flatMap { - _ match { - case tweetRec: TweetRecommendationInfo => - Some( - UserTweetEntityRecommendationUnion.TweetRec( - TweetRecommendation( - tweetRec.getRecommendation, - tweetRec.getWeight, - socialProofHydrator.addTweetSocialProofByType(tweetRec), - socialProofHydrator.addTweetSocialProofs(tweetRec) - ) - ) - ) - case tweetMetadataRec: TweetMetadataRecommendationInfo => - if (tweetMetadataRec.getRecommendationType == RecommendationType.HASHTAG) { - Some( - UserTweetEntityRecommendationUnion.HashtagRec( - HashtagRecommendation( - tweetMetadataRec.getRecommendation, - tweetMetadataRec.getWeight, - socialProofHydrator.addMetadataSocialProofByType(tweetMetadataRec) - ) - ) - ) - } else if (tweetMetadataRec.getRecommendationType == RecommendationType.URL) { - Some( - UserTweetEntityRecommendationUnion.UrlRec( - UrlRecommendation( - tweetMetadataRec.getRecommendation, - tweetMetadataRec.getWeight, - socialProofHydrator.addMetadataSocialProofByType(tweetMetadataRec) - ) - ) - ) - } else { - None: Option[UserTweetEntityRecommendationUnion] - } - case _ => None - } - }) - } - } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/RecosConfig.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/RecosConfig.docx new file mode 100644 index 000000000..2d7ebf53d Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/RecosConfig.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/RecosConfig.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/RecosConfig.scala deleted file mode 100644 index c37d2911d..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/RecosConfig.scala +++ /dev/null @@ -1,44 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.graphjet.algorithms.RecommendationType -import com.twitter.recos.model.Constants -import com.twitter.recos.graph_common.NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.GraphBuilderConfig - -/** - * The class holds all the config parameters for recos graph. - */ -object RecosConfig { - val maxNumSegments: Int = 8 // this value will be overwritten by a parameter from profile config - val maxNumEdgesPerSegment: Int = 1 << 27 // 134M edges per segment - val expectedNumLeftNodes: Int = 1 << 24 // 16M nodes - val expectedMaxLeftDegree: Int = 64 - val leftPowerLawExponent: Double = 16.0 // steep power law as most nodes will have a small degree - val expectedNumRightNodes: Int = 1 << 24 // 16M nodes - val numRightNodeMetadataTypes: Int = - RecommendationType.METADATASIZE.getValue // two node metadata types: hashtag and url - - val graphBuilderConfig = GraphBuilderConfig( - maxNumSegments = maxNumSegments, - maxNumEdgesPerSegment = maxNumEdgesPerSegment, - expectedNumLeftNodes = expectedNumLeftNodes, - expectedMaxLeftDegree = expectedMaxLeftDegree, - leftPowerLawExponent = leftPowerLawExponent, - expectedNumRightNodes = expectedNumRightNodes, - numRightNodeMetadataTypes = numRightNodeMetadataTypes, - edgeTypeMask = new UserTweetEdgeTypeMask() - ) - - val maxUserSocialProofSize: Int = 10 - val maxTweetSocialProofSize: Int = 10 - val maxTweetAgeInMillis: Long = 24 * 60 * 60 * 1000 - val maxEngagementAgeInMillis: Long = Long.MaxValue - - println("RecosConfig - maxNumSegments " + maxNumSegments) - println("RecosConfig - maxNumEdgesPerSegment " + maxNumEdgesPerSegment) - println("RecosConfig - expectedNumLeftNodes " + expectedNumLeftNodes) - println("RecosConfig - expectedMaxLeftDegree " + expectedMaxLeftDegree) - println("RecosConfig - leftPowerLawExponent " + leftPowerLawExponent) - println("RecosConfig - expectedNumRightNodes " + expectedNumRightNodes) - println("RecosConfig - numRightNodeMetadataTypes " + numRightNodeMetadataTypes) - println("RecosConfig - salsaRunnerConfig " + Constants.salsaRunnerConfig) -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHandler.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHandler.docx new file mode 100644 index 000000000..b457a2d9b Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHandler.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHandler.scala deleted file mode 100644 index 8d74cbe37..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHandler.scala +++ /dev/null @@ -1,165 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.algorithms.{ - RecommendationInfo, - RecommendationType => JavaRecommendationType -} -import com.twitter.graphjet.algorithms.socialproof.{ - NodeMetadataSocialProofResult => EntitySocialProofJavaResult, - SocialProofResult => SocialProofJavaResult -} -import com.twitter.recos.decider.UserTweetEntityGraphDecider -import com.twitter.recos.util.Stats -import com.twitter.recos.util.Stats._ -import com.twitter.recos.recos_common.thriftscala.{SocialProofType => SocialProofThriftType} -import com.twitter.recos.user_tweet_entity_graph.thriftscala.{ - HashtagRecommendation, - TweetRecommendation, - UrlRecommendation, - UserTweetEntityRecommendationUnion, - RecommendationSocialProofRequest => SocialProofThriftRequest, - RecommendationSocialProofResponse => SocialProofThriftResponse, - RecommendationType => ThriftRecommendationType -} -import com.twitter.servo.request.RequestHandler -import com.twitter.util.{Future, Try} -import scala.collection.JavaConverters._ - -class SocialProofHandler( - tweetSocialProofRunner: TweetSocialProofRunner, - entitySocialProofRunner: EntitySocialProofRunner, - decider: UserTweetEntityGraphDecider, - statsReceiver: StatsReceiver) - extends RequestHandler[SocialProofThriftRequest, SocialProofThriftResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - - private def getThriftSocialProof( - entitySocialProof: EntitySocialProofJavaResult - ): Map[SocialProofThriftType, Map[Long, Seq[Long]]] = { - val socialProofAttempt = Try(entitySocialProof.getSocialProof) - .onFailure { e => - stats.counter(e.getClass.getSimpleName).incr() - } - - socialProofAttempt.toOption match { - case Some(socialProof) if socialProof.isEmpty => - stats.counter(Stats.EmptyResult).incr() - Map.empty[SocialProofThriftType, Map[Long, Seq[Long]]] - case Some(socialProof) if !socialProof.isEmpty => - socialProof.asScala.map { - case (socialProofType, socialProofUserToTweetsMap) => - val userToTweetsSocialProof = socialProofUserToTweetsMap.asScala.map { - case (socialProofUser, connectingTweets) => - (socialProofUser.toLong, connectingTweets.asScala.map(Long2long).toSeq) - }.toMap - (SocialProofThriftType(socialProofType.toInt), userToTweetsSocialProof) - }.toMap - case _ => - Map.empty[SocialProofThriftType, Map[Long, Seq[Long]]] - } - } - - private def getThriftSocialProof( - tweetSocialProof: SocialProofJavaResult - ): Map[SocialProofThriftType, Seq[Long]] = { - val socialProofAttempt = Try(tweetSocialProof.getSocialProof) - .onFailure { e => - stats.counter(e.getClass.getSimpleName).incr() - } - - socialProofAttempt.toOption match { - case Some(socialProof) if socialProof.isEmpty => - stats.counter(Stats.EmptyResult).incr() - Map.empty[SocialProofThriftType, Seq[Long]] - case Some(socialProof) if !socialProof.isEmpty => - socialProof.asScala.map { - case (socialProofType, connectingUsers) => - ( - SocialProofThriftType(socialProofType.toInt), - connectingUsers.asScala.map { Long2long }.toSeq) - }.toMap - case _ => - Map.empty[SocialProofThriftType, Seq[Long]] - } - } - - private def getEntitySocialProof( - request: SocialProofThriftRequest - ): Future[Seq[UserTweetEntityRecommendationUnion]] = { - val socialProofsFuture = entitySocialProofRunner(request) - - socialProofsFuture.map { socialProofs: Seq[RecommendationInfo] => - stats.counter(Stats.Served).incr(socialProofs.size) - socialProofs.flatMap { entitySocialProof: RecommendationInfo => - val entitySocialProofJavaResult = - entitySocialProof.asInstanceOf[EntitySocialProofJavaResult] - if (entitySocialProofJavaResult.getRecommendationType == JavaRecommendationType.URL) { - Some( - UserTweetEntityRecommendationUnion.UrlRec( - UrlRecommendation( - entitySocialProofJavaResult.getNodeMetadataId, - entitySocialProofJavaResult.getWeight, - getThriftSocialProof(entitySocialProofJavaResult) - ) - ) - ) - } else if (entitySocialProofJavaResult.getRecommendationType == JavaRecommendationType.HASHTAG) { - Some( - UserTweetEntityRecommendationUnion.HashtagRec( - HashtagRecommendation( - entitySocialProofJavaResult.getNodeMetadataId, - entitySocialProofJavaResult.getWeight, - getThriftSocialProof(entitySocialProofJavaResult) - ) - ) - ) - } else { - None - } - } - } - } - - private def getTweetSocialProof( - request: SocialProofThriftRequest - ): Future[Seq[UserTweetEntityRecommendationUnion]] = { - val socialProofsFuture = tweetSocialProofRunner(request) - - socialProofsFuture.map { socialProofs: Seq[RecommendationInfo] => - stats.counter(Stats.Served).incr(socialProofs.size) - socialProofs.flatMap { tweetSocialProof: RecommendationInfo => - val tweetSocialProofJavaResult = tweetSocialProof.asInstanceOf[SocialProofJavaResult] - Some( - UserTweetEntityRecommendationUnion.TweetRec( - TweetRecommendation( - tweetSocialProofJavaResult.getNode, - tweetSocialProofJavaResult.getWeight, - getThriftSocialProof(tweetSocialProofJavaResult) - ) - ) - ) - } - } - } - - def apply(request: SocialProofThriftRequest): Future[SocialProofThriftResponse] = { - trackFutureBlockStats(stats) { - val recommendationsWithSocialProofFut = Future - .collect { - request.recommendationIdsForSocialProof.keys.map { - case ThriftRecommendationType.Tweet if decider.tweetSocialProof => - getTweetSocialProof(request) - case (ThriftRecommendationType.Url | ThriftRecommendationType.Hashtag) - if decider.entitySocialProof => - getEntitySocialProof(request) - case _ => - Future.Nil - }.toSeq - }.map(_.flatten) - recommendationsWithSocialProofFut.map { recommendationsWithSocialProof => - SocialProofThriftResponse(recommendationsWithSocialProof) - } - } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHydrator.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHydrator.docx new file mode 100644 index 000000000..70885dae4 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHydrator.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHydrator.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHydrator.scala deleted file mode 100644 index ed44de053..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/SocialProofHydrator.scala +++ /dev/null @@ -1,111 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.algorithms.counting.tweet.{ - TweetMetadataRecommendationInfo, - TweetRecommendationInfo -} -import com.twitter.recos.recos_common.thriftscala.{SocialProof, SocialProofType} - -import scala.collection.JavaConverters._ - -class SocialProofHydrator(statsReceiver: StatsReceiver) { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - private val socialProofsDup = stats.counter("socialProofsDup") - private val socialProofsUni = stats.counter("socialProofsUni") - private val socialProofByTypeDup = stats.counter("socialProofByTypeDup") - private val socialProofByTypeUni = stats.counter("socialProofByTypeUni") - - // If the social proof type is favorite, there are cases that one user favs, unfavs and then favs the same tweet again. - // In this case, UTEG only returns one valid social proof. Note that GraphJet library compares the number of unique users - // with the minSocialProofThreshold, so the threshold checking logic is correct. - // If the social proof type is reply or quote, there are valid cases that one user replies the same tweet multiple times. - // GraphJet does not handle this deduping because this is Twitter specific logic. - def getSocialProofs( - socialProofType: SocialProofType, - users: Seq[Long], - metadata: Seq[Long] - ): Seq[SocialProof] = { - if (socialProofType == SocialProofType.Favorite && users.size > 1 && users.size != users.distinct.size) { - socialProofsDup.incr() - val unique = users - .zip(metadata) - .foldLeft[Seq[(Long, Long)]](Nil) { (list, next) => - { - val test = list find { _._1 == next._1 } - if (test.isEmpty) next +: list else list - } - } - .reverse - unique.map { case (user, data) => SocialProof(user, Some(data)) } - } else { - socialProofsUni.incr() - users.zip(metadata).map { case (user, data) => SocialProof(user, Some(data)) } - } - - } - - // Extract and dedup social proofs from GraphJet. Only Favorite based social proof needs to dedup. - // Return the social proofs (userId, metadata) pair in SocialProof thrift objects. - def addTweetSocialProofs( - tweet: TweetRecommendationInfo - ): Option[Map[SocialProofType, Seq[SocialProof]]] = { - Some( - tweet.getSocialProof.asScala.map { - case (socialProofType, socialProof) => - val socialProofThriftType = SocialProofType(socialProofType.toByte) - ( - socialProofThriftType, - getSocialProofs( - socialProofThriftType, - socialProof.getConnectingUsers.asScala.map(_.toLong), - socialProof.getMetadata.asScala.map(_.toLong) - ) - ) - }.toMap - ) - } - - def getSocialProofs(users: Seq[Long]): Seq[Long] = { - if (users.size > 1) { - val distinctUsers = users.distinct - if (users.size != distinctUsers.size) { - socialProofByTypeDup.incr() - } else { - socialProofByTypeUni.incr() - } - distinctUsers - } else { - socialProofByTypeUni.incr() - users - } - } - - // Extract and dedup social proofs from GraphJet. All social proof types need to dedup. - // Return the userId social proofs without metadata. - def addTweetSocialProofByType(tweet: TweetRecommendationInfo): Map[SocialProofType, Seq[Long]] = { - tweet.getSocialProof.asScala.map { - case (socialProofType, socialProof) => - ( - SocialProofType(socialProofType.toByte), - getSocialProofs(socialProof.getConnectingUsers.asScala.map(_.toLong)) - ) - }.toMap - } - - // The Hashtag and URL Social Proof. Dedup is not necessary. - def addMetadataSocialProofByType( - tweetMetadataRec: TweetMetadataRecommendationInfo - ): Map[SocialProofType, Map[Long, Seq[Long]]] = { - tweetMetadataRec.getSocialProof.asScala.map { - case (socialProofType, socialProof) => - ( - SocialProofType(socialProofType.toByte), - socialProof.asScala.map { - case (authorId, tweetIds) => - (authorId.toLong, tweetIds.asScala.map(_.toLong)) - }.toMap) - }.toMap - } - -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetRecommendationsRunner.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetRecommendationsRunner.docx new file mode 100644 index 000000000..3c0dbec2c Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetRecommendationsRunner.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetRecommendationsRunner.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetRecommendationsRunner.scala deleted file mode 100644 index 428c2dd6d..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetRecommendationsRunner.scala +++ /dev/null @@ -1,322 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import java.util.Random -import com.twitter.concurrent.AsyncQueue -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.algorithms._ -import com.twitter.graphjet.algorithms.filters._ -import com.twitter.graphjet.algorithms.counting.TopSecondDegreeByCountResponse -import com.twitter.graphjet.algorithms.counting.tweet.TopSecondDegreeByCountForTweet -import com.twitter.graphjet.algorithms.counting.tweet.TopSecondDegreeByCountRequestForTweet -import com.twitter.graphjet.bipartite.NodeMetadataLeftIndexedMultiSegmentBipartiteGraph -import com.twitter.logging.Logger -import com.twitter.recos.graph_common.FinagleStatsReceiverWrapper -import com.twitter.recos.model.SalsaQueryRunner.SalsaRunnerConfig -import com.twitter.recos.recos_common.thriftscala.SocialProofType -import com.twitter.recos.user_tweet_entity_graph.thriftscala.RecommendTweetEntityRequest -import com.twitter.recos.user_tweet_entity_graph.thriftscala.TweetEntityDisplayLocation -import com.twitter.recos.user_tweet_entity_graph.thriftscala.TweetType -import com.twitter.recos.util.Stats.trackBlockStats -import com.twitter.util.Future -import com.twitter.util.JavaTimer -import com.twitter.util.Try -import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap -import it.unimi.dsi.fastutil.longs.LongOpenHashSet -import scala.collection.JavaConverters._ - -import com.twitter.graphjet.algorithms.RecommendationType -import com.twitter.recos.user_tweet_entity_graph.thriftscala.{ - RecommendationType => ThriftRecommendationType -} -import scala.collection.Map -import scala.collection.Set - -object TweetRecommendationsRunner { - private val DefaultTweetTypes: Seq[TweetType] = - Seq(TweetType.Regular, TweetType.Summary, TweetType.Photo, TweetType.Player) - private val DefaultF1ExactSocialProofSize = 1 - private val DefaultRareTweetRecencyMillis: Long = 7.days.inMillis - - /** - * Map valid social proof types specified by clients to an array of bytes. If clients do not - * specify any social proof type unions in thrift, it will return an empty set by default. - */ - private def getSocialProofTypeUnions( - socialProofTypeUnions: Option[Set[Seq[SocialProofType]]] - ): Set[Array[Byte]] = { - socialProofTypeUnions - .map { - _.map { - _.map { - _.getValue.toByte - }.toArray - } - } - .getOrElse(Set.empty) - } - - private def getRecommendationTypes( - recommendationTypes: Seq[ThriftRecommendationType] - ): Set[RecommendationType] = { - recommendationTypes.flatMap { - _ match { - case ThriftRecommendationType.Tweet => Some(RecommendationType.TWEET) - case ThriftRecommendationType.Hashtag => Some(RecommendationType.HASHTAG) - case ThriftRecommendationType.Url => Some(RecommendationType.URL) - case _ => - throw new Exception("Unmatched Recommendation Type in getRecommendationTypes") - } - }.toSet - } - - private def convertThriftEnumsToJavaEnums( - maxResults: Option[Map[ThriftRecommendationType, Int]] - ): Map[RecommendationType, Integer] = { - maxResults - .map { - _.flatMap { - _ match { - case (ThriftRecommendationType.Tweet, v) => Some((RecommendationType.TWEET, v: Integer)) - case (ThriftRecommendationType.Hashtag, v) => - Some((RecommendationType.HASHTAG, v: Integer)) - case (ThriftRecommendationType.Url, v) => Some((RecommendationType.URL, v: Integer)) - case _ => - throw new Exception("Unmatched Recommendation Type in convertThriftEnumsToJavaEnums") - } - } - } - .getOrElse(Map.empty) - } - -} - -/** - * The MagicRecsRunner creates a queue of reader threads, MagicRecs, and each one reads from the - * graph and computes recommendations. - */ -class TweetRecommendationsRunner( - bipartiteGraph: NodeMetadataLeftIndexedMultiSegmentBipartiteGraph, - salsaRunnerConfig: SalsaRunnerConfig, - statsReceiverWrapper: FinagleStatsReceiverWrapper) { - - import TweetRecommendationsRunner._ - - private val log: Logger = Logger() - - private val stats = statsReceiverWrapper.statsReceiver.scope(this.getClass.getSimpleName) - private val magicRecsFailureCounter = stats.counter("failure") - private val pollCounter = stats.counter("poll") - private val pollTimeoutCounter = stats.counter("pollTimeout") - private val offerCounter = stats.counter("offer") - private val pollLatencyStat = stats.stat("pollLatency") - - private val magicRecsQueue = new AsyncQueue[TopSecondDegreeByCountForTweet] - (0 until salsaRunnerConfig.numSalsaRunners).foreach { _ => - magicRecsQueue.offer( - new TopSecondDegreeByCountForTweet( - bipartiteGraph, - salsaRunnerConfig.expectedNodesToHitInSalsa, - statsReceiverWrapper.scope(this.getClass.getSimpleName) - ) - ) - } - - private implicit val timer: JavaTimer = new JavaTimer(true) - - private def getBaseFilters( - staleTweetDuration: Long, - tweetTypes: Seq[TweetType] - ) = { - List( - // Keep RecentTweetFilter first since it's the cheapest - new RecentTweetFilter(staleTweetDuration, statsReceiverWrapper), - new TweetCardFilter( - tweetTypes.contains(TweetType.Regular), - tweetTypes.contains(TweetType.Summary), - tweetTypes.contains(TweetType.Photo), - tweetTypes.contains(TweetType.Player), - false, // no promoted tweets - statsReceiverWrapper - ), - new DirectInteractionsFilter(bipartiteGraph, statsReceiverWrapper), - new RequestedSetFilter(statsReceiverWrapper), - new SocialProofTypesFilter(statsReceiverWrapper) - ) - } - - /** - * Helper method to interpret the output of MagicRecs graph - * - * @param magicRecsResponse is the response from running MagicRecs - * @return a sequence of candidate ids, with score and list of social proofs - */ - private def transformMagicRecsResponse( - magicRecsResponse: Option[TopSecondDegreeByCountResponse] - ): Seq[RecommendationInfo] = { - val responses = magicRecsResponse match { - case Some(response) => response.getRankedRecommendations.asScala.toSeq - case _ => Nil - } - responses - } - - /** - * Helper function to determine different post-process filtering logic in GraphJet, - * based on display locations - */ - private def getFiltersByDisplayLocations( - displayLocation: TweetEntityDisplayLocation, - whitelistAuthors: LongOpenHashSet, - blacklistAuthors: LongOpenHashSet, - validSocialProofs: Array[Byte] - ) = { - displayLocation match { - case TweetEntityDisplayLocation.MagicRecsF1 => - Seq( - new ANDFilters( - List[ResultFilter]( - new TweetAuthorFilter( - bipartiteGraph, - whitelistAuthors, - new LongOpenHashSet(), - statsReceiverWrapper), - new ExactUserSocialProofSizeFilter( - DefaultF1ExactSocialProofSize, - validSocialProofs, - statsReceiverWrapper - ) - ).asJava, - statsReceiverWrapper - ), - // Blacklist filter must be applied separately from F1's AND filter chain - new TweetAuthorFilter( - bipartiteGraph, - new LongOpenHashSet(), - blacklistAuthors, - statsReceiverWrapper) - ) - case TweetEntityDisplayLocation.MagicRecsRareTweet => - Seq( - new TweetAuthorFilter( - bipartiteGraph, - whitelistAuthors, - blacklistAuthors, - statsReceiverWrapper), - new RecentEdgeMetadataFilter( - DefaultRareTweetRecencyMillis, - UserTweetEdgeTypeMask.Tweet.id.toByte, - statsReceiverWrapper - ) - ) - case _ => - Seq( - new TweetAuthorFilter( - bipartiteGraph, - whitelistAuthors, - blacklistAuthors, - statsReceiverWrapper)) - } - } - - /** - * Helper method to run salsa computation and convert the results to Option - * - * @param magicRecs is magicRecs reader on bipartite graph - * @param magicRecsRequest is the magicRecs request - * @return is an option of MagicRecsResponse - */ - private def getMagicRecsResponse( - magicRecs: TopSecondDegreeByCountForTweet, - magicRecsRequest: TopSecondDegreeByCountRequestForTweet - )( - implicit statsReceiver: StatsReceiver - ): Option[TopSecondDegreeByCountResponse] = { - trackBlockStats(stats) { - val random = new Random() - // compute recs -- need to catch and print exceptions here otherwise they are swallowed - val magicRecsAttempt = - Try(magicRecs.computeRecommendations(magicRecsRequest, random)).onFailure { e => - magicRecsFailureCounter.incr() - log.error(e, "MagicRecs computation failed") - } - magicRecsAttempt.toOption - } - } - - private def getMagicRecsRequest( - request: RecommendTweetEntityRequest - ): TopSecondDegreeByCountRequestForTweet = { - val requesterId = request.requesterId - val leftSeedNodes = new Long2DoubleOpenHashMap( - request.seedsWithWeights.keys.toArray, - request.seedsWithWeights.values.toArray - ) - val tweetsToExcludeArray = new LongOpenHashSet(request.excludedTweetIds.getOrElse(Nil).toArray) - val staleTweetDuration = request.maxTweetAgeInMillis.getOrElse(RecosConfig.maxTweetAgeInMillis) - val staleEngagementDuration = - request.maxEngagementAgeInMillis.getOrElse(RecosConfig.maxEngagementAgeInMillis) - val tweetTypes = request.tweetTypes.getOrElse(DefaultTweetTypes) - val tweetAuthors = new LongOpenHashSet(request.tweetAuthors.getOrElse(Nil).toArray) - val excludedTweetAuthors = new LongOpenHashSet( - request.excludedTweetAuthors.getOrElse(Nil).toArray) - val validSocialProofs = - UserTweetEdgeTypeMask.getUserTweetGraphSocialProofTypes(request.socialProofTypes) - - val resultFilterChain = new ResultFilterChain( - ( - getBaseFilters(staleTweetDuration, tweetTypes) ++ - getFiltersByDisplayLocations( - displayLocation = request.displayLocation, - whitelistAuthors = tweetAuthors, - blacklistAuthors = excludedTweetAuthors, - validSocialProofs = validSocialProofs - ) - ).asJava - ) - - new TopSecondDegreeByCountRequestForTweet( - requesterId, - leftSeedNodes, - tweetsToExcludeArray, - getRecommendationTypes(request.recommendationTypes).asJava, - convertThriftEnumsToJavaEnums(request.maxResultsByType).asJava, - UserTweetEdgeTypeMask.SIZE, - request.maxUserSocialProofSize.getOrElse(RecosConfig.maxUserSocialProofSize), - request.maxTweetSocialProofSize.getOrElse(RecosConfig.maxTweetSocialProofSize), - convertThriftEnumsToJavaEnums(request.minUserSocialProofSizes).asJava, - validSocialProofs, - staleTweetDuration, - staleEngagementDuration, - resultFilterChain, - getSocialProofTypeUnions(request.socialProofTypeUnions).asJava - ) - } - - def apply(request: RecommendTweetEntityRequest): Future[Seq[RecommendationInfo]] = { - pollCounter.incr() - val t0 = System.currentTimeMillis - magicRecsQueue.poll().map { magicRecs => - val pollTime = System.currentTimeMillis - t0 - pollLatencyStat.add(pollTime) - val magicRecsResponse = Try { - if (pollTime < salsaRunnerConfig.timeoutSalsaRunner) { - val magicRecsRequest = getMagicRecsRequest(request) - transformMagicRecsResponse( - getMagicRecsResponse(magicRecs, magicRecsRequest)(statsReceiverWrapper.statsReceiver) - ) - } else { - // if we did not get a magicRecs in time, then fail fast here and immediately put it back - log.warning("magicRecsQueue polling timeout") - pollTimeoutCounter.incr() - throw new RuntimeException("magicRecs poll timeout") - Nil - } - } ensure { - magicRecsQueue.offer(magicRecs) - offerCounter.incr() - } - magicRecsResponse.toOption getOrElse Nil - } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofHandler.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofHandler.docx new file mode 100644 index 000000000..cf3fcc236 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofHandler.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofHandler.scala deleted file mode 100644 index 6ab493589..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofHandler.scala +++ /dev/null @@ -1,73 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.util.StatsUtil -import com.twitter.graphjet.algorithms.RecommendationInfo -import com.twitter.graphjet.algorithms.socialproof.{SocialProofResult => SocialProofJavaResult} -import com.twitter.recos.decider.UserTweetEntityGraphDecider -import com.twitter.recos.util.Stats -import com.twitter.recos.util.Stats._ -import com.twitter.recos.recos_common.thriftscala.{SocialProofType => SocialProofThriftType} -import com.twitter.recos.user_tweet_entity_graph.thriftscala.TweetRecommendation -import com.twitter.recos.user_tweet_entity_graph.thriftscala.{ - SocialProofRequest => SocialProofThriftRequest -} -import com.twitter.recos.user_tweet_entity_graph.thriftscala.{ - SocialProofResponse => SocialProofThriftResponse -} -import com.twitter.servo.request.RequestHandler -import com.twitter.util.Future -import scala.collection.JavaConverters._ - -class TweetSocialProofHandler( - tweetSocialProofRunner: TweetSocialProofRunner, - decider: UserTweetEntityGraphDecider, - statsReceiver: StatsReceiver) - extends RequestHandler[SocialProofThriftRequest, SocialProofThriftResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - - def getThriftSocialProof( - tweetSocialProof: SocialProofJavaResult - ): Map[SocialProofThriftType, Seq[Long]] = { - Option(tweetSocialProof.getSocialProof) match { - case Some(socialProof) if socialProof.isEmpty => - stats.counter(Stats.EmptyResult).incr() - Map.empty[SocialProofThriftType, Seq[Long]] - case Some(socialProof) if !socialProof.isEmpty => - socialProof.asScala.map { - case (socialProofType, connectingUsers) => - ( - SocialProofThriftType(socialProofType.toInt), - connectingUsers.asScala.map { Long2long }.toSeq) - }.toMap - case _ => - throw new Exception("TweetSocialProofHandler gets wrong TweetSocialProof response") - } - } - - def apply(request: SocialProofThriftRequest): Future[SocialProofThriftResponse] = { - StatsUtil.trackBlockStats(stats) { - if (decider.tweetSocialProof) { - val socialProofsFuture = tweetSocialProofRunner(request) - - socialProofsFuture map { socialProofs: Seq[RecommendationInfo] => - stats.counter(Stats.Served).incr(socialProofs.size) - SocialProofThriftResponse( - socialProofs.flatMap { tweetSocialProof: RecommendationInfo => - val tweetSocialProofJavaResult = tweetSocialProof.asInstanceOf[SocialProofJavaResult] - Some( - TweetRecommendation( - tweetSocialProofJavaResult.getNode, - tweetSocialProofJavaResult.getWeight, - getThriftSocialProof(tweetSocialProofJavaResult) - ) - ) - } - ) - } - } else { - Future.value(SocialProofThriftResponse()) - } - } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofRunner.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofRunner.docx new file mode 100644 index 000000000..1c1b433e6 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofRunner.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofRunner.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofRunner.scala deleted file mode 100644 index e0b38e067..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/TweetSocialProofRunner.scala +++ /dev/null @@ -1,168 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import java.util.Random -import com.twitter.concurrent.AsyncQueue -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.bipartite.NodeMetadataLeftIndexedMultiSegmentBipartiteGraph -import com.twitter.graphjet.algorithms.RecommendationInfo -import com.twitter.graphjet.algorithms.socialproof.{ - SocialProofResult, - TweetSocialProofGenerator, - SocialProofRequest => SocialProofJavaRequest, - SocialProofResponse => SocialProofJavaResponse -} -import com.twitter.logging.Logger -import com.twitter.recos.model.SalsaQueryRunner.SalsaRunnerConfig -import com.twitter.recos.user_tweet_entity_graph.thriftscala.{ - RecommendationType, - RecommendationSocialProofRequest => RecommendationSocialProofThriftRequest, - SocialProofRequest => SocialProofThriftRequest -} -import com.twitter.util.{Future, Try} -import it.unimi.dsi.fastutil.longs.{Long2DoubleMap, Long2DoubleOpenHashMap, LongArraySet} -import scala.collection.JavaConverters._ - -/** - * TweetSocialProofRunner creates a queue of reader threads, TweetSocialProofGenerator, and each one - * reads from the graph and computes social proofs. - */ -class TweetSocialProofRunner( - bipartiteGraph: NodeMetadataLeftIndexedMultiSegmentBipartiteGraph, - salsaRunnerConfig: SalsaRunnerConfig, - statsReceiver: StatsReceiver) { - private val log: Logger = Logger() - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - private val socialProofSizeStat = stats.stat("socialProofSize") - - private val socialProofFailureCounter = stats.counter("failure") - private val pollCounter = stats.counter("poll") - private val pollTimeoutCounter = stats.counter("pollTimeout") - private val offerCounter = stats.counter("offer") - private val pollLatencyStat = stats.stat("pollLatency") - private val socialProofRunnerPool = initSocialProofRunnerPool() - - private def initSocialProofRunnerPool(): AsyncQueue[TweetSocialProofGenerator] = { - val socialProofQueue = new AsyncQueue[TweetSocialProofGenerator] - (0 until salsaRunnerConfig.numSalsaRunners).foreach { _ => - socialProofQueue.offer(new TweetSocialProofGenerator(bipartiteGraph)) - } - socialProofQueue - } - - /** - * Helper method to interpret the output of SocialProofJavaResponse - * - * @param socialProofResponse is the response from running TweetSocialProof - * @return a sequence of SocialProofResult - */ - private def transformSocialProofResponse( - socialProofResponse: Option[SocialProofJavaResponse] - ): Seq[RecommendationInfo] = { - socialProofResponse match { - case Some(response) => - val scalaResponse = response.getRankedRecommendations.asScala - scalaResponse.foreach { result => - socialProofSizeStat.add(result.asInstanceOf[SocialProofResult].getSocialProofSize) - } - scalaResponse.toSeq - case _ => Nil - } - } - - /** - * Helper method to run social proof computation and convert the results to Option - * - * @param socialProof is socialProof reader on bipartite graph - * @param request is the socialProof request - * @return is an option of SocialProofJavaResponse - */ - private def getSocialProofResponse( - socialProof: TweetSocialProofGenerator, - request: SocialProofJavaRequest, - random: Random - )( - implicit statsReceiver: StatsReceiver - ): Option[SocialProofJavaResponse] = { - val attempt = Try(socialProof.computeRecommendations(request, random)).onFailure { e => - socialProofFailureCounter.incr() - log.error(e, "SocialProof computation failed") - } - attempt.toOption - } - - /** - * Attempt to retrieve a TweetSocialProof thread from the runner pool - * to execute a socialProofRequest - */ - private def handleSocialProofRequest(socialProofRequest: SocialProofJavaRequest) = { - pollCounter.incr() - val t0 = System.currentTimeMillis() - socialProofRunnerPool.poll().map { tweetSocialProof => - val pollTime = System.currentTimeMillis - t0 - pollLatencyStat.add(pollTime) - val socialProofResponse = Try { - if (pollTime < salsaRunnerConfig.timeoutSalsaRunner) { - val response = getSocialProofResponse(tweetSocialProof, socialProofRequest, new Random())( - statsReceiver - ) - transformSocialProofResponse(response) - } else { - // if we did not get a social proof in time, then fail fast here and immediately put it back - log.warning("socialProof polling timeout") - pollTimeoutCounter.incr() - throw new RuntimeException("socialProof poll timeout") - Nil - } - } ensure { - socialProofRunnerPool.offer(tweetSocialProof) - offerCounter.incr() - } - socialProofResponse.toOption getOrElse Nil - } - } - - /** - * This apply() supports requests coming from the old tweet social proof endpoint. - * Currently this supports clients such as Email Recommendations, MagicRecs, and HomeTimeline. - * In order to avoid heavy migration work, we are retaining this endpoint. - */ - def apply(request: SocialProofThriftRequest): Future[Seq[RecommendationInfo]] = { - val tweetSet = new LongArraySet(request.inputTweets.toArray) - val leftSeedNodes: Long2DoubleMap = new Long2DoubleOpenHashMap( - request.seedsWithWeights.keys.toArray, - request.seedsWithWeights.values.toArray - ) - - val socialProofRequest = new SocialProofJavaRequest( - tweetSet, - leftSeedNodes, - UserTweetEdgeTypeMask.getUserTweetGraphSocialProofTypes(request.socialProofTypes) - ) - - handleSocialProofRequest(socialProofRequest) - } - - /** - * This apply() supports requests coming from the new social proof endpoint in UTEG that works for - * tweet social proof generation, as well as hashtag and url social proof generation. - * Currently this endpoint supports url social proof generation for Guide. - */ - def apply(request: RecommendationSocialProofThriftRequest): Future[Seq[RecommendationInfo]] = { - val tweetIds = request.recommendationIdsForSocialProof.collect { - case (RecommendationType.Tweet, ids) => ids - }.flatten - val tweetSet = new LongArraySet(tweetIds.toArray) - val leftSeedNodes: Long2DoubleMap = new Long2DoubleOpenHashMap( - request.seedsWithWeights.keys.toArray, - request.seedsWithWeights.values.toArray - ) - - val socialProofRequest = new SocialProofJavaRequest( - tweetSet, - leftSeedNodes, - UserTweetEdgeTypeMask.getUserTweetGraphSocialProofTypes(request.socialProofTypes) - ) - - handleSocialProofRequest(socialProofRequest) - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEdgeTypeMask.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEdgeTypeMask.docx new file mode 100644 index 000000000..2776d0a96 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEdgeTypeMask.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEdgeTypeMask.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEdgeTypeMask.scala deleted file mode 100644 index b8e855ffd..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEdgeTypeMask.scala +++ /dev/null @@ -1,95 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.graphjet.bipartite.api.EdgeTypeMask -import com.twitter.recos.recos_common.thriftscala.SocialProofType -import com.twitter.recos.util.Action - -/** - * The bit mask is used to encode edge types in the top bits of an integer, - * e.g. favorite, retweet, reply and click. Under current segment configuration, each segment - * stores up to 128M edges. Assuming that each node on one side is unique, each segment - * stores up to 128M unique nodes on one side, which occupies the lower 27 bits of an integer. - * This leaves five bits to encode the edge types, which at max can store 32 edge types. - * The following implementation utilizes the top four bits and leaves one free bit out. - */ -class UserTweetEdgeTypeMask extends EdgeTypeMask { - import UserTweetEdgeTypeMask._ - - override def encode(node: Int, edgeType: Byte): Int = { - if (edgeType < 0 || edgeType > SIZE || edgeType == Click.id.toByte) { - throw new IllegalArgumentException("encode: Illegal edge type argument " + edgeType) - } else { - node | (edgeType << 28) - } - } - - override def edgeType(node: Int): Byte = { - (node >>> 28).toByte - } - - override def restore(node: Int): Int = { - node & MASK - } -} - -object UserTweetEdgeTypeMask extends Enumeration { - - type UserTweetEdgeTypeMask = Value - - /** - * Byte values corresponding to the action taken on a tweet, which will be encoded in the - * top 4 bits in a tweet Id - * NOTE: THERE CAN ONLY BE UP TO 16 TYPES - */ - val Click: UserTweetEdgeTypeMask = Value(0) - val Favorite: UserTweetEdgeTypeMask = Value(1) - val Retweet: UserTweetEdgeTypeMask = Value(2) - val Reply: UserTweetEdgeTypeMask = Value(3) - val Tweet: UserTweetEdgeTypeMask = Value(4) - val IsMentioned: UserTweetEdgeTypeMask = Value(5) - val IsMediatagged: UserTweetEdgeTypeMask = Value(6) - val Quote: UserTweetEdgeTypeMask = Value(7) - val Unfavorite: UserTweetEdgeTypeMask = Value(8) - - /** - * Reserve the top four bits of each integer to encode the edge type information. - */ - val MASK: Int = Integer.parseInt("00001111111111111111111111111111", 2) - val SIZE: Int = this.values.size - - /** - * Map valid social proof types specified by clients to an array of bytes. If clients do not - * specify any social proof types in thrift, it will return all available social types by - * default. - * - * @param socialProofTypes are the valid socialProofTypes specified by clients - * @return an array of bytes representing valid social proof types - */ - def getUserTweetGraphSocialProofTypes( - socialProofTypes: Option[Seq[SocialProofType]] - ): Array[Byte] = { - socialProofTypes - .map { _.map { _.getValue }.toArray } - .getOrElse((0 until SIZE).toArray) - .map { _.toByte } - } - - /** - * Converts the action byte in the RecosHoseMessage into GraphJet internal byte mapping - */ - def actionTypeToEdgeType(actionByte: Byte): Byte = { - val edgeType = Action(actionByte) match { - case Action.Favorite => Favorite.id - case Action.Retweet => Retweet.id - case Action.Reply => Reply.id - case Action.Tweet => Tweet.id - case Action.IsMentioned => IsMentioned.id - case Action.IsMediaTagged => IsMediatagged.id - case Action.Quote => Quote.id - case Action.Unfavorite => Unfavorite.id - case _ => - throw new IllegalArgumentException("getEdgeType: Illegal edge type argument " + actionByte) - } - edgeType.toByte - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraph.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraph.docx new file mode 100644 index 000000000..03913c18c Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraph.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraph.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraph.scala deleted file mode 100644 index 1ac23fb3b..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraph.scala +++ /dev/null @@ -1,46 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.finagle.thrift.ClientId -import com.twitter.finagle.tracing.{Trace, TraceId} -import com.twitter.recos.user_tweet_entity_graph.thriftscala._ -import com.twitter.util.Future - -object UserTweetEntityGraph { - def traceId: TraceId = Trace.id - def clientId: Option[ClientId] = ClientId.current -} - -class UserTweetEntityGraph( - recommendationHandler: RecommendationHandler, - tweetSocialProofHandler: TweetSocialProofHandler, - socialProofHandler: SocialProofHandler) - extends thriftscala.UserTweetEntityGraph.MethodPerEndpoint { - - override def recommendTweets( - request: RecommendTweetEntityRequest - ): Future[RecommendTweetEntityResponse] = recommendationHandler(request) - - /** - * Given a query user, its seed users, and a set of input tweets, return the social proofs of - * input tweets if any. - * - * Currently this supports clients such as Email Recommendations, MagicRecs, and HomeTimeline. - * In order to avoid heavy migration work, we are retaining this endpoint. - */ - override def findTweetSocialProofs( - request: SocialProofRequest - ): Future[SocialProofResponse] = tweetSocialProofHandler(request) - - /** - * Find social proof for the specified RecommendationType given a set of input ids of that type. - * Only find social proofs from the specified seed users with the specified social proof types. - * - * Currently this supports url social proof generation for Guide. - * - * This endpoint is flexible enough to support social proof generation for all recommendation - * types, and should be used for all future clients of this service. - */ - override def findRecommendationSocialProofs( - request: RecommendationSocialProofRequest - ): Future[RecommendationSocialProofResponse] = socialProofHandler(request) -} diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraphWriter.docx b/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraphWriter.docx new file mode 100644 index 000000000..8aa6f3d6f Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraphWriter.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraphWriter.scala b/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraphWriter.scala deleted file mode 100644 index eff1b22bd..000000000 --- a/src/scala/com/twitter/recos/user_tweet_entity_graph/UserTweetEntityGraphWriter.scala +++ /dev/null @@ -1,105 +0,0 @@ -package com.twitter.recos.user_tweet_entity_graph - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.graphjet.algorithms.{RecommendationType, TweetIDMask} -import com.twitter.graphjet.bipartite.NodeMetadataLeftIndexedMultiSegmentBipartiteGraph -import com.twitter.graphjet.bipartite.segment.NodeMetadataLeftIndexedBipartiteGraphSegment -import com.twitter.recos.hose.common.UnifiedGraphWriter -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.recos.serviceapi.Tweetypie._ - -/** - * The class submits a number of $numBootstrapWriters graph writer threads, BufferedEdgeWriter, - * during service startup. One of them is live writer thread, and the other $(numBootstrapWriters - 1) - * are catchup writer threads. All of them consume kafka events from an internal concurrent queue, - * which is populated by kafka reader threads. At bootstrap time, the kafka reader threads look - * back kafka offset from several hours ago and populate the internal concurrent queue. - * Each graph writer thread writes to an individual graph segment separately. - * The $(numBootstrapWriters - 1) catchup writer threads will stop once all events - * between current system time at startup and the time in memcache are processed. - * The live writer thread will continue to write all incoming kafka events. - * It lives through the entire life cycle of recos graph service. - */ -case class UserTweetEntityGraphWriter( - shardId: String, - env: String, - hosename: String, - bufferSize: Int, - kafkaConsumerBuilder: FinagleKafkaConsumerBuilder[String, RecosHoseMessage], - clientId: String, - statsReceiver: StatsReceiver) - extends UnifiedGraphWriter[ - NodeMetadataLeftIndexedBipartiteGraphSegment, - NodeMetadataLeftIndexedMultiSegmentBipartiteGraph - ] { - writer => - // The max throughput for each kafka consumer is around 25MB/s - // Use 4 processors for 100MB/s catch-up speed. - val consumerNum: Int = 4 - // Leave 1 Segments to LiveWriter - val catchupWriterNum: Int = RecosConfig.maxNumSegments - 1 - - private final val EMTPY_LEFT_NODE_METADATA = new Array[Array[Int]](1) - - /** - * Adds a RecosHoseMessage to the graph. used by live writer to insert edges to the - * current segment - */ - override def addEdgeToGraph( - graph: NodeMetadataLeftIndexedMultiSegmentBipartiteGraph, - recosHoseMessage: RecosHoseMessage - ): Unit = { - graph.addEdge( - recosHoseMessage.leftId, - getMetaEdge(recosHoseMessage.rightId, recosHoseMessage.card), - UserTweetEdgeTypeMask.actionTypeToEdgeType(recosHoseMessage.action), - recosHoseMessage.edgeMetadata.getOrElse(0L), - EMTPY_LEFT_NODE_METADATA, - extractEntities(recosHoseMessage) - ) - } - - /** - * Adds a RecosHoseMessage to the given segment in the graph. Used by catch up writers to - * insert edges to non-current (old) segments - */ - override def addEdgeToSegment( - segment: NodeMetadataLeftIndexedBipartiteGraphSegment, - recosHoseMessage: RecosHoseMessage - ): Unit = { - segment.addEdge( - recosHoseMessage.leftId, - getMetaEdge(recosHoseMessage.rightId, recosHoseMessage.card), - UserTweetEdgeTypeMask.actionTypeToEdgeType(recosHoseMessage.action), - recosHoseMessage.edgeMetadata.getOrElse(0L), - EMTPY_LEFT_NODE_METADATA, - extractEntities(recosHoseMessage) - ) - } - - private def getMetaEdge(rightId: Long, cardOption: Option[Byte]): Long = { - cardOption - .map { card => - if (isPhotoCard(card)) TweetIDMask.photo(rightId) - else if (isPlayerCard(card)) TweetIDMask.player(rightId) - else if (isSummaryCard(card)) TweetIDMask.summary(rightId) - else if (isPromotionCard(card)) TweetIDMask.promotion(rightId) - else rightId - } - .getOrElse(rightId) - } - - private def extractEntities(message: RecosHoseMessage): Array[Array[Int]] = { - val entities: Array[Array[Int]] = - new Array[Array[Int]](RecommendationType.METADATASIZE.getValue) - message.entities.foreach { - _.foreach { - case (entityType, ids) => - entities.update(entityType, ids.toArray) - } - } - entities - } - -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/BUILD b/src/scala/com/twitter/recos/user_tweet_graph/BUILD deleted file mode 100644 index 92f06d1c9..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/BUILD +++ /dev/null @@ -1,66 +0,0 @@ -scala_library( - name = "user-tweet-graph", - sources = ["*.scala"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/cascading:cascading-local", - "3rdparty/jvm/com/backtype:dfs-datastores", - "3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/netflix/curator:curator-framework", - "3rdparty/jvm/com/twitter/graphjet", - "3rdparty/jvm/io/netty:netty4-tcnative-boringssl-static", - "3rdparty/jvm/it/unimi/dsi:fastutil", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/kafka:rosette-kafka", - "3rdparty/jvm/org/apache/thrift:libthrift", - "abdecider/src/main/scala", - "decider/src/main/scala", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/server", - "finagle/finagle-core/src/main", - "finagle/finagle-http/src/main/scala", - "finagle/finagle-memcached/src/main/scala", - "finagle/finagle-stats/src/main/scala", - "finagle/finagle-thriftmux/src/main/scala", - "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/util", - "scrooge/scrooge-core/src/main/scala", - "servo/repo/src/main/scala", - "servo/request/src/main/scala", - "servo/util/src/main/scala", - "src/resources/com/twitter/recos:decider", - "src/scala/com/twitter/recos/decider", - "src/scala/com/twitter/recos/graph_common", - "src/scala/com/twitter/recos/hose/common", - "src/scala/com/twitter/recos/model:recos-model", - "src/scala/com/twitter/recos/serviceapi", - "src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers", - "src/scala/com/twitter/recos/util:recos-util", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/recos:recos-common-scala", - "src/thrift/com/twitter/recos:recos-internal-scala", - "src/thrift/com/twitter/recos/user_tweet_graph:user_tweet_graph-scala", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms/model", - "twitter-server-internal/src/main/scala", - "twitter-server/server/src/main/scala", - "twitter-server/slf4j-jdk14/src/main/scala/com/twitter/server/logging", - "util/util-app/src/main/scala", - "util/util-hashing/src/main/scala", - "util/util-logging/src/main/scala", - "util/util-stats/src/main/scala", - ], -) - -jvm_binary( - name = "bin", - basename = "user-tweet-graph-server", - main = "com.twitter.recos.user_tweet_graph.Main", - runtime_platform = "java11", - tags = ["known-to-fail-jira:SD-20771"], - dependencies = [ - ":user-tweet-graph", - "3rdparty/jvm/org/slf4j:slf4j-jdk14", - "twitter-server/slf4j-jdk14/src/main/scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_tweet_graph/BUILD.docx b/src/scala/com/twitter/recos/user_tweet_graph/BUILD.docx new file mode 100644 index 000000000..258e4dfd5 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/Main.docx b/src/scala/com/twitter/recos/user_tweet_graph/Main.docx new file mode 100644 index 000000000..27448bc83 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/Main.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/Main.scala b/src/scala/com/twitter/recos/user_tweet_graph/Main.scala deleted file mode 100644 index 2920481f3..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/Main.scala +++ /dev/null @@ -1,291 +0,0 @@ -package com.twitter.recos.user_tweet_graph - -import com.twitter.abdecider.ABDeciderFactory -import com.twitter.abdecider.LoggingABDecider -import com.twitter.app.Flag -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.ThriftMux -import com.twitter.finagle.http.HttpMuxer -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.mtls.client.MtlsStackClient.MtlsThriftMuxClientSyntax -import com.twitter.finagle.mtls.server.MtlsStackServer._ -import com.twitter.finagle.mux.ClientDiscardedRequestException -import com.twitter.finagle.mux.transport.OpportunisticTls -import com.twitter.finagle.service.ReqRep -import com.twitter.finagle.service.ResponseClass -import com.twitter.finagle.thrift.ClientId -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.finatra.kafka.domain.KafkaGroupId -import com.twitter.finatra.kafka.domain.SeekStrategy -import com.twitter.finatra.kafka.serde.ScalaSerdes -import com.twitter.frigate.common.util.ElfOwlFilter -import com.twitter.frigate.common.util.ElfOwlFilter.ByLdapGroup -import com.twitter.graphjet.bipartite.MultiSegmentPowerLawBipartiteGraph -import com.twitter.logging._ -import com.twitter.recos.decider.EndpointLoadShedder -import com.twitter.recos.decider.UserTweetGraphDecider -import com.twitter.recos.graph_common.FinagleStatsReceiverWrapper -import com.twitter.recos.graph_common.MultiSegmentPowerLawBipartiteGraphBuilder -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.recos.user_tweet_graph.RecosConfig._ -import com.twitter.recos.user_tweet_graph.relatedTweetHandlers.ConsumersBasedRelatedTweetsHandler -import com.twitter.recos.user_tweet_graph.relatedTweetHandlers.ProducerBasedRelatedTweetsHandler -import com.twitter.recos.user_tweet_graph.relatedTweetHandlers.TweetBasedRelatedTweetsHandler -import com.twitter.recos.user_tweet_graph.store.UserRecentFollowersStore -import com.twitter.server.Deciderable -import com.twitter.server.TwitterServer -import com.twitter.server.logging.{Logging => JDK14Logging} -import com.twitter.servo.request._ -import com.twitter.servo.util.ExceptionCounter -import com.twitter.simclusters_v2.common.UserId -import com.twitter.socialgraph.thriftscala.SocialGraphService -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Await -import com.twitter.util.Duration -import com.twitter.util.JavaTimer -import com.twitter.util.Throw -import com.twitter.util.Timer -import java.net.InetSocketAddress -import java.util.concurrent.TimeUnit -import org.apache.kafka.clients.CommonClientConfigs -import org.apache.kafka.common.config.SaslConfigs -import org.apache.kafka.common.config.SslConfigs -import org.apache.kafka.common.security.auth.SecurityProtocol -import org.apache.kafka.common.serialization.StringDeserializer -import scala.reflect.ClassTag - -object Main extends TwitterServer with JDK14Logging with Deciderable { - profile => - - val shardId: Flag[Int] = flag("shardId", 0, "Shard ID") - val servicePort: Flag[InetSocketAddress] = - flag("service.port", new InetSocketAddress(10143), "Thrift service port") - val logDir: Flag[String] = flag("logdir", "recos", "Logging directory") - val numShards: Flag[Int] = flag("numShards", 1, "Number of shards for this service") - val truststoreLocation: Flag[String] = - flag[String]("truststore_location", "", "Truststore file location") - val hoseName: Flag[String] = - flag("hosename", "recos_injector_user_user", "the kafka stream used for incoming edges") - - val dataCenter: Flag[String] = flag("service.cluster", "atla", "Data Center") - val serviceRole: Flag[String] = flag("service.role", "Service Role") - val serviceEnv: Flag[String] = flag("service.env", "Service Env") - val serviceName: Flag[String] = flag("service.name", "Service Name") - - private val maxNumSegments = - flag("maxNumSegments", graphBuilderConfig.maxNumSegments, "the number of segments in the graph") - - private val statsReceiverWrapper = FinagleStatsReceiverWrapper(statsReceiver) - - /** - * A ClientRequestAuthorizer to be used in a request-authorization RequestFilter. - */ - lazy val clientAuthorizer: ClientRequestAuthorizer = - ClientRequestAuthorizer.observed( - ClientRequestAuthorizer.permissive, - new ClientRequestObserver(statsReceiver) - ) - - lazy val clientId = ClientId(s"usertweetgraph.${serviceEnv()}") - - private def makeThriftClient[ThriftServiceType: ClassTag]( - dest: String, - label: String, - serviceIdentifier: ServiceIdentifier, - requestTimeout: Duration = 100.milliseconds - ): ThriftServiceType = { - ThriftMux.client - .withClientId(ClientId("usertweetgraph.prod")) - .withOpportunisticTls(OpportunisticTls.Required) - .withMutualTls(serviceIdentifier) - .withRequestTimeout(requestTimeout) - .withStatsReceiver(statsReceiver.scope("clnt")) - .withResponseClassifier { - case ReqRep(_, Throw(_: ClientDiscardedRequestException)) => ResponseClass.Ignorable - }.build[ThriftServiceType](dest, label) - } - - private val shutdownTimeout = flag( - "service.shutdownTimeout", - 5.seconds, - "Maximum amount of time to wait for pending requests to complete on shutdown" - ) - - /** - * ExceptionCounter for tracking failures from RequestHandler(s). - */ - lazy val exceptionCounter = new ExceptionCounter(statsReceiver) - - /** - * Function for translating exceptions returned by a RequestHandler. Useful - * for cases where underlying exception types should be wrapped in those - * defined in the project's Thrift IDL. - */ - lazy val translateExceptions: PartialFunction[Throwable, Throwable] = { - case t => t - } - - // ********* logging ********** - - lazy val loggingLevel: Level = Level.INFO - lazy val recosLogPath: String = logDir() + "/recos.log" - lazy val graphLogPath: String = logDir() + "/graph.log" - lazy val accessLogPath: String = logDir() + "/access.log" - - override def loggerFactories: List[LoggerFactory] = - List( - LoggerFactory( - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = recosLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "graph", - useParents = false, - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = graphLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "access", - useParents = false, - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = accessLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "client_event", - level = Some(loggingLevel), - useParents = false, - handlers = QueueingHandler( - maxQueueSize = 10000, - handler = ScribeHandler( - category = "client_event", - formatter = BareFormatter - ) - ) :: Nil - ) - ) - // ******** Decider ************* - - // ********* ABdecider ********** - - val abDeciderYmlPath: String = "/usr/local/config/abdecider/abdecider.yml" - - val scribeLogger: Option[Logger] = Some(Logger.get("client_event")) - - val abDecider: LoggingABDecider = - ABDeciderFactory( - abDeciderYmlPath = abDeciderYmlPath, - scribeLogger = scribeLogger, - environment = Some("production") - ).buildWithLogging() - - // ********* Recos service ********** - def main(): Unit = { - log.info("building graph with maxNumSegments = " + profile.maxNumSegments()) - - implicit val timer: Timer = new JavaTimer(true) - - val graph = MultiSegmentPowerLawBipartiteGraphBuilder( - graphBuilderConfig.copy(maxNumSegments = profile.maxNumSegments()), - statsReceiverWrapper - ) - - val kafkaConfigBuilder = FinagleKafkaConsumerBuilder[String, RecosHoseMessage]() - .dest("/s/kafka/recommendations:kafka-tls") - .groupId(KafkaGroupId(f"user_tweet_graph-${shardId()}%06d")) - .keyDeserializer(new StringDeserializer) - .valueDeserializer(ScalaSerdes.Thrift[RecosHoseMessage].deserializer) - .seekStrategy(SeekStrategy.REWIND) - .rewindDuration(48.hours) - .withConfig(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, SecurityProtocol.SASL_SSL.toString) - .withConfig(SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG, truststoreLocation()) - .withConfig(SaslConfigs.SASL_MECHANISM, SaslConfigs.GSSAPI_MECHANISM) - .withConfig(SaslConfigs.SASL_KERBEROS_SERVICE_NAME, "kafka") - .withConfig(SaslConfigs.SASL_KERBEROS_SERVER_NAME, "kafka") - - val graphWriter = - UserTweetGraphWriter( - shardId().toString, - serviceEnv(), - hoseName(), - 128, // keep the original setting. - kafkaConfigBuilder, - clientId.name, - statsReceiver, - ) - graphWriter.initHose(graph) - - // For MutualTLS - val serviceIdentifier = ServiceIdentifier( - role = serviceRole(), - service = serviceName(), - environment = serviceEnv(), - zone = dataCenter() - ) - log.info(s"ServiceIdentifier = ${serviceIdentifier.toString}") - - val socialGraphClient: SocialGraphService.MethodPerEndpoint = - makeThriftClient[SocialGraphService.MethodPerEndpoint]( - "/s/socialgraph/socialgraph", - "socialgraph", - serviceIdentifier) - val userRecentFollowersStore: ReadableStore[UserRecentFollowersStore.Query, Seq[UserId]] = - new UserRecentFollowersStore(socialGraphClient) - - val tweetBasedRelatedTweetsHandler = new TweetBasedRelatedTweetsHandler(graph, statsReceiver) - val consumersBasedRelatedTweetsHandler = - new ConsumersBasedRelatedTweetsHandler(graph, statsReceiver) - val producerBasedRelatedTweetsHandler = - new ProducerBasedRelatedTweetsHandler(graph, userRecentFollowersStore, statsReceiver) - - val decider = UserTweetGraphDecider(serviceEnv(), dataCenter()) - val endpointLoadShedder = new EndpointLoadShedder(decider) - val userTweetGraph = - new UserTweetGraph( - tweetBasedRelatedTweetsHandler, - producerBasedRelatedTweetsHandler, - consumersBasedRelatedTweetsHandler, - endpointLoadShedder)(timer) - - val thriftServer = ThriftMux.server - .withOpportunisticTls(OpportunisticTls.Required) - .withMutualTls(serviceIdentifier) - .serveIface(servicePort(), userTweetGraph) - - log.info("clientid: " + clientId.toString) - log.info("servicePort: " + servicePort().toString) - - log.info("adding shutdown hook") - onExit { - graphWriter.shutdown() - thriftServer.close(shutdownTimeout().fromNow) - } - log.info("added shutdown hook") - - // Wait on the thriftServer so that shutdownTimeout is respected. - Await.result(thriftServer) - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/README.docx b/src/scala/com/twitter/recos/user_tweet_graph/README.docx new file mode 100644 index 000000000..cfde653cc Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/README.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/README.md b/src/scala/com/twitter/recos/user_tweet_graph/README.md deleted file mode 100644 index e5e8fe35a..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# UserTweetGraph (UTG) - -## What is it -User Tweet Graph (UTG) is a Finalge thrift service built on the GraphJet framework. In maintains a graph of user-tweet engagements and serves user recommendations based on traversals of this graph. - -## How is it used on Twitter -UTG recommends tweets based on collaborative filtering & random walks. UTG takes a set of seed users or seed tweets as input, and performs -1-hop, 2-hop, or even 3+hop traversals on the engagement graph. -UTG's user-tweet engagement edges are bi-directional, and this enables it to perform flexible multi-hop traversals. The flipside to this is -UTG is more memory demanding compared to other GraphJet services like UTEG, whose engagement edges are single directional. - -UTG is a stateful service and relies on a Kafka stream to ingest & persist states. The Kafka stream is processed and generated by Recos-Injector. -It maintains an in-memory user engagements over the past 24-48 hours. Older events are dropped and GC'ed. - -For full details on storage & processing, please check out our open-sourced project GraphJet, a general-purpose high performance in-memory storage engine. -- https://github.com/twitter/GraphJet -- http://www.vldb.org/pvldb/vol9/p1281-sharma.pdf diff --git a/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraph.docx b/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraph.docx new file mode 100644 index 000000000..57546e55a Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraph.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraph.scala b/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraph.scala deleted file mode 100644 index 6c7ab1bf6..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraph.scala +++ /dev/null @@ -1,98 +0,0 @@ -package com.twitter.recos.user_tweet_graph - -import com.twitter.finagle.thrift.ClientId -import com.twitter.finagle.tracing.Trace -import com.twitter.finagle.tracing.TraceId -import com.twitter.recos.decider.EndpointLoadShedder -import com.twitter.recos.recos_common.thriftscala._ -import com.twitter.recos.user_tweet_graph.thriftscala._ -import com.twitter.util.Duration -import com.twitter.util.Future -import com.twitter.util.Timer -import scala.concurrent.duration.MILLISECONDS -import com.twitter.logging.Logger -import com.twitter.recos.user_tweet_graph.relatedTweetHandlers.TweetBasedRelatedTweetsHandler -import com.twitter.recos.user_tweet_graph.relatedTweetHandlers.ProducerBasedRelatedTweetsHandler -import com.twitter.recos.user_tweet_graph.relatedTweetHandlers.ConsumersBasedRelatedTweetsHandler -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.common.UserId - -object UserTweetGraph { - def traceId: TraceId = Trace.id - def clientId: Option[ClientId] = ClientId.current -} - -class UserTweetGraph( - tweetBasedRelatedTweetsHandler: TweetBasedRelatedTweetsHandler, - producerBasedRelatedTweetsHandler: ProducerBasedRelatedTweetsHandler, - consumersBasedRelatedTweetsHandler: ConsumersBasedRelatedTweetsHandler, - endpointLoadShedder: EndpointLoadShedder -)( - implicit timer: Timer) - extends thriftscala.UserTweetGraph.MethodPerEndpoint { - - private val defaultTimeout: Duration = Duration(50, MILLISECONDS) - private val EmptyResponse = Future.value(RelatedTweetResponse()) - private val EmptyFeatureResponse = Future.value(UserTweetFeatureResponse()) - - private val log = Logger() - - override def recommendTweets(request: RecommendTweetRequest): Future[RecommendTweetResponse] = - Future.value(RecommendTweetResponse()) - - override def getLeftNodeEdges(request: GetRecentEdgesRequest): Future[GetRecentEdgesResponse] = - Future.value(GetRecentEdgesResponse()) - - override def getRightNode(tweet: Long): Future[NodeInfo] = Future.value(NodeInfo()) - - // deprecated - override def relatedTweets(request: RelatedTweetRequest): Future[RelatedTweetResponse] = - EmptyResponse - - override def tweetBasedRelatedTweets( - request: TweetBasedRelatedTweetRequest - ): Future[RelatedTweetResponse] = - endpointLoadShedder("tweetBasedRelatedTweets") { - tweetBasedRelatedTweetsHandler(request).raiseWithin(defaultTimeout) - }.rescue { - case EndpointLoadShedder.LoadSheddingException => - EmptyResponse - case e => - log.info("user-tweet-graph_tweetBasedRelatedTweets" + e) - EmptyResponse - } - - override def producerBasedRelatedTweets( - request: ProducerBasedRelatedTweetRequest - ): Future[RelatedTweetResponse] = - endpointLoadShedder("producerBasedRelatedTweets") { - producerBasedRelatedTweetsHandler(request).raiseWithin(defaultTimeout) - }.rescue { - case EndpointLoadShedder.LoadSheddingException => - EmptyResponse - case e => - log.info("user-tweet-graph_producerBasedRelatedTweets" + e) - EmptyResponse - } - - override def consumersBasedRelatedTweets( - request: ConsumersBasedRelatedTweetRequest - ): Future[RelatedTweetResponse] = - endpointLoadShedder("consumersBasedRelatedTweets") { - consumersBasedRelatedTweetsHandler(request).raiseWithin(defaultTimeout) - }.rescue { - case EndpointLoadShedder.LoadSheddingException => - EmptyResponse - case e => - log.info("user-tweet-graph_consumersBasedRelatedTweets" + e) - EmptyResponse - } - - // deprecated - override def userTweetFeatures( - userId: UserId, - tweetId: TweetId - ): Future[UserTweetFeatureResponse] = - EmptyFeatureResponse - -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphConfig.docx b/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphConfig.docx new file mode 100644 index 000000000..8dfdabacd Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphConfig.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphConfig.scala b/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphConfig.scala deleted file mode 100644 index 7bd9f08eb..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphConfig.scala +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.recos.user_tweet_graph - -import com.twitter.recos.graph_common.MultiSegmentPowerLawBipartiteGraphBuilder.GraphBuilderConfig - -/** - * The class holds all the config parameters for recos graph. - */ -object RecosConfig { - val maxNumSegments: Int = 8 - val maxNumEdgesPerSegment: Int = - (1 << 28) // 268M edges per segment, should be able to include 2 days' data - val expectedNumLeftNodes: Int = - (1 << 26) // should correspond to 67M nodes storage - val expectedMaxLeftDegree: Int = 64 - val leftPowerLawExponent: Double = 16.0 // steep power law as most nodes will have a small degree - val expectedNumRightNodes: Int = (1 << 26) // 67M nodes - val expectedMaxRightDegree: Int = scala.math.pow(1024, 2).toInt // some nodes will be very popular - val rightPowerLawExponent: Double = 4.0 // this will be less steep - - val graphBuilderConfig = GraphBuilderConfig( - maxNumSegments = maxNumSegments, - maxNumEdgesPerSegment = maxNumEdgesPerSegment, - expectedNumLeftNodes = expectedNumLeftNodes, - expectedMaxLeftDegree = expectedMaxLeftDegree, - leftPowerLawExponent = leftPowerLawExponent, - expectedNumRightNodes = expectedNumRightNodes, - expectedMaxRightDegree = expectedMaxRightDegree, - rightPowerLawExponent = rightPowerLawExponent - ) - - println("RecosConfig - maxNumSegments " + maxNumSegments) - println("RecosConfig - maxNumEdgesPerSegment " + maxNumEdgesPerSegment) - println("RecosConfig - expectedNumLeftNodes " + expectedNumLeftNodes) - println("RecosConfig - expectedMaxLeftDegree " + expectedMaxLeftDegree) - println("RecosConfig - leftPowerLawExponent " + leftPowerLawExponent) - println("RecosConfig - expectedNumRightNodes " + expectedNumRightNodes) - println("RecosConfig - expectedMaxRightDegree " + expectedMaxRightDegree) - println("RecosConfig - rightPowerLawExponent " + rightPowerLawExponent) -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphWriter.docx b/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphWriter.docx new file mode 100644 index 000000000..405285da6 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphWriter.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphWriter.scala b/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphWriter.scala deleted file mode 100644 index bd7f238a1..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/UserTweetGraphWriter.scala +++ /dev/null @@ -1,88 +0,0 @@ -package com.twitter.recos.user_tweet_graph - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.graphjet.algorithms.TweetIDMask -import com.twitter.recos.util.Action -import com.twitter.graphjet.bipartite.MultiSegmentPowerLawBipartiteGraph -import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment -import com.twitter.recos.hose.common.UnifiedGraphWriter -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.recos.serviceapi.Tweetypie._ -import com.twitter.recos.user_tweet_graph.util.UserTweetEdgeTypeMask - -/** - * The class submits a number of $numBootstrapWriters graph writer threads, BufferedEdgeWriter, - * during service startup. One of them is live writer thread, and the other $(numBootstrapWriters - 1) - * are catchup writer threads. All of them consume kafka events from an internal concurrent queue, - * which is populated by kafka reader threads. At bootstrap time, the kafka reader threads look - * back kafka offset from several hours ago and populate the internal concurrent queue. - * Each graph writer thread writes to an individual graph segment separately. - * The $(numBootstrapWriters - 1) catchup writer threads will stop once all events - * between current system time at startup and the time in memcache are processed. - * The live writer thread will continue to write all incoming kafka events. - * It lives through the entire life cycle of recos graph service. - */ -case class UserTweetGraphWriter( - shardId: String, - env: String, - hosename: String, - bufferSize: Int, - kafkaConsumerBuilder: FinagleKafkaConsumerBuilder[String, RecosHoseMessage], - clientId: String, - statsReceiver: StatsReceiver) - extends UnifiedGraphWriter[BipartiteGraphSegment, MultiSegmentPowerLawBipartiteGraph] { - writer => - // The max throughput for each kafka consumer is around 25MB/s - // Use 4 processors for 100MB/s catch-up speed. - val consumerNum: Int = 4 - // Leave 1 Segments to LiveWriter - val catchupWriterNum: Int = RecosConfig.maxNumSegments - 1 - - /** - * Adds a RecosHoseMessage to the graph. used by live writer to insert edges to the - * current segment - */ - override def addEdgeToGraph( - graph: MultiSegmentPowerLawBipartiteGraph, - recosHoseMessage: RecosHoseMessage - ): Unit = { - if (Action(recosHoseMessage.action) == Action.Favorite || Action( - recosHoseMessage.action) == Action.Retweet) - graph.addEdge( - recosHoseMessage.leftId, - getMetaEdge(recosHoseMessage.rightId, recosHoseMessage.card), - UserTweetEdgeTypeMask.actionTypeToEdgeType(recosHoseMessage.action), - ) - } - - /** - * Adds a RecosHoseMessage to the given segment in the graph. Used by catch up writers to - * insert edges to non-current (old) segments - */ - override def addEdgeToSegment( - segment: BipartiteGraphSegment, - recosHoseMessage: RecosHoseMessage - ): Unit = { - if (Action(recosHoseMessage.action) == Action.Favorite || Action( - recosHoseMessage.action) == Action.Retweet) - segment.addEdge( - recosHoseMessage.leftId, - getMetaEdge(recosHoseMessage.rightId, recosHoseMessage.card), - UserTweetEdgeTypeMask.actionTypeToEdgeType(recosHoseMessage.action) - ) - } - - private def getMetaEdge(rightId: Long, cardOption: Option[Byte]): Long = { - cardOption - .map { card => - if (isPhotoCard(card)) TweetIDMask.photo(rightId) - else if (isPlayerCard(card)) TweetIDMask.player(rightId) - else if (isSummaryCard(card)) TweetIDMask.summary(rightId) - else if (isPromotionCard(card)) TweetIDMask.promotion(rightId) - else rightId - } - .getOrElse(rightId) - } - -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/BUILD b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/BUILD deleted file mode 100644 index 898e5f6ab..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/BUILD +++ /dev/null @@ -1,12 +0,0 @@ -scala_library( - sources = ["*.scala"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/graphjet", - "servo/request/src/main/scala", - "src/scala/com/twitter/recos/user_tweet_graph/store", - "src/scala/com/twitter/recos/user_tweet_graph/util", - "src/scala/com/twitter/recos/util:recos-util", - "src/thrift/com/twitter/recos/user_tweet_graph:user_tweet_graph-scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/BUILD.docx b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/BUILD.docx new file mode 100644 index 000000000..3d346103b Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.docx b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.docx new file mode 100644 index 000000000..d92f7294b Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.scala b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.scala deleted file mode 100644 index 9f807029b..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.scala +++ /dev/null @@ -1,68 +0,0 @@ -package com.twitter.recos.user_tweet_graph.relatedTweetHandlers - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.recos.user_tweet_graph.thriftscala._ -import com.twitter.recos.user_tweet_graph.util.FetchRHSTweetsUtil -import com.twitter.recos.user_tweet_graph.util.FilterUtil -import com.twitter.recos.user_tweet_graph.util.GetRelatedTweetCandidatesUtil -import com.twitter.recos.util.Action -import com.twitter.recos.util.Stats._ -import com.twitter.servo.request._ -import com.twitter.util.Duration -import com.twitter.util.Future -import scala.concurrent.duration.HOURS - -/** - * Implementation of the Thrift-defined service interface for consumersTweetBasedRelatedTweets. - * given a list of consumer userIds, find the tweets they co-engaged with (we're treating input userIds as consumers therefore "consumersTweetBasedRelatedTweets" ) - * example use case: given a list of user's contacts in their address book, find tweets those contacts engaged with - */ -class ConsumersBasedRelatedTweetsHandler( - bipartiteGraph: BipartiteGraph, - statsReceiver: StatsReceiver) - extends RequestHandler[ConsumersBasedRelatedTweetRequest, RelatedTweetResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - - override def apply(request: ConsumersBasedRelatedTweetRequest): Future[RelatedTweetResponse] = { - trackFutureBlockStats(stats) { - - val maxResults = request.maxResults.getOrElse(200) - val minScore = request.minScore.getOrElse(0.0) - val maxTweetAge = request.maxTweetAgeInHours.getOrElse(48) - val minResultDegree = request.minResultDegree.getOrElse(50) - val minCooccurrence = request.minCooccurrence.getOrElse(3) - val excludeTweetIds = request.excludeTweetIds.getOrElse(Seq.empty).toSet - - val consumerSeedSet = request.consumerSeedSet.distinct.filter { userId => - val userDegree = bipartiteGraph.getLeftNodeDegree(userId) - // constrain to users that have <100 engagements to avoid spammy behavior - userDegree < 100 - } - - val rhsTweetIds = FetchRHSTweetsUtil.fetchRHSTweets( - consumerSeedSet, - bipartiteGraph, - Set(Action.Favorite, Action.Retweet) - ) - - val scorePreFactor = 1000.0 / consumerSeedSet.size - val relatedTweetCandidates = GetRelatedTweetCandidatesUtil.getRelatedTweetCandidates( - rhsTweetIds, - minCooccurrence, - minResultDegree, - scorePreFactor, - bipartiteGraph) - - val relatedTweets = relatedTweetCandidates - .filter(relatedTweet => - FilterUtil.tweetAgeFilter( - relatedTweet.tweetId, - Duration(maxTweetAge, HOURS)) && (relatedTweet.score > minScore) && (!excludeTweetIds - .contains(relatedTweet.tweetId))).take(maxResults) - - stats.stat("response_size").add(relatedTweets.size) - Future.value(RelatedTweetResponse(tweets = relatedTweets)) - } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.docx b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.docx new file mode 100644 index 000000000..c3a07e1f1 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.scala b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.scala deleted file mode 100644 index dd73342ec..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.scala +++ /dev/null @@ -1,88 +0,0 @@ -package com.twitter.recos.user_tweet_graph.relatedTweetHandlers - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.recos.user_tweet_graph.thriftscala._ -import com.twitter.recos.util.Stats._ -import com.twitter.servo.request._ -import com.twitter.util.Duration -import com.twitter.util.Future -import scala.concurrent.duration.HOURS -import com.twitter.simclusters_v2.common.UserId -import com.twitter.storehaus.ReadableStore -import com.twitter.recos.user_tweet_graph.store.UserRecentFollowersStore -import com.twitter.recos.user_tweet_graph.util.FetchRHSTweetsUtil -import com.twitter.recos.user_tweet_graph.util.FilterUtil -import com.twitter.recos.user_tweet_graph.util.GetRelatedTweetCandidatesUtil -import com.twitter.recos.util.Action - -/** - * Implementation of the Thrift-defined service interface for producerBasedRelatedTweets. - * - */ -class ProducerBasedRelatedTweetsHandler( - bipartiteGraph: BipartiteGraph, - userRecentFollowersStore: ReadableStore[UserRecentFollowersStore.Query, Seq[UserId]], - statsReceiver: StatsReceiver) - extends RequestHandler[ProducerBasedRelatedTweetRequest, RelatedTweetResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - - override def apply(request: ProducerBasedRelatedTweetRequest): Future[RelatedTweetResponse] = { - trackFutureBlockStats(stats) { - val maxResults = request.maxResults.getOrElse(200) - val maxNumFollowers = request.maxNumFollowers.getOrElse(500) - val minScore = request.minScore.getOrElse(0.0) - val maxTweetAge = request.maxTweetAgeInHours.getOrElse(48) - val minResultDegree = request.minResultDegree.getOrElse(50) - val minCooccurrence = request.minCooccurrence.getOrElse(4) - val excludeTweetIds = request.excludeTweetIds.getOrElse(Seq.empty).toSet - - val followersFut = fetchFollowers(request.producerId, Some(maxNumFollowers)) - followersFut.map { followers => - val rhsTweetIds = FetchRHSTweetsUtil.fetchRHSTweets( - followers, - bipartiteGraph, - Set(Action.Favorite, Action.Retweet) - ) - - val scorePreFactor = 1000.0 / followers.size - val relatedTweetCandidates = GetRelatedTweetCandidatesUtil.getRelatedTweetCandidates( - rhsTweetIds, - minCooccurrence, - minResultDegree, - scorePreFactor, - bipartiteGraph) - - val relatedTweets = relatedTweetCandidates - .filter { relatedTweet => - FilterUtil.tweetAgeFilter( - relatedTweet.tweetId, - Duration(maxTweetAge, HOURS)) && (relatedTweet.score > minScore) && (!excludeTweetIds - .contains(relatedTweet.tweetId)) - }.take(maxResults) - stats.stat("response_size").add(relatedTweets.size) - RelatedTweetResponse(tweets = relatedTweets) - } - } - } - - private def fetchFollowers( - producerId: Long, - maxNumFollower: Option[Int], - ): Future[Seq[Long]] = { - val query = - UserRecentFollowersStore.Query(producerId, maxNumFollower, None) - - val followersFut = userRecentFollowersStore.get(query) - followersFut.map { followersOpt => - val followers = followersOpt.getOrElse(Seq.empty) - val followerIds = followers.distinct.filter { userId => - val userDegree = bipartiteGraph.getLeftNodeDegree(userId) - // constrain to more active users that have >1 engagement to optimize latency, and <100 engagements to avoid spammy behavior - userDegree > 1 && userDegree < 100 - } - stats.stat("follower_size_after_filter").add(followerIds.size) - followerIds - } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.docx b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.docx new file mode 100644 index 000000000..183b0a139 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.scala b/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.scala deleted file mode 100644 index 6643bd408..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.scala +++ /dev/null @@ -1,93 +0,0 @@ -package com.twitter.recos.user_tweet_graph.relatedTweetHandlers - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.recos.features.tweet.thriftscala.GraphFeaturesForQuery -import com.twitter.recos.user_tweet_graph.thriftscala._ -import com.twitter.recos.user_tweet_graph.util.FilterUtil -import com.twitter.recos.user_tweet_graph.util.FetchRHSTweetsUtil -import com.twitter.recos.user_tweet_graph.util.GetAllInternalTweetIdsUtil -import com.twitter.recos.user_tweet_graph.util.GetRelatedTweetCandidatesUtil -import com.twitter.recos.user_tweet_graph.util.SampleLHSUsersUtil -import com.twitter.recos.util.Action -import com.twitter.recos.util.Stats._ -import com.twitter.servo.request._ -import com.twitter.util.Duration -import com.twitter.util.Future -import scala.concurrent.duration.HOURS - -/** - * Implementation of the Thrift-defined service interface for tweetBasedRelatedTweets. - * - */ -class TweetBasedRelatedTweetsHandler(bipartiteGraph: BipartiteGraph, statsReceiver: StatsReceiver) - extends RequestHandler[TweetBasedRelatedTweetRequest, RelatedTweetResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - - override def apply(request: TweetBasedRelatedTweetRequest): Future[RelatedTweetResponse] = { - trackFutureBlockStats(stats) { - val internalQueryTweetIds = - GetAllInternalTweetIdsUtil.getAllInternalTweetIds(request.tweetId, bipartiteGraph) - - val response = internalQueryTweetIds match { - case head +: Nil => getRelatedTweets(request, head) - case _ => RelatedTweetResponse() - } - Future.value(response) - } - } - - private def getRelatedTweets( - request: TweetBasedRelatedTweetRequest, - maskedTweetId: Long - ): RelatedTweetResponse = { - - val maxNumSamplesPerNeighbor = request.maxNumSamplesPerNeighbor.getOrElse(100) - val maxResults = request.maxResults.getOrElse(200) - val minScore = request.minScore.getOrElse(0.5) - val maxTweetAge = request.maxTweetAgeInHours.getOrElse(48) - val minResultDegree = request.minResultDegree.getOrElse(50) - val minQueryDegree = request.minQueryDegree.getOrElse(10) - val minCooccurrence = request.minCooccurrence.getOrElse(3) - val excludeTweetIds = request.excludeTweetIds.getOrElse(Seq.empty).toSet - - val queryTweetDegree = bipartiteGraph.getRightNodeDegree(maskedTweetId) - stats.stat("queryTweetDegree").add(queryTweetDegree) - - if (queryTweetDegree < minQueryDegree) { - stats.counter("queryTweetDegreeLessThanMinQueryDegree").incr() - RelatedTweetResponse() - } else { - - val sampledLHSuserIds = - SampleLHSUsersUtil.sampleLHSUsers(maskedTweetId, maxNumSamplesPerNeighbor, bipartiteGraph) - - val rHStweetIds = FetchRHSTweetsUtil.fetchRHSTweets( - sampledLHSuserIds, - bipartiteGraph, - Set(Action.Favorite, Action.Retweet) - ) - - val scorePreFactor = - queryTweetDegree / math.log(queryTweetDegree) / sampledLHSuserIds.distinct.size - val relatedTweetCandidates = GetRelatedTweetCandidatesUtil.getRelatedTweetCandidates( - rHStweetIds, - minCooccurrence, - minResultDegree, - scorePreFactor, - bipartiteGraph) - - val relatedTweets = relatedTweetCandidates - .filter(relatedTweet => - FilterUtil.tweetAgeFilter( - relatedTweet.tweetId, - Duration(maxTweetAge, HOURS)) && (relatedTweet.score > minScore) && (!excludeTweetIds - .contains(relatedTweet.tweetId))).take(maxResults) - - stats.stat("response_size").add(relatedTweets.size) - RelatedTweetResponse( - tweets = relatedTweets, - queryTweetGraphFeatures = Some(GraphFeaturesForQuery(degree = Some(queryTweetDegree)))) - } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/store/BUILD b/src/scala/com/twitter/recos/user_tweet_graph/store/BUILD deleted file mode 100644 index b1c3562b7..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/store/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - sources = ["*.scala"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/storehaus:core", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/socialgraph:thrift-scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_tweet_graph/store/BUILD.docx b/src/scala/com/twitter/recos/user_tweet_graph/store/BUILD.docx new file mode 100644 index 000000000..1f761fb43 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/store/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/store/UserRecentFollowersStore.docx b/src/scala/com/twitter/recos/user_tweet_graph/store/UserRecentFollowersStore.docx new file mode 100644 index 000000000..fe3d70ecf Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/store/UserRecentFollowersStore.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/store/UserRecentFollowersStore.scala b/src/scala/com/twitter/recos/user_tweet_graph/store/UserRecentFollowersStore.scala deleted file mode 100644 index 4910e9d71..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/store/UserRecentFollowersStore.scala +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.recos.user_tweet_graph.store - -import com.twitter.simclusters_v2.common.UserId -import com.twitter.socialgraph.thriftscala.EdgesRequest -import com.twitter.socialgraph.thriftscala.EdgesResult -import com.twitter.socialgraph.thriftscala.PageRequest -import com.twitter.socialgraph.thriftscala.RelationshipType -import com.twitter.socialgraph.thriftscala.SrcRelationship -import com.twitter.socialgraph.thriftscala.SocialGraphService -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Duration -import com.twitter.util.Future -import com.twitter.util.Time - -class UserRecentFollowersStore( - sgsClient: SocialGraphService.MethodPerEndpoint) - extends ReadableStore[UserRecentFollowersStore.Query, Seq[UserId]] { - - override def get(key: UserRecentFollowersStore.Query): Future[Option[Seq[UserId]]] = { - val edgeRequest = EdgesRequest( - relationship = SrcRelationship(key.userId, RelationshipType.FollowedBy), - // Could have a better guess at count when k.maxAge != None - pageRequest = Some(PageRequest(count = key.maxResults)) - ) - - val lookbackThresholdMillis = key.maxAge - .map(maxAge => (Time.now - maxAge).inMilliseconds) - .getOrElse(0L) - - sgsClient - .edges(Seq(edgeRequest)) - .map(_.flatMap { - case EdgesResult(edges, _, _) => - edges.collect { - case e if e.createdAt >= lookbackThresholdMillis => - e.target - } - }) - .map(Some(_)) - } -} - -object UserRecentFollowersStore { - case class Query( - userId: UserId, - // maxResults - if Some(count), we return only the `count` most recent follows - maxResults: Option[Int] = None, - // maxAge - if Some(duration), return only follows since `Time.now - duration` - maxAge: Option[Duration] = None) -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/BUILD b/src/scala/com/twitter/recos/user_tweet_graph/util/BUILD deleted file mode 100644 index 789b5e3ad..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/util/BUILD +++ /dev/null @@ -1,12 +0,0 @@ -scala_library( - sources = ["*.scala"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/graphjet", - "snowflake:id", - "snowflake/src/main/scala/com/twitter/snowflake/id", - "src/scala/com/twitter/recos/util:recos-util", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/recos/user_tweet_graph:user_tweet_graph-scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/BUILD.docx b/src/scala/com/twitter/recos/user_tweet_graph/util/BUILD.docx new file mode 100644 index 000000000..f21e08583 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/util/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/FetchRHSTweetsUtil.docx b/src/scala/com/twitter/recos/user_tweet_graph/util/FetchRHSTweetsUtil.docx new file mode 100644 index 000000000..e9d357429 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/util/FetchRHSTweetsUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/FetchRHSTweetsUtil.scala b/src/scala/com/twitter/recos/user_tweet_graph/util/FetchRHSTweetsUtil.scala deleted file mode 100644 index dc4ec3020..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/util/FetchRHSTweetsUtil.scala +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.recos.user_tweet_graph.util - -import com.twitter.graphjet.bipartite.MultiSegmentIterator -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment -import scala.collection.mutable.ListBuffer -import com.twitter.recos.util.Action - -object FetchRHSTweetsUtil { - // get RHS tweets given LHS users - def fetchRHSTweets( - userIds: Seq[Long], - bipartiteGraph: BipartiteGraph, - allowedActions: Set[Action.Value] - ): Seq[Long] = { - val allowedActionStrings = allowedActions.map(_.toString) - userIds.distinct - .flatMap { userId => - val tweetIdsIterator = bipartiteGraph - .getLeftNodeEdges(userId).asInstanceOf[MultiSegmentIterator[BipartiteGraphSegment]] - - val tweetIds = new ListBuffer[Long]() - if (tweetIdsIterator != null) { - while (tweetIdsIterator.hasNext) { - val rightNode = tweetIdsIterator.nextLong() - val edgeType = tweetIdsIterator.currentEdgeType() - if (allowedActionStrings.contains(UserTweetEdgeTypeMask(edgeType).toString)) - tweetIds += rightNode - } - } - tweetIds.distinct - } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/FilterUtil.docx b/src/scala/com/twitter/recos/user_tweet_graph/util/FilterUtil.docx new file mode 100644 index 000000000..8a2fcf446 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/util/FilterUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/FilterUtil.scala b/src/scala/com/twitter/recos/user_tweet_graph/util/FilterUtil.scala deleted file mode 100644 index fb5928904..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/util/FilterUtil.scala +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.recos.user_tweet_graph.util - -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.snowflake.id.SnowflakeId -import com.twitter.util.Duration -import com.twitter.util.Time - -object FilterUtil { - def tweetAgeFilter(tweetId: TweetId, maxAge: Duration): Boolean = { - SnowflakeId - .timeFromIdOpt(tweetId) - .map { tweetTime => tweetTime > Time.now - maxAge }.getOrElse(false) - // If there's no snowflake timestamp, we have no idea when this tweet happened. - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/GetAllInternalTweetIdsUtil.docx b/src/scala/com/twitter/recos/user_tweet_graph/util/GetAllInternalTweetIdsUtil.docx new file mode 100644 index 000000000..4f926ebd1 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/util/GetAllInternalTweetIdsUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/GetAllInternalTweetIdsUtil.scala b/src/scala/com/twitter/recos/user_tweet_graph/util/GetAllInternalTweetIdsUtil.scala deleted file mode 100644 index 0a5e6ee65..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/util/GetAllInternalTweetIdsUtil.scala +++ /dev/null @@ -1,33 +0,0 @@ -package com.twitter.recos.user_tweet_graph.util - -import com.twitter.graphjet.algorithms.TweetIDMask -import com.twitter.graphjet.bipartite.api.BipartiteGraph - -object GetAllInternalTweetIdsUtil { - - def getAllInternalTweetIds(tweetId: Long, bipartiteGraph: BipartiteGraph): Seq[Long] = { - val internalTweetIds = getAllMasks(tweetId) - sortByDegrees(internalTweetIds, bipartiteGraph) - } - - private def getAllMasks(tweetId: Long): Seq[Long] = { - Seq( - tweetId, - TweetIDMask.summary(tweetId), - TweetIDMask.photo(tweetId), - TweetIDMask.player(tweetId), - TweetIDMask.promotion(tweetId) - ) - } - - private def sortByDegrees( - encodedTweetIds: Seq[Long], - bipartiteGraph: BipartiteGraph - ): Seq[Long] = { - encodedTweetIds - .map { encodedTweetId => (encodedTweetId, bipartiteGraph.getRightNodeDegree(encodedTweetId)) } - .filter { case (_, degree) => degree > 0 } // keep only tweetds with positive degree - .sortBy { case (_, degree) => -degree } // sort by degree in descending order - .map { case (encodedTweetId, _) => encodedTweetId } - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/GetRelatedTweetCandidatesUtil.docx b/src/scala/com/twitter/recos/user_tweet_graph/util/GetRelatedTweetCandidatesUtil.docx new file mode 100644 index 000000000..2d8839b60 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/util/GetRelatedTweetCandidatesUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/GetRelatedTweetCandidatesUtil.scala b/src/scala/com/twitter/recos/user_tweet_graph/util/GetRelatedTweetCandidatesUtil.scala deleted file mode 100644 index b093e4c9e..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/util/GetRelatedTweetCandidatesUtil.scala +++ /dev/null @@ -1,56 +0,0 @@ -package com.twitter.recos.user_tweet_graph.util - -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.recos.user_tweet_graph.thriftscala._ -import com.twitter.recos.features.tweet.thriftscala.GraphFeaturesForTweet -import com.twitter.graphjet.algorithms.TweetIDMask - -object GetRelatedTweetCandidatesUtil { - private val tweetIDMask = new TweetIDMask - - /** - * calculate scores for each RHS tweet that we get back - * for tweetBasedRelatedTweet, scorePreFactor = queryTweetDegree / log(queryTweetDegree) / LHSuserSize - * and the final score will be a log-cosine score - * for non-tweetBasedRelatedTweet, We don't have a query tweet, to keep scoring function consistent, - * scorePreFactor = 1000.0 / LHSuserSize (queryTweetDegree's average is ~10k, 1000 ~= 10k/log(10k)) - * Though scorePreFactor is applied for all results within a request, it's still useful to make score comparable across requests, - * so we can have a unifed min_score and help with downstream score normalization - * **/ - def getRelatedTweetCandidates( - relatedTweetCandidates: Seq[Long], - minCooccurrence: Int, - minResultDegree: Int, - scorePreFactor: Double, - bipartiteGraph: BipartiteGraph, - ): Seq[RelatedTweet] = { - relatedTweetCandidates - .groupBy(tweetId => tweetId) - .filterKeys(tweetId => bipartiteGraph.getRightNodeDegree(tweetId) > minResultDegree) - .mapValues(_.size) - .filter { case (_, cooccurrence) => cooccurrence >= minCooccurrence } - .toSeq - .map { - case (relatedTweetId, cooccurrence) => - val relatedTweetDegree = bipartiteGraph.getRightNodeDegree(relatedTweetId) - val score = scorePreFactor * cooccurrence / math.log(relatedTweetDegree) - - toRelatedTweet(relatedTweetId, score, relatedTweetDegree, cooccurrence) - } - .sortBy(-_.score) - } - - def toRelatedTweet( - relatedTweetId: Long, - score: Double, - relatedTweetDegree: Int, - cooccurrence: Int - ): RelatedTweet = { - RelatedTweet( - tweetId = tweetIDMask.restore(relatedTweetId), - score = score, - relatedTweetGraphFeatures = Some( - GraphFeaturesForTweet(cooccurrence = Some(cooccurrence), degree = Some(relatedTweetDegree))) - ) - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/SampleLHSUsersUtil.docx b/src/scala/com/twitter/recos/user_tweet_graph/util/SampleLHSUsersUtil.docx new file mode 100644 index 000000000..786f3b443 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/util/SampleLHSUsersUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/SampleLHSUsersUtil.scala b/src/scala/com/twitter/recos/user_tweet_graph/util/SampleLHSUsersUtil.scala deleted file mode 100644 index f265eb9e0..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/util/SampleLHSUsersUtil.scala +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.recos.user_tweet_graph.util - -import com.twitter.graphjet.bipartite.MultiSegmentIterator -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment -import java.util.Random -import scala.collection.mutable.ListBuffer - -object SampleLHSUsersUtil { - // sample userId nodes - def sampleLHSUsers( - maskedTweetId: Long, - maxNumSamplesPerNeighbor: Int, - bipartiteGraph: BipartiteGraph - ): Seq[Long] = { - val sampledUserIdsIterator = bipartiteGraph - .getRandomRightNodeEdges( - maskedTweetId, - maxNumSamplesPerNeighbor, - new Random(System.currentTimeMillis)).asInstanceOf[MultiSegmentIterator[ - BipartiteGraphSegment - ]] - - val userIds = new ListBuffer[Long]() - if (sampledUserIdsIterator != null) { - while (sampledUserIdsIterator.hasNext) { - val leftNode = sampledUserIdsIterator.nextLong() - // If a user likes too many things, we risk including spammy behavior. - if (bipartiteGraph.getLeftNodeDegree(leftNode) < 100) - userIds += leftNode - } - } - userIds - } -} diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/UserTweetEdgeTypeMask.docx b/src/scala/com/twitter/recos/user_tweet_graph/util/UserTweetEdgeTypeMask.docx new file mode 100644 index 000000000..d2299cff5 Binary files /dev/null and b/src/scala/com/twitter/recos/user_tweet_graph/util/UserTweetEdgeTypeMask.docx differ diff --git a/src/scala/com/twitter/recos/user_tweet_graph/util/UserTweetEdgeTypeMask.scala b/src/scala/com/twitter/recos/user_tweet_graph/util/UserTweetEdgeTypeMask.scala deleted file mode 100644 index 9a55c8e45..000000000 --- a/src/scala/com/twitter/recos/user_tweet_graph/util/UserTweetEdgeTypeMask.scala +++ /dev/null @@ -1,77 +0,0 @@ -package com.twitter.recos.user_tweet_graph.util - -import com.twitter.graphjet.bipartite.api.EdgeTypeMask -import com.twitter.recos.util.Action - -/** - * The bit mask is used to encode edge types in the top bits of an integer, - * e.g. favorite, retweet, reply and click. Under current segment configuration, each segment - * stores up to 128M edges. Assuming that each node on one side is unique, each segment - * stores up to 128M unique nodes on one side, which occupies the lower 27 bits of an integer. - * This leaves five bits to encode the edge types, which at max can store 32 edge types. - * The following implementation utilizes the top four bits and leaves one free bit out. - */ -class UserTweetEdgeTypeMask extends EdgeTypeMask { - import UserTweetEdgeTypeMask._ - - override def encode(node: Int, edgeType: Byte): Int = { - if (edgeType < 0 || edgeType > SIZE || edgeType == Click.id.toByte) { - throw new IllegalArgumentException("encode: Illegal edge type argument " + edgeType) - } else { - node | (edgeType << 28) - } - } - - override def edgeType(node: Int): Byte = { - (node >>> 28).toByte - } - - override def restore(node: Int): Int = { - node & MASK - } -} - -object UserTweetEdgeTypeMask extends Enumeration { - - type UserTweetEdgeTypeMask = Value - - /** - * Byte values corresponding to the action taken on a tweet, which will be encoded in the - * top 4 bits in a tweet Id - * NOTE: THERE CAN ONLY BE UP TO 16 TYPES - */ - val Click: UserTweetEdgeTypeMask = Value(0) - val Favorite: UserTweetEdgeTypeMask = Value(1) - val Retweet: UserTweetEdgeTypeMask = Value(2) - val Reply: UserTweetEdgeTypeMask = Value(3) - val Tweet: UserTweetEdgeTypeMask = Value(4) - val IsMentioned: UserTweetEdgeTypeMask = Value(5) - val IsMediatagged: UserTweetEdgeTypeMask = Value(6) - val Quote: UserTweetEdgeTypeMask = Value(7) - val Unfavorite: UserTweetEdgeTypeMask = Value(8) - - /** - * Reserve the top four bits of each integer to encode the edge type information. - */ - val MASK: Int = Integer.parseInt("00001111111111111111111111111111", 2) - val SIZE: Int = this.values.size - - /** - * Converts the action byte in the RecosHoseMessage into GraphJet internal byte mapping - */ - def actionTypeToEdgeType(actionByte: Byte): Byte = { - val edgeType = Action(actionByte) match { - case Action.Favorite => Favorite.id - case Action.Retweet => Retweet.id - case Action.Reply => Reply.id - case Action.Tweet => Tweet.id - case Action.IsMentioned => IsMentioned.id - case Action.IsMediaTagged => IsMediatagged.id - case Action.Quote => Quote.id - case Action.Unfavorite => Unfavorite.id - case _ => - throw new IllegalArgumentException("getEdgeType: Illegal edge type argument " + actionByte) - } - edgeType.toByte - } -} diff --git a/src/scala/com/twitter/recos/user_user_graph/BUILD b/src/scala/com/twitter/recos/user_user_graph/BUILD deleted file mode 100644 index 12dcbd292..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/BUILD +++ /dev/null @@ -1,45 +0,0 @@ -scala_library( - name = "user_user_graph", - sources = ["*.scala"], - strict_deps = False, - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/twitter/graphjet", - "3rdparty/jvm/io/netty:netty4-tcnative-boringssl-static", - "3rdparty/jvm/org/apache/kafka:rosette-kafka", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/server", - "finagle/finagle-core/src/main", - "finagle/finagle-http/src/main/scala", - "finagle/finagle-memcached/src/main/scala", - "finagle/finagle-stats/src/main/scala", - "finagle/finagle-thriftmux/src/main/scala", - "servo/request/src/main/scala", - "servo/util/src/main/scala", - "src/resources/com/twitter/recos:decider", - "src/scala/com/twitter/recos/decider", - "src/scala/com/twitter/recos/graph_common", - "src/scala/com/twitter/recos/hose/common", - "src/scala/com/twitter/recos/model:recos-model", - "src/scala/com/twitter/recos/util:recos-util", - "src/thrift/com/twitter/recos/user_user_graph:user_user_graph-scala", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms", - "twitter-server/slf4j-jdk14/src/main/scala/com/twitter/server/logging", - "util/util-logging/src/main/scala", - "util/util-stats/src/main/scala", - ], -) - -jvm_binary( - name = "bin", - basename = "user_user_graph-server", - main = "com.twitter.recos.user_user_graph.Main", - runtime_platform = "java11", - tags = ["bazel-compatible"], - dependencies = [ - ":user_user_graph", - "3rdparty/jvm/org/slf4j:slf4j-jdk14", - "twitter-server/slf4j-jdk14/src/main/scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_user_graph/BUILD.docx b/src/scala/com/twitter/recos/user_user_graph/BUILD.docx new file mode 100644 index 000000000..7ed66e731 Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/KafkaConfig.docx b/src/scala/com/twitter/recos/user_user_graph/KafkaConfig.docx new file mode 100644 index 000000000..7f8c76d59 Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/KafkaConfig.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/KafkaConfig.scala b/src/scala/com/twitter/recos/user_user_graph/KafkaConfig.scala deleted file mode 100644 index 4ee08df68..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/KafkaConfig.scala +++ /dev/null @@ -1,13 +0,0 @@ -package com.twitter.recos.user_user_graph - -/** - * The class holds all the config parameters for kafka queue. - */ -object KafkaConfig { - // The size of the RecosHoseMessage array that is written to the concurrently linked queue - // Buffersize of 64 to keep throughput around 64 / (2K edgesPerSec / 150 kafka threads) = 6 seconds, which is lower - // than young gen gc cycle, 20 seconds. So that all the incoming messages will be gced in young gen instead of old gen. - val bufferSize = 64 - - println("KafkaConfig - bufferSize " + bufferSize) -} diff --git a/src/scala/com/twitter/recos/user_user_graph/LoggingUserUserGraph.docx b/src/scala/com/twitter/recos/user_user_graph/LoggingUserUserGraph.docx new file mode 100644 index 000000000..76bc85d6a Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/LoggingUserUserGraph.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/LoggingUserUserGraph.scala b/src/scala/com/twitter/recos/user_user_graph/LoggingUserUserGraph.scala deleted file mode 100644 index f8353a975..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/LoggingUserUserGraph.scala +++ /dev/null @@ -1,51 +0,0 @@ -package com.twitter.recos.user_user_graph - -import com.twitter.logging.Logger -import com.twitter.recos.user_user_graph.thriftscala._ -import com.twitter.util.Future - -trait LoggingUserUserGraph extends thriftscala.UserUserGraph.MethodPerEndpoint { - private[this] val accessLog = Logger("access") - - abstract override def recommendUsers( - request: RecommendUserRequest - ): Future[RecommendUserResponse] = { - val time = System.currentTimeMillis - super.recommendUsers(request) onSuccess { resp => - val timeTaken = System.currentTimeMillis - time - val logText = - s"In ${timeTaken}ms, recommendUsers(${requestToString(request)}), response ${responseToString(resp)}" - accessLog.info(logText) - } onFailure { exc => - val timeTaken = System.currentTimeMillis - time - val logText = s"In ${timeTaken}ms, recommendUsers(${requestToString(request)} returned error" - accessLog.error(exc, logText) - } - } - - private def requestToString(request: RecommendUserRequest): String = { - Seq( - request.requesterId, - request.displayLocation, - request.seedsWithWeights.size, - request.seedsWithWeights.take(5), - request.excludedUserIds.map(_.size).getOrElse(0), - request.excludedUserIds.map(_.take(5)), - request.maxNumResults, - request.maxNumSocialProofs, - request.minUserPerSocialProof, - request.socialProofTypes, - request.maxEdgeEngagementAgeInMillis - ).mkString(",") - } - - private def responseToString(response: RecommendUserResponse): String = { - response.recommendedUsers.toList.map { recUser => - val socialProof = recUser.socialProofs.map { - case (proofType, proofs) => - (proofType, proofs) - } - (recUser.userId, recUser.score, socialProof) - }.toString - } -} diff --git a/src/scala/com/twitter/recos/user_user_graph/Main.docx b/src/scala/com/twitter/recos/user_user_graph/Main.docx new file mode 100644 index 000000000..b81133936 Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/Main.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/Main.scala b/src/scala/com/twitter/recos/user_user_graph/Main.scala deleted file mode 100644 index 55f889c02..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/Main.scala +++ /dev/null @@ -1,255 +0,0 @@ -package com.twitter.recos.user_user_graph - -import com.twitter.abdecider.ABDeciderFactory -import com.twitter.abdecider.LoggingABDecider -import com.twitter.app.Flag -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.ThriftMux -import com.twitter.finagle.http.HttpMuxer -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.mtls.server.MtlsStackServer._ -import com.twitter.finagle.mux.transport.OpportunisticTls -import com.twitter.finagle.thrift.ClientId -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.finatra.kafka.domain.KafkaGroupId -import com.twitter.finatra.kafka.domain.SeekStrategy -import com.twitter.finatra.kafka.serde.ScalaSerdes -import com.twitter.frigate.common.util.ElfOwlFilter -import com.twitter.frigate.common.util.ElfOwlFilter.ByLdapGroup -import com.twitter.logging._ -import com.twitter.recos.decider.UserUserGraphDecider -import com.twitter.recos.graph_common.FinagleStatsReceiverWrapper -import com.twitter.recos.graph_common.NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.recos.model.Constants -import com.twitter.recos.user_user_graph.KafkaConfig._ -import com.twitter.recos.user_user_graph.RecosConfig._ -import com.twitter.server.Deciderable -import com.twitter.server.TwitterServer -import com.twitter.server.logging.{Logging => JDK14Logging} -import com.twitter.servo.request._ -import com.twitter.servo.util.ExceptionCounter -import com.twitter.thriftwebforms._ -import com.twitter.util.Await -import com.twitter.util.Duration -import java.net.InetSocketAddress -import java.util.concurrent.TimeUnit -import org.apache.kafka.clients.CommonClientConfigs -import org.apache.kafka.common.config.SaslConfigs -import org.apache.kafka.common.config.SslConfigs -import org.apache.kafka.common.security.auth.SecurityProtocol -import org.apache.kafka.common.serialization.StringDeserializer - -object Main extends TwitterServer with JDK14Logging with Deciderable { - profile => - - val shardId: Flag[Int] = flag("shardId", 0, "Shard ID") - val servicePort: Flag[InetSocketAddress] = - flag("service.port", new InetSocketAddress(10143), "Thrift service port") - val logDir: Flag[String] = flag("logdir", "recos", "Logging directory") - val hoseName: Flag[String] = - flag("hosename", "recos_injector_user_user", "the kafka stream used for incoming edges") - val maxNumSegments: Flag[Int] = - flag("maxNumSegments", graphBuilderConfig.maxNumSegments, "the number of segments in the graph") - val numShards: Flag[Int] = flag("numShards", 1, "Number of shards for this service") - val truststoreLocation: Flag[String] = - flag[String]("truststore_location", "", "Truststore file location") - - val dataCenter: Flag[String] = flag("service.cluster", "atla", "Data Center") - val serviceRole: Flag[String] = flag("service.role", "Service Role") - val serviceEnv: Flag[String] = flag("service.env", "Service Env") - val serviceName: Flag[String] = flag("service.name", "Service Name") - - val statsReceiverWrapper: FinagleStatsReceiverWrapper = FinagleStatsReceiverWrapper( - statsReceiver - ) - - /** - * A ClientRequestAuthorizer to be used in a request-authorization RequestFilter. - */ - lazy val clientAuthorizer: ClientRequestAuthorizer = - ClientRequestAuthorizer.observed( - ClientRequestAuthorizer.permissive, - new ClientRequestObserver(statsReceiver) - ) - - lazy val clientId = ClientId("userusergraph.%s".format(serviceEnv().replace("devel", "dev"))) - - val shutdownTimeout: Flag[Duration] = flag( - "service.shutdownTimeout", - 5.seconds, - "Maximum amount of time to wait for pending requests to complete on shutdown" - ) - - /** - * ExceptionCounter for tracking failures from RequestHandler(s). - */ - lazy val exceptionCounter = new ExceptionCounter(statsReceiver) - - /** - * Function for translating exceptions returned by a RequestHandler. Useful - * for cases where underlying exception types should be wrapped in those - * defined in the project's Thrift IDL. - */ - lazy val translateExceptions: PartialFunction[Throwable, Throwable] = { - case t => t - } - - // ********* logging ********** - - lazy val loggingLevel: Level = Level.INFO - lazy val recosLogPath: String = logDir() + "/recos.log" - lazy val graphLogPath: String = logDir() + "/graph.log" - lazy val accessLogPath: String = logDir() + "/access.log" - - override def loggerFactories: List[LoggerFactory] = - List( - LoggerFactory( - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = recosLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "graph", - useParents = false, - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = graphLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "access", - useParents = false, - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = accessLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "client_event", - level = Some(loggingLevel), - useParents = false, - handlers = QueueingHandler( - maxQueueSize = 10000, - handler = ScribeHandler( - category = "client_event", - formatter = BareFormatter - ) - ) :: Nil - ) - ) - // ******** Decider ************* - - val recosDecider: UserUserGraphDecider = UserUserGraphDecider() - - // ********* ABdecider ********** - - val abDeciderYmlPath: String = "/usr/local/config/abdecider/abdecider.yml" - - val scribeLogger: Option[Logger] = Some(Logger.get("client_event")) - - val abDecider: LoggingABDecider = - ABDeciderFactory( - abDeciderYmlPath = abDeciderYmlPath, - scribeLogger = scribeLogger, - environment = Some("production") - ).buildWithLogging() - - val ldapGroups = Seq("eng", "cassowary-group", "timeline-team") - - // ********* Recos service ********** - - def main(): Unit = { - log.info("building graph with maxNumSegments = " + profile.maxNumSegments()) - log.info("Reading from: " + hoseName()) - - val graph = NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder( - graphBuilderConfig.copy(maxNumSegments = profile.maxNumSegments()), - statsReceiverWrapper - ) - - val kafkaConfigBuilder = FinagleKafkaConsumerBuilder[String, RecosHoseMessage]() - .dest("/s/kafka/recommendations:kafka-tls") - .groupId(KafkaGroupId(f"user_user_graph-${shardId()}%06d")) - .keyDeserializer(new StringDeserializer) - .valueDeserializer(ScalaSerdes.Thrift[RecosHoseMessage].deserializer) - .seekStrategy(SeekStrategy.REWIND) - .rewindDuration(24.hours) - .withConfig(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, SecurityProtocol.SASL_SSL.toString) - .withConfig(SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG, truststoreLocation()) - .withConfig(SaslConfigs.SASL_MECHANISM, SaslConfigs.GSSAPI_MECHANISM) - .withConfig(SaslConfigs.SASL_KERBEROS_SERVICE_NAME, "kafka") - .withConfig(SaslConfigs.SASL_KERBEROS_SERVER_NAME, "kafka") - - val graphWriter = UserUserGraphWriter( - shardId = shardId().toString, - env = serviceEnv(), - hosename = hoseName(), - bufferSize = bufferSize, - kafkaConsumerBuilder = kafkaConfigBuilder, - clientId = clientId.name, - statsReceiver = statsReceiver - ) - graphWriter.initHose(graph) - - val recommendUsersHandler = RecommendUsersHandlerImpl( - graph, - Constants.salsaRunnerConfig, - recosDecider, - statsReceiverWrapper - ) - - val recos = new UserUserGraph(recommendUsersHandler) with LoggingUserUserGraph - - // For MutualTLS - val serviceIdentifier = ServiceIdentifier( - role = serviceRole(), - service = serviceName(), - environment = serviceEnv(), - zone = dataCenter() - ) - - val thriftServer = ThriftMux.server - .withOpportunisticTls(OpportunisticTls.Required) - .withMutualTls(serviceIdentifier) - .serveIface(servicePort(), recos) - - this.addAdminRoute(ElfOwlFilter.getPostbackRoute()) - - val elfowlFilter = ElfOwlFilter( - ByLdapGroup(ldapGroups), - Duration.fromTimeUnit(5, TimeUnit.DAYS) - ) - - log.info(s"ServiceIdentifier = ${serviceIdentifier.toString}") - log.info("clientid: " + clientId.toString) - log.info("servicePort: " + servicePort().toString) - log.info("adding shutdown hook") - onExit { - graphWriter.shutdown() - thriftServer.close(shutdownTimeout().fromNow) - } - log.info("added shutdown hook") - // Wait on the thriftServer so that shutdownTimeout is respected. - Await.result(thriftServer) - } -} diff --git a/src/scala/com/twitter/recos/user_user_graph/README.docx b/src/scala/com/twitter/recos/user_user_graph/README.docx new file mode 100644 index 000000000..919bb9f2a Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/README.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/README.md b/src/scala/com/twitter/recos/user_user_graph/README.md deleted file mode 100644 index 6412f235c..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/README.md +++ /dev/null @@ -1,17 +0,0 @@ -# UserUserGraph (UUG) - -## What is it -User User Graph (UUG) is a Finalge thrift service built on the GraphJet framework. In maintains a graph of user-user relationships and serves user recommendations based on traversals of this graph. - -## How is it used on Twitter -UUG recommends users to follow based on who your follow graph have recently followed. -The core idea behind UUG is collaborative filtering. UUG takes a user's weighted follow graph (i.e a list of weighted userIds) as input, -performs efficient traversal & aggregation, and returns the top weighted users basd on # of users that engaged the users, as well as -the engaging users' weights. - -UUG is a stateful service and relies on a Kafka stream to ingest & persist states. It maintains an in-memory user engagements over the past -week. Older events are dropped and GC'ed. - -For full details on storage & processing, please check out our open-sourced project GraphJet, a general-purpose high performance in-memory storage engine. -- https://github.com/twitter/GraphJet -- http://www.vldb.org/pvldb/vol9/p1281-sharma.pdf diff --git a/src/scala/com/twitter/recos/user_user_graph/RecommendUsersHandler.docx b/src/scala/com/twitter/recos/user_user_graph/RecommendUsersHandler.docx new file mode 100644 index 000000000..629b920d3 Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/RecommendUsersHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/RecommendUsersHandler.scala b/src/scala/com/twitter/recos/user_user_graph/RecommendUsersHandler.scala deleted file mode 100644 index fa1978bbb..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/RecommendUsersHandler.scala +++ /dev/null @@ -1,221 +0,0 @@ -package com.twitter.recos.user_user_graph - -import java.util.Random -import com.google.common.collect.Lists -import com.twitter.concurrent.AsyncQueue -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.algorithms.counting.TopSecondDegreeByCountResponse -import com.twitter.graphjet.algorithms.counting.user.TopSecondDegreeByCountForUser -import com.twitter.graphjet.algorithms.counting.user.TopSecondDegreeByCountRequestForUser -import com.twitter.graphjet.algorithms.counting.user.UserRecommendationInfo -import com.twitter.graphjet.algorithms.ConnectingUsersWithMetadata -import com.twitter.graphjet.algorithms.filters._ -import com.twitter.graphjet.bipartite.NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph -import com.twitter.logging.Logger -import com.twitter.recos.decider.UserUserGraphDecider -import com.twitter.recos.graph_common.FinagleStatsReceiverWrapper -import com.twitter.recos.model.SalsaQueryRunner.SalsaRunnerConfig -import com.twitter.recos.recos_common.thriftscala.UserSocialProofType -import com.twitter.recos.user_user_graph.thriftscala._ -import com.twitter.recos.util.Stats._ -import com.twitter.servo.request.RequestHandler -import com.twitter.util.Future -import com.twitter.util.Try -import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap -import it.unimi.dsi.fastutil.longs.LongOpenHashSet -import scala.collection.JavaConverters._ - -trait RecommendUsersHandler extends RequestHandler[RecommendUserRequest, RecommendUserResponse] - -/** - * Computes user recommendations based on a RecommendUserRequest by using - * TopSecondDegree algorithm in GraphJet. - */ -case class RecommendUsersHandlerImpl( - bipartiteGraph: NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph, - salsaRunnerConfig: SalsaRunnerConfig, - decider: UserUserGraphDecider, - statsReceiverWrapper: FinagleStatsReceiverWrapper) - extends RecommendUsersHandler { - - private val log: Logger = Logger(this.getClass.getSimpleName) - private val stats = statsReceiverWrapper.statsReceiver.scope(this.getClass.getSimpleName) - private val failureCounter = stats.counter("failure") - private val recsStat = stats.stat("recs_count") - private val emptyCounter = stats.counter("empty") - private val pollCounter = stats.counter("poll") - private val pollTimeoutCounter = stats.counter("pollTimeout") - private val offerCounter = stats.counter("offer") - private val pollLatencyStat = stats.stat("pollLatency") - private val graphJetQueue = new AsyncQueue[TopSecondDegreeByCountForUser] - (0 until salsaRunnerConfig.numSalsaRunners).foreach { _ => - graphJetQueue.offer( - new TopSecondDegreeByCountForUser( - bipartiteGraph, - salsaRunnerConfig.expectedNodesToHitInSalsa, - statsReceiverWrapper.scope(this.getClass.getSimpleName) - ) - ) - } - - /** - * Given a user_user_graph request, make it conform to GraphJet's request format - */ - private def convertRequestToJava( - request: RecommendUserRequest - ): TopSecondDegreeByCountRequestForUser = { - val queryNode = request.requesterId - val leftSeedNodesWithWeight = new Long2DoubleOpenHashMap( - request.seedsWithWeights.keys.toArray, - request.seedsWithWeights.values.toArray - ) - val toBeFiltered = new LongOpenHashSet(request.excludedUserIds.getOrElse(Nil).toArray) - val maxNumResults = request.maxNumResults.getOrElse(DefaultRequestParams.MaxNumResults) - val maxNumSocialProofs = - request.maxNumSocialProofs.getOrElse(DefaultRequestParams.MaxNumSocialProofs) - val minUserPerSocialProof = convertMinUserPerSocialProofToJava(request.minUserPerSocialProof) - val socialProofTypes = - UserEdgeTypeMask.getUserUserGraphSocialProofTypes(request.socialProofTypes) - val maxRightNodeAgeInMillis = DefaultRequestParams.MaxRightNodeAgeThreshold - val maxEdgeEngagementAgeInMillis = - request.maxEdgeEngagementAgeInMillis.getOrElse(DefaultRequestParams.MaxEdgeAgeThreshold) - val resultFilterChain = new ResultFilterChain( - Lists.newArrayList( - new SocialProofTypesFilter(statsReceiverWrapper), - new RequestedSetFilter(statsReceiverWrapper) - ) - ) - - new TopSecondDegreeByCountRequestForUser( - queryNode, - leftSeedNodesWithWeight, - toBeFiltered, - maxNumResults, - maxNumSocialProofs, - UserEdgeTypeMask.SIZE.toInt, - minUserPerSocialProof, - socialProofTypes, - maxRightNodeAgeInMillis, - maxEdgeEngagementAgeInMillis, - resultFilterChain - ) - } - - /** - * Converts the thrift scala type to the Java equivalent - */ - private def convertMinUserPerSocialProofToJava( - socialProofInScala: Option[scala.collection.Map[UserSocialProofType, Int]] - ): java.util.Map[java.lang.Byte, java.lang.Integer] = { - socialProofInScala - .map { - _.map { - case (key: UserSocialProofType, value: Int) => - (new java.lang.Byte(key.getValue.toByte), new java.lang.Integer(value)) - } - } - .getOrElse(Map.empty[java.lang.Byte, java.lang.Integer]) - .asJava - } - - /** - * Converts a byte-array format of social proofs in Java to its Scala equivalent - */ - private def convertSocialProofsToScala( - socialProofs: java.util.Map[java.lang.Byte, ConnectingUsersWithMetadata] - ): scala.collection.mutable.Map[UserSocialProofType, scala.Seq[Long]] = { - socialProofs.asScala.map { - case (socialProofByte, socialProof) => - val proofType = UserSocialProofType(socialProofByte.toByte) - val ids = socialProof.getConnectingUsers.asScala.map(_.toLong) - (proofType, ids) - } - } - - /** - * Converts Java recommendation results to its Scala equivalent - */ - private def convertResponseToScala( - responseOpt: Option[TopSecondDegreeByCountResponse] - ): RecommendUserResponse = { - responseOpt match { - case Some(rawResponse) => - val userSeq = rawResponse.getRankedRecommendations.asScala.toSeq.flatMap { - case userRecs: UserRecommendationInfo => - Some( - RecommendedUser( - userRecs.getRecommendation, - userRecs.getWeight, - convertSocialProofsToScala(userRecs.getSocialProof) - ) - ) - case _ => - None - } - recsStat.add(userSeq.size) - if (userSeq.isEmpty) { - emptyCounter.incr() - } - RecommendUserResponse(userSeq) - case None => - emptyCounter.incr() - RecommendUserResponse(Nil) - } - } - - private def getGraphJetResponse( - graphJet: TopSecondDegreeByCountForUser, - request: TopSecondDegreeByCountRequestForUser, - random: Random - )( - implicit statsReceiver: StatsReceiver - ): Option[TopSecondDegreeByCountResponse] = { - trackBlockStats(stats) { - // compute recs -- need to catch and print exceptions here otherwise they are swallowed - val recAttempt = Try(graphJet.computeRecommendations(request, random)).onFailure { e => - failureCounter.incr() - log.error(e, "GraphJet computation failed") - } - recAttempt.toOption - } - } - - override def apply(request: RecommendUserRequest): Future[RecommendUserResponse] = { - val random = new Random() - val graphJetRequest = convertRequestToJava(request) - pollCounter.incr() - val t0 = System.currentTimeMillis - graphJetQueue.poll().map { graphJetRunner => - val pollTime = System.currentTimeMillis - t0 - pollLatencyStat.add(pollTime) - val response = Try { - if (pollTime < salsaRunnerConfig.timeoutSalsaRunner) { - convertResponseToScala( - getGraphJetResponse( - graphJetRunner, - graphJetRequest, - random - )(statsReceiverWrapper.statsReceiver) - ) - } else { - // if we did not get a runner in time, then fail fast here and immediately put it back - log.warning("GraphJet Queue polling timeout") - pollTimeoutCounter.incr() - throw new RuntimeException("GraphJet poll timeout") - RecommendUserResponse(Nil) - } - } ensure { - graphJetQueue.offer(graphJetRunner) - offerCounter.incr() - } - response.toOption.getOrElse(RecommendUserResponse(Nil)) - } - } - - object DefaultRequestParams { - val MaxNumResults = 100 - val MaxNumSocialProofs = 100 - val MaxRightNodeAgeThreshold: Long = Long.MaxValue - val MaxEdgeAgeThreshold: Long = Long.MaxValue - } -} diff --git a/src/scala/com/twitter/recos/user_user_graph/RecosConfig.docx b/src/scala/com/twitter/recos/user_user_graph/RecosConfig.docx new file mode 100644 index 000000000..3b8e9d1e1 Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/RecosConfig.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/RecosConfig.scala b/src/scala/com/twitter/recos/user_user_graph/RecosConfig.scala deleted file mode 100644 index 38c17fc5e..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/RecosConfig.scala +++ /dev/null @@ -1,37 +0,0 @@ -package com.twitter.recos.user_user_graph - -import com.twitter.recos.model.Constants -import com.twitter.recos.graph_common.NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraphBuilder.GraphBuilderConfig - -/** - * The class holds all the config parameters for recos graph. - */ -object RecosConfig { - val maxNumSegments: Int = 5 - val maxNumEdgesPerSegment: Int = 1 << 26 // 64M edges per segment - val expectedNumLeftNodes: Int = 1 << 24 // should correspond to 16M nodes storage - val expectedMaxLeftDegree: Int = 64 - val leftPowerLawExponent: Double = 16.0 // steep power law as most nodes will have a small degree - val expectedNumRightNodes: Int = 1 << 24 // 16M nodes - val numRightNodeMetadataTypes = 1 // UUG does not have node metadata - - val graphBuilderConfig = GraphBuilderConfig( - maxNumSegments = maxNumSegments, - maxNumEdgesPerSegment = maxNumEdgesPerSegment, - expectedNumLeftNodes = expectedNumLeftNodes, - expectedMaxLeftDegree = expectedMaxLeftDegree, - leftPowerLawExponent = leftPowerLawExponent, - expectedNumRightNodes = expectedNumRightNodes, - numRightNodeMetadataTypes = numRightNodeMetadataTypes, - edgeTypeMask = new UserEdgeTypeMask() - ) - - println("RecosConfig - maxNumSegments " + maxNumSegments) - println("RecosConfig - maxNumEdgesPerSegment " + maxNumEdgesPerSegment) - println("RecosConfig - expectedNumLeftNodes " + expectedNumLeftNodes) - println("RecosConfig - expectedMaxLeftDegree " + expectedMaxLeftDegree) - println("RecosConfig - leftPowerLawExponent " + leftPowerLawExponent) - println("RecosConfig - expectedNumRightNodes " + expectedNumRightNodes) - println("RecosConfig - numRightNodeMetadataTypes " + numRightNodeMetadataTypes) - println("RecosConfig - salsaRunnerConfig " + Constants.salsaRunnerConfig) -} diff --git a/src/scala/com/twitter/recos/user_user_graph/UserEdgeTypeMask.docx b/src/scala/com/twitter/recos/user_user_graph/UserEdgeTypeMask.docx new file mode 100644 index 000000000..9fb6846e9 Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/UserEdgeTypeMask.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/UserEdgeTypeMask.scala b/src/scala/com/twitter/recos/user_user_graph/UserEdgeTypeMask.scala deleted file mode 100644 index ac29bebf2..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/UserEdgeTypeMask.scala +++ /dev/null @@ -1,91 +0,0 @@ -package com.twitter.recos.user_user_graph - -import com.twitter.graphjet.bipartite.api.EdgeTypeMask -import com.twitter.recos.recos_common.thriftscala.UserSocialProofType - -/** - * The bit mask is used to encode edge types in the top bits of an integer, - * e.g. Follow, Mention, and Mediatag. Under current segment configuration, each segment - * stores up to 128M edges. Assuming that each node on one side is unique, each segment - * stores up to 128M unique nodes on one side, which occupies the lower 27 bits of an integer. - * This leaves five bits to encode the edge types, which at max can store 32 edge types. - * The following implementation utilizes the top four bits and leaves one free bit out. - */ -class UserEdgeTypeMask extends EdgeTypeMask { - import UserEdgeTypeMask._ - override def encode(node: Int, edgeType: Byte): Int = { - require( - edgeType == FOLLOW || edgeType == MENTION || edgeType == MEDIATAG, - s"encode: Illegal edge type argument $edgeType") - node | EDGEARRAY(edgeType) - } - - override def edgeType(node: Int): Byte = { - (node >> 28).toByte - } - - override def restore(node: Int): Int = { - node & MASK - } -} - -object UserEdgeTypeMask { - - /** - * Reserve the top four bits of each integer to encode the edge type information. - */ - val MASK: Int = - Integer.parseInt("00001111111111111111111111111111", 2) - val FOLLOW: Byte = 0 - val MENTION: Byte = 1 - val MEDIATAG: Byte = 2 - val SIZE: Byte = 3 - val UNUSED3: Byte = 3 - val UNUSED4: Byte = 4 - val UNUSED5: Byte = 5 - val UNUSED6: Byte = 6 - val UNUSED7: Byte = 7 - val UNUSED8: Byte = 8 - val UNUSED9: Byte = 9 - val UNUSED10: Byte = 10 - val UNUSED11: Byte = 11 - val UNUSED12: Byte = 12 - val UNUSED13: Byte = 13 - val UNUSED14: Byte = 14 - val UNUSED15: Byte = 15 - val EDGEARRAY: Array[Int] = Array( - 0, - 1 << 28, - 2 << 28, - 3 << 28, - 4 << 28, - 5 << 28, - 6 << 28, - 7 << 28, - 8 << 28, - 9 << 28, - 10 << 28, - 11 << 28, - 12 << 28, - 13 << 28, - 14 << 28, - 15 << 28 - ) - - /** - * Map valid social proof types specified by clients to an array of bytes. If clients do not - * specify any social proof types in thrift, it will return all available social types by - * default. - * - * @param socialProofTypes are the valid socialProofTypes specified by clients - * @return an array of bytes representing valid social proof types - */ - def getUserUserGraphSocialProofTypes( - socialProofTypes: Option[Seq[UserSocialProofType]] - ): Array[Byte] = { - socialProofTypes - .map { _.map { _.getValue }.toArray } - .getOrElse((0 until SIZE).toArray) - .map { _.toByte } - } -} diff --git a/src/scala/com/twitter/recos/user_user_graph/UserUserGraph.docx b/src/scala/com/twitter/recos/user_user_graph/UserUserGraph.docx new file mode 100644 index 000000000..fd641cd2e Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/UserUserGraph.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/UserUserGraph.scala b/src/scala/com/twitter/recos/user_user_graph/UserUserGraph.scala deleted file mode 100644 index 128597f90..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/UserUserGraph.scala +++ /dev/null @@ -1,18 +0,0 @@ -package com.twitter.recos.user_user_graph - -import com.twitter.finagle.thrift.ClientId -import com.twitter.finagle.tracing.{Trace, TraceId} -import com.twitter.recos.user_user_graph.thriftscala._ -import com.twitter.util.Future - -object UserUserGraph { - def traceId: TraceId = Trace.id - def clientId: Option[ClientId] = ClientId.current -} - -class UserUserGraph(recommendUsersHandler: RecommendUsersHandler) - extends thriftscala.UserUserGraph.MethodPerEndpoint { - - override def recommendUsers(request: RecommendUserRequest): Future[RecommendUserResponse] = - recommendUsersHandler(request) -} diff --git a/src/scala/com/twitter/recos/user_user_graph/UserUserGraphWriter.docx b/src/scala/com/twitter/recos/user_user_graph/UserUserGraphWriter.docx new file mode 100644 index 000000000..011a474db Binary files /dev/null and b/src/scala/com/twitter/recos/user_user_graph/UserUserGraphWriter.docx differ diff --git a/src/scala/com/twitter/recos/user_user_graph/UserUserGraphWriter.scala b/src/scala/com/twitter/recos/user_user_graph/UserUserGraphWriter.scala deleted file mode 100644 index 637f29717..000000000 --- a/src/scala/com/twitter/recos/user_user_graph/UserUserGraphWriter.scala +++ /dev/null @@ -1,83 +0,0 @@ -package com.twitter.recos.user_user_graph - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.graphjet.bipartite.NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph -import com.twitter.graphjet.bipartite.segment.NodeMetadataLeftIndexedBipartiteGraphSegment -import com.twitter.recos.hose.common.UnifiedGraphWriter -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.recos.util.Action - -case class UserUserGraphWriter( - shardId: String, - env: String, - hosename: String, - bufferSize: Int, - kafkaConsumerBuilder: FinagleKafkaConsumerBuilder[String, RecosHoseMessage], - clientId: String, - statsReceiver: StatsReceiver) - extends UnifiedGraphWriter[ - NodeMetadataLeftIndexedBipartiteGraphSegment, - NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph - ] { - - // The max throughput for each kafka consumer is around 25MB/s - // Use 3 processors for 75MB/s catch-up speed. - val consumerNum: Int = 3 - // Leave 2 Segments for live writer - val catchupWriterNum: Int = RecosConfig.maxNumSegments - 2 - - import UserUserGraphWriter._ - - private def getEdgeType(action: Byte): Byte = { - if (action == Action.Follow.id) { - UserEdgeTypeMask.FOLLOW - } else if (action == Action.Mention.id) { - UserEdgeTypeMask.MENTION - } else if (action == Action.MediaTag.id) { - UserEdgeTypeMask.MEDIATAG - } else { - throw new IllegalArgumentException("getEdgeType: Illegal edge type argument " + action) - } - } - - /** - * Adds a RecosHoseMessage to the graph. used by live writer to insert edges to the - * current segment - */ - override def addEdgeToGraph( - graph: NodeMetadataLeftIndexedPowerLawMultiSegmentBipartiteGraph, - recosHoseMessage: RecosHoseMessage - ): Unit = { - graph.addEdge( - recosHoseMessage.leftId, - recosHoseMessage.rightId, - getEdgeType(recosHoseMessage.action), - recosHoseMessage.edgeMetadata.getOrElse(0L), - EMTPY_NODE_METADATA, - EMTPY_NODE_METADATA - ) - } - - /** - * Adds a RecosHoseMessage to the given segment in the graph. Used by catch up writers to - * insert edges to non-current (old) segments - */ - override def addEdgeToSegment( - segment: NodeMetadataLeftIndexedBipartiteGraphSegment, - recosHoseMessage: RecosHoseMessage - ): Unit = { - segment.addEdge( - recosHoseMessage.leftId, - recosHoseMessage.rightId, - getEdgeType(recosHoseMessage.action), - recosHoseMessage.edgeMetadata.getOrElse(0L), - EMTPY_NODE_METADATA, - EMTPY_NODE_METADATA - ) - } -} - -private object UserUserGraphWriter { - final val EMTPY_NODE_METADATA = new Array[Array[Int]](1) -} diff --git a/src/scala/com/twitter/recos/user_video_graph/BUILD b/src/scala/com/twitter/recos/user_video_graph/BUILD deleted file mode 100644 index f85d7ba96..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/BUILD +++ /dev/null @@ -1,69 +0,0 @@ -scala_library( - name = "user-video-graph", - sources = ["*.scala"], - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "3rdparty/jvm/cascading:cascading-local", - "3rdparty/jvm/com/backtype:dfs-datastores", - "3rdparty/jvm/com/fasterxml/jackson/module:jackson-module-scala", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/netflix/curator:curator-framework", - "3rdparty/jvm/com/twitter/graphjet", - "3rdparty/jvm/io/netty:netty4-tcnative-boringssl-static", - "3rdparty/jvm/it/unimi/dsi:fastutil", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/kafka:rosette-kafka", - "3rdparty/jvm/org/apache/thrift:libthrift", - "abdecider/src/main/scala", - "decider/src/main/scala", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/server", - "finagle/finagle-core/src/main", - "finagle/finagle-http/src/main/scala", - "finagle/finagle-memcached/src/main/scala", - "finagle/finagle-stats/src/main/scala", - "finagle/finagle-thriftmux/src/main/scala", - "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/util", - "scrooge/scrooge-core/src/main/scala", - "servo/repo/src/main/scala", - "servo/request/src/main/scala", - "servo/util/src/main/scala", - "src/resources/com/twitter/recos:decider", - "src/scala/com/twitter/recos/decider", - "src/scala/com/twitter/recos/graph_common", - "src/scala/com/twitter/recos/hose/common", - "src/scala/com/twitter/recos/model:recos-model", - "src/scala/com/twitter/recos/serviceapi", - "src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers", - "src/scala/com/twitter/recos/user_video_graph/store", - "src/scala/com/twitter/recos/util:recos-util", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/recos:recos-common-scala", - "src/thrift/com/twitter/recos:recos-internal-scala", - "src/thrift/com/twitter/recos/user_video_graph:user_video_graph-scala", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms/model", - "twitter-server-internal/src/main/scala", - "twitter-server/server/src/main/scala", - "twitter-server/slf4j-jdk14/src/main/scala/com/twitter/server/logging", - "util/util-app/src/main/scala", - "util/util-hashing/src/main/scala", - "util/util-stats/src/main/scala", - ], -) - -jvm_binary( - name = "bin", - basename = "user-video-graph-server", - main = "com.twitter.recos.user_video_graph.Main", - runtime_platform = "java11", - tags = ["known-to-fail-jira:SD-20771"], - dependencies = [ - ":user-video-graph", - "3rdparty/jvm/org/slf4j:slf4j-jdk14", - "twitter-server/slf4j-jdk14/src/main/scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_video_graph/BUILD.docx b/src/scala/com/twitter/recos/user_video_graph/BUILD.docx new file mode 100644 index 000000000..fd6f5956a Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/LoggingUserVideoGraph.docx b/src/scala/com/twitter/recos/user_video_graph/LoggingUserVideoGraph.docx new file mode 100644 index 000000000..0cf013b67 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/LoggingUserVideoGraph.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/LoggingUserVideoGraph.scala b/src/scala/com/twitter/recos/user_video_graph/LoggingUserVideoGraph.scala deleted file mode 100644 index b7747596c..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/LoggingUserVideoGraph.scala +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.recos.user_video_graph - -import com.twitter.finagle.tracing.Trace -import com.twitter.logging.Logger -import com.twitter.recos.recos_common.thriftscala._ -import com.twitter.recos.user_video_graph.thriftscala._ -import com.twitter.util.Future - -trait LoggingUserVideoGraph extends thriftscala.UserVideoGraph.MethodPerEndpoint { - private[this] val accessLog = Logger("access") - -} diff --git a/src/scala/com/twitter/recos/user_video_graph/Main.docx b/src/scala/com/twitter/recos/user_video_graph/Main.docx new file mode 100644 index 000000000..9d1991be2 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/Main.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/Main.scala b/src/scala/com/twitter/recos/user_video_graph/Main.scala deleted file mode 100644 index 96b2a6218..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/Main.scala +++ /dev/null @@ -1,294 +0,0 @@ -package com.twitter.recos.user_video_graph - -import com.twitter.abdecider.ABDeciderFactory -import com.twitter.abdecider.LoggingABDecider -import com.twitter.app.Flag -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.ThriftMux -import com.twitter.finagle.http.HttpMuxer -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.mtls.client.MtlsStackClient.MtlsThriftMuxClientSyntax -import com.twitter.finagle.mtls.server.MtlsStackServer._ -import com.twitter.finagle.mux.ClientDiscardedRequestException -import com.twitter.finagle.mux.transport.OpportunisticTls -import com.twitter.finagle.service.ReqRep -import com.twitter.finagle.service.ResponseClass -import com.twitter.finagle.thrift.ClientId -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.finatra.kafka.domain.KafkaGroupId -import com.twitter.finatra.kafka.domain.SeekStrategy -import com.twitter.finatra.kafka.serde.ScalaSerdes -import com.twitter.frigate.common.util.ElfOwlFilter -import com.twitter.frigate.common.util.ElfOwlFilter.ByLdapGroup -import com.twitter.graphjet.bipartite.MultiSegmentPowerLawBipartiteGraph -import com.twitter.logging._ -import com.twitter.recos.decider.EndpointLoadShedder -import com.twitter.recos.decider.UserTweetGraphDecider -import com.twitter.recos.graph_common.FinagleStatsReceiverWrapper -import com.twitter.recos.graph_common.MultiSegmentPowerLawBipartiteGraphBuilder -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.recos.user_video_graph.RecosConfig._ -import com.twitter.recos.user_tweet_graph.relatedTweetHandlers.ConsumersBasedRelatedTweetsHandler -import com.twitter.recos.user_video_graph.relatedTweetHandlers.TweetBasedRelatedTweetsHandler -import com.twitter.recos.user_video_graph.relatedTweetHandlers.ProducerBasedRelatedTweetsHandler -import com.twitter.recos.user_video_graph.store.UserRecentFollowersStore -import com.twitter.server.Deciderable -import com.twitter.server.TwitterServer -import com.twitter.server.logging.{Logging => JDK14Logging} -import com.twitter.servo.request._ -import com.twitter.servo.util.ExceptionCounter -import com.twitter.simclusters_v2.common.UserId -import com.twitter.socialgraph.thriftscala.SocialGraphService -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Await -import com.twitter.util.Duration -import com.twitter.util.JavaTimer -import com.twitter.util.Throw -import com.twitter.util.Timer -import java.net.InetSocketAddress -import java.util.concurrent.TimeUnit -import org.apache.kafka.clients.CommonClientConfigs -import org.apache.kafka.common.config.SaslConfigs -import org.apache.kafka.common.config.SslConfigs -import org.apache.kafka.common.security.auth.SecurityProtocol -import org.apache.kafka.common.serialization.StringDeserializer -import scala.reflect.ClassTag - -object Main extends TwitterServer with JDK14Logging with Deciderable { - profile => - - val shardId: Flag[Int] = flag("shardId", 0, "Shard ID") - val servicePort: Flag[InetSocketAddress] = - flag("service.port", new InetSocketAddress(10143), "Thrift service port") - val logDir: Flag[String] = flag("logdir", "recos", "Logging directory") - val numShards: Flag[Int] = flag("numShards", 1, "Number of shards for this service") - val truststoreLocation: Flag[String] = - flag[String]("truststore_location", "", "Truststore file location") - val hoseName: Flag[String] = - flag("hosename", "recos_injector_user_user", "the kafka stream used for incoming edges") - - val dataCenter: Flag[String] = flag("service.cluster", "atla", "Data Center") - val serviceRole: Flag[String] = flag("service.role", "Service Role") - val serviceEnv: Flag[String] = flag("service.env", "Service Env") - val serviceName: Flag[String] = flag("service.name", "Service Name") - - private val maxNumSegments = - flag("maxNumSegments", graphBuilderConfig.maxNumSegments, "the number of segments in the graph") - - private val statsReceiverWrapper = FinagleStatsReceiverWrapper(statsReceiver) - - /** - * A ClientRequestAuthorizer to be used in a request-authorization RequestFilter. - */ - lazy val clientAuthorizer: ClientRequestAuthorizer = - ClientRequestAuthorizer.observed( - ClientRequestAuthorizer.permissive, - new ClientRequestObserver(statsReceiver) - ) - - lazy val clientId = ClientId(s"usertweetgraph.${serviceEnv()}") - - private def makeThriftClient[ThriftServiceType: ClassTag]( - dest: String, - label: String, - serviceIdentifier: ServiceIdentifier, - requestTimeout: Duration = 100.milliseconds - ): ThriftServiceType = { - ThriftMux.client - .withClientId(ClientId("usertweetgraph.prod")) - .withOpportunisticTls(OpportunisticTls.Required) - .withMutualTls(serviceIdentifier) - .withRequestTimeout(requestTimeout) - .withStatsReceiver(statsReceiver.scope("clnt")) - .withResponseClassifier { - case ReqRep(_, Throw(_: ClientDiscardedRequestException)) => ResponseClass.Ignorable - }.build[ThriftServiceType](dest, label) - } - - private val shutdownTimeout = flag( - "service.shutdownTimeout", - 5.seconds, - "Maximum amount of time to wait for pending requests to complete on shutdown" - ) - - /** - * ExceptionCounter for tracking failures from RequestHandler(s). - */ - lazy val exceptionCounter = new ExceptionCounter(statsReceiver) - - /** - * Function for translating exceptions returned by a RequestHandler. Useful - * for cases where underlying exception types should be wrapped in those - * defined in the project's Thrift IDL. - */ - lazy val translateExceptions: PartialFunction[Throwable, Throwable] = { - case t => t - } - - val DefaultLdapAccessGroup: Seq[String] = Seq("eng", "cassowary-group", "timeline-team") - - // ********* logging ********** - - lazy val loggingLevel: Level = Level.INFO - lazy val recosLogPath: String = logDir() + "/recos.log" - lazy val graphLogPath: String = logDir() + "/graph.log" - lazy val accessLogPath: String = logDir() + "/access.log" - - override def loggerFactories: List[LoggerFactory] = - List( - LoggerFactory( - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = recosLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "graph", - useParents = false, - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = graphLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "access", - useParents = false, - level = Some(loggingLevel), - handlers = QueueingHandler( - handler = FileHandler( - filename = accessLogPath, - level = Some(loggingLevel), - rollPolicy = Policy.Hourly, - rotateCount = 6, - formatter = new Formatter - ) - ) :: Nil - ), - LoggerFactory( - node = "client_event", - level = Some(loggingLevel), - useParents = false, - handlers = QueueingHandler( - maxQueueSize = 10000, - handler = ScribeHandler( - category = "client_event", - formatter = BareFormatter - ) - ) :: Nil - ) - ) - // ******** Decider ************* - - // ********* ABdecider ********** - - val abDeciderYmlPath: String = "/usr/local/config/abdecider/abdecider.yml" - - val scribeLogger: Option[Logger] = Some(Logger.get("client_event")) - - val abDecider: LoggingABDecider = - ABDeciderFactory( - abDeciderYmlPath = abDeciderYmlPath, - scribeLogger = scribeLogger, - environment = Some("production") - ).buildWithLogging() - - // ********* Recos service ********** - - def main(): Unit = { - log.info("building graph with maxNumSegments = " + profile.maxNumSegments()) - - implicit val timer: Timer = new JavaTimer(true) - - val graph = MultiSegmentPowerLawBipartiteGraphBuilder( - graphBuilderConfig.copy(maxNumSegments = profile.maxNumSegments()), - statsReceiverWrapper - ) - - val kafkaConfigBuilder = FinagleKafkaConsumerBuilder[String, RecosHoseMessage]() - .dest("/s/kafka/recommendations:kafka-tls") - .groupId(KafkaGroupId(f"user_video_graph-${shardId()}%06d")) - .keyDeserializer(new StringDeserializer) - .valueDeserializer(ScalaSerdes.Thrift[RecosHoseMessage].deserializer) - .seekStrategy(SeekStrategy.REWIND) - .rewindDuration(48.hours) - .withConfig(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, SecurityProtocol.SASL_SSL.toString) - .withConfig(SslConfigs.SSL_TRUSTSTORE_LOCATION_CONFIG, truststoreLocation()) - .withConfig(SaslConfigs.SASL_MECHANISM, SaslConfigs.GSSAPI_MECHANISM) - .withConfig(SaslConfigs.SASL_KERBEROS_SERVICE_NAME, "kafka") - .withConfig(SaslConfigs.SASL_KERBEROS_SERVER_NAME, "kafka") - - val graphWriter = - UserVideoGraphWriter( - shardId().toString, - serviceEnv(), - hoseName(), - 128, // keep the original setting. - kafkaConfigBuilder, - clientId.name, - statsReceiver, - ) - graphWriter.initHose(graph) - - // For MutualTLS - val serviceIdentifier = ServiceIdentifier( - role = serviceRole(), - service = serviceName(), - environment = serviceEnv(), - zone = dataCenter() - ) - log.info(s"ServiceIdentifier = ${serviceIdentifier.toString}") - - val socialGraphClient: SocialGraphService.MethodPerEndpoint = - makeThriftClient[SocialGraphService.MethodPerEndpoint]( - "/s/socialgraph/socialgraph", - "socialgraph", - serviceIdentifier) - val userRecentFollowersStore: ReadableStore[UserRecentFollowersStore.Query, Seq[UserId]] = - new UserRecentFollowersStore(socialGraphClient) - - val tweetBasedRelatedTweetsHandler = new TweetBasedRelatedTweetsHandler(graph, statsReceiver) - val consumersBasedRelatedTweetsHandler = - new ConsumersBasedRelatedTweetsHandler(graph, statsReceiver) - val producerBasedRelatedTweetsHandler = - new ProducerBasedRelatedTweetsHandler(graph, userRecentFollowersStore, statsReceiver) - - val decider = UserTweetGraphDecider(serviceEnv(), dataCenter()) - val endpointLoadShedder = new EndpointLoadShedder(decider) - val userVideoGraph = - new UserVideoGraph( - tweetBasedRelatedTweetsHandler, - producerBasedRelatedTweetsHandler, - consumersBasedRelatedTweetsHandler, - endpointLoadShedder)(timer) with LoggingUserVideoGraph - - val thriftServer = ThriftMux.server - .withOpportunisticTls(OpportunisticTls.Required) - .withMutualTls(serviceIdentifier) - .serveIface(servicePort(), userVideoGraph) - - log.info("clientid: " + clientId.toString) - log.info("servicePort: " + servicePort().toString) - - log.info("adding shutdown hook") - onExit { - graphWriter.shutdown() - thriftServer.close(shutdownTimeout().fromNow) - } - log.info("added shutdown hook") - - // Wait on the thriftServer so that shutdownTimeout is respected. - Await.result(thriftServer) - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/README.docx b/src/scala/com/twitter/recos/user_video_graph/README.docx new file mode 100644 index 000000000..3933965c4 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/README.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/README.md b/src/scala/com/twitter/recos/user_video_graph/README.md deleted file mode 100644 index 71de5deef..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/README.md +++ /dev/null @@ -1,14 +0,0 @@ -# UserVideoGraph (UVG) - -## What is it -User Video Graph (UVG) is a Finalge thrift service built on the GraphJet framework. In maintains a graph of user-video engagements and serves user recommendations based on traversals in this graph. - -## How is it used on Twitter -UVG generates video recommendations from a given seed tweet set. It recommends tweets based on collaborative filtering & random walks. - -UVG is a stateful service and relies on a Kafka stream to ingest & persist states. The Kafka stream is processed and generated by Recos-Injector. -It maintains an in-memory user engagements over the past 24-48 hours. Older events are dropped and GC'ed. - -For full details on storage & processing, please check out our open-sourced project GraphJet, a general-purpose high performance in-memory storage engine. -- https://github.com/twitter/GraphJet -- http://www.vldb.org/pvldb/vol9/p1281-sharma.pdf diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoEdgeTypeMask.docx b/src/scala/com/twitter/recos/user_video_graph/UserVideoEdgeTypeMask.docx new file mode 100644 index 000000000..302557a4a Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/UserVideoEdgeTypeMask.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoEdgeTypeMask.scala b/src/scala/com/twitter/recos/user_video_graph/UserVideoEdgeTypeMask.scala deleted file mode 100644 index 9a6c577d2..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/UserVideoEdgeTypeMask.scala +++ /dev/null @@ -1,62 +0,0 @@ -package com.twitter.recos.user_video_graph - -import com.twitter.graphjet.bipartite.api.EdgeTypeMask -import com.twitter.recos.util.Action - -/** - * The bit mask is used to encode edge types in the top bits of an integer, - * e.g. favorite, retweet, reply and click. Under current segment configuration, each segment - * stores up to 128M edges. Assuming that each node on one side is unique, each segment - * stores up to 128M unique nodes on one side, which occupies the lower 27 bits of an integer. - * This leaves five bits to encode the edge types, which at max can store 32 edge types. - * The following implementation utilizes the top four bits and leaves one free bit out. - */ -class UserVideoEdgeTypeMask extends EdgeTypeMask { - import UserVideoEdgeTypeMask._ - - override def encode(node: Int, edgeType: Byte): Int = { - if (edgeType < 0 || edgeType > SIZE) { - throw new IllegalArgumentException("encode: Illegal edge type argument " + edgeType) - } else { - node | (edgeType << 28) - } - } - - override def edgeType(node: Int): Byte = { - (node >>> 28).toByte - } - - override def restore(node: Int): Int = { - node & MASK - } -} - -object UserVideoEdgeTypeMask extends Enumeration { - - type UserTweetEdgeTypeMask = Value - - /** - * Byte values corresponding to the action taken on a tweet, which will be encoded in the - * top 4 bits in a tweet Id - * NOTE: THERE CAN ONLY BE UP TO 16 TYPES - */ - val VideoPlayback50: UserTweetEdgeTypeMask = Value(1) - - /** - * Reserve the top four bits of each integer to encode the edge type information. - */ - val MASK: Int = Integer.parseInt("00001111111111111111111111111111", 2) - val SIZE: Int = this.values.size - - /** - * Converts the action byte in the RecosHoseMessage into GraphJet internal byte mapping - */ - def actionTypeToEdgeType(actionByte: Byte): Byte = { - val edgeType = Action(actionByte) match { - case Action.VideoPlayback50 => VideoPlayback50.id - case _ => - throw new IllegalArgumentException("getEdgeType: Illegal edge type argument " + actionByte) - } - edgeType.toByte - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraph.docx b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraph.docx new file mode 100644 index 000000000..5aad7e38c Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraph.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraph.scala b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraph.scala deleted file mode 100644 index f22486ef3..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraph.scala +++ /dev/null @@ -1,73 +0,0 @@ -package com.twitter.recos.user_video_graph - -import com.twitter.finagle.thrift.ClientId -import com.twitter.finagle.tracing.Trace -import com.twitter.finagle.tracing.TraceId -import com.twitter.recos.decider.EndpointLoadShedder -import com.twitter.recos.user_video_graph.thriftscala._ -import com.twitter.util.Duration -import com.twitter.util.Future -import com.twitter.util.Timer -import scala.concurrent.duration.MILLISECONDS -import com.twitter.logging.Logger -import com.twitter.recos.user_tweet_graph.relatedTweetHandlers.ConsumersBasedRelatedTweetsHandler -import com.twitter.recos.user_video_graph.relatedTweetHandlers.ProducerBasedRelatedTweetsHandler -import com.twitter.recos.user_video_graph.relatedTweetHandlers.TweetBasedRelatedTweetsHandler - -object UserVideoGraph { - def traceId: TraceId = Trace.id - def clientId: Option[ClientId] = ClientId.current -} - -class UserVideoGraph( - tweetBasedRelatedTweetsHandler: TweetBasedRelatedTweetsHandler, - producerBasedRelatedTweetsHandler: ProducerBasedRelatedTweetsHandler, - consumersBasedRelatedTweetsHandler: ConsumersBasedRelatedTweetsHandler, - endpointLoadShedder: EndpointLoadShedder -)( - implicit timer: Timer) - extends thriftscala.UserVideoGraph.MethodPerEndpoint { - - private val defaultTimeout: Duration = Duration(50, MILLISECONDS) - private val EmptyResponse = Future.value(RelatedTweetResponse()) - private val log = Logger() - - override def tweetBasedRelatedTweets( - request: TweetBasedRelatedTweetRequest - ): Future[RelatedTweetResponse] = - endpointLoadShedder("videoGraphTweetBasedRelatedTweets") { - tweetBasedRelatedTweetsHandler(request).raiseWithin(defaultTimeout) - }.rescue { - case EndpointLoadShedder.LoadSheddingException => - EmptyResponse - case e => - log.info("user-video-graph_tweetBasedRelatedTweets" + e) - EmptyResponse - } - - override def producerBasedRelatedTweets( - request: ProducerBasedRelatedTweetRequest - ): Future[RelatedTweetResponse] = - endpointLoadShedder("producerBasedRelatedTweets") { - producerBasedRelatedTweetsHandler(request).raiseWithin(defaultTimeout) - }.rescue { - case EndpointLoadShedder.LoadSheddingException => - EmptyResponse - case e => - log.info("user-video-graph_producerBasedRelatedTweets" + e) - EmptyResponse - } - - override def consumersBasedRelatedTweets( - request: ConsumersBasedRelatedTweetRequest - ): Future[RelatedTweetResponse] = - endpointLoadShedder("consumersBasedRelatedTweets") { - consumersBasedRelatedTweetsHandler(request).raiseWithin(defaultTimeout) - }.rescue { - case EndpointLoadShedder.LoadSheddingException => - EmptyResponse - case e => - log.info("user-video-graph_consumersBasedRelatedTweets" + e) - EmptyResponse - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphConfig.docx b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphConfig.docx new file mode 100644 index 000000000..7e4de3b42 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphConfig.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphConfig.scala b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphConfig.scala deleted file mode 100644 index c99280133..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphConfig.scala +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.recos.user_video_graph - -import com.twitter.recos.graph_common.MultiSegmentPowerLawBipartiteGraphBuilder.GraphBuilderConfig - -/** - * The class holds all the config parameters for recos graph. - */ -object RecosConfig { - val maxNumSegments: Int = 8 - val maxNumEdgesPerSegment: Int = - (1 << 28) // 268M edges per segment, should be able to include 2 days' data - val expectedNumLeftNodes: Int = - (1 << 26) // should correspond to 67M nodes storage - val expectedMaxLeftDegree: Int = 64 - val leftPowerLawExponent: Double = 16.0 // steep power law as most nodes will have a small degree - val expectedNumRightNodes: Int = (1 << 26) // 67M nodes - val expectedMaxRightDegree: Int = scala.math.pow(1024, 2).toInt // some nodes will be very popular - val rightPowerLawExponent: Double = 4.0 // this will be less steep - - val graphBuilderConfig = GraphBuilderConfig( - maxNumSegments = maxNumSegments, - maxNumEdgesPerSegment = maxNumEdgesPerSegment, - expectedNumLeftNodes = expectedNumLeftNodes, - expectedMaxLeftDegree = expectedMaxLeftDegree, - leftPowerLawExponent = leftPowerLawExponent, - expectedNumRightNodes = expectedNumRightNodes, - expectedMaxRightDegree = expectedMaxRightDegree, - rightPowerLawExponent = rightPowerLawExponent - ) - - println("RecosConfig - maxNumSegments " + maxNumSegments) - println("RecosConfig - maxNumEdgesPerSegment " + maxNumEdgesPerSegment) - println("RecosConfig - expectedNumLeftNodes " + expectedNumLeftNodes) - println("RecosConfig - expectedMaxLeftDegree " + expectedMaxLeftDegree) - println("RecosConfig - leftPowerLawExponent " + leftPowerLawExponent) - println("RecosConfig - expectedNumRightNodes " + expectedNumRightNodes) - println("RecosConfig - expectedMaxRightDegree " + expectedMaxRightDegree) - println("RecosConfig - rightPowerLawExponent " + rightPowerLawExponent) -} diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphEdgeHttpHandler.docx b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphEdgeHttpHandler.docx new file mode 100644 index 000000000..394cedd11 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphEdgeHttpHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphEdgeHttpHandler.scala b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphEdgeHttpHandler.scala deleted file mode 100644 index b2464016c..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphEdgeHttpHandler.scala +++ /dev/null @@ -1,101 +0,0 @@ -package com.twitter.recos.user_video_graph - -import com.twitter.finagle.Service -import com.twitter.finagle.http.Request -import com.twitter.finagle.http.Response -import com.twitter.finagle.http.Status -import com.twitter.finagle.http.Version -import com.twitter.frigate.common.util.HTMLUtil -import com.twitter.graphjet.algorithms.TweetIDMask -import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment -import com.twitter.graphjet.bipartite.MultiSegmentIterator -import com.twitter.graphjet.bipartite.MultiSegmentPowerLawBipartiteGraph -import com.twitter.logging.Logger -import com.twitter.util.Future -import java.util.Random -import scala.collection.mutable.ListBuffer - -class UserTweetGraphEdgeHttpHandler(graph: MultiSegmentPowerLawBipartiteGraph) - extends Service[Request, Response] { - private val log = Logger("UserTweetGraphEdgeHttpHandler") - private val tweetIDMask = new TweetIDMask() - - def getCardInfo(rightNode: Long): String = { - val bits: Long = rightNode & TweetIDMask.METAMASK - bits match { - case TweetIDMask.PHOTO => "Photo" - case TweetIDMask.PLAYER => "Video" - case TweetIDMask.SUMMARY => "Url" - case TweetIDMask.PROMOTION => "Promotion" - case _ => "Regular" - } - } - - private def getUserEdges(userId: Long): ListBuffer[Edge] = { - val random = new Random() - val iterator = - graph - .getRandomLeftNodeEdges(userId, 10, random).asInstanceOf[MultiSegmentIterator[ - BipartiteGraphSegment - ]] - val tweets = new ListBuffer[Edge]() - if (iterator != null) { - while (iterator.hasNext) { - val rightNode = iterator.nextLong() - val edgeType = iterator.currentEdgeType() - tweets += Edge( - tweetIDMask.restore(rightNode), - UserVideoEdgeTypeMask(edgeType).toString, - getCardInfo(rightNode), - ) - } - } - tweets - } - - def apply(httpRequest: Request): Future[Response] = { - log.info("UserTweetGraphEdgeHttpHandler params: " + httpRequest.getParams()) - val time0 = System.currentTimeMillis - - val tweetId = httpRequest.getLongParam("tweetId") - val queryTweetDegree = graph.getRightNodeDegree(tweetId) - val tweetEdges = getTweetEdges(tweetId) - - val userId = httpRequest.getLongParam("userId") - val queryUserDegree = graph.getLeftNodeDegree(userId) - - val response = Response(Version.Http11, Status.Ok) - val userEdges = getUserEdges(userId) - val elapsed = System.currentTimeMillis - time0 - val comment = ("Please specify \"userId\" or \"tweetId\" param." + - "\n query tweet degree = " + queryTweetDegree + - "\n query user degree = " + queryUserDegree + - "\n done in %d ms
").format(elapsed) - val tweetContent = userEdges.toList - .map { edge => - s"TweetId: ${edge.tweetId},\nAction type: ${edge.actionType},\nCard type: ${edge.cardType}" - .replaceAll("\n", " ") - }.mkString("\n
\n") - - response.setContentString( - HTMLUtil.html.replace("XXXXX", comment + tweetContent + "\n


\n" + tweetEdges.toString())) - Future.value(response) - } - - private def getTweetEdges(tweetId: Long): ListBuffer[Long] = { - val random = new Random() - val iterator = - graph - .getRandomRightNodeEdges(tweetId, 500, random).asInstanceOf[MultiSegmentIterator[ - BipartiteGraphSegment - ]] - val terms = new ListBuffer[Long]() - if (iterator != null) { - while (iterator.hasNext) { terms += iterator.nextLong() } - } - terms.distinct - } - -} - -case class Edge(tweetId: Long, actionType: String, cardType: String) diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.docx b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.docx new file mode 100644 index 000000000..946016494 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.scala b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.scala deleted file mode 100644 index 4909e0386..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.scala +++ /dev/null @@ -1,82 +0,0 @@ -package com.twitter.recos.user_video_graph - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder -import com.twitter.graphjet.algorithms.TweetIDMask -import com.twitter.graphjet.bipartite.MultiSegmentPowerLawBipartiteGraph -import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment -import com.twitter.recos.hose.common.UnifiedGraphWriter -import com.twitter.recos.internal.thriftscala.RecosHoseMessage -import com.twitter.recos.serviceapi.Tweetypie._ - -/** - * The class submits a number of $numBootstrapWriters graph writer threads, BufferedEdgeWriter, - * during service startup. One of them is live writer thread, and the other $(numBootstrapWriters - 1) - * are catchup writer threads. All of them consume kafka events from an internal concurrent queue, - * which is populated by kafka reader threads. At bootstrap time, the kafka reader threads look - * back kafka offset from several hours ago and populate the internal concurrent queue. - * Each graph writer thread writes to an individual graph segment separately. - * The $(numBootstrapWriters - 1) catchup writer threads will stop once all events - * between current system time at startup and the time in memcache are processed. - * The live writer thread will continue to write all incoming kafka events. - * It lives through the entire life cycle of recos graph service. - */ -case class UserVideoGraphWriter( - shardId: String, - env: String, - hosename: String, - bufferSize: Int, - kafkaConsumerBuilder: FinagleKafkaConsumerBuilder[String, RecosHoseMessage], - clientId: String, - statsReceiver: StatsReceiver) - extends UnifiedGraphWriter[BipartiteGraphSegment, MultiSegmentPowerLawBipartiteGraph] { - writer => - // The max throughput for each kafka consumer is around 25MB/s - // Use 4 processors for 100MB/s catch-up speed. - val consumerNum: Int = 4 - // Leave 1 Segments to LiveWriter - val catchupWriterNum: Int = RecosConfig.maxNumSegments - 1 - - /** - * Adds a RecosHoseMessage to the graph. used by live writer to insert edges to the - * current segment - */ - override def addEdgeToGraph( - graph: MultiSegmentPowerLawBipartiteGraph, - recosHoseMessage: RecosHoseMessage - ): Unit = { - graph.addEdge( - recosHoseMessage.leftId, - getMetaEdge(recosHoseMessage.rightId, recosHoseMessage.card), - UserVideoEdgeTypeMask.actionTypeToEdgeType(recosHoseMessage.action), - ) - } - - /** - * Adds a RecosHoseMessage to the given segment in the graph. Used by catch up writers to - * insert edges to non-current (old) segments - */ - override def addEdgeToSegment( - segment: BipartiteGraphSegment, - recosHoseMessage: RecosHoseMessage - ): Unit = { - segment.addEdge( - recosHoseMessage.leftId, - getMetaEdge(recosHoseMessage.rightId, recosHoseMessage.card), - UserVideoEdgeTypeMask.actionTypeToEdgeType(recosHoseMessage.action) - ) - } - - private def getMetaEdge(rightId: Long, cardOption: Option[Byte]): Long = { - cardOption - .map { card => - if (isPhotoCard(card)) TweetIDMask.photo(rightId) - else if (isPlayerCard(card)) TweetIDMask.player(rightId) - else if (isSummaryCard(card)) TweetIDMask.summary(rightId) - else if (isPromotionCard(card)) TweetIDMask.promotion(rightId) - else rightId - } - .getOrElse(rightId) - } - -} diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD deleted file mode 100644 index ad9caf129..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD +++ /dev/null @@ -1,12 +0,0 @@ -scala_library( - sources = ["*.scala"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/graphjet", - "servo/request/src/main/scala", - "src/scala/com/twitter/recos/user_video_graph/store", - "src/scala/com/twitter/recos/user_video_graph/util", - "src/scala/com/twitter/recos/util:recos-util", - "src/thrift/com/twitter/recos/user_video_graph:user_video_graph-scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD.docx b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD.docx new file mode 100644 index 000000000..b8c1754a1 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.docx b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.docx new file mode 100644 index 000000000..16ac865fe Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.scala b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.scala deleted file mode 100644 index 44a190e0d..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.scala +++ /dev/null @@ -1,66 +0,0 @@ -package com.twitter.recos.user_tweet_graph.relatedTweetHandlers - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.recos.user_video_graph.thriftscala._ -import com.twitter.recos.user_video_graph.util.FetchRHSTweetsUtil -import com.twitter.recos.user_video_graph.util.FilterUtil -import com.twitter.recos.user_video_graph.util.GetRelatedTweetCandidatesUtil -import com.twitter.recos.util.Stats._ -import com.twitter.servo.request._ -import com.twitter.util.Duration -import com.twitter.util.Future -import scala.concurrent.duration.HOURS - -/** - * Implementation of the Thrift-defined service interface for consumersTweetBasedRelatedTweets. - * given a list of consumer userIds, find the tweets they co-engaged with (we're treating input userIds as consumers therefore "consumersTweetBasedRelatedTweets" ) - * example use case: given a list of user's contacts in their address book, find tweets those contacts engaged with - */ -class ConsumersBasedRelatedTweetsHandler( - bipartiteGraph: BipartiteGraph, - statsReceiver: StatsReceiver) - extends RequestHandler[ConsumersBasedRelatedTweetRequest, RelatedTweetResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - - override def apply(request: ConsumersBasedRelatedTweetRequest): Future[RelatedTweetResponse] = { - trackFutureBlockStats(stats) { - - val maxResults = request.maxResults.getOrElse(200) - val minScore = request.minScore.getOrElse(0.0) - val maxTweetAge = request.maxTweetAgeInHours.getOrElse(48) - val minResultDegree = request.minResultDegree.getOrElse(50) - val minCooccurrence = request.minCooccurrence.getOrElse(3) - val excludeTweetIds = request.excludeTweetIds.getOrElse(Seq.empty).toSet - - val consumerSeedSet = request.consumerSeedSet.distinct.filter { userId => - val userDegree = bipartiteGraph.getLeftNodeDegree(userId) - // constrain to users that have <100 engagements to avoid spammy behavior - userDegree < 100 - } - - val rhsTweetIds = FetchRHSTweetsUtil.fetchRHSTweets( - consumerSeedSet, - bipartiteGraph - ) - - val scorePreFactor = 1000.0 / consumerSeedSet.size - val relatedTweetCandidates = GetRelatedTweetCandidatesUtil.getRelatedTweetCandidates( - rhsTweetIds, - minCooccurrence, - minResultDegree, - scorePreFactor, - bipartiteGraph) - - val relatedTweets = relatedTweetCandidates - .filter(relatedTweet => - FilterUtil.tweetAgeFilter( - relatedTweet.tweetId, - Duration(maxTweetAge, HOURS)) && (relatedTweet.score > minScore) && (!excludeTweetIds - .contains(relatedTweet.tweetId))).take(maxResults) - - stats.stat("response_size").add(relatedTweets.size) - Future.value(RelatedTweetResponse(tweets = relatedTweets)) - } - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.docx b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.docx new file mode 100644 index 000000000..93367b0c3 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.scala b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.scala deleted file mode 100644 index 5f26ded6e..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.scala +++ /dev/null @@ -1,86 +0,0 @@ -package com.twitter.recos.user_video_graph.relatedTweetHandlers - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.recos.user_video_graph.thriftscala._ -import com.twitter.recos.util.Stats._ -import com.twitter.servo.request._ -import com.twitter.util.Duration -import com.twitter.util.Future -import scala.concurrent.duration.HOURS -import com.twitter.simclusters_v2.common.UserId -import com.twitter.storehaus.ReadableStore -import com.twitter.recos.user_video_graph.store.UserRecentFollowersStore -import com.twitter.recos.user_video_graph.util.FetchRHSTweetsUtil -import com.twitter.recos.user_video_graph.util.FilterUtil -import com.twitter.recos.user_video_graph.util.GetRelatedTweetCandidatesUtil - -/** - * Implementation of the Thrift-defined service interface for producerBasedRelatedTweets. - * - */ -class ProducerBasedRelatedTweetsHandler( - bipartiteGraph: BipartiteGraph, - userRecentFollowersStore: ReadableStore[UserRecentFollowersStore.Query, Seq[UserId]], - statsReceiver: StatsReceiver) - extends RequestHandler[ProducerBasedRelatedTweetRequest, RelatedTweetResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - - override def apply(request: ProducerBasedRelatedTweetRequest): Future[RelatedTweetResponse] = { - trackFutureBlockStats(stats) { - val maxResults = request.maxResults.getOrElse(200) - val maxNumFollowers = request.maxNumFollowers.getOrElse(500) - val minScore = request.minScore.getOrElse(0.0) - val maxTweetAge = request.maxTweetAgeInHours.getOrElse(48) - val minResultDegree = request.minResultDegree.getOrElse(50) - val minCooccurrence = request.minCooccurrence.getOrElse(4) - val excludeTweetIds = request.excludeTweetIds.getOrElse(Seq.empty).toSet - - val followersFut = fetchFollowers(request.producerId, Some(maxNumFollowers)) - followersFut.map { followers => - val rhsTweetIds = FetchRHSTweetsUtil.fetchRHSTweets( - followers, - bipartiteGraph - ) - - val scorePreFactor = 1000.0 / followers.size - val relatedTweetCandidates = GetRelatedTweetCandidatesUtil.getRelatedTweetCandidates( - rhsTweetIds, - minCooccurrence, - minResultDegree, - scorePreFactor, - bipartiteGraph) - - val relatedTweets = relatedTweetCandidates - .filter { relatedTweet => - FilterUtil.tweetAgeFilter( - relatedTweet.tweetId, - Duration(maxTweetAge, HOURS)) && (relatedTweet.score > minScore) && (!excludeTweetIds - .contains(relatedTweet.tweetId)) - }.take(maxResults) - stats.stat("response_size").add(relatedTweets.size) - RelatedTweetResponse(tweets = relatedTweets) - } - } - } - - private def fetchFollowers( - producerId: Long, - maxNumFollower: Option[Int], - ): Future[Seq[Long]] = { - val query = - UserRecentFollowersStore.Query(producerId, maxNumFollower, None) - - val followersFut = userRecentFollowersStore.get(query) - followersFut.map { followersOpt => - val followers = followersOpt.getOrElse(Seq.empty) - val followerIds = followers.distinct.filter { userId => - val userDegree = bipartiteGraph.getLeftNodeDegree(userId) - // constrain to more active users that have >1 engagement to optimize latency, and <100 engagements to avoid spammy behavior - userDegree > 1 && userDegree < 500 - } - stats.stat("follower_size_after_filter").add(followerIds.size) - followerIds - } - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.docx b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.docx new file mode 100644 index 000000000..0beb7556e Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.scala b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.scala deleted file mode 100644 index 7150a2f0f..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.scala +++ /dev/null @@ -1,91 +0,0 @@ -package com.twitter.recos.user_video_graph.relatedTweetHandlers - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.recos.features.tweet.thriftscala.GraphFeaturesForQuery -import com.twitter.recos.user_video_graph.thriftscala._ -import com.twitter.recos.user_video_graph.util.FilterUtil -import com.twitter.recos.user_video_graph.util.FetchRHSTweetsUtil -import com.twitter.recos.user_video_graph.util.GetRelatedTweetCandidatesUtil -import com.twitter.recos.user_video_graph.util.GetAllInternalTweetIdsUtil -import com.twitter.recos.user_video_graph.util.SampleLHSUsersUtil -import com.twitter.recos.util.Stats._ -import com.twitter.servo.request._ -import com.twitter.util.Duration -import com.twitter.util.Future -import scala.concurrent.duration.HOURS - -/** - * Implementation of the Thrift-defined service interface for tweetBasedRelatedTweets. - * - */ -class TweetBasedRelatedTweetsHandler(bipartiteGraph: BipartiteGraph, statsReceiver: StatsReceiver) - extends RequestHandler[TweetBasedRelatedTweetRequest, RelatedTweetResponse] { - private val stats = statsReceiver.scope(this.getClass.getSimpleName) - - override def apply(request: TweetBasedRelatedTweetRequest): Future[RelatedTweetResponse] = { - trackFutureBlockStats(stats) { - val internalQueryTweetIds = - GetAllInternalTweetIdsUtil.getAllInternalTweetIds(request.tweetId, bipartiteGraph) - - val response = internalQueryTweetIds match { - case head +: Nil => getRelatedTweets(request, head) - case _ => RelatedTweetResponse() - } - Future.value(response) - } - } - - private def getRelatedTweets( - request: TweetBasedRelatedTweetRequest, - maskedTweetId: Long - ): RelatedTweetResponse = { - - val maxNumSamplesPerNeighbor = request.maxNumSamplesPerNeighbor.getOrElse(100) - val maxResults = request.maxResults.getOrElse(200) - val minScore = request.minScore.getOrElse(0.5) - val maxTweetAge = request.maxTweetAgeInHours.getOrElse(48) - val minResultDegree = request.minResultDegree.getOrElse(50) - val minQueryDegree = request.minQueryDegree.getOrElse(10) - val minCooccurrence = request.minCooccurrence.getOrElse(3) - val excludeTweetIds = request.excludeTweetIds.getOrElse(Seq.empty).toSet - - val queryTweetDegree = bipartiteGraph.getRightNodeDegree(maskedTweetId) - stats.stat("queryTweetDegree").add(queryTweetDegree) - - if (queryTweetDegree < minQueryDegree) { - stats.counter("queryTweetDegreeLessThanMinQueryDegree").incr() - RelatedTweetResponse() - } else { - - val sampledLHSuserIds = - SampleLHSUsersUtil.sampleLHSUsers(maskedTweetId, maxNumSamplesPerNeighbor, bipartiteGraph) - - val rHStweetIds = FetchRHSTweetsUtil.fetchRHSTweets( - sampledLHSuserIds, - bipartiteGraph, - ) - - val scorePreFactor = - queryTweetDegree / math.log(queryTweetDegree) / sampledLHSuserIds.distinct.size - val relatedTweetCandidates = GetRelatedTweetCandidatesUtil.getRelatedTweetCandidates( - rHStweetIds, - minCooccurrence, - minResultDegree, - scorePreFactor, - bipartiteGraph) - - val relatedTweets = relatedTweetCandidates - .filter(relatedTweet => - FilterUtil.tweetAgeFilter( - relatedTweet.tweetId, - Duration(maxTweetAge, HOURS)) && (relatedTweet.score > minScore) && (!excludeTweetIds - .contains(relatedTweet.tweetId))).take(maxResults) - - stats.stat("response_size").add(relatedTweets.size) - RelatedTweetResponse( - tweets = relatedTweets, - queryTweetGraphFeatures = Some(GraphFeaturesForQuery(degree = Some(queryTweetDegree)))) - } - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/store/BUILD b/src/scala/com/twitter/recos/user_video_graph/store/BUILD deleted file mode 100644 index b1c3562b7..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/store/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - sources = ["*.scala"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/storehaus:core", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/socialgraph:thrift-scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_video_graph/store/BUILD.docx b/src/scala/com/twitter/recos/user_video_graph/store/BUILD.docx new file mode 100644 index 000000000..eb2b1796e Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/store/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.docx b/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.docx new file mode 100644 index 000000000..396c73ef7 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.scala b/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.scala deleted file mode 100644 index 7d1b6df6f..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.scala +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.recos.user_video_graph.store - -import com.twitter.simclusters_v2.common.UserId -import com.twitter.socialgraph.thriftscala.EdgesRequest -import com.twitter.socialgraph.thriftscala.EdgesResult -import com.twitter.socialgraph.thriftscala.PageRequest -import com.twitter.socialgraph.thriftscala.RelationshipType -import com.twitter.socialgraph.thriftscala.SrcRelationship -import com.twitter.socialgraph.thriftscala.SocialGraphService -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Duration -import com.twitter.util.Future -import com.twitter.util.Time - -class UserRecentFollowersStore( - sgsClient: SocialGraphService.MethodPerEndpoint) - extends ReadableStore[UserRecentFollowersStore.Query, Seq[UserId]] { - - override def get(key: UserRecentFollowersStore.Query): Future[Option[Seq[UserId]]] = { - val edgeRequest = EdgesRequest( - relationship = SrcRelationship(key.userId, RelationshipType.FollowedBy), - // Could have a better guess at count when k.maxAge != None - pageRequest = Some(PageRequest(count = key.maxResults)) - ) - - val lookbackThresholdMillis = key.maxAge - .map(maxAge => (Time.now - maxAge).inMilliseconds) - .getOrElse(0L) - - sgsClient - .edges(Seq(edgeRequest)) - .map(_.flatMap { - case EdgesResult(edges, _, _) => - edges.collect { - case e if e.createdAt >= lookbackThresholdMillis => - e.target - } - }) - .map(Some(_)) - } -} - -object UserRecentFollowersStore { - case class Query( - userId: UserId, - // maxResults - if Some(count), we return only the `count` most recent follows - maxResults: Option[Int] = None, - // maxAge - if Some(duration), return only follows since `Time.now - duration` - maxAge: Option[Duration] = None) -} diff --git a/src/scala/com/twitter/recos/user_video_graph/util/BUILD b/src/scala/com/twitter/recos/user_video_graph/util/BUILD deleted file mode 100644 index a8a1364e1..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/util/BUILD +++ /dev/null @@ -1,12 +0,0 @@ -scala_library( - sources = ["*.scala"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/graphjet", - "snowflake:id", - "snowflake/src/main/scala/com/twitter/snowflake/id", - "src/scala/com/twitter/recos/util:recos-util", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/recos/user_video_graph:user_video_graph-scala", - ], -) diff --git a/src/scala/com/twitter/recos/user_video_graph/util/BUILD.docx b/src/scala/com/twitter/recos/user_video_graph/util/BUILD.docx new file mode 100644 index 000000000..16679198c Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/BUILD.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.docx new file mode 100644 index 000000000..9005a537b Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.scala deleted file mode 100644 index 63041c1d0..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.scala +++ /dev/null @@ -1,29 +0,0 @@ -package com.twitter.recos.user_video_graph.util - -import com.twitter.graphjet.bipartite.MultiSegmentIterator -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment -import scala.collection.mutable.ListBuffer - -object FetchRHSTweetsUtil { - // get RHS tweets given LHS users - def fetchRHSTweets( - userIds: Seq[Long], - bipartiteGraph: BipartiteGraph - ): Seq[Long] = { - userIds.distinct - .flatMap { userId => - val tweetIdsIterator = bipartiteGraph - .getLeftNodeEdges(userId).asInstanceOf[MultiSegmentIterator[BipartiteGraphSegment]] - - val tweetIds = new ListBuffer[Long]() - if (tweetIdsIterator != null) { - while (tweetIdsIterator.hasNext) { - val rightNode = tweetIdsIterator.nextLong() - tweetIds += rightNode - } - } - tweetIds.distinct - } - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.docx new file mode 100644 index 000000000..9dcdfb574 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.scala deleted file mode 100644 index ca827070d..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.scala +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.recos.user_video_graph.util - -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.snowflake.id.SnowflakeId -import com.twitter.util.Duration -import com.twitter.util.Time - -object FilterUtil { - def tweetAgeFilter(tweetId: TweetId, maxAge: Duration): Boolean = { - SnowflakeId - .timeFromIdOpt(tweetId) - .map { tweetTime => tweetTime > Time.now - maxAge }.getOrElse(false) - // If there's no snowflake timestamp, we have no idea when this tweet happened. - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.docx new file mode 100644 index 000000000..3e984d05f Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.scala deleted file mode 100644 index 8628f3a10..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.scala +++ /dev/null @@ -1,33 +0,0 @@ -package com.twitter.recos.user_video_graph.util - -import com.twitter.graphjet.algorithms.TweetIDMask -import com.twitter.graphjet.bipartite.api.BipartiteGraph - -object GetAllInternalTweetIdsUtil { - - def getAllInternalTweetIds(tweetId: Long, bipartiteGraph: BipartiteGraph): Seq[Long] = { - val internalTweetIds = getAllMasks(tweetId) - sortByDegrees(internalTweetIds, bipartiteGraph) - } - - private def getAllMasks(tweetId: Long): Seq[Long] = { - Seq( - tweetId, - TweetIDMask.summary(tweetId), - TweetIDMask.photo(tweetId), - TweetIDMask.player(tweetId), - TweetIDMask.promotion(tweetId) - ) - } - - private def sortByDegrees( - encodedTweetIds: Seq[Long], - bipartiteGraph: BipartiteGraph - ): Seq[Long] = { - encodedTweetIds - .map { encodedTweetId => (encodedTweetId, bipartiteGraph.getRightNodeDegree(encodedTweetId)) } - .filter { case (_, degree) => degree > 0 } // keep only tweetds with positive degree - .sortBy { case (_, degree) => -degree } // sort by degree in descending order - .map { case (encodedTweetId, _) => encodedTweetId } - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.docx new file mode 100644 index 000000000..ab9357599 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.scala deleted file mode 100644 index 176e129db..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.scala +++ /dev/null @@ -1,56 +0,0 @@ -package com.twitter.recos.user_video_graph.util - -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.recos.user_video_graph.thriftscala._ -import com.twitter.recos.features.tweet.thriftscala.GraphFeaturesForTweet -import com.twitter.graphjet.algorithms.TweetIDMask - -object GetRelatedTweetCandidatesUtil { - private val tweetIDMask = new TweetIDMask - - /** - * calculate scores for each RHS tweet that we get back - * for tweetBasedRelatedTweet, scorePreFactor = queryTweetDegree / log(queryTweetDegree) / LHSuserSize - * and the final score will be a log-cosine score - * for non-tweetBasedRelatedTweet, We don't have a query tweet, to keep scoring function consistent, - * scorePreFactor = 1000.0 / LHSuserSize (queryTweetDegree's average is ~10k, 1000 ~= 10k/log(10k)) - * Though scorePreFactor is applied for all results within a request, it's still useful to make score comparable across requests, - * so we can have a unifed min_score and help with downstream score normalization - * **/ - def getRelatedTweetCandidates( - relatedTweetCandidates: Seq[Long], - minCooccurrence: Int, - minResultDegree: Int, - scorePreFactor: Double, - bipartiteGraph: BipartiteGraph - ): Seq[RelatedTweet] = { - relatedTweetCandidates - .groupBy(tweetId => tweetId) - .filterKeys(tweetId => bipartiteGraph.getRightNodeDegree(tweetId) > minResultDegree) - .mapValues(_.size) - .filter { case (_, cooccurrence) => cooccurrence >= minCooccurrence } - .toSeq - .map { - case (relatedTweetId, cooccurrence) => - val relatedTweetDegree = bipartiteGraph.getRightNodeDegree(relatedTweetId) - - val score = scorePreFactor * cooccurrence / math.log(relatedTweetDegree) - toRelatedTweet(relatedTweetId, score, relatedTweetDegree, cooccurrence) - } - .sortBy(-_.score) - } - - def toRelatedTweet( - relatedTweetId: Long, - score: Double, - relatedTweetDegree: Int, - cooccurrence: Int - ): RelatedTweet = { - RelatedTweet( - tweetId = tweetIDMask.restore(relatedTweetId), - score = score, - relatedTweetGraphFeatures = Some( - GraphFeaturesForTweet(cooccurrence = Some(cooccurrence), degree = Some(relatedTweetDegree))) - ) - } -} diff --git a/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.docx new file mode 100644 index 000000000..e03440b41 Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.docx differ diff --git a/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.scala deleted file mode 100644 index b8fd2c2f4..000000000 --- a/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.scala +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.recos.user_video_graph.util - -import com.twitter.graphjet.bipartite.MultiSegmentIterator -import com.twitter.graphjet.bipartite.api.BipartiteGraph -import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment -import java.util.Random -import scala.collection.mutable.ListBuffer - -object SampleLHSUsersUtil { - // sample userId nodes - def sampleLHSUsers( - maskedTweetId: Long, - maxNumSamplesPerNeighbor: Int, - bipartiteGraph: BipartiteGraph - ): Seq[Long] = { - val sampledUserIdsIterator = bipartiteGraph - .getRandomRightNodeEdges( - maskedTweetId, - maxNumSamplesPerNeighbor, - new Random(System.currentTimeMillis)).asInstanceOf[MultiSegmentIterator[ - BipartiteGraphSegment - ]] - - val userIds = new ListBuffer[Long]() - if (sampledUserIdsIterator != null) { - while (sampledUserIdsIterator.hasNext) { - val leftNode = sampledUserIdsIterator.nextLong() - // If a user likes too many things, we risk including spammy behavior. - if (bipartiteGraph.getLeftNodeDegree(leftNode) < 100) - userIds += leftNode - } - } - userIds - } -} diff --git a/src/scala/com/twitter/simclusters_v2/README.docx b/src/scala/com/twitter/simclusters_v2/README.docx new file mode 100644 index 000000000..4ce718d9a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/README.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/README.md b/src/scala/com/twitter/simclusters_v2/README.md deleted file mode 100644 index ae43836af..000000000 --- a/src/scala/com/twitter/simclusters_v2/README.md +++ /dev/null @@ -1,112 +0,0 @@ -# SimClusters: Community-based Representations for Heterogeneous Recommendations at Twitter - -## Overview -SimClusters is as a general-purpose representation layer based on overlapping communities into which users as well as heterogeneous content can be captured as sparse, interpretable vectors to support a multitude of recommendation tasks. - -We build our user and tweet SimClusters embeddings based on the inferred communities, and the representations power our personalized tweet recommendation via our online serving service SimClusters ANN. - - -For more details, please read our paper that was published in KDD'2020 Applied Data Science Track: https://www.kdd.org/kdd2020/accepted-papers/view/simclusters-community-based-representations-for-heterogeneous-recommendatio - -## Brief introduction to Simclusters Algorithm - -### Follow relationships as a bipartite graph -Follow relationships on Twitter are perhaps most naturally thought of as directed graph, where each node is a user and each edge represents a Follow. Edges are directed in that User 1 can follow User 2, User 2 can follow User 1 or both User 1 and User 2 can follow each other. - -This directed graph can be also viewed as a bipartite graph, where nodes are grouped into two sets, Producers and Consumers. In this bipartite graph, Producers are the users who are Followed and Consumers are the Followees. Below is a toy example of a follow graph for four users: - - - -> Figure 1 - Left panel: A directed follow graph; Right panel: A bipartite graph representation of the directed graph - -### Community Detection - Known For -The bipartite follow graph can be used to identify groups of Producers who have similar followers, or who are "Known For" a topic. Specifically, the bipartite follow graph can also be represented as an *m x n* matrix (*A*), where consumers are presented as *u* and producers are represented as *v*. - -Producer-producer similarity is computed as the cosine similarity between users who follow each producer. The resulting cosine similarity values can be used to construct a producer-producer similarity graph, where the nodes are producers and edges are weighted by the corresponding cosine similarity value. Noise removal is performed, such that edges with weights below a specified threshold are deleted from the graph. - -After noise removal has been completed, Metropolis-Hastings sampling-based community detection is then run on the Producer-Producer similarity graph to identify a community affiliation for each producer. This algorithm takes in a parameter *k* for the number of communities to be detected. - - - -> Figure 2 - Left panel: Matrix representation of the follow graph depicted in Figure 1; Middle panel: Producer-Producer similarity is estimated by calculating the cosine similarity between the users who follow each producer; Right panel: Cosine similarity scores are used to create the Producer-Producer similarity graph. A clustering algorithm is run on the graph to identify groups of Producers with similar followers. - -Community affiliation scores are then used to construct an *n x k* "Known For" matrix (*V*). This matrix is maximally sparse, and each Producer is affiliated with at most one community. In production, the Known For dataset covers the top 20M producers and k ~= 145000. In other words, we discover around 145k communities based on Twitter's user follow graph. - - - -> Figure 3 - The clustering algorithm returns community affiliation scores for each producer. These scores are represented in matrix V. - -In the example above, Producer 1 is "Known For" community 2, Producer 2 is "Known For" community 1, and so forth. - -### Consumer Embeddings - User InterestedIn -An Interested In matrix (*U*) can be computed by multiplying the matrix representation of the follow graph (*A*) by the Known For matrix (*V*): - - - -In this toy example, consumer 1 is interested in community 1 only, whereas consumer 3 is interested in all three communities. There is also a noise removal step applied to the Interested In matrix. - -We use the InterestedIn embeddings to capture consumer's long-term interest. The InterestedIn embeddings is one of our major source for consumer-based tweet recommendations. - -### Producer Embeddings -When computing the Known For matrix, each producer can only be Known For a single community. Although this maximally sparse matrix is useful from a computational perspective, we know that our users tweet about many different topics and may be "Known" in many different communities. Producer embeddings ( *Ṽ* ) are used to capture this richer structure of the graph. - -To calculate producer embeddings, the cosine similarity is calculated between each Producer’s follow graph and the Interested In vector for each community. - - - -Producer embeddings are used for producer-based tweet recommendations. For example, we can recommend similar tweets based on an account you just followed. - -### Entity Embeddings -SimClusters can also be used to generate embeddings for different kind of contents, such as -- Tweets (used for Tweet recommendations) -- Topics (used for TopicFollow) - -#### Tweet embeddings -When a tweet is created, its tweet embedding is initialized as an empty vector. -Tweet embeddings are updated each time the tweet is favorited. Specifically, the InterestedIn vector of each user who Fav-ed the tweet is added to the tweet vector. -Since tweet embeddings are updated each time a tweet is favorited, they change over time. - -Tweet embeddings are critical for our tweet recommendation tasks. We can calculate tweet similarity and recommend similar tweets to users based on their tweet engagement history. - -We have a online Heron job that updates the tweet embeddings in realtime, check out [here](summingbird/README.md) for more. - -#### Topic embeddings -Topic embeddings (**R**) are determined by taking the cosine similarity between consumers who are interested in a community and the number of aggregated favorites each consumer has taken on a tweet that has a topic annotation (with some time decay). - - - - -## Project Directory Overview -The whole SimClusters project can be understood as 2 main components -- SimClusters Offline Jobs (Scalding / GCP) -- SimClusters Real-time Streaming Jobs - -### SimClusters Offline Jobs - -**SimClusters Scalding Jobs** - -| Jobs | Code | Description | -|---|---|---| -| KnownFor | [simclusters_v2/scalding/update_known_for/UpdateKnownFor20M145K2020.scala](scalding/update_known_for/UpdateKnownFor20M145K2020.scala) | The job outputs the KnownFor dataset which stores the relationships between clusterId and producerUserId. KnownFor dataset covers the top 20M followed producers. We use this KnownFor dataset (or so-called clusters) to build all other entity embeddings. | -| InterestedIn Embeddings| [simclusters_v2/scalding/InterestedInFromKnownFor.scala](scalding/InterestedInFromKnownFor.scala) | This code implements the job for computing users' interestedIn embedding from the KnownFor dataset. We use this dataset for consumer-based tweet recommendations.| -| Producer Embeddings | [simclusters_v2/scalding/embedding/ProducerEmbeddingsFromInterestedIn.scala](scalding/embedding/ProducerEmbeddingsFromInterestedIn.scala) | The code implements the job for computer producer embeddings, which represents the content user produces. We use this dataset for producer-based tweet recommendations.| -| Semantic Core Entity Embeddings | [simclusters_v2/scalding/embedding/EntityToSimClustersEmbeddingsJob.scala](scalding/embedding/EntityToSimClustersEmbeddingsJob.scala) | The job computes the semantic core entity embeddings. It outputs datasets that stores the "SemanticCore entityId -> List(clusterId)" and "clusterId -> List(SemanticCore entityId))" relationships.| -| Topic Embeddings | [simclusters_v2/scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.scala](scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.scala) | Jobs to generate Fav-based Topic-Follow-Graph (TFG) topic embeddings A topic's fav-based TFG embedding is the sum of its followers' fav-based InterestedIn. We use this embedding for topic related recommendations.| - -**SimClusters GCP Jobs** - -We have a GCP pipeline where we build our SimClusters ANN index via BigQuery. This allows us to do fast iterations and build new embeddings more efficiently compared to Scalding. - -All SimClusters related GCP jobs are under [src/scala/com/twitter/simclusters_v2/scio/bq_generation](scio/bq_generation). - -| Jobs | Code | Description | -|---|---|---| -| PushOpenBased SimClusters ANN Index | [EngagementEventBasedClusterToTweetIndexGenerationJob.scala](scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.scala) | The job builds a clusterId -> TopTweet index based on user-open engagement history. This SANN source is used for candidate generation for Notifications. | -| VideoViewBased SimClusters Index| [EngagementEventBasedClusterToTweetIndexGenerationJob.scala](scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.scala) | The job builds a clusterId -> TopTweet index based on the user's video view history. This SANN source is used for video recommendation on Home.| - -### SimClusters Real-Time Streaming Tweets Jobs - -| Jobs | Code | Description | -|---|---|---| -| Tweet Embedding Job | [simclusters_v2/summingbird/storm/TweetJob.scala](summingbird/storm/TweetJob.scala) | Generate the Tweet embedding and index of tweets for the SimClusters | -| Persistent Tweet Embedding Job| [simclusters_v2/summingbird/storm/PersistentTweetJob.scala](summingbird/storm/PersistentTweetJob.scala) | Persistent the tweet embeddings from MemCache into Manhattan.| \ No newline at end of file diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD b/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD deleted file mode 100644 index 7e242cbb9..000000000 --- a/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD +++ /dev/null @@ -1,17 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/storehaus:core", - "frigate/frigate-common:base", - "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/base", - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/simclusters_v2/score", - "src/scala/com/twitter/simclusters_v2/summingbird/stores", - "src/scala/com/twitter/simclusters_v2/tweet_similarity", - "src/thrift/com/twitter/recos/user_tweet_entity_graph:user_tweet_entity_graph-scala", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - "src/thrift/com/twitter/wtf/interest:interest-thrift-scala", - "util/util-stats/src/main/scala/com/twitter/finagle/stats", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD.docx new file mode 100644 index 000000000..c60fdd867 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.docx new file mode 100644 index 000000000..ec9a999d0 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.scala b/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.scala deleted file mode 100644 index 9ef629a6c..000000000 --- a/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.scala +++ /dev/null @@ -1,56 +0,0 @@ -package com.twitter.simclusters_v2.candidate_source - -import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores - -object ClusterRanker extends Enumeration { - val RankByNormalizedFavScore: ClusterRanker.Value = Value - val RankByFavScore: ClusterRanker.Value = Value - val RankByFollowScore: ClusterRanker.Value = Value - val RankByLogFavScore: ClusterRanker.Value = Value - val RankByNormalizedLogFavScore: ClusterRanker.Value = Value - - /** - * Given a map of clusters, sort out the top scoring clusters by a ranking scheme - * provided by the caller - */ - def getTopKClustersByScore( - clustersWithScores: Map[Int, UserToInterestedInClusterScores], - rankByScore: ClusterRanker.Value, - topK: Int - ): Map[Int, Double] = { - val rankedClustersWithScores = clustersWithScores.map { - case (clusterId, score) => - rankByScore match { - case ClusterRanker.RankByFavScore => - (clusterId, (score.favScore.getOrElse(0.0), score.followScore.getOrElse(0.0))) - case ClusterRanker.RankByFollowScore => - (clusterId, (score.followScore.getOrElse(0.0), score.favScore.getOrElse(0.0))) - case ClusterRanker.RankByLogFavScore => - (clusterId, (score.logFavScore.getOrElse(0.0), score.followScore.getOrElse(0.0))) - case ClusterRanker.RankByNormalizedLogFavScore => - ( - clusterId, - ( - score.logFavScoreClusterNormalizedOnly.getOrElse(0.0), - score.followScore.getOrElse(0.0))) - case ClusterRanker.RankByNormalizedFavScore => - ( - clusterId, - ( - score.favScoreProducerNormalizedOnly.getOrElse(0.0), - score.followScore.getOrElse(0.0))) - case _ => - ( - clusterId, - ( - score.favScoreProducerNormalizedOnly.getOrElse(0.0), - score.followScore.getOrElse(0.0))) - } - } - rankedClustersWithScores.toSeq - .sortBy(_._2) // sort in ascending order - .takeRight(topK) - .map { case (clusterId, scores) => clusterId -> math.max(scores._1, 1e-4) } - .toMap - } -} diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.docx new file mode 100644 index 000000000..2e861a518 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.scala b/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.scala deleted file mode 100644 index 407558ee3..000000000 --- a/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.scala +++ /dev/null @@ -1,71 +0,0 @@ -package com.twitter.simclusters_v2.candidate_source - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.base.Stats -import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.SimClustersTweetCandidate -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ScoreInternalId -import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingPairScoreId -import com.twitter.simclusters_v2.thriftscala.{Score => ThriftScore} -import com.twitter.simclusters_v2.thriftscala.{ScoreId => ThriftScoreId} -import com.twitter.util.Future -import com.twitter.storehaus.ReadableStore - -object HeavyRanker { - trait HeavyRanker { - def rank( - scoringAlgorithm: ScoringAlgorithm, - sourceEmbeddingId: SimClustersEmbeddingId, - candidateEmbeddingType: EmbeddingType, - minScore: Double, - candidates: Seq[SimClustersTweetCandidate] - ): Future[Seq[SimClustersTweetCandidate]] - } - - class UniformScoreStoreRanker( - uniformScoringStore: ReadableStore[ThriftScoreId, ThriftScore], - stats: StatsReceiver) - extends HeavyRanker { - val fetchCandidateEmbeddingsStat = stats.scope("fetchCandidateEmbeddings") - - def rank( - scoringAlgorithm: ScoringAlgorithm, - sourceEmbeddingId: SimClustersEmbeddingId, - candidateEmbeddingType: EmbeddingType, - minScore: Double, - candidates: Seq[SimClustersTweetCandidate] - ): Future[Seq[SimClustersTweetCandidate]] = { - val pairScoreIds = candidates.map { candidate => - ThriftScoreId( - scoringAlgorithm, - ScoreInternalId.SimClustersEmbeddingPairScoreId( - SimClustersEmbeddingPairScoreId( - sourceEmbeddingId, - SimClustersEmbeddingId( - candidateEmbeddingType, - sourceEmbeddingId.modelVersion, - InternalId.TweetId(candidate.tweetId) - ) - )) - ) -> candidate.tweetId - }.toMap - - Future - .collect { - Stats.trackMap(fetchCandidateEmbeddingsStat) { - uniformScoringStore.multiGet(pairScoreIds.keySet) - } - } - .map { candidateScores => - candidateScores.toSeq - .collect { - case (pairScoreId, Some(score)) if score.score >= minScore => - SimClustersTweetCandidate(pairScoreIds(pairScoreId), score.score, sourceEmbeddingId) - } - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.docx new file mode 100644 index 000000000..de812c266 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.scala b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.scala deleted file mode 100644 index eb6684e7c..000000000 --- a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.scala +++ /dev/null @@ -1,637 +0,0 @@ -package com.twitter.simclusters_v2.candidate_source - -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.base.CandidateSource -import com.twitter.frigate.common.base.Stats -import com.twitter.simclusters_v2.candidate_source.HeavyRanker.UniformScoreStoreRanker -import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.SimClustersANNConfig -import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.SimClustersTweetCandidate -import com.twitter.simclusters_v2.common.ModelVersions._ -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.summingbird.stores.ClusterKey -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ScoreInternalId -import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingPairScoreId -import com.twitter.simclusters_v2.thriftscala.{Score => ThriftScore} -import com.twitter.simclusters_v2.thriftscala.{ScoreId => ThriftScoreId} -import com.twitter.snowflake.id.SnowflakeId -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Duration -import com.twitter.util.Future -import com.twitter.util.Time -import scala.collection.mutable - -/** - * This store looks for tweets whose similarity is close to a Source SimClustersEmbeddingId. - * - * Approximate cosine similarity is the core algorithm to drive this store. - * - * Step 1 - 4 are in "fetchCandidates" method. - * 1. Retrieve the SimClusters Embedding by the SimClustersEmbeddingId - * 2. Fetch top N clusters' top tweets from the clusterTweetCandidatesStore (TopTweetsPerCluster index). - * 3. Calculate all the tweet candidates' dot-product or approximate cosine similarity to source tweets. - * 4. Take top M tweet candidates by the step 3's score - * Step 5-6 are in "reranking" method. - * 5. Calculate the similarity score between source and candidates. - * 6. Return top N candidates by the step 5's score. - * - * Warning: Only turn off the step 5 for User InterestedIn candidate generation. It's the only use - * case in Recos that we use dot-product to rank the tweet candidates. - */ -case class SimClustersANNCandidateSource( - clusterTweetCandidatesStore: ReadableStore[ClusterKey, Seq[(TweetId, Double)]], - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding], - heavyRanker: HeavyRanker.HeavyRanker, - configs: Map[EmbeddingType, SimClustersANNConfig], - statsReceiver: StatsReceiver) - extends CandidateSource[SimClustersANNCandidateSource.Query, SimClustersTweetCandidate] { - - import SimClustersANNCandidateSource._ - - override val name: String = this.getClass.getName - private val stats = statsReceiver.scope(this.getClass.getName) - - private val fetchSourceEmbeddingStat = stats.scope("fetchSourceEmbedding") - protected val fetchCandidateEmbeddingsStat = stats.scope("fetchCandidateEmbeddings") - private val fetchCandidatesStat = stats.scope("fetchCandidates") - private val rerankingStat = stats.scope("reranking") - - override def get( - query: SimClustersANNCandidateSource.Query - ): Future[Option[Seq[SimClustersTweetCandidate]]] = { - val sourceEmbeddingId = query.sourceEmbeddingId - loadConfig(query) match { - case Some(config) => - for { - maybeSimClustersEmbedding <- Stats.track(fetchSourceEmbeddingStat) { - simClustersEmbeddingStore.get(query.sourceEmbeddingId) - } - maybeFilteredCandidates <- maybeSimClustersEmbedding match { - case Some(sourceEmbedding) => - for { - rawCandidates <- Stats.trackSeq(fetchCandidatesStat) { - fetchCandidates(sourceEmbeddingId, config, sourceEmbedding) - } - rankedCandidates <- Stats.trackSeq(rerankingStat) { - reranking(sourceEmbeddingId, config, rawCandidates) - } - } yield { - fetchCandidatesStat - .stat( - sourceEmbeddingId.embeddingType.name, - sourceEmbeddingId.modelVersion.name).add(rankedCandidates.size) - Some(rankedCandidates) - } - case None => - fetchCandidatesStat - .stat( - sourceEmbeddingId.embeddingType.name, - sourceEmbeddingId.modelVersion.name).add(0) - Future.None - } - } yield { - maybeFilteredCandidates - } - case _ => - // Skip over queries whose config is not defined - Future.None - } - } - - private def fetchCandidates( - sourceEmbeddingId: SimClustersEmbeddingId, - config: SimClustersANNConfig, - sourceEmbedding: SimClustersEmbedding - ): Future[Seq[SimClustersTweetCandidate]] = { - val now = Time.now - val earliestTweetId = SnowflakeId.firstIdFor(now - config.maxTweetCandidateAge) - val latestTweetId = SnowflakeId.firstIdFor(now - config.minTweetCandidateAge) - val clusterIds = - sourceEmbedding - .truncate(config.maxScanClusters).clusterIds - .map { clusterId: ClusterId => - ClusterKey(clusterId, sourceEmbeddingId.modelVersion, config.candidateEmbeddingType) - }.toSet - - Future - .collect { - clusterTweetCandidatesStore.multiGet(clusterIds) - }.map { clusterTweetsMap => - // Use Mutable map to optimize performance. The method is thread-safe. - // Set initial map size to around p75 of map size distribution to avoid too many copying - // from extending the size of the mutable hashmap - val candidateScoresMap = - new SimClustersANNCandidateSource.HashMap[TweetId, Double](InitialCandidateMapSize) - val candidateNormalizationMap = - new SimClustersANNCandidateSource.HashMap[TweetId, Double](InitialCandidateMapSize) - - clusterTweetsMap.foreach { - case (ClusterKey(clusterId, _, _, _), Some(tweetScores)) - if sourceEmbedding.contains(clusterId) => - val sourceClusterScore = sourceEmbedding.getOrElse(clusterId) - - for (i <- 0 until Math.min(tweetScores.size, config.maxTopTweetsPerCluster)) { - val (tweetId, score) = tweetScores(i) - - if (!parseTweetId(sourceEmbeddingId).contains(tweetId) && - tweetId >= earliestTweetId && tweetId <= latestTweetId) { - candidateScoresMap.put( - tweetId, - candidateScoresMap.getOrElse(tweetId, 0.0) + score * sourceClusterScore) - if (config.enablePartialNormalization) { - candidateNormalizationMap - .put(tweetId, candidateNormalizationMap.getOrElse(tweetId, 0.0) + score * score) - } - } - } - case _ => () - } - - stats.stat("candidateScoresMap").add(candidateScoresMap.size) - stats.stat("candidateNormalizationMap").add(candidateNormalizationMap.size) - - // Re-Rank the candidate by configuration - val processedCandidateScores = candidateScoresMap.map { - case (candidateId, score) => - // Enable Partial Normalization - val processedScore = - if (config.enablePartialNormalization) { - // We applied the "log" version of partial normalization when we rank candidates - // by log cosine similarity - if (config.rankingAlgorithm == ScoringAlgorithm.PairEmbeddingLogCosineSimilarity) { - score / sourceEmbedding.l2norm / math.log( - 1 + candidateNormalizationMap(candidateId)) - } else { - score / sourceEmbedding.l2norm / math.sqrt(candidateNormalizationMap(candidateId)) - } - } else score - SimClustersTweetCandidate(candidateId, processedScore, sourceEmbeddingId) - }.toSeq - - processedCandidateScores - .sortBy(-_.score) - } - } - - private def reranking( - sourceEmbeddingId: SimClustersEmbeddingId, - config: SimClustersANNConfig, - candidates: Seq[SimClustersTweetCandidate] - ): Future[Seq[SimClustersTweetCandidate]] = { - val rankedCandidates = if (config.enableHeavyRanking) { - heavyRanker - .rank( - scoringAlgorithm = config.rankingAlgorithm, - sourceEmbeddingId = sourceEmbeddingId, - candidateEmbeddingType = config.candidateEmbeddingType, - minScore = config.minScore, - candidates = candidates.take(config.maxReRankingCandidates) - ).map(_.sortBy(-_.score)) - } else { - Future.value(candidates) - } - rankedCandidates.map(_.take(config.maxNumResults)) - } - - private[candidate_source] def loadConfig(query: Query): Option[SimClustersANNConfig] = { - configs.get(query.sourceEmbeddingId.embeddingType).map { baseConfig => - // apply overrides if any - query.overrideConfig match { - case Some(overrides) => - baseConfig.copy( - maxNumResults = overrides.maxNumResults.getOrElse(baseConfig.maxNumResults), - maxTweetCandidateAge = - overrides.maxTweetCandidateAge.getOrElse(baseConfig.maxTweetCandidateAge), - minScore = overrides.minScore.getOrElse(baseConfig.minScore), - candidateEmbeddingType = - overrides.candidateEmbeddingType.getOrElse(baseConfig.candidateEmbeddingType), - enablePartialNormalization = - overrides.enablePartialNormalization.getOrElse(baseConfig.enablePartialNormalization), - enableHeavyRanking = - overrides.enableHeavyRanking.getOrElse(baseConfig.enableHeavyRanking), - rankingAlgorithm = overrides.rankingAlgorithm.getOrElse(baseConfig.rankingAlgorithm), - maxReRankingCandidates = - overrides.maxReRankingCandidates.getOrElse(baseConfig.maxReRankingCandidates), - maxTopTweetsPerCluster = - overrides.maxTopTweetsPerCluster.getOrElse(baseConfig.maxTopTweetsPerCluster), - maxScanClusters = overrides.maxScanClusters.getOrElse(baseConfig.maxScanClusters), - minTweetCandidateAge = - overrides.minTweetCandidateAge.getOrElse(baseConfig.minTweetCandidateAge) - ) - case _ => baseConfig - } - } - } -} - -object SimClustersANNCandidateSource { - - final val ProductionMaxNumResults = 200 - final val InitialCandidateMapSize = 16384 - - def apply( - clusterTweetCandidatesStore: ReadableStore[ClusterKey, Seq[(TweetId, Double)]], - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding], - uniformScoringStore: ReadableStore[ThriftScoreId, ThriftScore], - configs: Map[EmbeddingType, SimClustersANNConfig], - statsReceiver: StatsReceiver - ) = new SimClustersANNCandidateSource( - clusterTweetCandidatesStore = clusterTweetCandidatesStore, - simClustersEmbeddingStore = simClustersEmbeddingStore, - heavyRanker = new UniformScoreStoreRanker(uniformScoringStore, statsReceiver), - configs = configs, - statsReceiver = statsReceiver - ) - - private def parseTweetId(embeddingId: SimClustersEmbeddingId): Option[TweetId] = { - embeddingId.internalId match { - case InternalId.TweetId(tweetId) => - Some(tweetId) - case _ => - None - } - } - - case class Query( - sourceEmbeddingId: SimClustersEmbeddingId, - // Only override the config in DDG and Debuggers. - // Use Post-filter for the holdbacks for better cache hit rate. - overrideConfig: Option[SimClustersANNConfigOverride] = None) - - case class SimClustersTweetCandidate( - tweetId: TweetId, - score: Double, - sourceEmbeddingId: SimClustersEmbeddingId) - - class HashMap[A, B](initSize: Int) extends mutable.HashMap[A, B] { - override def initialSize: Int = initSize // 16 - by default - } - - /** - * The Configuration of Each SimClusters ANN Candidate Source. - * Expect One SimClusters Embedding Type mapping to a SimClusters ANN Configuration in Production. - */ - case class SimClustersANNConfig( - // The max number of candidates for a ANN Query - // Please don't override this value in Production. - maxNumResults: Int = ProductionMaxNumResults, - // The max tweet candidate duration from now. - maxTweetCandidateAge: Duration, - // The min score of the candidates - minScore: Double, - // The Candidate Embedding Type of Tweet. - candidateEmbeddingType: EmbeddingType, - // Enables normalization of approximate SimClusters vectors to remove popularity bias - enablePartialNormalization: Boolean, - // Whether to enable Embedding Similarity ranking - enableHeavyRanking: Boolean, - // The ranking algorithm for Source Candidate Similarity - rankingAlgorithm: ScoringAlgorithm, - // The max number of candidates in ReRanking Step - maxReRankingCandidates: Int, - // The max number of Top Tweets from every cluster tweet index - maxTopTweetsPerCluster: Int, - // The max number of Clusters in the source Embeddings. - maxScanClusters: Int, - // The min tweet candidate duration from now. - minTweetCandidateAge: Duration) - - /** - * Contains same fields as [[SimClustersANNConfig]], to specify which fields are to be overriden - * for experimental purposes. - * - * All fields in this class must be optional. - */ - case class SimClustersANNConfigOverride( - maxNumResults: Option[Int] = None, - maxTweetCandidateAge: Option[Duration] = None, - minScore: Option[Double] = None, - candidateEmbeddingType: Option[EmbeddingType] = None, - enablePartialNormalization: Option[Boolean] = None, - enableHeavyRanking: Option[Boolean] = None, - rankingAlgorithm: Option[ScoringAlgorithm] = None, - maxReRankingCandidates: Option[Int] = None, - maxTopTweetsPerCluster: Option[Int] = None, - maxScanClusters: Option[Int] = None, - minTweetCandidateAge: Option[Duration] = None, - enableLookbackSource: Option[Boolean] = None) - - final val DefaultMaxTopTweetsPerCluster = 200 - final val DefaultEnableHeavyRanking = false - object SimClustersANNConfig { - val DefaultSimClustersANNConfig: SimClustersANNConfig = - SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.7, - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = false, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = 200, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ) - } - - val LookbackMediaMinDays: Int = 0 - val LookbackMediaMaxDays: Int = 2 - val LookbackMediaMaxTweetsPerDay: Int = 2000 - val maxTopTweetsPerCluster: Int = - (LookbackMediaMaxDays - LookbackMediaMinDays + 1) * LookbackMediaMaxTweetsPerDay - - val LookbackMediaTweetConfig: Map[EmbeddingType, SimClustersANNConfig] = { - val candidateEmbeddingType = EmbeddingType.LogFavLongestL2EmbeddingTweet - val minTweetAge = LookbackMediaMinDays.days - val maxTweetAge = - LookbackMediaMaxDays.days - 1.hour // To compensate for the cache TTL that might push the tweet age beyond max age - val rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity - - val maxScanClusters = 50 - val minScore = 0.5 - Map( - EmbeddingType.FavBasedProducer -> SimClustersANNConfig( - minTweetCandidateAge = minTweetAge, - maxTweetCandidateAge = maxTweetAge, - minScore = - minScore, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = candidateEmbeddingType, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = rankingAlgorithm, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = maxTopTweetsPerCluster, - maxScanClusters = maxScanClusters, - ), - EmbeddingType.LogFavLongestL2EmbeddingTweet -> SimClustersANNConfig( - minTweetCandidateAge = minTweetAge, - maxTweetCandidateAge = maxTweetAge, - minScore = - minScore, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = candidateEmbeddingType, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = rankingAlgorithm, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = maxTopTweetsPerCluster, - maxScanClusters = maxScanClusters, - ), - EmbeddingType.FavTfgTopic -> SimClustersANNConfig( - minTweetCandidateAge = minTweetAge, - maxTweetCandidateAge = maxTweetAge, - minScore = minScore, - candidateEmbeddingType = candidateEmbeddingType, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = rankingAlgorithm, - maxReRankingCandidates = 400, - maxTopTweetsPerCluster = 200, - maxScanClusters = maxScanClusters, - ), - EmbeddingType.LogFavBasedKgoApeTopic -> SimClustersANNConfig( - minTweetCandidateAge = minTweetAge, - maxTweetCandidateAge = maxTweetAge, - minScore = minScore, - candidateEmbeddingType = candidateEmbeddingType, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = rankingAlgorithm, - maxReRankingCandidates = 400, - maxTopTweetsPerCluster = 200, - maxScanClusters = maxScanClusters, - ), - ) - } - - val DefaultConfigMappings: Map[EmbeddingType, SimClustersANNConfig] = Map( - EmbeddingType.FavBasedProducer -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.LogFavBasedUserInterestedAverageAddressBookFromIIAPE -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.RelaxedAggregatableLogFavBasedProducer -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.25, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 250, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.LogFavLongestL2EmbeddingTweet -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.3, // for twistly candidates. To specify a higher threshold, use a post-filter - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 400, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.FilteredUserInterestedInFromPE -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.7, // unused, heavy ranking disabled - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = false, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = - ScoringAlgorithm.PairEmbeddingCosineSimilarity, // Unused, heavy ranking disabled - maxReRankingCandidates = 150, // unused, heavy ranking disabled - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.FilteredUserInterestedIn -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.7, // unused, heavy ranking disabled - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = false, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = - ScoringAlgorithm.PairEmbeddingCosineSimilarity, // Unused, heavy ranking disabled - maxReRankingCandidates = 150, // unused, heavy ranking disabled - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.UnfilteredUserInterestedIn -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingLogCosineSimilarity, - maxReRankingCandidates = 400, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.FollowBasedUserInterestedInFromAPE -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 200, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.LogFavBasedUserInterestedInFromAPE -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 200, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.FavTfgTopic -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.5, - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 400, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.LogFavBasedKgoApeTopic -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.5, - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 400, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ), - EmbeddingType.UserNextInterestedIn -> SimClustersANNConfig( - maxTweetCandidateAge = 1.days, - minScore = 0.0, - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - enablePartialNormalization = true, - enableHeavyRanking = DefaultEnableHeavyRanking, - rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - maxReRankingCandidates = 200, - maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster, - maxScanClusters = 50, - minTweetCandidateAge = 0.seconds - ) - ) - - /** - * Only cache the candidates if it's not Consumer-source. For example, TweetSource, ProducerSource, - * TopicSource. We don't cache consumer-sources (e.g. UserInterestedIn) since a cached consumer - * object is going rarely hit, since it can't be shared by multiple users. - */ - val CacheableShortTTLEmbeddingTypes: Set[EmbeddingType] = - Set( - EmbeddingType.FavBasedProducer, - EmbeddingType.LogFavLongestL2EmbeddingTweet, - ) - - val CacheableLongTTLEmbeddingTypes: Set[EmbeddingType] = - Set( - EmbeddingType.FavTfgTopic, - EmbeddingType.LogFavBasedKgoApeTopic - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.docx new file mode 100644 index 000000000..2174d0c6c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.scala b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.scala deleted file mode 100644 index 2ad19e50f..000000000 --- a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.scala +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.simclusters_v2.candidate_source - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.base.CandidateSource -import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.LookbackMediaTweetConfig -import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.SimClustersTweetCandidate -import com.twitter.util.Future - -/** - * An abstraction layer that implements a lambda structure for ANNCandidate source. - * Allows us to call an online store as well as an offline store from a single query. - */ -case class SimClustersANNWrapperCandidateSource( - onlineANNSource: CandidateSource[SimClustersANNCandidateSource.Query, SimClustersTweetCandidate], - lookbackANNSource: CandidateSource[ - SimClustersANNCandidateSource.Query, - SimClustersTweetCandidate - ], -)( - statsReceiver: StatsReceiver) - extends CandidateSource[SimClustersANNCandidateSource.Query, SimClustersTweetCandidate] { - - override def get( - query: SimClustersANNCandidateSource.Query - ): Future[Option[Seq[SimClustersTweetCandidate]]] = { - - val enableLookbackSource = - query.overrideConfig.exists(_.enableLookbackSource.getOrElse(false)) - - val embeddingType = query.sourceEmbeddingId.embeddingType - val lookbackCandidatesFut = - if (enableLookbackSource && - LookbackMediaTweetConfig.contains(embeddingType)) { - statsReceiver - .counter("lookback_source", embeddingType.toString, "enable").incr() - statsReceiver.counter("lookback_source", "enable").incr() - lookbackANNSource.get(query) - } else { - statsReceiver - .counter("lookback_source", embeddingType.toString, "disable").incr() - Future.None - } - - Future.join(onlineANNSource.get(query), lookbackCandidatesFut).map { - case (onlineCandidates, lookbackCandidates) => - Some( - onlineCandidates.getOrElse(Nil) ++ lookbackCandidates.getOrElse(Nil) - ) - } - } - - override def name: String = this.getClass.getCanonicalName -} diff --git a/src/scala/com/twitter/simclusters_v2/common/BUILD b/src/scala/com/twitter/simclusters_v2/common/BUILD deleted file mode 100644 index 9cf3b3fd7..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/BUILD +++ /dev/null @@ -1,12 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/algebird:core", - "3rdparty/jvm/com/twitter/algebird:util", - "servo/decider", - "src/scala/com/twitter/storehaus_internal/manhattan", - "src/thrift/com/twitter/ml/api:interpretable-model-java", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/common/BUILD.docx b/src/scala/com/twitter/simclusters_v2/common/BUILD.docx new file mode 100644 index 000000000..e418604c8 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.docx b/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.docx new file mode 100644 index 000000000..f0627ebcb Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.scala b/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.scala deleted file mode 100644 index 2a8cc1c46..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.scala +++ /dev/null @@ -1,251 +0,0 @@ -package com.twitter.simclusters_v2.common - -object CosineSimilarityUtil { - - /** - * Sum of squared elements for a given vector v - */ - def sumOfSquares[T](v: Map[T, Double]): Double = { - v.values.foldLeft(0.0) { (sum, value) => sum + value * value } - } - - /** - * Sum of squared elements for a given vector v - */ - def sumOfSquaresArray(v: Array[Double]): Double = { - v.foldLeft(0.0) { (sum, value) => sum + value * value } - } - - /** - * Calculate the l2Norm score - */ - def norm[T](v: Map[T, Double]): Double = { - math.sqrt(sumOfSquares(v)) - } - - /** - * Calculate the l2Norm score - */ - def normArray(v: Array[Double]): Double = { - math.sqrt(sumOfSquaresArray(v)) - } - - /** - * Calculate the logNorm score - */ - def logNorm[T](v: Map[T, Double]): Double = { - math.log(sumOfSquares(v) + 1) - } - - /** - * Calculate the logNorm score - */ - def logNormArray(v: Array[Double]): Double = { - math.log(sumOfSquaresArray(v) + 1) - } - - /** - * Calculate the exp scaled norm score - * */ - def expScaledNorm[T](v: Map[T, Double], exponent: Double): Double = { - math.pow(sumOfSquares(v), exponent) - } - - /** - * Calculate the exp scaled norm score - * */ - def expScaledNormArray(v: Array[Double], exponent: Double): Double = { - math.pow(sumOfSquaresArray(v), exponent) - } - - /** - * Calculate the l1Norm score - */ - def l1Norm[T](v: Map[T, Double]): Double = { - v.values.foldLeft(0.0) { (sum, value) => sum + Math.abs(value) } - } - - /** - * Calculate the l1Norm score - */ - def l1NormArray(v: Array[Double]): Double = { - v.foldLeft(0.0) { (sum, value) => sum + Math.abs(value) } - } - - /** - * Divide the weight vector with the applied norm - * Return the original object if the norm is 0 - * - * @param v a map from cluster id to its weight - * @param norm a calculated norm from the given map v - * - * @return a map with normalized weight - */ - def applyNorm[T](v: Map[T, Double], norm: Double): Map[T, Double] = { - if (norm == 0) v else v.mapValues(x => x / norm) - } - - /** - * Divide the weight vector with the applied norm - * Return the original object if the norm is 0 - * - * @param v a an array of weights - * @param norm a calculated norm from the given array v - * - * @return an array with normalized weight in the same order as v - */ - def applyNormArray(v: Array[Double], norm: Double): Array[Double] = { - if (norm == 0) v else v.map(_ / norm) - } - - /** - * Normalize the weight vector for easy cosine similarity calculation. If the input weight vector - * is empty or its norm is 0, return the original map. - * - * @param v a map from cluster id to its weight - * - * @return a map with normalized weight (the norm of the weight vector is 1) - */ - def normalize[T](v: Map[T, Double], maybeNorm: Option[Double] = None): Map[T, Double] = { - val norm = maybeNorm.getOrElse(CosineSimilarityUtil.norm(v)) - applyNorm(v, norm) - } - - /** - * Normalize the weight vector for easy cosine similarity calculation. If the input weight vector - * is empty or its norm is 0, return the original array. - * - * @param v an array of weights - * - * @return an array with normalized weight (the norm of the weight vector is 1), in the same order as v - */ - def normalizeArray( - v: Array[Double], - maybeNorm: Option[Double] = None - ): Array[Double] = { - val norm = maybeNorm.getOrElse(CosineSimilarityUtil.normArray(v)) - applyNormArray(v, norm) - } - - /** - * Normalize the weight vector with log norm. If the input weight vector - * is empty or its norm is 0, return the original map. - * - * @param v a map from cluster id to its weight - * - * @return a map with log normalized weight - * */ - def logNormalize[T](v: Map[T, Double], maybeNorm: Option[Double] = None): Map[T, Double] = { - val norm = maybeNorm.getOrElse(CosineSimilarityUtil.logNorm(v)) - applyNorm(v, norm) - } - - /** - * Normalize the weight vector with log norm. If the input weight vector - * is empty or its norm is 0, return the original array. - * - * @param v an array of weights - * - * @return an array with log normalized weight, in the same order as v - * */ - def logNormalizeArray( - v: Array[Double], - maybeNorm: Option[Double] = None - ): Array[Double] = { - val norm = maybeNorm.getOrElse(CosineSimilarityUtil.logNormArray(v)) - applyNormArray(v, norm) - } - - /** - * Normalize the weight vector with exponentially scaled norm. If the input weight vector - * is empty or its norm is 0, return the original map. - * - * @param v a map from cluster id to its weight - * @param exponent the exponent we apply to the weight vector's norm - * - * @return a map with exp scaled normalized weight - * */ - def expScaledNormalize[T]( - v: Map[T, Double], - exponent: Option[Double] = None, - maybeNorm: Option[Double] = None - ): Map[T, Double] = { - val norm = maybeNorm.getOrElse(CosineSimilarityUtil.expScaledNorm(v, exponent.getOrElse(0.3))) - applyNorm(v, norm) - } - - /** - * Normalize the weight vector with exponentially scaled norm. If the input weight vector - * is empty or its norm is 0, return the original map. - * - * @param v an array of weights - * @param exponent the exponent we apply to the weight vector's norm - * - * @return an array with exp scaled normalized weight, in the same order as v - * */ - def expScaledNormalizeArray( - v: Array[Double], - exponent: Double, - maybeNorm: Option[Double] = None - ): Array[Double] = { - val norm = maybeNorm.getOrElse(CosineSimilarityUtil.expScaledNormArray(v, exponent)) - applyNormArray(v, norm) - } - - /** - * Given two sparse vectors, calculate its dot product. - * - * @param v1 the first map from cluster id to its weight - * @param v2 the second map from cluster id to its weight - * - * @return the dot product of above two sparse vector - */ - def dotProduct[T](v1: Map[T, Double], v2: Map[T, Double]): Double = { - val comparer = v1.size - v2.size - val smaller = if (comparer > 0) v2 else v1 - val bigger = if (comparer > 0) v1 else v2 - - smaller.foldLeft(0.0) { - case (sum, (id, value)) => - sum + bigger.getOrElse(id, 0.0) * value - } - } - - /** - * Given two sparse vectors, calculate its dot product. - * - * @param v1C an array of cluster ids. Must be sorted in ascending order - * @param v1S an array of corresponding cluster scores, of the same length and order as v1c - * @param v2C an array of cluster ids. Must be sorted in ascending order - * @param v2S an array of corresponding cluster scores, of the same length and order as v2c - * - * @return the dot product of above two sparse vector - */ - def dotProductForSortedClusterAndScores( - v1C: Array[Int], - v1S: Array[Double], - v2C: Array[Int], - v2S: Array[Double] - ): Double = { - require(v1C.size == v1S.size) - require(v2C.size == v2S.size) - var i1 = 0 - var i2 = 0 - var product: Double = 0.0 - - while (i1 < v1C.size && i2 < v2C.size) { - if (v1C(i1) == v2C(i2)) { - product += v1S(i1) * v2S(i2) - i1 += 1 - i2 += 1 - } else if (v1C(i1) > v2C(i2)) { - // v2 cluster is lower. Increment it to see if the next one matches v1's - i2 += 1 - } else { - // v1 cluster is lower. Increment it to see if the next one matches v2's - i1 += 1 - } - } - product - } -} diff --git a/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.docx b/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.docx new file mode 100644 index 000000000..b6a5125d6 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.scala b/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.scala deleted file mode 100644 index 76e10aaa0..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.scala +++ /dev/null @@ -1,21 +0,0 @@ -package com.twitter.simclusters_v2.common - -import com.twitter.decider.Decider -import com.twitter.servo.decider.{DeciderGateBuilder, DeciderKeyName} -import com.twitter.servo.util.Gate - -class DeciderGateBuilderWithIdHashing(decider: Decider) extends DeciderGateBuilder(decider) { - - def idGateWithHashing[T](key: DeciderKeyName): Gate[T] = { - val feature = keyToFeature(key) - // Only if the decider is neither fully on / off is the object hashed - // This does require an additional call to get the decider availability but that is comparatively cheaper - val convertToHash: T => Long = (obj: T) => { - val availability = feature.availability.getOrElse(0) - if (availability == 10000 || availability == 0) availability - else obj.hashCode - } - idGate(key).contramap[T](convertToHash) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/common/ModelVersions.docx b/src/scala/com/twitter/simclusters_v2/common/ModelVersions.docx new file mode 100644 index 000000000..b32603fba Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/ModelVersions.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/ModelVersions.scala b/src/scala/com/twitter/simclusters_v2/common/ModelVersions.scala deleted file mode 100644 index 796474ccd..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/ModelVersions.scala +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.simclusters_v2.common - -import com.twitter.simclusters_v2.thriftscala.ModelVersion - -/** - * The utility to convert SimClusters Model version into different forms. - * Required to register any new SimClusters Model version here. - */ -object ModelVersions { - - val Model20M145KDec11 = "20M_145K_dec11" - val Model20M145KUpdated = "20M_145K_updated" - val Model20M145K2020 = "20M_145K_2020" - - // Use Enum for feature switch - object Enum extends Enumeration { - val Model20M145K2020, Model20M145KUpdated: Value = Value - val enumToSimClustersModelVersionMap: Map[Enum.Value, ModelVersion] = Map( - Model20M145K2020 -> ModelVersion.Model20m145k2020, - Model20M145KUpdated -> ModelVersion.Model20m145kUpdated - ) - } - - // Add the new model version into this map - private val StringToThriftModelVersions: Map[String, ModelVersion] = - Map( - Model20M145KDec11 -> ModelVersion.Model20m145kDec11, - Model20M145KUpdated -> ModelVersion.Model20m145kUpdated, - Model20M145K2020 -> ModelVersion.Model20m145k2020 - ) - - private val ThriftModelVersionToStrings = StringToThriftModelVersions.map(_.swap) - - val AllModelVersions: Set[String] = StringToThriftModelVersions.keySet - - def toModelVersionOption(modelVersionStr: String): Option[ModelVersion] = { - StringToThriftModelVersions.get(modelVersionStr) - } - - implicit def toModelVersion(modelVersionStr: String): ModelVersion = { - StringToThriftModelVersions(modelVersionStr) - } - - implicit def toKnownForModelVersion(modelVersion: ModelVersion): String = { - ThriftModelVersionToStrings(modelVersion) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.docx b/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.docx new file mode 100644 index 000000000..9f1d31a18 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.scala b/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.scala deleted file mode 100644 index c8e11c41f..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.twitter.simclusters_v2.common - -object SeqStandardDeviation { - - def apply[T](t: Seq[T])(implicit mapper: T => Double): Double = { - if (t.isEmpty) { - 0.0 - } else { - val sum = t.foldLeft(0.0) { - case (temp, score) => - temp + score - } - val mean = sum / t.size - val variance = t.foldLeft(0.0) { (sum, score) => - val v = score - mean - sum + v * v - } / t.size - math.sqrt(variance) - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.docx new file mode 100644 index 000000000..83e660dc6 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.scala deleted file mode 100644 index b8f0179cb..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.scala +++ /dev/null @@ -1,581 +0,0 @@ -package com.twitter.simclusters_v2.common - -import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} -import scala.collection.mutable -import scala.language.implicitConversions -import scala.util.hashing.MurmurHash3.arrayHash -import scala.util.hashing.MurmurHash3.productHash -import scala.math._ - -/** - * A representation of a SimClusters Embedding, designed for low memory footprint and performance. - * For services that cache millions of embeddings, we found this to significantly reduce allocations, - * memory footprint and overall performance. - * - * Embedding data is stored in pre-sorted arrays rather than structures which use a lot of pointers - * (e.g. Map). A minimal set of lazily-constructed intermediate data is kept. - * - * Be wary of adding further `val` or `lazy val`s to this class; materializing and storing more data - * on these objects could significantly affect in-memory cache performance. - * - * Also, if you are using this code in a place where you care about memory footprint, be careful - * not to materialize any of the lazy vals unless you need them. - */ -sealed trait SimClustersEmbedding extends Equals { - import SimClustersEmbedding._ - - /** - * Any compliant implementation of the SimClustersEmbedding trait must ensure that: - * - the cluster and score arrays are ordered as described below - * - the cluster and score arrays are treated as immutable (.hashCode is memoized) - * - the size of all cluster and score arrays is the same - * - all cluster scores are > 0 - * - cluster ids are unique - */ - // In descending score order - this is useful for truncation, where we care most about the highest scoring elements - private[simclusters_v2] val clusterIds: Array[ClusterId] - private[simclusters_v2] val scores: Array[Double] - // In ascending cluster order. This is useful for operations where we try to find the same cluster in another embedding, e.g. dot product - private[simclusters_v2] val sortedClusterIds: Array[ClusterId] - private[simclusters_v2] val sortedScores: Array[Double] - - /** - * Build and return a Set of all clusters in this embedding - */ - lazy val clusterIdSet: Set[ClusterId] = sortedClusterIds.toSet - - /** - * Build and return Seq representation of this embedding - */ - lazy val embedding: Seq[(ClusterId, Double)] = - sortedClusterIds.zip(sortedScores).sortBy(-_._2).toSeq - - /** - * Build and return a Map representation of this embedding - */ - lazy val map: Map[ClusterId, Double] = sortedClusterIds.zip(sortedScores).toMap - - lazy val l1norm: Double = CosineSimilarityUtil.l1NormArray(sortedScores) - - lazy val l2norm: Double = CosineSimilarityUtil.normArray(sortedScores) - - lazy val logNorm: Double = CosineSimilarityUtil.logNormArray(sortedScores) - - lazy val expScaledNorm: Double = - CosineSimilarityUtil.expScaledNormArray(sortedScores, DefaultExponent) - - /** - * The L2 Normalized Embedding. Optimize for Cosine Similarity Calculation. - */ - lazy val normalizedSortedScores: Array[Double] = - CosineSimilarityUtil.applyNormArray(sortedScores, l2norm) - - lazy val logNormalizedSortedScores: Array[Double] = - CosineSimilarityUtil.applyNormArray(sortedScores, logNorm) - - lazy val expScaledNormalizedSortedScores: Array[Double] = - CosineSimilarityUtil.applyNormArray(sortedScores, expScaledNorm) - - /** - * The Standard Deviation of an Embedding. - */ - lazy val std: Double = { - if (scores.isEmpty) { - 0.0 - } else { - val sum = scores.sum - val mean = sum / scores.length - var variance: Double = 0.0 - for (i <- scores.indices) { - val v = scores(i) - mean - variance += (v * v) - } - math.sqrt(variance / scores.length) - } - } - - /** - * Return the score of a given clusterId. - */ - def get(clusterId: ClusterId): Option[Double] = { - var i = 0 - while (i < sortedClusterIds.length) { - val thisId = sortedClusterIds(i) - if (clusterId == thisId) return Some(sortedScores(i)) - if (thisId > clusterId) return None - i += 1 - } - None - } - - /** - * Return the score of a given clusterId. If not exist, return default. - */ - def getOrElse(clusterId: ClusterId, default: Double = 0.0): Double = { - require(default >= 0.0) - var i = 0 - while (i < sortedClusterIds.length) { - val thisId = sortedClusterIds(i) - if (clusterId == thisId) return sortedScores(i) - if (thisId > clusterId) return default - i += 1 - } - default - } - - /** - * Return the cluster ids - */ - def getClusterIds(): Array[ClusterId] = clusterIds - - /** - * Return the cluster ids with the highest scores - */ - def topClusterIds(size: Int): Seq[ClusterId] = clusterIds.take(size) - - /** - * Return true if this embedding contains a given clusterId - */ - def contains(clusterId: ClusterId): Boolean = clusterIdSet.contains(clusterId) - - def sum(another: SimClustersEmbedding): SimClustersEmbedding = { - if (another.isEmpty) this - else if (this.isEmpty) another - else { - var i1 = 0 - var i2 = 0 - val l = scala.collection.mutable.ArrayBuffer.empty[(Int, Double)] - while (i1 < sortedClusterIds.length && i2 < another.sortedClusterIds.length) { - if (sortedClusterIds(i1) == another.sortedClusterIds(i2)) { - l += Tuple2(sortedClusterIds(i1), sortedScores(i1) + another.sortedScores(i2)) - i1 += 1 - i2 += 1 - } else if (sortedClusterIds(i1) > another.sortedClusterIds(i2)) { - l += Tuple2(another.sortedClusterIds(i2), another.sortedScores(i2)) - // another cluster is lower. Increment it to see if the next one matches this's - i2 += 1 - } else { - l += Tuple2(sortedClusterIds(i1), sortedScores(i1)) - // this cluster is lower. Increment it to see if the next one matches anothers's - i1 += 1 - } - } - if (i1 == sortedClusterIds.length && i2 != another.sortedClusterIds.length) - // this was shorter. Prepend remaining elements from another - l ++= another.sortedClusterIds.drop(i2).zip(another.sortedScores.drop(i2)) - else if (i1 != sortedClusterIds.length && i2 == another.sortedClusterIds.length) - // another was shorter. Prepend remaining elements from this - l ++= sortedClusterIds.drop(i1).zip(sortedScores.drop(i1)) - SimClustersEmbedding(l) - } - } - - def scalarMultiply(multiplier: Double): SimClustersEmbedding = { - require(multiplier > 0.0, "SimClustersEmbedding.scalarMultiply requires multiplier > 0.0") - DefaultSimClustersEmbedding( - clusterIds, - scores.map(_ * multiplier), - sortedClusterIds, - sortedScores.map(_ * multiplier) - ) - } - - def scalarDivide(divisor: Double): SimClustersEmbedding = { - require(divisor > 0.0, "SimClustersEmbedding.scalarDivide requires divisor > 0.0") - DefaultSimClustersEmbedding( - clusterIds, - scores.map(_ / divisor), - sortedClusterIds, - sortedScores.map(_ / divisor) - ) - } - - def dotProduct(another: SimClustersEmbedding): Double = { - CosineSimilarityUtil.dotProductForSortedClusterAndScores( - sortedClusterIds, - sortedScores, - another.sortedClusterIds, - another.sortedScores) - } - - def cosineSimilarity(another: SimClustersEmbedding): Double = { - CosineSimilarityUtil.dotProductForSortedClusterAndScores( - sortedClusterIds, - normalizedSortedScores, - another.sortedClusterIds, - another.normalizedSortedScores) - } - - def logNormCosineSimilarity(another: SimClustersEmbedding): Double = { - CosineSimilarityUtil.dotProductForSortedClusterAndScores( - sortedClusterIds, - logNormalizedSortedScores, - another.sortedClusterIds, - another.logNormalizedSortedScores) - } - - def expScaledCosineSimilarity(another: SimClustersEmbedding): Double = { - CosineSimilarityUtil.dotProductForSortedClusterAndScores( - sortedClusterIds, - expScaledNormalizedSortedScores, - another.sortedClusterIds, - another.expScaledNormalizedSortedScores) - } - - /** - * Return true if this is an empty embedding - */ - def isEmpty: Boolean = sortedClusterIds.isEmpty - - /** - * Return the Jaccard Similarity Score between two embeddings. - * Note: this implementation should be optimized if we start to use it in production - */ - def jaccardSimilarity(another: SimClustersEmbedding): Double = { - if (this.isEmpty || another.isEmpty) { - 0.0 - } else { - val intersect = clusterIdSet.intersect(another.clusterIdSet).size - val union = clusterIdSet.union(another.clusterIdSet).size - intersect.toDouble / union - } - } - - /** - * Return the Fuzzy Jaccard Similarity Score between two embeddings. - * Treat each Simclusters embedding as fuzzy set, calculate the fuzzy set similarity - * metrics of two embeddings - * - * Paper 2.2.1: https://openreview.net/pdf?id=SkxXg2C5FX - */ - def fuzzyJaccardSimilarity(another: SimClustersEmbedding): Double = { - if (this.isEmpty || another.isEmpty) { - 0.0 - } else { - val v1C = sortedClusterIds - val v1S = sortedScores - val v2C = another.sortedClusterIds - val v2S = another.sortedScores - - require(v1C.length == v1S.length) - require(v2C.length == v2S.length) - - var i1 = 0 - var i2 = 0 - var numerator = 0.0 - var denominator = 0.0 - - while (i1 < v1C.length && i2 < v2C.length) { - if (v1C(i1) == v2C(i2)) { - numerator += min(v1S(i1), v2S(i2)) - denominator += max(v1S(i1), v2S(i2)) - i1 += 1 - i2 += 1 - } else if (v1C(i1) > v2C(i2)) { - denominator += v2S(i2) - i2 += 1 - } else { - denominator += v1S(i1) - i1 += 1 - } - } - - while (i1 < v1C.length) { - denominator += v1S(i1) - i1 += 1 - } - while (i2 < v2C.length) { - denominator += v2S(i2) - i2 += 1 - } - - numerator / denominator - } - } - - /** - * Return the Euclidean Distance Score between two embeddings. - * Note: this implementation should be optimized if we start to use it in production - */ - def euclideanDistance(another: SimClustersEmbedding): Double = { - val unionClusters = clusterIdSet.union(another.clusterIdSet) - val variance = unionClusters.foldLeft(0.0) { - case (sum, clusterId) => - val distance = math.abs(this.getOrElse(clusterId) - another.getOrElse(clusterId)) - sum + distance * distance - } - math.sqrt(variance) - } - - /** - * Return the Manhattan Distance Score between two embeddings. - * Note: this implementation should be optimized if we start to use it in production - */ - def manhattanDistance(another: SimClustersEmbedding): Double = { - val unionClusters = clusterIdSet.union(another.clusterIdSet) - unionClusters.foldLeft(0.0) { - case (sum, clusterId) => - sum + math.abs(this.getOrElse(clusterId) - another.getOrElse(clusterId)) - } - } - - /** - * Return the number of overlapping clusters between two embeddings. - */ - def overlappingClusters(another: SimClustersEmbedding): Int = { - var i1 = 0 - var i2 = 0 - var count = 0 - - while (i1 < sortedClusterIds.length && i2 < another.sortedClusterIds.length) { - if (sortedClusterIds(i1) == another.sortedClusterIds(i2)) { - count += 1 - i1 += 1 - i2 += 1 - } else if (sortedClusterIds(i1) > another.sortedClusterIds(i2)) { - // v2 cluster is lower. Increment it to see if the next one matches v1's - i2 += 1 - } else { - // v1 cluster is lower. Increment it to see if the next one matches v2's - i1 += 1 - } - } - count - } - - /** - * Return the largest product cluster scores - */ - def maxElementwiseProduct(another: SimClustersEmbedding): Double = { - var i1 = 0 - var i2 = 0 - var maxProduct: Double = 0.0 - - while (i1 < sortedClusterIds.length && i2 < another.sortedClusterIds.length) { - if (sortedClusterIds(i1) == another.sortedClusterIds(i2)) { - val product = sortedScores(i1) * another.sortedScores(i2) - if (product > maxProduct) maxProduct = product - i1 += 1 - i2 += 1 - } else if (sortedClusterIds(i1) > another.sortedClusterIds(i2)) { - // v2 cluster is lower. Increment it to see if the next one matches v1's - i2 += 1 - } else { - // v1 cluster is lower. Increment it to see if the next one matches v2's - i1 += 1 - } - } - maxProduct - } - - /** - * Return a new SimClustersEmbedding with Max Embedding Size. - * - * Prefer to truncate on embedding construction where possible. Doing so is cheaper. - */ - def truncate(size: Int): SimClustersEmbedding = { - if (clusterIds.length <= size) { - this - } else { - val truncatedClusterIds = clusterIds.take(size) - val truncatedScores = scores.take(size) - val (sortedClusterIds, sortedScores) = - truncatedClusterIds.zip(truncatedScores).sortBy(_._1).unzip - - DefaultSimClustersEmbedding( - truncatedClusterIds, - truncatedScores, - sortedClusterIds, - sortedScores) - } - } - - def toNormalized: SimClustersEmbedding = { - // Additional safety check. Only EmptyEmbedding's l2norm is 0.0. - if (l2norm == 0.0) { - EmptyEmbedding - } else { - this.scalarDivide(l2norm) - } - } - - implicit def toThrift: ThriftSimClustersEmbedding = { - ThriftSimClustersEmbedding( - embedding.map { - case (clusterId, score) => - SimClusterWithScore(clusterId, score) - } - ) - } - - def canEqual(a: Any): Boolean = a.isInstanceOf[SimClustersEmbedding] - - /* We define equality as having the same clusters and scores. - * This implementation is arguably incorrect in this case: - * (1 -> 1.0, 2 -> 0.0) == (1 -> 1.0) // equals returns false - * However, compliant implementations of SimClustersEmbedding should not include zero-weight - * clusters, so this implementation should work correctly. - */ - override def equals(that: Any): Boolean = - that match { - case that: SimClustersEmbedding => - that.canEqual(this) && - this.sortedClusterIds.sameElements(that.sortedClusterIds) && - this.sortedScores.sameElements(that.sortedScores) - case _ => false - } - - /** - * hashcode implementation based on the contents of the embedding. As a lazy val, this relies on - * the embedding contents being immutable. - */ - override lazy val hashCode: Int = { - /* Arrays uses object id as hashCode, so different arrays with the same contents hash - * differently. To provide a stable hash code, we take the same approach as how a - * `case class(clusters: Seq[Int], scores: Seq[Double])` would be hashed. See - * ScalaRunTime._hashCode and MurmurHash3.productHash - * https://github.com/scala/scala/blob/2.12.x/src/library/scala/runtime/ScalaRunTime.scala#L167 - * https://github.com/scala/scala/blob/2.12.x/src/library/scala/util/hashing/MurmurHash3.scala#L64 - * - * Note that the hashcode is arguably incorrect in this case: - * (1 -> 1.0, 2 -> 0.0).hashcode == (1 -> 1.0).hashcode // returns false - * However, compliant implementations of SimClustersEmbedding should not include zero-weight - * clusters, so this implementation should work correctly. - */ - productHash((arrayHash(sortedClusterIds), arrayHash(sortedScores))) - } -} - -object SimClustersEmbedding { - val EmptyEmbedding: SimClustersEmbedding = - DefaultSimClustersEmbedding(Array.empty, Array.empty, Array.empty, Array.empty) - - val DefaultExponent: Double = 0.3 - - // Descending by score then ascending by ClusterId - implicit val order: Ordering[(ClusterId, Double)] = - (a: (ClusterId, Double), b: (ClusterId, Double)) => { - b._2 compare a._2 match { - case 0 => a._1 compare b._1 - case c => c - } - } - - /** - * Constructors - * - * These constructors: - * - do not make assumptions about the ordering of the cluster/scores. - * - do assume that cluster ids are unique - * - ignore (drop) any cluster whose score is <= 0 - */ - def apply(embedding: (ClusterId, Double)*): SimClustersEmbedding = - buildDefaultSimClustersEmbedding(embedding) - - def apply(embedding: Iterable[(ClusterId, Double)]): SimClustersEmbedding = - buildDefaultSimClustersEmbedding(embedding) - - def apply(embedding: Iterable[(ClusterId, Double)], size: Int): SimClustersEmbedding = - buildDefaultSimClustersEmbedding(embedding, truncate = Some(size)) - - implicit def apply(thriftEmbedding: ThriftSimClustersEmbedding): SimClustersEmbedding = - buildDefaultSimClustersEmbedding(thriftEmbedding.embedding.map(_.toTuple)) - - def apply(thriftEmbedding: ThriftSimClustersEmbedding, truncate: Int): SimClustersEmbedding = - buildDefaultSimClustersEmbedding( - thriftEmbedding.embedding.map(_.toTuple), - truncate = Some(truncate)) - - private def buildDefaultSimClustersEmbedding( - embedding: Iterable[(ClusterId, Double)], - truncate: Option[Int] = None - ): SimClustersEmbedding = { - val truncatedIdAndScores = { - val idsAndScores = embedding.filter(_._2 > 0.0).toArray.sorted(order) - truncate match { - case Some(t) => idsAndScores.take(t) - case _ => idsAndScores - } - } - - if (truncatedIdAndScores.isEmpty) { - EmptyEmbedding - } else { - val (clusterIds, scores) = truncatedIdAndScores.unzip - val (sortedClusterIds, sortedScores) = truncatedIdAndScores.sortBy(_._1).unzip - DefaultSimClustersEmbedding(clusterIds, scores, sortedClusterIds, sortedScores) - } - } - - /** ***** Aggregation Methods ******/ - /** - * A high performance version of Sum a list of SimClustersEmbeddings. - * Suggest using in Online Services to avoid the unnecessary GC. - * For offline or streaming. Please check [[SimClustersEmbeddingMonoid]] - */ - def sum(simClustersEmbeddings: Iterable[SimClustersEmbedding]): SimClustersEmbedding = { - if (simClustersEmbeddings.isEmpty) { - EmptyEmbedding - } else { - val sum = simClustersEmbeddings.foldLeft(mutable.Map[ClusterId, Double]()) { - (sum, embedding) => - for (i <- embedding.sortedClusterIds.indices) { - val clusterId = embedding.sortedClusterIds(i) - sum.put(clusterId, embedding.sortedScores(i) + sum.getOrElse(clusterId, 0.0)) - } - sum - } - SimClustersEmbedding(sum) - } - } - - /** - * Support a fixed size SimClustersEmbedding Sum - */ - def sum( - simClustersEmbeddings: Iterable[SimClustersEmbedding], - maxSize: Int - ): SimClustersEmbedding = { - sum(simClustersEmbeddings).truncate(maxSize) - } - - /** - * A high performance version of Mean a list of SimClustersEmbeddings. - * Suggest using in Online Services to avoid the unnecessary GC. - */ - def mean(simClustersEmbeddings: Iterable[SimClustersEmbedding]): SimClustersEmbedding = { - if (simClustersEmbeddings.isEmpty) { - EmptyEmbedding - } else { - sum(simClustersEmbeddings).scalarDivide(simClustersEmbeddings.size) - } - } - - /** - * Support a fixed size SimClustersEmbedding Mean - */ - def mean( - simClustersEmbeddings: Iterable[SimClustersEmbedding], - maxSize: Int - ): SimClustersEmbedding = { - mean(simClustersEmbeddings).truncate(maxSize) - } -} - -case class DefaultSimClustersEmbedding( - override val clusterIds: Array[ClusterId], - override val scores: Array[Double], - override val sortedClusterIds: Array[ClusterId], - override val sortedScores: Array[Double]) - extends SimClustersEmbedding { - - override def toString: String = - s"DefaultSimClustersEmbedding(${clusterIds.zip(scores).mkString(",")})" -} - -object DefaultSimClustersEmbedding { - // To support existing code which builds embeddings from a Seq - def apply(embedding: Seq[(ClusterId, Double)]): SimClustersEmbedding = SimClustersEmbedding( - embedding) -} diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.docx new file mode 100644 index 000000000..2cafa5991 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.scala deleted file mode 100644 index 0a2fc592f..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.scala +++ /dev/null @@ -1,209 +0,0 @@ -package com.twitter.simclusters_v2.common - -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.LocaleEntityId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.TopicId -import com.twitter.simclusters_v2.thriftscala.{ - SimClustersEmbeddingId => ThriftSimClustersEmbeddingId -} -import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ -import com.twitter.simclusters_v2.thriftscala.InternalId.EntityId -import com.twitter.simclusters_v2.thriftscala.InternalId.TweetId -import com.twitter.simclusters_v2.thriftscala.InternalId.UserId -import com.twitter.simclusters_v2.thriftscala.{EmbeddingType => SimClustersEmbeddingType} - -object SimClustersEmbeddingId { - - val DefaultModelVersion: ModelVersion = ModelVersion.Model20m145k2020 - - // Embeddings which is available in Content-Recommender - val TweetEmbeddingTypes: Set[EmbeddingType] = - Set( - FavBasedTweet, - FollowBasedTweet, - LogFavBasedTweet, - LogFavLongestL2EmbeddingTweet - ) - val DefaultTweetEmbeddingType: EmbeddingType = LogFavLongestL2EmbeddingTweet - - val UserInterestedInEmbeddingTypes: Set[EmbeddingType] = - Set( - FavBasedUserInterestedIn, - FollowBasedUserInterestedIn, - LogFavBasedUserInterestedIn, - RecentFollowBasedUserInterestedIn, - FilteredUserInterestedIn, - FavBasedUserInterestedInFromPE, - FollowBasedUserInterestedInFromPE, - LogFavBasedUserInterestedInFromPE, - FilteredUserInterestedInFromPE, - LogFavBasedUserInterestedInFromAPE, - FollowBasedUserInterestedInFromAPE, - UnfilteredUserInterestedIn - ) - val DefaultUserInterestInEmbeddingType: EmbeddingType = FavBasedUserInterestedIn - - val ProducerEmbeddingTypes: Set[EmbeddingType] = - Set( - FavBasedProducer, - FollowBasedProducer, - AggregatableFavBasedProducer, - AggregatableLogFavBasedProducer, - RelaxedAggregatableLogFavBasedProducer, - KnownFor - ) - val DefaultProducerEmbeddingType: EmbeddingType = FavBasedProducer - - val LocaleEntityEmbeddingTypes: Set[EmbeddingType] = - Set( - FavTfgTopic, - LogFavTfgTopic - ) - val DefaultLocaleEntityEmbeddingType: EmbeddingType = FavTfgTopic - - val TopicEmbeddingTypes: Set[EmbeddingType] = - Set( - LogFavBasedKgoApeTopic - ) - val DefaultTopicEmbeddingType: EmbeddingType = LogFavBasedKgoApeTopic - - val AllEmbeddingTypes: Set[EmbeddingType] = - TweetEmbeddingTypes ++ - UserInterestedInEmbeddingTypes ++ - ProducerEmbeddingTypes ++ - LocaleEntityEmbeddingTypes ++ - TopicEmbeddingTypes - - def buildTweetId( - tweetId: TweetId, - embeddingType: EmbeddingType = DefaultTweetEmbeddingType, - modelVersion: ModelVersion = DefaultModelVersion - ): ThriftSimClustersEmbeddingId = { - assert(TweetEmbeddingTypes.contains(embeddingType)) - ThriftSimClustersEmbeddingId( - embeddingType, - modelVersion, - InternalId.TweetId(tweetId) - ) - } - - def buildUserInterestedInId( - userId: UserId, - embeddingType: EmbeddingType = DefaultUserInterestInEmbeddingType, - modelVersion: ModelVersion = DefaultModelVersion - ): ThriftSimClustersEmbeddingId = { - assert(UserInterestedInEmbeddingTypes.contains(embeddingType)) - ThriftSimClustersEmbeddingId( - embeddingType, - modelVersion, - InternalId.UserId(userId) - ) - } - - def buildProducerId( - userId: UserId, - embeddingType: EmbeddingType = DefaultProducerEmbeddingType, - modelVersion: ModelVersion = DefaultModelVersion - ): ThriftSimClustersEmbeddingId = { - assert(ProducerEmbeddingTypes.contains(embeddingType)) - ThriftSimClustersEmbeddingId( - embeddingType, - modelVersion, - InternalId.UserId(userId) - ) - } - - def buildLocaleEntityId( - entityId: SemanticCoreEntityId, - language: String, - embeddingType: EmbeddingType = DefaultLocaleEntityEmbeddingType, - modelVersion: ModelVersion = DefaultModelVersion - ): ThriftSimClustersEmbeddingId = { - ThriftSimClustersEmbeddingId( - embeddingType, - modelVersion, - InternalId.LocaleEntityId( - LocaleEntityId(entityId, language) - ) - ) - } - - def buildTopicId( - topicId: TopicId, - language: Option[String] = None, - country: Option[String] = None, - embeddingType: EmbeddingType = DefaultTopicEmbeddingType, - modelVersion: ModelVersion = DefaultModelVersion - ): ThriftSimClustersEmbeddingId = { - ThriftSimClustersEmbeddingId( - embeddingType, - modelVersion, - InternalId.TopicId( - TopicId(topicId, language, country) - ) - ) - } - - // Extractor object for InternalIds that wrap Long - object LongInternalId { - def unapply(iid: InternalId): Option[Long] = iid match { - case InternalId.TweetId(id) => Some(id) - case InternalId.UserId(id) => Some(id) - case InternalId.EntityId(id) => Some(id) - case _ => None - } - } - - // Extractor object for SimClusterEmbeddingIds with InternalIds that wrap Long - object LongSimClustersEmbeddingId { - def unapply(id: ThriftSimClustersEmbeddingId): Option[Long] = - LongInternalId.unapply(id.internalId) - } - - // Only for debuggers. - def buildEmbeddingId( - entityId: String, - embeddingType: EmbeddingType, - modelVersion: ModelVersion = DefaultModelVersion - ): ThriftSimClustersEmbeddingId = { - if (TweetEmbeddingTypes.contains(embeddingType)) { - buildTweetId(entityId.toLong, embeddingType, modelVersion) - } else if (UserInterestedInEmbeddingTypes.contains(embeddingType)) { - buildUserInterestedInId(entityId.toLong, embeddingType, modelVersion) - } else if (ProducerEmbeddingTypes.contains(embeddingType)) { - buildProducerId(entityId.toLong, embeddingType, modelVersion) - } else if (LocaleEntityEmbeddingTypes.contains(embeddingType)) { - buildLocaleEntityId(entityId.toLong, "en", embeddingType, modelVersion) - } else if (TopicEmbeddingTypes.contains(embeddingType)) { - buildTopicId( - entityId.toLong, - Some("en"), - embeddingType = embeddingType, - modelVersion = modelVersion) - } else { - throw new IllegalArgumentException(s"Invalid embedding type: $embeddingType") - } - } - - implicit val internalIdOrdering: Ordering[InternalId] = - Ordering.by(internalId => internalId.hashCode()) - - implicit val simClustersEmbeddingIdOrdering: Ordering[ThriftSimClustersEmbeddingId] = - Ordering.by(embeddingId => - (embeddingId.embeddingType.value, embeddingId.modelVersion.value, embeddingId.internalId)) - - // Use Enum for feature switch - object TopicEnum extends Enumeration { - protected case class EmbeddingType(embeddingType: SimClustersEmbeddingType) extends super.Val - import scala.language.implicitConversions - implicit def valueToEmbeddingType(value: Value): EmbeddingType = - value.asInstanceOf[EmbeddingType] - - val FavTfgTopic: Value = EmbeddingType(SimClustersEmbeddingType.FavTfgTopic) - val LogFavBasedKgoApeTopic: Value = EmbeddingType( - SimClustersEmbeddingType.LogFavBasedKgoApeTopic) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.docx new file mode 100644 index 000000000..208b7310c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.scala deleted file mode 100644 index 21a54e96c..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.scala +++ /dev/null @@ -1,19 +0,0 @@ -package com.twitter.simclusters_v2.common - -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId - -/** - * A common library to construct Cache Key for SimClustersEmbeddingId. - */ -case class SimClustersEmbeddingIdCacheKeyBuilder( - hash: Array[Byte] => Long, - prefix: String = "") { - - // Example: "CR:SCE:1:2:1234567890ABCDEF" - def apply(embeddingId: SimClustersEmbeddingId): String = { - f"$prefix:SCE:${embeddingId.embeddingType.getValue()}%X:" + - f"${embeddingId.modelVersion.getValue()}%X" + - f":${hash(embeddingId.internalId.toString.getBytes)}%X" - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.docx new file mode 100644 index 000000000..7ec5e2478 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.scala deleted file mode 100644 index 1b17c9705..000000000 --- a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.scala +++ /dev/null @@ -1,18 +0,0 @@ -package com.twitter.simclusters_v2.common - -import com.twitter.algebird.Monoid - -case class SimClustersEmbeddingMonoid() extends Monoid[SimClustersEmbedding] { - - override val zero: SimClustersEmbedding = SimClustersEmbedding.EmptyEmbedding - - override def plus(x: SimClustersEmbedding, y: SimClustersEmbedding): SimClustersEmbedding = { - x.sum(y) - } -} - -object SimClustersEmbeddingMonoid { - - val monoid: Monoid[SimClustersEmbedding] = SimClustersEmbeddingMonoid() - -}