diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD deleted file mode 100644 index a834ba69e..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -python3_library( - name = "libs_py3", - sources = ["*.py"], - dependencies = [ - "src/python/twitter/deepbird/io", - "twml:twml-nodeps", - ], -) - -python37_binary( - name = "score", - source = "score.py", - dependencies = [ - ":libs_py3", - "3rdparty/python/_closures/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly:score", - "twml", - ], -) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD.docx new file mode 100644 index 000000000..e977474f9 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/BUILD.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.docx new file mode 100644 index 000000000..0fcd18354 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.docx new file mode 100644 index 000000000..b614c22b8 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py deleted file mode 100644 index 723dd626c..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/data_helpers.py +++ /dev/null @@ -1,23 +0,0 @@ -# checkstyle: noqa -import tensorflow.compat.v1 as tf -from ..constants import EB_SCORE_IDX - -# The rationale behind this logic is available at TQ-9678. -def get_lolly_logits(labels): - ''' - :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config. - :return: tf.Tensor of shape (batch size) with the extracted lolly logits. - ''' - eb_lolly_scores = get_lolly_scores(labels) - inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores) - lolly_activations = tf.math.subtract(tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores)) - return lolly_activations - -def get_lolly_scores(labels): - ''' - :param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config. - :return: tf.Tensor of shape (batch size) with the extracted lolly scores. - ''' - logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1)) - eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0) - return eb_lolly_scores diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.docx new file mode 100644 index 000000000..3490d33aa Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py deleted file mode 100644 index cb39c67a7..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/parsers.py +++ /dev/null @@ -1,145 +0,0 @@ -import re - -from twitter.deepbird.io.util import _get_feature_id - - -class Parser(object): - def parse(self, line): - match = re.search(self.pattern(), line) - if match: - return self._parse_match(match) - return None - - def pattern(self): - raise NotImplementedError - - def _parse_match(self, match): - raise NotImplementedError - - -class BiasParser(Parser): - ''' - Parses the bias feature available in lolly model tsv files. - ''' - - def pattern(self): - ''' - Matches lines like: - unified_engagement bias -0.935945 - :return: a RegEx that extracts feature weight. - ''' - return r"\t(bias)\t([^\s]+)" - - def _parse_match(self, match): - return float(match.group(2)) - - -class BinaryFeatureParser(Parser): - ''' - Parses binary features available in lolly model tsv files. - ''' - - def pattern(self): - ''' - Matches lines like: - unified_engagement encoded_tweet_features.is_user_spam_flag -0.181130 - :return: a RegEx that extracts feature name and weight. - ''' - return r"\t([\w\.]+)\t([^\s]+)" - - def _parse_match(self, match): - return (match.group(1), float(match.group(2))) - - -class DiscretizedFeatureParser(Parser): - ''' - Parses discretized features available in lolly model tsv files. - ''' - - def pattern(self): - ''' - Matches lines like: - unified_engagement encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00 0.031004 - :return: a RegEx that extracts feature name, bin boundaries and weight. - ''' - return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)" - - def _parse_match(self, match): - left_bin_side, right_bin_side = [float(number) for number in match.group(2).split("_")] - return ( - match.group(1), - left_bin_side, - right_bin_side, - float(match.group(3)) - ) - - -class LollyModelFeaturesParser(Parser): - def __init__(self, bias_parser=BiasParser(), binary_feature_parser=BinaryFeatureParser(), discretized_feature_parser=DiscretizedFeatureParser()): - self._bias_parser = bias_parser - self._binary_feature_parser = binary_feature_parser - self._discretized_feature_parser = discretized_feature_parser - - def parse(self, lolly_model_reader): - parsed_features = { - "bias": None, - "binary": {}, - "discretized": {} - } - def process_line_fn(line): - bias_parser_result = self._bias_parser.parse(line) - if bias_parser_result: - parsed_features["bias"] = bias_parser_result - return - - binary_feature_parser_result = self._binary_feature_parser.parse(line) - if binary_feature_parser_result: - name, value = binary_feature_parser_result - parsed_features["binary"][name] = value - return - - discretized_feature_parser_result = self._discretized_feature_parser.parse(line) - if discretized_feature_parser_result: - name, left_bin, right_bin, weight = discretized_feature_parser_result - discretized_features = parsed_features["discretized"] - if name not in discretized_features: - discretized_features[name] = [] - discretized_features[name].append((left_bin, right_bin, weight)) - - lolly_model_reader.read(process_line_fn) - - return parsed_features - - -class DBv2DataExampleParser(Parser): - ''' - Parses data records printed by the DBv2 train.py build_graph function. - Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]] - ''' - - def __init__(self, lolly_model_reader, lolly_model_features_parser=LollyModelFeaturesParser()): - self.features = lolly_model_features_parser.parse(lolly_model_reader) - self.feature_name_by_dbv2_id = {} - - for feature_name in list(self.features["binary"].keys()) + list(self.features["discretized"].keys()): - self.feature_name_by_dbv2_id[str(_get_feature_id(feature_name))] = feature_name - - def pattern(self): - ''' - :return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values. - ''' - return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]" - - def _parse_match(self, match): - feature_ids = match.group(3).split(" ") - feature_values = match.group(4).split(" ") - - value_by_feature_name = {} - for index in range(len(feature_ids)): - feature_id = feature_ids[index] - if feature_id not in self.feature_name_by_dbv2_id: - print("Missing feature with id: " + str(feature_id)) - continue - value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(feature_values[index]) - - return value_by_feature_name diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.docx new file mode 100644 index 000000000..1fb5fc64c Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py deleted file mode 100644 index ab33ee4e7..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/reader.py +++ /dev/null @@ -1,8 +0,0 @@ -class LollyModelReader(object): - def __init__(self, lolly_model_file_path): - self._lolly_model_file_path = lolly_model_file_path - - def read(self, process_line_fn): - with open(self._lolly_model_file_path, "r") as file: - for line in file: - process_line_fn(line) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.docx new file mode 100644 index 000000000..4ff26fdc5 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py deleted file mode 100644 index 5692616c2..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/score.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys - -from .parsers import DBv2DataExampleParser -from .reader import LollyModelReader -from .scorer import LollyModelScorer - - -if __name__ == "__main__": - lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1]) - lolly_model_scorer = LollyModelScorer(data_example_parser=DBv2DataExampleParser(lolly_model_reader)) - - score = lolly_model_scorer.score(data_example=sys.argv[2]) - print(score) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.docx new file mode 100644 index 000000000..330524c62 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py deleted file mode 100644 index 621c43388..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/scorer.py +++ /dev/null @@ -1,37 +0,0 @@ -class LollyModelScorer(object): - def __init__(self, data_example_parser): - self._data_example_parser = data_example_parser - - def score(self, data_example): - value_by_feature_name = self._data_example_parser.parse(data_example) - features = self._data_example_parser.features - return self._score(value_by_feature_name, features) - - def _score(self, value_by_feature_name, features): - score = features["bias"] - score += self._score_binary_features(features["binary"], value_by_feature_name) - score += self._score_discretized_features(features["discretized"], value_by_feature_name) - return score - - def _score_binary_features(self, binary_features, value_by_feature_name): - score = 0.0 - for binary_feature_name, binary_feature_weight in binary_features.items(): - if binary_feature_name in value_by_feature_name: - score += binary_feature_weight - return score - - def _score_discretized_features(self, discretized_features, value_by_feature_name): - score = 0.0 - for discretized_feature_name, buckets in discretized_features.items(): - if discretized_feature_name in value_by_feature_name: - feature_value = value_by_feature_name[discretized_feature_name] - score += self._find_matching_bucket_weight(buckets, feature_value) - return score - - def _find_matching_bucket_weight(self, buckets, feature_value): - for left_side, right_side, weight in buckets: - # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) - if feature_value >= left_side and feature_value < right_side: - return weight - - raise LookupError("Couldn't find a matching bucket for the given feature value.") diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.docx new file mode 100644 index 000000000..032e3e96d Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py deleted file mode 100644 index 2d0342551..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly/tf_model_initializer_builder.py +++ /dev/null @@ -1,91 +0,0 @@ -from .parsers import LollyModelFeaturesParser - - -class TFModelInitializerBuilder: - - def __init__(self, model_features_parser=LollyModelFeaturesParser()): - self._model_features_parser = model_features_parser - - def build(self, lolly_model_reader): - ''' - :param lolly_model_reader: LollyModelReader instance - :return: tf_model_initializer dictionary of the following format: - { - "features": { - "bias": 0.0, - "binary": { - # (feature name : feature weight) pairs - "feature_name_1": 0.0, - ... - "feature_nameN": 0.0 - }, - "discretized": { - # (feature name : index aligned lists of bin_boundaries and weights - "feature_name_1": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - ... - "feature_name_K": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - } - } - } - ''' - tf_model_initializer = { - "features": {} - } - - features = self._model_features_parser.parse(lolly_model_reader) - tf_model_initializer["features"]["bias"] = features["bias"] - self._set_discretized_features(features["discretized"], tf_model_initializer) - - self._dedup_binary_features(features["binary"], features["discretized"]) - tf_model_initializer["features"]["binary"] = features["binary"] - - return tf_model_initializer - - def _set_discretized_features(self, discretized_features, tf_model_initializer): - if len(discretized_features) == 0: - return - - num_bins = max([len(bins) for bins in discretized_features.values()]) - - bin_boundaries_and_weights = {} - for feature_name in discretized_features: - bin_boundaries_and_weights[feature_name] = self._extract_bin_boundaries_and_weights( - discretized_features[feature_name], num_bins) - - tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights - - def _dedup_binary_features(self, binary_features, discretized_features): - [binary_features.pop(feature_name) for feature_name in discretized_features] - - def _extract_bin_boundaries_and_weights(self, discretized_feature_buckets, num_bins): - bin_boundary_weight_pairs = [] - - for bucket in discretized_feature_buckets: - bin_boundary_weight_pairs.append([bucket[0], bucket[2]]) - - # The default DBv2 HashingDiscretizer bin membership interval is (a, b] - # - # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) - # - # Thus, convert (a, b] to [a, b) by inverting the bin boundaries. - for bin_boundary_weight_pair in bin_boundary_weight_pairs: - if bin_boundary_weight_pair[0] < float("inf"): - bin_boundary_weight_pair[0] *= -1 - - while len(bin_boundary_weight_pairs) < num_bins: - bin_boundary_weight_pairs.append([float("inf"), float(0)]) - - bin_boundary_weight_pairs.sort(key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0]) - - bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs)) - - return { - "bin_boundaries": bin_boundaries, - "weights": weights - } diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.docx new file mode 100644 index 000000000..c016c5b4e Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py deleted file mode 100644 index 6919914f8..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/metrics.py +++ /dev/null @@ -1,120 +0,0 @@ -# checkstyle: noqa -import tensorflow.compat.v1 as tf -from collections import OrderedDict -from .constants import EB_SCORE_IDX -from .lolly.data_helpers import get_lolly_scores - -import twml - -def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1): - """ - This function was copied from twml/metrics.py with the following adjustments: - - Override example weights with the ones set in graph_output. - - Tile labels in order to support per engagement metrics for both TF and Lolly scores. - - Add lolly_tf_score_MSE metric. - Note: All custom lines have a comment that starts with 'Added' - """ - # pylint: disable=invalid-name,dict-keys-not-iterating - if metrics is None: - # remove expensive metrics by default for faster eval - metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys()) - metrics.remove('pr_curve') - - def get_eval_metric_ops(graph_output, labels, weights): - """ - graph_output: - dict that is returned by build_graph given input features. - labels: - target labels associated to batch. - weights: - weights of the samples.. - """ - - # Added to support the example weights overriding. - weights = graph_output["weights"] - # Added to support per engagement metrics for both TF and Lolly scores. - labels = tf.tile(labels, [1, 2]) - - eval_metric_ops = OrderedDict() - - preds = graph_output['output'] - - threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5 - - hard_preds = graph_output.get('hard_output') - if not hard_preds: - hard_preds = tf.greater_equal(preds, threshold) - - shape = labels.get_shape() - - # basic sanity check: multi_metric dimension must exist - assert len(shape) > class_dim, "Dimension specified by class_dim does not exist." - - num_labels = shape[class_dim] - # If we are doing multi-class / multi-label metric, the number of classes / labels must - # be know at graph construction time. This dimension cannot have size None. - assert num_labels is not None, "The multi-metric dimension cannot be None." - assert classes is None or len(classes) == num_labels, ( - "Number of classes must match the number of labels") - - weights_shape = weights.get_shape() if weights is not None else None - if weights_shape is None: - num_weights = None - elif len(weights_shape) > 1: - num_weights = weights_shape[class_dim] - else: - num_weights = 1 - - for i in range(num_labels): - - # add metrics to eval_metric_ops dict - for metric_name in metrics: - metric_name = metric_name.lower() # metric name are case insensitive. - - class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i)) - - if class_metric_name in eval_metric_ops: - # avoid adding duplicate metrics. - continue - - class_labels = tf.gather(labels, indices=[i], axis=class_dim) - class_preds = tf.gather(preds, indices=[i], axis=class_dim) - class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim) - - if num_weights is None: - class_weights = None - elif num_weights == num_labels: - class_weights = tf.gather(weights, indices=[i], axis=class_dim) - elif num_weights == 1: - class_weights = weights - else: - raise ValueError("num_weights (%d) and num_labels (%d) do not match" - % (num_weights, num_labels)) - - metric_factory, requires_threshold = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name) - if metric_factory: - value_op, update_op = metric_factory( - labels=class_labels, - predictions=(class_hard_preds if requires_threshold else class_preds), - weights=class_weights, name=class_metric_name) - eval_metric_ops[class_metric_name] = (value_op, update_op) - else: - raise ValueError('Cannot find the metric named ' + metric_name) - - # Added to compare TF and Lolly scores. - eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels) - - return eval_metric_ops - - return get_eval_metric_ops - - -def get_mse(predictions, labels): - lolly_scores = get_lolly_scores(labels) - tf_scores = predictions[:, EB_SCORE_IDX] - squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores)) - - value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op") - update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op") - - return value_op, update_op diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD deleted file mode 100644 index d8cd264ad..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD +++ /dev/null @@ -1,8 +0,0 @@ -python3_library( - name = "libs_py3", - sources = ["*.py"], - dependencies = [ - "src/python/twitter/deepbird/io", - "twml:twml-nodeps", - ], -) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD.docx new file mode 100644 index 000000000..d79bef99e Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/BUILD.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.docx new file mode 100644 index 000000000..0fcd18354 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.docx new file mode 100644 index 000000000..f79cd3496 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py deleted file mode 100644 index 82c31bde0..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/discretizer_builder.py +++ /dev/null @@ -1,62 +0,0 @@ -from .hashing_utils import make_feature_id - -from twml.contrib.layers.hashing_discretizer import HashingDiscretizer -import numpy as np - - -class TFModelDiscretizerBuilder(object): - def __init__(self, num_bits): - self.num_bits = num_bits - - def build(self, tf_model_initializer): - ''' - :param tf_model_initializer: dictionary of the following format: - { - "features": { - "bias": 0.0, - "binary": { - # (feature name : feature weight) pairs - "feature_name_1": 0.0, - ... - "feature_nameN": 0.0 - }, - "discretized": { - # (feature name : index aligned lists of bin_boundaries and weights - "feature_name_1": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - ... - "feature_name_K": { - "bin_boundaries": [1, ..., inf], - "weights": [0.0, ..., 0.0] - } - } - } - } - :return: a HashingDiscretizer instance. - ''' - discretized_features = tf_model_initializer["features"]["discretized"] - - max_bins = 0 - - feature_ids = [] - bin_vals = [] - for feature_name in discretized_features: - bin_boundaries = discretized_features[feature_name]["bin_boundaries"] - feature_id = make_feature_id(feature_name, self.num_bits) - feature_ids.append(feature_id) - np_bin_boundaries = [np.float(bin_boundary) for bin_boundary in bin_boundaries] - bin_vals.append(np_bin_boundaries) - - max_bins = max(max_bins, len(np_bin_boundaries)) - - feature_ids_np = np.array(feature_ids) - bin_vals_np = np.array(bin_vals).flatten() - - return HashingDiscretizer( - feature_ids=feature_ids_np, - bin_vals=bin_vals_np, - n_bin=max_bins, - out_bits=self.num_bits - ) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.docx new file mode 100644 index 000000000..ab43793e7 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py deleted file mode 100644 index 2c57f8d63..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/hashing_utils.py +++ /dev/null @@ -1,29 +0,0 @@ -from twitter.deepbird.io.util import _get_feature_id - -import numpy as np - - -def numpy_hashing_uniform(the_id, bin_idx, output_bits): - """ - integer_multiplicative_hashing - This is a reimplementation, for testing purposes, of the - c++ version found in hashing_discretizer_impl.cpp - """ - hashing_constant = 2654435761 - N = 32 - with np.errstate(over='ignore'): - the_id *= hashing_constant - the_id += bin_idx - the_id *= hashing_constant - the_id >>= N - output_bits - the_id &= (1 << output_bits) - 1 - return the_id - - -def make_feature_id(name, num_bits): - feature_id = _get_feature_id(name) - return np.int64(limit_bits(feature_id, num_bits)) - - -def limit_bits(value, num_bits): - return value & ((2 ** num_bits) - 1) diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.docx new file mode 100644 index 000000000..95378abd9 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py deleted file mode 100644 index 63491ea38..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/tf_model/weights_initializer_builder.py +++ /dev/null @@ -1,34 +0,0 @@ -from .hashing_utils import make_feature_id, numpy_hashing_uniform - -import numpy as np -import tensorflow.compat.v1 as tf -import twml - - -class TFModelWeightsInitializerBuilder(object): - def __init__(self, num_bits): - self.num_bits = num_bits - - def build(self, tf_model_initializer): - ''' - :return: (bias_initializer, weight_initializer) - ''' - initial_weights = np.zeros((2 ** self.num_bits, 1)) - - features = tf_model_initializer["features"] - self._set_binary_feature_weights(initial_weights, features["binary"]) - self._set_discretized_feature_weights(initial_weights, features["discretized"]) - - return tf.constant_initializer(features["bias"]), twml.contrib.initializers.PartitionConstant(initial_weights) - - def _set_binary_feature_weights(self, initial_weights, binary_features): - for feature_name, weight in binary_features.items(): - feature_id = make_feature_id(feature_name, self.num_bits) - initial_weights[feature_id][0] = weight - - def _set_discretized_feature_weights(self, initial_weights, discretized_features): - for feature_name, discretized_feature in discretized_features.items(): - feature_id = make_feature_id(feature_name, self.num_bits) - for bin_idx, weight in enumerate(discretized_feature["weights"]): - final_bucket_id = numpy_hashing_uniform(feature_id, bin_idx, self.num_bits) - initial_weights[final_bucket_id][0] = weight diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.docx b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.docx new file mode 100644 index 000000000..c507836a7 Binary files /dev/null and b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.docx differ diff --git a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py b/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py deleted file mode 100644 index 6ef181f5f..000000000 --- a/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/train.py +++ /dev/null @@ -1,212 +0,0 @@ -# checkstyle: noqa -import tensorflow.compat.v1 as tf -from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn -from tensorflow.python.framework import dtypes -from tensorflow.python.ops import array_ops -import tensorflow_hub as hub - -from datetime import datetime -from tensorflow.compat.v1 import logging -from twitter.deepbird.projects.timelines.configs import all_configs -from twml.trainers import DataRecordTrainer -from twml.contrib.calibrators.common_calibrators import build_percentile_discretizer_graph -from twml.contrib.calibrators.common_calibrators import calibrate_discretizer_and_export -from .metrics import get_multi_binary_class_metric_fn -from .constants import TARGET_LABEL_IDX, PREDICTED_CLASSES -from .example_weights import add_weight_arguments, make_weights_tensor -from .lolly.data_helpers import get_lolly_logits -from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder -from .lolly.reader import LollyModelReader -from .tf_model.discretizer_builder import TFModelDiscretizerBuilder -from .tf_model.weights_initializer_builder import TFModelWeightsInitializerBuilder - -import twml - -def get_feature_values(features_values, params): - if params.lolly_model_tsv: - # The default DBv2 HashingDiscretizer bin membership interval is (a, b] - # - # The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b) - # - # TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries. - # - # Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket. - return tf.multiply(features_values, -1.0) - else: - return features_values - -def build_graph(features, label, mode, params, config=None): - weights = None - if "weights" in features: - weights = make_weights_tensor(features["weights"], label, params) - - num_bits = params.input_size_bits - - if mode == "infer": - indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits) - dense_shape = tf.stack([features["input_sparse_tensor_shape"][0], 1 << num_bits]) - sparse_tf = tf.SparseTensor( - indices=indices, - values=get_feature_values(features["input_sparse_tensor_values"], params), - dense_shape=dense_shape - ) - else: - features["values"] = get_feature_values(features["values"], params) - sparse_tf = twml.util.convert_to_sparse(features, num_bits) - - if params.lolly_model_tsv: - tf_model_initializer = TFModelInitializerBuilder().build(LollyModelReader(params.lolly_model_tsv)) - bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder(num_bits).build(tf_model_initializer) - discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer) - else: - discretizer = hub.Module(params.discretizer_save_dir) - bias_initializer, weight_initializer = None, None - - input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator") - - logits = twml.layers.full_sparse( - inputs=input_sparse, - output_size=1, - bias_initializer=bias_initializer, - weight_initializer=weight_initializer, - use_sparse_grads=(mode == "train"), - use_binary_values=True, - name="full_sparse_1" - ) - - loss = None - - if mode != "infer": - lolly_activations = get_lolly_logits(label) - - if opt.print_data_examples: - logits = print_data_example(logits, lolly_activations, features) - - if params.replicate_lolly: - loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations)) - else: - batch_size = tf.shape(label)[0] - target_label = tf.reshape(tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1)) - loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=target_label, logits=logits) - loss = twml.util.weighted_average(loss, weights) - - num_labels = tf.shape(label)[1] - eb_scores = tf.tile(lolly_activations, [1, num_labels]) - logits = tf.tile(logits, [1, num_labels]) - logits = tf.concat([logits, eb_scores], axis=1) - - output = tf.nn.sigmoid(logits) - - return {"output": output, "loss": loss, "weights": weights} - -def print_data_example(logits, lolly_activations, features): - return tf.Print( - logits, - [logits, lolly_activations, tf.reshape(features['keys'], (1, -1)), tf.reshape(tf.multiply(features['values'], -1.0), (1, -1))], - message="DATA EXAMPLE = ", - summarize=10000 - ) - -def earlybird_output_fn(graph_output): - export_outputs = { - tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: - tf.estimator.export.PredictOutput( - {"prediction": tf.identity(graph_output["output"], name="output_scores")} - ) - } - return export_outputs - -if __name__ == "__main__": - parser = DataRecordTrainer.add_parser_arguments() - - parser = twml.contrib.calibrators.add_discretizer_arguments(parser) - - parser.add_argument("--label", type=str, help="label for the engagement") - parser.add_argument("--model.use_existing_discretizer", action="store_true", - dest="model_use_existing_discretizer", - help="Load a pre-trained calibration or train a new one") - parser.add_argument("--input_size_bits", type=int) - parser.add_argument("--export_module_name", type=str, default="base_mlp", dest="export_module_name") - parser.add_argument("--feature_config", type=str) - parser.add_argument("--replicate_lolly", type=bool, default=False, dest="replicate_lolly", - help="Train a regression model with MSE loss and the logged Earlybird score as a label") - parser.add_argument("--lolly_model_tsv", type=str, required=False, dest="lolly_model_tsv", - help="Initialize with weights and discretizer bins available in the given Lolly model tsv file" - "No discretizer gets trained or loaded if set.") - parser.add_argument("--print_data_examples", type=bool, default=False, dest="print_data_examples", - help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'") - add_weight_arguments(parser) - - opt = parser.parse_args() - - feature_config_module = all_configs.select_feature_config(opt.feature_config) - - feature_config = feature_config_module.get_feature_config(data_spec_path=opt.data_spec, label=opt.label) - - parse_fn = twml.parsers.get_sparse_parse_fn( - feature_config, - keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes")) - - if not opt.lolly_model_tsv: - if opt.model_use_existing_discretizer: - logging.info("Skipping discretizer calibration [model.use_existing_discretizer=True]") - logging.info(f"Using calibration at {opt.discretizer_save_dir}") - else: - logging.info("Calibrating new discretizer [model.use_existing_discretizer=False]") - calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator( - opt.discretizer_num_bins, - opt.discretizer_output_size_bits - ) - calibrate_discretizer_and_export(name="recap_earlybird_hashing_discretizer", - params=opt, - calibrator=calibrator, - build_graph_fn=build_percentile_discretizer_graph, - feature_config=feature_config) - - trainer = DataRecordTrainer( - name="earlybird", - params=opt, - build_graph_fn=build_graph, - save_dir=opt.save_dir, - feature_config=feature_config, - metric_fn=get_multi_binary_class_metric_fn( - metrics=["roc_auc"], - classes=PREDICTED_CLASSES - ), - warm_start_from=None - ) - - train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn) - eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn) - - logging.info("Training and Evaluation ...") - trainingStartTime = datetime.now() - trainer.train_and_evaluate(train_input_fn=train_input_fn, eval_input_fn=eval_input_fn) - trainingEndTime = datetime.now() - logging.info("Training and Evaluation time: " + str(trainingEndTime - trainingStartTime)) - - if trainer._estimator.config.is_chief: - serving_input_in_earlybird = { - "input_sparse_tensor_indices": array_ops.placeholder( - name="input_sparse_tensor_indices", - shape=[None, 2], - dtype=dtypes.int64), - "input_sparse_tensor_values": array_ops.placeholder( - name="input_sparse_tensor_values", - shape=[None], - dtype=dtypes.float32), - "input_sparse_tensor_shape": array_ops.placeholder( - name="input_sparse_tensor_shape", - shape=[2], - dtype=dtypes.int64) - } - serving_input_receiver_fn = build_raw_serving_input_receiver_fn(serving_input_in_earlybird) - twml.contrib.export.export_fn.export_all_models( - trainer=trainer, - export_dir=opt.export_dir, - parse_fn=parse_fn, - serving_input_receiver_fn=serving_input_receiver_fn, - export_output_fn=earlybird_output_fn, - feature_spec=feature_config.get_feature_spec() - ) - logging.info("The export model path is: " + opt.export_dir) diff --git a/src/scala/com/twitter/graph/batch/BUILD.bazel b/src/scala/com/twitter/graph/batch/BUILD.bazel deleted file mode 100644 index 0dcfc85cf..000000000 --- a/src/scala/com/twitter/graph/batch/BUILD.bazel +++ /dev/null @@ -1,91 +0,0 @@ -JOB = ["job/**/*"] - -scala_library( - name = "batch", - sources = ["**/*.scala"], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "3rdparty/jvm/cascading:cascading-core", - "3rdparty/jvm/cascading:cascading-hadoop", - "3rdparty/jvm/cascading:cascading-local", - "3rdparty/jvm/cascading:cascading-thrift", - "3rdparty/jvm/com/twitter/algebird:core", - "3rdparty/jvm/com/twitter/algebird:util", - "3rdparty/jvm/com/twitter/storehaus:algebra", - "3rdparty/jvm/com/twitter/storehaus:core", - "3rdparty/src/jvm/com/twitter/scalding:args", - "3rdparty/src/jvm/com/twitter/scalding:commons", - "3rdparty/src/jvm/com/twitter/scalding:core", - "3rdparty/src/jvm/com/twitter/scalding:date", - "3rdparty/src/jvm/com/twitter/scalding:parquet", - "3rdparty/src/jvm/com/twitter/summingbird:batch", - "3rdparty/src/jvm/com/twitter/summingbird:client", - "graphstore/common:flock_follows-java", - "src/java/com/twitter/common_internal/util:date_util", - "src/java/com/twitter/twadoop/batch", - "src/java/com/twitter/twadoop/util/dbconfig", - "src/java/com/twitter/twadoop/util/yaml", - "src/protobuf/com/twitter/twadoop", - "src/scala/com/twitter/pluck", - "src/scala/com/twitter/pluck/source/combined_user_source", - "src/scala/com/twitter/pluck/source/jdbc", - "src/scala/com/twitter/scalding_internal/error_handling", - "src/scala/com/twitter/scalding_internal/job", - "src/scala/com/twitter/scalding_internal/job/analytics_batch", - "src/scala/com/twitter/scalding_internal/multiformat", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/wtf/scalding/jobs/common:date_util", - "src/thrift/com/twitter/gizmoduck:user-thrift-java", - "src/thrift/com/twitter/twadoop/user/gen:gen-java", - "util/util-core:scala", - ], -) - -#pants.new build target for the old "dist" -hadoop_binary( - name = "graph-batch-deploy", - main = "com.twitter.scalding.Tool", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweepcred", - ], -) - -# Generated with `capesospy-v2 create_target tweepcred_job science/scalding/mesos/wtf/recos_platform_atla_proc.yaml`, config hash d63a47. -scalding_job( - name = "tweepcred_job", - main = "com.twitter.graph.batch.job.tweepcred.TweepcredBatchJob", - args = ["--weighted false --hadoop_config /etc/hadoop/hadoop-conf-proc-atla"], - config = [ - ("hadoop.combine-input", "true"), - ("hadoop.map.jvm.total-memory", "3072m"), - ("hadoop.queue", "cassowary.default"), - ("hadoop.reduce.jvm.total-memory", "3072m"), - ("hadoop.reducers", "1200"), - ("hadoop.submitter.disk", "200000m"), - ("hadoop.submitter.jvm.total-memory", "5120m"), - ("submitter.tier", "preemptible"), - ], - cron = "24,44,04 * * * *", - hadoop_cluster = "atla-proc", - platform = "java8", - role = "cassowary", - runtime_platform = "java8", - tags = [ - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":tweepcred", - ], -) diff --git a/src/scala/com/twitter/graph/batch/BUILD.docx b/src/scala/com/twitter/graph/batch/BUILD.docx new file mode 100644 index 000000000..ef24c6dc8 Binary files /dev/null and b/src/scala/com/twitter/graph/batch/BUILD.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.docx new file mode 100644 index 000000000..22adbe9d1 Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.scala deleted file mode 100644 index 568e85251..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/ExtractTweepcred.scala +++ /dev/null @@ -1,83 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.pluck.source.combined_user_source.MostRecentCombinedUserSnapshotSource -import com.twitter.scalding._ - -/** - * Calculate tweepcred from the given pagerank file. If post_adjust is true, - * reduce pagerank for users with low followers compared to number of - * followings based on existing reputation code. - * Options: - * --input_pagerank: given pagerank - * --user_mass: user mass tsv file, generated by twadoop user_mass job - * --output_pagerank: where to put pagerank file - * --output_tweepcred: where to put tweepcred file - * optional arguments: - * --post_adjust: whether to do post adjust, default true - * - */ -class ExtractTweepcred(args: Args) extends Job(args) { - val POST_ADJUST = args.getOrElse("post_adjust", "true").toBoolean - - val inputPagerank = getInputPagerank(args("input_pagerank")) - .map(() -> ('num_followers, 'num_followings)) { (u: Unit) => - (0, 0) - } - - val userInfo = TypedPipe - .from(MostRecentCombinedUserSnapshotSource) - .flatMap { combinedUser => - val user = Option(combinedUser.user) - val userId = user.map(_.id).getOrElse(0L) - val userExtended = Option(combinedUser.user_extended) - val numFollowers = userExtended.flatMap(u => Option(u.followers)).map(_.toInt).getOrElse(0) - val numFollowings = userExtended.flatMap(u => Option(u.followings)).map(_.toInt).getOrElse(0) - - if (userId == 0L || user.map(_.safety).exists(_.deactivated)) { - None - } else { - Some((userId, 0.0, numFollowers, numFollowings)) - } - } - .toPipe[(Long, Double, Int, Int)]('src_id, 'mass_input, 'num_followers, 'num_followings) - - val pagerankWithSuspended = (inputPagerank ++ userInfo) - .groupBy('src_id) { - _.max('mass_input) - .max('num_followers) - .max('num_followings) - } - - pagerankWithSuspended - .discard('num_followers, 'num_followings) - .write(Tsv(args("output_pagerank"))) - - val adjustedPagerank = - if (POST_ADJUST) { - pagerankWithSuspended - .map(('mass_input, 'num_followers, 'num_followings) -> 'mass_input) { - input: (Double, Int, Int) => - Reputation.adjustReputationsPostCalculation(input._1, input._2, input._3) - } - .normalize('mass_input) - } else { - pagerankWithSuspended - .discard('num_followers, 'num_followings) - } - - val tweepcred = adjustedPagerank - .map('mass_input -> 'mass_input) { input: Double => - Reputation.scaledReputation(input) - } - - tweepcred.write(Tsv(args("output_tweepcred"))) - tweepcred.write(Tsv(args("current_tweepcred"))) - tweepcred.write(Tsv(args("today_tweepcred"))) - - def getInputPagerank(fileName: String) = { - Tsv(fileName).read - .mapTo((0, 1) -> ('src_id, 'mass_input)) { input: (Long, Double) => - input - } - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.docx new file mode 100644 index 000000000..82345af6f Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.scala deleted file mode 100644 index 284ba45f8..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/PreparePageRankData.scala +++ /dev/null @@ -1,275 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.data.proto.Flock -import com.twitter.scalding._ -import com.twitter.pluck.source._ -import com.twitter.pluck.source.combined_user_source.MostRecentCombinedUserSnapshotSource -import com.twitter.scalding_internal.dalv2.DAL -import com.twitter.service.interactions.InteractionGraph -import graphstore.common.FlockFollowsJavaDataset -import java.util.TimeZone - -/** - * Prepare the graph data for page rank calculation. Also generate the initial - * pagerank as the starting point. Afterwards, start WeightedPageRank job. - * - * Either read a tsv file for testing or read the following to build the graph - * flock edges Flock.Edge - * real graph input for weights InteractionGraph.Edge - * - * Options: - * --pwd: working directory, will generate the following files there - * numnodes: total number of nodes - * nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> - * pagerank: the page rank file - * --user_mass: user mass tsv file, generated by twadoop user_mass job - * Optional arguments: - * --input: use the given tsv file instead of flock and real graph - * --weighted: do weighted pagerank, default false - * --flock_edges_only: restrict graph to flock edges, default true - * --input_pagerank: continue pagerank from this - * - * Plus the following options for WeightedPageRank and ExtractTweepcred: - * --output_pagerank: where to put pagerank file - * --output_tweepcred: where to put tweepcred file - * Optional: - * --maxiterations: how many iterations to run. Default is 20 - * --jumpprob: probability of a random jump, default is 0.1 - * --threshold: total difference before finishing early, default 0.001 - * --post_adjust: whether to do post adjust, default true - */ -class PreparePageRankData(args: Args) extends Job(args) { - implicit val timeZone: TimeZone = DateOps.UTC - val PWD = args("pwd") - val WEIGHTED = args.getOrElse("weighted", "false").toBoolean - val FLOCK_EDGES_ONLY = args.getOrElse("flock_edges_only", "true").toBoolean - - val ROW_TYPE_1 = 1 - val ROW_TYPE_2 = 2 - - // graph data and user mass - val userMass = getUserMass - val nodesWithPrior = getGraphData(userMass) - val numNodes = nodesWithPrior.groupAll { _.size } - numNodes.write(Tsv(PWD + "/numnodes")) - dumpNodes(nodesWithPrior, PWD + "/nodes"); - - // initial pagerank to start computation - generateInitialPagerank(nodesWithPrior) - - // continue with the calculation - override def next = { - Some(new WeightedPageRank(args)) - } - - /** - * read flock edges - */ - def getFlockEdges = { - DAL - .readMostRecentSnapshotNoOlderThan(FlockFollowsJavaDataset, Days(7)) - .toTypedSource - .flatMapTo('src_id, 'dst_id) { edge: Flock.Edge => - if (edge.getStateId() == Flock.State.Positive.getNumber()) { - Some((edge.getSourceId(), edge.getDestinationId())) - } else { - None - } - } - } - - /** - * read real graph edges with weights - */ - def getRealGraphEdges = { - RealGraphEdgeSource() - .flatMapTo('src_id, 'dst_id, 'weight) { edge: InteractionGraph.Edge => - if (edge.getSourceId() != edge.getDestinationId()) { - val srcId = edge.getSourceId() - val dstId = edge.getDestinationId() - val weight = edge.getWeight().toFloat - Some((srcId, dstId, weight)) - } else { - None - } - } - } - - /** - * combine real graph and flock. If flock_edges_only is true, only take the - * flock edges; otherwise edges are either from flock or from real graph. - * edges weights default to be 1, overwritten by weights from real graph - */ - def getFlockRealGraphEdges = { - val flock = getFlockEdges - - if (WEIGHTED) { - val flockWithWeight = flock - .map(() -> ('weight, 'rowtype)) { (u: Unit) => - (1.0f, ROW_TYPE_1) - } - - val realGraph = getRealGraphEdges - .map(() -> 'rowtype) { (u: Unit) => - (ROW_TYPE_2) - } - - val combined = (flockWithWeight ++ realGraph) - .groupBy('src_id, 'dst_id) { - _.min('rowtype) - .max('weight) // take whichever is bigger - } - - if (FLOCK_EDGES_ONLY) { - combined.filter('rowtype) { (rowtype: Int) => - rowtype == ROW_TYPE_1 - } - } else { - combined - } - } else { - flock.map(() -> ('weight)) { (u: Unit) => - 1.0f - } - }.project('src_id, 'dst_id, 'weight) - } - - def getCsvEdges(fileName: String) = { - Tsv(fileName).read - .mapTo((0, 1, 2) -> ('src_id, 'dst_id, 'weight)) { input: (Long, Long, Float) => - input - } - } - - /* - * Compute user mass based on combined user - */ - def getUserMass = - TypedPipe - .from(MostRecentCombinedUserSnapshotSource) - .flatMap { user => - UserMass.getUserMass(user) - } - .map { userMassInfo => - (userMassInfo.userId, userMassInfo.mass) - } - .toPipe[(Long, Double)]('src_id_input, 'mass_prior) - .normalize('mass_prior) - - /** - * Read either flock/real_graph or a given tsv file - * group by the source id, and output node data structure - * merge with the user_mass. - * return <'src_id, 'dst_ids, 'weights, 'mass_prior> - * - * make sure src_id is the same set as in user_mass, and dst_ids - * are subset of user_mass. eg flock has edges like 1->2, - * where both users 1 and 2 do not exist anymore - */ - def getGraphData(userMass: RichPipe) = { - val edges: RichPipe = args.optional("input") match { - case None => getFlockRealGraphEdges - case Some(input) => getCsvEdges(input) - } - - // remove edges where dst_id is not in userMass - val filterByDst = userMass - .joinWithLarger('src_id_input -> 'dst_id, edges) - .discard('src_id_input, 'mass_prior) - - // aggreate by the source id - val nodes = filterByDst - .groupBy('src_id) { - _.mapReduceMap(('dst_id, 'weight) -> ('dst_ids, 'weights)) /* map1 */ { a: (Long, Float) => - (Vector(a._1), if (WEIGHTED) Vector(a._2) else Vector()) - } /* reduce */ { (a: (Vector[Long], Vector[Float]), b: (Vector[Long], Vector[Float])) => - { - (a._1 ++ b._1, a._2 ++ b._2) - } - } /* map2 */ { a: (Vector[Long], Vector[Float]) => - a - } - } - .mapTo( - ('src_id, 'dst_ids, 'weights) -> ('src_id, 'dst_ids, 'weights, 'mass_prior, 'rowtype)) { - input: (Long, Vector[Long], Vector[Float]) => - { - (input._1, input._2.toArray, input._3.toArray, 0.0, ROW_TYPE_1) - } - } - - // get to the same schema - val userMassNodes = userMass - .mapTo(('src_id_input, 'mass_prior) -> ('src_id, 'dst_ids, 'weights, 'mass_prior, 'rowtype)) { - input: (Long, Double) => - { - (input._1, Array[Long](), Array[Float](), input._2, ROW_TYPE_2) - } - } - - // make src_id the same set as in userMass - (nodes ++ userMassNodes) - .groupBy('src_id) { - _.sortBy('rowtype) - .head('dst_ids, 'weights) - .last('mass_prior, 'rowtype) - } - .filter('rowtype) { input: Int => - input == ROW_TYPE_2 - } - } - - /** - * generate the graph data output - */ - def dumpNodes(nodes: RichPipe, fileName: String) = { - mode match { - case Hdfs(_, conf) => nodes.write(SequenceFile(fileName)) - case _ => - nodes - .mapTo((0, 1, 2, 3) -> (0, 1, 2, 3)) { input: (Long, Array[Long], Array[Float], Double) => - (input._1, input._2.mkString(","), input._3.mkString(","), input._4) - } - .write(Tsv(fileName)) - } - } - - /* - * output prior mass or copy the given mass file (merge, normalize) - * to be used as the starting point - */ - def generateInitialPagerank(nodes: RichPipe) = { - val prior = nodes - .project('src_id, 'mass_prior) - - val combined = args.optional("input_pagerank") match { - case None => prior - case Some(fileName) => { - val massInput = Tsv(fileName).read - .mapTo((0, 1) -> ('src_id, 'mass_prior, 'rowtype)) { input: (Long, Double) => - (input._1, input._2, ROW_TYPE_2) - } - - val priorRow = prior - .map(() -> ('rowtype)) { (u: Unit) => - ROW_TYPE_1 - } - - (priorRow ++ massInput) - .groupBy('src_id) { - _.sortBy('rowtype) - .last('mass_prior) - .head('rowtype) - } - // throw away extra nodes from input file - .filter('rowtype) { (rowtype: Int) => - rowtype == ROW_TYPE_1 - } - .discard('rowtype) - .normalize('mass_prior) - } - } - - combined.write(Tsv(PWD + "/pagerank_0")) - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/README b/src/scala/com/twitter/graph/batch/job/tweepcred/README deleted file mode 100644 index 55ef3b093..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/README +++ /dev/null @@ -1,75 +0,0 @@ -Tweepcred - -Tweepcred is a social network analysis tool that calculates the influence of Twitter users based on their interactions with other users. The tool uses the PageRank algorithm to rank users based on their influence. - -PageRank Algorithm -PageRank is a graph algorithm that was originally developed by Google to determine the importance of web pages in search results. The algorithm works by assigning a numerical score to each page based on the number and quality of other pages that link to it. The more links a page has from other high-quality pages, the higher its PageRank score. - -In the Tweepcred project, the PageRank algorithm is used to determine the influence of Twitter users based on their interactions with other users. The graph is constructed by treating Twitter users as nodes, and their interactions (mentions, retweets, etc.) as edges. The PageRank score of a user represents their influence in the network. - -Tweepcred PageRank Implementation -The implementation of the PageRank algorithm in Tweepcred is based on the Hadoop MapReduce framework. The algorithm is split into two stages: preparation and iteration. - -The preparation stage involves constructing the graph of Twitter users and their interactions, and initializing each user's PageRank score to a default value. This stage is implemented in the PreparePageRankData class. - -The iteration stage involves repeatedly calculating and updating the PageRank scores of each user until convergence is reached. This stage is implemented in the UpdatePageRank class, which is run multiple times until the algorithm converges. - -The Tweepcred PageRank implementation also includes a number of optimizations to improve performance and reduce memory usage. These optimizations include block compression, lazy loading, and in-memory caching. - - -========================================== TweepcredBatchJob.scala ========================================== - - -This is a Scala class that represents a batch job for computing the "tweepcred" (Twitter credibility) score for Twitter users using weighted or unweighted PageRank algorithm. The class extends the AnalyticsIterativeBatchJob class, which is part of the Scalding framework used for data processing on Hadoop. - -The class defines various properties and methods that are used to configure and run the batch job. The args parameter represents the command-line arguments that are passed to the batch job, such as the --weighted flag that determines whether to use the weighted PageRank algorithm or not. - -The run method overrides the run method of the base class and prints the batch statistics after the job has finished. The children method defines a list of child jobs that need to be executed as part of the batch job. The messageHeader method returns a string that represents the header of the batch job message. - -========================================== ExtractTweepcred.scala ========================================== - -This class is a Scalding job that calculates "tweepcred" from a given pagerank file. Tweepcred is a measure of reputation for Twitter users that takes into account the number of followers they have and the number of people they follow. If the optional argument post_adjust is set to true (default value), then the pagerank values are adjusted based on the user's follower-to-following ratio. - -The class takes several command-line arguments specifying input and output files and options, and it uses the Scalding library to perform distributed data processing on the input files. It reads in the pagerank file and a user mass file, both in TSV format, and combines them to produce a new pagerank file with the adjusted values. The adjusted pagerank is then used to calculate tweepcred values, which are written to output files. - -The code makes use of the MostRecentCombinedUserSnapshotSource class from the com.twitter.pluck.source.combined_user_source package to obtain user information from the user mass file. It also uses the Reputation class to perform the tweepcred calculations and adjustments. - - -========================================== UserMass.scala ========================================== - -The UserMass class is a helper class used to calculate the "mass" of a user on Twitter, as defined by a certain algorithm. The mass score represents the user's reputation and is used in various applications, such as in determining which users should be recommended to follow or which users should have their content highlighted. - -The getUserMass method of the UserMass class takes in a CombinedUser object, which contains information about a Twitter user, and returns an optional UserMassInfo object, which contains the user's ID and calculated mass score. - -The algorithm used to calculate the mass score takes into account various factors such as the user's account age, number of followers and followings, device usage, and safety status (restricted, suspended, verified). The calculation involves adding and multiplying weight factors and adjusting the mass score based on a threshold for the number of friends and followers. - - -========================================== PreparePageRankData.scala ========================================== - -The PreparePageRankData class prepares the graph data for the page rank calculation. It generates the initial pagerank and then starts the WeightedPageRank job. It has the following functionalities: - -It reads the user mass TSV file generated by the twadoop user_mass job. -It reads the graph data, which is either a TSV file or a combination of flock edges and real graph inputs for weights. -It generates the initial pagerank as the starting point for the pagerank computation. -It writes the number of nodes to a TSV file and dumps the nodes to another TSV file. -It has several options like weighted, flock_edges_only, and input_pagerank to fine-tune the pagerank calculation. -It also has options for the WeightedPageRank and ExtractTweepcred jobs, like output_pagerank, output_tweepcred, maxiterations, jumpprob, threshold, and post_adjust. -The PreparePageRankData class has several helper functions like getFlockEdges, getRealGraphEdges, getFlockRealGraphEdges, and getCsvEdges that read the graph data from different sources like DAL, InteractionGraph, or CSV files. It also has the generateInitialPagerank function that generates the initial pagerank from the graph data. - -========================================== WeightedPageRank.scala ========================================== - -WeightedPageRank is a class that performs the weighted PageRank algorithm on a given graph. - -The algorithm starts from a given PageRank value and performs one iteration, then tests for convergence. If convergence has not been reached, the algorithm clones itself and starts the next PageRank job with the updated PageRank as input. If convergence has been reached, the algorithm starts the ExtractTweepcred job instead. - -The class takes in several options, including the working directory, total number of nodes, nodes file, PageRank file, total difference, whether to perform weighted PageRank, the current iteration, maximum iterations to run, probability of a random jump, and whether to do post adjust. - -The algorithm reads a nodes file that includes the source node ID, destination node IDs, weights, and mass prior. The algorithm also reads an input PageRank file that includes the source node ID and mass input. The algorithm then performs one iteration of the PageRank algorithm and writes the output PageRank to a file. - -The algorithm tests for convergence by calculating the total difference between the input and output PageRank masses. If convergence has not been reached, the algorithm clones itself and starts the next PageRank job. If convergence has been reached, the algorithm starts the ExtractTweepcred job. - -========================================== Reputation.scala ========================================== - -This is a helper class called Reputation that contains methods for calculating a user's reputation score. The first method called scaledReputation takes a Double parameter raw which represents the user's page rank, and returns a Byte value that represents the user's reputation on a scale of 0 to 100. This method uses a formula that involves converting the logarithm of the page rank to a number between 0 and 100. - -The second method called adjustReputationsPostCalculation takes three parameters: mass (a Double value representing the user's page rank), numFollowers (an Int value representing the number of followers a user has), and numFollowings (an Int value representing the number of users a user is following). This method reduces the page rank of users who have a low number of followers but a high number of followings. It calculates a division factor based on the ratio of followings to followers, and reduces the user's page rank by dividing it by this factor. The method returns the adjusted page rank. diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/README.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/README.docx new file mode 100644 index 000000000..f74bf915d Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/README.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.docx new file mode 100644 index 000000000..0375417bb Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.scala deleted file mode 100644 index 6c81805fd..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.scala +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -/** - * helper class to calculate reputation, borrowed from repo reputations - */ -object Reputation { - - /** - * convert pagerank to tweepcred between 0 and 100, - * take from repo reputations, util/Utils.scala - */ - def scaledReputation(raw: Double): Byte = { - if (raw == 0 || (raw < 1.0e-20)) { - 0 - } else { - // convert log(pagerank) to a number between 0 and 100 - // the two parameters are from a linear fit by converting - // max pagerank -> 95 - // min pagerank -> 15 - val e: Double = 130d + 5.21 * scala.math.log(raw) // log to the base e - val pos = scala.math.rint(e) - val v = if (pos > 100) 100.0 else if (pos < 0) 0.0 else pos - v.toByte - } - } - - // these constants are take from repo reputations, config/production.conf - private val threshAbsNumFriendsReps = 2500 - private val constantDivisionFactorGt_threshFriendsToFollowersRatioReps = 3.0 - private val threshFriendsToFollowersRatioUMass = 0.6 - private val maxDivFactorReps = 50 - - /** - * reduce pagerank of users with low followers but high followings - */ - def adjustReputationsPostCalculation(mass: Double, numFollowers: Int, numFollowings: Int) = { - if (numFollowings > threshAbsNumFriendsReps) { - val friendsToFollowersRatio = (1.0 + numFollowings) / (1.0 + numFollowers) - val divFactor = - scala.math.exp( - constantDivisionFactorGt_threshFriendsToFollowersRatioReps * - (friendsToFollowersRatio - threshFriendsToFollowersRatioUMass) * - scala.math.log(scala.math.log(numFollowings)) - ) - mass / ((divFactor min maxDivFactorReps) max 1.0) - } else { - mass - } - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.docx new file mode 100644 index 000000000..585d64dde Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.scala deleted file mode 100644 index 48c06027b..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/TweepcredBatchJob.scala +++ /dev/null @@ -1,64 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.scalding._ -import com.twitter.scalding_internal.job._ -import com.twitter.scalding_internal.job.analytics_batch._ - -/** - * Register the beginning of the tweepcred job in analytic batch table - * - * Options: - * --weighted: do weighted pagerank - * --hadoop_config: /etc/hadoop/hadoop-conf-proc-atla - * - */ -class TweepcredBatchJob(args: Args) extends AnalyticsIterativeBatchJob(args) { - - def WEIGHTED = args("weighted").toBoolean - - override def timeout = Hours(36) - override def hasFlow = false - def descriptionSuffix = " weighted=" + args("weighted") - override def batchIncrement = Hours(24) - override def firstTime = RichDate("2015-10-02") - override def batchDescription = classOf[TweepcredBatchJob].getCanonicalName + descriptionSuffix - - override def run = { - val success = super.run - println("Batch Stat: " + messageHeader + " " + jobStat.get.toString) - success - } - - def startTime = dateRange.start - def dateString = startTime.toString("yyyy/MM/dd") - - override def children = { - val BASEDIR = "/user/cassowary/tweepcred/" - val baseDir = BASEDIR + (if (WEIGHTED) "weighted" else "unweighted") + "/daily/" - val tmpDir = baseDir + "tmp" - val outputDir = baseDir + dateString - val pageRankDir = outputDir + "/finalmass" - val tweepcredDir = outputDir + "/finaltweepcred" - val yesterdayStr = (startTime - Days(1)).toString("yyyy/MM/dd") - val yestPageRankDir = baseDir + yesterdayStr + "/finalmass" - val TWEEPCRED = "/tweepcred" - val curRep = (if (WEIGHTED) baseDir else BASEDIR) + "current" - val todayRep = (if (WEIGHTED) baseDir else BASEDIR) + dateString - val newArgs = args + ("pwd", Some(tmpDir)) + - ("output_pagerank", Some(pageRankDir)) + - ("output_tweepcred", Some(tweepcredDir)) + - ("input_pagerank", Some(yestPageRankDir)) + - ("current_tweepcred", Some(curRep + TWEEPCRED)) + - ("today_tweepcred", Some(todayRep + TWEEPCRED)) - - val prJob = new PreparePageRankData(newArgs) - - List(prJob) - } - - private def messageHeader = { - val dateString = dateRange.start.toString("yyyy/MM/dd") - classOf[TweepcredBatchJob].getSimpleName + - (if (WEIGHTED) " weighted " else " unweighted ") + dateString - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.docx new file mode 100644 index 000000000..3c81d3f6b Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.scala deleted file mode 100644 index 064819bb0..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.scala +++ /dev/null @@ -1,69 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.twadoop.user.gen.CombinedUser -import com.twitter.util.Time -import com.twitter.wtf.scalding.jobs.common.DateUtil - -case class UserMassInfo(userId: Long, mass: Double) - -/** - * helper class to calculate user mass, borrowed from repo reputations - */ -object UserMass { - - private val currentTimestamp = Time.now.inMilliseconds - private val constantDivisionFactorGt_threshFriendsToFollowersRatioUMass = 5.0 - private val threshAbsNumFriendsUMass = 500 - private val threshFriendsToFollowersRatioUMass = 0.6 - private val deviceWeightAdditive = 0.5 - private val ageWeightAdditive = 0.2 - private val restrictedWeightMultiplicative = 0.1 - - def getUserMass(combinedUser: CombinedUser): Option[UserMassInfo] = { - val user = Option(combinedUser.user) - val userId = user.map(_.id).getOrElse(0L) - val userExtended = Option(combinedUser.user_extended) - val age = user.map(_.created_at_msec).map(DateUtil.diffDays(_, currentTimestamp)).getOrElse(0) - val isRestricted = user.map(_.safety).exists(_.restricted) - val isSuspended = user.map(_.safety).exists(_.suspended) - val isVerified = user.map(_.safety).exists(_.verified) - val hasValidDevice = user.flatMap(u => Option(u.devices)).exists(_.isSetMessaging_devices) - val numFollowers = userExtended.flatMap(u => Option(u.followers)).map(_.toInt).getOrElse(0) - val numFollowings = userExtended.flatMap(u => Option(u.followings)).map(_.toInt).getOrElse(0) - - if (userId == 0L || user.map(_.safety).exists(_.deactivated)) { - None - } else { - val mass = - if (isSuspended) - 0 - else if (isVerified) - 100 - else { - var score = deviceWeightAdditive * 0.1 + - (if (hasValidDevice) deviceWeightAdditive else 0) - val normalizedAge = if (age > 30) 1.0 else (1.0 min scala.math.log(1.0 + age / 15.0)) - score *= normalizedAge - if (score < 0.01) score = 0.01 - if (isRestricted) score *= restrictedWeightMultiplicative - score = (score min 1.0) max 0 - score *= 100 - score - } - - val friendsToFollowersRatio = (1.0 + numFollowings) / (1.0 + numFollowers) - val adjustedMass = - if (numFollowings > threshAbsNumFriendsUMass && - friendsToFollowersRatio > threshFriendsToFollowersRatioUMass) { - mass / scala.math.exp( - constantDivisionFactorGt_threshFriendsToFollowersRatioUMass * - (friendsToFollowersRatio - threshFriendsToFollowersRatioUMass) - ) - } else { - mass - } - - Some(UserMassInfo(userId, adjustedMass)) - } - } -} diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.docx b/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.docx new file mode 100644 index 000000000..ba44df6d2 Binary files /dev/null and b/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.docx differ diff --git a/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.scala b/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.scala deleted file mode 100644 index 7e06077a1..000000000 --- a/src/scala/com/twitter/graph/batch/job/tweepcred/WeightedPageRank.scala +++ /dev/null @@ -1,235 +0,0 @@ -package com.twitter.graph.batch.job.tweepcred - -import com.twitter.scalding._ - -/** - * weighted page rank for the given graph, start from the given pagerank, - * perform one iteration, test for convergence, if not yet, clone itself - * and start the next page rank job with updated pagerank as input; - * if converged, start ExtractTweepcred job instead - * - * Options: - * --pwd: working directory, will read/generate the following files there - * numnodes: total number of nodes - * nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> - * pagerank: the page rank file eg pagerank_0, pagerank_1 etc - * totaldiff: the current max pagerank delta - * Optional arguments: - * --weighted: do weighted pagerank, default false - * --curiteration: what is the current iteration, default 0 - * --maxiterations: how many iterations to run. Default is 20 - * --jumpprob: probability of a random jump, default is 0.1 - * --threshold: total difference before finishing early, default 0.001 - * - * plus the following options for ExtractTweepcred: - * --user_mass: user mass tsv file, generated by twadoop user_mass job - * --output_pagerank: where to put pagerank file - * --output_tweepcred: where to put tweepcred file - * Optional: - * --post_adjust: whether to do post adjust, default true - * - */ -class WeightedPageRank(args: Args) extends Job(args) { - val ROW_TYPE_1 = 1 - val ROW_TYPE_2 = 2 - - val PWD = args("pwd") - val ALPHA = args.getOrElse("jumpprob", "0.1").toDouble - val WEIGHTED = args.getOrElse("weighted", "false").toBoolean - val THRESHOLD = args.getOrElse("threshold", "0.001").toDouble - val MAXITERATIONS = args.getOrElse("maxiterations", "20").toInt - val CURITERATION = args.getOrElse("curiteration", "0").toInt - - // 'size - val numNodes = getNumNodes(PWD + "/numnodes") - - // 'src_id, 'dst_ids, 'weights, 'mass_prior - val nodes = getNodes(PWD + "/nodes") - - // 'src_id_input, 'mass_input - val inputPagerank = getInputPagerank(PWD + "/pagerank_" + CURITERATION) - - // one iteration of pagerank - val outputPagerank = doPageRank(nodes, inputPagerank) - val outputFileName = PWD + "/pagerank_" + (CURITERATION + 1) - outputPagerank - .project('src_id, 'mass_n) - .write(Tsv(outputFileName)) - - // detect convergence - val totalDiff = outputPagerank - .mapTo(('mass_input, 'mass_n) -> 'mass_diff) { args: (Double, Double) => - scala.math.abs(args._1 - args._2) - } - .groupAll { _.sum[Double]('mass_diff) } - .write(Tsv(PWD + "/totaldiff")) - - /** - * test convergence, if not yet, kick off the next iteration - */ - override def next = { - // the max diff generated above - val totalDiff = Tsv(PWD + "/totaldiff").readAtSubmitter[Double].head - - if (CURITERATION < MAXITERATIONS - 1 && totalDiff > THRESHOLD) { - val newArgs = args + ("curiteration", Some((CURITERATION + 1).toString)) - Some(clone(newArgs)) - } else { - val newArgs = args + ("input_pagerank", Some(outputFileName)) - Some(new ExtractTweepcred(newArgs)) - } - } - - def getInputPagerank(fileName: String) = { - Tsv(fileName).read - .mapTo((0, 1) -> ('src_id_input, 'mass_input)) { input: (Long, Double) => - input - } - } - - /** - * read the pregenerated nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior> - */ - def getNodes(fileName: String) = { - mode match { - case Hdfs(_, conf) => { - SequenceFile(fileName).read - .mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) { - input: (Long, Array[Long], Array[Float], Double) => - input - } - } - case _ => { - Tsv(fileName).read - .mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) { - input: (Long, String, String, Double) => - { - ( - input._1, - // convert string to int array - if (input._2 != null && input._2.length > 0) { - input._2.split(",").map { _.toLong } - } else { - Array[Long]() - }, - // convert string to float array - if (input._3 != null && input._3.length > 0) { - input._3.split(",").map { _.toFloat } - } else { - Array[Float]() - }, - input._4 - ) - } - } - } - } - } - - /** - * the total number of nodes, single line file - */ - def getNumNodes(fileName: String) = { - Tsv(fileName).read - .mapTo(0 -> 'size) { input: Long => - input - } - } - - /** - * one iteration of pagerank - * inputPagerank: <'src_id_input, 'mass_input> - * return <'src_id, 'mass_n, 'mass_input> - * - * Here is a highlevel view of the unweighted algorithm: - * let - * N: number of nodes - * inputPagerank(N_i): prob of walking to node i, - * d(N_j): N_j's out degree - * then - * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) / d_j) - * deadPagerank = (1 - \sum_{i} pagerankNext(N_i)) / N - * randomPagerank(N_i) = userMass(N_i) * ALPHA + deadPagerank * (1-ALPHA) - * pagerankOutput(N_i) = randomPagerank(N_i) + pagerankNext(N_i) * (1-ALPHA) - * - * For weighted algorithm: - * let - * w(N_j, N_i): weight from N_j to N_i - * tw(N_j): N_j's total out weights - * then - * pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) * w(N_j, N_i) / tw(N_j)) - * - */ - def doPageRank(nodeRows: RichPipe, inputPagerank: RichPipe): RichPipe = { - // 'src_id, 'dst_ids, 'weights, 'mass_prior, 'mass_input - val nodeJoined = nodeRows - .joinWithSmaller('src_id -> 'src_id_input, inputPagerank) - .discard('src_id_input) - - // 'src_id, 'mass_n - val pagerankNext = nodeJoined - .flatMapTo(('dst_ids, 'weights, 'mass_input) -> ('src_id, 'mass_n)) { - args: (Array[Long], Array[Float], Double) => - { - if (args._1.length > 0) { - if (WEIGHTED) { - // weighted distribution - val total: Double = args._2.sum - (args._1 zip args._2).map { idWeight: (Long, Float) => - (idWeight._1, args._3 * idWeight._2 / total) - } - } else { - // equal distribution - val dist: Double = args._3 / args._1.length - args._1.map { id: Long => - (id, dist) - } - } - } else { - //Here is a node that points to no other nodes (dangling) - Nil - } - } - } - .groupBy('src_id) { - _.sum[Double]('mass_n) - } - - // 'sum_mass - val sumPagerankNext = pagerankNext.groupAll { _.sum[Double]('mass_n -> 'sum_mass) } - - // 'deadMass - // single row jobs - // the dead page rank equally distributed to every node - val deadPagerank = sumPagerankNext - .crossWithTiny(numNodes) - .map(('sum_mass, 'size) -> 'deadMass) { input: (Double, Long) => - (1.0 - input._1) / input._2 - } - .discard('size, 'sum_mass) - - // 'src_id_r, 'mass_n_r - // random jump probability plus dead page rank - val randomPagerank = nodeJoined - .crossWithTiny(deadPagerank) - .mapTo(('src_id, 'mass_prior, 'deadMass, 'mass_input) -> ('src_id, 'mass_n, 'mass_input)) { - ranks: (Long, Double, Double, Double) => - (ranks._1, ranks._2 * ALPHA + ranks._3 * (1 - ALPHA), ranks._4) - } - - // 'src_id, 'mass_n - // scale next page rank to 1-ALPHA - val pagerankNextScaled = pagerankNext - .map('mass_n -> ('mass_n, 'mass_input)) { m: Double => - ((1 - ALPHA) * m, 0.0) - } - - // 'src_id, 'mass_n, 'mass_input - // random probability + next probability - (randomPagerank ++ pagerankNextScaled) - .groupBy('src_id) { - _.sum[Double]('mass_input) // keep the input pagerank - .sum[Double]('mass_n) // take the sum - } - } -} diff --git a/src/scala/com/twitter/interaction_graph/README.docx b/src/scala/com/twitter/interaction_graph/README.docx new file mode 100644 index 000000000..dc53d8739 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/README.md b/src/scala/com/twitter/interaction_graph/README.md deleted file mode 100644 index 31b4cf00b..000000000 --- a/src/scala/com/twitter/interaction_graph/README.md +++ /dev/null @@ -1,19 +0,0 @@ -## Real Graph (bqe) - -This project builds a machine learning model using a gradient boosting tree classifier to predict the likelihood of a Twitter user interacting with another user. - -The algorithm works by first creating a labeled dataset of user interactions from a graph of Twitter users. This graph is represented in a BigQuery table where each row represents a directed edge between two users, along with various features such as the number of tweets, follows, favorites, and other metrics related to user behavior. - -To create the labeled dataset, the algorithm first selects a set of candidate interactions by identifying all edges that were active during a certain time period. It then joins this candidate set with a set of labeled interactions that occurred one day after the candidate period. Positive interactions are labeled as "1" and negative interactions are labeled as "0". The resulting labeled dataset is then used to train a boosted tree classifier model. - -The model is trained using the labeled dataset and various hyperparameters, including the maximum number of iterations and the subsample rate. The algorithm splits the labeled dataset into training and testing sets based on the source user's ID, using a custom data split method. - -Once the model is trained, it can be used to generate a score estimating the probability of a user interacting with another user. - -## Real Graph (scio) - -This project aggregates the number of interactions between pairs of users on Twitter. On a daily basis, there are multiple dataflow jobs that perform this aggregation, which includes public engagements like favorites, retweets, follows, etc. as well as private engagements like profile views, tweet clicks, and whether or not a user has another user in their address book (given a user opt-in to share address book). - -After the daily aggregation of interactions, there is a rollup job that aggregates yesterday's aggregation with today's interactions. The rollup job outputs several results, including the daily count of interactions per interaction types between a pair of users, the daily incoming interactions made on a user per interaction type, the rollup aggregation of interactions as a decayed sum between a pair of users, and the rollup aggregation of incoming interactions made on a user. - -Finally, the rollup job outputs the ML predicted interaction score between the pair of users alongside the rollup aggregation of interactions as a decayed sum between them. diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/README.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/README.docx new file mode 100644 index 000000000..ed5558f1c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/README.md b/src/scala/com/twitter/interaction_graph/bqe/scoring/README.md deleted file mode 100644 index 0e435feb8..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/README.md +++ /dev/null @@ -1,58 +0,0 @@ -# Scoring - -This folder contains the sql files that we'll use for scoring the real graph edges in BQ. We have 4 steps that take place: -- check to make sure that our models are in place. the feature importance query should return 20 rows in total: 10 rows per model, 1 for each feature. -- follow graph feature generation. this is to ensure that we have features for all users regardless if they have had any recent activity. -- candidate generation. this query combines the candidates from the follow graph and the activity graph, and the features from both. -- scoring. this query scores with 2 of our prod models and saves the scores to a table, with an additional field that distinguishes if an edge in in/out of network. - -## Instructions - -For deploying the job, you would need to create a zip file, upload to packer, and then schedule it with aurora. - -``` -zip -jr real_graph_scoring src/scala/com/twitter/interaction_graph/bqe/scoring && \ -packer add_version --cluster=atla cassowary real_graph_scoring real_graph_scoring.zip -aurora cron schedule atla/cassowary/prod/real_graph_scoring src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.aurora && \ -aurora cron start atla/cassowary/prod/real_graph_scoring -``` - -# candidates.sql - -This BigQuery (BQ) query does the following: - -1. Declares two variables, date_start and date_end, which are both of type DATE. -2. Sets the date_end variable to the maximum partition ID of the interaction_graph_labels_daily table, using the PARSE_DATE() function to convert the partition ID to a date format. -3. Sets the date_start variable to 30 days prior to the date_end variable, using the DATE_SUB() function. -4. Creates a new table called candidates in the realgraph dataset, partitioned by ds. -5. The query uses three common table expressions (T1, T2, and T3) to join data from two tables (interaction_graph_labels_daily and tweeting_follows) to generate a table containing candidate information and features. -6. The table T3 is the result of a full outer join between T1 and T2, grouping by source_id and destination_id, and aggregating values such as num_tweets, label_types, and the counts of different types of labels (e.g. num_follows, num_favorites, etc.). -7. The T4 table ranks each source_id by the number of num_days and num_tweets, and selects the top 2000 rows for each source_id. -8. Finally, the query selects all columns from the T4 table and appends the date_end variable as a new column named ds. - -Overall, the query generates a table of candidates and their associated features for a particular date range, using data from two tables in the twttr-bq-cassowary-prod and twttr-recos-ml-prod datasets. - -# follow_graph_features.sql - -This BigQuery script creates a table twttr-recos-ml-prod.realgraph.tweeting_follows that includes features for Twitter user interactions, specifically tweet counts and follows. - -First, it sets two variables date_latest_tweet and date_latest_follows to the most recent dates available in two separate tables: twttr-bq-tweetsource-pub-prod.user.public_tweets and twttr-recos-ml-prod.user_events.valid_user_follows, respectively. - -Then, it creates the tweet_count and all_follows CTEs. - -The tweet_count CTE counts the number of tweets made by each user within the last 3 days prior to date_latest_tweet. - -The all_follows CTE retrieves all the follows from the valid_user_follows table that happened on date_latest_follows and left joins it with the tweet_count CTE. It also adds a row number that partitions by the source user ID and orders by the number of tweets in descending order. The final output is filtered to keep only the top 2000 follows per user based on the row number. - -The final SELECT statement combines the all_follows CTE with the date_latest_tweet variable and inserts the results into the twttr-recos-ml-prod.realgraph.tweeting_follows table partitioned by date. - -# scoring.sql - -This BQ code performs operations on a BigQuery table called twttr-recos-ml-prod.realgraph.scores. Here is a step-by-step breakdown of what the code does: - -Declare two variables, date_end and date_latest_follows, and set their values based on the latest partitions in the twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS and twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS tables that correspond to specific tables, respectively. The PARSE_DATE() function is used to convert the partition IDs to date format. - -Delete rows from the twttr-recos-ml-prod.realgraph.scores table where the value of the ds column is equal to date_end. - -Insert rows into the twttr-recos-ml-prod.realgraph.scores table based on a query that generates predicted scores for pairs of user IDs using two machine learning models. Specifically, the query uses the ML.PREDICT() function to apply two machine learning models (twttr-recos-ml-prod.realgraph.prod and twttr-recos-ml-prod.realgraph.prod_explicit) to the twttr-recos-ml-prod.realgraph.candidates table. The resulting predicted scores are joined with the twttr-recos-ml-prod.realgraph.tweeting_follows table, which contains information about the number of tweets made by users and their follow relationships, using a full outer join. The final result includes columns for the source ID, destination ID, predicted score (prob), explicit predicted score (prob_explicit), a binary variable indicating whether the destination ID is followed by the source ID (followed), and the value of date_end for the ds column. If there is no match in the predicted_scores table for a given pair of user IDs, the COALESCE() function is used to return the corresponding values from the tweeting_follows table, with default values of 0.0 for the predicted scores. - diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.docx new file mode 100644 index 000000000..5b9364ab7 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.sql b/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.sql deleted file mode 100644 index 89bd30d38..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/candidates.sql +++ /dev/null @@ -1,42 +0,0 @@ -DECLARE date_start, date_end DATE; -SET date_end = ( - SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id - FROM `twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="interaction_graph_labels_daily" -); -SET date_start = DATE_SUB(date_end, INTERVAL 30 DAY); - --- all candidates and their features -CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.candidates` -PARTITION BY ds -AS -WITH T1 AS ( - SELECT source_id, destination_id, label, dateHour - FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily` - LEFT JOIN UNNEST(labels) AS label - WHERE DATE(dateHour) BETWEEN date_start AND date_end -), T2 AS ( - SELECT source_id, destination_id, num_tweets - FROM `twttr-recos-ml-prod.realgraph.tweeting_follows` -), T3 AS ( -SELECT -COALESCE(T1.source_id, T2.source_id) AS source_id, -COALESCE(T1.destination_id, T2.destination_id) AS destination_id, -COUNT(DISTINCT(T1.dateHour)) AS num_days, -MIN(COALESCE(num_tweets,0)) AS num_tweets, -- all rows' num_tweets should be the same -COALESCE(DATE_DIFF(date_end, DATE(MAX(T1.dateHour)), DAY),30) AS days_since_last_interaction, -COUNT(DISTINCT(label)) AS label_types, -COUNTIF(label="num_follows") AS num_follows, -COUNTIF(label="num_favorites") AS num_favorites, -COUNTIF(label="num_tweet_clicks") AS num_tweet_clicks, -COUNTIF(label="num_profile_views") AS num_profile_views, -FROM T1 -FULL JOIN T2 -USING (source_id, destination_id) -GROUP BY 1,2 -ORDER BY 3 DESC,4 DESC -), T4 AS ( - SELECT RANK() OVER (PARTITION BY source_id ORDER BY num_days DESC, num_tweets DESC) AS rn, * - FROM T3 -) SELECT *, date_end AS ds FROM T4 WHERE rn <= 2000 - diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.docx new file mode 100644 index 000000000..4970b73d0 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.sql b/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.sql deleted file mode 100644 index 6baecc2ed..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/check_models.sql +++ /dev/null @@ -1,5 +0,0 @@ -(SELECT * FROM ML.FEATURE_IMPORTANCE(MODEL `twttr-recos-ml-prod.realgraph.prod`) -ORDER BY importance_gain DESC) -UNION ALL -(SELECT * FROM ML.FEATURE_IMPORTANCE(MODEL `twttr-recos-ml-prod.realgraph.prod_explicit`) -ORDER BY importance_gain DESC) diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.docx new file mode 100644 index 000000000..a0e1cb10c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.sql b/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.sql deleted file mode 100644 index ace7e2f36..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/follow_graph_features.sql +++ /dev/null @@ -1,28 +0,0 @@ -DECLARE date_latest_tweet, date_latest_follows DATE; -SET date_latest_tweet = ( - SELECT PARSE_DATE('%Y%m%d', SUBSTRING(MAX(partition_id), 1, 8)) AS partition_id - FROM `twttr-bq-tweetsource-pub-prod.user.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="public_tweets"); -SET date_latest_follows = ( - SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id - FROM `twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="valid_user_follows"); - --- tweet count candidate features -CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.tweeting_follows` -PARTITION BY ds -AS -WITH tweet_count AS ( - SELECT userId, COUNT(userId) AS num_tweets - FROM `twttr-bq-tweetsource-pub-prod.user.public_tweets` - WHERE DATE(ts) BETWEEN DATE_SUB(date_latest_tweet, INTERVAL 3 DAY) AND date_latest_tweet - GROUP BY 1 -), all_follows AS ( - SELECT F.sourceId AS source_id, F.destinationId AS destination_id, COALESCE(T.num_tweets,0) AS num_tweets, - ROW_NUMBER() OVER (PARTITION BY F.sourceId ORDER BY T.num_tweets DESC) AS rn - FROM `twttr-recos-ml-prod.user_events.valid_user_follows` F - LEFT JOIN tweet_count T - ON F.destinationId=T.userId - WHERE DATE(F._PARTITIONTIME) = date_latest_follows -) SELECT *, date_latest_tweet AS ds FROM all_follows WHERE rn <= 2000 -; diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.docx b/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.docx new file mode 100644 index 000000000..9a148c116 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.sql b/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.sql deleted file mode 100644 index 5694c0988..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.sql +++ /dev/null @@ -1,52 +0,0 @@ -DECLARE date_end, date_latest_follows DATE; -SET date_end = ( - SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id - FROM `twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="interaction_graph_labels_daily" -); -SET date_latest_follows = ( - SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id - FROM `twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS` - WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="valid_user_follows"); - -DELETE -FROM `twttr-recos-ml-prod.realgraph.scores` -WHERE ds = date_end; - --- score candidates (59m) -INSERT INTO `twttr-recos-ml-prod.realgraph.scores` -WITH predicted_scores AS ( - SELECT - source_id, - destination_id, - p1.prob AS prob, - p2.prob AS prob_explicit - FROM ML.PREDICT(MODEL `twttr-recos-ml-prod.realgraph.prod`, - ( - SELECT - * - FROM - `twttr-recos-ml-prod.realgraph.candidates` ) ) S1 - CROSS JOIN UNNEST(S1.predicted_label_probs) AS p1 - JOIN ML.PREDICT(MODEL `twttr-recos-ml-prod.realgraph.prod_explicit`, - ( - SELECT - * - FROM - `twttr-recos-ml-prod.realgraph.candidates` ) ) S2 - USING (source_id, destination_id) - CROSS JOIN UNNEST(S2.predicted_label_probs) AS p2 - WHERE p1.label=1 AND p2.label=1 -) -SELECT - COALESCE(predicted_scores.source_id, tweeting_follows.source_id) AS source_id, - COALESCE(predicted_scores.destination_id, tweeting_follows.destination_id) AS destination_id, - COALESCE(prob, 0.0) AS prob, - COALESCE(prob_explicit, 0.0) AS prob_explicit, - (tweeting_follows.source_id IS NOT NULL) AND (tweeting_follows.destination_id IS NOT NULL) AS followed, - date_end AS ds -FROM - predicted_scores - FULL JOIN - `twttr-recos-ml-prod.realgraph.tweeting_follows` tweeting_follows - USING (source_id, destination_id) diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/README.docx b/src/scala/com/twitter/interaction_graph/bqe/training/README.docx new file mode 100644 index 000000000..10ba2fed0 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/README.md b/src/scala/com/twitter/interaction_graph/bqe/training/README.md deleted file mode 100644 index 17e94e7f5..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/README.md +++ /dev/null @@ -1,60 +0,0 @@ -# Training - -This folder contains the sql files that we'll use for training the prod real graph models: -- prod (predicts any interactions the next day) -- prod_explicit (predicts any explicit interactions the next day) - -We have 3 steps that take place: -- candidate generation + feature hydration. this query samples 1% of edges from the `twttr-recos-ml-prod.realgraph.candidates` table which is already produced daily and saves it to `twttr-recos-ml-prod.realgraph.candidates_sampled`. we save each day's data according to the statebird batch run date and hence require checks to make sure that the data exists to begin with. -- label candidates. we join day T's candidates with day T+1's labels while filtering out any negative interactions to get our labeled dataset. we append an additional day's worth of segments for each day. we finally generate the training dataset which uses all day's labeled data for training, performing negative downsampling to get a roughly 50-50 split of positive to negative labels. -- training. we use bqml for training our xgboost models. - -## Instructions - -For deploying the job, you would need to create a zip file, upload to packer, and then schedule it with aurora. - -``` -zip -jr real_graph_training src/scala/com/twitter/interaction_graph/bqe/training && \ -packer add_version --cluster=atla cassowary real_graph_training real_graph_training.zip -aurora cron schedule atla/cassowary/prod/real_graph_training src/scala/com/twitter/interaction_graph/bqe/training/training.aurora && \ -aurora cron start atla/cassowary/prod/real_graph_training -``` - -# candidates.sql - -1. Sets the value of the variable date_candidates to the date of the latest partition of the candidates_for_training table. -2. Creates a new table candidates_sampled if it does not exist already, which will contain a sample of 100 rows from the candidates_for_training table. -3. Deletes any existing rows from the candidates_sampled table where the ds column matches the date_candidates value, to avoid double-writing. -4. Inserts a sample of rows into the candidates_sampled table from the candidates_for_training table, where the modulo of the absolute value of the FARM_FINGERPRINT of the concatenation of source_id and destination_id is equal to the value of the $mod_remainder$ variable, and where the ds column matches the date_candidates value. - -# check_candidates_exist.sql - -This BigQuery prepares a table of candidates for training a machine learning model. It does the following: - -1. Declares two variables date_start and date_end that are 30 days apart, and date_end is set to the value of $start_time$ parameter (which is a Unix timestamp). -2. Creates a table candidates_for_training that is partitioned by ds (date) and populated with data from several other tables in the database. It joins information from tables of user interactions, tweeting, and interaction graph aggregates, filters out negative edge snapshots, calculates some statistics and aggregates them by source_id and destination_id. Then, it ranks each source_id by the number of days and tweets, selects top 2000, and adds date_end as a new column ds. -3. Finally, it selects the ds column from candidates_for_training where ds equals date_end. - -Overall, this script prepares a table of 2000 candidate pairs of user interactions with statistics and labels, which can be used to train a machine learning model for recommendation purposes. - -# labeled_candidates.sql - -The BQ does the following: - -1. Defines two variables date_candidates and date_labels as dates based on the $start_time$ parameter. -2. Creates a new table twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ with default values. -3. Deletes any prior data in the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table for the current date_candidates. -4. Joins the twttr-recos-ml-prod.realgraph.candidates_sampled table with the twttr-bq-cassowary-prod.user.interaction_graph_labels_daily table and the twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot table. It assigns a label of 1 for positive interactions and 0 for negative interactions, and selects only the rows where there is no negative interaction. -5. Inserts the joined data into the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table. -6. Calculates the positive rate by counting the number of positive labels and dividing it by the total number of labels. -7. Creates a new table twttr-recos-ml-prod.realgraph.train$table_suffix$ by sampling from the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table, with a downsampling of negative examples to balance the number of positive and negative examples, based on the positive rate calculated in step 6. - -The resulting twttr-recos-ml-prod.realgraph.train$table_suffix$ table is used as a training dataset for a machine learning model. - -# train_model.sql - -This BQ command creates or replaces a machine learning model called twttr-recos-ml-prod.realgraph.prod$table_suffix$. The model is a boosted tree classifier, which is used for binary classification problems. - -The options provided in the command configure the specific settings for the model, such as the number of parallel trees, the maximum number of iterations, and the data split method. The DATA_SPLIT_METHOD parameter is set to CUSTOM, and DATA_SPLIT_COL is set to if_eval, which means the data will be split into training and evaluation sets based on the if_eval column. The IF function is used to assign a boolean value of true or false to if_eval based on the modulo operation performed on source_id. - -The SELECT statement specifies the input data for the model. The columns selected include label (the target variable to be predicted), as well as various features such as num_days, num_tweets, and num_follows that are used to predict the target variable. \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/candidates.docx b/src/scala/com/twitter/interaction_graph/bqe/training/candidates.docx new file mode 100644 index 000000000..6b4c42926 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/candidates.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/candidates.sql b/src/scala/com/twitter/interaction_graph/bqe/training/candidates.sql deleted file mode 100644 index 8c47b8184..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/candidates.sql +++ /dev/null @@ -1,18 +0,0 @@ --- get latest partition of candidates with data -DECLARE date_candidates DATE; -SET date_candidates = (SELECT DATE(TIMESTAMP_MILLIS($start_time$))); - -CREATE TABLE IF NOT EXISTS `twttr-recos-ml-prod.realgraph.candidates_sampled` AS -SELECT * FROM `twttr-recos-ml-prod.realgraph.candidates_for_training` LIMIT 100; - --- remove previous output snapshot (if exists) to avoid double-writing -DELETE -FROM `twttr-recos-ml-prod.realgraph.candidates_sampled` -WHERE ds = date_candidates; - --- sample from candidates table instead of recomputing features -INSERT INTO `twttr-recos-ml-prod.realgraph.candidates_sampled` -SELECT * FROM `twttr-recos-ml-prod.realgraph.candidates_for_training` -WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(source_id, '_', destination_id))), 100) = $mod_remainder$ -AND ds = date_candidates; - diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.docx b/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.docx new file mode 100644 index 000000000..fd4412aad Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.sql b/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.sql deleted file mode 100644 index 5cb380b4f..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/check_candidates_exist.sql +++ /dev/null @@ -1,43 +0,0 @@ -DECLARE date_start, date_end DATE; -SET date_end = (SELECT DATE(TIMESTAMP_MILLIS($start_time$))); -SET date_start = DATE_SUB(date_end, INTERVAL 30 DAY); - -CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.candidates_for_training` -PARTITION BY ds -AS -WITH T1 AS ( - SELECT source_id, destination_id, label, dateHour - FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily` - LEFT JOIN UNNEST(labels) AS label - WHERE DATE(dateHour) BETWEEN date_start AND date_end -), T2 AS ( - SELECT source_id, destination_id, num_tweets - FROM `twttr-recos-ml-prod.realgraph.tweeting_follows` -), T3 AS ( -SELECT -COALESCE(T1.source_id, T2.source_id) AS source_id, -COALESCE(T1.destination_id, T2.destination_id) AS destination_id, -COUNT(DISTINCT(T1.dateHour)) AS num_days, -MIN(COALESCE(num_tweets,0)) AS num_tweets, -- all rows' num_tweets should be the same -COALESCE(DATE_DIFF(date_end, DATE(MAX(T1.dateHour)), DAY),30) AS days_since_last_interaction, -COUNT(DISTINCT(label)) AS label_types, -COUNTIF(label="num_follows") AS num_follows, -COUNTIF(label="num_favorites") AS num_favorites, -COUNTIF(label="num_tweet_clicks") AS num_tweet_clicks, -COUNTIF(label="num_profile_views") AS num_profile_views, -FROM T1 -FULL JOIN T2 -USING (source_id, destination_id) -LEFT JOIN `twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot` N -USING (source_id, destination_id) -WHERE N.source_id IS NULL AND N.destination_id IS NULL -GROUP BY 1,2 -ORDER BY 3 DESC,4 DESC -), T4 AS ( - SELECT RANK() OVER (PARTITION BY source_id ORDER BY num_days DESC, num_tweets DESC) AS rn, * - FROM T3 -) SELECT *, date_end AS ds FROM T4 WHERE rn <= 2000; - -SELECT ds FROM `twttr-recos-ml-prod.realgraph.candidates_for_training` -WHERE ds = (SELECT DATE(TIMESTAMP_MILLIS($start_time$))) -LIMIT 1 diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.docx b/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.docx new file mode 100644 index 000000000..e816d3e90 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.sql b/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.sql deleted file mode 100644 index 20a372b4a..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/check_labels_exist.sql +++ /dev/null @@ -1,4 +0,0 @@ -SELECT dateHour FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily` -WHERE dateHour = (SELECT TIMESTAMP_ADD(TIMESTAMP_MILLIS($start_time$), INTERVAL 1 DAY)) -LIMIT 1 - diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.docx b/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.docx new file mode 100644 index 000000000..c636b043c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.sql b/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.sql deleted file mode 100644 index 4230ee5c5..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/labeled_candidates.sql +++ /dev/null @@ -1,67 +0,0 @@ --- date_labels is 1 day after date_candidates (which is the current batch run's start date) -DECLARE date_candidates, date_labels DATE; -DECLARE positive_rate FLOAT64; -SET date_candidates = (SELECT DATE(TIMESTAMP_MILLIS($start_time$))); -SET date_labels = DATE_ADD(date_candidates, INTERVAL 1 DAY); - -CREATE TABLE IF NOT EXISTS `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` AS -SELECT - 0 AS source_id, - 1 AS destination_id, - 1 AS label, - 1 AS num_days, - 1 AS num_tweets, - 1 AS num_follows, - 1 AS num_favorites, - 1 AS num_tweet_clicks, - 1 AS num_profile_views, - 1 AS days_since_last_interaction, - 1 AS label_types, - DATE("2023-01-08") AS ds; - --- delete any prior data to avoid double writing -DELETE -FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` -WHERE ds = date_candidates; - --- join labels with candidates with 1 day attribution delay and insert new segment -INSERT INTO `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` -WITH label_positive AS ( - SELECT source_id, destination_id - FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily` - WHERE DATE(dateHour)=date_labels -), label_negative AS ( - SELECT source_id, destination_id - FROM `twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot` -) SELECT - F.source_id, - F.destination_id, - CASE WHEN P.source_id IS NULL THEN 0 ELSE 1 END AS label, - num_days, - num_tweets, - num_follows, - num_favorites, - num_tweet_clicks, - num_profile_views, - days_since_last_interaction, - label_types, - date_candidates AS ds -FROM `twttr-recos-ml-prod.realgraph.candidates_sampled` F -LEFT JOIN label_positive P USING(source_id, destination_id) -LEFT JOIN label_negative N USING(source_id, destination_id) -WHERE N.source_id IS NULL AND N.destination_id IS NULL -AND F.ds=date_candidates -; - --- get positive rate -SET positive_rate = -(SELECT SUM(label)/COUNT(label) AS pct_positive -FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` -); - --- create training dataset with negative downsampling (should get ~50-50 split) --- this spans over the cumulative date range of the labeled candidates table. -CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.train$table_suffix$` AS -SELECT * FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` -WHERE CASE WHEN label = 0 AND RAND() < positive_rate THEN true WHEN label = 1 AND RAND() < (1-positive_rate) THEN true ELSE false END -; diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/train_model.docx b/src/scala/com/twitter/interaction_graph/bqe/training/train_model.docx new file mode 100644 index 000000000..4da040141 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/bqe/training/train_model.docx differ diff --git a/src/scala/com/twitter/interaction_graph/bqe/training/train_model.sql b/src/scala/com/twitter/interaction_graph/bqe/training/train_model.sql deleted file mode 100644 index c7a5df501..000000000 --- a/src/scala/com/twitter/interaction_graph/bqe/training/train_model.sql +++ /dev/null @@ -1,27 +0,0 @@ -CREATE OR REPLACE MODEL `twttr-recos-ml-prod.realgraph.prod$table_suffix$` -OPTIONS(MODEL_TYPE='BOOSTED_TREE_CLASSIFIER', - BOOSTER_TYPE = 'GBTREE', - NUM_PARALLEL_TREE = 1, - MAX_ITERATIONS = 20, - TREE_METHOD = 'HIST', - EARLY_STOP = TRUE, - SUBSAMPLE = 0.01, - INPUT_LABEL_COLS = ['label'], - DATA_SPLIT_METHOD = 'CUSTOM', - DATA_SPLIT_COL = 'if_eval') -AS SELECT - label, - source_id, - destination_id, - num_days, - num_tweets, - num_follows, - num_favorites, - num_tweet_clicks, - num_profile_views, - days_since_last_interaction, - label_types, - -- partition train/test by source_id's - IF(MOD(ABS(FARM_FINGERPRINT(CAST(source_id AS STRING))), 10) = 0, true, false) AS if_eval, -FROM `twttr-recos-ml-prod.realgraph.train$table_suffix$` -; diff --git a/src/scala/com/twitter/interaction_graph/injection/BUILD b/src/scala/com/twitter/interaction_graph/injection/BUILD deleted file mode 100644 index 3e9d55ccf..000000000 --- a/src/scala/com/twitter/interaction_graph/injection/BUILD +++ /dev/null @@ -1,25 +0,0 @@ -scala_library( - name = "user_session_inj", - sources = ["UserSessionInjection.scala"], - platform = "java8", - strict_deps = True, - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/bijection:scrooge", - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/thrift/com/twitter/user_session_store:thrift-scala", - ], -) - -scala_library( - name = "edge_list_injection", - sources = ["EdgeListInjection.scala"], - platform = "java8", - strict_deps = True, - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/bijection:scrooge", - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/injection/BUILD.docx b/src/scala/com/twitter/interaction_graph/injection/BUILD.docx new file mode 100644 index 000000000..3a4ab2c90 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/injection/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.docx b/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.docx new file mode 100644 index 000000000..1970d5d0f Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.docx differ diff --git a/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.scala b/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.scala deleted file mode 100644 index c03ad097c..000000000 --- a/src/scala/com/twitter/interaction_graph/injection/EdgeListInjection.scala +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.interaction_graph.injection - -import com.twitter.interaction_graph.thriftscala.EdgeList -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift - -object EdgeListInjection { - final val injection: KeyValInjection[Long, EdgeList] = - KeyValInjection( - Long2BigEndian, - ScalaCompactThrift(EdgeList) - ) -} diff --git a/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.docx b/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.docx new file mode 100644 index 000000000..9fd0a4d8d Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.docx differ diff --git a/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.scala b/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.scala deleted file mode 100644 index f6c84e184..000000000 --- a/src/scala/com/twitter/interaction_graph/injection/UserSessionInjection.scala +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.interaction_graph.injection - -import com.twitter.user_session_store.thriftscala.UserSession -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian - -object UserSessionInjection { - final val injection: KeyValInjection[Long, UserSession] = - KeyValInjection( - Long2BigEndian, - ScalaCompactThrift(UserSession) - ) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/README.docx b/src/scala/com/twitter/interaction_graph/scio/README.docx new file mode 100644 index 000000000..7de6543b4 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/README.md b/src/scala/com/twitter/interaction_graph/scio/README.md deleted file mode 100644 index c7ef6d713..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# Interaction Graph - -This folder contains the code used in the offline pipeline for real graph v2. - -The ETL jobs are contained in folders prefaced with `agg_*`, while the jobs powering the ml pipeline are in the ml folder. - -Note that the jobs in the ml folder are mostly ETL jobs; the main training and scoring happens within BQML. diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD deleted file mode 100644 index 3f7e0491e..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD +++ /dev/null @@ -1,62 +0,0 @@ -scala_library( - name = "agg_address_book", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_address_book_edge_snapshot-scala", - ":interaction_graph_agg_address_book_vertex_snapshot-scala", - "3rdparty/jvm/com/twitter/storehaus:algebra", - "addressbook/jobs/src/main/scala/com/twitter/addressbook/jobs/simplematches:simple_user_matches-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/interaction_graph/scio/common", - ], -) - -jvm_binary( - name = "interaction_graph_address_book_scio", - main = "com.twitter.interaction_graph.scio.agg_address_book.InteractionGraphAddressBookJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":agg_address_book", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_address_book_edge_snapshot", - description = "User-user directed edges with addressbook features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_address_book_vertex_snapshot", - description = "User vertex with addressbook features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD.docx new file mode 100644 index 000000000..94ce50873 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.docx new file mode 100644 index 000000000..495967902 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.scala deleted file mode 100644 index 0d57c4cae..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookCounters.scala +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.spotify.scio.ScioMetrics -import org.apache.beam.sdk.metrics.Counter - -trait InteractionGraphAddressBookCountersTrait { - val Namespace = "Interaction Graph Address Book" - - def emailFeatureInc(): Unit - - def phoneFeatureInc(): Unit - - def bothFeatureInc(): Unit -} - -/** - * SCIO counters are used to gather run time statistics - */ -case object InteractionGraphAddressBookCounters extends InteractionGraphAddressBookCountersTrait { - val emailFeatureCounter: Counter = - ScioMetrics.counter(Namespace, "Email Feature") - - val phoneFeatureCounter: Counter = - ScioMetrics.counter(Namespace, "Phone Feature") - - val bothFeatureCounter: Counter = - ScioMetrics.counter(Namespace, "Both Feature") - - override def emailFeatureInc(): Unit = emailFeatureCounter.inc() - - override def phoneFeatureInc(): Unit = phoneFeatureCounter.inc() - - override def bothFeatureInc(): Unit = bothFeatureCounter.inc() -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.docx new file mode 100644 index 000000000..3cc523b66 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.scala deleted file mode 100644 index 360b52cee..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookJob.scala +++ /dev/null @@ -1,71 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.dal.DAL.PathLayout -import com.twitter.beam.io.dal.DAL.WriteOptions -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import java.time.Instant -import org.joda.time.Interval - -object InteractionGraphAddressBookJob extends ScioBeamJob[InteractionGraphAddressBookOption] { - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphAddressBookOption - ): Unit = { - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val dateInterval: Interval = pipelineOptions.interval - implicit lazy val addressBookCounters: InteractionGraphAddressBookCountersTrait = - InteractionGraphAddressBookCounters - - val interactionGraphAddressBookSource = InteractionGraphAddressBookSource(pipelineOptions) - - val addressBook: SCollection[UserMatchesRecord] = - interactionGraphAddressBookSource.readSimpleUserMatches( - dateInterval.withStart(dateInterval.getStart.minusDays(3)) - ) - val (vertex, edges) = InteractionGraphAddressBookUtil.process(addressBook) - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - - vertex.saveAsCustomOutput( - "Write Vertex Records", - DAL.writeSnapshot[Vertex]( - InteractionGraphAggAddressBookVertexSnapshotScalaDataset, - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_vertex_daily"), - Instant.ofEpochMilli(dateInterval.getEndMillis), - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = - WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 16.0).ceil.toInt)) - ) - ) - - edges.saveAsCustomOutput( - "Write Edge Records", - DAL.writeSnapshot[Edge]( - InteractionGraphAggAddressBookEdgeSnapshotScalaDataset, - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_edge_daily"), - Instant.ofEpochMilli(dateInterval.getEndMillis), - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.docx new file mode 100644 index 000000000..f44fc3f62 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.scala deleted file mode 100644 index b5c34e94c..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphAddressBookOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.docx new file mode 100644 index 000000000..3d597ff65 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.scala deleted file mode 100644 index 66e3903bc..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookSource.scala +++ /dev/null @@ -1,28 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.addressbook.jobs.simplematches.SimpleUserMatchesScalaDataset -import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.cde.scio.dal_read.SourceUtil -import org.joda.time.Interval - -case class InteractionGraphAddressBookSource( - pipelineOptions: InteractionGraphAddressBookOption -)( - implicit sc: ScioContext, -) { - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readSimpleUserMatches( - dateInterval: Interval - ): SCollection[UserMatchesRecord] = { - SourceUtil.readMostRecentSnapshotDALDataset[UserMatchesRecord]( - SimpleUserMatchesScalaDataset, - dateInterval, - dalEnvironment) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.docx new file mode 100644 index 000000000..5e8c08be9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.scala deleted file mode 100644 index fc5898ce0..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/InteractionGraphAddressBookUtil.scala +++ /dev/null @@ -1,93 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_address_book - -import com.spotify.scio.values.SCollection -import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.Vertex - -object InteractionGraphAddressBookUtil { - val EMAIL = "email" - val PHONE = "phone" - val BOTH = "both" - - val DefaultAge = 1 - val DegaultFeatureValue = 1.0 - - def process( - addressBook: SCollection[UserMatchesRecord] - )( - implicit addressBookCounters: InteractionGraphAddressBookCountersTrait - ): (SCollection[Vertex], SCollection[Edge]) = { - // First construct a data with (src, dst, name), where name can be "email", "phone", or "both" - val addressBookTypes: SCollection[((Long, Long), String)] = addressBook.flatMap { record => - record.forwardMatches.toSeq.flatMap { matchDetails => - val matchedUsers = (record.userId, matchDetails.userId) - (matchDetails.matchedByEmail, matchDetails.matchedByPhone) match { - case (true, true) => - Seq((matchedUsers, EMAIL), (matchedUsers, PHONE), (matchedUsers, BOTH)) - case (true, false) => Seq((matchedUsers, EMAIL)) - case (false, true) => Seq((matchedUsers, PHONE)) - case _ => Seq.empty - } - } - } - - // Then construct the input data for feature calculation - val addressBookFeatureInput: SCollection[InteractionGraphRawInput] = addressBookTypes - .map { - case ((src, dst), name) => - if (src < dst) - ((src, dst, name), false) - else - ((dst, src, name), true) - }.groupByKey - .flatMap { - case ((src, dst, name), iterator) => - val isReversedValues = iterator.toSeq - // check if (src, dst) is mutual follow - val isMutualFollow = isReversedValues.size == 2 - // get correct srcId and dstId if there is no mutual follow and they are reversed - val (srcId, dstId) = { - if (!isMutualFollow && isReversedValues.head) - (dst, src) - else - (src, dst) - } - // get the feature name and mutual follow name - val (featureName, mfFeatureName) = name match { - case EMAIL => - addressBookCounters.emailFeatureInc() - (FeatureName.AddressBookEmail, FeatureName.AddressBookMutualEdgeEmail) - case PHONE => - addressBookCounters.phoneFeatureInc() - (FeatureName.AddressBookPhone, FeatureName.AddressBookMutualEdgePhone) - case BOTH => - addressBookCounters.bothFeatureInc() - (FeatureName.AddressBookInBoth, FeatureName.AddressBookMutualEdgeInBoth) - } - // construct the TypedPipe for feature calculation - if (isMutualFollow) { - Iterator( - InteractionGraphRawInput(srcId, dstId, featureName, DefaultAge, DegaultFeatureValue), - InteractionGraphRawInput(dstId, srcId, featureName, DefaultAge, DegaultFeatureValue), - InteractionGraphRawInput( - srcId, - dstId, - mfFeatureName, - DefaultAge, - DegaultFeatureValue), - InteractionGraphRawInput(dstId, srcId, mfFeatureName, DefaultAge, DegaultFeatureValue) - ) - } else { - Iterator( - InteractionGraphRawInput(srcId, dstId, featureName, DefaultAge, DegaultFeatureValue)) - } - } - - // Calculate the Features - FeatureGeneratorUtil.getFeatures(addressBookFeatureInput) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.docx new file mode 100644 index 000000000..9c80b4658 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.md deleted file mode 100644 index 4d895c71d..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_address_book/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphAddressBook Dataflow Job - -#### IntelliJ -``` -./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio -``` - -#### Compile -``` -./bazel build src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio -``` - -#### Build Jar -``` -./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-address-book-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_address_book/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-04-13 \ - --bind=profile.output_path=processed/interaction_graph_agg_address_book_dataflow -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD deleted file mode 100644 index 61dc35906..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD +++ /dev/null @@ -1,175 +0,0 @@ -scala_library( - name = "agg_all", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_history_aggregated_raw_edge_daily-scala", - ":interaction_graph_history_aggregated_vertex_daily-scala", - ":interaction_graph_aggregated_edge_daily-scala", - ":interaction_graph_aggregated_vertex_daily-scala", - ":interaction_graph_history_aggregated_edge_snapshot-scala", - ":interaction_graph_history_aggregated_vertex_snapshot-scala", - ":real_graph_features-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_agg_address_book_edge_snapshot-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_agg_address_book_vertex_snapshot-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_edge_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_vertex_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_edge_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_vertex_daily-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_edge_snapshot-scala", - "src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_vertex_snapshot-scala", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/scala/com/twitter/interaction_graph/scio/ml/scores:real_graph_in_scores-scala", - "src/scala/com/twitter/interaction_graph/scio/ml/scores:real_graph_oon_scores-scala", - "src/scala/com/twitter/wtf/dataflow/user_events:valid_user_follows-scala", - "src/thrift/com/twitter/wtf/candidate:wtf-candidate-scala", - "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_history_aggregated_raw_edge_daily", - description = "User-user directed edges with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_history_aggregated_vertex_daily", - description = "User vertex with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -jvm_binary( - name = "interaction_graph_aggregation_job_scio", - main = "com.twitter.interaction_graph.scio.agg_all.InteractionGraphAggregationJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":agg_all", - ], -) - -create_datasets( - base_name = "interaction_graph_history_aggregated_edge_snapshot", - description = "User-user directed edges with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_history_aggregated_vertex_snapshot", - description = "User vertex with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_aggregated_edge_daily", - description = "User-user directed edges with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_aggregated_vertex_daily", - description = "User vertex with all features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "real_graph_features", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.injection.UserSessionInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.user_session_store.thriftscala.UserSession", - scala_dependencies = [ - "src/scala/com/twitter/interaction_graph/injection:user_session_inj", - ], -) - -create_datasets( - base_name = "home_light_ranker_top_k_real_graph_features", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.injection.EdgeListInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.interaction_graph.thriftscala.EdgeList", - scala_dependencies = [ - "src/scala/com/twitter/interaction_graph/injection:edge_list_injection", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD.docx new file mode 100644 index 000000000..b94077877 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.docx new file mode 100644 index 000000000..121702e9f Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.scala deleted file mode 100644 index 2f9b0da57..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationConfig.scala +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -object InteractionGraphScoringConfig { - - /** - * This is alpha for a variant of the Exponentially weighted moving average, computed as: - * ewma_{t+1} = x_{t+1} + (1-alpha) * ewma_t (ewma_1 = x_1, t > 0) - * We choose alpha such that the half life of weights is 7 days. - * Note that we don't down-weight x_{t+1} (unlike in EWMA) as we only want to decay actions - * as they grow old, not compute the average value. - */ - val ALPHA = 1.0 - val ONE_MINUS_ALPHA = 0.955 -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.docx new file mode 100644 index 000000000..6bde8f011 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.scala deleted file mode 100644 index 06942205d..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationJob.scala +++ /dev/null @@ -1,314 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -import com.google.cloud.bigquery.BigQueryOptions -import com.google.cloud.bigquery.QueryJobConfiguration -import com.spotify.scio.ScioContext -import com.spotify.scio.ScioMetrics -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.dal.DAL.PathLayout -import com.twitter.beam.io.dal.DAL.WriteOptions -import com.twitter.beam.io.exception.DataNotFoundException -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.interaction_graph.scio.agg_all.InteractionGraphAggregationTransform._ -import com.twitter.interaction_graph.scio.common.DateUtil -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.scio.common.UserUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.user_session_store.thriftscala.UserSession -import com.twitter.util.Duration -import com.twitter.wtf.candidate.thriftscala.ScoredEdge -import java.time.Instant -import org.apache.avro.generic.GenericRecord -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead -import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord -import org.apache.beam.sdk.transforms.SerializableFunction -import org.joda.time.Interval -import scala.collection.JavaConverters._ - -object InteractionGraphAggregationJob extends ScioBeamJob[InteractionGraphAggregationOption] { - - // to parse latest date from the BQ table we're reading from - val parseDateRow = new SerializableFunction[SchemaAndRecord, String] { - override def apply(input: SchemaAndRecord): String = { - val genericRecord: GenericRecord = input.getRecord() - genericRecord.get("ds").toString - } - } - - // note that we're using the prob_explicit for real_graph_features (for Home) - val parseRow = new SerializableFunction[SchemaAndRecord, ScoredEdge] { - override def apply(record: SchemaAndRecord): ScoredEdge = { - val genericRecord: GenericRecord = record.getRecord() - ScoredEdge( - genericRecord.get("source_id").asInstanceOf[Long], - genericRecord.get("destination_id").asInstanceOf[Long], - genericRecord.get("prob_explicit").asInstanceOf[Double], - genericRecord.get("followed").asInstanceOf[Boolean], - ) - } - } - - override def runPipeline( - sc: ScioContext, - opts: InteractionGraphAggregationOption - ): Unit = { - - val dateStr: String = opts.getDate().value.getStart.toString("yyyyMMdd") - logger.info(s"dateStr $dateStr") - val project: String = "twttr-recos-ml-prod" - val datasetName: String = "realgraph" - val bqTableName: String = "scores" - val fullBqTableName: String = s"$project:$datasetName.$bqTableName" - - if (opts.getDALWriteEnvironment.toLowerCase == "prod") { - val bqClient = - BigQueryOptions.newBuilder.setProjectId(project).build.getService - val query = - s""" - |SELECT total_rows - |FROM `$project.$datasetName.INFORMATION_SCHEMA.PARTITIONS` - |WHERE partition_id ="$dateStr" AND - |table_name="$bqTableName" AND total_rows > 0 - |""".stripMargin - val queryConfig = QueryJobConfiguration.of(query) - val results = bqClient.query(queryConfig).getValues.asScala.toSeq - if (results.isEmpty || results.head.get(0).getLongValue == 0) { - throw new DataNotFoundException(s"$dateStr not present in $fullBqTableName.") - } - } - sc.run() - } - - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphAggregationOption - ): Unit = { - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val dateInterval: Interval = pipelineOptions.interval - val yesterday = DateUtil.subtract(dateInterval, Duration.fromDays(1)) - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - val dateStr: String = pipelineOptions.getDate().value.getStart.toString("yyyy-MM-dd") - logger.info(s"dateStr $dateStr") - val project: String = "twttr-recos-ml-prod" - val datasetName: String = "realgraph" - val bqTableName: String = "scores" - val fullBqTableName: String = s"$project:$datasetName.$bqTableName" - - val scoreExport: SCollection[ScoredEdge] = - sc.customInput( - s"Read from BQ table $fullBqTableName", - BigQueryIO - .read(parseRow) - .fromQuery(s"""SELECT source_id, destination_id, prob_explicit, followed - |FROM `$project.$datasetName.$bqTableName` - |WHERE ds = '$dateStr'""".stripMargin) - .usingStandardSql() - .withMethod(TypedRead.Method.DEFAULT) - ) - - val source = InteractionGraphAggregationSource(pipelineOptions) - - val (addressEdgeFeatures, addressVertexFeatures) = source.readAddressBookFeatures() - - val (clientEventLogsEdgeFeatures, clientEventLogsVertexFeatures) = - source.readClientEventLogsFeatures(dateInterval) - - val (flockEdgeFeatures, flockVertexFeatures) = source.readFlockFeatures() - - val (directInteractionsEdgeFeatures, directInteractionsVertexFeatures) = - source.readDirectInteractionsFeatures(dateInterval) - - val invalidUsers = UserUtil.getInvalidUsers(source.readFlatUsers()) - - val (prevAggEdge, prevAggVertex) = source.readAggregatedFeatures(yesterday) - - val prevAggregatedVertex: SCollection[Vertex] = - UserUtil - .filterUsersByIdMapping[Vertex]( - prevAggVertex, - invalidUsers, - v => v.userId - ) - - /** Remove status-based features (flock/ab) from current graph, because we only need the latest - * This is to allow us to filter and roll-up a smaller dataset, to which we will still add - * back the status-based features for the complete scoredAggregates (that other teams will read). - */ - val prevAggEdgeFiltered = prevAggEdge - .filter { e => - e.sourceId != e.destinationId - } - .withName("filtering status-based edges") - .flatMap(FeatureGeneratorUtil.removeStatusFeatures) - val prevAggEdgeValid: SCollection[Edge] = - UserUtil - .filterUsersByMultipleIdMappings[Edge]( - prevAggEdgeFiltered, - invalidUsers, - Seq(e => e.sourceId, e => e.destinationId) - ) - - val aggregatedActivityVertexDaily = UserUtil - .filterUsersByIdMapping[Vertex]( - FeatureGeneratorUtil - .combineVertexFeatures( - clientEventLogsVertexFeatures ++ - directInteractionsVertexFeatures ++ - addressVertexFeatures ++ - flockVertexFeatures - ), - invalidUsers, - v => v.userId - ) - - // we split up the roll-up of decayed counts between status vs activity/count-based features - val aggregatedActivityEdgeDaily = FeatureGeneratorUtil - .combineEdgeFeatures(clientEventLogsEdgeFeatures ++ directInteractionsEdgeFeatures) - - // Vertex level, Add the decay sum for history and daily - val aggregatedActivityVertex = FeatureGeneratorUtil - .combineVertexFeaturesWithDecay( - prevAggregatedVertex, - aggregatedActivityVertexDaily, - InteractionGraphScoringConfig.ONE_MINUS_ALPHA, - InteractionGraphScoringConfig.ALPHA - ) - - // Edge level, Add the decay sum for history and daily - val aggregatedActivityEdge = FeatureGeneratorUtil - .combineEdgeFeaturesWithDecay( - prevAggEdgeValid, - aggregatedActivityEdgeDaily, - InteractionGraphScoringConfig.ONE_MINUS_ALPHA, - InteractionGraphScoringConfig.ALPHA - ) - .filter(FeatureGeneratorUtil.edgeWithFeatureOtherThanDwellTime) - .withName("removing edges that only have dwell time features") - - val edgeKeyedScores = scoreExport.keyBy { e => (e.sourceId, e.destinationId) } - - val scoredAggregatedActivityEdge = aggregatedActivityEdge - .keyBy { e => (e.sourceId, e.destinationId) } - .withName("join with scores") - .leftOuterJoin(edgeKeyedScores) - .map { - case (_, (e, scoredEdgeOpt)) => - val scoreOpt = scoredEdgeOpt.map(_.score) - e.copy(weight = if (scoreOpt.nonEmpty) { - ScioMetrics.counter("after joining edge with scores", "has score").inc() - scoreOpt - } else { - ScioMetrics.counter("after joining edge with scores", "no score").inc() - None - }) - } - - val combinedFeatures = FeatureGeneratorUtil - .combineEdgeFeatures(aggregatedActivityEdge ++ addressEdgeFeatures ++ flockEdgeFeatures) - .keyBy { e => (e.sourceId, e.destinationId) } - - val aggregatedActivityScoredEdge = - edgeKeyedScores - .withName("join with combined edge features") - .leftOuterJoin(combinedFeatures) - .map { - case (_, (scoredEdge, combinedFeaturesOpt)) => - if (combinedFeaturesOpt.exists(_.features.nonEmpty)) { - ScioMetrics.counter("after joining scored edge with features", "has features").inc() - Edge( - sourceId = scoredEdge.sourceId, - destinationId = scoredEdge.destinationId, - weight = Some(scoredEdge.score), - features = combinedFeaturesOpt.map(_.features).getOrElse(Nil) - ) - } else { - ScioMetrics.counter("after joining scored edge with features", "no features").inc() - Edge( - sourceId = scoredEdge.sourceId, - destinationId = scoredEdge.destinationId, - weight = Some(scoredEdge.score), - features = Nil - ) - } - } - - val realGraphFeatures = - getTopKTimelineFeatures(aggregatedActivityScoredEdge, pipelineOptions.getMaxDestinationIds) - - aggregatedActivityVertex.saveAsCustomOutput( - "Write History Aggregated Vertex Records", - DAL.writeSnapshot[Vertex]( - dataset = InteractionGraphHistoryAggregatedVertexSnapshotScalaDataset, - pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_vertex"), - endDate = Instant.ofEpochMilli(dateInterval.getEndMillis), - diskFormat = DiskFormat.Parquet, - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards / 10)) - ) - ) - - scoredAggregatedActivityEdge.saveAsCustomOutput( - "Write History Aggregated Edge Records", - DAL.writeSnapshot[Edge]( - dataset = InteractionGraphHistoryAggregatedEdgeSnapshotScalaDataset, - pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_raw_edge"), - endDate = Instant.ofEpochMilli(dateInterval.getEndMillis), - diskFormat = DiskFormat.Parquet, - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - - aggregatedActivityVertexDaily.saveAsCustomOutput( - "Write Daily Aggregated Vertex Records", - DAL.write[Vertex]( - dataset = InteractionGraphAggregatedVertexDailyScalaDataset, - pathLayout = - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_vertex_daily"), - interval = dateInterval, - diskFormat = DiskFormat.Parquet, - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards / 10)) - ) - ) - - aggregatedActivityEdgeDaily.saveAsCustomOutput( - "Write Daily Aggregated Edge Records", - DAL.write[Edge]( - dataset = InteractionGraphAggregatedEdgeDailyScalaDataset, - pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_edge_daily"), - interval = dateInterval, - diskFormat = DiskFormat.Parquet, - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - - realGraphFeatures.saveAsCustomOutput( - "Write Timeline Real Graph Features", - DAL.writeVersionedKeyVal[KeyVal[Long, UserSession]]( - dataset = RealGraphFeaturesScalaDataset, - pathLayout = - PathLayout.VersionedPath(pipelineOptions.getOutputPath + "/real_graph_features"), - environmentOverride = Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.docx new file mode 100644 index 000000000..2eac1d68c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.scala deleted file mode 100644 index 94e7ffae6..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationOption.scala +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphAggregationOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit - - @Description("BQ Table name for reading scores from") - def getBqTableName: String - def setBqTableName(value: String): Unit - - @Description("max destination ids that we will store for real graph features in TL") - def getMaxDestinationIds: Integer - def setMaxDestinationIds(value: Integer): Unit - - @Description("true if getting scores from BQ instead of DAL-based dataset in GCS") - def getScoresFromBQ: Boolean - def setScoresFromBQ(value: Boolean): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.docx new file mode 100644 index 000000000..eddb208e9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.scala deleted file mode 100644 index b1ea8ff05..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationSource.scala +++ /dev/null @@ -1,182 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.ReadOptions -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.dal.client.dataset.SnapshotDALDatasetBase -import com.twitter.dal.client.dataset.TimePartitionedDALDataset -import com.twitter.interaction_graph.scio.agg_address_book.InteractionGraphAggAddressBookEdgeSnapshotScalaDataset -import com.twitter.interaction_graph.scio.agg_address_book.InteractionGraphAggAddressBookVertexSnapshotScalaDataset -import com.twitter.interaction_graph.scio.agg_client_event_logs.InteractionGraphAggClientEventLogsEdgeDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_client_event_logs.InteractionGraphAggClientEventLogsVertexDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_direct_interactions.InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_direct_interactions.InteractionGraphAggDirectInteractionsVertexDailyScalaDataset -import com.twitter.interaction_graph.scio.agg_flock.InteractionGraphAggFlockEdgeSnapshotScalaDataset -import com.twitter.interaction_graph.scio.agg_flock.InteractionGraphAggFlockVertexSnapshotScalaDataset -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser -import com.twitter.util.Duration -import org.joda.time.Interval - -case class InteractionGraphAggregationSource( - pipelineOptions: InteractionGraphAggregationOption -)( - implicit sc: ScioContext) { - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readDALDataset[T: Manifest]( - dataset: TimePartitionedDALDataset[T], - interval: Interval, - dalEnvironment: String, - projections: Option[Seq[String]] = None - )( - implicit sc: ScioContext, - ): SCollection[T] = { - sc.customInput( - s"Reading ${dataset.role.name}.${dataset.logicalName}", - DAL.read[T]( - dataset = dataset, - interval = interval, - environmentOverride = Environment.valueOf(dalEnvironment), - readOptions = ReadOptions(projections) - ) - ) - } - - def readMostRecentSnapshotDALDataset[T: Manifest]( - dataset: SnapshotDALDatasetBase[T], - dateInterval: Interval, - dalEnvironment: String, - projections: Option[Seq[String]] = None - )( - implicit sc: ScioContext, - ): SCollection[T] = { - sc.customInput( - s"Reading most recent snapshot ${dataset.role.name}.${dataset.logicalName}", - DAL.readMostRecentSnapshot[T]( - dataset, - dateInterval, - Environment.valueOf(dalEnvironment), - readOptions = ReadOptions(projections) - ) - ) - } - - def readMostRecentSnapshotNoOlderThanDALDataset[T: Manifest]( - dataset: SnapshotDALDatasetBase[T], - noOlderThan: Duration, - dalEnvironment: String, - projections: Option[Seq[String]] = None - )( - implicit sc: ScioContext, - ): SCollection[T] = { - sc.customInput( - s"Reading most recent snapshot ${dataset.role.name}.${dataset.logicalName}", - DAL.readMostRecentSnapshotNoOlderThan[T]( - dataset, - noOlderThan, - environmentOverride = Environment.valueOf(dalEnvironment), - readOptions = ReadOptions(projections) - ) - ) - } - - def readAddressBookFeatures(): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readMostRecentSnapshotNoOlderThanDALDataset[Edge]( - dataset = InteractionGraphAggAddressBookEdgeSnapshotScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - ) - - val vertex = readMostRecentSnapshotNoOlderThanDALDataset[Vertex]( - dataset = InteractionGraphAggAddressBookVertexSnapshotScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - ) - - (edges, vertex) - } - - def readClientEventLogsFeatures( - dateInterval: Interval - ): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readDALDataset[Edge]( - dataset = InteractionGraphAggClientEventLogsEdgeDailyScalaDataset, - dalEnvironment = dalEnvironment, - interval = dateInterval - ) - - val vertex = readDALDataset[Vertex]( - dataset = InteractionGraphAggClientEventLogsVertexDailyScalaDataset, - dalEnvironment = dalEnvironment, - interval = dateInterval - ) - - (edges, vertex) - } - - def readDirectInteractionsFeatures( - dateInterval: Interval - ): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readDALDataset[Edge]( - dataset = InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset, - dalEnvironment = dalEnvironment, - interval = dateInterval - ) - - val vertex = readDALDataset[Vertex]( - dataset = InteractionGraphAggDirectInteractionsVertexDailyScalaDataset, - dalEnvironment = dalEnvironment, - interval = dateInterval - ) - - (edges, vertex) - } - - def readFlockFeatures(): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readMostRecentSnapshotNoOlderThanDALDataset[Edge]( - dataset = InteractionGraphAggFlockEdgeSnapshotScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - ) - - val vertex = readMostRecentSnapshotNoOlderThanDALDataset[Vertex]( - dataset = InteractionGraphAggFlockVertexSnapshotScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - ) - - (edges, vertex) - } - - def readAggregatedFeatures(dateInterval: Interval): (SCollection[Edge], SCollection[Vertex]) = { - val edges = readMostRecentSnapshotDALDataset[Edge]( - dataset = InteractionGraphHistoryAggregatedEdgeSnapshotScalaDataset, - dalEnvironment = dalEnvironment, - dateInterval = dateInterval - ) - - val vertex = readMostRecentSnapshotDALDataset[Vertex]( - dataset = InteractionGraphHistoryAggregatedVertexSnapshotScalaDataset, - dalEnvironment = dalEnvironment, - dateInterval = dateInterval - ) - - (edges, vertex) - } - - def readFlatUsers(): SCollection[FlatUser] = - readMostRecentSnapshotNoOlderThanDALDataset[FlatUser]( - dataset = UsersourceFlatScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment, - projections = Some(Seq("id", "valid_user")) - ) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.docx new file mode 100644 index 000000000..4e0d0678e Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.scala b/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.scala deleted file mode 100644 index c76592c10..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/InteractionGraphAggregationTransform.scala +++ /dev/null @@ -1,59 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_all - -import collection.JavaConverters._ -import com.spotify.scio.values.SCollection -import com.twitter.algebird.mutable.PriorityQueueMonoid -import com.twitter.interaction_graph.scio.common.GraphUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.timelines.real_graph.thriftscala.RealGraphFeatures -import com.twitter.timelines.real_graph.thriftscala.RealGraphFeaturesTest -import com.twitter.timelines.real_graph.v1.thriftscala.{RealGraphFeatures => RealGraphFeaturesV1} -import com.twitter.user_session_store.thriftscala.UserSession -import com.twitter.interaction_graph.scio.common.ConversionUtil._ - -object InteractionGraphAggregationTransform { - val ordering: Ordering[Edge] = Ordering.by(-_.weight.getOrElse(0.0)) - - // converts our Edge thrift into timelines' thrift - def getTopKTimelineFeatures( - scoredAggregatedEdge: SCollection[Edge], - maxDestinationIds: Int - ): SCollection[KeyVal[Long, UserSession]] = { - scoredAggregatedEdge - .filter(_.weight.exists(_ > 0)) - .keyBy(_.sourceId) - .groupByKey - .map { - case (sourceId, edges) => - val (inEdges, outEdges) = edges.partition(GraphUtil.isFollow) - val inTopK = - if (inEdges.isEmpty) Nil - else { - val inTopKQueue = - new PriorityQueueMonoid[Edge](maxDestinationIds)(ordering) - inTopKQueue - .build(inEdges).iterator().asScala.toList.flatMap( - toRealGraphEdgeFeatures(hasTimelinesRequiredFeatures)) - } - val outTopK = - if (outEdges.isEmpty) Nil - else { - val outTopKQueue = - new PriorityQueueMonoid[Edge](maxDestinationIds)(ordering) - outTopKQueue - .build(outEdges).iterator().asScala.toList.flatMap( - toRealGraphEdgeFeatures(hasTimelinesRequiredFeatures)) - } - KeyVal( - sourceId, - UserSession( - userId = Some(sourceId), - realGraphFeatures = Some(RealGraphFeatures.V1(RealGraphFeaturesV1(inTopK, outTopK))), - realGraphFeaturesTest = - Some(RealGraphFeaturesTest.V1(RealGraphFeaturesV1(inTopK, outTopK))) - ) - ) - } - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_all/README.docx new file mode 100644 index 000000000..130f33aeb Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_all/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_all/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_all/README.md deleted file mode 100644 index cedf39b12..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_all/README.md +++ /dev/null @@ -1,38 +0,0 @@ -## InteractionGraphAggregationJob Dataflow Job - -This job aggregates the previous day's history with today's activities, and outputs an updated -history. This history is joined with the explicit scores from real graph's BQML pipeline, and -exported as features for timelines (which is why we're using their thrift). - -#### IntelliJ -``` -fastpass create --name rg_agg_all --intellij src/scala/com/twitter/interaction_graph/scio/agg_all:interaction_graph_aggregation_job_scio -``` - -#### Compile -``` -bazel build src/scala/com/twitter/interaction_graph/scio/agg_all:interaction_graph_aggregation_job_scio -``` - -#### Build Jar -``` -bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_all:interaction_graph_aggregation_job_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-aggregation-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_all/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-11-08 \ - --bind=profile.output_path=processed/interaction_graph_aggregation_dataflow -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD deleted file mode 100644 index 9c14f4d38..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD +++ /dev/null @@ -1,61 +0,0 @@ -scala_library( - name = "agg_client_event_logs", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_client_event_logs_edge_daily-scala", - ":interaction_graph_agg_client_event_logs_vertex_daily-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/scala/com/twitter/wtf/scalding/jobs/client_event_processing:user_interaction-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/combined:usersource-scala", - ], -) - -jvm_binary( - name = "interaction_graph_client_event_logs_scio", - main = "com.twitter.interaction_graph.scio.agg_client_event_logs.InteractionGraphClientEventLogsJob", - platform = "java8", - dependencies = [ - ":agg_client_event_logs", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_client_event_logs_edge_daily", - description = "User-user directed edges with client events features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_client_event_logs_vertex_daily", - description = "User vertex with client events features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD.docx new file mode 100644 index 000000000..4a80a6b86 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.docx new file mode 100644 index 000000000..872c12777 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.scala deleted file mode 100644 index cc9793ba8..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsCounters.scala +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.spotify.scio.ScioMetrics - -trait InteractionGraphClientEventLogsCountersTrait { - val Namespace = "Interaction Graph Client Event Logs" - def profileViewFeaturesInc(): Unit - def linkOpenFeaturesInc(): Unit - def tweetClickFeaturesInc(): Unit - def tweetImpressionFeaturesInc(): Unit - def catchAllInc(): Unit -} - -case object InteractionGraphClientEventLogsCounters - extends InteractionGraphClientEventLogsCountersTrait { - - val profileViewCounter = ScioMetrics.counter(Namespace, "Profile View Features") - val linkOpenCounter = ScioMetrics.counter(Namespace, "Link Open Features") - val tweetClickCounter = ScioMetrics.counter(Namespace, "Tweet Click Features") - val tweetImpressionCounter = ScioMetrics.counter(Namespace, "Tweet Impression Features") - val catchAllCounter = ScioMetrics.counter(Namespace, "Catch All") - - override def profileViewFeaturesInc(): Unit = profileViewCounter.inc() - - override def linkOpenFeaturesInc(): Unit = linkOpenCounter.inc() - - override def tweetClickFeaturesInc(): Unit = tweetClickCounter.inc() - - override def tweetImpressionFeaturesInc(): Unit = tweetImpressionCounter.inc() - - override def catchAllInc(): Unit = catchAllCounter.inc() -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.docx new file mode 100644 index 000000000..2183163b9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.scala deleted file mode 100644 index 1a12b33d9..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsJob.scala +++ /dev/null @@ -1,74 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.spotify.scio.ScioContext -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.dal.DAL.WriteOptions -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.interaction_graph.scio.common.UserUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import org.joda.time.Interval - -object InteractionGraphClientEventLogsJob - extends ScioBeamJob[InteractionGraphClientEventLogsOption] { - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphClientEventLogsOption - ): Unit = { - - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val jobCounters: InteractionGraphClientEventLogsCountersTrait = - InteractionGraphClientEventLogsCounters - - lazy val dateInterval: Interval = pipelineOptions.interval - - val sources = InteractionGraphClientEventLogsSource(pipelineOptions) - - val userInteractions = sources.readUserInteractions(dateInterval) - val rawUsers = sources.readCombinedUsers() - val safeUsers = UserUtil.getValidUsers(rawUsers) - - val (vertex, edges) = InteractionGraphClientEventLogsUtil.process(userInteractions, safeUsers) - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - - vertex.saveAsCustomOutput( - "Write Vertex Records", - DAL.write[Vertex]( - InteractionGraphAggClientEventLogsVertexDailyScalaDataset, - PathLayout.DailyPath( - pipelineOptions.getOutputPath + "/aggregated_client_event_logs_vertex_daily"), - dateInterval, - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = - WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 32.0).ceil.toInt)) - ) - ) - - edges.saveAsCustomOutput( - "Write Edge Records", - DAL.write[Edge]( - InteractionGraphAggClientEventLogsEdgeDailyScalaDataset, - PathLayout.DailyPath( - pipelineOptions.getOutputPath + "/aggregated_client_event_logs_edge_daily"), - dateInterval, - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.docx new file mode 100644 index 000000000..a6bf3d26b Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.scala deleted file mode 100644 index 7a07a6913..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphClientEventLogsOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.docx new file mode 100644 index 000000000..ca3e847f7 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.scala deleted file mode 100644 index 1cf2da318..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsSource.scala +++ /dev/null @@ -1,40 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.twadoop.user.gen.thriftscala.CombinedUser -import com.twitter.usersource.snapshot.combined.UsersourceScalaDataset -import com.twitter.util.Duration -import com.twitter.cde.scio.dal_read.SourceUtil -import com.twitter.wtf.scalding.client_event_processing.thriftscala.UserInteraction -import com.twitter.wtf.scalding.jobs.client_event_processing.UserInteractionScalaDataset -import org.joda.time.Interval - -case class InteractionGraphClientEventLogsSource( - pipelineOptions: InteractionGraphClientEventLogsOption -)( - implicit sc: ScioContext) { - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readUserInteractions(dateInterval: Interval): SCollection[UserInteraction] = { - - SourceUtil.readDALDataset[UserInteraction]( - dataset = UserInteractionScalaDataset, - interval = dateInterval, - dalEnvironment = dalEnvironment) - - } - - def readCombinedUsers(): SCollection[CombinedUser] = { - - SourceUtil.readMostRecentSnapshotNoOlderThanDALDataset[CombinedUser]( - dataset = UsersourceScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.docx new file mode 100644 index 000000000..0ebc65fc1 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.scala deleted file mode 100644 index 521a1f07f..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/InteractionGraphClientEventLogsUtil.scala +++ /dev/null @@ -1,137 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_client_event_logs - -import com.spotify.scio.values.SCollection -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.scio.common.FeatureKey -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.wtf.scalding.client_event_processing.thriftscala.InteractionDetails -import com.twitter.wtf.scalding.client_event_processing.thriftscala.InteractionType -import com.twitter.wtf.scalding.client_event_processing.thriftscala.UserInteraction - -object InteractionGraphClientEventLogsUtil { - - val DefaultAge = 1 - val DefaultFeatureValue = 1.0 - - def process( - userInteractions: SCollection[UserInteraction], - safeUsers: SCollection[Long] - )( - implicit jobCounters: InteractionGraphClientEventLogsCountersTrait - ): (SCollection[Vertex], SCollection[Edge]) = { - - val unfilteredFeatureInput = userInteractions - .flatMap { - case UserInteraction( - userId, - _, - interactionType, - InteractionDetails.ProfileClickDetails(profileClick)) - if interactionType == InteractionType.ProfileClicks && userId != profileClick.profileId => - jobCounters.profileViewFeaturesInc() - Seq( - FeatureKey( - userId, - profileClick.profileId, - FeatureName.NumProfileViews) -> DefaultFeatureValue - ) - - case UserInteraction( - userId, - _, - interactionType, - InteractionDetails.TweetClickDetails(tweetClick)) - if interactionType == InteractionType.TweetClicks && - Some(userId) != tweetClick.authorId => - ( - for { - authorId <- tweetClick.authorId - } yield { - jobCounters.tweetClickFeaturesInc() - FeatureKey(userId, authorId, FeatureName.NumTweetClicks) -> DefaultFeatureValue - - } - ).toSeq - - case UserInteraction( - userId, - _, - interactionType, - InteractionDetails.LinkClickDetails(linkClick)) - if interactionType == InteractionType.LinkClicks && - Some(userId) != linkClick.authorId => - ( - for { - authorId <- linkClick.authorId - } yield { - jobCounters.linkOpenFeaturesInc() - FeatureKey(userId, authorId, FeatureName.NumLinkClicks) -> DefaultFeatureValue - } - ).toSeq - - case UserInteraction( - userId, - _, - interactionType, - InteractionDetails.TweetImpressionDetails(tweetImpression)) - if interactionType == InteractionType.TweetImpressions && - Some(userId) != tweetImpression.authorId => - ( - for { - authorId <- tweetImpression.authorId - dwellTime <- tweetImpression.dwellTimeInSec - } yield { - jobCounters.tweetImpressionFeaturesInc() - Seq( - FeatureKey( - userId, - authorId, - FeatureName.NumInspectedStatuses) -> DefaultFeatureValue, - FeatureKey(userId, authorId, FeatureName.TotalDwellTime) -> dwellTime.toDouble - ) - } - ).getOrElse(Nil) - - case _ => - jobCounters.catchAllInc() - Nil - } - .sumByKey - .collect { - case (FeatureKey(srcId, destId, featureName), featureValue) => - InteractionGraphRawInput( - src = srcId, - dst = destId, - name = featureName, - age = 1, - featureValue = featureValue - ) - } - - val filteredFeatureInput = filterForSafeUsers(unfilteredFeatureInput, safeUsers) - - // Calculate the Features - FeatureGeneratorUtil.getFeatures(filteredFeatureInput) - - } - - private def filterForSafeUsers( - featureInput: SCollection[InteractionGraphRawInput], - safeUsers: SCollection[Long] - ): SCollection[InteractionGraphRawInput] = { - - featureInput - .keyBy(_.src) - .withName("Filter out unsafe users") - .intersectByKey(safeUsers) - .values // Fetch only InteractionGraphRawInput - .keyBy(_.dst) - .withName("Filter out unsafe authors") - .intersectByKey(safeUsers) - .values // Fetch only InteractionGraphRawInput - } - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.docx new file mode 100644 index 000000000..a8b0f994c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.md deleted file mode 100644 index 6bd1ea2cd..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphClientEventLogs Dataflow Job - -#### IntelliJ -``` -./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_client_event_logs_scio -``` - -#### Compile -``` -./bazel build src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_client_event_logs_scio -``` - -#### Build Jar -``` -./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_client_event_logs_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-client-event-logs-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-04-27 \ - --bind=profile.output_path=processed/interaction_graph_agg_client_event_logs_dataflow -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD deleted file mode 100644 index 51479c70d..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD +++ /dev/null @@ -1,65 +0,0 @@ -scala_library( - name = "agg_direct_interactions", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_direct_interactions_edge_daily-scala", - ":interaction_graph_agg_direct_interactions_vertex_daily-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/thrift/com/twitter/timelineservice/server/internal:thrift-scala", - "twadoop_config/configuration/log_categories/group/timeline:timeline_service_favorites-scala", - "twadoop_config/configuration/log_categories/group/tweetypie:tweetypie_media_tag_events-scala", - "tweetsource/common:unhydrated_flat-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/combined:usersource-scala", - ], -) - -jvm_binary( - name = "interaction_graph_agg_direct_interactions_scio", - main = "com.twitter.interaction_graph.scio.agg_direct_interactions.InteractionGraphAggDirectInteractionsJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":agg_direct_interactions", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_direct_interactions_edge_daily", - description = "User-user directed edges with direct interactions features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_direct_interactions_vertex_daily", - description = "User vertex with direct interactions features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD.docx new file mode 100644 index 000000000..ed63316c4 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.docx new file mode 100644 index 000000000..414764dff Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.scala deleted file mode 100644 index 0b855cee2..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsJob.scala +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_direct_interactions - -import com.spotify.scio.ScioContext -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.io.fs.multiformat.WriteOptions -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.interaction_graph.scio.common.UserUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import org.joda.time.Interval - -object InteractionGraphAggDirectInteractionsJob - extends ScioBeamJob[InteractionGraphAggDirectInteractionsOption] { - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphAggDirectInteractionsOption - ): Unit = { - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val dateInterval: Interval = pipelineOptions.interval - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - - val source = InteractionGraphAggDirectInteractionsSource(pipelineOptions) - - val rawUsers = source.readCombinedUsers() - val safeUsers = UserUtil.getValidUsers(rawUsers) - - val rawFavorites = source.readFavorites(dateInterval) - val rawPhotoTags = source.readPhotoTags(dateInterval) - val tweetSource = source.readTweetSource(dateInterval) - - val (vertex, edges) = InteractionGraphAggDirectInteractionsUtil.process( - rawFavorites, - tweetSource, - rawPhotoTags, - safeUsers - ) - - vertex.saveAsCustomOutput( - "Write Vertex Records", - DAL.write[Vertex]( - InteractionGraphAggDirectInteractionsVertexDailyScalaDataset, - PathLayout.DailyPath( - pipelineOptions.getOutputPath + "/aggregated_direct_interactions_vertex_daily"), - dateInterval, - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = - WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 8.0).ceil.toInt)) - ) - ) - - edges.saveAsCustomOutput( - "Write Edge Records", - DAL.write[Edge]( - InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset, - PathLayout.DailyPath( - pipelineOptions.getOutputPath + "/aggregated_direct_interactions_edge_daily"), - dateInterval, - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.docx new file mode 100644 index 000000000..9de9a108c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.scala deleted file mode 100644 index 43d3d08df..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_direct_interactions - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphAggDirectInteractionsOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.docx new file mode 100644 index 000000000..1d195d603 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.scala deleted file mode 100644 index 9470b1980..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsSource.scala +++ /dev/null @@ -1,51 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_direct_interactions - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.cde.scio.dal_read.SourceUtil -import com.twitter.timelineservice.thriftscala.ContextualizedFavoriteEvent -import com.twitter.twadoop.user.gen.thriftscala.CombinedUser -import com.twitter.tweetsource.common.thriftscala.UnhydratedFlatTweet -import com.twitter.tweetypie.thriftscala.TweetMediaTagEvent -import com.twitter.usersource.snapshot.combined.UsersourceScalaDataset -import com.twitter.util.Duration -import org.joda.time.Interval -import twadoop_config.configuration.log_categories.group.timeline.TimelineServiceFavoritesScalaDataset -import twadoop_config.configuration.log_categories.group.tweetypie.TweetypieMediaTagEventsScalaDataset -import tweetsource.common.UnhydratedFlatScalaDataset - -case class InteractionGraphAggDirectInteractionsSource( - pipelineOptions: InteractionGraphAggDirectInteractionsOption -)( - implicit sc: ScioContext) { - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readFavorites(dateInterval: Interval): SCollection[ContextualizedFavoriteEvent] = - SourceUtil.readDALDataset[ContextualizedFavoriteEvent]( - dataset = TimelineServiceFavoritesScalaDataset, - interval = dateInterval, - dalEnvironment = dalEnvironment - ) - - def readPhotoTags(dateInterval: Interval): SCollection[TweetMediaTagEvent] = - SourceUtil.readDALDataset[TweetMediaTagEvent]( - dataset = TweetypieMediaTagEventsScalaDataset, - interval = dateInterval, - dalEnvironment = dalEnvironment) - - def readTweetSource(dateInterval: Interval): SCollection[UnhydratedFlatTweet] = - SourceUtil.readDALDataset[UnhydratedFlatTweet]( - dataset = UnhydratedFlatScalaDataset, - interval = dateInterval, - dalEnvironment = dalEnvironment) - - def readCombinedUsers(): SCollection[CombinedUser] = - SourceUtil.readMostRecentSnapshotNoOlderThanDALDataset[CombinedUser]( - dataset = UsersourceScalaDataset, - noOlderThan = Duration.fromDays(5), - dalEnvironment = dalEnvironment - ) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.docx new file mode 100644 index 000000000..dfbeddb5e Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.scala deleted file mode 100644 index 1d996116e..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/InteractionGraphAggDirectInteractionsUtil.scala +++ /dev/null @@ -1,168 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_direct_interactions - -import com.spotify.scio.ScioMetrics -import com.spotify.scio.values.SCollection -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.scio.common.FeatureKey -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.scio.common.UserUtil.DUMMY_USER_ID -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.timelineservice.thriftscala.ContextualizedFavoriteEvent -import com.twitter.timelineservice.thriftscala.FavoriteEventUnion.Favorite -import com.twitter.tweetsource.common.thriftscala.UnhydratedFlatTweet -import com.twitter.tweetypie.thriftscala.TweetMediaTagEvent - -object InteractionGraphAggDirectInteractionsUtil { - - val DefaultFeatureValue = 1L - - def favouriteFeatures( - rawFavorites: SCollection[ContextualizedFavoriteEvent] - ): SCollection[(FeatureKey, Long)] = { - rawFavorites - .withName("fav features") - .flatMap { event => - event.event match { - case Favorite(e) if e.userId != e.tweetUserId => - ScioMetrics.counter("process", "fav").inc() - Some( - FeatureKey(e.userId, e.tweetUserId, FeatureName.NumFavorites) -> DefaultFeatureValue) - case _ => None - } - } - - } - - def mentionFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource - .withName("mention features") - .flatMap { - case s if s.shareSourceTweetId.isEmpty => // only for non-retweets - s.atMentionedUserIds - .map { users => - users.toSet.map { uid: Long => - ScioMetrics.counter("process", "mention").inc() - FeatureKey(s.userId, uid, FeatureName.NumMentions) -> DefaultFeatureValue - }.toSeq - } - .getOrElse(Nil) - case _ => - Nil - } - } - - def photoTagFeatures( - rawPhotoTags: SCollection[TweetMediaTagEvent] - ): SCollection[(FeatureKey, Long)] = { - rawPhotoTags - .withName("photo tag features") - .flatMap { p => - p.taggedUserIds.map { (p.userId, _) } - } - .collect { - case (src, dst) if src != dst => - ScioMetrics.counter("process", "photo tag").inc() - FeatureKey(src, dst, FeatureName.NumPhotoTags) -> DefaultFeatureValue - } - } - - def retweetFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource - .withName("retweet features") - .collect { - case s if s.shareSourceUserId.exists(_ != s.userId) => - ScioMetrics.counter("process", "share tweet").inc() - FeatureKey( - s.userId, - s.shareSourceUserId.get, - FeatureName.NumRetweets) -> DefaultFeatureValue - } - } - - def quotedTweetFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource - .withName("quoted tweet features") - .collect { - case t if t.quotedTweetUserId.isDefined => - ScioMetrics.counter("process", "quote tweet").inc() - FeatureKey( - t.userId, - t.quotedTweetUserId.get, - FeatureName.NumTweetQuotes) -> DefaultFeatureValue - } - } - - def replyTweetFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource - .withName("reply tweet features") - .collect { - case t if t.inReplyToUserId.isDefined => - ScioMetrics.counter("process", "reply tweet").inc() - FeatureKey(t.userId, t.inReplyToUserId.get, FeatureName.NumReplies) -> DefaultFeatureValue - } - } - - // we create edges to a dummy user id since creating a tweet has no destination id - def createTweetFeatures( - tweetSource: SCollection[UnhydratedFlatTweet] - ): SCollection[(FeatureKey, Long)] = { - tweetSource.withName("create tweet features").map { tweet => - ScioMetrics.counter("process", "create tweet").inc() - FeatureKey(tweet.userId, DUMMY_USER_ID, FeatureName.NumCreateTweets) -> DefaultFeatureValue - } - } - - def process( - rawFavorites: SCollection[ContextualizedFavoriteEvent], - tweetSource: SCollection[UnhydratedFlatTweet], - rawPhotoTags: SCollection[TweetMediaTagEvent], - safeUsers: SCollection[Long] - ): (SCollection[Vertex], SCollection[Edge]) = { - val favouriteInput = favouriteFeatures(rawFavorites) - val mentionInput = mentionFeatures(tweetSource) - val photoTagInput = photoTagFeatures(rawPhotoTags) - val retweetInput = retweetFeatures(tweetSource) - val quotedTweetInput = quotedTweetFeatures(tweetSource) - val replyInput = replyTweetFeatures(tweetSource) - val createTweetInput = createTweetFeatures(tweetSource) - - val allInput = SCollection.unionAll( - Seq( - favouriteInput, - mentionInput, - photoTagInput, - retweetInput, - quotedTweetInput, - replyInput, - createTweetInput - )) - - val filteredFeatureInput = allInput - .keyBy(_._1.src) - .intersectByKey(safeUsers) // filter for safe users - .values - .collect { - case (FeatureKey(src, dst, feature), featureValue) if src != dst => - FeatureKey(src, dst, feature) -> featureValue - } - .sumByKey - .map { - case (FeatureKey(src, dst, feature), featureValue) => - val age = 1 - InteractionGraphRawInput(src, dst, feature, age, featureValue) - } - - FeatureGeneratorUtil.getFeatures(filteredFeatureInput) - } - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.docx new file mode 100644 index 000000000..de2e6d502 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.md deleted file mode 100644 index a9e9d3610..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphAggDirectInteractions Dataflow Job - -#### IntelliJ -``` -./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_scio -``` - -#### Compile -``` -./bazel build src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_scio -``` - -#### Build Jar -``` -./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-agg-direct-interactions-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-04-13 \ - --bind=profile.output_path=processed/interaction_graph_agg_direct_interactions_dataflow -``` diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD deleted file mode 100644 index 3bf51323c..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD +++ /dev/null @@ -1,70 +0,0 @@ -scala_library( - name = "agg_flock", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_flock_edge_snapshot-scala", - ":interaction_graph_agg_flock_vertex_snapshot-scala", - "3rdparty/jvm/com/twitter/storehaus:algebra", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "flockdb-tools/datasets/flock:flock-blocks-edges-scala", - "flockdb-tools/datasets/flock:flock-mutes-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-abuse-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-spam-edges-scala", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/scala/com/twitter/wtf/dataflow/user_events:valid_user_follows-scala", - "src/thrift/com/twitter/core_workflows/user_model:user_model-scala", - "src/thrift/com/twitter/twadoop/user/gen:gen-java", - "src/thrift/com/twitter/twadoop/user/gen:gen-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/combined:usersource-scala", - ], -) - -jvm_binary( - name = "interaction_graph_agg_flock_scio", - main = "com.twitter.interaction_graph.scio.agg_flock.InteractionGraphAggFlockJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":agg_flock", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_flock_edge_snapshot", - description = "User-user directed edges with flock features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_flock_vertex_snapshot", - description = "User vertex with flock features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD.docx new file mode 100644 index 000000000..ea6103ce5 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.docx new file mode 100644 index 000000000..43b51729d Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.scala deleted file mode 100644 index e0a9f934d..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockJob.scala +++ /dev/null @@ -1,84 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_flock - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.dal.DAL.DiskFormat -import com.twitter.beam.io.dal.DAL.PathLayout -import com.twitter.beam.io.dal.DAL.WriteOptions -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.interaction_graph.scio.agg_flock.InteractionGraphAggFlockUtil._ -import com.twitter.interaction_graph.scio.common.DateUtil -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.util.Duration -import java.time.Instant -import org.joda.time.Interval - -object InteractionGraphAggFlockJob extends ScioBeamJob[InteractionGraphAggFlockOption] { - override protected def configurePipeline( - scioContext: ScioContext, - pipelineOptions: InteractionGraphAggFlockOption - ): Unit = { - @transient - implicit lazy val sc: ScioContext = scioContext - implicit lazy val dateInterval: Interval = pipelineOptions.interval - - val source = InteractionGraphAggFlockSource(pipelineOptions) - - val embiggenInterval = DateUtil.embiggen(dateInterval, Duration.fromDays(7)) - - val flockFollowsSnapshot = source.readFlockFollowsSnapshot(embiggenInterval) - - // the flock snapshot we're reading from has already been filtered for safe/valid users hence no filtering for safeUsers - val flockFollowsFeature = - getFlockFeatures(flockFollowsSnapshot, FeatureName.NumFollows, dateInterval) - - val flockMutualFollowsFeature = getMutualFollowFeature(flockFollowsFeature) - - val allSCollections = Seq(flockFollowsFeature, flockMutualFollowsFeature) - - val allFeatures = SCollection.unionAll(allSCollections) - - val (vertex, edges) = FeatureGeneratorUtil.getFeatures(allFeatures) - - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) { - pipelineOptions.getDALWriteEnvironment - } else { - dalEnvironment - } - - vertex.saveAsCustomOutput( - "Write Vertex Records", - DAL.writeSnapshot[Vertex]( - InteractionGraphAggFlockVertexSnapshotScalaDataset, - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_flock_vertex_daily"), - Instant.ofEpochMilli(dateInterval.getEndMillis), - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = - WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 64.0).ceil.toInt)) - ) - ) - - edges.saveAsCustomOutput( - "Write Edge Records", - DAL.writeSnapshot[Edge]( - InteractionGraphAggFlockEdgeSnapshotScalaDataset, - PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_flock_edge_daily"), - Instant.ofEpochMilli(dateInterval.getEndMillis), - DiskFormat.Parquet, - Environment.valueOf(dalWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards)) - ) - ) - - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.docx new file mode 100644 index 000000000..dfdc8fa22 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.scala deleted file mode 100644 index f5ef58b55..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_flock - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphAggFlockOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(16) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.docx new file mode 100644 index 000000000..af0ac9daa Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.scala b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.scala deleted file mode 100644 index 726293475..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockSource.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_flock - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.job.ServiceIdentifierOptions -import com.twitter.flockdb.tools.datasets.flock.thriftscala.FlockEdge -import com.twitter.cde.scio.dal_read.SourceUtil -import com.twitter.wtf.dataflow.user_events.ValidUserFollowsScalaDataset -import org.joda.time.Interval - -case class InteractionGraphAggFlockSource( - pipelineOptions: InteractionGraphAggFlockOption -)( - implicit sc: ScioContext) { - val dalEnvironment: String = pipelineOptions - .as(classOf[ServiceIdentifierOptions]) - .getEnvironment() - - def readFlockFollowsSnapshot(dateInterval: Interval): SCollection[FlockEdge] = - SourceUtil.readMostRecentSnapshotDALDataset( - dataset = ValidUserFollowsScalaDataset, - dateInterval = dateInterval, - dalEnvironment = dalEnvironment) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.docx new file mode 100644 index 000000000..8b7d8a9cd Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.scala deleted file mode 100644 index 89858a89a..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/InteractionGraphAggFlockUtil.scala +++ /dev/null @@ -1,63 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_flock - -import com.spotify.scio.values.SCollection -import com.twitter.algebird.Min -import com.twitter.flockdb.tools.datasets.flock.thriftscala.FlockEdge -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.thriftscala.FeatureName -import java.time.Instant -import java.time.temporal.ChronoUnit -import org.joda.time.Interval - -object InteractionGraphAggFlockUtil { - - def getFlockFeatures( - edges: SCollection[FlockEdge], - featureName: FeatureName, - dateInterval: Interval - ): SCollection[InteractionGraphRawInput] = { - edges - .withName(s"${featureName.toString} - Converting flock edge to interaction graph input") - .map { edge => - // NOTE: getUpdatedAt gives time in the seconds resolution - // Because we use .extend() when reading the data source, the updatedAt time might be larger than the dateRange. - // We need to cap them, otherwise, DateUtil.diffDays gives incorrect results. - val start = (edge.updatedAt * 1000L).min(dateInterval.getEnd.toInstant.getMillis) - val end = dateInterval.getStart.toInstant.getMillis - val age = ChronoUnit.DAYS.between( - Instant.ofEpochMilli(start), - Instant.ofEpochMilli(end) - ) + 1 - InteractionGraphRawInput(edge.sourceId, edge.destinationId, featureName, age.toInt, 1.0) - } - - } - - def getMutualFollowFeature( - flockFollowFeature: SCollection[InteractionGraphRawInput] - ): SCollection[InteractionGraphRawInput] = { - flockFollowFeature - .withName("Convert FlockFollows to Mutual Follows") - .map { input => - val sourceId = input.src - val destId = input.dst - - if (sourceId < destId) { - Tuple2(sourceId, destId) -> Tuple2(Set(true), Min(input.age)) // true means follow - } else { - Tuple2(destId, sourceId) -> Tuple2(Set(false), Min(input.age)) // false means followed_by - } - } - .sumByKey - .flatMap { - case ((id1, id2), (followSet, minAge)) if followSet.size == 2 => - val age = minAge.get - Seq( - InteractionGraphRawInput(id1, id2, FeatureName.NumMutualFollows, age, 1.0), - InteractionGraphRawInput(id2, id1, FeatureName.NumMutualFollows, age, 1.0)) - case _ => - Nil - } - } - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.docx new file mode 100644 index 000000000..14af45521 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.md deleted file mode 100644 index 0ff797194..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_flock/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphClientEventLogs Dataflow Job - -#### IntelliJ -``` -./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_scio -``` - -#### Compile -``` -./bazel build src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_scio -``` - -#### Build Jar -``` -./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-agg-flock-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_flock/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-04-13 \ - --bind=profile.output_path=processed/interaction_graph_agg_flock_dataflow -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD deleted file mode 100644 index 1fbe57e1f..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD +++ /dev/null @@ -1,43 +0,0 @@ -scala_library( - name = "agg_negative", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":real_graph_negative_features-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "flockdb-tools/datasets/flock:flock-blocks-edges-scala", - "flockdb-tools/datasets/flock:flock-mutes-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-abuse-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-spam-edges-scala", - "socialgraph/hadoop/src/main/scala/com/twitter/socialgraph/hadoop:socialgraph-unfollows-scala", - "src/scala/com/twitter/interaction_graph/scio/common", - "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam", - ], -) - -jvm_binary( - name = "interaction_graph_negative_scio", - main = "com.twitter.interaction_graph.scio.agg_negative.InteractionGraphNegativeJob", - platform = "java8", - dependencies = [ - ":agg_negative", - ], -) - -create_datasets( - base_name = "real_graph_negative_features", - key_type = "Long", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.injection.UserSessionInjection.injection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.user_session_store.thriftscala.UserSession", - scala_dependencies = [ - "src/scala/com/twitter/interaction_graph/injection:user_session_inj", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD.docx new file mode 100644 index 000000000..8d696ebb4 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_negative/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.docx new file mode 100644 index 000000000..242e1b8c6 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.scala deleted file mode 100644 index 479b67524..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeJob.scala +++ /dev/null @@ -1,155 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_negative - -import com.google.api.services.bigquery.model.TimePartitioning -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.algebird.mutable.PriorityQueueMonoid -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.io.fs.multiformat.WriteOptions -import com.twitter.conversions.DurationOps._ -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.interaction_graph.scio.common.ConversionUtil.hasNegativeFeatures -import com.twitter.interaction_graph.scio.common.ConversionUtil.toRealGraphEdgeFeatures -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil.getEdgeFeature -import com.twitter.interaction_graph.scio.common.GraphUtil -import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.scrooge.ThriftStruct -import com.twitter.socialgraph.hadoop.SocialgraphUnfollowsScalaDataset -import com.twitter.tcdc.bqblaster.beam.syntax._ -import com.twitter.tcdc.bqblaster.core.avro.TypedProjection -import com.twitter.tcdc.bqblaster.core.transform.RootTransform -import com.twitter.timelines.real_graph.thriftscala.RealGraphFeaturesTest -import com.twitter.timelines.real_graph.v1.thriftscala.{RealGraphFeatures => RealGraphFeaturesV1} -import com.twitter.user_session_store.thriftscala.UserSession -import flockdb_tools.datasets.flock.FlockBlocksEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockMutesEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockReportAsAbuseEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockReportAsSpamEdgesScalaDataset -import java.time.Instant -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO - -object InteractionGraphNegativeJob extends ScioBeamJob[InteractionGraphNegativeOption] { - val maxDestinationIds = 500 // p99 is about 500 - def getFeatureCounts(e: Edge): Int = e.features.size - val negativeEdgeOrdering = Ordering.by[Edge, Int](getFeatureCounts) - val negativeEdgeReverseOrdering = negativeEdgeOrdering.reverse - implicit val pqMonoid: PriorityQueueMonoid[Edge] = - new PriorityQueueMonoid[Edge](maxDestinationIds)(negativeEdgeOrdering) - - override protected def configurePipeline( - sc: ScioContext, - opts: InteractionGraphNegativeOption - ): Unit = { - - val endTs = opts.interval.getEndMillis - - // read input datasets - val blocks: SCollection[InteractionGraphRawInput] = - GraphUtil.getFlockFeatures( - readSnapshot(FlockBlocksEdgesScalaDataset, sc), - FeatureName.NumBlocks, - endTs) - - val mutes: SCollection[InteractionGraphRawInput] = - GraphUtil.getFlockFeatures( - readSnapshot(FlockMutesEdgesScalaDataset, sc), - FeatureName.NumMutes, - endTs) - - val abuseReports: SCollection[InteractionGraphRawInput] = - GraphUtil.getFlockFeatures( - readSnapshot(FlockReportAsAbuseEdgesScalaDataset, sc), - FeatureName.NumReportAsAbuses, - endTs) - - val spamReports: SCollection[InteractionGraphRawInput] = - GraphUtil.getFlockFeatures( - readSnapshot(FlockReportAsSpamEdgesScalaDataset, sc), - FeatureName.NumReportAsSpams, - endTs) - - // we only keep unfollows in the past 90 days due to the huge size of this dataset, - // and to prevent permanent "shadow-banning" in the event of accidental unfollows. - // we treat unfollows as less critical than above 4 negative signals, since it deals more with - // interest than health typically, which might change over time. - val unfollows: SCollection[InteractionGraphRawInput] = - GraphUtil - .getSocialGraphFeatures( - readSnapshot(SocialgraphUnfollowsScalaDataset, sc), - FeatureName.NumUnfollows, - endTs) - .filter(_.age < 90) - - // group all features by (src, dest) - val allEdgeFeatures: SCollection[Edge] = - getEdgeFeature(SCollection.unionAll(Seq(blocks, mutes, abuseReports, spamReports, unfollows))) - - val negativeFeatures: SCollection[KeyVal[Long, UserSession]] = - allEdgeFeatures - .keyBy(_.sourceId) - .topByKey(maxDestinationIds)(Ordering.by(_.features.size)) - .map { - case (srcId, pqEdges) => - val topKNeg = - pqEdges.toSeq.flatMap(toRealGraphEdgeFeatures(hasNegativeFeatures)) - KeyVal( - srcId, - UserSession( - userId = Some(srcId), - realGraphFeaturesTest = - Some(RealGraphFeaturesTest.V1(RealGraphFeaturesV1(topKNeg))))) - } - - // save to GCS (via DAL) - negativeFeatures.saveAsCustomOutput( - "Write Negative Edge Label", - DAL.writeVersionedKeyVal( - dataset = RealGraphNegativeFeaturesScalaDataset, - pathLayout = PathLayout.VersionedPath(opts.getOutputPath), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis), - writeOption = WriteOptions(numOfShards = Some(3000)) - ) - ) - - // save to BQ - val ingestionDate = opts.getDate().value.getStart.toDate - val bqDataset = opts.getBqDataset - val bqFieldsTransform = RootTransform - .Builder() - .withPrependedFields("dateHour" -> TypedProjection.fromConstant(ingestionDate)) - val timePartitioning = new TimePartitioning() - .setType("DAY").setField("dateHour").setExpirationMs(21.days.inMilliseconds) - val bqWriter = BigQueryIO - .write[Edge] - .to(s"${bqDataset}.interaction_graph_agg_negative_edge_snapshot") - .withExtendedErrorInfo() - .withTimePartitioning(timePartitioning) - .withLoadJobProjectId("twttr-recos-ml-prod") - .withThriftSupport(bqFieldsTransform.build(), AvroConverter.Legacy) - .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition( - BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE - ) // we only want the latest snapshot - - allEdgeFeatures - .saveAsCustomOutput( - s"Save Recommendations to BQ interaction_graph_agg_negative_edge_snapshot", - bqWriter - ) - } - - def readSnapshot[T <: ThriftStruct]( - dataset: SnapshotDALDataset[T], - sc: ScioContext - ): SCollection[T] = { - sc.customInput( - s"Reading most recent snaphost ${dataset.role.name}.${dataset.logicalName}", - DAL.readMostRecentSnapshotNoOlderThan[T](dataset, 7.days) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.docx new file mode 100644 index 000000000..659bb48b8 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.scala deleted file mode 100644 index c44dc3396..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_negative/InteractionGraphNegativeOption.scala +++ /dev/null @@ -1,18 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_negative - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphNegativeOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("BQ dataset prefix") - def getBqDataset: String - def setBqDataset(value: String): Unit - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.docx new file mode 100644 index 000000000..bdeaa81c5 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.md deleted file mode 100644 index 9df76e7ad..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_negative/README.md +++ /dev/null @@ -1,35 +0,0 @@ -## InteractionGraphNegative Dataflow Job - -#### IntelliJ -``` -fastpass create --name rg_neg --intellij src/scala/com/twitter/interaction_graph/scio/agg_negative -``` - -#### Compile -``` -bazel build src/scala/com/twitter/interaction_graph/scio/agg_negative:interaction_graph_negative_scio -``` - -#### Build Jar -``` -bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_negative:interaction_graph_negative_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-negative-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_negative/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-10-19 \ - --bind=profile.output_path=processed/interaction_graph_agg_negative_dataflow \ - --bind=profile.bq_dataset="twttr-bq-cassowary-prod:user" -``` \ No newline at end of file diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD deleted file mode 100644 index 25dfa572b..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD +++ /dev/null @@ -1,65 +0,0 @@ -scala_library( - name = "agg_notifications", - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":interaction_graph_agg_notifications_edge_daily-scala", - ":interaction_graph_agg_notifications_vertex_daily-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read", - "src/scala/com/twitter/frigate/data_pipeline_beam/mr-client-event-filtering-job/src/main/scala/com/twitter/client_event_filtering:frigate_filtered_client_events_dataflow-scala", - "src/scala/com/twitter/interaction_graph/scio/common", - "src/scala/com/twitter/wtf/scalding/jobs/client_event_processing:user_interaction-scala", - "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam", - "twadoop_config/configuration/log_categories/group/frigate:frigate_notifier-scala", - "tweetsource/public_tweets/src/main/scala/com/twitter/tweetsource/public_tweets:public_tweets-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/combined:usersource-scala", - ], -) - -jvm_binary( - name = "interaction_graph_notifications_scio", - main = "com.twitter.interaction_graph.scio.agg_notifications.InteractionGraphNotificationsJob", - platform = "java8", - dependencies = [ - ":agg_notifications", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_notifications_edge_daily", - description = "User-user directed edges with notification features", - java_schema = "com.twitter.interaction_graph.thriftjava.Edge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Edge", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) - -create_datasets( - base_name = "interaction_graph_agg_notifications_vertex_daily", - description = "User vertex with notification features", - java_schema = "com.twitter.interaction_graph.thriftjava.Vertex", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex", - segment_type = "partitioned", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD.docx new file mode 100644 index 000000000..69e25dcc0 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.docx new file mode 100644 index 000000000..d35c89e6e Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.scala b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.scala deleted file mode 100644 index 2ca5a9cf4..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationUtil.scala +++ /dev/null @@ -1,132 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_notifications - -import com.spotify.scio.ScioMetrics -import com.twitter.clientapp.thriftscala.EventNamespace -import com.twitter.clientapp.thriftscala.LogEvent -import com.twitter.interaction_graph.thriftscala.FeatureName - -object InteractionGraphNotificationUtil { - - val PUSH_OPEN_ACTIONS = Set("open", "background_open") - val NTAB_CLICK_ACTIONS = Set("navigate", "click") - val STATUS_ID_REGEX = "^twitter:\\/\\/tweet\\?status_id=([0-9]+).*".r - val TWEET_ID_REGEX = "^twitter:\\/\\/tweet.id=([0-9]+).*".r - - def extractTweetIdFromUrl(url: String): Option[Long] = url match { - case STATUS_ID_REGEX(statusId) => - ScioMetrics.counter("regex matching", "status_id=").inc() - Some(statusId.toLong) - case TWEET_ID_REGEX(tweetId) => - ScioMetrics.counter("regex matching", "tweet?id=").inc() - Some(tweetId.toLong) - case _ => None - } - - def getPushNtabEvents(e: LogEvent): Seq[(Long, (Long, FeatureName))] = { - for { - logBase <- e.logBase.toSeq - userId <- logBase.userId.toSeq - namespace <- e.eventNamespace.toSeq - (tweetId, featureName) <- namespace match { - case EventNamespace(_, _, _, _, _, Some(action)) if PUSH_OPEN_ACTIONS.contains(action) => - (for { - details <- e.eventDetails - url <- details.url - tweetId <- extractTweetIdFromUrl(url) - } yield { - ScioMetrics.counter("event type", "push open").inc() - (tweetId, FeatureName.NumPushOpens) - }).toSeq - case EventNamespace(_, Some("ntab"), _, _, _, Some("navigate")) => - val tweetIds = for { - details <- e.eventDetails.toSeq - items <- details.items.toSeq - item <- items - ntabDetails <- item.notificationTabDetails.toSeq - clientEventMetadata <- ntabDetails.clientEventMetadata.toSeq - tweetIds <- clientEventMetadata.tweetIds.toSeq - tweetId <- tweetIds - } yield { - ScioMetrics.counter("event type", "ntab navigate").inc() - tweetId - } - tweetIds.map((_, FeatureName.NumNtabClicks)) - case EventNamespace(_, Some("ntab"), _, _, _, Some("click")) => - val tweetIds = for { - details <- e.eventDetails.toSeq - items <- details.items.toSeq - item <- items - tweetId <- item.id - } yield { - ScioMetrics.counter("event type", "ntab click").inc() - tweetId - } - tweetIds.map((_, FeatureName.NumNtabClicks)) - case _ => Nil - } - } yield (tweetId, (userId, featureName)) - } - - /** - * Returns events corresponding to ntab clicks. We have the tweet id from ntab clicks and can join - * those with public tweets. - */ - def getNtabEvents(e: LogEvent): Seq[(Long, (Long, FeatureName))] = { - for { - logBase <- e.logBase.toSeq - userId <- logBase.userId.toSeq - namespace <- e.eventNamespace.toSeq - (tweetId, featureName) <- namespace match { - case EventNamespace(_, Some("ntab"), _, _, _, Some("navigate")) => - val tweetIds = for { - details <- e.eventDetails.toSeq - items <- details.items.toSeq - item <- items - ntabDetails <- item.notificationTabDetails.toSeq - clientEventMetadata <- ntabDetails.clientEventMetadata.toSeq - tweetIds <- clientEventMetadata.tweetIds.toSeq - tweetId <- tweetIds - } yield { - ScioMetrics.counter("event type", "ntab navigate").inc() - tweetId - } - tweetIds.map((_, FeatureName.NumNtabClicks)) - case EventNamespace(_, Some("ntab"), _, _, _, Some("click")) => - val tweetIds = for { - details <- e.eventDetails.toSeq - items <- details.items.toSeq - item <- items - tweetId <- item.id - } yield { - ScioMetrics.counter("event type", "ntab click").inc() - tweetId - } - tweetIds.map((_, FeatureName.NumNtabClicks)) - case _ => Nil - } - } yield (tweetId, (userId, featureName)) - } - - /** - * get push open events, keyed by impressionId (as the client event does not always have the tweetId nor the authorId) - */ - def getPushOpenEvents(e: LogEvent): Seq[(String, (Long, FeatureName))] = { - for { - logBase <- e.logBase.toSeq - userId <- logBase.userId.toSeq - namespace <- e.eventNamespace.toSeq - (tweetId, featureName) <- namespace match { - case EventNamespace(_, _, _, _, _, Some(action)) if PUSH_OPEN_ACTIONS.contains(action) => - val impressionIdOpt = for { - details <- e.notificationDetails - impressionId <- details.impressionId - } yield { - ScioMetrics.counter("event type", "push open").inc() - impressionId - } - impressionIdOpt.map((_, FeatureName.NumPushOpens)).toSeq - case _ => Nil - } - } yield (tweetId, (userId, featureName)) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.docx new file mode 100644 index 000000000..5f5b8237a Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.scala b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.scala deleted file mode 100644 index 2a01988be..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsJob.scala +++ /dev/null @@ -1,86 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_notifications - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.DiskFormat -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.io.fs.multiformat.ReadOptions -import com.twitter.beam.io.fs.multiformat.WriteOptions -import com.twitter.client_event_filtering.FrigateFilteredClientEventsDataflowScalaDataset -import com.twitter.clientapp.thriftscala.LogEvent -import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil -import com.twitter.interaction_graph.thriftscala._ -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.statebird.v2.thriftscala.Environment -import com.twitter.tweetsource.public_tweets.PublicTweetsScalaDataset - -object InteractionGraphNotificationsJob extends ScioBeamJob[InteractionGraphNotificationsOption] { - override protected def configurePipeline( - sc: ScioContext, - opts: InteractionGraphNotificationsOption - ): Unit = { - - val pushClientEvents: SCollection[LogEvent] = sc - .customInput( - name = "Read Push Client Events", - DAL - .read( - FrigateFilteredClientEventsDataflowScalaDataset, - opts.interval, - DAL.Environment.Prod, - ) - ) - val pushNtabEvents = - pushClientEvents.flatMap(InteractionGraphNotificationUtil.getPushNtabEvents) - - // look back tweets for 2 days because MR gets tweets from 2 days ago. - // Allow a grace period of 24 hours to reduce oncall workload - val graceHours = 24 - val interval2DaysBefore = - opts.interval.withStart(opts.interval.getStart.minusDays(2).plusHours(graceHours)) - val tweetAuthors: SCollection[(Long, Long)] = sc - .customInput( - name = "Read Tweets", - DAL - .read( - dataset = PublicTweetsScalaDataset, - interval = interval2DaysBefore, - environmentOverride = DAL.Environment.Prod, - readOptions = ReadOptions(projections = Some(Seq("tweetId", "userId"))) - ) - ).map { t => (t.tweetId, t.userId) } - - val pushNtabEdgeCounts = pushNtabEvents - .join(tweetAuthors) - .map { - case (_, ((srcId, feature), destId)) => ((srcId, destId, feature), 1L) - } - .withName("summing edge feature counts") - .sumByKey - - val aggPushEdges = pushNtabEdgeCounts - .map { - case ((srcId, destId, featureName), count) => - (srcId, destId) -> Seq( - EdgeFeature(featureName, FeatureGeneratorUtil.initializeTSS(count))) - } - .sumByKey - .map { - case ((srcId, destId), edgeFeatures) => - Edge(srcId, destId, None, edgeFeatures.sortBy(_.name.value)) - } - - aggPushEdges.saveAsCustomOutput( - "Write Edge Records", - DAL.write[Edge]( - InteractionGraphAggNotificationsEdgeDailyScalaDataset, - PathLayout.DailyPath(opts.getOutputPath + "/aggregated_notifications_edge_daily"), - opts.interval, - DiskFormat.Parquet, - Environment.valueOf(opts.getDALWriteEnvironment), - writeOption = WriteOptions(numOfShards = Some(opts.getNumberOfShards)) - ) - ) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.docx new file mode 100644 index 000000000..bc002c17d Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.scala b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.scala deleted file mode 100644 index dd1b4c769..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/InteractionGraphNotificationsOption.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.interaction_graph.scio.agg_notifications - -import com.twitter.beam.io.dal.DALOptions -import com.twitter.beam.job.DateRangeOptions -import org.apache.beam.sdk.options.Default -import org.apache.beam.sdk.options.Description -import org.apache.beam.sdk.options.Validation.Required - -trait InteractionGraphNotificationsOption extends DALOptions with DateRangeOptions { - @Required - @Description("Output path for storing the final dataset") - def getOutputPath: String - def setOutputPath(value: String): Unit - - @Description("Indicates DAL write environment. Can be set to dev/stg during local validation") - @Default.String("PROD") - def getDALWriteEnvironment: String - def setDALWriteEnvironment(value: String): Unit - - @Description("Number of shards/partitions for saving the final dataset.") - @Default.Integer(8) - def getNumberOfShards: Integer - def setNumberOfShards(value: Integer): Unit -} diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.docx b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.docx new file mode 100644 index 000000000..b8d827be0 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.md b/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.md deleted file mode 100644 index f5f274ad8..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/agg_notifications/README.md +++ /dev/null @@ -1,34 +0,0 @@ -## InteractionGraphClientEventLogs Dataflow Job - -#### IntelliJ -``` -fastpass create --name rg_labels --intellij src/scala/com/twitter/interaction_graph/scio/agg_notifications -``` - -#### Compile -``` -bazel build src/scala/com/twitter/interaction_graph/scio/agg_notifications:interaction_graph_notifications_scio -``` - -#### Build Jar -``` -bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_notifications:interaction_graph_notifications_scio -``` - -#### Run Scheduled Job -``` -export PROJECTID=twttr-recos-ml-prod -export REGION=us-central1 -export JOB_NAME=interaction-graph-notifications-dataflow - -bin/d6w schedule \ - ${PROJECTID}/${REGION}/${JOB_NAME} \ - src/scala/com/twitter/interaction_graph/scio/agg_notifications/config.d6w \ - --bind=profile.user_name=cassowary \ - --bind=profile.project=${PROJECTID} \ - --bind=profile.region=${REGION} \ - --bind=profile.job_name=${JOB_NAME} \ - --bind=profile.environment=prod \ - --bind=profile.date=2022-05-10 \ - --bind=profile.output_path=processed/interaction_graph_agg_notifications_dataflow -``` diff --git a/src/scala/com/twitter/interaction_graph/scio/common/BUILD b/src/scala/com/twitter/interaction_graph/scio/common/BUILD deleted file mode 100644 index 4916728c5..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -scala_library( - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/algebird:core", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "flockdb-tools/datasets/flock/src/main/thrift:thrift-scala", - "src/scala/com/twitter/pluck/source/combined_user_scrooge_source", - "src/thrift/com/twitter/gizmoduck:user-thrift-scala", - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - "src/thrift/com/twitter/socialgraph:thrift-scala", - "src/thrift/com/twitter/twadoop/user/gen:gen-scala", - "src/thrift/com/twitter/user_session_store:thrift-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - ], -) - -scala_library( - name = "feature_groups", - sources = ["FeatureGroups.scala"], - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/thrift/com/twitter/interaction_graph:interaction_graph-scala", - ], -) diff --git a/src/scala/com/twitter/interaction_graph/scio/common/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/common/BUILD.docx new file mode 100644 index 000000000..b2167503c Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/BUILD.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.docx b/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.docx new file mode 100644 index 000000000..9744a6eb9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.scala b/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.scala deleted file mode 100644 index d8264fd8e..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/CaseClasses.scala +++ /dev/null @@ -1,21 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.interaction_graph.thriftscala.FeatureName - -/** Interaction Graph Raw Input type defines a common type for edge / vertex feature calculation - * It has fields: (source Id, destination Id, Feature Name, age of this relationship (in days), - * and value to be aggregated) - */ -case class InteractionGraphRawInput( - src: Long, - dst: Long, - name: FeatureName, - age: Int, - featureValue: Double) - -case class FeatureKey( - src: Long, - dest: Long, - name: FeatureName) - -case class Tweepcred(userId: Long, tweepcred: Short) diff --git a/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.docx new file mode 100644 index 000000000..82dcb38cf Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.scala deleted file mode 100644 index a23816078..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/ConversionUtil.scala +++ /dev/null @@ -1,110 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.ScioMetrics -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics -import com.twitter.timelines.real_graph.v1.thriftscala.RealGraphEdgeFeatures -import com.twitter.timelines.real_graph.v1.thriftscala.{ - RealGraphEdgeFeature => RealGraphEdgeFeatureV1 -} - -object ConversionUtil { - def toRealGraphEdgeFeatureV1(tss: TimeSeriesStatistics): RealGraphEdgeFeatureV1 = { - RealGraphEdgeFeatureV1( - mean = Some(tss.mean), - ewma = Some(tss.ewma), - m2ForVariance = Some(tss.m2ForVariance), - daysSinceLast = tss.numDaysSinceLast.map(_.toShort), - nonZeroDays = Some(tss.numNonZeroDays.toShort), - elapsedDays = Some(tss.numElapsedDays.toShort), - isMissing = Some(false) - ) - } - - /** - * Checks if the converted `RealGraphEdgeFeatures` has negative edges features. - * Our pipeline includes other negative interactions that aren't in the UserSession thrift - * so we'll just filter them away for now (for parity). - */ - def hasNegativeFeatures(rgef: RealGraphEdgeFeatures): Boolean = { - rgef.numMutes.nonEmpty || - rgef.numBlocks.nonEmpty || - rgef.numReportAsAbuses.nonEmpty || - rgef.numReportAsSpams.nonEmpty - } - - /** - * Checks if the converted `RealGraphEdgeFeatures` has some of the key interaction features present. - * This is adapted from timeline's code here: - */ - def hasTimelinesRequiredFeatures(rgef: RealGraphEdgeFeatures): Boolean = { - rgef.retweetsFeature.nonEmpty || - rgef.favsFeature.nonEmpty || - rgef.mentionsFeature.nonEmpty || - rgef.tweetClicksFeature.nonEmpty || - rgef.linkClicksFeature.nonEmpty || - rgef.profileViewsFeature.nonEmpty || - rgef.dwellTimeFeature.nonEmpty || - rgef.inspectedStatusesFeature.nonEmpty || - rgef.photoTagsFeature.nonEmpty || - rgef.numTweetQuotes.nonEmpty || - rgef.followFeature.nonEmpty || - rgef.mutualFollowFeature.nonEmpty || - rgef.addressBookEmailFeature.nonEmpty || - rgef.addressBookPhoneFeature.nonEmpty - } - - /** - * Convert an Edge into a RealGraphEdgeFeature. - * We return the converted RealGraphEdgeFeature when filterFn is true. - * This is to allow us to filter early on during the conversion if required, rather than map over the whole - * collection of records again to filter. - * - * @param filterFn true if and only if we want to keep the converted feature - */ - def toRealGraphEdgeFeatures( - filterFn: RealGraphEdgeFeatures => Boolean - )( - e: Edge - ): Option[RealGraphEdgeFeatures] = { - val baseFeature = RealGraphEdgeFeatures(destId = e.destinationId) - val aggregatedFeature = e.features.foldLeft(baseFeature) { - case (aggregatedFeature, edgeFeature) => - val f = Some(toRealGraphEdgeFeatureV1(edgeFeature.tss)) - ScioMetrics.counter("toRealGraphEdgeFeatures", edgeFeature.name.name).inc() - edgeFeature.name match { - case FeatureName.NumRetweets => aggregatedFeature.copy(retweetsFeature = f) - case FeatureName.NumFavorites => aggregatedFeature.copy(favsFeature = f) - case FeatureName.NumMentions => aggregatedFeature.copy(mentionsFeature = f) - case FeatureName.NumTweetClicks => aggregatedFeature.copy(tweetClicksFeature = f) - case FeatureName.NumLinkClicks => aggregatedFeature.copy(linkClicksFeature = f) - case FeatureName.NumProfileViews => aggregatedFeature.copy(profileViewsFeature = f) - case FeatureName.TotalDwellTime => aggregatedFeature.copy(dwellTimeFeature = f) - case FeatureName.NumInspectedStatuses => - aggregatedFeature.copy(inspectedStatusesFeature = f) - case FeatureName.NumPhotoTags => aggregatedFeature.copy(photoTagsFeature = f) - case FeatureName.NumFollows => aggregatedFeature.copy(followFeature = f) - case FeatureName.NumMutualFollows => aggregatedFeature.copy(mutualFollowFeature = f) - case FeatureName.AddressBookEmail => aggregatedFeature.copy(addressBookEmailFeature = f) - case FeatureName.AddressBookPhone => aggregatedFeature.copy(addressBookPhoneFeature = f) - case FeatureName.AddressBookInBoth => aggregatedFeature.copy(addressBookInBothFeature = f) - case FeatureName.AddressBookMutualEdgeEmail => - aggregatedFeature.copy(addressBookMutualEdgeEmailFeature = f) - case FeatureName.AddressBookMutualEdgePhone => - aggregatedFeature.copy(addressBookMutualEdgePhoneFeature = f) - case FeatureName.AddressBookMutualEdgeInBoth => - aggregatedFeature.copy(addressBookMutualEdgeInBothFeature = f) - case FeatureName.NumTweetQuotes => aggregatedFeature.copy(numTweetQuotes = f) - case FeatureName.NumBlocks => aggregatedFeature.copy(numBlocks = f) - case FeatureName.NumMutes => aggregatedFeature.copy(numMutes = f) - case FeatureName.NumReportAsSpams => aggregatedFeature.copy(numReportAsSpams = f) - case FeatureName.NumReportAsAbuses => aggregatedFeature.copy(numReportAsAbuses = f) - case _ => aggregatedFeature - } - } - if (filterFn(aggregatedFeature)) - Some(aggregatedFeature.copy(weight = e.weight.orElse(Some(0.0)))) - else None - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.docx new file mode 100644 index 000000000..a2eb39c5f Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.scala deleted file mode 100644 index f791d538a..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/DateUtil.scala +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.util.Duration -import org.joda.time.Interval - -object DateUtil { - def embiggen(dateInterval: Interval, duration: Duration): Interval = { - - val days = duration.inDays - val newStart = dateInterval.getStart.minusDays(days) - val newEnd = dateInterval.getEnd.plusDays(days) - new Interval(newStart, newEnd) - } - - def subtract(dateInterval: Interval, duration: Duration): Interval = { - val days = duration.inDays - val newStart = dateInterval.getStart.minusDays(days) - val newEnd = dateInterval.getEnd.minusDays(days) - new Interval(newStart, newEnd) - } - - def prependDays(dateInterval: Interval, duration: Duration): Interval = { - val days = duration.inDays - val newStart = dateInterval.getStart.minusDays(days) - new Interval(newStart, dateInterval.getEnd.toInstant) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.docx b/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.docx new file mode 100644 index 000000000..649513ba4 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.scala b/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.scala deleted file mode 100644 index 004a141bb..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/EdgeFeatureCombiner.scala +++ /dev/null @@ -1,350 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.ScioMetrics -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.EdgeFeature -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics - -object EdgeFeatureCombiner { - def apply(srcId: Long, destId: Long): EdgeFeatureCombiner = new EdgeFeatureCombiner( - instanceEdge = Edge(srcId, destId), - featureMap = Map( - FeatureName.NumRetweets -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumFavorites -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumMentions -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumTweetClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumLinkClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumProfileViews -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumFollows -> new BooleanOrEdgeCombiner, - FeatureName.NumUnfollows -> new BooleanOrEdgeCombiner, - FeatureName.NumMutualFollows -> new BooleanOrEdgeCombiner, - FeatureName.NumBlocks -> new BooleanOrEdgeCombiner, - FeatureName.NumMutes -> new BooleanOrEdgeCombiner, - FeatureName.NumReportAsAbuses -> new BooleanOrEdgeCombiner, - FeatureName.NumReportAsSpams -> new BooleanOrEdgeCombiner, - FeatureName.NumTweetQuotes -> new WeightedAdditiveEdgeCombiner, - FeatureName.AddressBookEmail -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookPhone -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookInBoth -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookMutualEdgeEmail -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookMutualEdgePhone -> new BooleanOrEdgeCombiner, - FeatureName.AddressBookMutualEdgeInBoth -> new BooleanOrEdgeCombiner, - FeatureName.TotalDwellTime -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumInspectedStatuses -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumPhotoTags -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumPushOpens -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumNtabClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtMentions -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtReplies -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtRetweets -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtFavories -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtLinkClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtTweetClicks -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumRtTweetQuotes -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumShares -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumEmailOpen -> new WeightedAdditiveEdgeCombiner, - FeatureName.NumEmailClick -> new WeightedAdditiveEdgeCombiner, - ) - ) -} - -/** - * This class can take in a number of input Edge thrift objects, (all of which are assumed to - * contain information about a single edge) and builds a combined Edge protobuf object, which has - * the union of all the input. - *
- * There are two modes of aggregation: one of them just adds the values in assuming that these are - * from the same day, and the other adds them in a time-decayed manner using the passed in weights. - *
- * The input objects features must be disjoint. Also, remember that the edge is directed! - */ -class EdgeFeatureCombiner(instanceEdge: Edge, featureMap: Map[FeatureName, EFeatureCombiner]) { - - /** - * Adds features without any decay. To be used for the same day. - * - * @param edge edge to be added into the combiner - */ - def addFeature(edge: Edge): EdgeFeatureCombiner = { - - val newEdge = - if (edge.weight.isDefined) instanceEdge.copy(weight = edge.weight) else instanceEdge - val newFeatures = featureMap.map { - case (featureName, combiner) => - edge.features.find(_.name.equals(featureName)) match { - case Some(feature) => - val updatedCombiner = - if (combiner.isSet) combiner.updateFeature(feature) else combiner.setFeature(feature) - (featureName, updatedCombiner) - case _ => (featureName, combiner) - } - } - - new EdgeFeatureCombiner(newEdge, newFeatures) - - } - - /** - * Adds features with decays. Used for combining multiple days. - * - * @param edge edge to be added into the combiner - * @param alpha parameters for the decay calculation - * @param day number of days from today - */ - def addFeature(edge: Edge, alpha: Double, day: Int): EdgeFeatureCombiner = { - - val newEdge = if (edge.weight.isDefined) edge.copy(weight = edge.weight) else edge - val newFeatures = featureMap.map { - case (featureName, combiner) => - edge.features.find(_.name.equals(featureName)) match { - case Some(feature) => - val updatedCombiner = - if (combiner.isSet) combiner.updateFeature(feature, alpha, day) - else combiner.setFeature(feature, alpha, day) - ScioMetrics.counter("EdgeFeatureCombiner.addFeature", feature.name.name).inc() - (featureName, updatedCombiner) - case _ => (featureName, combiner) - } - } - new EdgeFeatureCombiner(newEdge, newFeatures) - } - - /** - * Generate the final combined Edge instance - * We return a deterministically sorted list of edge features - * - * @param totalDays total number of days to be combined together - */ - def getCombinedEdge(totalDays: Int): Edge = { - val moreFeatures = featureMap.values - .flatMap { combiner => - combiner.getFinalFeature(totalDays) - }.toList.sortBy(_.name.value) - instanceEdge.copy( - features = moreFeatures - ) - } - -} - -/** - * This portion contains the actual combination logic. For now, we only implement a simple - * additive combiner, but in future we'd like to have things like time-weighted (exponential - * decay, maybe) values. - */ - -trait EFeatureCombiner { - val edgeFeature: Option[EdgeFeature] - val startingDay: Int - val endingDay: Int - val timeSeriesStatistics: Option[TimeSeriesStatistics] - - def updateTSS(feature: EdgeFeature, alpha: Double): Option[TimeSeriesStatistics] - - def addToTSS(feature: EdgeFeature): Option[TimeSeriesStatistics] - - def updateFeature(feature: EdgeFeature): EFeatureCombiner - - def updateFeature(feature: EdgeFeature, alpha: Double, day: Int): EFeatureCombiner - - def isSet: Boolean - - def dropFeature: Boolean - - def setFeature(feature: EdgeFeature, alpha: Double, day: Int): EFeatureCombiner - - def setFeature(feature: EdgeFeature): EFeatureCombiner - - def getFinalFeature(totalDays: Int): Option[EdgeFeature] - -} - -case class WeightedAdditiveEdgeCombiner( - override val edgeFeature: Option[EdgeFeature] = None, - override val startingDay: Int = Integer.MAX_VALUE, - override val endingDay: Int = Integer.MIN_VALUE, - override val timeSeriesStatistics: Option[TimeSeriesStatistics] = None) - extends EFeatureCombiner { - - override def updateTSS( - feature: EdgeFeature, - alpha: Double - ): Option[TimeSeriesStatistics] = { - timeSeriesStatistics.map(tss => - InteractionGraphUtils.updateTimeSeriesStatistics(tss, feature.tss.mean, alpha)) - } - - override def addToTSS(feature: EdgeFeature): Option[TimeSeriesStatistics] = { - timeSeriesStatistics.map(tss => - InteractionGraphUtils.addToTimeSeriesStatistics(tss, feature.tss.mean)) - } - - override def updateFeature(feature: EdgeFeature): WeightedAdditiveEdgeCombiner = { - WeightedAdditiveEdgeCombiner( - edgeFeature, - startingDay, - endingDay, - addToTSS(feature) - ) - } - - def setFeature(feature: EdgeFeature, alpha: Double, day: Int): WeightedAdditiveEdgeCombiner = { - val newStartingDay = Math.min(startingDay, day) - val newEndingDay = Math.max(endingDay, day) - - val numDaysSinceLast = - if (feature.tss.numDaysSinceLast.exists(_ > 0)) - feature.tss.numDaysSinceLast - else Some(feature.tss.numElapsedDays - feature.tss.numNonZeroDays + 1) - - val tss = feature.tss.copy( - numDaysSinceLast = numDaysSinceLast, - ewma = alpha * feature.tss.ewma - ) - - val newFeature = EdgeFeature( - name = feature.name, - tss = tss - ) - - WeightedAdditiveEdgeCombiner( - Some(newFeature), - newStartingDay, - newEndingDay, - Some(tss) - ) - } - - def getFinalFeature(totalDays: Int): Option[EdgeFeature] = { - if (edgeFeature.isEmpty || dropFeature) return None - - val newTss = if (totalDays > 0) { - val elapsed = - timeSeriesStatistics.map(tss => tss.numElapsedDays + totalDays - 1 - startingDay) - - val latest = - if (endingDay > 0) Some(totalDays - endingDay) - else - timeSeriesStatistics.flatMap(tss => - tss.numDaysSinceLast.map(numDaysSinceLast => numDaysSinceLast + totalDays - 1)) - - timeSeriesStatistics.map(tss => - tss.copy( - numElapsedDays = elapsed.get, - numDaysSinceLast = latest - )) - } else timeSeriesStatistics - - edgeFeature.map(ef => ef.copy(tss = newTss.get)) - } - - override def updateFeature( - feature: EdgeFeature, - alpha: Double, - day: Int - ): WeightedAdditiveEdgeCombiner = copy( - endingDay = Math.max(endingDay, day), - timeSeriesStatistics = updateTSS(feature, alpha) - ) - - override def dropFeature: Boolean = timeSeriesStatistics.exists(tss => - tss.numDaysSinceLast.exists(_ > InteractionGraphUtils.MAX_DAYS_RETENTION) || - tss.ewma < InteractionGraphUtils.MIN_FEATURE_VALUE) - - override def isSet = edgeFeature.isDefined - - override def setFeature(feature: EdgeFeature): WeightedAdditiveEdgeCombiner = - setFeature(feature, 1.0, 0) - -} - -/** - * This combiner resets the value to 0 if the latest event being combined = 0. Ignores time decays. - */ -case class BooleanOrEdgeCombiner( - override val edgeFeature: Option[EdgeFeature] = None, - override val startingDay: Int = Integer.MAX_VALUE, - override val endingDay: Int = Integer.MIN_VALUE, - override val timeSeriesStatistics: Option[TimeSeriesStatistics] = None) - extends EFeatureCombiner { - - override def updateTSS( - feature: EdgeFeature, - alpha: Double - ): Option[TimeSeriesStatistics] = { - val value = timeSeriesStatistics.map(tss => Math.floor(tss.ewma)) - val newValue = if (value.exists(_ == 1.0) || feature.tss.mean > 0.0) 1.0 else 0.0 - timeSeriesStatistics.map(tss => - tss.copy( - mean = newValue, - ewma = newValue, - numNonZeroDays = tss.numNonZeroDays + 1 - )) - } - - override def addToTSS(feature: EdgeFeature): Option[TimeSeriesStatistics] = { - val value = timeSeriesStatistics.map(tss => Math.floor(tss.ewma)) - val newValue = if (value.exists(_ == 1.0) || feature.tss.mean > 0.0) 1.0 else 0.0 - timeSeriesStatistics.map(tss => tss.copy(mean = newValue, ewma = newValue)) - } - - override def updateFeature(feature: EdgeFeature): BooleanOrEdgeCombiner = BooleanOrEdgeCombiner( - edgeFeature, - startingDay, - endingDay, - addToTSS(feature) - ) - - def setFeature(feature: EdgeFeature, alpha: Double, day: Int): BooleanOrEdgeCombiner = { - val newStartingDay = Math.min(startingDay, day) - val newEndingDay = Math.max(endingDay, day) - - val numDaysSinceLast = - if (feature.tss.numDaysSinceLast.exists(_ > 0)) - feature.tss.numDaysSinceLast.get - else feature.tss.numElapsedDays - feature.tss.numNonZeroDays + 1 - - val tss = feature.tss.copy( - numDaysSinceLast = Some(numDaysSinceLast), - ewma = alpha * feature.tss.ewma - ) - - val newFeature = EdgeFeature( - name = feature.name, - tss = tss - ) - - BooleanOrEdgeCombiner( - Some(newFeature), - newStartingDay, - newEndingDay, - Some(tss) - ) - } - - override def getFinalFeature(totalDays: Int): Option[EdgeFeature] = - if (timeSeriesStatistics.exists(tss => tss.ewma < 1.0)) None - else { - if (edgeFeature.isEmpty || dropFeature) return None - edgeFeature.map(ef => - ef.copy( - tss = timeSeriesStatistics.get - )) - } - - override def updateFeature( - feature: EdgeFeature, - alpha: Double, - day: Int - ): BooleanOrEdgeCombiner = copy( - endingDay = Math.max(endingDay, day), - timeSeriesStatistics = updateTSS(feature, alpha) - ) - - override def dropFeature: Boolean = false // we will keep rolling up status-based features - - override def isSet = edgeFeature.isDefined - - override def setFeature(feature: EdgeFeature): BooleanOrEdgeCombiner = setFeature(feature, 1.0, 0) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.docx new file mode 100644 index 000000000..e7e591e87 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.scala deleted file mode 100644 index 56c403522..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGeneratorUtil.scala +++ /dev/null @@ -1,263 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.ScioMetrics -import com.spotify.scio.values.SCollection -import com.twitter.interaction_graph.scio.common.FeatureGroups.DWELL_TIME_FEATURE_LIST -import com.twitter.interaction_graph.scio.common.FeatureGroups.STATUS_FEATURE_LIST -import com.twitter.interaction_graph.scio.common.UserUtil.DUMMY_USER_ID -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.EdgeFeature -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.interaction_graph.thriftscala.VertexFeature - -object FeatureGeneratorUtil { - - // Initialize a TimeSeriesStatistics object by (value, age) pair - def initializeTSS(featureValue: Double, age: Int = 1): TimeSeriesStatistics = - TimeSeriesStatistics( - mean = featureValue, - m2ForVariance = 0.0, - ewma = featureValue, - numElapsedDays = age, - numNonZeroDays = age, - numDaysSinceLast = Some(age) - ) - - /** - * Create vertex feature from InteractionGraphRawInput graph (src, dst, feature name, age, featureValue) - * We will represent non-directional features (eg num_create_tweets) as "outgoing" values. - * @return - */ - def getVertexFeature( - input: SCollection[InteractionGraphRawInput] - ): SCollection[Vertex] = { - // For vertex features we need to calculate both in and out featureValue - val vertexAggregatedFeatureValues = input - .flatMap { input => - if (input.dst != DUMMY_USER_ID) { - Seq( - ((input.src, input.name.value), (input.featureValue, 0.0)), - ((input.dst, input.name.value), (0.0, input.featureValue)) - ) - } else { - // we put the non-directional features as "outgoing" values - Seq(((input.src, input.name.value), (input.featureValue, 0.0))) - } - } - .sumByKey - .map { - case ((userId, nameId), (outEdges, inEdges)) => - (userId, (FeatureName(nameId), outEdges, inEdges)) - }.groupByKey - - vertexAggregatedFeatureValues.map { - case (userId, records) => - // sort features by FeatureName for deterministic order (esp during testing) - val features = records.toSeq.sortBy(_._1.value).flatMap { - case (name, outEdges, inEdges) => - // create out vertex features - val outFeatures = if (outEdges > 0) { - val outTss = initializeTSS(outEdges) - List( - VertexFeature( - name = name, - outgoing = true, - tss = outTss - )) - } else Nil - - // create in vertex features - val inFeatures = if (inEdges > 0) { - val inTss = initializeTSS(inEdges) - List( - VertexFeature( - name = name, - outgoing = false, - tss = inTss - )) - } else Nil - - outFeatures ++ inFeatures - } - Vertex(userId = userId, features = features) - } - } - - /** - * Create edge feature from InteractionGraphRawInput graph (src, dst, feature name, age, featureValue) - * We will exclude all non-directional features (eg num_create_tweets) from all edge aggregates - */ - def getEdgeFeature( - input: SCollection[InteractionGraphRawInput] - ): SCollection[Edge] = { - input - .withName("filter non-directional features") - .flatMap { input => - if (input.dst != DUMMY_USER_ID) { - ScioMetrics.counter("getEdgeFeature", s"directional feature ${input.name.name}").inc() - Some(((input.src, input.dst), (input.name, input.age, input.featureValue))) - } else { - ScioMetrics.counter("getEdgeFeature", s"non-directional feature ${input.name.name}").inc() - None - } - } - .withName("group features by pairs") - .groupByKey - .map { - case ((src, dst), records) => - // sort features by FeatureName for deterministic order (esp during testing) - val features = records.toSeq.sortBy(_._1.value).map { - case (name, age, featureValue) => - val tss = initializeTSS(featureValue, age) - EdgeFeature( - name = name, - tss = tss - ) - } - Edge( - sourceId = src, - destinationId = dst, - weight = Some(0.0), - features = features.toSeq - ) - } - } - - // For same user id, combine different vertex feature records into one record - // The input will assume for each (userId, featureName, direction), there will be only one record - def combineVertexFeatures( - vertex: SCollection[Vertex], - ): SCollection[Vertex] = { - vertex - .groupBy { v: Vertex => - v.userId - } - .map { - case (userId, vertexes) => - val combiner = vertexes.foldLeft(VertexFeatureCombiner(userId)) { - case (combiner, vertex) => - combiner.addFeature(vertex) - } - combiner.getCombinedVertex(0) - } - - } - - def combineEdgeFeatures( - edge: SCollection[Edge] - ): SCollection[Edge] = { - edge - .groupBy { e => - (e.sourceId, e.destinationId) - } - .withName("combining edge features for each (src, dst)") - .map { - case ((src, dst), edges) => - val combiner = edges.foldLeft(EdgeFeatureCombiner(src, dst)) { - case (combiner, edge) => - combiner.addFeature(edge) - } - combiner.getCombinedEdge(0) - } - } - - def combineVertexFeaturesWithDecay( - history: SCollection[Vertex], - daily: SCollection[Vertex], - historyWeight: Double, - dailyWeight: Double - ): SCollection[Vertex] = { - - history - .keyBy(_.userId) - .cogroup(daily.keyBy(_.userId)).map { - case (userId, (h, d)) => - // Adding history iterators - val historyCombiner = h.toList.foldLeft(VertexFeatureCombiner(userId)) { - case (combiner, vertex) => - combiner.addFeature(vertex, historyWeight, 0) - } - // Adding daily iterators - val finalCombiner = d.toList.foldLeft(historyCombiner) { - case (combiner, vertex) => - combiner.addFeature(vertex, dailyWeight, 1) - } - - finalCombiner.getCombinedVertex( - 2 - ) // 2 means totally we have 2 days(yesterday and today) data to combine together - } - } - - def combineEdgeFeaturesWithDecay( - history: SCollection[Edge], - daily: SCollection[Edge], - historyWeight: Double, - dailyWeight: Double - ): SCollection[Edge] = { - - history - .keyBy { e => - (e.sourceId, e.destinationId) - } - .withName("combine history and daily edges with decay") - .cogroup(daily.keyBy { e => - (e.sourceId, e.destinationId) - }).map { - case ((src, dst), (h, d)) => - //val combiner = EdgeFeatureCombiner(src, dst) - // Adding history iterators - - val historyCombiner = h.toList.foldLeft(EdgeFeatureCombiner(src, dst)) { - case (combiner, edge) => - combiner.addFeature(edge, historyWeight, 0) - } - - val finalCombiner = d.toList.foldLeft(historyCombiner) { - case (combiner, edge) => - combiner.addFeature(edge, dailyWeight, 1) - } - - finalCombiner.getCombinedEdge( - 2 - ) // 2 means totally we have 2 days(yesterday and today) data to combine together - - } - } - - /** - * Create features from following graph (src, dst, age, featureValue) - * Note that we will filter out vertex features represented as edges from the edge output. - */ - def getFeatures( - input: SCollection[InteractionGraphRawInput] - ): (SCollection[Vertex], SCollection[Edge]) = { - (getVertexFeature(input), getEdgeFeature(input)) - } - - // remove the edge features that from flock, address book or sms as we will refresh them on a daily basis - def removeStatusFeatures(e: Edge): Seq[Edge] = { - val updatedFeatureList = e.features.filter { e => - !STATUS_FEATURE_LIST.contains(e.name) - } - if (updatedFeatureList.size > 0) { - val edge = Edge( - sourceId = e.sourceId, - destinationId = e.destinationId, - weight = e.weight, - features = updatedFeatureList - ) - Seq(edge) - } else - Nil - } - - // check if the edge feature has features other than dwell time feature - def edgeWithFeatureOtherThanDwellTime(e: Edge): Boolean = { - e.features.exists { f => - !DWELL_TIME_FEATURE_LIST.contains(f.name) - } - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.docx b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.docx new file mode 100644 index 000000000..eacbc45ef Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.scala b/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.scala deleted file mode 100644 index 89887be99..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/FeatureGroups.scala +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.interaction_graph.thriftscala.FeatureName - -object FeatureGroups { - - val HEALTH_FEATURE_LIST: Set[FeatureName] = Set( - FeatureName.NumMutes, - FeatureName.NumBlocks, - FeatureName.NumReportAsSpams, - FeatureName.NumReportAsAbuses - ) - - val STATUS_FEATURE_LIST: Set[FeatureName] = Set( - FeatureName.AddressBookEmail, - FeatureName.AddressBookPhone, - FeatureName.AddressBookInBoth, - FeatureName.AddressBookMutualEdgeEmail, - FeatureName.AddressBookMutualEdgePhone, - FeatureName.AddressBookMutualEdgeInBoth, - FeatureName.NumFollows, - FeatureName.NumUnfollows, - FeatureName.NumMutualFollows - ) ++ HEALTH_FEATURE_LIST - - val DWELL_TIME_FEATURE_LIST: Set[FeatureName] = Set( - FeatureName.TotalDwellTime, - FeatureName.NumInspectedStatuses - ) -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.docx new file mode 100644 index 000000000..1b0ca045b Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.scala deleted file mode 100644 index f94c136df..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/GraphUtil.scala +++ /dev/null @@ -1,93 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.ScioMetrics -import com.spotify.scio.values.SCollection -import com.twitter.socialgraph.presto.thriftscala.{Edge => SocialGraphEdge} -import com.twitter.flockdb.tools.datasets.flock.thriftscala.FlockEdge -import com.twitter.interaction_graph.scio.common.FeatureGroups.HEALTH_FEATURE_LIST -import com.twitter.interaction_graph.thriftscala.Edge -import com.twitter.interaction_graph.thriftscala.FeatureName - -import java.time.Instant -import java.time.temporal.ChronoUnit - -object GraphUtil { - - /** - * Convert FlockEdge into common InteractionGraphRawInput class. - * updatedAt field in socialgraph.unfollows is in seconds. - */ - def getFlockFeatures( - edges: SCollection[FlockEdge], - featureName: FeatureName, - currentTimeMillis: Long - ): SCollection[InteractionGraphRawInput] = { - edges - .withName(s"${featureName.toString} - Converting flock edge to interaction graph input") - .map { edge => - val age = ChronoUnit.DAYS.between( - Instant.ofEpochMilli(edge.updatedAt * 1000L), // updatedAt is in seconds - Instant.ofEpochMilli(currentTimeMillis) - ) - InteractionGraphRawInput( - edge.sourceId, - edge.destinationId, - featureName, - age.max(0).toInt, - 1.0) - } - } - - /** - * Convert com.twitter.socialgraph.presto.thriftscala.Edge (from unfollows) into common InteractionGraphRawInput class. - * updatedAt field in socialgraph.unfollows is in seconds. - */ - def getSocialGraphFeatures( - edges: SCollection[SocialGraphEdge], - featureName: FeatureName, - currentTimeMillis: Long - ): SCollection[InteractionGraphRawInput] = { - edges - .withName(s"${featureName.toString} - Converting flock edge to interaction graph input") - .map { edge => - val age = ChronoUnit.DAYS.between( - Instant.ofEpochMilli(edge.updatedAt * 1000L), // updatedAt is in seconds - Instant.ofEpochMilli(currentTimeMillis) - ) - InteractionGraphRawInput( - edge.sourceId, - edge.destinationId, - featureName, - age.max(0).toInt, - 1.0) - } - } - def isFollow(edge: Edge): Boolean = { - val result = edge.features - .find(_.name == FeatureName.NumFollows) - .exists(_.tss.mean == 1.0) - result - } - - def filterExtremes(edge: Edge): Boolean = { - if (edge.weight.exists(_.isNaN)) { - ScioMetrics.counter("filter extremes", "nan").inc() - false - } else if (edge.weight.contains(Double.MaxValue)) { - ScioMetrics.counter("filter extremes", "max value").inc() - false - } else if (edge.weight.contains(Double.PositiveInfinity)) { - ScioMetrics.counter("filter extremes", "+ve inf").inc() - false - } else if (edge.weight.exists(_ < 0.0)) { - ScioMetrics.counter("filter extremes", "negative").inc() - false - } else { - true - } - } - - def filterNegative(edge: Edge): Boolean = { - !edge.features.find(ef => HEALTH_FEATURE_LIST.contains(ef.name)).exists(_.tss.mean > 0.0) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.docx b/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.docx new file mode 100644 index 000000000..3f5a229f9 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.scala b/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.scala deleted file mode 100644 index be6aa0153..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/InteractionGraphUtils.scala +++ /dev/null @@ -1,40 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics - -object InteractionGraphUtils { - final val MIN_FEATURE_VALUE = Math.pow(0.955, 60) - final val MAX_DAYS_RETENTION = 60L - final val MILLISECONDS_PER_DAY = 1000 * 60 * 60 * 24 - - def updateTimeSeriesStatistics( - timeSeriesStatistics: TimeSeriesStatistics, - currValue: Double, - alpha: Double - ): TimeSeriesStatistics = { - val numNonZeroDays = timeSeriesStatistics.numNonZeroDays + 1 - - val delta = currValue - timeSeriesStatistics.mean - val updatedMean = timeSeriesStatistics.mean + delta / numNonZeroDays - val m2ForVariance = timeSeriesStatistics.m2ForVariance + delta * (currValue - updatedMean) - val ewma = alpha * currValue + timeSeriesStatistics.ewma - - timeSeriesStatistics.copy( - mean = updatedMean, - m2ForVariance = m2ForVariance, - ewma = ewma, - numNonZeroDays = numNonZeroDays - ) - } - - def addToTimeSeriesStatistics( - timeSeriesStatistics: TimeSeriesStatistics, - currValue: Double - ): TimeSeriesStatistics = { - timeSeriesStatistics.copy( - mean = timeSeriesStatistics.mean + currValue, - ewma = timeSeriesStatistics.ewma + currValue - ) - } - -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.docx b/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.docx new file mode 100644 index 000000000..e703114c8 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.scala b/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.scala deleted file mode 100644 index 39ac51006..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/UserUtil.scala +++ /dev/null @@ -1,76 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.spotify.scio.coders.Coder -import com.spotify.scio.values.SCollection -import com.twitter.twadoop.user.gen.thriftscala.CombinedUser -import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser - -object UserUtil { - - /** - * placeholder for the destId when representing vertex features with no dest (eg create tweet) - * this will only be aggregated and saved in the vertex datasets but not the edge datasets - */ - val DUMMY_USER_ID = -1L - def getValidUsers(users: SCollection[CombinedUser]): SCollection[Long] = { - users - .flatMap { u => - for { - user <- u.user - if user.id != 0 - safety <- user.safety - if !(safety.suspended || safety.deactivated || safety.restricted || - safety.nsfwUser || safety.nsfwAdmin || safety.erased) - } yield { - user.id - } - } - } - - def getValidFlatUsers(users: SCollection[FlatUser]): SCollection[Long] = { - users - .flatMap { u => - for { - id <- u.id - if id != 0 && u.validUser.contains(true) - } yield { - id - } - } - } - - def getInvalidUsers(users: SCollection[FlatUser]): SCollection[Long] = { - users - .flatMap { user => - for { - valid <- user.validUser - if !valid - id <- user.id - } yield id - } - } - - def filterUsersByIdMapping[T: Coder]( - input: SCollection[T], - usersToBeFiltered: SCollection[Long], - userIdMapping: T => Long - ): SCollection[T] = { - input - .withName("filter users by id") - .keyBy(userIdMapping(_)) - .leftOuterJoin[Long](usersToBeFiltered.map(x => (x, x))) - .collect { - // only return data if the key is not in the list of usersToBeFiltered - case (_, (data, None)) => data - } - } - - def filterUsersByMultipleIdMappings[T: Coder]( - input: SCollection[T], - usersToBeFiltered: SCollection[Long], - userIdMappings: Seq[T => Long] - ): SCollection[T] = { - userIdMappings.foldLeft(input)((data, mapping) => - filterUsersByIdMapping(data, usersToBeFiltered, mapping)) - } -} diff --git a/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.docx b/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.docx new file mode 100644 index 000000000..fd0ac4588 Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.docx differ diff --git a/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.scala b/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.scala deleted file mode 100644 index fb7ae7947..000000000 --- a/src/scala/com/twitter/interaction_graph/scio/common/VertexFeatureCombiner.scala +++ /dev/null @@ -1,342 +0,0 @@ -package com.twitter.interaction_graph.scio.common - -import com.twitter.interaction_graph.thriftscala.FeatureName -import com.twitter.interaction_graph.thriftscala.TimeSeriesStatistics -import com.twitter.interaction_graph.thriftscala.Vertex -import com.twitter.interaction_graph.thriftscala.VertexFeature - -object VertexFeatureCombiner { - def apply(userId: Long): VertexFeatureCombiner = new VertexFeatureCombiner( - instanceVertex = Vertex(userId), - featureMap = Map( - (FeatureName.NumRetweets, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRetweets, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumFavorites, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumFavorites, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumMentions, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumMentions, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumTweetClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumTweetClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumLinkClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumLinkClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumProfileViews, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumProfileViews, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumFollows, true) -> new ReplacementVertexCombiner, - (FeatureName.NumFollows, false) -> new ReplacementVertexCombiner, - (FeatureName.NumUnfollows, true) -> new ReplacementVertexCombiner, - (FeatureName.NumUnfollows, false) -> new ReplacementVertexCombiner, - (FeatureName.NumMutualFollows, true) -> new ReplacementVertexCombiner, - (FeatureName.NumBlocks, true) -> new ReplacementVertexCombiner, - (FeatureName.NumBlocks, false) -> new ReplacementVertexCombiner, - (FeatureName.NumMutes, true) -> new ReplacementVertexCombiner, - (FeatureName.NumMutes, false) -> new ReplacementVertexCombiner, - (FeatureName.NumReportAsAbuses, true) -> new ReplacementVertexCombiner, - (FeatureName.NumReportAsAbuses, false) -> new ReplacementVertexCombiner, - (FeatureName.NumReportAsSpams, true) -> new ReplacementVertexCombiner, - (FeatureName.NumReportAsSpams, false) -> new ReplacementVertexCombiner, - (FeatureName.NumTweetQuotes, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumTweetQuotes, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumMutualFollows, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookEmail, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookEmail, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookPhone, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookPhone, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookInBoth, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookInBoth, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgeEmail, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgeEmail, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgePhone, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgePhone, false) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgeInBoth, true) -> new ReplacementVertexCombiner, - (FeatureName.AddressBookMutualEdgeInBoth, false) -> new ReplacementVertexCombiner, - (FeatureName.TotalDwellTime, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.TotalDwellTime, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumInspectedStatuses, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumInspectedStatuses, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumPhotoTags, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumPhotoTags, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumPushOpens, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumPushOpens, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumNtabClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumNtabClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtFavories, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtFavories, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtTweetQuotes, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtTweetQuotes, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtTweetClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtTweetClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtRetweets, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtRetweets, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtReplies, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtReplies, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtLinkClicks, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtLinkClicks, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtMentions, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumRtMentions, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumShares, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumShares, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumEmailOpen, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumEmailOpen, false) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumEmailClick, true) -> new WeightedAdditiveVertexCombiner, - (FeatureName.NumEmailClick, false) -> new WeightedAdditiveVertexCombiner, - ) - ) -} - -/** - * This class can take in a number of input Vertex thrift objects (all of which are assumed to - * contain information about a single vertex) and builds a combined Vertex protobuf object, which - * has the union of all the input. Note that we do a weighted addition for a time-decayed value. - *
- * The input objects features must be disjoint. Also, remember that the Vertex is directed!
- */
-class VertexFeatureCombiner(
- instanceVertex: Vertex,
- featureMap: Map[(FeatureName, Boolean), VFeatureCombiner]) {
-
- /**
- * Adds features without any decay. To be used for the same day.
- *
- * @param vertex vertex to be added into the combiner
- */
- def addFeature(vertex: Vertex): VertexFeatureCombiner = {
- val newVertex = instanceVertex.copy(weight = vertex.weight)
- val newFeatures = featureMap.map {
- case ((featureName, outgoing), combiner) =>
- vertex.features.find(f => f.name.equals(featureName) && f.outgoing.equals(outgoing)) match {
- case Some(feature) =>
- val updatedCombiner =
- if (combiner.isSet) combiner.updateFeature(feature) else combiner.setFeature(feature)
- ((featureName, outgoing), updatedCombiner)
- case _ => ((featureName, outgoing), combiner)
- }
- }
-
- new VertexFeatureCombiner(newVertex, newFeatures)
- }
-
- /**
- * Adds features with decays. Used for combining multiple days.
- *
- * @param vertex vertex to be added into the combiner
- * @param alpha parameters for the decay calculation
- * @param day number of days from today
- */
- def addFeature(vertex: Vertex, alpha: Double, day: Int): VertexFeatureCombiner = {
-
- val newVertex = instanceVertex.copy(weight = vertex.weight)
- val newFeatures = featureMap.map {
- case ((featureName, outgoing), combiner) =>
- vertex.features.find(f => f.name.equals(featureName) && f.outgoing.equals(outgoing)) match {
- case Some(feature) =>
- val updatedCombiner =
- if (combiner.isSet) combiner.updateFeature(feature, alpha, day)
- else combiner.setFeature(feature, alpha, day)
- ((featureName, outgoing), updatedCombiner)
- case _ => ((featureName, outgoing), combiner)
- }
- }
-
- new VertexFeatureCombiner(newVertex, newFeatures)
- }
-
- /**
- * Generate the final combined Vertex instance
- *
- * @param totalDays total number of days to be combined together
- */
- def getCombinedVertex(totalDays: Int): Vertex = {
- val moreFeatures = featureMap.values.flatMap {
- case combiner => combiner.getFinalFeature(totalDays)
- }
- instanceVertex.copy(features = moreFeatures.toSeq)
- }
-
-}
-
-/**
- * This portion contains the actual combination logic. For now, we only implement a simple
- * additive combiner, but in future we'd like to have things like time-weighted (exponential
- * decay, maybe) values.
- */
-trait VFeatureCombiner {
- val startingDay: Int
- val endingDay: Int
- val timeSeriesStatistics: Option[TimeSeriesStatistics]
- val vertexFeature: Option[VertexFeature]
-
- def updateTss(feature: VertexFeature, alpha: Double): VFeatureCombiner
- def addToTss(feature: VertexFeature): VFeatureCombiner
- def updateFeature(feature: VertexFeature, alpha: Double, day: Int): VFeatureCombiner
- def updateFeature(feature: VertexFeature): VFeatureCombiner
- def isSet: Boolean
- def dropFeature: Boolean
- def setFeature(feature: VertexFeature, alpha: Double, day: Int): VFeatureCombiner
- def setFeature(feature: VertexFeature): VFeatureCombiner
- def getFinalFeature(totalDays: Int): Option[VertexFeature]
-}
-
-case class WeightedAdditiveVertexCombiner(
- override val vertexFeature: Option[VertexFeature] = None,
- override val startingDay: Int = Integer.MAX_VALUE,
- override val endingDay: Int = Integer.MIN_VALUE,
- override val timeSeriesStatistics: Option[TimeSeriesStatistics] = None)
- extends VFeatureCombiner {
- override def updateTss(
- feature: VertexFeature,
- alpha: Double
- ): WeightedAdditiveVertexCombiner = copy(timeSeriesStatistics = timeSeriesStatistics.map(tss =>
- InteractionGraphUtils.updateTimeSeriesStatistics(tss, feature.tss.mean, alpha)))
-
- override def addToTss(feature: VertexFeature): WeightedAdditiveVertexCombiner =
- copy(timeSeriesStatistics = timeSeriesStatistics.map(tss =>
- InteractionGraphUtils.addToTimeSeriesStatistics(tss, feature.tss.mean)))
-
- override def updateFeature(feature: VertexFeature, alpha: Double, day: Int): VFeatureCombiner = {
- updateTss(feature, alpha).copy(
- vertexFeature,
- startingDay = startingDay,
- endingDay = Math.max(endingDay, day)
- )
- }
-
- override def updateFeature(feature: VertexFeature): VFeatureCombiner =
- addToTss(feature)
-
- override def setFeature(feature: VertexFeature, alpha: Double, day: Int): VFeatureCombiner = {
- val newStartingDay = Math.min(startingDay, day)
- val newEndingDay = Math.max(endingDay, day)
-
- val numDaysSinceLast =
- if (feature.tss.numDaysSinceLast.exists(_ > 0))
- feature.tss.numDaysSinceLast
- else Some(feature.tss.numElapsedDays - feature.tss.numNonZeroDays + 1)
-
- val tss = feature.tss.copy(numDaysSinceLast = numDaysSinceLast)
-
- val newFeature = VertexFeature(
- name = feature.name,
- outgoing = feature.outgoing,
- tss = tss
- )
-
- WeightedAdditiveVertexCombiner(
- Some(newFeature),
- newStartingDay,
- newEndingDay,
- Some(tss)
- )
- }
-
- def getFinalFeature(totalDays: Int): Option[VertexFeature] = {
- if (vertexFeature.isEmpty || dropFeature) return None
-
- val newTss = if (totalDays > 0) {
- val elapsed =
- timeSeriesStatistics.map(tss => tss.numElapsedDays + totalDays - 1 - startingDay)
- val latest =
- if (endingDay > 0) Some(totalDays - endingDay)
- else timeSeriesStatistics.map(tss => tss.numDaysSinceLast.get + totalDays - 1)
-
- timeSeriesStatistics.map(tss =>
- tss.copy(
- numElapsedDays = elapsed.get,
- numDaysSinceLast = latest
- ))
- } else timeSeriesStatistics
-
- vertexFeature.map(vf => vf.copy(tss = newTss.get))
- }
-
- override def setFeature(feature: VertexFeature): VFeatureCombiner = setFeature(feature, 1.0, 0)
- override def isSet: Boolean = vertexFeature.isDefined
- override def dropFeature: Boolean =
- timeSeriesStatistics.exists(tss =>
- tss.numDaysSinceLast.exists(_ > InteractionGraphUtils.MAX_DAYS_RETENTION) &&
- tss.ewma < InteractionGraphUtils.MIN_FEATURE_VALUE)
-}
-
-/**
- * This combiner always replaces the old value with the current. Ignores time-decays.
- */
-case class ReplacementVertexCombiner(
- override val vertexFeature: Option[VertexFeature] = None,
- override val startingDay: Int = Integer.MAX_VALUE,
- override val endingDay: Int = Integer.MIN_VALUE,
- override val timeSeriesStatistics: Option[TimeSeriesStatistics] = None)
- extends VFeatureCombiner {
- override def updateTss(
- feature: VertexFeature,
- alpha: Double
- ): ReplacementVertexCombiner = setFeature(feature, 1.0, 0)
-
- override def addToTss(feature: VertexFeature): ReplacementVertexCombiner =
- setFeature(feature, 1.0, 0)
-
- override def updateFeature(
- feature: VertexFeature,
- alpha: Double,
- day: Int
- ): ReplacementVertexCombiner = updateTss(feature, alpha).copy(
- vertexFeature,
- startingDay = startingDay,
- endingDay = Math.max(endingDay, day)
- )
-
- override def updateFeature(feature: VertexFeature): ReplacementVertexCombiner =
- addToTss(feature)
-
- override def setFeature(
- feature: VertexFeature,
- alpha: Double,
- day: Int
- ): ReplacementVertexCombiner = {
- val newStartingDay = Math.min(startingDay, day)
- val newEndingDay = Math.max(endingDay, day)
-
- val numDaysSinceLast =
- if (feature.tss.numDaysSinceLast.exists(_ > 0))
- feature.tss.numDaysSinceLast
- else Some(feature.tss.numElapsedDays - feature.tss.numNonZeroDays + 1)
-
- val tss = feature.tss.copy(numDaysSinceLast = numDaysSinceLast)
-
- val newFeature = VertexFeature(
- name = feature.name,
- outgoing = feature.outgoing,
- tss = tss
- )
-
- ReplacementVertexCombiner(
- Some(newFeature),
- newStartingDay,
- newEndingDay,
- Some(tss)
- )
- }
-
- override def getFinalFeature(totalDays: Int): Option[VertexFeature] = {
- if (vertexFeature.isEmpty || dropFeature) return None
- if (timeSeriesStatistics.exists(tss => tss.ewma < 1.0)) return None
- val newTss = if (totalDays > 0) {
- val latest =
- if (endingDay > 0) totalDays - endingDay
- else timeSeriesStatistics.get.numDaysSinceLast.get + totalDays - 1
-
- timeSeriesStatistics.map(tss =>
- tss.copy(
- numElapsedDays = 1,
- numDaysSinceLast = Some(latest)
- ))
- } else timeSeriesStatistics
-
- vertexFeature.map(vf => vf.copy(tss = newTss.get))
- }
-
- override def setFeature(feature: VertexFeature): VFeatureCombiner = setFeature(feature, 1.0, 0)
- override def isSet: Boolean = vertexFeature.isDefined
- override def dropFeature: Boolean =
- timeSeriesStatistics.exists(tss =>
- tss.numDaysSinceLast.exists(_ > InteractionGraphUtils.MAX_DAYS_RETENTION) &&
- tss.ewma < InteractionGraphUtils.MIN_FEATURE_VALUE)
-}
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD b/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD
deleted file mode 100644
index f06c0c08d..000000000
--- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD
+++ /dev/null
@@ -1,49 +0,0 @@
-scala_library(
- name = "labels",
- sources = ["*.scala"],
- compiler_option_sets = ["fatal_warnings"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- ":interaction_graph_labels_daily-scala",
- "beam-internal/src/main/scala/com/twitter/beam/io/dal",
- "beam-internal/src/main/scala/com/twitter/scio_internal/job",
- "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow",
- "consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read",
- "socialgraph/hadoop/src/main/scala/com/twitter/socialgraph/hadoop:socialgraph-follow-events-scala",
- "src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_edge_daily-scala",
- "src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_edge_daily-scala",
- "src/scala/com/twitter/interaction_graph/scio/agg_email:interaction_graph_extended_email_edge_daily-scala",
- "src/scala/com/twitter/interaction_graph/scio/agg_notifications:interaction_graph_agg_notifications_edge_daily-scala",
- "src/scala/com/twitter/interaction_graph/scio/agg_retweets:interaction_graph_extended_retweet_edge_daily-scala",
- "src/scala/com/twitter/interaction_graph/scio/agg_shares:interaction_graph_extended_share_edge_daily-scala",
- "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam",
- ],
-)
-
-jvm_binary(
- name = "interaction_graph_labels",
- main = "com.twitter.interaction_graph.scio.ml.labels.InteractionGraphLabelsJob",
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- ":labels",
- ],
-)
-
-create_datasets(
- base_name = "interaction_graph_labels_daily",
- description = "Daily labels",
- java_schema = "com.twitter.interaction_graph.thriftjava.EdgeLabel",
- platform = "java8",
- role = "cassowary",
- scala_schema = "com.twitter.interaction_graph.thriftscala.EdgeLabel",
- segment_type = "partitioned",
- tags = ["bazel-compatible"],
- java_dependencies = [
- "src/thrift/com/twitter/interaction_graph:interaction_graph-java",
- ],
- scala_dependencies = [
- "src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
- ],
-)
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD.docx
new file mode 100644
index 000000000..07b5ce027
Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/BUILD.docx differ
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.docx
new file mode 100644
index 000000000..f99e381fd
Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.docx differ
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.scala b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.scala
deleted file mode 100644
index a6d9999c8..000000000
--- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsJob.scala
+++ /dev/null
@@ -1,123 +0,0 @@
-package com.twitter.interaction_graph.scio.ml.labels
-
-import com.google.api.services.bigquery.model.TimePartitioning
-import com.spotify.scio.ScioContext
-import com.spotify.scio.values.SCollection
-import com.twitter.beam.io.dal.DAL
-import com.twitter.beam.io.fs.multiformat.DiskFormat
-import com.twitter.beam.io.fs.multiformat.PathLayout
-import com.twitter.beam.io.fs.multiformat.WriteOptions
-import com.twitter.beam.job.ServiceIdentifierOptions
-import com.twitter.cde.scio.dal_read.SourceUtil
-import com.twitter.conversions.DurationOps._
-import com.twitter.dal.client.dataset.TimePartitionedDALDataset
-import com.twitter.interaction_graph.scio.agg_client_event_logs.InteractionGraphAggClientEventLogsEdgeDailyScalaDataset
-import com.twitter.interaction_graph.scio.agg_direct_interactions.InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset
-import com.twitter.interaction_graph.scio.agg_notifications.InteractionGraphAggNotificationsEdgeDailyScalaDataset
-import com.twitter.interaction_graph.thriftscala.Edge
-import com.twitter.interaction_graph.thriftscala.EdgeLabel
-import com.twitter.scio_internal.job.ScioBeamJob
-import com.twitter.socialgraph.event.thriftscala.FollowEvent
-import com.twitter.socialgraph.hadoop.SocialgraphFollowEventsScalaDataset
-import com.twitter.statebird.v2.thriftscala.Environment
-import com.twitter.tcdc.bqblaster.beam.syntax._
-import com.twitter.tcdc.bqblaster.core.avro.TypedProjection
-import com.twitter.tcdc.bqblaster.core.transform.RootTransform
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO
-import org.joda.time.Interval
-
-object InteractionGraphLabelsJob extends ScioBeamJob[InteractionGraphLabelsOption] {
-
- override protected def configurePipeline(
- scioContext: ScioContext,
- pipelineOptions: InteractionGraphLabelsOption
- ): Unit = {
- @transient
- implicit lazy val sc: ScioContext = scioContext
- implicit lazy val dateInterval: Interval = pipelineOptions.interval
-
- val bqTableName: String = pipelineOptions.getBqTableName
- val dalEnvironment: String = pipelineOptions
- .as(classOf[ServiceIdentifierOptions])
- .getEnvironment()
- val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) {
- pipelineOptions.getDALWriteEnvironment
- } else {
- dalEnvironment
- }
-
- def readPartition[T: Manifest](dataset: TimePartitionedDALDataset[T]): SCollection[T] = {
- SourceUtil.readDALDataset[T](
- dataset = dataset,
- interval = dateInterval,
- dalEnvironment = dalEnvironment
- )
- }
-
- val follows = readPartition[FollowEvent](SocialgraphFollowEventsScalaDataset)
- .flatMap(LabelUtil.fromFollowEvent)
-
- val directInteractions =
- readPartition[Edge](InteractionGraphAggDirectInteractionsEdgeDailyScalaDataset)
- .flatMap(LabelUtil.fromInteractionGraphEdge)
-
- val clientEvents =
- readPartition[Edge](InteractionGraphAggClientEventLogsEdgeDailyScalaDataset)
- .flatMap(LabelUtil.fromInteractionGraphEdge)
-
- val pushEvents =
- readPartition[Edge](InteractionGraphAggNotificationsEdgeDailyScalaDataset)
- .flatMap(LabelUtil.fromInteractionGraphEdge)
-
-
- val labels = groupLabels(
- follows ++
- directInteractions ++
- clientEvents ++
- pushEvents)
-
- labels.saveAsCustomOutput(
- "Write Edge Labels",
- DAL.write[EdgeLabel](
- InteractionGraphLabelsDailyScalaDataset,
- PathLayout.DailyPath(pipelineOptions.getOutputPath),
- dateInterval,
- DiskFormat.Parquet,
- Environment.valueOf(dalWriteEnvironment),
- writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
- )
- )
-
- // save to BQ
- if (pipelineOptions.getBqTableName != null) {
- val ingestionTime = pipelineOptions.getDate().value.getStart.toDate
- val bqFieldsTransform = RootTransform
- .Builder()
- .withPrependedFields("dateHour" -> TypedProjection.fromConstant(ingestionTime))
- val timePartitioning = new TimePartitioning()
- .setType("DAY").setField("dateHour").setExpirationMs(90.days.inMilliseconds)
- val bqWriter = BigQueryIO
- .write[EdgeLabel]
- .to(bqTableName)
- .withExtendedErrorInfo()
- .withTimePartitioning(timePartitioning)
- .withLoadJobProjectId("twttr-recos-ml-prod")
- .withThriftSupport(bqFieldsTransform.build(), AvroConverter.Legacy)
- .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
- .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND)
- labels
- .saveAsCustomOutput(
- s"Save Recommendations to BQ $bqTableName",
- bqWriter
- )
- }
-
- }
-
- def groupLabels(labels: SCollection[EdgeLabel]): SCollection[EdgeLabel] = {
- labels
- .map { e: EdgeLabel => ((e.sourceId, e.destinationId), e.labels.toSet) }
- .sumByKey
- .map { case ((srcId, destId), labels) => EdgeLabel(srcId, destId, labels) }
- }
-}
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.docx
new file mode 100644
index 000000000..aed20ad43
Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.docx differ
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.scala b/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.scala
deleted file mode 100644
index 7c0a9a27a..000000000
--- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/InteractionGraphLabelsOption.scala
+++ /dev/null
@@ -1,28 +0,0 @@
-package com.twitter.interaction_graph.scio.ml.labels
-
-import com.twitter.beam.io.dal.DALOptions
-import com.twitter.beam.job.DateRangeOptions
-import org.apache.beam.sdk.options.Default
-import org.apache.beam.sdk.options.Description
-import org.apache.beam.sdk.options.Validation.Required
-
-trait InteractionGraphLabelsOption extends DALOptions with DateRangeOptions {
- @Required
- @Description("Output path for storing the final dataset")
- def getOutputPath: String
- def setOutputPath(value: String): Unit
-
- @Description("Output bq table name")
- def getBqTableName: String
- def setBqTableName(value: String): Unit
-
- @Description("Indicates DAL write environment. Can be set to dev/stg during local validation")
- @Default.String("PROD")
- def getDALWriteEnvironment: String
- def setDALWriteEnvironment(value: String): Unit
-
- @Description("Number of shards/partitions for saving the final dataset.")
- @Default.Integer(10)
- def getNumberOfShards: Integer
- def setNumberOfShards(value: Integer): Unit
-}
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.docx
new file mode 100644
index 000000000..a3794b998
Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.docx differ
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.scala b/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.scala
deleted file mode 100644
index 350c86c84..000000000
--- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/LabelUtil.scala
+++ /dev/null
@@ -1,63 +0,0 @@
-package com.twitter.interaction_graph.scio.ml.labels
-
-import com.spotify.scio.ScioMetrics
-import com.twitter.interaction_graph.thriftscala.EdgeFeature
-import com.twitter.interaction_graph.thriftscala.EdgeLabel
-import com.twitter.interaction_graph.thriftscala.FeatureName
-import com.twitter.interaction_graph.thriftscala.{Edge => TEdge}
-import com.twitter.socialgraph.event.thriftscala.FollowEvent
-
-object LabelUtil {
-
- val LabelExplicit = Set(
- FeatureName.NumFollows,
- FeatureName.NumFavorites,
- FeatureName.NumRetweets,
- FeatureName.NumMentions,
- FeatureName.NumTweetQuotes,
- FeatureName.NumPhotoTags,
- FeatureName.NumRtFavories,
- FeatureName.NumRtReplies,
- FeatureName.NumRtTweetQuotes,
- FeatureName.NumRtRetweets,
- FeatureName.NumRtMentions,
- FeatureName.NumShares,
- FeatureName.NumReplies,
- )
-
- val LabelImplicit = Set(
- FeatureName.NumTweetClicks,
- FeatureName.NumProfileViews,
- FeatureName.NumLinkClicks,
- FeatureName.NumPushOpens,
- FeatureName.NumNtabClicks,
- FeatureName.NumRtTweetClicks,
- FeatureName.NumRtLinkClicks,
- FeatureName.NumEmailOpen,
- FeatureName.NumEmailClick,
- )
-
- val LabelSet = (LabelExplicit ++ LabelImplicit).map(_.value)
-
- def fromFollowEvent(f: FollowEvent): Option[EdgeLabel] = {
- for {
- srcId <- f.sourceId
- destId <- f.targetId
- } yield EdgeLabel(srcId, destId, labels = Set(FeatureName.NumFollows))
- }
-
- def fromInteractionGraphEdge(e: TEdge): Option[EdgeLabel] = {
- val labels = e.features.collect {
- case EdgeFeature(featureName: FeatureName, _) if LabelSet.contains(featureName.value) =>
- ScioMetrics.counter("fromInteractionGraphEdge", featureName.toString).inc()
- featureName
- }.toSet
- if (labels.nonEmpty) {
- Some(EdgeLabel(e.sourceId, e.destinationId, labels))
- } else None
- }
-
- def toTEdge(e: EdgeLabel): EdgeLabel = {
- EdgeLabel(e.sourceId, e.destinationId, labels = e.labels)
- }
-}
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.docx b/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.docx
new file mode 100644
index 000000000..b57cc7c91
Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.docx differ
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.md b/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.md
deleted file mode 100644
index f67a624fb..000000000
--- a/src/scala/com/twitter/interaction_graph/scio/ml/labels/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-## InteractionGraphLabels Dataflow Job
-
-#### IntelliJ
-```
-fastpass create --name rg_labels --intellij src/scala/com/twitter/interaction_graph/scio/ml/labels
-```
-
-#### Compile
-```
-bazel build src/scala/com/twitter/interaction_graph/scio/ml/labels:interaction_graph_labels
-```
-
-#### Build Jar
-```
-bazel bundle src/scala/com/twitter/interaction_graph/scio/ml/labels:interaction_graph_labels
-```
-
-#### Run Scheduled Job
-```
-export PROJECTID=twttr-recos-ml-prod
-export REGION=us-central1
-export JOB_NAME=interaction-graph-labels-dataflow
-
-bin/d6w schedule \
- ${PROJECTID}/${REGION}/${JOB_NAME} \
- src/scala/com/twitter/interaction_graph/scio/ml/labels/config.d6w \
- --bind=profile.user_name=cassowary \
- --bind=profile.project=${PROJECTID} \
- --bind=profile.region=${REGION} \
- --bind=profile.job_name=${JOB_NAME} \
- --bind=profile.environment=prod \
- --bind=profile.date=2022-05-15 \
- --bind=profile.output_path=processed/interaction_graph/labels
-```
\ No newline at end of file
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD b/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD
deleted file mode 100644
index f5f1cacc2..000000000
--- a/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD
+++ /dev/null
@@ -1,54 +0,0 @@
-scala_library(
- sources = ["*.scala"],
- compiler_option_sets = ["fatal_warnings"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- ":real_graph_in_scores-scala",
- ":real_graph_oon_scores-scala",
- "beam-internal/src/main/scala/com/twitter/beam/io/dal",
- "beam-internal/src/main/scala/com/twitter/scio_internal/job",
- "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow",
- "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam",
- ],
-)
-
-jvm_binary(
- name = "interaction_graph_scores_scio",
- main = "com.twitter.interaction_graph.scio.ml.scores.InteractionGraphScoreExportJob",
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- ":scores",
- ],
-)
-
-create_datasets(
- base_name = "real_graph_in_scores",
- description = "Real Graph in network scores",
- key_type = "Long",
- platform = "java8",
- role = "cassowary",
- scala_schema = "com.twitter.wtf.scalding.jobs.injection.CandidateSeqInjection.injection",
- segment_type = "snapshot",
- tags = ["bazel-compatible"],
- val_type = "com.twitter.wtf.candidate.thriftscala.CandidateSeq",
- scala_dependencies = [
- "src/scala/com/twitter/wtf/scalding/jobs/injection",
- ],
-)
-
-create_datasets(
- base_name = "real_graph_oon_scores",
- description = "Real Graph OON Scores",
- key_type = "Long",
- platform = "java8",
- role = "cassowary",
- scala_schema = "com.twitter.wtf.scalding.jobs.injection.CandidateSeqInjection.injection",
- segment_type = "snapshot",
- tags = ["bazel-compatible"],
- val_type = "com.twitter.wtf.candidate.thriftscala.CandidateSeq",
- scala_dependencies = [
- "src/scala/com/twitter/wtf/scalding/jobs/injection",
- ],
-)
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD.docx b/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD.docx
new file mode 100644
index 000000000..de94002c6
Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/scores/BUILD.docx differ
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.docx b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.docx
new file mode 100644
index 000000000..b51c1f022
Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.docx differ
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.scala b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.scala
deleted file mode 100644
index 85e2284c2..000000000
--- a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportJob.scala
+++ /dev/null
@@ -1,134 +0,0 @@
-package com.twitter.interaction_graph.scio.ml.scores
-
-import com.google.cloud.bigquery.BigQueryOptions
-import com.google.cloud.bigquery.QueryJobConfiguration
-import com.spotify.scio.ScioContext
-import com.spotify.scio.values.SCollection
-import com.twitter.beam.io.dal.DAL
-import com.twitter.beam.io.exception.DataNotFoundException
-import com.twitter.beam.io.fs.multiformat.PathLayout
-import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
-import com.twitter.scio_internal.job.ScioBeamJob
-import com.twitter.wtf.candidate.thriftscala.Candidate
-import com.twitter.wtf.candidate.thriftscala.CandidateSeq
-import com.twitter.wtf.candidate.thriftscala.ScoredEdge
-import org.apache.avro.generic.GenericRecord
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO
-import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead
-import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord
-import org.apache.beam.sdk.transforms.SerializableFunction
-import scala.collection.JavaConverters._
-
-object InteractionGraphScoreExportJob extends ScioBeamJob[InteractionGraphScoreExportOption] {
-
- // to parse latest date from the BQ table we're reading from
- val parseDateRow = new SerializableFunction[SchemaAndRecord, String] {
- override def apply(input: SchemaAndRecord): String = {
- val genericRecord: GenericRecord = input.getRecord()
- genericRecord.get("ds").toString
- }
- }
-
- // to parse each row from the BQ table we're reading from
- val parseRow = new SerializableFunction[SchemaAndRecord, ScoredEdge] {
- override def apply(record: SchemaAndRecord): ScoredEdge = {
- val genericRecord: GenericRecord = record.getRecord()
- ScoredEdge(
- genericRecord.get("source_id").asInstanceOf[Long],
- genericRecord.get("destination_id").asInstanceOf[Long],
- genericRecord.get("prob").asInstanceOf[Double],
- genericRecord.get("followed").asInstanceOf[Boolean],
- )
- }
- }
-
- override def runPipeline(
- sc: ScioContext,
- opts: InteractionGraphScoreExportOption
- ): Unit = {
-
- val dateStr: String = opts.getDate().value.getStart.toString("yyyyMMdd")
- logger.info(s"dateStr $dateStr")
- val project: String = "twttr-recos-ml-prod"
- val datasetName: String = "realgraph"
- val bqTableName: String = "scores"
- val fullBqTableName: String = s"$project:$datasetName.$bqTableName"
-
- if (opts.getDALWriteEnvironment == "PROD") {
- val bqClient =
- BigQueryOptions.newBuilder.setProjectId("twttr-recos-ml-prod").build.getService
- val query =
- s"""
- |SELECT total_rows
- |FROM `$project.$datasetName.INFORMATION_SCHEMA.PARTITIONS`
- |WHERE partition_id ="$dateStr" AND
- |table_name="$bqTableName" AND total_rows > 0
- |""".stripMargin
- val queryConfig = QueryJobConfiguration.of(query)
- val results = bqClient.query(queryConfig).getValues.asScala.toSeq
- if (results.isEmpty || results.head.get(0).getLongValue == 0) {
- throw new DataNotFoundException(s"$dateStr not present in $fullBqTableName.")
- }
- }
- sc.run()
- }
-
- override protected def configurePipeline(
- sc: ScioContext,
- opts: InteractionGraphScoreExportOption
- ): Unit = {
-
- val dateStr: String = opts.getDate().value.getStart.toString("yyyy-MM-dd")
- logger.info(s"dateStr $dateStr")
- val project: String = "twttr-recos-ml-prod"
- val datasetName: String = "realgraph"
- val bqTableName: String = "scores"
- val fullBqTableName: String = s"$project:$datasetName.$bqTableName"
-
- val scoreExport: SCollection[ScoredEdge] = sc
- .customInput(
- s"Read from BQ table $fullBqTableName",
- BigQueryIO
- .read(parseRow)
- .from(fullBqTableName)
- .withSelectedFields(List("source_id", "destination_id", "prob", "followed").asJava)
- .withRowRestriction(s"ds = '$dateStr'")
- .withMethod(TypedRead.Method.DIRECT_READ)
- )
-
- val inScores = scoreExport
- .collect {
- case ScoredEdge(src, dest, score, true) =>
- (src, Candidate(dest, score))
- }
- .groupByKey
- .map {
- case (src, candidateIter) => KeyVal(src, CandidateSeq(candidateIter.toSeq.sortBy(-_.score)))
- }
-
- val outScores = scoreExport
- .collect {
- case ScoredEdge(src, dest, score, false) =>
- (src, Candidate(dest, score))
- }
- .groupByKey
- .map {
- case (src, candidateIter) => KeyVal(src, CandidateSeq(candidateIter.toSeq.sortBy(-_.score)))
- }
-
- inScores.saveAsCustomOutput(
- "Write real_graph_in_scores",
- DAL.writeVersionedKeyVal(
- RealGraphInScoresScalaDataset,
- PathLayout.VersionedPath(opts.getOutputPath + "/in"),
- )
- )
- outScores.saveAsCustomOutput(
- "Write real_graph_oon_scores",
- DAL.writeVersionedKeyVal(
- RealGraphOonScoresScalaDataset,
- PathLayout.VersionedPath(opts.getOutputPath + "/oon"),
- )
- )
- }
-}
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.docx b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.docx
new file mode 100644
index 000000000..3dcd0249f
Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.docx differ
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.scala b/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.scala
deleted file mode 100644
index 3b55c517b..000000000
--- a/src/scala/com/twitter/interaction_graph/scio/ml/scores/InteractionGraphScoreExportOption.scala
+++ /dev/null
@@ -1,24 +0,0 @@
-package com.twitter.interaction_graph.scio.ml.scores
-
-import com.twitter.beam.io.dal.DALOptions
-import com.twitter.beam.job.DateRangeOptions
-import org.apache.beam.sdk.options.Default
-import org.apache.beam.sdk.options.Description
-import org.apache.beam.sdk.options.Validation.Required
-
-trait InteractionGraphScoreExportOption extends DALOptions with DateRangeOptions {
- @Required
- @Description("Output path for storing the final dataset")
- def getOutputPath: String
- def setOutputPath(value: String): Unit
-
- @Description("Indicates DAL write environment. Can be set to dev/stg during local validation")
- @Default.String("PROD")
- def getDALWriteEnvironment: String
- def setDALWriteEnvironment(value: String): Unit
-
- @Description("Number of shards/partitions for saving the final dataset.")
- @Default.Integer(1000)
- def getNumberOfShards: Integer
- def setNumberOfShards(value: Integer): Unit
-}
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.docx b/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.docx
new file mode 100644
index 000000000..91fa58ef4
Binary files /dev/null and b/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.docx differ
diff --git a/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.md b/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.md
deleted file mode 100644
index 51ace9d9a..000000000
--- a/src/scala/com/twitter/interaction_graph/scio/ml/scores/README.md
+++ /dev/null
@@ -1,34 +0,0 @@
-## InteractionGraphLabels Dataflow Job
-
-#### IntelliJ
-```
-fastpass create --name rg_scores --intellij src/scala/com/twitter/interaction_graph/scio/ml/scores
-```
-
-#### Compile
-```
-bazel build src/scala/com/twitter/interaction_graph/scio/ml/scores
-```
-
-#### Build Jar
-```
-bazel bundle src/scala/com/twitter/interaction_graph/scio/ml/scores
-```
-
-#### Run Scheduled Job
-```
-export PROJECTID=twttr-recos-ml-prod
-export REGION=us-central1
-export JOB_NAME=interaction-graph-scores-dataflow
-
-bin/d6w schedule \
- ${PROJECTID}/${REGION}/${JOB_NAME} \
- src/scala/com/twitter/interaction_graph/scio/ml/scores/config.d6w \
- --bind=profile.user_name=cassowary \
- --bind=profile.project=${PROJECTID} \
- --bind=profile.region=${REGION} \
- --bind=profile.job_name=${JOB_NAME} \
- --bind=profile.environment=prod \
- --bind=profile.date=2022-06-23 \
- --bind=profile.output_path=manhattan_sequence_files/real_graph_scores_v2
-```
\ No newline at end of file
diff --git a/src/scala/com/twitter/recos/decider/BUILD b/src/scala/com/twitter/recos/decider/BUILD
deleted file mode 100644
index d1eb8d74f..000000000
--- a/src/scala/com/twitter/recos/decider/BUILD
+++ /dev/null
@@ -1,9 +0,0 @@
-scala_library(
- sources = ["*.scala"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "decider/src/main/scala",
- "src/scala/com/twitter/recos/util:recos-util",
- ],
-)
diff --git a/src/scala/com/twitter/recos/decider/BUILD.docx b/src/scala/com/twitter/recos/decider/BUILD.docx
new file mode 100644
index 000000000..5ea409ce1
Binary files /dev/null and b/src/scala/com/twitter/recos/decider/BUILD.docx differ
diff --git a/src/scala/com/twitter/recos/decider/BaseDecider.docx b/src/scala/com/twitter/recos/decider/BaseDecider.docx
new file mode 100644
index 000000000..613942ed5
Binary files /dev/null and b/src/scala/com/twitter/recos/decider/BaseDecider.docx differ
diff --git a/src/scala/com/twitter/recos/decider/BaseDecider.scala b/src/scala/com/twitter/recos/decider/BaseDecider.scala
deleted file mode 100644
index 841963631..000000000
--- a/src/scala/com/twitter/recos/decider/BaseDecider.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-package com.twitter.recos.decider
-
-import com.twitter.decider.Decider
-import com.twitter.decider.DeciderFactory
-import com.twitter.decider.RandomRecipient
-import com.twitter.decider.Recipient
-import com.twitter.decider.SimpleRecipient
-import com.twitter.recos.util.TeamUsers
-
-case class GuestRecipient(id: Long) extends Recipient {
- override def isGuest: Boolean = true
-}
-
-sealed trait BaseDecider {
- def baseConfig: Option[String] = None
-
- def overlayConfig: Option[String] = None
-
- lazy val decider: Decider = DeciderFactory(baseConfig, overlayConfig)()
-
- def isAvailable(feature: String, recipient: Option[Recipient]): Boolean =
- decider.isAvailable(feature, recipient)
-
- def isAvailable(feature: String): Boolean = isAvailable(feature, None)
-
- def isAvailableExceptTeam(feature: String, id: Long, isUser: Boolean = true): Boolean = {
- if (isUser) TeamUsers.team.contains(id) || isAvailable(feature, Some(SimpleRecipient(id)))
- else isAvailable(feature, Some(GuestRecipient(id)))
- }
-}
-
-case class RecosDecider(env: String, cluster: String = "atla") extends BaseDecider {
- override val baseConfig = Some("/com/twitter/recos/config/decider.yml")
- override val overlayConfig = Some(
- s"/usr/local/config/overlays/recos/service/prod/$cluster/decider_overlay.yml"
- )
-
- def shouldCompute(id: Long, displayLocation: String, isUser: Boolean = true): Boolean = {
- isAvailableExceptTeam(RecosDecider.recosIncomingTraffic + "_" + displayLocation, id, isUser)
- }
-
- def shouldReturn(id: Long, displayLocation: String, isUser: Boolean = true): Boolean = {
- isAvailableExceptTeam(RecosDecider.recosShouldReturn + "_" + displayLocation, id, isUser)
- }
-
- def shouldDarkmode(experiment: String): Boolean = {
- isAvailable(RecosDecider.recosShouldDark + "_exp_" + experiment, None)
- }
-
- def shouldScribe(id: Long, isUser: Boolean = true): Boolean = {
- if (isUser) (id > 0) && isAvailableExceptTeam(RecosDecider.recosShouldScribe, id, isUser)
- else false // TODO: define the behavior for guests
- }
-
- def shouldWriteMomentCapsuleOpenEdge(): Boolean = {
- val capsuleOpenDecider = env match {
- case "prod" => RecosDecider.recosShouldWriteMomentCapsuleOpenEdge
- case _ => RecosDecider.recosShouldWriteMomentCapsuleOpenEdge + RecosDecider.testSuffix
- }
-
- isAvailable(capsuleOpenDecider, Some(RandomRecipient))
- }
-}
-
-object RecosDecider {
- val testSuffix = "_test"
-
- val recosIncomingTraffic: String = "recos_incoming_traffic"
- val recosShouldReturn: String = "recos_should_return"
- val recosShouldDark: String = "recos_should_dark"
- val recosRealtimeBlacklist: String = "recos_realtime_blacklist"
- val recosRealtimeDeveloperlist: String = "recos_realtime_developerlist"
- val recosShouldScribe: String = "recos_should_scribe"
- val recosShouldWriteMomentCapsuleOpenEdge: String = "recos_should_write_moment_capsule_open_edge"
-}
-
-trait GraphDecider extends BaseDecider {
- val graphNamePrefix: String
-
- override val baseConfig = Some("/com/twitter/recos/config/decider.yml")
- override val overlayConfig = Some(
- "/usr/local/config/overlays/recos/service/prod/atla/decider_overlay.yml"
- )
-}
-
-case class UserTweetEntityGraphDecider() extends GraphDecider {
- override val graphNamePrefix: String = "user_tweet_entity_graph"
-
- def tweetSocialProof: Boolean = {
- isAvailable("user_tweet_entity_graph_tweet_social_proof")
- }
-
- def entitySocialProof: Boolean = {
- isAvailable("user_tweet_entity_graph_entity_social_proof")
- }
-
-}
-
-case class UserUserGraphDecider() extends GraphDecider {
- override val graphNamePrefix: String = "user_user_graph"
-}
-
-case class UserTweetGraphDecider(env: String, dc: String) extends GraphDecider {
- override val graphNamePrefix: String = "user-tweet-graph"
-
- override val baseConfig = Some("/com/twitter/recos/config/user-tweet-graph_decider.yml")
- override val overlayConfig = Some(
- s"/usr/local/config/overlays/user-tweet-graph/user-tweet-graph/$env/$dc/decider_overlay.yml"
- )
-}
diff --git a/src/scala/com/twitter/recos/decider/EndpointLoadShedder.docx b/src/scala/com/twitter/recos/decider/EndpointLoadShedder.docx
new file mode 100644
index 000000000..429990e2c
Binary files /dev/null and b/src/scala/com/twitter/recos/decider/EndpointLoadShedder.docx differ
diff --git a/src/scala/com/twitter/recos/decider/EndpointLoadShedder.scala b/src/scala/com/twitter/recos/decider/EndpointLoadShedder.scala
deleted file mode 100644
index 73a06e5af..000000000
--- a/src/scala/com/twitter/recos/decider/EndpointLoadShedder.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-package com.twitter.recos.decider
-
-import com.twitter.decider.Decider
-import com.twitter.decider.RandomRecipient
-import com.twitter.util.Future
-import scala.util.control.NoStackTrace
-
-/*
- Provides deciders-controlled load shedding for a given endpoint.
- The format of the decider keys is:
-
- enable_loadshedding_
").format(elapsed)
- val tweetContent = userEdges.toList
- .map { edge =>
- s"TweetId: ${edge.tweetId},\nAction type: ${edge.actionType},\nCard type: ${edge.cardType}"
- .replaceAll("\n", " ")
- }.mkString("\n
\n")
-
- response.setContentString(
- HTMLUtil.html.replace("XXXXX", comment + tweetContent + "\n
\n" + tweetEdges.toString()))
- Future.value(response)
- }
-
- private def getTweetEdges(tweetId: Long): ListBuffer[Long] = {
- val random = new Random()
- val iterator =
- graph
- .getRandomRightNodeEdges(tweetId, 500, random).asInstanceOf[MultiSegmentIterator[
- BipartiteGraphSegment
- ]]
- val terms = new ListBuffer[Long]()
- if (iterator != null) {
- while (iterator.hasNext) { terms += iterator.nextLong() }
- }
- terms.distinct
- }
-
-}
-
-case class Edge(tweetId: Long, actionType: String, cardType: String)
diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.docx b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.docx
new file mode 100644
index 000000000..946016494
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.scala b/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.scala
deleted file mode 100644
index 4909e0386..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/UserVideoGraphWriter.scala
+++ /dev/null
@@ -1,82 +0,0 @@
-package com.twitter.recos.user_video_graph
-
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.finatra.kafka.consumers.FinagleKafkaConsumerBuilder
-import com.twitter.graphjet.algorithms.TweetIDMask
-import com.twitter.graphjet.bipartite.MultiSegmentPowerLawBipartiteGraph
-import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment
-import com.twitter.recos.hose.common.UnifiedGraphWriter
-import com.twitter.recos.internal.thriftscala.RecosHoseMessage
-import com.twitter.recos.serviceapi.Tweetypie._
-
-/**
- * The class submits a number of $numBootstrapWriters graph writer threads, BufferedEdgeWriter,
- * during service startup. One of them is live writer thread, and the other $(numBootstrapWriters - 1)
- * are catchup writer threads. All of them consume kafka events from an internal concurrent queue,
- * which is populated by kafka reader threads. At bootstrap time, the kafka reader threads look
- * back kafka offset from several hours ago and populate the internal concurrent queue.
- * Each graph writer thread writes to an individual graph segment separately.
- * The $(numBootstrapWriters - 1) catchup writer threads will stop once all events
- * between current system time at startup and the time in memcache are processed.
- * The live writer thread will continue to write all incoming kafka events.
- * It lives through the entire life cycle of recos graph service.
- */
-case class UserVideoGraphWriter(
- shardId: String,
- env: String,
- hosename: String,
- bufferSize: Int,
- kafkaConsumerBuilder: FinagleKafkaConsumerBuilder[String, RecosHoseMessage],
- clientId: String,
- statsReceiver: StatsReceiver)
- extends UnifiedGraphWriter[BipartiteGraphSegment, MultiSegmentPowerLawBipartiteGraph] {
- writer =>
- // The max throughput for each kafka consumer is around 25MB/s
- // Use 4 processors for 100MB/s catch-up speed.
- val consumerNum: Int = 4
- // Leave 1 Segments to LiveWriter
- val catchupWriterNum: Int = RecosConfig.maxNumSegments - 1
-
- /**
- * Adds a RecosHoseMessage to the graph. used by live writer to insert edges to the
- * current segment
- */
- override def addEdgeToGraph(
- graph: MultiSegmentPowerLawBipartiteGraph,
- recosHoseMessage: RecosHoseMessage
- ): Unit = {
- graph.addEdge(
- recosHoseMessage.leftId,
- getMetaEdge(recosHoseMessage.rightId, recosHoseMessage.card),
- UserVideoEdgeTypeMask.actionTypeToEdgeType(recosHoseMessage.action),
- )
- }
-
- /**
- * Adds a RecosHoseMessage to the given segment in the graph. Used by catch up writers to
- * insert edges to non-current (old) segments
- */
- override def addEdgeToSegment(
- segment: BipartiteGraphSegment,
- recosHoseMessage: RecosHoseMessage
- ): Unit = {
- segment.addEdge(
- recosHoseMessage.leftId,
- getMetaEdge(recosHoseMessage.rightId, recosHoseMessage.card),
- UserVideoEdgeTypeMask.actionTypeToEdgeType(recosHoseMessage.action)
- )
- }
-
- private def getMetaEdge(rightId: Long, cardOption: Option[Byte]): Long = {
- cardOption
- .map { card =>
- if (isPhotoCard(card)) TweetIDMask.photo(rightId)
- else if (isPlayerCard(card)) TweetIDMask.player(rightId)
- else if (isSummaryCard(card)) TweetIDMask.summary(rightId)
- else if (isPromotionCard(card)) TweetIDMask.promotion(rightId)
- else rightId
- }
- .getOrElse(rightId)
- }
-
-}
diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD
deleted file mode 100644
index ad9caf129..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-scala_library(
- sources = ["*.scala"],
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/twitter/graphjet",
- "servo/request/src/main/scala",
- "src/scala/com/twitter/recos/user_video_graph/store",
- "src/scala/com/twitter/recos/user_video_graph/util",
- "src/scala/com/twitter/recos/util:recos-util",
- "src/thrift/com/twitter/recos/user_video_graph:user_video_graph-scala",
- ],
-)
diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD.docx b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD.docx
new file mode 100644
index 000000000..b8c1754a1
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/BUILD.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.docx b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.docx
new file mode 100644
index 000000000..16ac865fe
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.scala b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.scala
deleted file mode 100644
index 44a190e0d..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ConsumersBasedRelatedTweetsHandler.scala
+++ /dev/null
@@ -1,66 +0,0 @@
-package com.twitter.recos.user_tweet_graph.relatedTweetHandlers
-
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.graphjet.bipartite.api.BipartiteGraph
-import com.twitter.recos.user_video_graph.thriftscala._
-import com.twitter.recos.user_video_graph.util.FetchRHSTweetsUtil
-import com.twitter.recos.user_video_graph.util.FilterUtil
-import com.twitter.recos.user_video_graph.util.GetRelatedTweetCandidatesUtil
-import com.twitter.recos.util.Stats._
-import com.twitter.servo.request._
-import com.twitter.util.Duration
-import com.twitter.util.Future
-import scala.concurrent.duration.HOURS
-
-/**
- * Implementation of the Thrift-defined service interface for consumersTweetBasedRelatedTweets.
- * given a list of consumer userIds, find the tweets they co-engaged with (we're treating input userIds as consumers therefore "consumersTweetBasedRelatedTweets" )
- * example use case: given a list of user's contacts in their address book, find tweets those contacts engaged with
- */
-class ConsumersBasedRelatedTweetsHandler(
- bipartiteGraph: BipartiteGraph,
- statsReceiver: StatsReceiver)
- extends RequestHandler[ConsumersBasedRelatedTweetRequest, RelatedTweetResponse] {
- private val stats = statsReceiver.scope(this.getClass.getSimpleName)
-
- override def apply(request: ConsumersBasedRelatedTweetRequest): Future[RelatedTweetResponse] = {
- trackFutureBlockStats(stats) {
-
- val maxResults = request.maxResults.getOrElse(200)
- val minScore = request.minScore.getOrElse(0.0)
- val maxTweetAge = request.maxTweetAgeInHours.getOrElse(48)
- val minResultDegree = request.minResultDegree.getOrElse(50)
- val minCooccurrence = request.minCooccurrence.getOrElse(3)
- val excludeTweetIds = request.excludeTweetIds.getOrElse(Seq.empty).toSet
-
- val consumerSeedSet = request.consumerSeedSet.distinct.filter { userId =>
- val userDegree = bipartiteGraph.getLeftNodeDegree(userId)
- // constrain to users that have <100 engagements to avoid spammy behavior
- userDegree < 100
- }
-
- val rhsTweetIds = FetchRHSTweetsUtil.fetchRHSTweets(
- consumerSeedSet,
- bipartiteGraph
- )
-
- val scorePreFactor = 1000.0 / consumerSeedSet.size
- val relatedTweetCandidates = GetRelatedTweetCandidatesUtil.getRelatedTweetCandidates(
- rhsTweetIds,
- minCooccurrence,
- minResultDegree,
- scorePreFactor,
- bipartiteGraph)
-
- val relatedTweets = relatedTweetCandidates
- .filter(relatedTweet =>
- FilterUtil.tweetAgeFilter(
- relatedTweet.tweetId,
- Duration(maxTweetAge, HOURS)) && (relatedTweet.score > minScore) && (!excludeTweetIds
- .contains(relatedTweet.tweetId))).take(maxResults)
-
- stats.stat("response_size").add(relatedTweets.size)
- Future.value(RelatedTweetResponse(tweets = relatedTweets))
- }
- }
-}
diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.docx b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.docx
new file mode 100644
index 000000000..93367b0c3
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.scala b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.scala
deleted file mode 100644
index 5f26ded6e..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/ProducerBasedRelatedTweetsHandler.scala
+++ /dev/null
@@ -1,86 +0,0 @@
-package com.twitter.recos.user_video_graph.relatedTweetHandlers
-
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.graphjet.bipartite.api.BipartiteGraph
-import com.twitter.recos.user_video_graph.thriftscala._
-import com.twitter.recos.util.Stats._
-import com.twitter.servo.request._
-import com.twitter.util.Duration
-import com.twitter.util.Future
-import scala.concurrent.duration.HOURS
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.storehaus.ReadableStore
-import com.twitter.recos.user_video_graph.store.UserRecentFollowersStore
-import com.twitter.recos.user_video_graph.util.FetchRHSTweetsUtil
-import com.twitter.recos.user_video_graph.util.FilterUtil
-import com.twitter.recos.user_video_graph.util.GetRelatedTweetCandidatesUtil
-
-/**
- * Implementation of the Thrift-defined service interface for producerBasedRelatedTweets.
- *
- */
-class ProducerBasedRelatedTweetsHandler(
- bipartiteGraph: BipartiteGraph,
- userRecentFollowersStore: ReadableStore[UserRecentFollowersStore.Query, Seq[UserId]],
- statsReceiver: StatsReceiver)
- extends RequestHandler[ProducerBasedRelatedTweetRequest, RelatedTweetResponse] {
- private val stats = statsReceiver.scope(this.getClass.getSimpleName)
-
- override def apply(request: ProducerBasedRelatedTweetRequest): Future[RelatedTweetResponse] = {
- trackFutureBlockStats(stats) {
- val maxResults = request.maxResults.getOrElse(200)
- val maxNumFollowers = request.maxNumFollowers.getOrElse(500)
- val minScore = request.minScore.getOrElse(0.0)
- val maxTweetAge = request.maxTweetAgeInHours.getOrElse(48)
- val minResultDegree = request.minResultDegree.getOrElse(50)
- val minCooccurrence = request.minCooccurrence.getOrElse(4)
- val excludeTweetIds = request.excludeTweetIds.getOrElse(Seq.empty).toSet
-
- val followersFut = fetchFollowers(request.producerId, Some(maxNumFollowers))
- followersFut.map { followers =>
- val rhsTweetIds = FetchRHSTweetsUtil.fetchRHSTweets(
- followers,
- bipartiteGraph
- )
-
- val scorePreFactor = 1000.0 / followers.size
- val relatedTweetCandidates = GetRelatedTweetCandidatesUtil.getRelatedTweetCandidates(
- rhsTweetIds,
- minCooccurrence,
- minResultDegree,
- scorePreFactor,
- bipartiteGraph)
-
- val relatedTweets = relatedTweetCandidates
- .filter { relatedTweet =>
- FilterUtil.tweetAgeFilter(
- relatedTweet.tweetId,
- Duration(maxTweetAge, HOURS)) && (relatedTweet.score > minScore) && (!excludeTweetIds
- .contains(relatedTweet.tweetId))
- }.take(maxResults)
- stats.stat("response_size").add(relatedTweets.size)
- RelatedTweetResponse(tweets = relatedTweets)
- }
- }
- }
-
- private def fetchFollowers(
- producerId: Long,
- maxNumFollower: Option[Int],
- ): Future[Seq[Long]] = {
- val query =
- UserRecentFollowersStore.Query(producerId, maxNumFollower, None)
-
- val followersFut = userRecentFollowersStore.get(query)
- followersFut.map { followersOpt =>
- val followers = followersOpt.getOrElse(Seq.empty)
- val followerIds = followers.distinct.filter { userId =>
- val userDegree = bipartiteGraph.getLeftNodeDegree(userId)
- // constrain to more active users that have >1 engagement to optimize latency, and <100 engagements to avoid spammy behavior
- userDegree > 1 && userDegree < 500
- }
- stats.stat("follower_size_after_filter").add(followerIds.size)
- followerIds
- }
- }
-}
diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.docx b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.docx
new file mode 100644
index 000000000..0beb7556e
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.scala b/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.scala
deleted file mode 100644
index 7150a2f0f..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/relatedTweetHandlers/TweetBasedRelatedTweetsHandler.scala
+++ /dev/null
@@ -1,91 +0,0 @@
-package com.twitter.recos.user_video_graph.relatedTweetHandlers
-
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.graphjet.bipartite.api.BipartiteGraph
-import com.twitter.recos.features.tweet.thriftscala.GraphFeaturesForQuery
-import com.twitter.recos.user_video_graph.thriftscala._
-import com.twitter.recos.user_video_graph.util.FilterUtil
-import com.twitter.recos.user_video_graph.util.FetchRHSTweetsUtil
-import com.twitter.recos.user_video_graph.util.GetRelatedTweetCandidatesUtil
-import com.twitter.recos.user_video_graph.util.GetAllInternalTweetIdsUtil
-import com.twitter.recos.user_video_graph.util.SampleLHSUsersUtil
-import com.twitter.recos.util.Stats._
-import com.twitter.servo.request._
-import com.twitter.util.Duration
-import com.twitter.util.Future
-import scala.concurrent.duration.HOURS
-
-/**
- * Implementation of the Thrift-defined service interface for tweetBasedRelatedTweets.
- *
- */
-class TweetBasedRelatedTweetsHandler(bipartiteGraph: BipartiteGraph, statsReceiver: StatsReceiver)
- extends RequestHandler[TweetBasedRelatedTweetRequest, RelatedTweetResponse] {
- private val stats = statsReceiver.scope(this.getClass.getSimpleName)
-
- override def apply(request: TweetBasedRelatedTweetRequest): Future[RelatedTweetResponse] = {
- trackFutureBlockStats(stats) {
- val internalQueryTweetIds =
- GetAllInternalTweetIdsUtil.getAllInternalTweetIds(request.tweetId, bipartiteGraph)
-
- val response = internalQueryTweetIds match {
- case head +: Nil => getRelatedTweets(request, head)
- case _ => RelatedTweetResponse()
- }
- Future.value(response)
- }
- }
-
- private def getRelatedTweets(
- request: TweetBasedRelatedTweetRequest,
- maskedTweetId: Long
- ): RelatedTweetResponse = {
-
- val maxNumSamplesPerNeighbor = request.maxNumSamplesPerNeighbor.getOrElse(100)
- val maxResults = request.maxResults.getOrElse(200)
- val minScore = request.minScore.getOrElse(0.5)
- val maxTweetAge = request.maxTweetAgeInHours.getOrElse(48)
- val minResultDegree = request.minResultDegree.getOrElse(50)
- val minQueryDegree = request.minQueryDegree.getOrElse(10)
- val minCooccurrence = request.minCooccurrence.getOrElse(3)
- val excludeTweetIds = request.excludeTweetIds.getOrElse(Seq.empty).toSet
-
- val queryTweetDegree = bipartiteGraph.getRightNodeDegree(maskedTweetId)
- stats.stat("queryTweetDegree").add(queryTweetDegree)
-
- if (queryTweetDegree < minQueryDegree) {
- stats.counter("queryTweetDegreeLessThanMinQueryDegree").incr()
- RelatedTweetResponse()
- } else {
-
- val sampledLHSuserIds =
- SampleLHSUsersUtil.sampleLHSUsers(maskedTweetId, maxNumSamplesPerNeighbor, bipartiteGraph)
-
- val rHStweetIds = FetchRHSTweetsUtil.fetchRHSTweets(
- sampledLHSuserIds,
- bipartiteGraph,
- )
-
- val scorePreFactor =
- queryTweetDegree / math.log(queryTweetDegree) / sampledLHSuserIds.distinct.size
- val relatedTweetCandidates = GetRelatedTweetCandidatesUtil.getRelatedTweetCandidates(
- rHStweetIds,
- minCooccurrence,
- minResultDegree,
- scorePreFactor,
- bipartiteGraph)
-
- val relatedTweets = relatedTweetCandidates
- .filter(relatedTweet =>
- FilterUtil.tweetAgeFilter(
- relatedTweet.tweetId,
- Duration(maxTweetAge, HOURS)) && (relatedTweet.score > minScore) && (!excludeTweetIds
- .contains(relatedTweet.tweetId))).take(maxResults)
-
- stats.stat("response_size").add(relatedTweets.size)
- RelatedTweetResponse(
- tweets = relatedTweets,
- queryTweetGraphFeatures = Some(GraphFeaturesForQuery(degree = Some(queryTweetDegree))))
- }
- }
-}
diff --git a/src/scala/com/twitter/recos/user_video_graph/store/BUILD b/src/scala/com/twitter/recos/user_video_graph/store/BUILD
deleted file mode 100644
index b1c3562b7..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/store/BUILD
+++ /dev/null
@@ -1,9 +0,0 @@
-scala_library(
- sources = ["*.scala"],
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/twitter/storehaus:core",
- "src/scala/com/twitter/simclusters_v2/common",
- "src/thrift/com/twitter/socialgraph:thrift-scala",
- ],
-)
diff --git a/src/scala/com/twitter/recos/user_video_graph/store/BUILD.docx b/src/scala/com/twitter/recos/user_video_graph/store/BUILD.docx
new file mode 100644
index 000000000..eb2b1796e
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/store/BUILD.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.docx b/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.docx
new file mode 100644
index 000000000..396c73ef7
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.scala b/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.scala
deleted file mode 100644
index 7d1b6df6f..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/store/UserRecentFollowersStore.scala
+++ /dev/null
@@ -1,50 +0,0 @@
-package com.twitter.recos.user_video_graph.store
-
-import com.twitter.simclusters_v2.common.UserId
-import com.twitter.socialgraph.thriftscala.EdgesRequest
-import com.twitter.socialgraph.thriftscala.EdgesResult
-import com.twitter.socialgraph.thriftscala.PageRequest
-import com.twitter.socialgraph.thriftscala.RelationshipType
-import com.twitter.socialgraph.thriftscala.SrcRelationship
-import com.twitter.socialgraph.thriftscala.SocialGraphService
-import com.twitter.storehaus.ReadableStore
-import com.twitter.util.Duration
-import com.twitter.util.Future
-import com.twitter.util.Time
-
-class UserRecentFollowersStore(
- sgsClient: SocialGraphService.MethodPerEndpoint)
- extends ReadableStore[UserRecentFollowersStore.Query, Seq[UserId]] {
-
- override def get(key: UserRecentFollowersStore.Query): Future[Option[Seq[UserId]]] = {
- val edgeRequest = EdgesRequest(
- relationship = SrcRelationship(key.userId, RelationshipType.FollowedBy),
- // Could have a better guess at count when k.maxAge != None
- pageRequest = Some(PageRequest(count = key.maxResults))
- )
-
- val lookbackThresholdMillis = key.maxAge
- .map(maxAge => (Time.now - maxAge).inMilliseconds)
- .getOrElse(0L)
-
- sgsClient
- .edges(Seq(edgeRequest))
- .map(_.flatMap {
- case EdgesResult(edges, _, _) =>
- edges.collect {
- case e if e.createdAt >= lookbackThresholdMillis =>
- e.target
- }
- })
- .map(Some(_))
- }
-}
-
-object UserRecentFollowersStore {
- case class Query(
- userId: UserId,
- // maxResults - if Some(count), we return only the `count` most recent follows
- maxResults: Option[Int] = None,
- // maxAge - if Some(duration), return only follows since `Time.now - duration`
- maxAge: Option[Duration] = None)
-}
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/BUILD b/src/scala/com/twitter/recos/user_video_graph/util/BUILD
deleted file mode 100644
index a8a1364e1..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/util/BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-scala_library(
- sources = ["*.scala"],
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/twitter/graphjet",
- "snowflake:id",
- "snowflake/src/main/scala/com/twitter/snowflake/id",
- "src/scala/com/twitter/recos/util:recos-util",
- "src/scala/com/twitter/simclusters_v2/common",
- "src/thrift/com/twitter/recos/user_video_graph:user_video_graph-scala",
- ],
-)
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/BUILD.docx b/src/scala/com/twitter/recos/user_video_graph/util/BUILD.docx
new file mode 100644
index 000000000..16679198c
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/BUILD.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.docx
new file mode 100644
index 000000000..9005a537b
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.scala
deleted file mode 100644
index 63041c1d0..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/util/FetchRHSTweetsUtil.scala
+++ /dev/null
@@ -1,29 +0,0 @@
-package com.twitter.recos.user_video_graph.util
-
-import com.twitter.graphjet.bipartite.MultiSegmentIterator
-import com.twitter.graphjet.bipartite.api.BipartiteGraph
-import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment
-import scala.collection.mutable.ListBuffer
-
-object FetchRHSTweetsUtil {
- // get RHS tweets given LHS users
- def fetchRHSTweets(
- userIds: Seq[Long],
- bipartiteGraph: BipartiteGraph
- ): Seq[Long] = {
- userIds.distinct
- .flatMap { userId =>
- val tweetIdsIterator = bipartiteGraph
- .getLeftNodeEdges(userId).asInstanceOf[MultiSegmentIterator[BipartiteGraphSegment]]
-
- val tweetIds = new ListBuffer[Long]()
- if (tweetIdsIterator != null) {
- while (tweetIdsIterator.hasNext) {
- val rightNode = tweetIdsIterator.nextLong()
- tweetIds += rightNode
- }
- }
- tweetIds.distinct
- }
- }
-}
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.docx
new file mode 100644
index 000000000..9dcdfb574
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.scala
deleted file mode 100644
index ca827070d..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/util/FilterUtil.scala
+++ /dev/null
@@ -1,15 +0,0 @@
-package com.twitter.recos.user_video_graph.util
-
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.snowflake.id.SnowflakeId
-import com.twitter.util.Duration
-import com.twitter.util.Time
-
-object FilterUtil {
- def tweetAgeFilter(tweetId: TweetId, maxAge: Duration): Boolean = {
- SnowflakeId
- .timeFromIdOpt(tweetId)
- .map { tweetTime => tweetTime > Time.now - maxAge }.getOrElse(false)
- // If there's no snowflake timestamp, we have no idea when this tweet happened.
- }
-}
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.docx
new file mode 100644
index 000000000..3e984d05f
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.scala
deleted file mode 100644
index 8628f3a10..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/util/GetAllInternalTweetIdsUtil.scala
+++ /dev/null
@@ -1,33 +0,0 @@
-package com.twitter.recos.user_video_graph.util
-
-import com.twitter.graphjet.algorithms.TweetIDMask
-import com.twitter.graphjet.bipartite.api.BipartiteGraph
-
-object GetAllInternalTweetIdsUtil {
-
- def getAllInternalTweetIds(tweetId: Long, bipartiteGraph: BipartiteGraph): Seq[Long] = {
- val internalTweetIds = getAllMasks(tweetId)
- sortByDegrees(internalTweetIds, bipartiteGraph)
- }
-
- private def getAllMasks(tweetId: Long): Seq[Long] = {
- Seq(
- tweetId,
- TweetIDMask.summary(tweetId),
- TweetIDMask.photo(tweetId),
- TweetIDMask.player(tweetId),
- TweetIDMask.promotion(tweetId)
- )
- }
-
- private def sortByDegrees(
- encodedTweetIds: Seq[Long],
- bipartiteGraph: BipartiteGraph
- ): Seq[Long] = {
- encodedTweetIds
- .map { encodedTweetId => (encodedTweetId, bipartiteGraph.getRightNodeDegree(encodedTweetId)) }
- .filter { case (_, degree) => degree > 0 } // keep only tweetds with positive degree
- .sortBy { case (_, degree) => -degree } // sort by degree in descending order
- .map { case (encodedTweetId, _) => encodedTweetId }
- }
-}
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.docx
new file mode 100644
index 000000000..ab9357599
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.scala
deleted file mode 100644
index 176e129db..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/util/GetRelatedTweetCandidatesUtil.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-package com.twitter.recos.user_video_graph.util
-
-import com.twitter.graphjet.bipartite.api.BipartiteGraph
-import com.twitter.recos.user_video_graph.thriftscala._
-import com.twitter.recos.features.tweet.thriftscala.GraphFeaturesForTweet
-import com.twitter.graphjet.algorithms.TweetIDMask
-
-object GetRelatedTweetCandidatesUtil {
- private val tweetIDMask = new TweetIDMask
-
- /**
- * calculate scores for each RHS tweet that we get back
- * for tweetBasedRelatedTweet, scorePreFactor = queryTweetDegree / log(queryTweetDegree) / LHSuserSize
- * and the final score will be a log-cosine score
- * for non-tweetBasedRelatedTweet, We don't have a query tweet, to keep scoring function consistent,
- * scorePreFactor = 1000.0 / LHSuserSize (queryTweetDegree's average is ~10k, 1000 ~= 10k/log(10k))
- * Though scorePreFactor is applied for all results within a request, it's still useful to make score comparable across requests,
- * so we can have a unifed min_score and help with downstream score normalization
- * **/
- def getRelatedTweetCandidates(
- relatedTweetCandidates: Seq[Long],
- minCooccurrence: Int,
- minResultDegree: Int,
- scorePreFactor: Double,
- bipartiteGraph: BipartiteGraph
- ): Seq[RelatedTweet] = {
- relatedTweetCandidates
- .groupBy(tweetId => tweetId)
- .filterKeys(tweetId => bipartiteGraph.getRightNodeDegree(tweetId) > minResultDegree)
- .mapValues(_.size)
- .filter { case (_, cooccurrence) => cooccurrence >= minCooccurrence }
- .toSeq
- .map {
- case (relatedTweetId, cooccurrence) =>
- val relatedTweetDegree = bipartiteGraph.getRightNodeDegree(relatedTweetId)
-
- val score = scorePreFactor * cooccurrence / math.log(relatedTweetDegree)
- toRelatedTweet(relatedTweetId, score, relatedTweetDegree, cooccurrence)
- }
- .sortBy(-_.score)
- }
-
- def toRelatedTweet(
- relatedTweetId: Long,
- score: Double,
- relatedTweetDegree: Int,
- cooccurrence: Int
- ): RelatedTweet = {
- RelatedTweet(
- tweetId = tweetIDMask.restore(relatedTweetId),
- score = score,
- relatedTweetGraphFeatures = Some(
- GraphFeaturesForTweet(cooccurrence = Some(cooccurrence), degree = Some(relatedTweetDegree)))
- )
- }
-}
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.docx b/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.docx
new file mode 100644
index 000000000..e03440b41
Binary files /dev/null and b/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.docx differ
diff --git a/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.scala b/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.scala
deleted file mode 100644
index b8fd2c2f4..000000000
--- a/src/scala/com/twitter/recos/user_video_graph/util/SampleLHSUsersUtil.scala
+++ /dev/null
@@ -1,35 +0,0 @@
-package com.twitter.recos.user_video_graph.util
-
-import com.twitter.graphjet.bipartite.MultiSegmentIterator
-import com.twitter.graphjet.bipartite.api.BipartiteGraph
-import com.twitter.graphjet.bipartite.segment.BipartiteGraphSegment
-import java.util.Random
-import scala.collection.mutable.ListBuffer
-
-object SampleLHSUsersUtil {
- // sample userId nodes
- def sampleLHSUsers(
- maskedTweetId: Long,
- maxNumSamplesPerNeighbor: Int,
- bipartiteGraph: BipartiteGraph
- ): Seq[Long] = {
- val sampledUserIdsIterator = bipartiteGraph
- .getRandomRightNodeEdges(
- maskedTweetId,
- maxNumSamplesPerNeighbor,
- new Random(System.currentTimeMillis)).asInstanceOf[MultiSegmentIterator[
- BipartiteGraphSegment
- ]]
-
- val userIds = new ListBuffer[Long]()
- if (sampledUserIdsIterator != null) {
- while (sampledUserIdsIterator.hasNext) {
- val leftNode = sampledUserIdsIterator.nextLong()
- // If a user likes too many things, we risk including spammy behavior.
- if (bipartiteGraph.getLeftNodeDegree(leftNode) < 100)
- userIds += leftNode
- }
- }
- userIds
- }
-}
diff --git a/src/scala/com/twitter/simclusters_v2/README.docx b/src/scala/com/twitter/simclusters_v2/README.docx
new file mode 100644
index 000000000..4ce718d9a
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/README.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/README.md b/src/scala/com/twitter/simclusters_v2/README.md
deleted file mode 100644
index ae43836af..000000000
--- a/src/scala/com/twitter/simclusters_v2/README.md
+++ /dev/null
@@ -1,112 +0,0 @@
-# SimClusters: Community-based Representations for Heterogeneous Recommendations at Twitter
-
-## Overview
-SimClusters is as a general-purpose representation layer based on overlapping communities into which users as well as heterogeneous content can be captured as sparse, interpretable vectors to support a multitude of recommendation tasks.
-
-We build our user and tweet SimClusters embeddings based on the inferred communities, and the representations power our personalized tweet recommendation via our online serving service SimClusters ANN.
-
-
-For more details, please read our paper that was published in KDD'2020 Applied Data Science Track: https://www.kdd.org/kdd2020/accepted-papers/view/simclusters-community-based-representations-for-heterogeneous-recommendatio
-
-## Brief introduction to Simclusters Algorithm
-
-### Follow relationships as a bipartite graph
-Follow relationships on Twitter are perhaps most naturally thought of as directed graph, where each node is a user and each edge represents a Follow. Edges are directed in that User 1 can follow User 2, User 2 can follow User 1 or both User 1 and User 2 can follow each other.
-
-This directed graph can be also viewed as a bipartite graph, where nodes are grouped into two sets, Producers and Consumers. In this bipartite graph, Producers are the users who are Followed and Consumers are the Followees. Below is a toy example of a follow graph for four users:
-
-
-
-> Figure 1 - Left panel: A directed follow graph; Right panel: A bipartite graph representation of the directed graph
-
-### Community Detection - Known For
-The bipartite follow graph can be used to identify groups of Producers who have similar followers, or who are "Known For" a topic. Specifically, the bipartite follow graph can also be represented as an *m x n* matrix (*A*), where consumers are presented as *u* and producers are represented as *v*.
-
-Producer-producer similarity is computed as the cosine similarity between users who follow each producer. The resulting cosine similarity values can be used to construct a producer-producer similarity graph, where the nodes are producers and edges are weighted by the corresponding cosine similarity value. Noise removal is performed, such that edges with weights below a specified threshold are deleted from the graph.
-
-After noise removal has been completed, Metropolis-Hastings sampling-based community detection is then run on the Producer-Producer similarity graph to identify a community affiliation for each producer. This algorithm takes in a parameter *k* for the number of communities to be detected.
-
-
-
-> Figure 2 - Left panel: Matrix representation of the follow graph depicted in Figure 1; Middle panel: Producer-Producer similarity is estimated by calculating the cosine similarity between the users who follow each producer; Right panel: Cosine similarity scores are used to create the Producer-Producer similarity graph. A clustering algorithm is run on the graph to identify groups of Producers with similar followers.
-
-Community affiliation scores are then used to construct an *n x k* "Known For" matrix (*V*). This matrix is maximally sparse, and each Producer is affiliated with at most one community. In production, the Known For dataset covers the top 20M producers and k ~= 145000. In other words, we discover around 145k communities based on Twitter's user follow graph.
-
-
-
-> Figure 3 - The clustering algorithm returns community affiliation scores for each producer. These scores are represented in matrix V.
-
-In the example above, Producer 1 is "Known For" community 2, Producer 2 is "Known For" community 1, and so forth.
-
-### Consumer Embeddings - User InterestedIn
-An Interested In matrix (*U*) can be computed by multiplying the matrix representation of the follow graph (*A*) by the Known For matrix (*V*):
-
-
-
-In this toy example, consumer 1 is interested in community 1 only, whereas consumer 3 is interested in all three communities. There is also a noise removal step applied to the Interested In matrix.
-
-We use the InterestedIn embeddings to capture consumer's long-term interest. The InterestedIn embeddings is one of our major source for consumer-based tweet recommendations.
-
-### Producer Embeddings
-When computing the Known For matrix, each producer can only be Known For a single community. Although this maximally sparse matrix is useful from a computational perspective, we know that our users tweet about many different topics and may be "Known" in many different communities. Producer embeddings ( *á¹¼* ) are used to capture this richer structure of the graph.
-
-To calculate producer embeddings, the cosine similarity is calculated between each Producer’s follow graph and the Interested In vector for each community.
-
-
-
-Producer embeddings are used for producer-based tweet recommendations. For example, we can recommend similar tweets based on an account you just followed.
-
-### Entity Embeddings
-SimClusters can also be used to generate embeddings for different kind of contents, such as
-- Tweets (used for Tweet recommendations)
-- Topics (used for TopicFollow)
-
-#### Tweet embeddings
-When a tweet is created, its tweet embedding is initialized as an empty vector.
-Tweet embeddings are updated each time the tweet is favorited. Specifically, the InterestedIn vector of each user who Fav-ed the tweet is added to the tweet vector.
-Since tweet embeddings are updated each time a tweet is favorited, they change over time.
-
-Tweet embeddings are critical for our tweet recommendation tasks. We can calculate tweet similarity and recommend similar tweets to users based on their tweet engagement history.
-
-We have a online Heron job that updates the tweet embeddings in realtime, check out [here](summingbird/README.md) for more.
-
-#### Topic embeddings
-Topic embeddings (**R**) are determined by taking the cosine similarity between consumers who are interested in a community and the number of aggregated favorites each consumer has taken on a tweet that has a topic annotation (with some time decay).
-
-
-
-
-## Project Directory Overview
-The whole SimClusters project can be understood as 2 main components
-- SimClusters Offline Jobs (Scalding / GCP)
-- SimClusters Real-time Streaming Jobs
-
-### SimClusters Offline Jobs
-
-**SimClusters Scalding Jobs**
-
-| Jobs | Code | Description |
-|---|---|---|
-| KnownFor | [simclusters_v2/scalding/update_known_for/UpdateKnownFor20M145K2020.scala](scalding/update_known_for/UpdateKnownFor20M145K2020.scala) | The job outputs the KnownFor dataset which stores the relationships between clusterId and producerUserId. KnownFor dataset covers the top 20M followed producers. We use this KnownFor dataset (or so-called clusters) to build all other entity embeddings. |
-| InterestedIn Embeddings| [simclusters_v2/scalding/InterestedInFromKnownFor.scala](scalding/InterestedInFromKnownFor.scala) | This code implements the job for computing users' interestedIn embedding from the KnownFor dataset. We use this dataset for consumer-based tweet recommendations.|
-| Producer Embeddings | [simclusters_v2/scalding/embedding/ProducerEmbeddingsFromInterestedIn.scala](scalding/embedding/ProducerEmbeddingsFromInterestedIn.scala) | The code implements the job for computer producer embeddings, which represents the content user produces. We use this dataset for producer-based tweet recommendations.|
-| Semantic Core Entity Embeddings | [simclusters_v2/scalding/embedding/EntityToSimClustersEmbeddingsJob.scala](scalding/embedding/EntityToSimClustersEmbeddingsJob.scala) | The job computes the semantic core entity embeddings. It outputs datasets that stores the "SemanticCore entityId -> List(clusterId)" and "clusterId -> List(SemanticCore entityId))" relationships.|
-| Topic Embeddings | [simclusters_v2/scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.scala](scalding/embedding/tfg/FavTfgBasedTopicEmbeddings.scala) | Jobs to generate Fav-based Topic-Follow-Graph (TFG) topic embeddings A topic's fav-based TFG embedding is the sum of its followers' fav-based InterestedIn. We use this embedding for topic related recommendations.|
-
-**SimClusters GCP Jobs**
-
-We have a GCP pipeline where we build our SimClusters ANN index via BigQuery. This allows us to do fast iterations and build new embeddings more efficiently compared to Scalding.
-
-All SimClusters related GCP jobs are under [src/scala/com/twitter/simclusters_v2/scio/bq_generation](scio/bq_generation).
-
-| Jobs | Code | Description |
-|---|---|---|
-| PushOpenBased SimClusters ANN Index | [EngagementEventBasedClusterToTweetIndexGenerationJob.scala](scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.scala) | The job builds a clusterId -> TopTweet index based on user-open engagement history. This SANN source is used for candidate generation for Notifications. |
-| VideoViewBased SimClusters Index| [EngagementEventBasedClusterToTweetIndexGenerationJob.scala](scio/bq_generation/simclusters_index_generation/EngagementEventBasedClusterToTweetIndexGenerationJob.scala) | The job builds a clusterId -> TopTweet index based on the user's video view history. This SANN source is used for video recommendation on Home.|
-
-### SimClusters Real-Time Streaming Tweets Jobs
-
-| Jobs | Code | Description |
-|---|---|---|
-| Tweet Embedding Job | [simclusters_v2/summingbird/storm/TweetJob.scala](summingbird/storm/TweetJob.scala) | Generate the Tweet embedding and index of tweets for the SimClusters |
-| Persistent Tweet Embedding Job| [simclusters_v2/summingbird/storm/PersistentTweetJob.scala](summingbird/storm/PersistentTweetJob.scala) | Persistent the tweet embeddings from MemCache into Manhattan.|
\ No newline at end of file
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD b/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD
deleted file mode 100644
index 7e242cbb9..000000000
--- a/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-scala_library(
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/twitter/storehaus:core",
- "frigate/frigate-common:base",
- "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/base",
- "src/scala/com/twitter/simclusters_v2/common",
- "src/scala/com/twitter/simclusters_v2/score",
- "src/scala/com/twitter/simclusters_v2/summingbird/stores",
- "src/scala/com/twitter/simclusters_v2/tweet_similarity",
- "src/thrift/com/twitter/recos/user_tweet_entity_graph:user_tweet_entity_graph-scala",
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
- "src/thrift/com/twitter/wtf/interest:interest-thrift-scala",
- "util/util-stats/src/main/scala/com/twitter/finagle/stats",
- ],
-)
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD.docx
new file mode 100644
index 000000000..c60fdd867
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/BUILD.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.docx
new file mode 100644
index 000000000..ec9a999d0
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.scala b/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.scala
deleted file mode 100644
index 9ef629a6c..000000000
--- a/src/scala/com/twitter/simclusters_v2/candidate_source/ClusterRanker.scala
+++ /dev/null
@@ -1,56 +0,0 @@
-package com.twitter.simclusters_v2.candidate_source
-
-import com.twitter.simclusters_v2.thriftscala.UserToInterestedInClusterScores
-
-object ClusterRanker extends Enumeration {
- val RankByNormalizedFavScore: ClusterRanker.Value = Value
- val RankByFavScore: ClusterRanker.Value = Value
- val RankByFollowScore: ClusterRanker.Value = Value
- val RankByLogFavScore: ClusterRanker.Value = Value
- val RankByNormalizedLogFavScore: ClusterRanker.Value = Value
-
- /**
- * Given a map of clusters, sort out the top scoring clusters by a ranking scheme
- * provided by the caller
- */
- def getTopKClustersByScore(
- clustersWithScores: Map[Int, UserToInterestedInClusterScores],
- rankByScore: ClusterRanker.Value,
- topK: Int
- ): Map[Int, Double] = {
- val rankedClustersWithScores = clustersWithScores.map {
- case (clusterId, score) =>
- rankByScore match {
- case ClusterRanker.RankByFavScore =>
- (clusterId, (score.favScore.getOrElse(0.0), score.followScore.getOrElse(0.0)))
- case ClusterRanker.RankByFollowScore =>
- (clusterId, (score.followScore.getOrElse(0.0), score.favScore.getOrElse(0.0)))
- case ClusterRanker.RankByLogFavScore =>
- (clusterId, (score.logFavScore.getOrElse(0.0), score.followScore.getOrElse(0.0)))
- case ClusterRanker.RankByNormalizedLogFavScore =>
- (
- clusterId,
- (
- score.logFavScoreClusterNormalizedOnly.getOrElse(0.0),
- score.followScore.getOrElse(0.0)))
- case ClusterRanker.RankByNormalizedFavScore =>
- (
- clusterId,
- (
- score.favScoreProducerNormalizedOnly.getOrElse(0.0),
- score.followScore.getOrElse(0.0)))
- case _ =>
- (
- clusterId,
- (
- score.favScoreProducerNormalizedOnly.getOrElse(0.0),
- score.followScore.getOrElse(0.0)))
- }
- }
- rankedClustersWithScores.toSeq
- .sortBy(_._2) // sort in ascending order
- .takeRight(topK)
- .map { case (clusterId, scores) => clusterId -> math.max(scores._1, 1e-4) }
- .toMap
- }
-}
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.docx
new file mode 100644
index 000000000..2e861a518
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.scala b/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.scala
deleted file mode 100644
index 407558ee3..000000000
--- a/src/scala/com/twitter/simclusters_v2/candidate_source/HeavyRanker.scala
+++ /dev/null
@@ -1,71 +0,0 @@
-package com.twitter.simclusters_v2.candidate_source
-
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.frigate.common.base.Stats
-import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.SimClustersTweetCandidate
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.ScoreInternalId
-import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingPairScoreId
-import com.twitter.simclusters_v2.thriftscala.{Score => ThriftScore}
-import com.twitter.simclusters_v2.thriftscala.{ScoreId => ThriftScoreId}
-import com.twitter.util.Future
-import com.twitter.storehaus.ReadableStore
-
-object HeavyRanker {
- trait HeavyRanker {
- def rank(
- scoringAlgorithm: ScoringAlgorithm,
- sourceEmbeddingId: SimClustersEmbeddingId,
- candidateEmbeddingType: EmbeddingType,
- minScore: Double,
- candidates: Seq[SimClustersTweetCandidate]
- ): Future[Seq[SimClustersTweetCandidate]]
- }
-
- class UniformScoreStoreRanker(
- uniformScoringStore: ReadableStore[ThriftScoreId, ThriftScore],
- stats: StatsReceiver)
- extends HeavyRanker {
- val fetchCandidateEmbeddingsStat = stats.scope("fetchCandidateEmbeddings")
-
- def rank(
- scoringAlgorithm: ScoringAlgorithm,
- sourceEmbeddingId: SimClustersEmbeddingId,
- candidateEmbeddingType: EmbeddingType,
- minScore: Double,
- candidates: Seq[SimClustersTweetCandidate]
- ): Future[Seq[SimClustersTweetCandidate]] = {
- val pairScoreIds = candidates.map { candidate =>
- ThriftScoreId(
- scoringAlgorithm,
- ScoreInternalId.SimClustersEmbeddingPairScoreId(
- SimClustersEmbeddingPairScoreId(
- sourceEmbeddingId,
- SimClustersEmbeddingId(
- candidateEmbeddingType,
- sourceEmbeddingId.modelVersion,
- InternalId.TweetId(candidate.tweetId)
- )
- ))
- ) -> candidate.tweetId
- }.toMap
-
- Future
- .collect {
- Stats.trackMap(fetchCandidateEmbeddingsStat) {
- uniformScoringStore.multiGet(pairScoreIds.keySet)
- }
- }
- .map { candidateScores =>
- candidateScores.toSeq
- .collect {
- case (pairScoreId, Some(score)) if score.score >= minScore =>
- SimClustersTweetCandidate(pairScoreIds(pairScoreId), score.score, sourceEmbeddingId)
- }
- }
- }
- }
-}
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.docx
new file mode 100644
index 000000000..de812c266
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.scala b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.scala
deleted file mode 100644
index eb6684e7c..000000000
--- a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNCandidateSource.scala
+++ /dev/null
@@ -1,637 +0,0 @@
-package com.twitter.simclusters_v2.candidate_source
-
-import com.twitter.conversions.DurationOps._
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.frigate.common.base.CandidateSource
-import com.twitter.frigate.common.base.Stats
-import com.twitter.simclusters_v2.candidate_source.HeavyRanker.UniformScoreStoreRanker
-import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.SimClustersANNConfig
-import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.SimClustersTweetCandidate
-import com.twitter.simclusters_v2.common.ModelVersions._
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.summingbird.stores.ClusterKey
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.ScoreInternalId
-import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingPairScoreId
-import com.twitter.simclusters_v2.thriftscala.{Score => ThriftScore}
-import com.twitter.simclusters_v2.thriftscala.{ScoreId => ThriftScoreId}
-import com.twitter.snowflake.id.SnowflakeId
-import com.twitter.storehaus.ReadableStore
-import com.twitter.util.Duration
-import com.twitter.util.Future
-import com.twitter.util.Time
-import scala.collection.mutable
-
-/**
- * This store looks for tweets whose similarity is close to a Source SimClustersEmbeddingId.
- *
- * Approximate cosine similarity is the core algorithm to drive this store.
- *
- * Step 1 - 4 are in "fetchCandidates" method.
- * 1. Retrieve the SimClusters Embedding by the SimClustersEmbeddingId
- * 2. Fetch top N clusters' top tweets from the clusterTweetCandidatesStore (TopTweetsPerCluster index).
- * 3. Calculate all the tweet candidates' dot-product or approximate cosine similarity to source tweets.
- * 4. Take top M tweet candidates by the step 3's score
- * Step 5-6 are in "reranking" method.
- * 5. Calculate the similarity score between source and candidates.
- * 6. Return top N candidates by the step 5's score.
- *
- * Warning: Only turn off the step 5 for User InterestedIn candidate generation. It's the only use
- * case in Recos that we use dot-product to rank the tweet candidates.
- */
-case class SimClustersANNCandidateSource(
- clusterTweetCandidatesStore: ReadableStore[ClusterKey, Seq[(TweetId, Double)]],
- simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding],
- heavyRanker: HeavyRanker.HeavyRanker,
- configs: Map[EmbeddingType, SimClustersANNConfig],
- statsReceiver: StatsReceiver)
- extends CandidateSource[SimClustersANNCandidateSource.Query, SimClustersTweetCandidate] {
-
- import SimClustersANNCandidateSource._
-
- override val name: String = this.getClass.getName
- private val stats = statsReceiver.scope(this.getClass.getName)
-
- private val fetchSourceEmbeddingStat = stats.scope("fetchSourceEmbedding")
- protected val fetchCandidateEmbeddingsStat = stats.scope("fetchCandidateEmbeddings")
- private val fetchCandidatesStat = stats.scope("fetchCandidates")
- private val rerankingStat = stats.scope("reranking")
-
- override def get(
- query: SimClustersANNCandidateSource.Query
- ): Future[Option[Seq[SimClustersTweetCandidate]]] = {
- val sourceEmbeddingId = query.sourceEmbeddingId
- loadConfig(query) match {
- case Some(config) =>
- for {
- maybeSimClustersEmbedding <- Stats.track(fetchSourceEmbeddingStat) {
- simClustersEmbeddingStore.get(query.sourceEmbeddingId)
- }
- maybeFilteredCandidates <- maybeSimClustersEmbedding match {
- case Some(sourceEmbedding) =>
- for {
- rawCandidates <- Stats.trackSeq(fetchCandidatesStat) {
- fetchCandidates(sourceEmbeddingId, config, sourceEmbedding)
- }
- rankedCandidates <- Stats.trackSeq(rerankingStat) {
- reranking(sourceEmbeddingId, config, rawCandidates)
- }
- } yield {
- fetchCandidatesStat
- .stat(
- sourceEmbeddingId.embeddingType.name,
- sourceEmbeddingId.modelVersion.name).add(rankedCandidates.size)
- Some(rankedCandidates)
- }
- case None =>
- fetchCandidatesStat
- .stat(
- sourceEmbeddingId.embeddingType.name,
- sourceEmbeddingId.modelVersion.name).add(0)
- Future.None
- }
- } yield {
- maybeFilteredCandidates
- }
- case _ =>
- // Skip over queries whose config is not defined
- Future.None
- }
- }
-
- private def fetchCandidates(
- sourceEmbeddingId: SimClustersEmbeddingId,
- config: SimClustersANNConfig,
- sourceEmbedding: SimClustersEmbedding
- ): Future[Seq[SimClustersTweetCandidate]] = {
- val now = Time.now
- val earliestTweetId = SnowflakeId.firstIdFor(now - config.maxTweetCandidateAge)
- val latestTweetId = SnowflakeId.firstIdFor(now - config.minTweetCandidateAge)
- val clusterIds =
- sourceEmbedding
- .truncate(config.maxScanClusters).clusterIds
- .map { clusterId: ClusterId =>
- ClusterKey(clusterId, sourceEmbeddingId.modelVersion, config.candidateEmbeddingType)
- }.toSet
-
- Future
- .collect {
- clusterTweetCandidatesStore.multiGet(clusterIds)
- }.map { clusterTweetsMap =>
- // Use Mutable map to optimize performance. The method is thread-safe.
- // Set initial map size to around p75 of map size distribution to avoid too many copying
- // from extending the size of the mutable hashmap
- val candidateScoresMap =
- new SimClustersANNCandidateSource.HashMap[TweetId, Double](InitialCandidateMapSize)
- val candidateNormalizationMap =
- new SimClustersANNCandidateSource.HashMap[TweetId, Double](InitialCandidateMapSize)
-
- clusterTweetsMap.foreach {
- case (ClusterKey(clusterId, _, _, _), Some(tweetScores))
- if sourceEmbedding.contains(clusterId) =>
- val sourceClusterScore = sourceEmbedding.getOrElse(clusterId)
-
- for (i <- 0 until Math.min(tweetScores.size, config.maxTopTweetsPerCluster)) {
- val (tweetId, score) = tweetScores(i)
-
- if (!parseTweetId(sourceEmbeddingId).contains(tweetId) &&
- tweetId >= earliestTweetId && tweetId <= latestTweetId) {
- candidateScoresMap.put(
- tweetId,
- candidateScoresMap.getOrElse(tweetId, 0.0) + score * sourceClusterScore)
- if (config.enablePartialNormalization) {
- candidateNormalizationMap
- .put(tweetId, candidateNormalizationMap.getOrElse(tweetId, 0.0) + score * score)
- }
- }
- }
- case _ => ()
- }
-
- stats.stat("candidateScoresMap").add(candidateScoresMap.size)
- stats.stat("candidateNormalizationMap").add(candidateNormalizationMap.size)
-
- // Re-Rank the candidate by configuration
- val processedCandidateScores = candidateScoresMap.map {
- case (candidateId, score) =>
- // Enable Partial Normalization
- val processedScore =
- if (config.enablePartialNormalization) {
- // We applied the "log" version of partial normalization when we rank candidates
- // by log cosine similarity
- if (config.rankingAlgorithm == ScoringAlgorithm.PairEmbeddingLogCosineSimilarity) {
- score / sourceEmbedding.l2norm / math.log(
- 1 + candidateNormalizationMap(candidateId))
- } else {
- score / sourceEmbedding.l2norm / math.sqrt(candidateNormalizationMap(candidateId))
- }
- } else score
- SimClustersTweetCandidate(candidateId, processedScore, sourceEmbeddingId)
- }.toSeq
-
- processedCandidateScores
- .sortBy(-_.score)
- }
- }
-
- private def reranking(
- sourceEmbeddingId: SimClustersEmbeddingId,
- config: SimClustersANNConfig,
- candidates: Seq[SimClustersTweetCandidate]
- ): Future[Seq[SimClustersTweetCandidate]] = {
- val rankedCandidates = if (config.enableHeavyRanking) {
- heavyRanker
- .rank(
- scoringAlgorithm = config.rankingAlgorithm,
- sourceEmbeddingId = sourceEmbeddingId,
- candidateEmbeddingType = config.candidateEmbeddingType,
- minScore = config.minScore,
- candidates = candidates.take(config.maxReRankingCandidates)
- ).map(_.sortBy(-_.score))
- } else {
- Future.value(candidates)
- }
- rankedCandidates.map(_.take(config.maxNumResults))
- }
-
- private[candidate_source] def loadConfig(query: Query): Option[SimClustersANNConfig] = {
- configs.get(query.sourceEmbeddingId.embeddingType).map { baseConfig =>
- // apply overrides if any
- query.overrideConfig match {
- case Some(overrides) =>
- baseConfig.copy(
- maxNumResults = overrides.maxNumResults.getOrElse(baseConfig.maxNumResults),
- maxTweetCandidateAge =
- overrides.maxTweetCandidateAge.getOrElse(baseConfig.maxTweetCandidateAge),
- minScore = overrides.minScore.getOrElse(baseConfig.minScore),
- candidateEmbeddingType =
- overrides.candidateEmbeddingType.getOrElse(baseConfig.candidateEmbeddingType),
- enablePartialNormalization =
- overrides.enablePartialNormalization.getOrElse(baseConfig.enablePartialNormalization),
- enableHeavyRanking =
- overrides.enableHeavyRanking.getOrElse(baseConfig.enableHeavyRanking),
- rankingAlgorithm = overrides.rankingAlgorithm.getOrElse(baseConfig.rankingAlgorithm),
- maxReRankingCandidates =
- overrides.maxReRankingCandidates.getOrElse(baseConfig.maxReRankingCandidates),
- maxTopTweetsPerCluster =
- overrides.maxTopTweetsPerCluster.getOrElse(baseConfig.maxTopTweetsPerCluster),
- maxScanClusters = overrides.maxScanClusters.getOrElse(baseConfig.maxScanClusters),
- minTweetCandidateAge =
- overrides.minTweetCandidateAge.getOrElse(baseConfig.minTweetCandidateAge)
- )
- case _ => baseConfig
- }
- }
- }
-}
-
-object SimClustersANNCandidateSource {
-
- final val ProductionMaxNumResults = 200
- final val InitialCandidateMapSize = 16384
-
- def apply(
- clusterTweetCandidatesStore: ReadableStore[ClusterKey, Seq[(TweetId, Double)]],
- simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding],
- uniformScoringStore: ReadableStore[ThriftScoreId, ThriftScore],
- configs: Map[EmbeddingType, SimClustersANNConfig],
- statsReceiver: StatsReceiver
- ) = new SimClustersANNCandidateSource(
- clusterTweetCandidatesStore = clusterTweetCandidatesStore,
- simClustersEmbeddingStore = simClustersEmbeddingStore,
- heavyRanker = new UniformScoreStoreRanker(uniformScoringStore, statsReceiver),
- configs = configs,
- statsReceiver = statsReceiver
- )
-
- private def parseTweetId(embeddingId: SimClustersEmbeddingId): Option[TweetId] = {
- embeddingId.internalId match {
- case InternalId.TweetId(tweetId) =>
- Some(tweetId)
- case _ =>
- None
- }
- }
-
- case class Query(
- sourceEmbeddingId: SimClustersEmbeddingId,
- // Only override the config in DDG and Debuggers.
- // Use Post-filter for the holdbacks for better cache hit rate.
- overrideConfig: Option[SimClustersANNConfigOverride] = None)
-
- case class SimClustersTweetCandidate(
- tweetId: TweetId,
- score: Double,
- sourceEmbeddingId: SimClustersEmbeddingId)
-
- class HashMap[A, B](initSize: Int) extends mutable.HashMap[A, B] {
- override def initialSize: Int = initSize // 16 - by default
- }
-
- /**
- * The Configuration of Each SimClusters ANN Candidate Source.
- * Expect One SimClusters Embedding Type mapping to a SimClusters ANN Configuration in Production.
- */
- case class SimClustersANNConfig(
- // The max number of candidates for a ANN Query
- // Please don't override this value in Production.
- maxNumResults: Int = ProductionMaxNumResults,
- // The max tweet candidate duration from now.
- maxTweetCandidateAge: Duration,
- // The min score of the candidates
- minScore: Double,
- // The Candidate Embedding Type of Tweet.
- candidateEmbeddingType: EmbeddingType,
- // Enables normalization of approximate SimClusters vectors to remove popularity bias
- enablePartialNormalization: Boolean,
- // Whether to enable Embedding Similarity ranking
- enableHeavyRanking: Boolean,
- // The ranking algorithm for Source Candidate Similarity
- rankingAlgorithm: ScoringAlgorithm,
- // The max number of candidates in ReRanking Step
- maxReRankingCandidates: Int,
- // The max number of Top Tweets from every cluster tweet index
- maxTopTweetsPerCluster: Int,
- // The max number of Clusters in the source Embeddings.
- maxScanClusters: Int,
- // The min tweet candidate duration from now.
- minTweetCandidateAge: Duration)
-
- /**
- * Contains same fields as [[SimClustersANNConfig]], to specify which fields are to be overriden
- * for experimental purposes.
- *
- * All fields in this class must be optional.
- */
- case class SimClustersANNConfigOverride(
- maxNumResults: Option[Int] = None,
- maxTweetCandidateAge: Option[Duration] = None,
- minScore: Option[Double] = None,
- candidateEmbeddingType: Option[EmbeddingType] = None,
- enablePartialNormalization: Option[Boolean] = None,
- enableHeavyRanking: Option[Boolean] = None,
- rankingAlgorithm: Option[ScoringAlgorithm] = None,
- maxReRankingCandidates: Option[Int] = None,
- maxTopTweetsPerCluster: Option[Int] = None,
- maxScanClusters: Option[Int] = None,
- minTweetCandidateAge: Option[Duration] = None,
- enableLookbackSource: Option[Boolean] = None)
-
- final val DefaultMaxTopTweetsPerCluster = 200
- final val DefaultEnableHeavyRanking = false
- object SimClustersANNConfig {
- val DefaultSimClustersANNConfig: SimClustersANNConfig =
- SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.7,
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = false,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = 200,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- )
- }
-
- val LookbackMediaMinDays: Int = 0
- val LookbackMediaMaxDays: Int = 2
- val LookbackMediaMaxTweetsPerDay: Int = 2000
- val maxTopTweetsPerCluster: Int =
- (LookbackMediaMaxDays - LookbackMediaMinDays + 1) * LookbackMediaMaxTweetsPerDay
-
- val LookbackMediaTweetConfig: Map[EmbeddingType, SimClustersANNConfig] = {
- val candidateEmbeddingType = EmbeddingType.LogFavLongestL2EmbeddingTweet
- val minTweetAge = LookbackMediaMinDays.days
- val maxTweetAge =
- LookbackMediaMaxDays.days - 1.hour // To compensate for the cache TTL that might push the tweet age beyond max age
- val rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity
-
- val maxScanClusters = 50
- val minScore = 0.5
- Map(
- EmbeddingType.FavBasedProducer -> SimClustersANNConfig(
- minTweetCandidateAge = minTweetAge,
- maxTweetCandidateAge = maxTweetAge,
- minScore =
- minScore, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = candidateEmbeddingType,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = rankingAlgorithm,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = maxTopTweetsPerCluster,
- maxScanClusters = maxScanClusters,
- ),
- EmbeddingType.LogFavLongestL2EmbeddingTweet -> SimClustersANNConfig(
- minTweetCandidateAge = minTweetAge,
- maxTweetCandidateAge = maxTweetAge,
- minScore =
- minScore, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = candidateEmbeddingType,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = rankingAlgorithm,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = maxTopTweetsPerCluster,
- maxScanClusters = maxScanClusters,
- ),
- EmbeddingType.FavTfgTopic -> SimClustersANNConfig(
- minTweetCandidateAge = minTweetAge,
- maxTweetCandidateAge = maxTweetAge,
- minScore = minScore,
- candidateEmbeddingType = candidateEmbeddingType,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = rankingAlgorithm,
- maxReRankingCandidates = 400,
- maxTopTweetsPerCluster = 200,
- maxScanClusters = maxScanClusters,
- ),
- EmbeddingType.LogFavBasedKgoApeTopic -> SimClustersANNConfig(
- minTweetCandidateAge = minTweetAge,
- maxTweetCandidateAge = maxTweetAge,
- minScore = minScore,
- candidateEmbeddingType = candidateEmbeddingType,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = rankingAlgorithm,
- maxReRankingCandidates = 400,
- maxTopTweetsPerCluster = 200,
- maxScanClusters = maxScanClusters,
- ),
- )
- }
-
- val DefaultConfigMappings: Map[EmbeddingType, SimClustersANNConfig] = Map(
- EmbeddingType.FavBasedProducer -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.LogFavBasedUserInterestedAverageAddressBookFromIIAPE -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.RelaxedAggregatableLogFavBasedProducer -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.25, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 250,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.LogFavLongestL2EmbeddingTweet -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.3, // for twistly candidates. To specify a higher threshold, use a post-filter
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 400,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.FilteredUserInterestedInFromPE -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.7, // unused, heavy ranking disabled
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = false,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm =
- ScoringAlgorithm.PairEmbeddingCosineSimilarity, // Unused, heavy ranking disabled
- maxReRankingCandidates = 150, // unused, heavy ranking disabled
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.FilteredUserInterestedIn -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.7, // unused, heavy ranking disabled
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = false,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm =
- ScoringAlgorithm.PairEmbeddingCosineSimilarity, // Unused, heavy ranking disabled
- maxReRankingCandidates = 150, // unused, heavy ranking disabled
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.UnfilteredUserInterestedIn -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0,
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingLogCosineSimilarity,
- maxReRankingCandidates = 400,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.FollowBasedUserInterestedInFromAPE -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0,
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 200,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.LogFavBasedUserInterestedInFromAPE -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0,
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 200,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.FavTfgTopic -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.5,
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 400,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.LogFavBasedKgoApeTopic -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.5,
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 400,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- ),
- EmbeddingType.UserNextInterestedIn -> SimClustersANNConfig(
- maxTweetCandidateAge = 1.days,
- minScore = 0.0,
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- enablePartialNormalization = true,
- enableHeavyRanking = DefaultEnableHeavyRanking,
- rankingAlgorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- maxReRankingCandidates = 200,
- maxTopTweetsPerCluster = DefaultMaxTopTweetsPerCluster,
- maxScanClusters = 50,
- minTweetCandidateAge = 0.seconds
- )
- )
-
- /**
- * Only cache the candidates if it's not Consumer-source. For example, TweetSource, ProducerSource,
- * TopicSource. We don't cache consumer-sources (e.g. UserInterestedIn) since a cached consumer
- * object is going rarely hit, since it can't be shared by multiple users.
- */
- val CacheableShortTTLEmbeddingTypes: Set[EmbeddingType] =
- Set(
- EmbeddingType.FavBasedProducer,
- EmbeddingType.LogFavLongestL2EmbeddingTweet,
- )
-
- val CacheableLongTTLEmbeddingTypes: Set[EmbeddingType] =
- Set(
- EmbeddingType.FavTfgTopic,
- EmbeddingType.LogFavBasedKgoApeTopic
- )
-}
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.docx b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.docx
new file mode 100644
index 000000000..2174d0c6c
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.scala b/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.scala
deleted file mode 100644
index 2ad19e50f..000000000
--- a/src/scala/com/twitter/simclusters_v2/candidate_source/SimClustersANNWrapperCandidateSource.scala
+++ /dev/null
@@ -1,53 +0,0 @@
-package com.twitter.simclusters_v2.candidate_source
-
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.frigate.common.base.CandidateSource
-import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.LookbackMediaTweetConfig
-import com.twitter.simclusters_v2.candidate_source.SimClustersANNCandidateSource.SimClustersTweetCandidate
-import com.twitter.util.Future
-
-/**
- * An abstraction layer that implements a lambda structure for ANNCandidate source.
- * Allows us to call an online store as well as an offline store from a single query.
- */
-case class SimClustersANNWrapperCandidateSource(
- onlineANNSource: CandidateSource[SimClustersANNCandidateSource.Query, SimClustersTweetCandidate],
- lookbackANNSource: CandidateSource[
- SimClustersANNCandidateSource.Query,
- SimClustersTweetCandidate
- ],
-)(
- statsReceiver: StatsReceiver)
- extends CandidateSource[SimClustersANNCandidateSource.Query, SimClustersTweetCandidate] {
-
- override def get(
- query: SimClustersANNCandidateSource.Query
- ): Future[Option[Seq[SimClustersTweetCandidate]]] = {
-
- val enableLookbackSource =
- query.overrideConfig.exists(_.enableLookbackSource.getOrElse(false))
-
- val embeddingType = query.sourceEmbeddingId.embeddingType
- val lookbackCandidatesFut =
- if (enableLookbackSource &&
- LookbackMediaTweetConfig.contains(embeddingType)) {
- statsReceiver
- .counter("lookback_source", embeddingType.toString, "enable").incr()
- statsReceiver.counter("lookback_source", "enable").incr()
- lookbackANNSource.get(query)
- } else {
- statsReceiver
- .counter("lookback_source", embeddingType.toString, "disable").incr()
- Future.None
- }
-
- Future.join(onlineANNSource.get(query), lookbackCandidatesFut).map {
- case (onlineCandidates, lookbackCandidates) =>
- Some(
- onlineCandidates.getOrElse(Nil) ++ lookbackCandidates.getOrElse(Nil)
- )
- }
- }
-
- override def name: String = this.getClass.getCanonicalName
-}
diff --git a/src/scala/com/twitter/simclusters_v2/common/BUILD b/src/scala/com/twitter/simclusters_v2/common/BUILD
deleted file mode 100644
index 9cf3b3fd7..000000000
--- a/src/scala/com/twitter/simclusters_v2/common/BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-scala_library(
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/twitter/algebird:core",
- "3rdparty/jvm/com/twitter/algebird:util",
- "servo/decider",
- "src/scala/com/twitter/storehaus_internal/manhattan",
- "src/thrift/com/twitter/ml/api:interpretable-model-java",
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
- ],
-)
diff --git a/src/scala/com/twitter/simclusters_v2/common/BUILD.docx b/src/scala/com/twitter/simclusters_v2/common/BUILD.docx
new file mode 100644
index 000000000..e418604c8
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/BUILD.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.docx b/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.docx
new file mode 100644
index 000000000..f0627ebcb
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.scala b/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.scala
deleted file mode 100644
index 2a8cc1c46..000000000
--- a/src/scala/com/twitter/simclusters_v2/common/CosineSimilarityUtil.scala
+++ /dev/null
@@ -1,251 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-object CosineSimilarityUtil {
-
- /**
- * Sum of squared elements for a given vector v
- */
- def sumOfSquares[T](v: Map[T, Double]): Double = {
- v.values.foldLeft(0.0) { (sum, value) => sum + value * value }
- }
-
- /**
- * Sum of squared elements for a given vector v
- */
- def sumOfSquaresArray(v: Array[Double]): Double = {
- v.foldLeft(0.0) { (sum, value) => sum + value * value }
- }
-
- /**
- * Calculate the l2Norm score
- */
- def norm[T](v: Map[T, Double]): Double = {
- math.sqrt(sumOfSquares(v))
- }
-
- /**
- * Calculate the l2Norm score
- */
- def normArray(v: Array[Double]): Double = {
- math.sqrt(sumOfSquaresArray(v))
- }
-
- /**
- * Calculate the logNorm score
- */
- def logNorm[T](v: Map[T, Double]): Double = {
- math.log(sumOfSquares(v) + 1)
- }
-
- /**
- * Calculate the logNorm score
- */
- def logNormArray(v: Array[Double]): Double = {
- math.log(sumOfSquaresArray(v) + 1)
- }
-
- /**
- * Calculate the exp scaled norm score
- * */
- def expScaledNorm[T](v: Map[T, Double], exponent: Double): Double = {
- math.pow(sumOfSquares(v), exponent)
- }
-
- /**
- * Calculate the exp scaled norm score
- * */
- def expScaledNormArray(v: Array[Double], exponent: Double): Double = {
- math.pow(sumOfSquaresArray(v), exponent)
- }
-
- /**
- * Calculate the l1Norm score
- */
- def l1Norm[T](v: Map[T, Double]): Double = {
- v.values.foldLeft(0.0) { (sum, value) => sum + Math.abs(value) }
- }
-
- /**
- * Calculate the l1Norm score
- */
- def l1NormArray(v: Array[Double]): Double = {
- v.foldLeft(0.0) { (sum, value) => sum + Math.abs(value) }
- }
-
- /**
- * Divide the weight vector with the applied norm
- * Return the original object if the norm is 0
- *
- * @param v a map from cluster id to its weight
- * @param norm a calculated norm from the given map v
- *
- * @return a map with normalized weight
- */
- def applyNorm[T](v: Map[T, Double], norm: Double): Map[T, Double] = {
- if (norm == 0) v else v.mapValues(x => x / norm)
- }
-
- /**
- * Divide the weight vector with the applied norm
- * Return the original object if the norm is 0
- *
- * @param v a an array of weights
- * @param norm a calculated norm from the given array v
- *
- * @return an array with normalized weight in the same order as v
- */
- def applyNormArray(v: Array[Double], norm: Double): Array[Double] = {
- if (norm == 0) v else v.map(_ / norm)
- }
-
- /**
- * Normalize the weight vector for easy cosine similarity calculation. If the input weight vector
- * is empty or its norm is 0, return the original map.
- *
- * @param v a map from cluster id to its weight
- *
- * @return a map with normalized weight (the norm of the weight vector is 1)
- */
- def normalize[T](v: Map[T, Double], maybeNorm: Option[Double] = None): Map[T, Double] = {
- val norm = maybeNorm.getOrElse(CosineSimilarityUtil.norm(v))
- applyNorm(v, norm)
- }
-
- /**
- * Normalize the weight vector for easy cosine similarity calculation. If the input weight vector
- * is empty or its norm is 0, return the original array.
- *
- * @param v an array of weights
- *
- * @return an array with normalized weight (the norm of the weight vector is 1), in the same order as v
- */
- def normalizeArray(
- v: Array[Double],
- maybeNorm: Option[Double] = None
- ): Array[Double] = {
- val norm = maybeNorm.getOrElse(CosineSimilarityUtil.normArray(v))
- applyNormArray(v, norm)
- }
-
- /**
- * Normalize the weight vector with log norm. If the input weight vector
- * is empty or its norm is 0, return the original map.
- *
- * @param v a map from cluster id to its weight
- *
- * @return a map with log normalized weight
- * */
- def logNormalize[T](v: Map[T, Double], maybeNorm: Option[Double] = None): Map[T, Double] = {
- val norm = maybeNorm.getOrElse(CosineSimilarityUtil.logNorm(v))
- applyNorm(v, norm)
- }
-
- /**
- * Normalize the weight vector with log norm. If the input weight vector
- * is empty or its norm is 0, return the original array.
- *
- * @param v an array of weights
- *
- * @return an array with log normalized weight, in the same order as v
- * */
- def logNormalizeArray(
- v: Array[Double],
- maybeNorm: Option[Double] = None
- ): Array[Double] = {
- val norm = maybeNorm.getOrElse(CosineSimilarityUtil.logNormArray(v))
- applyNormArray(v, norm)
- }
-
- /**
- * Normalize the weight vector with exponentially scaled norm. If the input weight vector
- * is empty or its norm is 0, return the original map.
- *
- * @param v a map from cluster id to its weight
- * @param exponent the exponent we apply to the weight vector's norm
- *
- * @return a map with exp scaled normalized weight
- * */
- def expScaledNormalize[T](
- v: Map[T, Double],
- exponent: Option[Double] = None,
- maybeNorm: Option[Double] = None
- ): Map[T, Double] = {
- val norm = maybeNorm.getOrElse(CosineSimilarityUtil.expScaledNorm(v, exponent.getOrElse(0.3)))
- applyNorm(v, norm)
- }
-
- /**
- * Normalize the weight vector with exponentially scaled norm. If the input weight vector
- * is empty or its norm is 0, return the original map.
- *
- * @param v an array of weights
- * @param exponent the exponent we apply to the weight vector's norm
- *
- * @return an array with exp scaled normalized weight, in the same order as v
- * */
- def expScaledNormalizeArray(
- v: Array[Double],
- exponent: Double,
- maybeNorm: Option[Double] = None
- ): Array[Double] = {
- val norm = maybeNorm.getOrElse(CosineSimilarityUtil.expScaledNormArray(v, exponent))
- applyNormArray(v, norm)
- }
-
- /**
- * Given two sparse vectors, calculate its dot product.
- *
- * @param v1 the first map from cluster id to its weight
- * @param v2 the second map from cluster id to its weight
- *
- * @return the dot product of above two sparse vector
- */
- def dotProduct[T](v1: Map[T, Double], v2: Map[T, Double]): Double = {
- val comparer = v1.size - v2.size
- val smaller = if (comparer > 0) v2 else v1
- val bigger = if (comparer > 0) v1 else v2
-
- smaller.foldLeft(0.0) {
- case (sum, (id, value)) =>
- sum + bigger.getOrElse(id, 0.0) * value
- }
- }
-
- /**
- * Given two sparse vectors, calculate its dot product.
- *
- * @param v1C an array of cluster ids. Must be sorted in ascending order
- * @param v1S an array of corresponding cluster scores, of the same length and order as v1c
- * @param v2C an array of cluster ids. Must be sorted in ascending order
- * @param v2S an array of corresponding cluster scores, of the same length and order as v2c
- *
- * @return the dot product of above two sparse vector
- */
- def dotProductForSortedClusterAndScores(
- v1C: Array[Int],
- v1S: Array[Double],
- v2C: Array[Int],
- v2S: Array[Double]
- ): Double = {
- require(v1C.size == v1S.size)
- require(v2C.size == v2S.size)
- var i1 = 0
- var i2 = 0
- var product: Double = 0.0
-
- while (i1 < v1C.size && i2 < v2C.size) {
- if (v1C(i1) == v2C(i2)) {
- product += v1S(i1) * v2S(i2)
- i1 += 1
- i2 += 1
- } else if (v1C(i1) > v2C(i2)) {
- // v2 cluster is lower. Increment it to see if the next one matches v1's
- i2 += 1
- } else {
- // v1 cluster is lower. Increment it to see if the next one matches v2's
- i1 += 1
- }
- }
- product
- }
-}
diff --git a/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.docx b/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.docx
new file mode 100644
index 000000000..b6a5125d6
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.scala b/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.scala
deleted file mode 100644
index 76e10aaa0..000000000
--- a/src/scala/com/twitter/simclusters_v2/common/DeciderGateBuilderWithIdHashing.scala
+++ /dev/null
@@ -1,21 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-import com.twitter.decider.Decider
-import com.twitter.servo.decider.{DeciderGateBuilder, DeciderKeyName}
-import com.twitter.servo.util.Gate
-
-class DeciderGateBuilderWithIdHashing(decider: Decider) extends DeciderGateBuilder(decider) {
-
- def idGateWithHashing[T](key: DeciderKeyName): Gate[T] = {
- val feature = keyToFeature(key)
- // Only if the decider is neither fully on / off is the object hashed
- // This does require an additional call to get the decider availability but that is comparatively cheaper
- val convertToHash: T => Long = (obj: T) => {
- val availability = feature.availability.getOrElse(0)
- if (availability == 10000 || availability == 0) availability
- else obj.hashCode
- }
- idGate(key).contramap[T](convertToHash)
- }
-
-}
diff --git a/src/scala/com/twitter/simclusters_v2/common/ModelVersions.docx b/src/scala/com/twitter/simclusters_v2/common/ModelVersions.docx
new file mode 100644
index 000000000..b32603fba
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/ModelVersions.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/common/ModelVersions.scala b/src/scala/com/twitter/simclusters_v2/common/ModelVersions.scala
deleted file mode 100644
index 796474ccd..000000000
--- a/src/scala/com/twitter/simclusters_v2/common/ModelVersions.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-
-/**
- * The utility to convert SimClusters Model version into different forms.
- * Required to register any new SimClusters Model version here.
- */
-object ModelVersions {
-
- val Model20M145KDec11 = "20M_145K_dec11"
- val Model20M145KUpdated = "20M_145K_updated"
- val Model20M145K2020 = "20M_145K_2020"
-
- // Use Enum for feature switch
- object Enum extends Enumeration {
- val Model20M145K2020, Model20M145KUpdated: Value = Value
- val enumToSimClustersModelVersionMap: Map[Enum.Value, ModelVersion] = Map(
- Model20M145K2020 -> ModelVersion.Model20m145k2020,
- Model20M145KUpdated -> ModelVersion.Model20m145kUpdated
- )
- }
-
- // Add the new model version into this map
- private val StringToThriftModelVersions: Map[String, ModelVersion] =
- Map(
- Model20M145KDec11 -> ModelVersion.Model20m145kDec11,
- Model20M145KUpdated -> ModelVersion.Model20m145kUpdated,
- Model20M145K2020 -> ModelVersion.Model20m145k2020
- )
-
- private val ThriftModelVersionToStrings = StringToThriftModelVersions.map(_.swap)
-
- val AllModelVersions: Set[String] = StringToThriftModelVersions.keySet
-
- def toModelVersionOption(modelVersionStr: String): Option[ModelVersion] = {
- StringToThriftModelVersions.get(modelVersionStr)
- }
-
- implicit def toModelVersion(modelVersionStr: String): ModelVersion = {
- StringToThriftModelVersions(modelVersionStr)
- }
-
- implicit def toKnownForModelVersion(modelVersion: ModelVersion): String = {
- ThriftModelVersionToStrings(modelVersion)
- }
-
-}
diff --git a/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.docx b/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.docx
new file mode 100644
index 000000000..9f1d31a18
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.scala b/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.scala
deleted file mode 100644
index c8e11c41f..000000000
--- a/src/scala/com/twitter/simclusters_v2/common/SeqStandardDeviation.scala
+++ /dev/null
@@ -1,22 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-object SeqStandardDeviation {
-
- def apply[T](t: Seq[T])(implicit mapper: T => Double): Double = {
- if (t.isEmpty) {
- 0.0
- } else {
- val sum = t.foldLeft(0.0) {
- case (temp, score) =>
- temp + score
- }
- val mean = sum / t.size
- val variance = t.foldLeft(0.0) { (sum, score) =>
- val v = score - mean
- sum + v * v
- } / t.size
- math.sqrt(variance)
- }
- }
-
-}
diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.docx
new file mode 100644
index 000000000..83e660dc6
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.scala
deleted file mode 100644
index b8f0179cb..000000000
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbedding.scala
+++ /dev/null
@@ -1,581 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-import com.twitter.simclusters_v2.thriftscala.SimClusterWithScore
-import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding}
-import scala.collection.mutable
-import scala.language.implicitConversions
-import scala.util.hashing.MurmurHash3.arrayHash
-import scala.util.hashing.MurmurHash3.productHash
-import scala.math._
-
-/**
- * A representation of a SimClusters Embedding, designed for low memory footprint and performance.
- * For services that cache millions of embeddings, we found this to significantly reduce allocations,
- * memory footprint and overall performance.
- *
- * Embedding data is stored in pre-sorted arrays rather than structures which use a lot of pointers
- * (e.g. Map). A minimal set of lazily-constructed intermediate data is kept.
- *
- * Be wary of adding further `val` or `lazy val`s to this class; materializing and storing more data
- * on these objects could significantly affect in-memory cache performance.
- *
- * Also, if you are using this code in a place where you care about memory footprint, be careful
- * not to materialize any of the lazy vals unless you need them.
- */
-sealed trait SimClustersEmbedding extends Equals {
- import SimClustersEmbedding._
-
- /**
- * Any compliant implementation of the SimClustersEmbedding trait must ensure that:
- * - the cluster and score arrays are ordered as described below
- * - the cluster and score arrays are treated as immutable (.hashCode is memoized)
- * - the size of all cluster and score arrays is the same
- * - all cluster scores are > 0
- * - cluster ids are unique
- */
- // In descending score order - this is useful for truncation, where we care most about the highest scoring elements
- private[simclusters_v2] val clusterIds: Array[ClusterId]
- private[simclusters_v2] val scores: Array[Double]
- // In ascending cluster order. This is useful for operations where we try to find the same cluster in another embedding, e.g. dot product
- private[simclusters_v2] val sortedClusterIds: Array[ClusterId]
- private[simclusters_v2] val sortedScores: Array[Double]
-
- /**
- * Build and return a Set of all clusters in this embedding
- */
- lazy val clusterIdSet: Set[ClusterId] = sortedClusterIds.toSet
-
- /**
- * Build and return Seq representation of this embedding
- */
- lazy val embedding: Seq[(ClusterId, Double)] =
- sortedClusterIds.zip(sortedScores).sortBy(-_._2).toSeq
-
- /**
- * Build and return a Map representation of this embedding
- */
- lazy val map: Map[ClusterId, Double] = sortedClusterIds.zip(sortedScores).toMap
-
- lazy val l1norm: Double = CosineSimilarityUtil.l1NormArray(sortedScores)
-
- lazy val l2norm: Double = CosineSimilarityUtil.normArray(sortedScores)
-
- lazy val logNorm: Double = CosineSimilarityUtil.logNormArray(sortedScores)
-
- lazy val expScaledNorm: Double =
- CosineSimilarityUtil.expScaledNormArray(sortedScores, DefaultExponent)
-
- /**
- * The L2 Normalized Embedding. Optimize for Cosine Similarity Calculation.
- */
- lazy val normalizedSortedScores: Array[Double] =
- CosineSimilarityUtil.applyNormArray(sortedScores, l2norm)
-
- lazy val logNormalizedSortedScores: Array[Double] =
- CosineSimilarityUtil.applyNormArray(sortedScores, logNorm)
-
- lazy val expScaledNormalizedSortedScores: Array[Double] =
- CosineSimilarityUtil.applyNormArray(sortedScores, expScaledNorm)
-
- /**
- * The Standard Deviation of an Embedding.
- */
- lazy val std: Double = {
- if (scores.isEmpty) {
- 0.0
- } else {
- val sum = scores.sum
- val mean = sum / scores.length
- var variance: Double = 0.0
- for (i <- scores.indices) {
- val v = scores(i) - mean
- variance += (v * v)
- }
- math.sqrt(variance / scores.length)
- }
- }
-
- /**
- * Return the score of a given clusterId.
- */
- def get(clusterId: ClusterId): Option[Double] = {
- var i = 0
- while (i < sortedClusterIds.length) {
- val thisId = sortedClusterIds(i)
- if (clusterId == thisId) return Some(sortedScores(i))
- if (thisId > clusterId) return None
- i += 1
- }
- None
- }
-
- /**
- * Return the score of a given clusterId. If not exist, return default.
- */
- def getOrElse(clusterId: ClusterId, default: Double = 0.0): Double = {
- require(default >= 0.0)
- var i = 0
- while (i < sortedClusterIds.length) {
- val thisId = sortedClusterIds(i)
- if (clusterId == thisId) return sortedScores(i)
- if (thisId > clusterId) return default
- i += 1
- }
- default
- }
-
- /**
- * Return the cluster ids
- */
- def getClusterIds(): Array[ClusterId] = clusterIds
-
- /**
- * Return the cluster ids with the highest scores
- */
- def topClusterIds(size: Int): Seq[ClusterId] = clusterIds.take(size)
-
- /**
- * Return true if this embedding contains a given clusterId
- */
- def contains(clusterId: ClusterId): Boolean = clusterIdSet.contains(clusterId)
-
- def sum(another: SimClustersEmbedding): SimClustersEmbedding = {
- if (another.isEmpty) this
- else if (this.isEmpty) another
- else {
- var i1 = 0
- var i2 = 0
- val l = scala.collection.mutable.ArrayBuffer.empty[(Int, Double)]
- while (i1 < sortedClusterIds.length && i2 < another.sortedClusterIds.length) {
- if (sortedClusterIds(i1) == another.sortedClusterIds(i2)) {
- l += Tuple2(sortedClusterIds(i1), sortedScores(i1) + another.sortedScores(i2))
- i1 += 1
- i2 += 1
- } else if (sortedClusterIds(i1) > another.sortedClusterIds(i2)) {
- l += Tuple2(another.sortedClusterIds(i2), another.sortedScores(i2))
- // another cluster is lower. Increment it to see if the next one matches this's
- i2 += 1
- } else {
- l += Tuple2(sortedClusterIds(i1), sortedScores(i1))
- // this cluster is lower. Increment it to see if the next one matches anothers's
- i1 += 1
- }
- }
- if (i1 == sortedClusterIds.length && i2 != another.sortedClusterIds.length)
- // this was shorter. Prepend remaining elements from another
- l ++= another.sortedClusterIds.drop(i2).zip(another.sortedScores.drop(i2))
- else if (i1 != sortedClusterIds.length && i2 == another.sortedClusterIds.length)
- // another was shorter. Prepend remaining elements from this
- l ++= sortedClusterIds.drop(i1).zip(sortedScores.drop(i1))
- SimClustersEmbedding(l)
- }
- }
-
- def scalarMultiply(multiplier: Double): SimClustersEmbedding = {
- require(multiplier > 0.0, "SimClustersEmbedding.scalarMultiply requires multiplier > 0.0")
- DefaultSimClustersEmbedding(
- clusterIds,
- scores.map(_ * multiplier),
- sortedClusterIds,
- sortedScores.map(_ * multiplier)
- )
- }
-
- def scalarDivide(divisor: Double): SimClustersEmbedding = {
- require(divisor > 0.0, "SimClustersEmbedding.scalarDivide requires divisor > 0.0")
- DefaultSimClustersEmbedding(
- clusterIds,
- scores.map(_ / divisor),
- sortedClusterIds,
- sortedScores.map(_ / divisor)
- )
- }
-
- def dotProduct(another: SimClustersEmbedding): Double = {
- CosineSimilarityUtil.dotProductForSortedClusterAndScores(
- sortedClusterIds,
- sortedScores,
- another.sortedClusterIds,
- another.sortedScores)
- }
-
- def cosineSimilarity(another: SimClustersEmbedding): Double = {
- CosineSimilarityUtil.dotProductForSortedClusterAndScores(
- sortedClusterIds,
- normalizedSortedScores,
- another.sortedClusterIds,
- another.normalizedSortedScores)
- }
-
- def logNormCosineSimilarity(another: SimClustersEmbedding): Double = {
- CosineSimilarityUtil.dotProductForSortedClusterAndScores(
- sortedClusterIds,
- logNormalizedSortedScores,
- another.sortedClusterIds,
- another.logNormalizedSortedScores)
- }
-
- def expScaledCosineSimilarity(another: SimClustersEmbedding): Double = {
- CosineSimilarityUtil.dotProductForSortedClusterAndScores(
- sortedClusterIds,
- expScaledNormalizedSortedScores,
- another.sortedClusterIds,
- another.expScaledNormalizedSortedScores)
- }
-
- /**
- * Return true if this is an empty embedding
- */
- def isEmpty: Boolean = sortedClusterIds.isEmpty
-
- /**
- * Return the Jaccard Similarity Score between two embeddings.
- * Note: this implementation should be optimized if we start to use it in production
- */
- def jaccardSimilarity(another: SimClustersEmbedding): Double = {
- if (this.isEmpty || another.isEmpty) {
- 0.0
- } else {
- val intersect = clusterIdSet.intersect(another.clusterIdSet).size
- val union = clusterIdSet.union(another.clusterIdSet).size
- intersect.toDouble / union
- }
- }
-
- /**
- * Return the Fuzzy Jaccard Similarity Score between two embeddings.
- * Treat each Simclusters embedding as fuzzy set, calculate the fuzzy set similarity
- * metrics of two embeddings
- *
- * Paper 2.2.1: https://openreview.net/pdf?id=SkxXg2C5FX
- */
- def fuzzyJaccardSimilarity(another: SimClustersEmbedding): Double = {
- if (this.isEmpty || another.isEmpty) {
- 0.0
- } else {
- val v1C = sortedClusterIds
- val v1S = sortedScores
- val v2C = another.sortedClusterIds
- val v2S = another.sortedScores
-
- require(v1C.length == v1S.length)
- require(v2C.length == v2S.length)
-
- var i1 = 0
- var i2 = 0
- var numerator = 0.0
- var denominator = 0.0
-
- while (i1 < v1C.length && i2 < v2C.length) {
- if (v1C(i1) == v2C(i2)) {
- numerator += min(v1S(i1), v2S(i2))
- denominator += max(v1S(i1), v2S(i2))
- i1 += 1
- i2 += 1
- } else if (v1C(i1) > v2C(i2)) {
- denominator += v2S(i2)
- i2 += 1
- } else {
- denominator += v1S(i1)
- i1 += 1
- }
- }
-
- while (i1 < v1C.length) {
- denominator += v1S(i1)
- i1 += 1
- }
- while (i2 < v2C.length) {
- denominator += v2S(i2)
- i2 += 1
- }
-
- numerator / denominator
- }
- }
-
- /**
- * Return the Euclidean Distance Score between two embeddings.
- * Note: this implementation should be optimized if we start to use it in production
- */
- def euclideanDistance(another: SimClustersEmbedding): Double = {
- val unionClusters = clusterIdSet.union(another.clusterIdSet)
- val variance = unionClusters.foldLeft(0.0) {
- case (sum, clusterId) =>
- val distance = math.abs(this.getOrElse(clusterId) - another.getOrElse(clusterId))
- sum + distance * distance
- }
- math.sqrt(variance)
- }
-
- /**
- * Return the Manhattan Distance Score between two embeddings.
- * Note: this implementation should be optimized if we start to use it in production
- */
- def manhattanDistance(another: SimClustersEmbedding): Double = {
- val unionClusters = clusterIdSet.union(another.clusterIdSet)
- unionClusters.foldLeft(0.0) {
- case (sum, clusterId) =>
- sum + math.abs(this.getOrElse(clusterId) - another.getOrElse(clusterId))
- }
- }
-
- /**
- * Return the number of overlapping clusters between two embeddings.
- */
- def overlappingClusters(another: SimClustersEmbedding): Int = {
- var i1 = 0
- var i2 = 0
- var count = 0
-
- while (i1 < sortedClusterIds.length && i2 < another.sortedClusterIds.length) {
- if (sortedClusterIds(i1) == another.sortedClusterIds(i2)) {
- count += 1
- i1 += 1
- i2 += 1
- } else if (sortedClusterIds(i1) > another.sortedClusterIds(i2)) {
- // v2 cluster is lower. Increment it to see if the next one matches v1's
- i2 += 1
- } else {
- // v1 cluster is lower. Increment it to see if the next one matches v2's
- i1 += 1
- }
- }
- count
- }
-
- /**
- * Return the largest product cluster scores
- */
- def maxElementwiseProduct(another: SimClustersEmbedding): Double = {
- var i1 = 0
- var i2 = 0
- var maxProduct: Double = 0.0
-
- while (i1 < sortedClusterIds.length && i2 < another.sortedClusterIds.length) {
- if (sortedClusterIds(i1) == another.sortedClusterIds(i2)) {
- val product = sortedScores(i1) * another.sortedScores(i2)
- if (product > maxProduct) maxProduct = product
- i1 += 1
- i2 += 1
- } else if (sortedClusterIds(i1) > another.sortedClusterIds(i2)) {
- // v2 cluster is lower. Increment it to see if the next one matches v1's
- i2 += 1
- } else {
- // v1 cluster is lower. Increment it to see if the next one matches v2's
- i1 += 1
- }
- }
- maxProduct
- }
-
- /**
- * Return a new SimClustersEmbedding with Max Embedding Size.
- *
- * Prefer to truncate on embedding construction where possible. Doing so is cheaper.
- */
- def truncate(size: Int): SimClustersEmbedding = {
- if (clusterIds.length <= size) {
- this
- } else {
- val truncatedClusterIds = clusterIds.take(size)
- val truncatedScores = scores.take(size)
- val (sortedClusterIds, sortedScores) =
- truncatedClusterIds.zip(truncatedScores).sortBy(_._1).unzip
-
- DefaultSimClustersEmbedding(
- truncatedClusterIds,
- truncatedScores,
- sortedClusterIds,
- sortedScores)
- }
- }
-
- def toNormalized: SimClustersEmbedding = {
- // Additional safety check. Only EmptyEmbedding's l2norm is 0.0.
- if (l2norm == 0.0) {
- EmptyEmbedding
- } else {
- this.scalarDivide(l2norm)
- }
- }
-
- implicit def toThrift: ThriftSimClustersEmbedding = {
- ThriftSimClustersEmbedding(
- embedding.map {
- case (clusterId, score) =>
- SimClusterWithScore(clusterId, score)
- }
- )
- }
-
- def canEqual(a: Any): Boolean = a.isInstanceOf[SimClustersEmbedding]
-
- /* We define equality as having the same clusters and scores.
- * This implementation is arguably incorrect in this case:
- * (1 -> 1.0, 2 -> 0.0) == (1 -> 1.0) // equals returns false
- * However, compliant implementations of SimClustersEmbedding should not include zero-weight
- * clusters, so this implementation should work correctly.
- */
- override def equals(that: Any): Boolean =
- that match {
- case that: SimClustersEmbedding =>
- that.canEqual(this) &&
- this.sortedClusterIds.sameElements(that.sortedClusterIds) &&
- this.sortedScores.sameElements(that.sortedScores)
- case _ => false
- }
-
- /**
- * hashcode implementation based on the contents of the embedding. As a lazy val, this relies on
- * the embedding contents being immutable.
- */
- override lazy val hashCode: Int = {
- /* Arrays uses object id as hashCode, so different arrays with the same contents hash
- * differently. To provide a stable hash code, we take the same approach as how a
- * `case class(clusters: Seq[Int], scores: Seq[Double])` would be hashed. See
- * ScalaRunTime._hashCode and MurmurHash3.productHash
- * https://github.com/scala/scala/blob/2.12.x/src/library/scala/runtime/ScalaRunTime.scala#L167
- * https://github.com/scala/scala/blob/2.12.x/src/library/scala/util/hashing/MurmurHash3.scala#L64
- *
- * Note that the hashcode is arguably incorrect in this case:
- * (1 -> 1.0, 2 -> 0.0).hashcode == (1 -> 1.0).hashcode // returns false
- * However, compliant implementations of SimClustersEmbedding should not include zero-weight
- * clusters, so this implementation should work correctly.
- */
- productHash((arrayHash(sortedClusterIds), arrayHash(sortedScores)))
- }
-}
-
-object SimClustersEmbedding {
- val EmptyEmbedding: SimClustersEmbedding =
- DefaultSimClustersEmbedding(Array.empty, Array.empty, Array.empty, Array.empty)
-
- val DefaultExponent: Double = 0.3
-
- // Descending by score then ascending by ClusterId
- implicit val order: Ordering[(ClusterId, Double)] =
- (a: (ClusterId, Double), b: (ClusterId, Double)) => {
- b._2 compare a._2 match {
- case 0 => a._1 compare b._1
- case c => c
- }
- }
-
- /**
- * Constructors
- *
- * These constructors:
- * - do not make assumptions about the ordering of the cluster/scores.
- * - do assume that cluster ids are unique
- * - ignore (drop) any cluster whose score is <= 0
- */
- def apply(embedding: (ClusterId, Double)*): SimClustersEmbedding =
- buildDefaultSimClustersEmbedding(embedding)
-
- def apply(embedding: Iterable[(ClusterId, Double)]): SimClustersEmbedding =
- buildDefaultSimClustersEmbedding(embedding)
-
- def apply(embedding: Iterable[(ClusterId, Double)], size: Int): SimClustersEmbedding =
- buildDefaultSimClustersEmbedding(embedding, truncate = Some(size))
-
- implicit def apply(thriftEmbedding: ThriftSimClustersEmbedding): SimClustersEmbedding =
- buildDefaultSimClustersEmbedding(thriftEmbedding.embedding.map(_.toTuple))
-
- def apply(thriftEmbedding: ThriftSimClustersEmbedding, truncate: Int): SimClustersEmbedding =
- buildDefaultSimClustersEmbedding(
- thriftEmbedding.embedding.map(_.toTuple),
- truncate = Some(truncate))
-
- private def buildDefaultSimClustersEmbedding(
- embedding: Iterable[(ClusterId, Double)],
- truncate: Option[Int] = None
- ): SimClustersEmbedding = {
- val truncatedIdAndScores = {
- val idsAndScores = embedding.filter(_._2 > 0.0).toArray.sorted(order)
- truncate match {
- case Some(t) => idsAndScores.take(t)
- case _ => idsAndScores
- }
- }
-
- if (truncatedIdAndScores.isEmpty) {
- EmptyEmbedding
- } else {
- val (clusterIds, scores) = truncatedIdAndScores.unzip
- val (sortedClusterIds, sortedScores) = truncatedIdAndScores.sortBy(_._1).unzip
- DefaultSimClustersEmbedding(clusterIds, scores, sortedClusterIds, sortedScores)
- }
- }
-
- /** ***** Aggregation Methods ******/
- /**
- * A high performance version of Sum a list of SimClustersEmbeddings.
- * Suggest using in Online Services to avoid the unnecessary GC.
- * For offline or streaming. Please check [[SimClustersEmbeddingMonoid]]
- */
- def sum(simClustersEmbeddings: Iterable[SimClustersEmbedding]): SimClustersEmbedding = {
- if (simClustersEmbeddings.isEmpty) {
- EmptyEmbedding
- } else {
- val sum = simClustersEmbeddings.foldLeft(mutable.Map[ClusterId, Double]()) {
- (sum, embedding) =>
- for (i <- embedding.sortedClusterIds.indices) {
- val clusterId = embedding.sortedClusterIds(i)
- sum.put(clusterId, embedding.sortedScores(i) + sum.getOrElse(clusterId, 0.0))
- }
- sum
- }
- SimClustersEmbedding(sum)
- }
- }
-
- /**
- * Support a fixed size SimClustersEmbedding Sum
- */
- def sum(
- simClustersEmbeddings: Iterable[SimClustersEmbedding],
- maxSize: Int
- ): SimClustersEmbedding = {
- sum(simClustersEmbeddings).truncate(maxSize)
- }
-
- /**
- * A high performance version of Mean a list of SimClustersEmbeddings.
- * Suggest using in Online Services to avoid the unnecessary GC.
- */
- def mean(simClustersEmbeddings: Iterable[SimClustersEmbedding]): SimClustersEmbedding = {
- if (simClustersEmbeddings.isEmpty) {
- EmptyEmbedding
- } else {
- sum(simClustersEmbeddings).scalarDivide(simClustersEmbeddings.size)
- }
- }
-
- /**
- * Support a fixed size SimClustersEmbedding Mean
- */
- def mean(
- simClustersEmbeddings: Iterable[SimClustersEmbedding],
- maxSize: Int
- ): SimClustersEmbedding = {
- mean(simClustersEmbeddings).truncate(maxSize)
- }
-}
-
-case class DefaultSimClustersEmbedding(
- override val clusterIds: Array[ClusterId],
- override val scores: Array[Double],
- override val sortedClusterIds: Array[ClusterId],
- override val sortedScores: Array[Double])
- extends SimClustersEmbedding {
-
- override def toString: String =
- s"DefaultSimClustersEmbedding(${clusterIds.zip(scores).mkString(",")})"
-}
-
-object DefaultSimClustersEmbedding {
- // To support existing code which builds embeddings from a Seq
- def apply(embedding: Seq[(ClusterId, Double)]): SimClustersEmbedding = SimClustersEmbedding(
- embedding)
-}
diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.docx
new file mode 100644
index 000000000..2cafa5991
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.scala
deleted file mode 100644
index 0a2fc592f..000000000
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingId.scala
+++ /dev/null
@@ -1,209 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.LocaleEntityId
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.TopicId
-import com.twitter.simclusters_v2.thriftscala.{
- SimClustersEmbeddingId => ThriftSimClustersEmbeddingId
-}
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType._
-import com.twitter.simclusters_v2.thriftscala.InternalId.EntityId
-import com.twitter.simclusters_v2.thriftscala.InternalId.TweetId
-import com.twitter.simclusters_v2.thriftscala.InternalId.UserId
-import com.twitter.simclusters_v2.thriftscala.{EmbeddingType => SimClustersEmbeddingType}
-
-object SimClustersEmbeddingId {
-
- val DefaultModelVersion: ModelVersion = ModelVersion.Model20m145k2020
-
- // Embeddings which is available in Content-Recommender
- val TweetEmbeddingTypes: Set[EmbeddingType] =
- Set(
- FavBasedTweet,
- FollowBasedTweet,
- LogFavBasedTweet,
- LogFavLongestL2EmbeddingTweet
- )
- val DefaultTweetEmbeddingType: EmbeddingType = LogFavLongestL2EmbeddingTweet
-
- val UserInterestedInEmbeddingTypes: Set[EmbeddingType] =
- Set(
- FavBasedUserInterestedIn,
- FollowBasedUserInterestedIn,
- LogFavBasedUserInterestedIn,
- RecentFollowBasedUserInterestedIn,
- FilteredUserInterestedIn,
- FavBasedUserInterestedInFromPE,
- FollowBasedUserInterestedInFromPE,
- LogFavBasedUserInterestedInFromPE,
- FilteredUserInterestedInFromPE,
- LogFavBasedUserInterestedInFromAPE,
- FollowBasedUserInterestedInFromAPE,
- UnfilteredUserInterestedIn
- )
- val DefaultUserInterestInEmbeddingType: EmbeddingType = FavBasedUserInterestedIn
-
- val ProducerEmbeddingTypes: Set[EmbeddingType] =
- Set(
- FavBasedProducer,
- FollowBasedProducer,
- AggregatableFavBasedProducer,
- AggregatableLogFavBasedProducer,
- RelaxedAggregatableLogFavBasedProducer,
- KnownFor
- )
- val DefaultProducerEmbeddingType: EmbeddingType = FavBasedProducer
-
- val LocaleEntityEmbeddingTypes: Set[EmbeddingType] =
- Set(
- FavTfgTopic,
- LogFavTfgTopic
- )
- val DefaultLocaleEntityEmbeddingType: EmbeddingType = FavTfgTopic
-
- val TopicEmbeddingTypes: Set[EmbeddingType] =
- Set(
- LogFavBasedKgoApeTopic
- )
- val DefaultTopicEmbeddingType: EmbeddingType = LogFavBasedKgoApeTopic
-
- val AllEmbeddingTypes: Set[EmbeddingType] =
- TweetEmbeddingTypes ++
- UserInterestedInEmbeddingTypes ++
- ProducerEmbeddingTypes ++
- LocaleEntityEmbeddingTypes ++
- TopicEmbeddingTypes
-
- def buildTweetId(
- tweetId: TweetId,
- embeddingType: EmbeddingType = DefaultTweetEmbeddingType,
- modelVersion: ModelVersion = DefaultModelVersion
- ): ThriftSimClustersEmbeddingId = {
- assert(TweetEmbeddingTypes.contains(embeddingType))
- ThriftSimClustersEmbeddingId(
- embeddingType,
- modelVersion,
- InternalId.TweetId(tweetId)
- )
- }
-
- def buildUserInterestedInId(
- userId: UserId,
- embeddingType: EmbeddingType = DefaultUserInterestInEmbeddingType,
- modelVersion: ModelVersion = DefaultModelVersion
- ): ThriftSimClustersEmbeddingId = {
- assert(UserInterestedInEmbeddingTypes.contains(embeddingType))
- ThriftSimClustersEmbeddingId(
- embeddingType,
- modelVersion,
- InternalId.UserId(userId)
- )
- }
-
- def buildProducerId(
- userId: UserId,
- embeddingType: EmbeddingType = DefaultProducerEmbeddingType,
- modelVersion: ModelVersion = DefaultModelVersion
- ): ThriftSimClustersEmbeddingId = {
- assert(ProducerEmbeddingTypes.contains(embeddingType))
- ThriftSimClustersEmbeddingId(
- embeddingType,
- modelVersion,
- InternalId.UserId(userId)
- )
- }
-
- def buildLocaleEntityId(
- entityId: SemanticCoreEntityId,
- language: String,
- embeddingType: EmbeddingType = DefaultLocaleEntityEmbeddingType,
- modelVersion: ModelVersion = DefaultModelVersion
- ): ThriftSimClustersEmbeddingId = {
- ThriftSimClustersEmbeddingId(
- embeddingType,
- modelVersion,
- InternalId.LocaleEntityId(
- LocaleEntityId(entityId, language)
- )
- )
- }
-
- def buildTopicId(
- topicId: TopicId,
- language: Option[String] = None,
- country: Option[String] = None,
- embeddingType: EmbeddingType = DefaultTopicEmbeddingType,
- modelVersion: ModelVersion = DefaultModelVersion
- ): ThriftSimClustersEmbeddingId = {
- ThriftSimClustersEmbeddingId(
- embeddingType,
- modelVersion,
- InternalId.TopicId(
- TopicId(topicId, language, country)
- )
- )
- }
-
- // Extractor object for InternalIds that wrap Long
- object LongInternalId {
- def unapply(iid: InternalId): Option[Long] = iid match {
- case InternalId.TweetId(id) => Some(id)
- case InternalId.UserId(id) => Some(id)
- case InternalId.EntityId(id) => Some(id)
- case _ => None
- }
- }
-
- // Extractor object for SimClusterEmbeddingIds with InternalIds that wrap Long
- object LongSimClustersEmbeddingId {
- def unapply(id: ThriftSimClustersEmbeddingId): Option[Long] =
- LongInternalId.unapply(id.internalId)
- }
-
- // Only for debuggers.
- def buildEmbeddingId(
- entityId: String,
- embeddingType: EmbeddingType,
- modelVersion: ModelVersion = DefaultModelVersion
- ): ThriftSimClustersEmbeddingId = {
- if (TweetEmbeddingTypes.contains(embeddingType)) {
- buildTweetId(entityId.toLong, embeddingType, modelVersion)
- } else if (UserInterestedInEmbeddingTypes.contains(embeddingType)) {
- buildUserInterestedInId(entityId.toLong, embeddingType, modelVersion)
- } else if (ProducerEmbeddingTypes.contains(embeddingType)) {
- buildProducerId(entityId.toLong, embeddingType, modelVersion)
- } else if (LocaleEntityEmbeddingTypes.contains(embeddingType)) {
- buildLocaleEntityId(entityId.toLong, "en", embeddingType, modelVersion)
- } else if (TopicEmbeddingTypes.contains(embeddingType)) {
- buildTopicId(
- entityId.toLong,
- Some("en"),
- embeddingType = embeddingType,
- modelVersion = modelVersion)
- } else {
- throw new IllegalArgumentException(s"Invalid embedding type: $embeddingType")
- }
- }
-
- implicit val internalIdOrdering: Ordering[InternalId] =
- Ordering.by(internalId => internalId.hashCode())
-
- implicit val simClustersEmbeddingIdOrdering: Ordering[ThriftSimClustersEmbeddingId] =
- Ordering.by(embeddingId =>
- (embeddingId.embeddingType.value, embeddingId.modelVersion.value, embeddingId.internalId))
-
- // Use Enum for feature switch
- object TopicEnum extends Enumeration {
- protected case class EmbeddingType(embeddingType: SimClustersEmbeddingType) extends super.Val
- import scala.language.implicitConversions
- implicit def valueToEmbeddingType(value: Value): EmbeddingType =
- value.asInstanceOf[EmbeddingType]
-
- val FavTfgTopic: Value = EmbeddingType(SimClustersEmbeddingType.FavTfgTopic)
- val LogFavBasedKgoApeTopic: Value = EmbeddingType(
- SimClustersEmbeddingType.LogFavBasedKgoApeTopic)
- }
-
-}
diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.docx
new file mode 100644
index 000000000..208b7310c
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.scala
deleted file mode 100644
index 21a54e96c..000000000
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingIdCacheKeyBuilder.scala
+++ /dev/null
@@ -1,19 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-
-/**
- * A common library to construct Cache Key for SimClustersEmbeddingId.
- */
-case class SimClustersEmbeddingIdCacheKeyBuilder(
- hash: Array[Byte] => Long,
- prefix: String = "") {
-
- // Example: "CR:SCE:1:2:1234567890ABCDEF"
- def apply(embeddingId: SimClustersEmbeddingId): String = {
- f"$prefix:SCE:${embeddingId.embeddingType.getValue()}%X:" +
- f"${embeddingId.modelVersion.getValue()}%X" +
- f":${hash(embeddingId.internalId.toString.getBytes)}%X"
- }
-
-}
diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.docx b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.docx
new file mode 100644
index 000000000..7ec5e2478
Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.docx differ
diff --git a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.scala b/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.scala
deleted file mode 100644
index 1b17c9705..000000000
--- a/src/scala/com/twitter/simclusters_v2/common/SimClustersEmbeddingMonoid.scala
+++ /dev/null
@@ -1,18 +0,0 @@
-package com.twitter.simclusters_v2.common
-
-import com.twitter.algebird.Monoid
-
-case class SimClustersEmbeddingMonoid() extends Monoid[SimClustersEmbedding] {
-
- override val zero: SimClustersEmbedding = SimClustersEmbedding.EmptyEmbedding
-
- override def plus(x: SimClustersEmbedding, y: SimClustersEmbedding): SimClustersEmbedding = {
- x.sum(y)
- }
-}
-
-object SimClustersEmbeddingMonoid {
-
- val monoid: Monoid[SimClustersEmbedding] = SimClustersEmbeddingMonoid()
-
-}