mirror of
https://github.com/twitter/the-algorithm.git
synced 2025-01-26 10:45:34 +01:00
[docx] split commit for file 4800
Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
parent
470dc00686
commit
c4b4b821a3
@ -1,18 +0,0 @@
|
||||
python3_library(
|
||||
name = "libs_py3",
|
||||
sources = ["*.py"],
|
||||
dependencies = [
|
||||
"src/python/twitter/deepbird/io",
|
||||
"twml:twml-nodeps",
|
||||
],
|
||||
)
|
||||
|
||||
python37_binary(
|
||||
name = "score",
|
||||
source = "score.py",
|
||||
dependencies = [
|
||||
":libs_py3",
|
||||
"3rdparty/python/_closures/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly:score",
|
||||
"twml",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,23 +0,0 @@
|
||||
# checkstyle: noqa
|
||||
import tensorflow.compat.v1 as tf
|
||||
from ..constants import EB_SCORE_IDX
|
||||
|
||||
# The rationale behind this logic is available at TQ-9678.
|
||||
def get_lolly_logits(labels):
|
||||
'''
|
||||
:param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
|
||||
:return: tf.Tensor of shape (batch size) with the extracted lolly logits.
|
||||
'''
|
||||
eb_lolly_scores = get_lolly_scores(labels)
|
||||
inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores)
|
||||
lolly_activations = tf.math.subtract(tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores))
|
||||
return lolly_activations
|
||||
|
||||
def get_lolly_scores(labels):
|
||||
'''
|
||||
:param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
|
||||
:return: tf.Tensor of shape (batch size) with the extracted lolly scores.
|
||||
'''
|
||||
logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1))
|
||||
eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0)
|
||||
return eb_lolly_scores
|
Binary file not shown.
@ -1,145 +0,0 @@
|
||||
import re
|
||||
|
||||
from twitter.deepbird.io.util import _get_feature_id
|
||||
|
||||
|
||||
class Parser(object):
|
||||
def parse(self, line):
|
||||
match = re.search(self.pattern(), line)
|
||||
if match:
|
||||
return self._parse_match(match)
|
||||
return None
|
||||
|
||||
def pattern(self):
|
||||
raise NotImplementedError
|
||||
|
||||
def _parse_match(self, match):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
class BiasParser(Parser):
|
||||
'''
|
||||
Parses the bias feature available in lolly model tsv files.
|
||||
'''
|
||||
|
||||
def pattern(self):
|
||||
'''
|
||||
Matches lines like:
|
||||
unified_engagement bias -0.935945
|
||||
:return: a RegEx that extracts feature weight.
|
||||
'''
|
||||
return r"\t(bias)\t([^\s]+)"
|
||||
|
||||
def _parse_match(self, match):
|
||||
return float(match.group(2))
|
||||
|
||||
|
||||
class BinaryFeatureParser(Parser):
|
||||
'''
|
||||
Parses binary features available in lolly model tsv files.
|
||||
'''
|
||||
|
||||
def pattern(self):
|
||||
'''
|
||||
Matches lines like:
|
||||
unified_engagement encoded_tweet_features.is_user_spam_flag -0.181130
|
||||
:return: a RegEx that extracts feature name and weight.
|
||||
'''
|
||||
return r"\t([\w\.]+)\t([^\s]+)"
|
||||
|
||||
def _parse_match(self, match):
|
||||
return (match.group(1), float(match.group(2)))
|
||||
|
||||
|
||||
class DiscretizedFeatureParser(Parser):
|
||||
'''
|
||||
Parses discretized features available in lolly model tsv files.
|
||||
'''
|
||||
|
||||
def pattern(self):
|
||||
'''
|
||||
Matches lines like:
|
||||
unified_engagement encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00 0.031004
|
||||
:return: a RegEx that extracts feature name, bin boundaries and weight.
|
||||
'''
|
||||
return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)"
|
||||
|
||||
def _parse_match(self, match):
|
||||
left_bin_side, right_bin_side = [float(number) for number in match.group(2).split("_")]
|
||||
return (
|
||||
match.group(1),
|
||||
left_bin_side,
|
||||
right_bin_side,
|
||||
float(match.group(3))
|
||||
)
|
||||
|
||||
|
||||
class LollyModelFeaturesParser(Parser):
|
||||
def __init__(self, bias_parser=BiasParser(), binary_feature_parser=BinaryFeatureParser(), discretized_feature_parser=DiscretizedFeatureParser()):
|
||||
self._bias_parser = bias_parser
|
||||
self._binary_feature_parser = binary_feature_parser
|
||||
self._discretized_feature_parser = discretized_feature_parser
|
||||
|
||||
def parse(self, lolly_model_reader):
|
||||
parsed_features = {
|
||||
"bias": None,
|
||||
"binary": {},
|
||||
"discretized": {}
|
||||
}
|
||||
def process_line_fn(line):
|
||||
bias_parser_result = self._bias_parser.parse(line)
|
||||
if bias_parser_result:
|
||||
parsed_features["bias"] = bias_parser_result
|
||||
return
|
||||
|
||||
binary_feature_parser_result = self._binary_feature_parser.parse(line)
|
||||
if binary_feature_parser_result:
|
||||
name, value = binary_feature_parser_result
|
||||
parsed_features["binary"][name] = value
|
||||
return
|
||||
|
||||
discretized_feature_parser_result = self._discretized_feature_parser.parse(line)
|
||||
if discretized_feature_parser_result:
|
||||
name, left_bin, right_bin, weight = discretized_feature_parser_result
|
||||
discretized_features = parsed_features["discretized"]
|
||||
if name not in discretized_features:
|
||||
discretized_features[name] = []
|
||||
discretized_features[name].append((left_bin, right_bin, weight))
|
||||
|
||||
lolly_model_reader.read(process_line_fn)
|
||||
|
||||
return parsed_features
|
||||
|
||||
|
||||
class DBv2DataExampleParser(Parser):
|
||||
'''
|
||||
Parses data records printed by the DBv2 train.py build_graph function.
|
||||
Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]]
|
||||
'''
|
||||
|
||||
def __init__(self, lolly_model_reader, lolly_model_features_parser=LollyModelFeaturesParser()):
|
||||
self.features = lolly_model_features_parser.parse(lolly_model_reader)
|
||||
self.feature_name_by_dbv2_id = {}
|
||||
|
||||
for feature_name in list(self.features["binary"].keys()) + list(self.features["discretized"].keys()):
|
||||
self.feature_name_by_dbv2_id[str(_get_feature_id(feature_name))] = feature_name
|
||||
|
||||
def pattern(self):
|
||||
'''
|
||||
:return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values.
|
||||
'''
|
||||
return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]"
|
||||
|
||||
def _parse_match(self, match):
|
||||
feature_ids = match.group(3).split(" ")
|
||||
feature_values = match.group(4).split(" ")
|
||||
|
||||
value_by_feature_name = {}
|
||||
for index in range(len(feature_ids)):
|
||||
feature_id = feature_ids[index]
|
||||
if feature_id not in self.feature_name_by_dbv2_id:
|
||||
print("Missing feature with id: " + str(feature_id))
|
||||
continue
|
||||
value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(feature_values[index])
|
||||
|
||||
return value_by_feature_name
|
Binary file not shown.
@ -1,8 +0,0 @@
|
||||
class LollyModelReader(object):
|
||||
def __init__(self, lolly_model_file_path):
|
||||
self._lolly_model_file_path = lolly_model_file_path
|
||||
|
||||
def read(self, process_line_fn):
|
||||
with open(self._lolly_model_file_path, "r") as file:
|
||||
for line in file:
|
||||
process_line_fn(line)
|
Binary file not shown.
@ -1,13 +0,0 @@
|
||||
import sys
|
||||
|
||||
from .parsers import DBv2DataExampleParser
|
||||
from .reader import LollyModelReader
|
||||
from .scorer import LollyModelScorer
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1])
|
||||
lolly_model_scorer = LollyModelScorer(data_example_parser=DBv2DataExampleParser(lolly_model_reader))
|
||||
|
||||
score = lolly_model_scorer.score(data_example=sys.argv[2])
|
||||
print(score)
|
Binary file not shown.
@ -1,37 +0,0 @@
|
||||
class LollyModelScorer(object):
|
||||
def __init__(self, data_example_parser):
|
||||
self._data_example_parser = data_example_parser
|
||||
|
||||
def score(self, data_example):
|
||||
value_by_feature_name = self._data_example_parser.parse(data_example)
|
||||
features = self._data_example_parser.features
|
||||
return self._score(value_by_feature_name, features)
|
||||
|
||||
def _score(self, value_by_feature_name, features):
|
||||
score = features["bias"]
|
||||
score += self._score_binary_features(features["binary"], value_by_feature_name)
|
||||
score += self._score_discretized_features(features["discretized"], value_by_feature_name)
|
||||
return score
|
||||
|
||||
def _score_binary_features(self, binary_features, value_by_feature_name):
|
||||
score = 0.0
|
||||
for binary_feature_name, binary_feature_weight in binary_features.items():
|
||||
if binary_feature_name in value_by_feature_name:
|
||||
score += binary_feature_weight
|
||||
return score
|
||||
|
||||
def _score_discretized_features(self, discretized_features, value_by_feature_name):
|
||||
score = 0.0
|
||||
for discretized_feature_name, buckets in discretized_features.items():
|
||||
if discretized_feature_name in value_by_feature_name:
|
||||
feature_value = value_by_feature_name[discretized_feature_name]
|
||||
score += self._find_matching_bucket_weight(buckets, feature_value)
|
||||
return score
|
||||
|
||||
def _find_matching_bucket_weight(self, buckets, feature_value):
|
||||
for left_side, right_side, weight in buckets:
|
||||
# The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
|
||||
if feature_value >= left_side and feature_value < right_side:
|
||||
return weight
|
||||
|
||||
raise LookupError("Couldn't find a matching bucket for the given feature value.")
|
Binary file not shown.
@ -1,91 +0,0 @@
|
||||
from .parsers import LollyModelFeaturesParser
|
||||
|
||||
|
||||
class TFModelInitializerBuilder:
|
||||
|
||||
def __init__(self, model_features_parser=LollyModelFeaturesParser()):
|
||||
self._model_features_parser = model_features_parser
|
||||
|
||||
def build(self, lolly_model_reader):
|
||||
'''
|
||||
:param lolly_model_reader: LollyModelReader instance
|
||||
:return: tf_model_initializer dictionary of the following format:
|
||||
{
|
||||
"features": {
|
||||
"bias": 0.0,
|
||||
"binary": {
|
||||
# (feature name : feature weight) pairs
|
||||
"feature_name_1": 0.0,
|
||||
...
|
||||
"feature_nameN": 0.0
|
||||
},
|
||||
"discretized": {
|
||||
# (feature name : index aligned lists of bin_boundaries and weights
|
||||
"feature_name_1": {
|
||||
"bin_boundaries": [1, ..., inf],
|
||||
"weights": [0.0, ..., 0.0]
|
||||
}
|
||||
...
|
||||
"feature_name_K": {
|
||||
"bin_boundaries": [1, ..., inf],
|
||||
"weights": [0.0, ..., 0.0]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
'''
|
||||
tf_model_initializer = {
|
||||
"features": {}
|
||||
}
|
||||
|
||||
features = self._model_features_parser.parse(lolly_model_reader)
|
||||
tf_model_initializer["features"]["bias"] = features["bias"]
|
||||
self._set_discretized_features(features["discretized"], tf_model_initializer)
|
||||
|
||||
self._dedup_binary_features(features["binary"], features["discretized"])
|
||||
tf_model_initializer["features"]["binary"] = features["binary"]
|
||||
|
||||
return tf_model_initializer
|
||||
|
||||
def _set_discretized_features(self, discretized_features, tf_model_initializer):
|
||||
if len(discretized_features) == 0:
|
||||
return
|
||||
|
||||
num_bins = max([len(bins) for bins in discretized_features.values()])
|
||||
|
||||
bin_boundaries_and_weights = {}
|
||||
for feature_name in discretized_features:
|
||||
bin_boundaries_and_weights[feature_name] = self._extract_bin_boundaries_and_weights(
|
||||
discretized_features[feature_name], num_bins)
|
||||
|
||||
tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights
|
||||
|
||||
def _dedup_binary_features(self, binary_features, discretized_features):
|
||||
[binary_features.pop(feature_name) for feature_name in discretized_features]
|
||||
|
||||
def _extract_bin_boundaries_and_weights(self, discretized_feature_buckets, num_bins):
|
||||
bin_boundary_weight_pairs = []
|
||||
|
||||
for bucket in discretized_feature_buckets:
|
||||
bin_boundary_weight_pairs.append([bucket[0], bucket[2]])
|
||||
|
||||
# The default DBv2 HashingDiscretizer bin membership interval is (a, b]
|
||||
#
|
||||
# The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
|
||||
#
|
||||
# Thus, convert (a, b] to [a, b) by inverting the bin boundaries.
|
||||
for bin_boundary_weight_pair in bin_boundary_weight_pairs:
|
||||
if bin_boundary_weight_pair[0] < float("inf"):
|
||||
bin_boundary_weight_pair[0] *= -1
|
||||
|
||||
while len(bin_boundary_weight_pairs) < num_bins:
|
||||
bin_boundary_weight_pairs.append([float("inf"), float(0)])
|
||||
|
||||
bin_boundary_weight_pairs.sort(key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0])
|
||||
|
||||
bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs))
|
||||
|
||||
return {
|
||||
"bin_boundaries": bin_boundaries,
|
||||
"weights": weights
|
||||
}
|
Binary file not shown.
@ -1,120 +0,0 @@
|
||||
# checkstyle: noqa
|
||||
import tensorflow.compat.v1 as tf
|
||||
from collections import OrderedDict
|
||||
from .constants import EB_SCORE_IDX
|
||||
from .lolly.data_helpers import get_lolly_scores
|
||||
|
||||
import twml
|
||||
|
||||
def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1):
|
||||
"""
|
||||
This function was copied from twml/metrics.py with the following adjustments:
|
||||
- Override example weights with the ones set in graph_output.
|
||||
- Tile labels in order to support per engagement metrics for both TF and Lolly scores.
|
||||
- Add lolly_tf_score_MSE metric.
|
||||
Note: All custom lines have a comment that starts with 'Added'
|
||||
"""
|
||||
# pylint: disable=invalid-name,dict-keys-not-iterating
|
||||
if metrics is None:
|
||||
# remove expensive metrics by default for faster eval
|
||||
metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys())
|
||||
metrics.remove('pr_curve')
|
||||
|
||||
def get_eval_metric_ops(graph_output, labels, weights):
|
||||
"""
|
||||
graph_output:
|
||||
dict that is returned by build_graph given input features.
|
||||
labels:
|
||||
target labels associated to batch.
|
||||
weights:
|
||||
weights of the samples..
|
||||
"""
|
||||
|
||||
# Added to support the example weights overriding.
|
||||
weights = graph_output["weights"]
|
||||
# Added to support per engagement metrics for both TF and Lolly scores.
|
||||
labels = tf.tile(labels, [1, 2])
|
||||
|
||||
eval_metric_ops = OrderedDict()
|
||||
|
||||
preds = graph_output['output']
|
||||
|
||||
threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
|
||||
|
||||
hard_preds = graph_output.get('hard_output')
|
||||
if not hard_preds:
|
||||
hard_preds = tf.greater_equal(preds, threshold)
|
||||
|
||||
shape = labels.get_shape()
|
||||
|
||||
# basic sanity check: multi_metric dimension must exist
|
||||
assert len(shape) > class_dim, "Dimension specified by class_dim does not exist."
|
||||
|
||||
num_labels = shape[class_dim]
|
||||
# If we are doing multi-class / multi-label metric, the number of classes / labels must
|
||||
# be know at graph construction time. This dimension cannot have size None.
|
||||
assert num_labels is not None, "The multi-metric dimension cannot be None."
|
||||
assert classes is None or len(classes) == num_labels, (
|
||||
"Number of classes must match the number of labels")
|
||||
|
||||
weights_shape = weights.get_shape() if weights is not None else None
|
||||
if weights_shape is None:
|
||||
num_weights = None
|
||||
elif len(weights_shape) > 1:
|
||||
num_weights = weights_shape[class_dim]
|
||||
else:
|
||||
num_weights = 1
|
||||
|
||||
for i in range(num_labels):
|
||||
|
||||
# add metrics to eval_metric_ops dict
|
||||
for metric_name in metrics:
|
||||
metric_name = metric_name.lower() # metric name are case insensitive.
|
||||
|
||||
class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i))
|
||||
|
||||
if class_metric_name in eval_metric_ops:
|
||||
# avoid adding duplicate metrics.
|
||||
continue
|
||||
|
||||
class_labels = tf.gather(labels, indices=[i], axis=class_dim)
|
||||
class_preds = tf.gather(preds, indices=[i], axis=class_dim)
|
||||
class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
|
||||
|
||||
if num_weights is None:
|
||||
class_weights = None
|
||||
elif num_weights == num_labels:
|
||||
class_weights = tf.gather(weights, indices=[i], axis=class_dim)
|
||||
elif num_weights == 1:
|
||||
class_weights = weights
|
||||
else:
|
||||
raise ValueError("num_weights (%d) and num_labels (%d) do not match"
|
||||
% (num_weights, num_labels))
|
||||
|
||||
metric_factory, requires_threshold = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
|
||||
if metric_factory:
|
||||
value_op, update_op = metric_factory(
|
||||
labels=class_labels,
|
||||
predictions=(class_hard_preds if requires_threshold else class_preds),
|
||||
weights=class_weights, name=class_metric_name)
|
||||
eval_metric_ops[class_metric_name] = (value_op, update_op)
|
||||
else:
|
||||
raise ValueError('Cannot find the metric named ' + metric_name)
|
||||
|
||||
# Added to compare TF and Lolly scores.
|
||||
eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels)
|
||||
|
||||
return eval_metric_ops
|
||||
|
||||
return get_eval_metric_ops
|
||||
|
||||
|
||||
def get_mse(predictions, labels):
|
||||
lolly_scores = get_lolly_scores(labels)
|
||||
tf_scores = predictions[:, EB_SCORE_IDX]
|
||||
squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores))
|
||||
|
||||
value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op")
|
||||
update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op")
|
||||
|
||||
return value_op, update_op
|
@ -1,8 +0,0 @@
|
||||
python3_library(
|
||||
name = "libs_py3",
|
||||
sources = ["*.py"],
|
||||
dependencies = [
|
||||
"src/python/twitter/deepbird/io",
|
||||
"twml:twml-nodeps",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1,62 +0,0 @@
|
||||
from .hashing_utils import make_feature_id
|
||||
|
||||
from twml.contrib.layers.hashing_discretizer import HashingDiscretizer
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TFModelDiscretizerBuilder(object):
|
||||
def __init__(self, num_bits):
|
||||
self.num_bits = num_bits
|
||||
|
||||
def build(self, tf_model_initializer):
|
||||
'''
|
||||
:param tf_model_initializer: dictionary of the following format:
|
||||
{
|
||||
"features": {
|
||||
"bias": 0.0,
|
||||
"binary": {
|
||||
# (feature name : feature weight) pairs
|
||||
"feature_name_1": 0.0,
|
||||
...
|
||||
"feature_nameN": 0.0
|
||||
},
|
||||
"discretized": {
|
||||
# (feature name : index aligned lists of bin_boundaries and weights
|
||||
"feature_name_1": {
|
||||
"bin_boundaries": [1, ..., inf],
|
||||
"weights": [0.0, ..., 0.0]
|
||||
}
|
||||
...
|
||||
"feature_name_K": {
|
||||
"bin_boundaries": [1, ..., inf],
|
||||
"weights": [0.0, ..., 0.0]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
:return: a HashingDiscretizer instance.
|
||||
'''
|
||||
discretized_features = tf_model_initializer["features"]["discretized"]
|
||||
|
||||
max_bins = 0
|
||||
|
||||
feature_ids = []
|
||||
bin_vals = []
|
||||
for feature_name in discretized_features:
|
||||
bin_boundaries = discretized_features[feature_name]["bin_boundaries"]
|
||||
feature_id = make_feature_id(feature_name, self.num_bits)
|
||||
feature_ids.append(feature_id)
|
||||
np_bin_boundaries = [np.float(bin_boundary) for bin_boundary in bin_boundaries]
|
||||
bin_vals.append(np_bin_boundaries)
|
||||
|
||||
max_bins = max(max_bins, len(np_bin_boundaries))
|
||||
|
||||
feature_ids_np = np.array(feature_ids)
|
||||
bin_vals_np = np.array(bin_vals).flatten()
|
||||
|
||||
return HashingDiscretizer(
|
||||
feature_ids=feature_ids_np,
|
||||
bin_vals=bin_vals_np,
|
||||
n_bin=max_bins,
|
||||
out_bits=self.num_bits
|
||||
)
|
Binary file not shown.
@ -1,29 +0,0 @@
|
||||
from twitter.deepbird.io.util import _get_feature_id
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
def numpy_hashing_uniform(the_id, bin_idx, output_bits):
|
||||
"""
|
||||
integer_multiplicative_hashing
|
||||
This is a reimplementation, for testing purposes, of the
|
||||
c++ version found in hashing_discretizer_impl.cpp
|
||||
"""
|
||||
hashing_constant = 2654435761
|
||||
N = 32
|
||||
with np.errstate(over='ignore'):
|
||||
the_id *= hashing_constant
|
||||
the_id += bin_idx
|
||||
the_id *= hashing_constant
|
||||
the_id >>= N - output_bits
|
||||
the_id &= (1 << output_bits) - 1
|
||||
return the_id
|
||||
|
||||
|
||||
def make_feature_id(name, num_bits):
|
||||
feature_id = _get_feature_id(name)
|
||||
return np.int64(limit_bits(feature_id, num_bits))
|
||||
|
||||
|
||||
def limit_bits(value, num_bits):
|
||||
return value & ((2 ** num_bits) - 1)
|
Binary file not shown.
@ -1,34 +0,0 @@
|
||||
from .hashing_utils import make_feature_id, numpy_hashing_uniform
|
||||
|
||||
import numpy as np
|
||||
import tensorflow.compat.v1 as tf
|
||||
import twml
|
||||
|
||||
|
||||
class TFModelWeightsInitializerBuilder(object):
|
||||
def __init__(self, num_bits):
|
||||
self.num_bits = num_bits
|
||||
|
||||
def build(self, tf_model_initializer):
|
||||
'''
|
||||
:return: (bias_initializer, weight_initializer)
|
||||
'''
|
||||
initial_weights = np.zeros((2 ** self.num_bits, 1))
|
||||
|
||||
features = tf_model_initializer["features"]
|
||||
self._set_binary_feature_weights(initial_weights, features["binary"])
|
||||
self._set_discretized_feature_weights(initial_weights, features["discretized"])
|
||||
|
||||
return tf.constant_initializer(features["bias"]), twml.contrib.initializers.PartitionConstant(initial_weights)
|
||||
|
||||
def _set_binary_feature_weights(self, initial_weights, binary_features):
|
||||
for feature_name, weight in binary_features.items():
|
||||
feature_id = make_feature_id(feature_name, self.num_bits)
|
||||
initial_weights[feature_id][0] = weight
|
||||
|
||||
def _set_discretized_feature_weights(self, initial_weights, discretized_features):
|
||||
for feature_name, discretized_feature in discretized_features.items():
|
||||
feature_id = make_feature_id(feature_name, self.num_bits)
|
||||
for bin_idx, weight in enumerate(discretized_feature["weights"]):
|
||||
final_bucket_id = numpy_hashing_uniform(feature_id, bin_idx, self.num_bits)
|
||||
initial_weights[final_bucket_id][0] = weight
|
Binary file not shown.
@ -1,212 +0,0 @@
|
||||
# checkstyle: noqa
|
||||
import tensorflow.compat.v1 as tf
|
||||
from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
|
||||
from tensorflow.python.framework import dtypes
|
||||
from tensorflow.python.ops import array_ops
|
||||
import tensorflow_hub as hub
|
||||
|
||||
from datetime import datetime
|
||||
from tensorflow.compat.v1 import logging
|
||||
from twitter.deepbird.projects.timelines.configs import all_configs
|
||||
from twml.trainers import DataRecordTrainer
|
||||
from twml.contrib.calibrators.common_calibrators import build_percentile_discretizer_graph
|
||||
from twml.contrib.calibrators.common_calibrators import calibrate_discretizer_and_export
|
||||
from .metrics import get_multi_binary_class_metric_fn
|
||||
from .constants import TARGET_LABEL_IDX, PREDICTED_CLASSES
|
||||
from .example_weights import add_weight_arguments, make_weights_tensor
|
||||
from .lolly.data_helpers import get_lolly_logits
|
||||
from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder
|
||||
from .lolly.reader import LollyModelReader
|
||||
from .tf_model.discretizer_builder import TFModelDiscretizerBuilder
|
||||
from .tf_model.weights_initializer_builder import TFModelWeightsInitializerBuilder
|
||||
|
||||
import twml
|
||||
|
||||
def get_feature_values(features_values, params):
|
||||
if params.lolly_model_tsv:
|
||||
# The default DBv2 HashingDiscretizer bin membership interval is (a, b]
|
||||
#
|
||||
# The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
|
||||
#
|
||||
# TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries.
|
||||
#
|
||||
# Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket.
|
||||
return tf.multiply(features_values, -1.0)
|
||||
else:
|
||||
return features_values
|
||||
|
||||
def build_graph(features, label, mode, params, config=None):
|
||||
weights = None
|
||||
if "weights" in features:
|
||||
weights = make_weights_tensor(features["weights"], label, params)
|
||||
|
||||
num_bits = params.input_size_bits
|
||||
|
||||
if mode == "infer":
|
||||
indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits)
|
||||
dense_shape = tf.stack([features["input_sparse_tensor_shape"][0], 1 << num_bits])
|
||||
sparse_tf = tf.SparseTensor(
|
||||
indices=indices,
|
||||
values=get_feature_values(features["input_sparse_tensor_values"], params),
|
||||
dense_shape=dense_shape
|
||||
)
|
||||
else:
|
||||
features["values"] = get_feature_values(features["values"], params)
|
||||
sparse_tf = twml.util.convert_to_sparse(features, num_bits)
|
||||
|
||||
if params.lolly_model_tsv:
|
||||
tf_model_initializer = TFModelInitializerBuilder().build(LollyModelReader(params.lolly_model_tsv))
|
||||
bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder(num_bits).build(tf_model_initializer)
|
||||
discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer)
|
||||
else:
|
||||
discretizer = hub.Module(params.discretizer_save_dir)
|
||||
bias_initializer, weight_initializer = None, None
|
||||
|
||||
input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator")
|
||||
|
||||
logits = twml.layers.full_sparse(
|
||||
inputs=input_sparse,
|
||||
output_size=1,
|
||||
bias_initializer=bias_initializer,
|
||||
weight_initializer=weight_initializer,
|
||||
use_sparse_grads=(mode == "train"),
|
||||
use_binary_values=True,
|
||||
name="full_sparse_1"
|
||||
)
|
||||
|
||||
loss = None
|
||||
|
||||
if mode != "infer":
|
||||
lolly_activations = get_lolly_logits(label)
|
||||
|
||||
if opt.print_data_examples:
|
||||
logits = print_data_example(logits, lolly_activations, features)
|
||||
|
||||
if params.replicate_lolly:
|
||||
loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations))
|
||||
else:
|
||||
batch_size = tf.shape(label)[0]
|
||||
target_label = tf.reshape(tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1))
|
||||
loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=target_label, logits=logits)
|
||||
loss = twml.util.weighted_average(loss, weights)
|
||||
|
||||
num_labels = tf.shape(label)[1]
|
||||
eb_scores = tf.tile(lolly_activations, [1, num_labels])
|
||||
logits = tf.tile(logits, [1, num_labels])
|
||||
logits = tf.concat([logits, eb_scores], axis=1)
|
||||
|
||||
output = tf.nn.sigmoid(logits)
|
||||
|
||||
return {"output": output, "loss": loss, "weights": weights}
|
||||
|
||||
def print_data_example(logits, lolly_activations, features):
|
||||
return tf.Print(
|
||||
logits,
|
||||
[logits, lolly_activations, tf.reshape(features['keys'], (1, -1)), tf.reshape(tf.multiply(features['values'], -1.0), (1, -1))],
|
||||
message="DATA EXAMPLE = ",
|
||||
summarize=10000
|
||||
)
|
||||
|
||||
def earlybird_output_fn(graph_output):
|
||||
export_outputs = {
|
||||
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
|
||||
tf.estimator.export.PredictOutput(
|
||||
{"prediction": tf.identity(graph_output["output"], name="output_scores")}
|
||||
)
|
||||
}
|
||||
return export_outputs
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = DataRecordTrainer.add_parser_arguments()
|
||||
|
||||
parser = twml.contrib.calibrators.add_discretizer_arguments(parser)
|
||||
|
||||
parser.add_argument("--label", type=str, help="label for the engagement")
|
||||
parser.add_argument("--model.use_existing_discretizer", action="store_true",
|
||||
dest="model_use_existing_discretizer",
|
||||
help="Load a pre-trained calibration or train a new one")
|
||||
parser.add_argument("--input_size_bits", type=int)
|
||||
parser.add_argument("--export_module_name", type=str, default="base_mlp", dest="export_module_name")
|
||||
parser.add_argument("--feature_config", type=str)
|
||||
parser.add_argument("--replicate_lolly", type=bool, default=False, dest="replicate_lolly",
|
||||
help="Train a regression model with MSE loss and the logged Earlybird score as a label")
|
||||
parser.add_argument("--lolly_model_tsv", type=str, required=False, dest="lolly_model_tsv",
|
||||
help="Initialize with weights and discretizer bins available in the given Lolly model tsv file"
|
||||
"No discretizer gets trained or loaded if set.")
|
||||
parser.add_argument("--print_data_examples", type=bool, default=False, dest="print_data_examples",
|
||||
help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'")
|
||||
add_weight_arguments(parser)
|
||||
|
||||
opt = parser.parse_args()
|
||||
|
||||
feature_config_module = all_configs.select_feature_config(opt.feature_config)
|
||||
|
||||
feature_config = feature_config_module.get_feature_config(data_spec_path=opt.data_spec, label=opt.label)
|
||||
|
||||
parse_fn = twml.parsers.get_sparse_parse_fn(
|
||||
feature_config,
|
||||
keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes"))
|
||||
|
||||
if not opt.lolly_model_tsv:
|
||||
if opt.model_use_existing_discretizer:
|
||||
logging.info("Skipping discretizer calibration [model.use_existing_discretizer=True]")
|
||||
logging.info(f"Using calibration at {opt.discretizer_save_dir}")
|
||||
else:
|
||||
logging.info("Calibrating new discretizer [model.use_existing_discretizer=False]")
|
||||
calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator(
|
||||
opt.discretizer_num_bins,
|
||||
opt.discretizer_output_size_bits
|
||||
)
|
||||
calibrate_discretizer_and_export(name="recap_earlybird_hashing_discretizer",
|
||||
params=opt,
|
||||
calibrator=calibrator,
|
||||
build_graph_fn=build_percentile_discretizer_graph,
|
||||
feature_config=feature_config)
|
||||
|
||||
trainer = DataRecordTrainer(
|
||||
name="earlybird",
|
||||
params=opt,
|
||||
build_graph_fn=build_graph,
|
||||
save_dir=opt.save_dir,
|
||||
feature_config=feature_config,
|
||||
metric_fn=get_multi_binary_class_metric_fn(
|
||||
metrics=["roc_auc"],
|
||||
classes=PREDICTED_CLASSES
|
||||
),
|
||||
warm_start_from=None
|
||||
)
|
||||
|
||||
train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn)
|
||||
eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn)
|
||||
|
||||
logging.info("Training and Evaluation ...")
|
||||
trainingStartTime = datetime.now()
|
||||
trainer.train_and_evaluate(train_input_fn=train_input_fn, eval_input_fn=eval_input_fn)
|
||||
trainingEndTime = datetime.now()
|
||||
logging.info("Training and Evaluation time: " + str(trainingEndTime - trainingStartTime))
|
||||
|
||||
if trainer._estimator.config.is_chief:
|
||||
serving_input_in_earlybird = {
|
||||
"input_sparse_tensor_indices": array_ops.placeholder(
|
||||
name="input_sparse_tensor_indices",
|
||||
shape=[None, 2],
|
||||
dtype=dtypes.int64),
|
||||
"input_sparse_tensor_values": array_ops.placeholder(
|
||||
name="input_sparse_tensor_values",
|
||||
shape=[None],
|
||||
dtype=dtypes.float32),
|
||||
"input_sparse_tensor_shape": array_ops.placeholder(
|
||||
name="input_sparse_tensor_shape",
|
||||
shape=[2],
|
||||
dtype=dtypes.int64)
|
||||
}
|
||||
serving_input_receiver_fn = build_raw_serving_input_receiver_fn(serving_input_in_earlybird)
|
||||
twml.contrib.export.export_fn.export_all_models(
|
||||
trainer=trainer,
|
||||
export_dir=opt.export_dir,
|
||||
parse_fn=parse_fn,
|
||||
serving_input_receiver_fn=serving_input_receiver_fn,
|
||||
export_output_fn=earlybird_output_fn,
|
||||
feature_spec=feature_config.get_feature_spec()
|
||||
)
|
||||
logging.info("The export model path is: " + opt.export_dir)
|
@ -1,91 +0,0 @@
|
||||
JOB = ["job/**/*"]
|
||||
|
||||
scala_library(
|
||||
name = "batch",
|
||||
sources = ["**/*.scala"],
|
||||
platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/cascading:cascading-core",
|
||||
"3rdparty/jvm/cascading:cascading-hadoop",
|
||||
"3rdparty/jvm/cascading:cascading-local",
|
||||
"3rdparty/jvm/cascading:cascading-thrift",
|
||||
"3rdparty/jvm/com/twitter/algebird:core",
|
||||
"3rdparty/jvm/com/twitter/algebird:util",
|
||||
"3rdparty/jvm/com/twitter/storehaus:algebra",
|
||||
"3rdparty/jvm/com/twitter/storehaus:core",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:args",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:commons",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:core",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:date",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:parquet",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:batch",
|
||||
"3rdparty/src/jvm/com/twitter/summingbird:client",
|
||||
"graphstore/common:flock_follows-java",
|
||||
"src/java/com/twitter/common_internal/util:date_util",
|
||||
"src/java/com/twitter/twadoop/batch",
|
||||
"src/java/com/twitter/twadoop/util/dbconfig",
|
||||
"src/java/com/twitter/twadoop/util/yaml",
|
||||
"src/protobuf/com/twitter/twadoop",
|
||||
"src/scala/com/twitter/pluck",
|
||||
"src/scala/com/twitter/pluck/source/combined_user_source",
|
||||
"src/scala/com/twitter/pluck/source/jdbc",
|
||||
"src/scala/com/twitter/scalding_internal/error_handling",
|
||||
"src/scala/com/twitter/scalding_internal/job",
|
||||
"src/scala/com/twitter/scalding_internal/job/analytics_batch",
|
||||
"src/scala/com/twitter/scalding_internal/multiformat",
|
||||
"src/scala/com/twitter/scalding_internal/source",
|
||||
"src/scala/com/twitter/wtf/scalding/jobs/common:date_util",
|
||||
"src/thrift/com/twitter/gizmoduck:user-thrift-java",
|
||||
"src/thrift/com/twitter/twadoop/user/gen:gen-java",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
||||
|
||||
#pants.new build target for the old "dist"
|
||||
hadoop_binary(
|
||||
name = "graph-batch-deploy",
|
||||
main = "com.twitter.scalding.Tool",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":tweepcred",
|
||||
],
|
||||
)
|
||||
|
||||
# Generated with `capesospy-v2 create_target tweepcred_job science/scalding/mesos/wtf/recos_platform_atla_proc.yaml`, config hash d63a47.
|
||||
scalding_job(
|
||||
name = "tweepcred_job",
|
||||
main = "com.twitter.graph.batch.job.tweepcred.TweepcredBatchJob",
|
||||
args = ["--weighted false --hadoop_config /etc/hadoop/hadoop-conf-proc-atla"],
|
||||
config = [
|
||||
("hadoop.combine-input", "true"),
|
||||
("hadoop.map.jvm.total-memory", "3072m"),
|
||||
("hadoop.queue", "cassowary.default"),
|
||||
("hadoop.reduce.jvm.total-memory", "3072m"),
|
||||
("hadoop.reducers", "1200"),
|
||||
("hadoop.submitter.disk", "200000m"),
|
||||
("hadoop.submitter.jvm.total-memory", "5120m"),
|
||||
("submitter.tier", "preemptible"),
|
||||
],
|
||||
cron = "24,44,04 * * * *",
|
||||
hadoop_cluster = "atla-proc",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":tweepcred",
|
||||
],
|
||||
)
|
BIN
src/scala/com/twitter/graph/batch/BUILD.docx
Normal file
BIN
src/scala/com/twitter/graph/batch/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,83 +0,0 @@
|
||||
package com.twitter.graph.batch.job.tweepcred
|
||||
|
||||
import com.twitter.pluck.source.combined_user_source.MostRecentCombinedUserSnapshotSource
|
||||
import com.twitter.scalding._
|
||||
|
||||
/**
|
||||
* Calculate tweepcred from the given pagerank file. If post_adjust is true,
|
||||
* reduce pagerank for users with low followers compared to number of
|
||||
* followings based on existing reputation code.
|
||||
* Options:
|
||||
* --input_pagerank: given pagerank
|
||||
* --user_mass: user mass tsv file, generated by twadoop user_mass job
|
||||
* --output_pagerank: where to put pagerank file
|
||||
* --output_tweepcred: where to put tweepcred file
|
||||
* optional arguments:
|
||||
* --post_adjust: whether to do post adjust, default true
|
||||
*
|
||||
*/
|
||||
class ExtractTweepcred(args: Args) extends Job(args) {
|
||||
val POST_ADJUST = args.getOrElse("post_adjust", "true").toBoolean
|
||||
|
||||
val inputPagerank = getInputPagerank(args("input_pagerank"))
|
||||
.map(() -> ('num_followers, 'num_followings)) { (u: Unit) =>
|
||||
(0, 0)
|
||||
}
|
||||
|
||||
val userInfo = TypedPipe
|
||||
.from(MostRecentCombinedUserSnapshotSource)
|
||||
.flatMap { combinedUser =>
|
||||
val user = Option(combinedUser.user)
|
||||
val userId = user.map(_.id).getOrElse(0L)
|
||||
val userExtended = Option(combinedUser.user_extended)
|
||||
val numFollowers = userExtended.flatMap(u => Option(u.followers)).map(_.toInt).getOrElse(0)
|
||||
val numFollowings = userExtended.flatMap(u => Option(u.followings)).map(_.toInt).getOrElse(0)
|
||||
|
||||
if (userId == 0L || user.map(_.safety).exists(_.deactivated)) {
|
||||
None
|
||||
} else {
|
||||
Some((userId, 0.0, numFollowers, numFollowings))
|
||||
}
|
||||
}
|
||||
.toPipe[(Long, Double, Int, Int)]('src_id, 'mass_input, 'num_followers, 'num_followings)
|
||||
|
||||
val pagerankWithSuspended = (inputPagerank ++ userInfo)
|
||||
.groupBy('src_id) {
|
||||
_.max('mass_input)
|
||||
.max('num_followers)
|
||||
.max('num_followings)
|
||||
}
|
||||
|
||||
pagerankWithSuspended
|
||||
.discard('num_followers, 'num_followings)
|
||||
.write(Tsv(args("output_pagerank")))
|
||||
|
||||
val adjustedPagerank =
|
||||
if (POST_ADJUST) {
|
||||
pagerankWithSuspended
|
||||
.map(('mass_input, 'num_followers, 'num_followings) -> 'mass_input) {
|
||||
input: (Double, Int, Int) =>
|
||||
Reputation.adjustReputationsPostCalculation(input._1, input._2, input._3)
|
||||
}
|
||||
.normalize('mass_input)
|
||||
} else {
|
||||
pagerankWithSuspended
|
||||
.discard('num_followers, 'num_followings)
|
||||
}
|
||||
|
||||
val tweepcred = adjustedPagerank
|
||||
.map('mass_input -> 'mass_input) { input: Double =>
|
||||
Reputation.scaledReputation(input)
|
||||
}
|
||||
|
||||
tweepcred.write(Tsv(args("output_tweepcred")))
|
||||
tweepcred.write(Tsv(args("current_tweepcred")))
|
||||
tweepcred.write(Tsv(args("today_tweepcred")))
|
||||
|
||||
def getInputPagerank(fileName: String) = {
|
||||
Tsv(fileName).read
|
||||
.mapTo((0, 1) -> ('src_id, 'mass_input)) { input: (Long, Double) =>
|
||||
input
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,275 +0,0 @@
|
||||
package com.twitter.graph.batch.job.tweepcred
|
||||
|
||||
import com.twitter.data.proto.Flock
|
||||
import com.twitter.scalding._
|
||||
import com.twitter.pluck.source._
|
||||
import com.twitter.pluck.source.combined_user_source.MostRecentCombinedUserSnapshotSource
|
||||
import com.twitter.scalding_internal.dalv2.DAL
|
||||
import com.twitter.service.interactions.InteractionGraph
|
||||
import graphstore.common.FlockFollowsJavaDataset
|
||||
import java.util.TimeZone
|
||||
|
||||
/**
|
||||
* Prepare the graph data for page rank calculation. Also generate the initial
|
||||
* pagerank as the starting point. Afterwards, start WeightedPageRank job.
|
||||
*
|
||||
* Either read a tsv file for testing or read the following to build the graph
|
||||
* flock edges Flock.Edge
|
||||
* real graph input for weights InteractionGraph.Edge
|
||||
*
|
||||
* Options:
|
||||
* --pwd: working directory, will generate the following files there
|
||||
* numnodes: total number of nodes
|
||||
* nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior>
|
||||
* pagerank: the page rank file
|
||||
* --user_mass: user mass tsv file, generated by twadoop user_mass job
|
||||
* Optional arguments:
|
||||
* --input: use the given tsv file instead of flock and real graph
|
||||
* --weighted: do weighted pagerank, default false
|
||||
* --flock_edges_only: restrict graph to flock edges, default true
|
||||
* --input_pagerank: continue pagerank from this
|
||||
*
|
||||
* Plus the following options for WeightedPageRank and ExtractTweepcred:
|
||||
* --output_pagerank: where to put pagerank file
|
||||
* --output_tweepcred: where to put tweepcred file
|
||||
* Optional:
|
||||
* --maxiterations: how many iterations to run. Default is 20
|
||||
* --jumpprob: probability of a random jump, default is 0.1
|
||||
* --threshold: total difference before finishing early, default 0.001
|
||||
* --post_adjust: whether to do post adjust, default true
|
||||
*/
|
||||
class PreparePageRankData(args: Args) extends Job(args) {
|
||||
implicit val timeZone: TimeZone = DateOps.UTC
|
||||
val PWD = args("pwd")
|
||||
val WEIGHTED = args.getOrElse("weighted", "false").toBoolean
|
||||
val FLOCK_EDGES_ONLY = args.getOrElse("flock_edges_only", "true").toBoolean
|
||||
|
||||
val ROW_TYPE_1 = 1
|
||||
val ROW_TYPE_2 = 2
|
||||
|
||||
// graph data and user mass
|
||||
val userMass = getUserMass
|
||||
val nodesWithPrior = getGraphData(userMass)
|
||||
val numNodes = nodesWithPrior.groupAll { _.size }
|
||||
numNodes.write(Tsv(PWD + "/numnodes"))
|
||||
dumpNodes(nodesWithPrior, PWD + "/nodes");
|
||||
|
||||
// initial pagerank to start computation
|
||||
generateInitialPagerank(nodesWithPrior)
|
||||
|
||||
// continue with the calculation
|
||||
override def next = {
|
||||
Some(new WeightedPageRank(args))
|
||||
}
|
||||
|
||||
/**
|
||||
* read flock edges
|
||||
*/
|
||||
def getFlockEdges = {
|
||||
DAL
|
||||
.readMostRecentSnapshotNoOlderThan(FlockFollowsJavaDataset, Days(7))
|
||||
.toTypedSource
|
||||
.flatMapTo('src_id, 'dst_id) { edge: Flock.Edge =>
|
||||
if (edge.getStateId() == Flock.State.Positive.getNumber()) {
|
||||
Some((edge.getSourceId(), edge.getDestinationId()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* read real graph edges with weights
|
||||
*/
|
||||
def getRealGraphEdges = {
|
||||
RealGraphEdgeSource()
|
||||
.flatMapTo('src_id, 'dst_id, 'weight) { edge: InteractionGraph.Edge =>
|
||||
if (edge.getSourceId() != edge.getDestinationId()) {
|
||||
val srcId = edge.getSourceId()
|
||||
val dstId = edge.getDestinationId()
|
||||
val weight = edge.getWeight().toFloat
|
||||
Some((srcId, dstId, weight))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* combine real graph and flock. If flock_edges_only is true, only take the
|
||||
* flock edges; otherwise edges are either from flock or from real graph.
|
||||
* edges weights default to be 1, overwritten by weights from real graph
|
||||
*/
|
||||
def getFlockRealGraphEdges = {
|
||||
val flock = getFlockEdges
|
||||
|
||||
if (WEIGHTED) {
|
||||
val flockWithWeight = flock
|
||||
.map(() -> ('weight, 'rowtype)) { (u: Unit) =>
|
||||
(1.0f, ROW_TYPE_1)
|
||||
}
|
||||
|
||||
val realGraph = getRealGraphEdges
|
||||
.map(() -> 'rowtype) { (u: Unit) =>
|
||||
(ROW_TYPE_2)
|
||||
}
|
||||
|
||||
val combined = (flockWithWeight ++ realGraph)
|
||||
.groupBy('src_id, 'dst_id) {
|
||||
_.min('rowtype)
|
||||
.max('weight) // take whichever is bigger
|
||||
}
|
||||
|
||||
if (FLOCK_EDGES_ONLY) {
|
||||
combined.filter('rowtype) { (rowtype: Int) =>
|
||||
rowtype == ROW_TYPE_1
|
||||
}
|
||||
} else {
|
||||
combined
|
||||
}
|
||||
} else {
|
||||
flock.map(() -> ('weight)) { (u: Unit) =>
|
||||
1.0f
|
||||
}
|
||||
}.project('src_id, 'dst_id, 'weight)
|
||||
}
|
||||
|
||||
def getCsvEdges(fileName: String) = {
|
||||
Tsv(fileName).read
|
||||
.mapTo((0, 1, 2) -> ('src_id, 'dst_id, 'weight)) { input: (Long, Long, Float) =>
|
||||
input
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Compute user mass based on combined user
|
||||
*/
|
||||
def getUserMass =
|
||||
TypedPipe
|
||||
.from(MostRecentCombinedUserSnapshotSource)
|
||||
.flatMap { user =>
|
||||
UserMass.getUserMass(user)
|
||||
}
|
||||
.map { userMassInfo =>
|
||||
(userMassInfo.userId, userMassInfo.mass)
|
||||
}
|
||||
.toPipe[(Long, Double)]('src_id_input, 'mass_prior)
|
||||
.normalize('mass_prior)
|
||||
|
||||
/**
|
||||
* Read either flock/real_graph or a given tsv file
|
||||
* group by the source id, and output node data structure
|
||||
* merge with the user_mass.
|
||||
* return <'src_id, 'dst_ids, 'weights, 'mass_prior>
|
||||
*
|
||||
* make sure src_id is the same set as in user_mass, and dst_ids
|
||||
* are subset of user_mass. eg flock has edges like 1->2,
|
||||
* where both users 1 and 2 do not exist anymore
|
||||
*/
|
||||
def getGraphData(userMass: RichPipe) = {
|
||||
val edges: RichPipe = args.optional("input") match {
|
||||
case None => getFlockRealGraphEdges
|
||||
case Some(input) => getCsvEdges(input)
|
||||
}
|
||||
|
||||
// remove edges where dst_id is not in userMass
|
||||
val filterByDst = userMass
|
||||
.joinWithLarger('src_id_input -> 'dst_id, edges)
|
||||
.discard('src_id_input, 'mass_prior)
|
||||
|
||||
// aggreate by the source id
|
||||
val nodes = filterByDst
|
||||
.groupBy('src_id) {
|
||||
_.mapReduceMap(('dst_id, 'weight) -> ('dst_ids, 'weights)) /* map1 */ { a: (Long, Float) =>
|
||||
(Vector(a._1), if (WEIGHTED) Vector(a._2) else Vector())
|
||||
} /* reduce */ { (a: (Vector[Long], Vector[Float]), b: (Vector[Long], Vector[Float])) =>
|
||||
{
|
||||
(a._1 ++ b._1, a._2 ++ b._2)
|
||||
}
|
||||
} /* map2 */ { a: (Vector[Long], Vector[Float]) =>
|
||||
a
|
||||
}
|
||||
}
|
||||
.mapTo(
|
||||
('src_id, 'dst_ids, 'weights) -> ('src_id, 'dst_ids, 'weights, 'mass_prior, 'rowtype)) {
|
||||
input: (Long, Vector[Long], Vector[Float]) =>
|
||||
{
|
||||
(input._1, input._2.toArray, input._3.toArray, 0.0, ROW_TYPE_1)
|
||||
}
|
||||
}
|
||||
|
||||
// get to the same schema
|
||||
val userMassNodes = userMass
|
||||
.mapTo(('src_id_input, 'mass_prior) -> ('src_id, 'dst_ids, 'weights, 'mass_prior, 'rowtype)) {
|
||||
input: (Long, Double) =>
|
||||
{
|
||||
(input._1, Array[Long](), Array[Float](), input._2, ROW_TYPE_2)
|
||||
}
|
||||
}
|
||||
|
||||
// make src_id the same set as in userMass
|
||||
(nodes ++ userMassNodes)
|
||||
.groupBy('src_id) {
|
||||
_.sortBy('rowtype)
|
||||
.head('dst_ids, 'weights)
|
||||
.last('mass_prior, 'rowtype)
|
||||
}
|
||||
.filter('rowtype) { input: Int =>
|
||||
input == ROW_TYPE_2
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* generate the graph data output
|
||||
*/
|
||||
def dumpNodes(nodes: RichPipe, fileName: String) = {
|
||||
mode match {
|
||||
case Hdfs(_, conf) => nodes.write(SequenceFile(fileName))
|
||||
case _ =>
|
||||
nodes
|
||||
.mapTo((0, 1, 2, 3) -> (0, 1, 2, 3)) { input: (Long, Array[Long], Array[Float], Double) =>
|
||||
(input._1, input._2.mkString(","), input._3.mkString(","), input._4)
|
||||
}
|
||||
.write(Tsv(fileName))
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* output prior mass or copy the given mass file (merge, normalize)
|
||||
* to be used as the starting point
|
||||
*/
|
||||
def generateInitialPagerank(nodes: RichPipe) = {
|
||||
val prior = nodes
|
||||
.project('src_id, 'mass_prior)
|
||||
|
||||
val combined = args.optional("input_pagerank") match {
|
||||
case None => prior
|
||||
case Some(fileName) => {
|
||||
val massInput = Tsv(fileName).read
|
||||
.mapTo((0, 1) -> ('src_id, 'mass_prior, 'rowtype)) { input: (Long, Double) =>
|
||||
(input._1, input._2, ROW_TYPE_2)
|
||||
}
|
||||
|
||||
val priorRow = prior
|
||||
.map(() -> ('rowtype)) { (u: Unit) =>
|
||||
ROW_TYPE_1
|
||||
}
|
||||
|
||||
(priorRow ++ massInput)
|
||||
.groupBy('src_id) {
|
||||
_.sortBy('rowtype)
|
||||
.last('mass_prior)
|
||||
.head('rowtype)
|
||||
}
|
||||
// throw away extra nodes from input file
|
||||
.filter('rowtype) { (rowtype: Int) =>
|
||||
rowtype == ROW_TYPE_1
|
||||
}
|
||||
.discard('rowtype)
|
||||
.normalize('mass_prior)
|
||||
}
|
||||
}
|
||||
|
||||
combined.write(Tsv(PWD + "/pagerank_0"))
|
||||
}
|
||||
}
|
@ -1,75 +0,0 @@
|
||||
Tweepcred
|
||||
|
||||
Tweepcred is a social network analysis tool that calculates the influence of Twitter users based on their interactions with other users. The tool uses the PageRank algorithm to rank users based on their influence.
|
||||
|
||||
PageRank Algorithm
|
||||
PageRank is a graph algorithm that was originally developed by Google to determine the importance of web pages in search results. The algorithm works by assigning a numerical score to each page based on the number and quality of other pages that link to it. The more links a page has from other high-quality pages, the higher its PageRank score.
|
||||
|
||||
In the Tweepcred project, the PageRank algorithm is used to determine the influence of Twitter users based on their interactions with other users. The graph is constructed by treating Twitter users as nodes, and their interactions (mentions, retweets, etc.) as edges. The PageRank score of a user represents their influence in the network.
|
||||
|
||||
Tweepcred PageRank Implementation
|
||||
The implementation of the PageRank algorithm in Tweepcred is based on the Hadoop MapReduce framework. The algorithm is split into two stages: preparation and iteration.
|
||||
|
||||
The preparation stage involves constructing the graph of Twitter users and their interactions, and initializing each user's PageRank score to a default value. This stage is implemented in the PreparePageRankData class.
|
||||
|
||||
The iteration stage involves repeatedly calculating and updating the PageRank scores of each user until convergence is reached. This stage is implemented in the UpdatePageRank class, which is run multiple times until the algorithm converges.
|
||||
|
||||
The Tweepcred PageRank implementation also includes a number of optimizations to improve performance and reduce memory usage. These optimizations include block compression, lazy loading, and in-memory caching.
|
||||
|
||||
|
||||
========================================== TweepcredBatchJob.scala ==========================================
|
||||
|
||||
|
||||
This is a Scala class that represents a batch job for computing the "tweepcred" (Twitter credibility) score for Twitter users using weighted or unweighted PageRank algorithm. The class extends the AnalyticsIterativeBatchJob class, which is part of the Scalding framework used for data processing on Hadoop.
|
||||
|
||||
The class defines various properties and methods that are used to configure and run the batch job. The args parameter represents the command-line arguments that are passed to the batch job, such as the --weighted flag that determines whether to use the weighted PageRank algorithm or not.
|
||||
|
||||
The run method overrides the run method of the base class and prints the batch statistics after the job has finished. The children method defines a list of child jobs that need to be executed as part of the batch job. The messageHeader method returns a string that represents the header of the batch job message.
|
||||
|
||||
========================================== ExtractTweepcred.scala ==========================================
|
||||
|
||||
This class is a Scalding job that calculates "tweepcred" from a given pagerank file. Tweepcred is a measure of reputation for Twitter users that takes into account the number of followers they have and the number of people they follow. If the optional argument post_adjust is set to true (default value), then the pagerank values are adjusted based on the user's follower-to-following ratio.
|
||||
|
||||
The class takes several command-line arguments specifying input and output files and options, and it uses the Scalding library to perform distributed data processing on the input files. It reads in the pagerank file and a user mass file, both in TSV format, and combines them to produce a new pagerank file with the adjusted values. The adjusted pagerank is then used to calculate tweepcred values, which are written to output files.
|
||||
|
||||
The code makes use of the MostRecentCombinedUserSnapshotSource class from the com.twitter.pluck.source.combined_user_source package to obtain user information from the user mass file. It also uses the Reputation class to perform the tweepcred calculations and adjustments.
|
||||
|
||||
|
||||
========================================== UserMass.scala ==========================================
|
||||
|
||||
The UserMass class is a helper class used to calculate the "mass" of a user on Twitter, as defined by a certain algorithm. The mass score represents the user's reputation and is used in various applications, such as in determining which users should be recommended to follow or which users should have their content highlighted.
|
||||
|
||||
The getUserMass method of the UserMass class takes in a CombinedUser object, which contains information about a Twitter user, and returns an optional UserMassInfo object, which contains the user's ID and calculated mass score.
|
||||
|
||||
The algorithm used to calculate the mass score takes into account various factors such as the user's account age, number of followers and followings, device usage, and safety status (restricted, suspended, verified). The calculation involves adding and multiplying weight factors and adjusting the mass score based on a threshold for the number of friends and followers.
|
||||
|
||||
|
||||
========================================== PreparePageRankData.scala ==========================================
|
||||
|
||||
The PreparePageRankData class prepares the graph data for the page rank calculation. It generates the initial pagerank and then starts the WeightedPageRank job. It has the following functionalities:
|
||||
|
||||
It reads the user mass TSV file generated by the twadoop user_mass job.
|
||||
It reads the graph data, which is either a TSV file or a combination of flock edges and real graph inputs for weights.
|
||||
It generates the initial pagerank as the starting point for the pagerank computation.
|
||||
It writes the number of nodes to a TSV file and dumps the nodes to another TSV file.
|
||||
It has several options like weighted, flock_edges_only, and input_pagerank to fine-tune the pagerank calculation.
|
||||
It also has options for the WeightedPageRank and ExtractTweepcred jobs, like output_pagerank, output_tweepcred, maxiterations, jumpprob, threshold, and post_adjust.
|
||||
The PreparePageRankData class has several helper functions like getFlockEdges, getRealGraphEdges, getFlockRealGraphEdges, and getCsvEdges that read the graph data from different sources like DAL, InteractionGraph, or CSV files. It also has the generateInitialPagerank function that generates the initial pagerank from the graph data.
|
||||
|
||||
========================================== WeightedPageRank.scala ==========================================
|
||||
|
||||
WeightedPageRank is a class that performs the weighted PageRank algorithm on a given graph.
|
||||
|
||||
The algorithm starts from a given PageRank value and performs one iteration, then tests for convergence. If convergence has not been reached, the algorithm clones itself and starts the next PageRank job with the updated PageRank as input. If convergence has been reached, the algorithm starts the ExtractTweepcred job instead.
|
||||
|
||||
The class takes in several options, including the working directory, total number of nodes, nodes file, PageRank file, total difference, whether to perform weighted PageRank, the current iteration, maximum iterations to run, probability of a random jump, and whether to do post adjust.
|
||||
|
||||
The algorithm reads a nodes file that includes the source node ID, destination node IDs, weights, and mass prior. The algorithm also reads an input PageRank file that includes the source node ID and mass input. The algorithm then performs one iteration of the PageRank algorithm and writes the output PageRank to a file.
|
||||
|
||||
The algorithm tests for convergence by calculating the total difference between the input and output PageRank masses. If convergence has not been reached, the algorithm clones itself and starts the next PageRank job. If convergence has been reached, the algorithm starts the ExtractTweepcred job.
|
||||
|
||||
========================================== Reputation.scala ==========================================
|
||||
|
||||
This is a helper class called Reputation that contains methods for calculating a user's reputation score. The first method called scaledReputation takes a Double parameter raw which represents the user's page rank, and returns a Byte value that represents the user's reputation on a scale of 0 to 100. This method uses a formula that involves converting the logarithm of the page rank to a number between 0 and 100.
|
||||
|
||||
The second method called adjustReputationsPostCalculation takes three parameters: mass (a Double value representing the user's page rank), numFollowers (an Int value representing the number of followers a user has), and numFollowings (an Int value representing the number of users a user is following). This method reduces the page rank of users who have a low number of followers but a high number of followings. It calculates a division factor based on the ratio of followings to followers, and reduces the user's page rank by dividing it by this factor. The method returns the adjusted page rank.
|
BIN
src/scala/com/twitter/graph/batch/job/tweepcred/README.docx
Normal file
BIN
src/scala/com/twitter/graph/batch/job/tweepcred/README.docx
Normal file
Binary file not shown.
BIN
src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.docx
Normal file
BIN
src/scala/com/twitter/graph/batch/job/tweepcred/Reputation.docx
Normal file
Binary file not shown.
@ -1,50 +0,0 @@
|
||||
package com.twitter.graph.batch.job.tweepcred
|
||||
|
||||
/**
|
||||
* helper class to calculate reputation, borrowed from repo reputations
|
||||
*/
|
||||
object Reputation {
|
||||
|
||||
/**
|
||||
* convert pagerank to tweepcred between 0 and 100,
|
||||
* take from repo reputations, util/Utils.scala
|
||||
*/
|
||||
def scaledReputation(raw: Double): Byte = {
|
||||
if (raw == 0 || (raw < 1.0e-20)) {
|
||||
0
|
||||
} else {
|
||||
// convert log(pagerank) to a number between 0 and 100
|
||||
// the two parameters are from a linear fit by converting
|
||||
// max pagerank -> 95
|
||||
// min pagerank -> 15
|
||||
val e: Double = 130d + 5.21 * scala.math.log(raw) // log to the base e
|
||||
val pos = scala.math.rint(e)
|
||||
val v = if (pos > 100) 100.0 else if (pos < 0) 0.0 else pos
|
||||
v.toByte
|
||||
}
|
||||
}
|
||||
|
||||
// these constants are take from repo reputations, config/production.conf
|
||||
private val threshAbsNumFriendsReps = 2500
|
||||
private val constantDivisionFactorGt_threshFriendsToFollowersRatioReps = 3.0
|
||||
private val threshFriendsToFollowersRatioUMass = 0.6
|
||||
private val maxDivFactorReps = 50
|
||||
|
||||
/**
|
||||
* reduce pagerank of users with low followers but high followings
|
||||
*/
|
||||
def adjustReputationsPostCalculation(mass: Double, numFollowers: Int, numFollowings: Int) = {
|
||||
if (numFollowings > threshAbsNumFriendsReps) {
|
||||
val friendsToFollowersRatio = (1.0 + numFollowings) / (1.0 + numFollowers)
|
||||
val divFactor =
|
||||
scala.math.exp(
|
||||
constantDivisionFactorGt_threshFriendsToFollowersRatioReps *
|
||||
(friendsToFollowersRatio - threshFriendsToFollowersRatioUMass) *
|
||||
scala.math.log(scala.math.log(numFollowings))
|
||||
)
|
||||
mass / ((divFactor min maxDivFactorReps) max 1.0)
|
||||
} else {
|
||||
mass
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,64 +0,0 @@
|
||||
package com.twitter.graph.batch.job.tweepcred
|
||||
|
||||
import com.twitter.scalding._
|
||||
import com.twitter.scalding_internal.job._
|
||||
import com.twitter.scalding_internal.job.analytics_batch._
|
||||
|
||||
/**
|
||||
* Register the beginning of the tweepcred job in analytic batch table
|
||||
*
|
||||
* Options:
|
||||
* --weighted: do weighted pagerank
|
||||
* --hadoop_config: /etc/hadoop/hadoop-conf-proc-atla
|
||||
*
|
||||
*/
|
||||
class TweepcredBatchJob(args: Args) extends AnalyticsIterativeBatchJob(args) {
|
||||
|
||||
def WEIGHTED = args("weighted").toBoolean
|
||||
|
||||
override def timeout = Hours(36)
|
||||
override def hasFlow = false
|
||||
def descriptionSuffix = " weighted=" + args("weighted")
|
||||
override def batchIncrement = Hours(24)
|
||||
override def firstTime = RichDate("2015-10-02")
|
||||
override def batchDescription = classOf[TweepcredBatchJob].getCanonicalName + descriptionSuffix
|
||||
|
||||
override def run = {
|
||||
val success = super.run
|
||||
println("Batch Stat: " + messageHeader + " " + jobStat.get.toString)
|
||||
success
|
||||
}
|
||||
|
||||
def startTime = dateRange.start
|
||||
def dateString = startTime.toString("yyyy/MM/dd")
|
||||
|
||||
override def children = {
|
||||
val BASEDIR = "/user/cassowary/tweepcred/"
|
||||
val baseDir = BASEDIR + (if (WEIGHTED) "weighted" else "unweighted") + "/daily/"
|
||||
val tmpDir = baseDir + "tmp"
|
||||
val outputDir = baseDir + dateString
|
||||
val pageRankDir = outputDir + "/finalmass"
|
||||
val tweepcredDir = outputDir + "/finaltweepcred"
|
||||
val yesterdayStr = (startTime - Days(1)).toString("yyyy/MM/dd")
|
||||
val yestPageRankDir = baseDir + yesterdayStr + "/finalmass"
|
||||
val TWEEPCRED = "/tweepcred"
|
||||
val curRep = (if (WEIGHTED) baseDir else BASEDIR) + "current"
|
||||
val todayRep = (if (WEIGHTED) baseDir else BASEDIR) + dateString
|
||||
val newArgs = args + ("pwd", Some(tmpDir)) +
|
||||
("output_pagerank", Some(pageRankDir)) +
|
||||
("output_tweepcred", Some(tweepcredDir)) +
|
||||
("input_pagerank", Some(yestPageRankDir)) +
|
||||
("current_tweepcred", Some(curRep + TWEEPCRED)) +
|
||||
("today_tweepcred", Some(todayRep + TWEEPCRED))
|
||||
|
||||
val prJob = new PreparePageRankData(newArgs)
|
||||
|
||||
List(prJob)
|
||||
}
|
||||
|
||||
private def messageHeader = {
|
||||
val dateString = dateRange.start.toString("yyyy/MM/dd")
|
||||
classOf[TweepcredBatchJob].getSimpleName +
|
||||
(if (WEIGHTED) " weighted " else " unweighted ") + dateString
|
||||
}
|
||||
}
|
BIN
src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.docx
Normal file
BIN
src/scala/com/twitter/graph/batch/job/tweepcred/UserMass.docx
Normal file
Binary file not shown.
@ -1,69 +0,0 @@
|
||||
package com.twitter.graph.batch.job.tweepcred
|
||||
|
||||
import com.twitter.twadoop.user.gen.CombinedUser
|
||||
import com.twitter.util.Time
|
||||
import com.twitter.wtf.scalding.jobs.common.DateUtil
|
||||
|
||||
case class UserMassInfo(userId: Long, mass: Double)
|
||||
|
||||
/**
|
||||
* helper class to calculate user mass, borrowed from repo reputations
|
||||
*/
|
||||
object UserMass {
|
||||
|
||||
private val currentTimestamp = Time.now.inMilliseconds
|
||||
private val constantDivisionFactorGt_threshFriendsToFollowersRatioUMass = 5.0
|
||||
private val threshAbsNumFriendsUMass = 500
|
||||
private val threshFriendsToFollowersRatioUMass = 0.6
|
||||
private val deviceWeightAdditive = 0.5
|
||||
private val ageWeightAdditive = 0.2
|
||||
private val restrictedWeightMultiplicative = 0.1
|
||||
|
||||
def getUserMass(combinedUser: CombinedUser): Option[UserMassInfo] = {
|
||||
val user = Option(combinedUser.user)
|
||||
val userId = user.map(_.id).getOrElse(0L)
|
||||
val userExtended = Option(combinedUser.user_extended)
|
||||
val age = user.map(_.created_at_msec).map(DateUtil.diffDays(_, currentTimestamp)).getOrElse(0)
|
||||
val isRestricted = user.map(_.safety).exists(_.restricted)
|
||||
val isSuspended = user.map(_.safety).exists(_.suspended)
|
||||
val isVerified = user.map(_.safety).exists(_.verified)
|
||||
val hasValidDevice = user.flatMap(u => Option(u.devices)).exists(_.isSetMessaging_devices)
|
||||
val numFollowers = userExtended.flatMap(u => Option(u.followers)).map(_.toInt).getOrElse(0)
|
||||
val numFollowings = userExtended.flatMap(u => Option(u.followings)).map(_.toInt).getOrElse(0)
|
||||
|
||||
if (userId == 0L || user.map(_.safety).exists(_.deactivated)) {
|
||||
None
|
||||
} else {
|
||||
val mass =
|
||||
if (isSuspended)
|
||||
0
|
||||
else if (isVerified)
|
||||
100
|
||||
else {
|
||||
var score = deviceWeightAdditive * 0.1 +
|
||||
(if (hasValidDevice) deviceWeightAdditive else 0)
|
||||
val normalizedAge = if (age > 30) 1.0 else (1.0 min scala.math.log(1.0 + age / 15.0))
|
||||
score *= normalizedAge
|
||||
if (score < 0.01) score = 0.01
|
||||
if (isRestricted) score *= restrictedWeightMultiplicative
|
||||
score = (score min 1.0) max 0
|
||||
score *= 100
|
||||
score
|
||||
}
|
||||
|
||||
val friendsToFollowersRatio = (1.0 + numFollowings) / (1.0 + numFollowers)
|
||||
val adjustedMass =
|
||||
if (numFollowings > threshAbsNumFriendsUMass &&
|
||||
friendsToFollowersRatio > threshFriendsToFollowersRatioUMass) {
|
||||
mass / scala.math.exp(
|
||||
constantDivisionFactorGt_threshFriendsToFollowersRatioUMass *
|
||||
(friendsToFollowersRatio - threshFriendsToFollowersRatioUMass)
|
||||
)
|
||||
} else {
|
||||
mass
|
||||
}
|
||||
|
||||
Some(UserMassInfo(userId, adjustedMass))
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,235 +0,0 @@
|
||||
package com.twitter.graph.batch.job.tweepcred
|
||||
|
||||
import com.twitter.scalding._
|
||||
|
||||
/**
|
||||
* weighted page rank for the given graph, start from the given pagerank,
|
||||
* perform one iteration, test for convergence, if not yet, clone itself
|
||||
* and start the next page rank job with updated pagerank as input;
|
||||
* if converged, start ExtractTweepcred job instead
|
||||
*
|
||||
* Options:
|
||||
* --pwd: working directory, will read/generate the following files there
|
||||
* numnodes: total number of nodes
|
||||
* nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior>
|
||||
* pagerank: the page rank file eg pagerank_0, pagerank_1 etc
|
||||
* totaldiff: the current max pagerank delta
|
||||
* Optional arguments:
|
||||
* --weighted: do weighted pagerank, default false
|
||||
* --curiteration: what is the current iteration, default 0
|
||||
* --maxiterations: how many iterations to run. Default is 20
|
||||
* --jumpprob: probability of a random jump, default is 0.1
|
||||
* --threshold: total difference before finishing early, default 0.001
|
||||
*
|
||||
* plus the following options for ExtractTweepcred:
|
||||
* --user_mass: user mass tsv file, generated by twadoop user_mass job
|
||||
* --output_pagerank: where to put pagerank file
|
||||
* --output_tweepcred: where to put tweepcred file
|
||||
* Optional:
|
||||
* --post_adjust: whether to do post adjust, default true
|
||||
*
|
||||
*/
|
||||
class WeightedPageRank(args: Args) extends Job(args) {
|
||||
val ROW_TYPE_1 = 1
|
||||
val ROW_TYPE_2 = 2
|
||||
|
||||
val PWD = args("pwd")
|
||||
val ALPHA = args.getOrElse("jumpprob", "0.1").toDouble
|
||||
val WEIGHTED = args.getOrElse("weighted", "false").toBoolean
|
||||
val THRESHOLD = args.getOrElse("threshold", "0.001").toDouble
|
||||
val MAXITERATIONS = args.getOrElse("maxiterations", "20").toInt
|
||||
val CURITERATION = args.getOrElse("curiteration", "0").toInt
|
||||
|
||||
// 'size
|
||||
val numNodes = getNumNodes(PWD + "/numnodes")
|
||||
|
||||
// 'src_id, 'dst_ids, 'weights, 'mass_prior
|
||||
val nodes = getNodes(PWD + "/nodes")
|
||||
|
||||
// 'src_id_input, 'mass_input
|
||||
val inputPagerank = getInputPagerank(PWD + "/pagerank_" + CURITERATION)
|
||||
|
||||
// one iteration of pagerank
|
||||
val outputPagerank = doPageRank(nodes, inputPagerank)
|
||||
val outputFileName = PWD + "/pagerank_" + (CURITERATION + 1)
|
||||
outputPagerank
|
||||
.project('src_id, 'mass_n)
|
||||
.write(Tsv(outputFileName))
|
||||
|
||||
// detect convergence
|
||||
val totalDiff = outputPagerank
|
||||
.mapTo(('mass_input, 'mass_n) -> 'mass_diff) { args: (Double, Double) =>
|
||||
scala.math.abs(args._1 - args._2)
|
||||
}
|
||||
.groupAll { _.sum[Double]('mass_diff) }
|
||||
.write(Tsv(PWD + "/totaldiff"))
|
||||
|
||||
/**
|
||||
* test convergence, if not yet, kick off the next iteration
|
||||
*/
|
||||
override def next = {
|
||||
// the max diff generated above
|
||||
val totalDiff = Tsv(PWD + "/totaldiff").readAtSubmitter[Double].head
|
||||
|
||||
if (CURITERATION < MAXITERATIONS - 1 && totalDiff > THRESHOLD) {
|
||||
val newArgs = args + ("curiteration", Some((CURITERATION + 1).toString))
|
||||
Some(clone(newArgs))
|
||||
} else {
|
||||
val newArgs = args + ("input_pagerank", Some(outputFileName))
|
||||
Some(new ExtractTweepcred(newArgs))
|
||||
}
|
||||
}
|
||||
|
||||
def getInputPagerank(fileName: String) = {
|
||||
Tsv(fileName).read
|
||||
.mapTo((0, 1) -> ('src_id_input, 'mass_input)) { input: (Long, Double) =>
|
||||
input
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* read the pregenerated nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior>
|
||||
*/
|
||||
def getNodes(fileName: String) = {
|
||||
mode match {
|
||||
case Hdfs(_, conf) => {
|
||||
SequenceFile(fileName).read
|
||||
.mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) {
|
||||
input: (Long, Array[Long], Array[Float], Double) =>
|
||||
input
|
||||
}
|
||||
}
|
||||
case _ => {
|
||||
Tsv(fileName).read
|
||||
.mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) {
|
||||
input: (Long, String, String, Double) =>
|
||||
{
|
||||
(
|
||||
input._1,
|
||||
// convert string to int array
|
||||
if (input._2 != null && input._2.length > 0) {
|
||||
input._2.split(",").map { _.toLong }
|
||||
} else {
|
||||
Array[Long]()
|
||||
},
|
||||
// convert string to float array
|
||||
if (input._3 != null && input._3.length > 0) {
|
||||
input._3.split(",").map { _.toFloat }
|
||||
} else {
|
||||
Array[Float]()
|
||||
},
|
||||
input._4
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* the total number of nodes, single line file
|
||||
*/
|
||||
def getNumNodes(fileName: String) = {
|
||||
Tsv(fileName).read
|
||||
.mapTo(0 -> 'size) { input: Long =>
|
||||
input
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* one iteration of pagerank
|
||||
* inputPagerank: <'src_id_input, 'mass_input>
|
||||
* return <'src_id, 'mass_n, 'mass_input>
|
||||
*
|
||||
* Here is a highlevel view of the unweighted algorithm:
|
||||
* let
|
||||
* N: number of nodes
|
||||
* inputPagerank(N_i): prob of walking to node i,
|
||||
* d(N_j): N_j's out degree
|
||||
* then
|
||||
* pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) / d_j)
|
||||
* deadPagerank = (1 - \sum_{i} pagerankNext(N_i)) / N
|
||||
* randomPagerank(N_i) = userMass(N_i) * ALPHA + deadPagerank * (1-ALPHA)
|
||||
* pagerankOutput(N_i) = randomPagerank(N_i) + pagerankNext(N_i) * (1-ALPHA)
|
||||
*
|
||||
* For weighted algorithm:
|
||||
* let
|
||||
* w(N_j, N_i): weight from N_j to N_i
|
||||
* tw(N_j): N_j's total out weights
|
||||
* then
|
||||
* pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) * w(N_j, N_i) / tw(N_j))
|
||||
*
|
||||
*/
|
||||
def doPageRank(nodeRows: RichPipe, inputPagerank: RichPipe): RichPipe = {
|
||||
// 'src_id, 'dst_ids, 'weights, 'mass_prior, 'mass_input
|
||||
val nodeJoined = nodeRows
|
||||
.joinWithSmaller('src_id -> 'src_id_input, inputPagerank)
|
||||
.discard('src_id_input)
|
||||
|
||||
// 'src_id, 'mass_n
|
||||
val pagerankNext = nodeJoined
|
||||
.flatMapTo(('dst_ids, 'weights, 'mass_input) -> ('src_id, 'mass_n)) {
|
||||
args: (Array[Long], Array[Float], Double) =>
|
||||
{
|
||||
if (args._1.length > 0) {
|
||||
if (WEIGHTED) {
|
||||
// weighted distribution
|
||||
val total: Double = args._2.sum
|
||||
(args._1 zip args._2).map { idWeight: (Long, Float) =>
|
||||
(idWeight._1, args._3 * idWeight._2 / total)
|
||||
}
|
||||
} else {
|
||||
// equal distribution
|
||||
val dist: Double = args._3 / args._1.length
|
||||
args._1.map { id: Long =>
|
||||
(id, dist)
|
||||
}
|
||||
}
|
||||
} else {
|
||||
//Here is a node that points to no other nodes (dangling)
|
||||
Nil
|
||||
}
|
||||
}
|
||||
}
|
||||
.groupBy('src_id) {
|
||||
_.sum[Double]('mass_n)
|
||||
}
|
||||
|
||||
// 'sum_mass
|
||||
val sumPagerankNext = pagerankNext.groupAll { _.sum[Double]('mass_n -> 'sum_mass) }
|
||||
|
||||
// 'deadMass
|
||||
// single row jobs
|
||||
// the dead page rank equally distributed to every node
|
||||
val deadPagerank = sumPagerankNext
|
||||
.crossWithTiny(numNodes)
|
||||
.map(('sum_mass, 'size) -> 'deadMass) { input: (Double, Long) =>
|
||||
(1.0 - input._1) / input._2
|
||||
}
|
||||
.discard('size, 'sum_mass)
|
||||
|
||||
// 'src_id_r, 'mass_n_r
|
||||
// random jump probability plus dead page rank
|
||||
val randomPagerank = nodeJoined
|
||||
.crossWithTiny(deadPagerank)
|
||||
.mapTo(('src_id, 'mass_prior, 'deadMass, 'mass_input) -> ('src_id, 'mass_n, 'mass_input)) {
|
||||
ranks: (Long, Double, Double, Double) =>
|
||||
(ranks._1, ranks._2 * ALPHA + ranks._3 * (1 - ALPHA), ranks._4)
|
||||
}
|
||||
|
||||
// 'src_id, 'mass_n
|
||||
// scale next page rank to 1-ALPHA
|
||||
val pagerankNextScaled = pagerankNext
|
||||
.map('mass_n -> ('mass_n, 'mass_input)) { m: Double =>
|
||||
((1 - ALPHA) * m, 0.0)
|
||||
}
|
||||
|
||||
// 'src_id, 'mass_n, 'mass_input
|
||||
// random probability + next probability
|
||||
(randomPagerank ++ pagerankNextScaled)
|
||||
.groupBy('src_id) {
|
||||
_.sum[Double]('mass_input) // keep the input pagerank
|
||||
.sum[Double]('mass_n) // take the sum
|
||||
}
|
||||
}
|
||||
}
|
BIN
src/scala/com/twitter/interaction_graph/README.docx
Normal file
BIN
src/scala/com/twitter/interaction_graph/README.docx
Normal file
Binary file not shown.
@ -1,19 +0,0 @@
|
||||
## Real Graph (bqe)
|
||||
|
||||
This project builds a machine learning model using a gradient boosting tree classifier to predict the likelihood of a Twitter user interacting with another user.
|
||||
|
||||
The algorithm works by first creating a labeled dataset of user interactions from a graph of Twitter users. This graph is represented in a BigQuery table where each row represents a directed edge between two users, along with various features such as the number of tweets, follows, favorites, and other metrics related to user behavior.
|
||||
|
||||
To create the labeled dataset, the algorithm first selects a set of candidate interactions by identifying all edges that were active during a certain time period. It then joins this candidate set with a set of labeled interactions that occurred one day after the candidate period. Positive interactions are labeled as "1" and negative interactions are labeled as "0". The resulting labeled dataset is then used to train a boosted tree classifier model.
|
||||
|
||||
The model is trained using the labeled dataset and various hyperparameters, including the maximum number of iterations and the subsample rate. The algorithm splits the labeled dataset into training and testing sets based on the source user's ID, using a custom data split method.
|
||||
|
||||
Once the model is trained, it can be used to generate a score estimating the probability of a user interacting with another user.
|
||||
|
||||
## Real Graph (scio)
|
||||
|
||||
This project aggregates the number of interactions between pairs of users on Twitter. On a daily basis, there are multiple dataflow jobs that perform this aggregation, which includes public engagements like favorites, retweets, follows, etc. as well as private engagements like profile views, tweet clicks, and whether or not a user has another user in their address book (given a user opt-in to share address book).
|
||||
|
||||
After the daily aggregation of interactions, there is a rollup job that aggregates yesterday's aggregation with today's interactions. The rollup job outputs several results, including the daily count of interactions per interaction types between a pair of users, the daily incoming interactions made on a user per interaction type, the rollup aggregation of interactions as a decayed sum between a pair of users, and the rollup aggregation of incoming interactions made on a user.
|
||||
|
||||
Finally, the rollup job outputs the ML predicted interaction score between the pair of users alongside the rollup aggregation of interactions as a decayed sum between them.
|
BIN
src/scala/com/twitter/interaction_graph/bqe/scoring/README.docx
Normal file
BIN
src/scala/com/twitter/interaction_graph/bqe/scoring/README.docx
Normal file
Binary file not shown.
@ -1,58 +0,0 @@
|
||||
# Scoring
|
||||
|
||||
This folder contains the sql files that we'll use for scoring the real graph edges in BQ. We have 4 steps that take place:
|
||||
- check to make sure that our models are in place. the feature importance query should return 20 rows in total: 10 rows per model, 1 for each feature.
|
||||
- follow graph feature generation. this is to ensure that we have features for all users regardless if they have had any recent activity.
|
||||
- candidate generation. this query combines the candidates from the follow graph and the activity graph, and the features from both.
|
||||
- scoring. this query scores with 2 of our prod models and saves the scores to a table, with an additional field that distinguishes if an edge in in/out of network.
|
||||
|
||||
## Instructions
|
||||
|
||||
For deploying the job, you would need to create a zip file, upload to packer, and then schedule it with aurora.
|
||||
|
||||
```
|
||||
zip -jr real_graph_scoring src/scala/com/twitter/interaction_graph/bqe/scoring && \
|
||||
packer add_version --cluster=atla cassowary real_graph_scoring real_graph_scoring.zip
|
||||
aurora cron schedule atla/cassowary/prod/real_graph_scoring src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.aurora && \
|
||||
aurora cron start atla/cassowary/prod/real_graph_scoring
|
||||
```
|
||||
|
||||
# candidates.sql
|
||||
|
||||
This BigQuery (BQ) query does the following:
|
||||
|
||||
1. Declares two variables, date_start and date_end, which are both of type DATE.
|
||||
2. Sets the date_end variable to the maximum partition ID of the interaction_graph_labels_daily table, using the PARSE_DATE() function to convert the partition ID to a date format.
|
||||
3. Sets the date_start variable to 30 days prior to the date_end variable, using the DATE_SUB() function.
|
||||
4. Creates a new table called candidates in the realgraph dataset, partitioned by ds.
|
||||
5. The query uses three common table expressions (T1, T2, and T3) to join data from two tables (interaction_graph_labels_daily and tweeting_follows) to generate a table containing candidate information and features.
|
||||
6. The table T3 is the result of a full outer join between T1 and T2, grouping by source_id and destination_id, and aggregating values such as num_tweets, label_types, and the counts of different types of labels (e.g. num_follows, num_favorites, etc.).
|
||||
7. The T4 table ranks each source_id by the number of num_days and num_tweets, and selects the top 2000 rows for each source_id.
|
||||
8. Finally, the query selects all columns from the T4 table and appends the date_end variable as a new column named ds.
|
||||
|
||||
Overall, the query generates a table of candidates and their associated features for a particular date range, using data from two tables in the twttr-bq-cassowary-prod and twttr-recos-ml-prod datasets.
|
||||
|
||||
# follow_graph_features.sql
|
||||
|
||||
This BigQuery script creates a table twttr-recos-ml-prod.realgraph.tweeting_follows that includes features for Twitter user interactions, specifically tweet counts and follows.
|
||||
|
||||
First, it sets two variables date_latest_tweet and date_latest_follows to the most recent dates available in two separate tables: twttr-bq-tweetsource-pub-prod.user.public_tweets and twttr-recos-ml-prod.user_events.valid_user_follows, respectively.
|
||||
|
||||
Then, it creates the tweet_count and all_follows CTEs.
|
||||
|
||||
The tweet_count CTE counts the number of tweets made by each user within the last 3 days prior to date_latest_tweet.
|
||||
|
||||
The all_follows CTE retrieves all the follows from the valid_user_follows table that happened on date_latest_follows and left joins it with the tweet_count CTE. It also adds a row number that partitions by the source user ID and orders by the number of tweets in descending order. The final output is filtered to keep only the top 2000 follows per user based on the row number.
|
||||
|
||||
The final SELECT statement combines the all_follows CTE with the date_latest_tweet variable and inserts the results into the twttr-recos-ml-prod.realgraph.tweeting_follows table partitioned by date.
|
||||
|
||||
# scoring.sql
|
||||
|
||||
This BQ code performs operations on a BigQuery table called twttr-recos-ml-prod.realgraph.scores. Here is a step-by-step breakdown of what the code does:
|
||||
|
||||
Declare two variables, date_end and date_latest_follows, and set their values based on the latest partitions in the twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS and twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS tables that correspond to specific tables, respectively. The PARSE_DATE() function is used to convert the partition IDs to date format.
|
||||
|
||||
Delete rows from the twttr-recos-ml-prod.realgraph.scores table where the value of the ds column is equal to date_end.
|
||||
|
||||
Insert rows into the twttr-recos-ml-prod.realgraph.scores table based on a query that generates predicted scores for pairs of user IDs using two machine learning models. Specifically, the query uses the ML.PREDICT() function to apply two machine learning models (twttr-recos-ml-prod.realgraph.prod and twttr-recos-ml-prod.realgraph.prod_explicit) to the twttr-recos-ml-prod.realgraph.candidates table. The resulting predicted scores are joined with the twttr-recos-ml-prod.realgraph.tweeting_follows table, which contains information about the number of tweets made by users and their follow relationships, using a full outer join. The final result includes columns for the source ID, destination ID, predicted score (prob), explicit predicted score (prob_explicit), a binary variable indicating whether the destination ID is followed by the source ID (followed), and the value of date_end for the ds column. If there is no match in the predicted_scores table for a given pair of user IDs, the COALESCE() function is used to return the corresponding values from the tweeting_follows table, with default values of 0.0 for the predicted scores.
|
||||
|
Binary file not shown.
@ -1,42 +0,0 @@
|
||||
DECLARE date_start, date_end DATE;
|
||||
SET date_end = (
|
||||
SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id
|
||||
FROM `twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS`
|
||||
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="interaction_graph_labels_daily"
|
||||
);
|
||||
SET date_start = DATE_SUB(date_end, INTERVAL 30 DAY);
|
||||
|
||||
-- all candidates and their features
|
||||
CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.candidates`
|
||||
PARTITION BY ds
|
||||
AS
|
||||
WITH T1 AS (
|
||||
SELECT source_id, destination_id, label, dateHour
|
||||
FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily`
|
||||
LEFT JOIN UNNEST(labels) AS label
|
||||
WHERE DATE(dateHour) BETWEEN date_start AND date_end
|
||||
), T2 AS (
|
||||
SELECT source_id, destination_id, num_tweets
|
||||
FROM `twttr-recos-ml-prod.realgraph.tweeting_follows`
|
||||
), T3 AS (
|
||||
SELECT
|
||||
COALESCE(T1.source_id, T2.source_id) AS source_id,
|
||||
COALESCE(T1.destination_id, T2.destination_id) AS destination_id,
|
||||
COUNT(DISTINCT(T1.dateHour)) AS num_days,
|
||||
MIN(COALESCE(num_tweets,0)) AS num_tweets, -- all rows' num_tweets should be the same
|
||||
COALESCE(DATE_DIFF(date_end, DATE(MAX(T1.dateHour)), DAY),30) AS days_since_last_interaction,
|
||||
COUNT(DISTINCT(label)) AS label_types,
|
||||
COUNTIF(label="num_follows") AS num_follows,
|
||||
COUNTIF(label="num_favorites") AS num_favorites,
|
||||
COUNTIF(label="num_tweet_clicks") AS num_tweet_clicks,
|
||||
COUNTIF(label="num_profile_views") AS num_profile_views,
|
||||
FROM T1
|
||||
FULL JOIN T2
|
||||
USING (source_id, destination_id)
|
||||
GROUP BY 1,2
|
||||
ORDER BY 3 DESC,4 DESC
|
||||
), T4 AS (
|
||||
SELECT RANK() OVER (PARTITION BY source_id ORDER BY num_days DESC, num_tweets DESC) AS rn, *
|
||||
FROM T3
|
||||
) SELECT *, date_end AS ds FROM T4 WHERE rn <= 2000
|
||||
|
Binary file not shown.
@ -1,5 +0,0 @@
|
||||
(SELECT * FROM ML.FEATURE_IMPORTANCE(MODEL `twttr-recos-ml-prod.realgraph.prod`)
|
||||
ORDER BY importance_gain DESC)
|
||||
UNION ALL
|
||||
(SELECT * FROM ML.FEATURE_IMPORTANCE(MODEL `twttr-recos-ml-prod.realgraph.prod_explicit`)
|
||||
ORDER BY importance_gain DESC)
|
Binary file not shown.
@ -1,28 +0,0 @@
|
||||
DECLARE date_latest_tweet, date_latest_follows DATE;
|
||||
SET date_latest_tweet = (
|
||||
SELECT PARSE_DATE('%Y%m%d', SUBSTRING(MAX(partition_id), 1, 8)) AS partition_id
|
||||
FROM `twttr-bq-tweetsource-pub-prod.user.INFORMATION_SCHEMA.PARTITIONS`
|
||||
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="public_tweets");
|
||||
SET date_latest_follows = (
|
||||
SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id
|
||||
FROM `twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS`
|
||||
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="valid_user_follows");
|
||||
|
||||
-- tweet count candidate features
|
||||
CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.tweeting_follows`
|
||||
PARTITION BY ds
|
||||
AS
|
||||
WITH tweet_count AS (
|
||||
SELECT userId, COUNT(userId) AS num_tweets
|
||||
FROM `twttr-bq-tweetsource-pub-prod.user.public_tweets`
|
||||
WHERE DATE(ts) BETWEEN DATE_SUB(date_latest_tweet, INTERVAL 3 DAY) AND date_latest_tweet
|
||||
GROUP BY 1
|
||||
), all_follows AS (
|
||||
SELECT F.sourceId AS source_id, F.destinationId AS destination_id, COALESCE(T.num_tweets,0) AS num_tweets,
|
||||
ROW_NUMBER() OVER (PARTITION BY F.sourceId ORDER BY T.num_tweets DESC) AS rn
|
||||
FROM `twttr-recos-ml-prod.user_events.valid_user_follows` F
|
||||
LEFT JOIN tweet_count T
|
||||
ON F.destinationId=T.userId
|
||||
WHERE DATE(F._PARTITIONTIME) = date_latest_follows
|
||||
) SELECT *, date_latest_tweet AS ds FROM all_follows WHERE rn <= 2000
|
||||
;
|
BIN
src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.docx
Normal file
BIN
src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.docx
Normal file
Binary file not shown.
@ -1,52 +0,0 @@
|
||||
DECLARE date_end, date_latest_follows DATE;
|
||||
SET date_end = (
|
||||
SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id
|
||||
FROM `twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS`
|
||||
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="interaction_graph_labels_daily"
|
||||
);
|
||||
SET date_latest_follows = (
|
||||
SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id
|
||||
FROM `twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS`
|
||||
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="valid_user_follows");
|
||||
|
||||
DELETE
|
||||
FROM `twttr-recos-ml-prod.realgraph.scores`
|
||||
WHERE ds = date_end;
|
||||
|
||||
-- score candidates (59m)
|
||||
INSERT INTO `twttr-recos-ml-prod.realgraph.scores`
|
||||
WITH predicted_scores AS (
|
||||
SELECT
|
||||
source_id,
|
||||
destination_id,
|
||||
p1.prob AS prob,
|
||||
p2.prob AS prob_explicit
|
||||
FROM ML.PREDICT(MODEL `twttr-recos-ml-prod.realgraph.prod`,
|
||||
(
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
`twttr-recos-ml-prod.realgraph.candidates` ) ) S1
|
||||
CROSS JOIN UNNEST(S1.predicted_label_probs) AS p1
|
||||
JOIN ML.PREDICT(MODEL `twttr-recos-ml-prod.realgraph.prod_explicit`,
|
||||
(
|
||||
SELECT
|
||||
*
|
||||
FROM
|
||||
`twttr-recos-ml-prod.realgraph.candidates` ) ) S2
|
||||
USING (source_id, destination_id)
|
||||
CROSS JOIN UNNEST(S2.predicted_label_probs) AS p2
|
||||
WHERE p1.label=1 AND p2.label=1
|
||||
)
|
||||
SELECT
|
||||
COALESCE(predicted_scores.source_id, tweeting_follows.source_id) AS source_id,
|
||||
COALESCE(predicted_scores.destination_id, tweeting_follows.destination_id) AS destination_id,
|
||||
COALESCE(prob, 0.0) AS prob,
|
||||
COALESCE(prob_explicit, 0.0) AS prob_explicit,
|
||||
(tweeting_follows.source_id IS NOT NULL) AND (tweeting_follows.destination_id IS NOT NULL) AS followed,
|
||||
date_end AS ds
|
||||
FROM
|
||||
predicted_scores
|
||||
FULL JOIN
|
||||
`twttr-recos-ml-prod.realgraph.tweeting_follows` tweeting_follows
|
||||
USING (source_id, destination_id)
|
BIN
src/scala/com/twitter/interaction_graph/bqe/training/README.docx
Normal file
BIN
src/scala/com/twitter/interaction_graph/bqe/training/README.docx
Normal file
Binary file not shown.
@ -1,60 +0,0 @@
|
||||
# Training
|
||||
|
||||
This folder contains the sql files that we'll use for training the prod real graph models:
|
||||
- prod (predicts any interactions the next day)
|
||||
- prod_explicit (predicts any explicit interactions the next day)
|
||||
|
||||
We have 3 steps that take place:
|
||||
- candidate generation + feature hydration. this query samples 1% of edges from the `twttr-recos-ml-prod.realgraph.candidates` table which is already produced daily and saves it to `twttr-recos-ml-prod.realgraph.candidates_sampled`. we save each day's data according to the statebird batch run date and hence require checks to make sure that the data exists to begin with.
|
||||
- label candidates. we join day T's candidates with day T+1's labels while filtering out any negative interactions to get our labeled dataset. we append an additional day's worth of segments for each day. we finally generate the training dataset which uses all day's labeled data for training, performing negative downsampling to get a roughly 50-50 split of positive to negative labels.
|
||||
- training. we use bqml for training our xgboost models.
|
||||
|
||||
## Instructions
|
||||
|
||||
For deploying the job, you would need to create a zip file, upload to packer, and then schedule it with aurora.
|
||||
|
||||
```
|
||||
zip -jr real_graph_training src/scala/com/twitter/interaction_graph/bqe/training && \
|
||||
packer add_version --cluster=atla cassowary real_graph_training real_graph_training.zip
|
||||
aurora cron schedule atla/cassowary/prod/real_graph_training src/scala/com/twitter/interaction_graph/bqe/training/training.aurora && \
|
||||
aurora cron start atla/cassowary/prod/real_graph_training
|
||||
```
|
||||
|
||||
# candidates.sql
|
||||
|
||||
1. Sets the value of the variable date_candidates to the date of the latest partition of the candidates_for_training table.
|
||||
2. Creates a new table candidates_sampled if it does not exist already, which will contain a sample of 100 rows from the candidates_for_training table.
|
||||
3. Deletes any existing rows from the candidates_sampled table where the ds column matches the date_candidates value, to avoid double-writing.
|
||||
4. Inserts a sample of rows into the candidates_sampled table from the candidates_for_training table, where the modulo of the absolute value of the FARM_FINGERPRINT of the concatenation of source_id and destination_id is equal to the value of the $mod_remainder$ variable, and where the ds column matches the date_candidates value.
|
||||
|
||||
# check_candidates_exist.sql
|
||||
|
||||
This BigQuery prepares a table of candidates for training a machine learning model. It does the following:
|
||||
|
||||
1. Declares two variables date_start and date_end that are 30 days apart, and date_end is set to the value of $start_time$ parameter (which is a Unix timestamp).
|
||||
2. Creates a table candidates_for_training that is partitioned by ds (date) and populated with data from several other tables in the database. It joins information from tables of user interactions, tweeting, and interaction graph aggregates, filters out negative edge snapshots, calculates some statistics and aggregates them by source_id and destination_id. Then, it ranks each source_id by the number of days and tweets, selects top 2000, and adds date_end as a new column ds.
|
||||
3. Finally, it selects the ds column from candidates_for_training where ds equals date_end.
|
||||
|
||||
Overall, this script prepares a table of 2000 candidate pairs of user interactions with statistics and labels, which can be used to train a machine learning model for recommendation purposes.
|
||||
|
||||
# labeled_candidates.sql
|
||||
|
||||
The BQ does the following:
|
||||
|
||||
1. Defines two variables date_candidates and date_labels as dates based on the $start_time$ parameter.
|
||||
2. Creates a new table twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ with default values.
|
||||
3. Deletes any prior data in the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table for the current date_candidates.
|
||||
4. Joins the twttr-recos-ml-prod.realgraph.candidates_sampled table with the twttr-bq-cassowary-prod.user.interaction_graph_labels_daily table and the twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot table. It assigns a label of 1 for positive interactions and 0 for negative interactions, and selects only the rows where there is no negative interaction.
|
||||
5. Inserts the joined data into the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table.
|
||||
6. Calculates the positive rate by counting the number of positive labels and dividing it by the total number of labels.
|
||||
7. Creates a new table twttr-recos-ml-prod.realgraph.train$table_suffix$ by sampling from the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table, with a downsampling of negative examples to balance the number of positive and negative examples, based on the positive rate calculated in step 6.
|
||||
|
||||
The resulting twttr-recos-ml-prod.realgraph.train$table_suffix$ table is used as a training dataset for a machine learning model.
|
||||
|
||||
# train_model.sql
|
||||
|
||||
This BQ command creates or replaces a machine learning model called twttr-recos-ml-prod.realgraph.prod$table_suffix$. The model is a boosted tree classifier, which is used for binary classification problems.
|
||||
|
||||
The options provided in the command configure the specific settings for the model, such as the number of parallel trees, the maximum number of iterations, and the data split method. The DATA_SPLIT_METHOD parameter is set to CUSTOM, and DATA_SPLIT_COL is set to if_eval, which means the data will be split into training and evaluation sets based on the if_eval column. The IF function is used to assign a boolean value of true or false to if_eval based on the modulo operation performed on source_id.
|
||||
|
||||
The SELECT statement specifies the input data for the model. The columns selected include label (the target variable to be predicted), as well as various features such as num_days, num_tweets, and num_follows that are used to predict the target variable.
|
Binary file not shown.
@ -1,18 +0,0 @@
|
||||
-- get latest partition of candidates with data
|
||||
DECLARE date_candidates DATE;
|
||||
SET date_candidates = (SELECT DATE(TIMESTAMP_MILLIS($start_time$)));
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `twttr-recos-ml-prod.realgraph.candidates_sampled` AS
|
||||
SELECT * FROM `twttr-recos-ml-prod.realgraph.candidates_for_training` LIMIT 100;
|
||||
|
||||
-- remove previous output snapshot (if exists) to avoid double-writing
|
||||
DELETE
|
||||
FROM `twttr-recos-ml-prod.realgraph.candidates_sampled`
|
||||
WHERE ds = date_candidates;
|
||||
|
||||
-- sample from candidates table instead of recomputing features
|
||||
INSERT INTO `twttr-recos-ml-prod.realgraph.candidates_sampled`
|
||||
SELECT * FROM `twttr-recos-ml-prod.realgraph.candidates_for_training`
|
||||
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(source_id, '_', destination_id))), 100) = $mod_remainder$
|
||||
AND ds = date_candidates;
|
||||
|
Binary file not shown.
@ -1,43 +0,0 @@
|
||||
DECLARE date_start, date_end DATE;
|
||||
SET date_end = (SELECT DATE(TIMESTAMP_MILLIS($start_time$)));
|
||||
SET date_start = DATE_SUB(date_end, INTERVAL 30 DAY);
|
||||
|
||||
CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.candidates_for_training`
|
||||
PARTITION BY ds
|
||||
AS
|
||||
WITH T1 AS (
|
||||
SELECT source_id, destination_id, label, dateHour
|
||||
FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily`
|
||||
LEFT JOIN UNNEST(labels) AS label
|
||||
WHERE DATE(dateHour) BETWEEN date_start AND date_end
|
||||
), T2 AS (
|
||||
SELECT source_id, destination_id, num_tweets
|
||||
FROM `twttr-recos-ml-prod.realgraph.tweeting_follows`
|
||||
), T3 AS (
|
||||
SELECT
|
||||
COALESCE(T1.source_id, T2.source_id) AS source_id,
|
||||
COALESCE(T1.destination_id, T2.destination_id) AS destination_id,
|
||||
COUNT(DISTINCT(T1.dateHour)) AS num_days,
|
||||
MIN(COALESCE(num_tweets,0)) AS num_tweets, -- all rows' num_tweets should be the same
|
||||
COALESCE(DATE_DIFF(date_end, DATE(MAX(T1.dateHour)), DAY),30) AS days_since_last_interaction,
|
||||
COUNT(DISTINCT(label)) AS label_types,
|
||||
COUNTIF(label="num_follows") AS num_follows,
|
||||
COUNTIF(label="num_favorites") AS num_favorites,
|
||||
COUNTIF(label="num_tweet_clicks") AS num_tweet_clicks,
|
||||
COUNTIF(label="num_profile_views") AS num_profile_views,
|
||||
FROM T1
|
||||
FULL JOIN T2
|
||||
USING (source_id, destination_id)
|
||||
LEFT JOIN `twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot` N
|
||||
USING (source_id, destination_id)
|
||||
WHERE N.source_id IS NULL AND N.destination_id IS NULL
|
||||
GROUP BY 1,2
|
||||
ORDER BY 3 DESC,4 DESC
|
||||
), T4 AS (
|
||||
SELECT RANK() OVER (PARTITION BY source_id ORDER BY num_days DESC, num_tweets DESC) AS rn, *
|
||||
FROM T3
|
||||
) SELECT *, date_end AS ds FROM T4 WHERE rn <= 2000;
|
||||
|
||||
SELECT ds FROM `twttr-recos-ml-prod.realgraph.candidates_for_training`
|
||||
WHERE ds = (SELECT DATE(TIMESTAMP_MILLIS($start_time$)))
|
||||
LIMIT 1
|
Binary file not shown.
@ -1,4 +0,0 @@
|
||||
SELECT dateHour FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily`
|
||||
WHERE dateHour = (SELECT TIMESTAMP_ADD(TIMESTAMP_MILLIS($start_time$), INTERVAL 1 DAY))
|
||||
LIMIT 1
|
||||
|
Binary file not shown.
@ -1,67 +0,0 @@
|
||||
-- date_labels is 1 day after date_candidates (which is the current batch run's start date)
|
||||
DECLARE date_candidates, date_labels DATE;
|
||||
DECLARE positive_rate FLOAT64;
|
||||
SET date_candidates = (SELECT DATE(TIMESTAMP_MILLIS($start_time$)));
|
||||
SET date_labels = DATE_ADD(date_candidates, INTERVAL 1 DAY);
|
||||
|
||||
CREATE TABLE IF NOT EXISTS `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` AS
|
||||
SELECT
|
||||
0 AS source_id,
|
||||
1 AS destination_id,
|
||||
1 AS label,
|
||||
1 AS num_days,
|
||||
1 AS num_tweets,
|
||||
1 AS num_follows,
|
||||
1 AS num_favorites,
|
||||
1 AS num_tweet_clicks,
|
||||
1 AS num_profile_views,
|
||||
1 AS days_since_last_interaction,
|
||||
1 AS label_types,
|
||||
DATE("2023-01-08") AS ds;
|
||||
|
||||
-- delete any prior data to avoid double writing
|
||||
DELETE
|
||||
FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$`
|
||||
WHERE ds = date_candidates;
|
||||
|
||||
-- join labels with candidates with 1 day attribution delay and insert new segment
|
||||
INSERT INTO `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$`
|
||||
WITH label_positive AS (
|
||||
SELECT source_id, destination_id
|
||||
FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily`
|
||||
WHERE DATE(dateHour)=date_labels
|
||||
), label_negative AS (
|
||||
SELECT source_id, destination_id
|
||||
FROM `twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot`
|
||||
) SELECT
|
||||
F.source_id,
|
||||
F.destination_id,
|
||||
CASE WHEN P.source_id IS NULL THEN 0 ELSE 1 END AS label,
|
||||
num_days,
|
||||
num_tweets,
|
||||
num_follows,
|
||||
num_favorites,
|
||||
num_tweet_clicks,
|
||||
num_profile_views,
|
||||
days_since_last_interaction,
|
||||
label_types,
|
||||
date_candidates AS ds
|
||||
FROM `twttr-recos-ml-prod.realgraph.candidates_sampled` F
|
||||
LEFT JOIN label_positive P USING(source_id, destination_id)
|
||||
LEFT JOIN label_negative N USING(source_id, destination_id)
|
||||
WHERE N.source_id IS NULL AND N.destination_id IS NULL
|
||||
AND F.ds=date_candidates
|
||||
;
|
||||
|
||||
-- get positive rate
|
||||
SET positive_rate =
|
||||
(SELECT SUM(label)/COUNT(label) AS pct_positive
|
||||
FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$`
|
||||
);
|
||||
|
||||
-- create training dataset with negative downsampling (should get ~50-50 split)
|
||||
-- this spans over the cumulative date range of the labeled candidates table.
|
||||
CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.train$table_suffix$` AS
|
||||
SELECT * FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$`
|
||||
WHERE CASE WHEN label = 0 AND RAND() < positive_rate THEN true WHEN label = 1 AND RAND() < (1-positive_rate) THEN true ELSE false END
|
||||
;
|
Binary file not shown.
@ -1,27 +0,0 @@
|
||||
CREATE OR REPLACE MODEL `twttr-recos-ml-prod.realgraph.prod$table_suffix$`
|
||||
OPTIONS(MODEL_TYPE='BOOSTED_TREE_CLASSIFIER',
|
||||
BOOSTER_TYPE = 'GBTREE',
|
||||
NUM_PARALLEL_TREE = 1,
|
||||
MAX_ITERATIONS = 20,
|
||||
TREE_METHOD = 'HIST',
|
||||
EARLY_STOP = TRUE,
|
||||
SUBSAMPLE = 0.01,
|
||||
INPUT_LABEL_COLS = ['label'],
|
||||
DATA_SPLIT_METHOD = 'CUSTOM',
|
||||
DATA_SPLIT_COL = 'if_eval')
|
||||
AS SELECT
|
||||
label,
|
||||
source_id,
|
||||
destination_id,
|
||||
num_days,
|
||||
num_tweets,
|
||||
num_follows,
|
||||
num_favorites,
|
||||
num_tweet_clicks,
|
||||
num_profile_views,
|
||||
days_since_last_interaction,
|
||||
label_types,
|
||||
-- partition train/test by source_id's
|
||||
IF(MOD(ABS(FARM_FINGERPRINT(CAST(source_id AS STRING))), 10) = 0, true, false) AS if_eval,
|
||||
FROM `twttr-recos-ml-prod.realgraph.train$table_suffix$`
|
||||
;
|
@ -1,25 +0,0 @@
|
||||
scala_library(
|
||||
name = "user_session_inj",
|
||||
sources = ["UserSessionInjection.scala"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/twitter/bijection:scrooge",
|
||||
"src/scala/com/twitter/scalding_internal/multiformat/format",
|
||||
"src/thrift/com/twitter/user_session_store:thrift-scala",
|
||||
],
|
||||
)
|
||||
|
||||
scala_library(
|
||||
name = "edge_list_injection",
|
||||
sources = ["EdgeListInjection.scala"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/twitter/bijection:scrooge",
|
||||
"src/scala/com/twitter/scalding_internal/multiformat/format",
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
|
||||
],
|
||||
)
|
BIN
src/scala/com/twitter/interaction_graph/injection/BUILD.docx
Normal file
BIN
src/scala/com/twitter/interaction_graph/injection/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,14 +0,0 @@
|
||||
package com.twitter.interaction_graph.injection
|
||||
|
||||
import com.twitter.interaction_graph.thriftscala.EdgeList
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
||||
|
||||
object EdgeListInjection {
|
||||
final val injection: KeyValInjection[Long, EdgeList] =
|
||||
KeyValInjection(
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift(EdgeList)
|
||||
)
|
||||
}
|
Binary file not shown.
@ -1,14 +0,0 @@
|
||||
package com.twitter.interaction_graph.injection
|
||||
|
||||
import com.twitter.user_session_store.thriftscala.UserSession
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
|
||||
|
||||
object UserSessionInjection {
|
||||
final val injection: KeyValInjection[Long, UserSession] =
|
||||
KeyValInjection(
|
||||
Long2BigEndian,
|
||||
ScalaCompactThrift(UserSession)
|
||||
)
|
||||
}
|
BIN
src/scala/com/twitter/interaction_graph/scio/README.docx
Normal file
BIN
src/scala/com/twitter/interaction_graph/scio/README.docx
Normal file
Binary file not shown.
@ -1,7 +0,0 @@
|
||||
# Interaction Graph
|
||||
|
||||
This folder contains the code used in the offline pipeline for real graph v2.
|
||||
|
||||
The ETL jobs are contained in folders prefaced with `agg_*`, while the jobs powering the ml pipeline are in the ml folder.
|
||||
|
||||
Note that the jobs in the ml folder are mostly ETL jobs; the main training and scoring happens within BQML.
|
@ -1,62 +0,0 @@
|
||||
scala_library(
|
||||
name = "agg_address_book",
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":interaction_graph_agg_address_book_edge_snapshot-scala",
|
||||
":interaction_graph_agg_address_book_vertex_snapshot-scala",
|
||||
"3rdparty/jvm/com/twitter/storehaus:algebra",
|
||||
"addressbook/jobs/src/main/scala/com/twitter/addressbook/jobs/simplematches:simple_user_matches-scala",
|
||||
"beam-internal/src/main/scala/com/twitter/beam/io/dal",
|
||||
"beam-internal/src/main/scala/com/twitter/scio_internal/job",
|
||||
"beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow",
|
||||
"consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read",
|
||||
"src/scala/com/twitter/interaction_graph/scio/common",
|
||||
],
|
||||
)
|
||||
|
||||
jvm_binary(
|
||||
name = "interaction_graph_address_book_scio",
|
||||
main = "com.twitter.interaction_graph.scio.agg_address_book.InteractionGraphAddressBookJob",
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":agg_address_book",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "interaction_graph_agg_address_book_edge_snapshot",
|
||||
description = "User-user directed edges with addressbook features",
|
||||
java_schema = "com.twitter.interaction_graph.thriftjava.Edge",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.thriftscala.Edge",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "interaction_graph_agg_address_book_vertex_snapshot",
|
||||
description = "User vertex with addressbook features",
|
||||
java_schema = "com.twitter.interaction_graph.thriftjava.Vertex",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
@ -1,34 +0,0 @@
|
||||
package com.twitter.interaction_graph.scio.agg_address_book
|
||||
|
||||
import com.spotify.scio.ScioMetrics
|
||||
import org.apache.beam.sdk.metrics.Counter
|
||||
|
||||
trait InteractionGraphAddressBookCountersTrait {
|
||||
val Namespace = "Interaction Graph Address Book"
|
||||
|
||||
def emailFeatureInc(): Unit
|
||||
|
||||
def phoneFeatureInc(): Unit
|
||||
|
||||
def bothFeatureInc(): Unit
|
||||
}
|
||||
|
||||
/**
|
||||
* SCIO counters are used to gather run time statistics
|
||||
*/
|
||||
case object InteractionGraphAddressBookCounters extends InteractionGraphAddressBookCountersTrait {
|
||||
val emailFeatureCounter: Counter =
|
||||
ScioMetrics.counter(Namespace, "Email Feature")
|
||||
|
||||
val phoneFeatureCounter: Counter =
|
||||
ScioMetrics.counter(Namespace, "Phone Feature")
|
||||
|
||||
val bothFeatureCounter: Counter =
|
||||
ScioMetrics.counter(Namespace, "Both Feature")
|
||||
|
||||
override def emailFeatureInc(): Unit = emailFeatureCounter.inc()
|
||||
|
||||
override def phoneFeatureInc(): Unit = phoneFeatureCounter.inc()
|
||||
|
||||
override def bothFeatureInc(): Unit = bothFeatureCounter.inc()
|
||||
}
|
Binary file not shown.
@ -1,71 +0,0 @@
|
||||
package com.twitter.interaction_graph.scio.agg_address_book
|
||||
|
||||
import com.spotify.scio.ScioContext
|
||||
import com.spotify.scio.values.SCollection
|
||||
import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord
|
||||
import com.twitter.beam.io.dal.DAL
|
||||
import com.twitter.beam.io.dal.DAL.DiskFormat
|
||||
import com.twitter.beam.io.dal.DAL.PathLayout
|
||||
import com.twitter.beam.io.dal.DAL.WriteOptions
|
||||
import com.twitter.beam.job.ServiceIdentifierOptions
|
||||
import com.twitter.scio_internal.job.ScioBeamJob
|
||||
import com.twitter.statebird.v2.thriftscala.Environment
|
||||
import com.twitter.interaction_graph.thriftscala.Edge
|
||||
import com.twitter.interaction_graph.thriftscala.Vertex
|
||||
import java.time.Instant
|
||||
import org.joda.time.Interval
|
||||
|
||||
object InteractionGraphAddressBookJob extends ScioBeamJob[InteractionGraphAddressBookOption] {
|
||||
override protected def configurePipeline(
|
||||
scioContext: ScioContext,
|
||||
pipelineOptions: InteractionGraphAddressBookOption
|
||||
): Unit = {
|
||||
@transient
|
||||
implicit lazy val sc: ScioContext = scioContext
|
||||
implicit lazy val dateInterval: Interval = pipelineOptions.interval
|
||||
implicit lazy val addressBookCounters: InteractionGraphAddressBookCountersTrait =
|
||||
InteractionGraphAddressBookCounters
|
||||
|
||||
val interactionGraphAddressBookSource = InteractionGraphAddressBookSource(pipelineOptions)
|
||||
|
||||
val addressBook: SCollection[UserMatchesRecord] =
|
||||
interactionGraphAddressBookSource.readSimpleUserMatches(
|
||||
dateInterval.withStart(dateInterval.getStart.minusDays(3))
|
||||
)
|
||||
val (vertex, edges) = InteractionGraphAddressBookUtil.process(addressBook)
|
||||
|
||||
val dalEnvironment: String = pipelineOptions
|
||||
.as(classOf[ServiceIdentifierOptions])
|
||||
.getEnvironment()
|
||||
val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) {
|
||||
pipelineOptions.getDALWriteEnvironment
|
||||
} else {
|
||||
dalEnvironment
|
||||
}
|
||||
|
||||
vertex.saveAsCustomOutput(
|
||||
"Write Vertex Records",
|
||||
DAL.writeSnapshot[Vertex](
|
||||
InteractionGraphAggAddressBookVertexSnapshotScalaDataset,
|
||||
PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_vertex_daily"),
|
||||
Instant.ofEpochMilli(dateInterval.getEndMillis),
|
||||
DiskFormat.Parquet,
|
||||
Environment.valueOf(dalWriteEnvironment),
|
||||
writeOption =
|
||||
WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 16.0).ceil.toInt))
|
||||
)
|
||||
)
|
||||
|
||||
edges.saveAsCustomOutput(
|
||||
"Write Edge Records",
|
||||
DAL.writeSnapshot[Edge](
|
||||
InteractionGraphAggAddressBookEdgeSnapshotScalaDataset,
|
||||
PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_edge_daily"),
|
||||
Instant.ofEpochMilli(dateInterval.getEndMillis),
|
||||
DiskFormat.Parquet,
|
||||
Environment.valueOf(dalWriteEnvironment),
|
||||
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,24 +0,0 @@
|
||||
package com.twitter.interaction_graph.scio.agg_address_book
|
||||
|
||||
import com.twitter.beam.io.dal.DALOptions
|
||||
import com.twitter.beam.job.DateRangeOptions
|
||||
import org.apache.beam.sdk.options.Default
|
||||
import org.apache.beam.sdk.options.Description
|
||||
import org.apache.beam.sdk.options.Validation.Required
|
||||
|
||||
trait InteractionGraphAddressBookOption extends DALOptions with DateRangeOptions {
|
||||
@Required
|
||||
@Description("Output path for storing the final dataset")
|
||||
def getOutputPath: String
|
||||
def setOutputPath(value: String): Unit
|
||||
|
||||
@Description("Indicates DAL write environment. Can be set to dev/stg during local validation")
|
||||
@Default.String("PROD")
|
||||
def getDALWriteEnvironment: String
|
||||
def setDALWriteEnvironment(value: String): Unit
|
||||
|
||||
@Description("Number of shards/partitions for saving the final dataset.")
|
||||
@Default.Integer(16)
|
||||
def getNumberOfShards: Integer
|
||||
def setNumberOfShards(value: Integer): Unit
|
||||
}
|
Binary file not shown.
@ -1,28 +0,0 @@
|
||||
package com.twitter.interaction_graph.scio.agg_address_book
|
||||
|
||||
import com.spotify.scio.ScioContext
|
||||
import com.spotify.scio.values.SCollection
|
||||
import com.twitter.addressbook.jobs.simplematches.SimpleUserMatchesScalaDataset
|
||||
import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord
|
||||
import com.twitter.beam.job.ServiceIdentifierOptions
|
||||
import com.twitter.cde.scio.dal_read.SourceUtil
|
||||
import org.joda.time.Interval
|
||||
|
||||
case class InteractionGraphAddressBookSource(
|
||||
pipelineOptions: InteractionGraphAddressBookOption
|
||||
)(
|
||||
implicit sc: ScioContext,
|
||||
) {
|
||||
val dalEnvironment: String = pipelineOptions
|
||||
.as(classOf[ServiceIdentifierOptions])
|
||||
.getEnvironment()
|
||||
|
||||
def readSimpleUserMatches(
|
||||
dateInterval: Interval
|
||||
): SCollection[UserMatchesRecord] = {
|
||||
SourceUtil.readMostRecentSnapshotDALDataset[UserMatchesRecord](
|
||||
SimpleUserMatchesScalaDataset,
|
||||
dateInterval,
|
||||
dalEnvironment)
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,93 +0,0 @@
|
||||
package com.twitter.interaction_graph.scio.agg_address_book
|
||||
|
||||
import com.spotify.scio.values.SCollection
|
||||
import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord
|
||||
import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil
|
||||
import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput
|
||||
import com.twitter.interaction_graph.thriftscala.Edge
|
||||
import com.twitter.interaction_graph.thriftscala.FeatureName
|
||||
import com.twitter.interaction_graph.thriftscala.Vertex
|
||||
|
||||
object InteractionGraphAddressBookUtil {
|
||||
val EMAIL = "email"
|
||||
val PHONE = "phone"
|
||||
val BOTH = "both"
|
||||
|
||||
val DefaultAge = 1
|
||||
val DegaultFeatureValue = 1.0
|
||||
|
||||
def process(
|
||||
addressBook: SCollection[UserMatchesRecord]
|
||||
)(
|
||||
implicit addressBookCounters: InteractionGraphAddressBookCountersTrait
|
||||
): (SCollection[Vertex], SCollection[Edge]) = {
|
||||
// First construct a data with (src, dst, name), where name can be "email", "phone", or "both"
|
||||
val addressBookTypes: SCollection[((Long, Long), String)] = addressBook.flatMap { record =>
|
||||
record.forwardMatches.toSeq.flatMap { matchDetails =>
|
||||
val matchedUsers = (record.userId, matchDetails.userId)
|
||||
(matchDetails.matchedByEmail, matchDetails.matchedByPhone) match {
|
||||
case (true, true) =>
|
||||
Seq((matchedUsers, EMAIL), (matchedUsers, PHONE), (matchedUsers, BOTH))
|
||||
case (true, false) => Seq((matchedUsers, EMAIL))
|
||||
case (false, true) => Seq((matchedUsers, PHONE))
|
||||
case _ => Seq.empty
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Then construct the input data for feature calculation
|
||||
val addressBookFeatureInput: SCollection[InteractionGraphRawInput] = addressBookTypes
|
||||
.map {
|
||||
case ((src, dst), name) =>
|
||||
if (src < dst)
|
||||
((src, dst, name), false)
|
||||
else
|
||||
((dst, src, name), true)
|
||||
}.groupByKey
|
||||
.flatMap {
|
||||
case ((src, dst, name), iterator) =>
|
||||
val isReversedValues = iterator.toSeq
|
||||
// check if (src, dst) is mutual follow
|
||||
val isMutualFollow = isReversedValues.size == 2
|
||||
// get correct srcId and dstId if there is no mutual follow and they are reversed
|
||||
val (srcId, dstId) = {
|
||||
if (!isMutualFollow && isReversedValues.head)
|
||||
(dst, src)
|
||||
else
|
||||
(src, dst)
|
||||
}
|
||||
// get the feature name and mutual follow name
|
||||
val (featureName, mfFeatureName) = name match {
|
||||
case EMAIL =>
|
||||
addressBookCounters.emailFeatureInc()
|
||||
(FeatureName.AddressBookEmail, FeatureName.AddressBookMutualEdgeEmail)
|
||||
case PHONE =>
|
||||
addressBookCounters.phoneFeatureInc()
|
||||
(FeatureName.AddressBookPhone, FeatureName.AddressBookMutualEdgePhone)
|
||||
case BOTH =>
|
||||
addressBookCounters.bothFeatureInc()
|
||||
(FeatureName.AddressBookInBoth, FeatureName.AddressBookMutualEdgeInBoth)
|
||||
}
|
||||
// construct the TypedPipe for feature calculation
|
||||
if (isMutualFollow) {
|
||||
Iterator(
|
||||
InteractionGraphRawInput(srcId, dstId, featureName, DefaultAge, DegaultFeatureValue),
|
||||
InteractionGraphRawInput(dstId, srcId, featureName, DefaultAge, DegaultFeatureValue),
|
||||
InteractionGraphRawInput(
|
||||
srcId,
|
||||
dstId,
|
||||
mfFeatureName,
|
||||
DefaultAge,
|
||||
DegaultFeatureValue),
|
||||
InteractionGraphRawInput(dstId, srcId, mfFeatureName, DefaultAge, DegaultFeatureValue)
|
||||
)
|
||||
} else {
|
||||
Iterator(
|
||||
InteractionGraphRawInput(srcId, dstId, featureName, DefaultAge, DegaultFeatureValue))
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate the Features
|
||||
FeatureGeneratorUtil.getFeatures(addressBookFeatureInput)
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,34 +0,0 @@
|
||||
## InteractionGraphAddressBook Dataflow Job
|
||||
|
||||
#### IntelliJ
|
||||
```
|
||||
./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio
|
||||
```
|
||||
|
||||
#### Compile
|
||||
```
|
||||
./bazel build src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio
|
||||
```
|
||||
|
||||
#### Build Jar
|
||||
```
|
||||
./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio
|
||||
```
|
||||
|
||||
#### Run Scheduled Job
|
||||
```
|
||||
export PROJECTID=twttr-recos-ml-prod
|
||||
export REGION=us-central1
|
||||
export JOB_NAME=interaction-graph-address-book-dataflow
|
||||
|
||||
bin/d6w schedule \
|
||||
${PROJECTID}/${REGION}/${JOB_NAME} \
|
||||
src/scala/com/twitter/interaction_graph/scio/agg_address_book/config.d6w \
|
||||
--bind=profile.user_name=cassowary \
|
||||
--bind=profile.project=${PROJECTID} \
|
||||
--bind=profile.region=${REGION} \
|
||||
--bind=profile.job_name=${JOB_NAME} \
|
||||
--bind=profile.environment=prod \
|
||||
--bind=profile.date=2022-04-13 \
|
||||
--bind=profile.output_path=processed/interaction_graph_agg_address_book_dataflow
|
||||
```
|
@ -1,175 +0,0 @@
|
||||
scala_library(
|
||||
name = "agg_all",
|
||||
sources = ["*.scala"],
|
||||
compiler_option_sets = ["fatal_warnings"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":interaction_graph_history_aggregated_raw_edge_daily-scala",
|
||||
":interaction_graph_history_aggregated_vertex_daily-scala",
|
||||
":interaction_graph_aggregated_edge_daily-scala",
|
||||
":interaction_graph_aggregated_vertex_daily-scala",
|
||||
":interaction_graph_history_aggregated_edge_snapshot-scala",
|
||||
":interaction_graph_history_aggregated_vertex_snapshot-scala",
|
||||
":real_graph_features-scala",
|
||||
"beam-internal/src/main/scala/com/twitter/beam/io/dal",
|
||||
"beam-internal/src/main/scala/com/twitter/scio_internal/job",
|
||||
"beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow",
|
||||
"consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read",
|
||||
"src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_agg_address_book_edge_snapshot-scala",
|
||||
"src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_agg_address_book_vertex_snapshot-scala",
|
||||
"src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_edge_daily-scala",
|
||||
"src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_vertex_daily-scala",
|
||||
"src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_edge_daily-scala",
|
||||
"src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_vertex_daily-scala",
|
||||
"src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_edge_snapshot-scala",
|
||||
"src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_vertex_snapshot-scala",
|
||||
"src/scala/com/twitter/interaction_graph/scio/common",
|
||||
"src/scala/com/twitter/interaction_graph/scio/ml/scores:real_graph_in_scores-scala",
|
||||
"src/scala/com/twitter/interaction_graph/scio/ml/scores:real_graph_oon_scores-scala",
|
||||
"src/scala/com/twitter/wtf/dataflow/user_events:valid_user_follows-scala",
|
||||
"src/thrift/com/twitter/wtf/candidate:wtf-candidate-scala",
|
||||
"tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam",
|
||||
"usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "interaction_graph_history_aggregated_raw_edge_daily",
|
||||
description = "User-user directed edges with all features",
|
||||
java_schema = "com.twitter.interaction_graph.thriftjava.Edge",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.thriftscala.Edge",
|
||||
segment_type = "partitioned",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "interaction_graph_history_aggregated_vertex_daily",
|
||||
description = "User vertex with all features",
|
||||
java_schema = "com.twitter.interaction_graph.thriftjava.Vertex",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex",
|
||||
segment_type = "partitioned",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
|
||||
],
|
||||
)
|
||||
|
||||
jvm_binary(
|
||||
name = "interaction_graph_aggregation_job_scio",
|
||||
main = "com.twitter.interaction_graph.scio.agg_all.InteractionGraphAggregationJob",
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
":agg_all",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "interaction_graph_history_aggregated_edge_snapshot",
|
||||
description = "User-user directed edges with all features",
|
||||
java_schema = "com.twitter.interaction_graph.thriftjava.Edge",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.thriftscala.Edge",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "interaction_graph_history_aggregated_vertex_snapshot",
|
||||
description = "User vertex with all features",
|
||||
java_schema = "com.twitter.interaction_graph.thriftjava.Vertex",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "interaction_graph_aggregated_edge_daily",
|
||||
description = "User-user directed edges with all features",
|
||||
java_schema = "com.twitter.interaction_graph.thriftjava.Edge",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.thriftscala.Edge",
|
||||
segment_type = "partitioned",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "interaction_graph_aggregated_vertex_daily",
|
||||
description = "User vertex with all features",
|
||||
java_schema = "com.twitter.interaction_graph.thriftjava.Vertex",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex",
|
||||
segment_type = "partitioned",
|
||||
tags = ["bazel-compatible"],
|
||||
java_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
|
||||
],
|
||||
scala_dependencies = [
|
||||
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "real_graph_features",
|
||||
key_type = "Long",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.injection.UserSessionInjection.injection",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "com.twitter.user_session_store.thriftscala.UserSession",
|
||||
scala_dependencies = [
|
||||
"src/scala/com/twitter/interaction_graph/injection:user_session_inj",
|
||||
],
|
||||
)
|
||||
|
||||
create_datasets(
|
||||
base_name = "home_light_ranker_top_k_real_graph_features",
|
||||
key_type = "Long",
|
||||
platform = "java8",
|
||||
role = "cassowary",
|
||||
scala_schema = "com.twitter.interaction_graph.injection.EdgeListInjection.injection",
|
||||
segment_type = "snapshot",
|
||||
tags = ["bazel-compatible"],
|
||||
val_type = "com.twitter.interaction_graph.thriftscala.EdgeList",
|
||||
scala_dependencies = [
|
||||
"src/scala/com/twitter/interaction_graph/injection:edge_list_injection",
|
||||
],
|
||||
)
|
BIN
src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD.docx
Normal file
BIN
src/scala/com/twitter/interaction_graph/scio/agg_all/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,14 +0,0 @@
|
||||
package com.twitter.interaction_graph.scio.agg_all
|
||||
|
||||
object InteractionGraphScoringConfig {
|
||||
|
||||
/**
|
||||
* This is alpha for a variant of the Exponentially weighted moving average, computed as:
|
||||
* ewma_{t+1} = x_{t+1} + (1-alpha) * ewma_t (ewma_1 = x_1, t > 0)
|
||||
* We choose alpha such that the half life of weights is 7 days.
|
||||
* Note that we don't down-weight x_{t+1} (unlike in EWMA) as we only want to decay actions
|
||||
* as they grow old, not compute the average value.
|
||||
*/
|
||||
val ALPHA = 1.0
|
||||
val ONE_MINUS_ALPHA = 0.955
|
||||
}
|
Binary file not shown.
@ -1,314 +0,0 @@
|
||||
package com.twitter.interaction_graph.scio.agg_all
|
||||
|
||||
import com.google.cloud.bigquery.BigQueryOptions
|
||||
import com.google.cloud.bigquery.QueryJobConfiguration
|
||||
import com.spotify.scio.ScioContext
|
||||
import com.spotify.scio.ScioMetrics
|
||||
import com.spotify.scio.values.SCollection
|
||||
import com.twitter.beam.io.dal.DAL
|
||||
import com.twitter.beam.io.dal.DAL.DiskFormat
|
||||
import com.twitter.beam.io.dal.DAL.PathLayout
|
||||
import com.twitter.beam.io.dal.DAL.WriteOptions
|
||||
import com.twitter.beam.io.exception.DataNotFoundException
|
||||
import com.twitter.beam.job.ServiceIdentifierOptions
|
||||
import com.twitter.interaction_graph.scio.agg_all.InteractionGraphAggregationTransform._
|
||||
import com.twitter.interaction_graph.scio.common.DateUtil
|
||||
import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil
|
||||
import com.twitter.interaction_graph.scio.common.UserUtil
|
||||
import com.twitter.interaction_graph.thriftscala.Edge
|
||||
import com.twitter.interaction_graph.thriftscala.Vertex
|
||||
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
||||
import com.twitter.scio_internal.job.ScioBeamJob
|
||||
import com.twitter.statebird.v2.thriftscala.Environment
|
||||
import com.twitter.user_session_store.thriftscala.UserSession
|
||||
import com.twitter.util.Duration
|
||||
import com.twitter.wtf.candidate.thriftscala.ScoredEdge
|
||||
import java.time.Instant
|
||||
import org.apache.avro.generic.GenericRecord
|
||||
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO
|
||||
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead
|
||||
import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord
|
||||
import org.apache.beam.sdk.transforms.SerializableFunction
|
||||
import org.joda.time.Interval
|
||||
import scala.collection.JavaConverters._
|
||||
|
||||
object InteractionGraphAggregationJob extends ScioBeamJob[InteractionGraphAggregationOption] {
|
||||
|
||||
// to parse latest date from the BQ table we're reading from
|
||||
val parseDateRow = new SerializableFunction[SchemaAndRecord, String] {
|
||||
override def apply(input: SchemaAndRecord): String = {
|
||||
val genericRecord: GenericRecord = input.getRecord()
|
||||
genericRecord.get("ds").toString
|
||||
}
|
||||
}
|
||||
|
||||
// note that we're using the prob_explicit for real_graph_features (for Home)
|
||||
val parseRow = new SerializableFunction[SchemaAndRecord, ScoredEdge] {
|
||||
override def apply(record: SchemaAndRecord): ScoredEdge = {
|
||||
val genericRecord: GenericRecord = record.getRecord()
|
||||
ScoredEdge(
|
||||
genericRecord.get("source_id").asInstanceOf[Long],
|
||||
genericRecord.get("destination_id").asInstanceOf[Long],
|
||||
genericRecord.get("prob_explicit").asInstanceOf[Double],
|
||||
genericRecord.get("followed").asInstanceOf[Boolean],
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
override def runPipeline(
|
||||
sc: ScioContext,
|
||||
opts: InteractionGraphAggregationOption
|
||||
): Unit = {
|
||||
|
||||
val dateStr: String = opts.getDate().value.getStart.toString("yyyyMMdd")
|
||||
logger.info(s"dateStr $dateStr")
|
||||
val project: String = "twttr-recos-ml-prod"
|
||||
val datasetName: String = "realgraph"
|
||||
val bqTableName: String = "scores"
|
||||
val fullBqTableName: String = s"$project:$datasetName.$bqTableName"
|
||||
|
||||
if (opts.getDALWriteEnvironment.toLowerCase == "prod") {
|
||||
val bqClient =
|
||||
BigQueryOptions.newBuilder.setProjectId(project).build.getService
|
||||
val query =
|
||||
s"""
|
||||
|SELECT total_rows
|
||||
|FROM `$project.$datasetName.INFORMATION_SCHEMA.PARTITIONS`
|
||||
|WHERE partition_id ="$dateStr" AND
|
||||
|table_name="$bqTableName" AND total_rows > 0
|
||||
|""".stripMargin
|
||||
val queryConfig = QueryJobConfiguration.of(query)
|
||||
val results = bqClient.query(queryConfig).getValues.asScala.toSeq
|
||||
if (results.isEmpty || results.head.get(0).getLongValue == 0) {
|
||||
throw new DataNotFoundException(s"$dateStr not present in $fullBqTableName.")
|
||||
}
|
||||
}
|
||||
sc.run()
|
||||
}
|
||||
|
||||
override protected def configurePipeline(
|
||||
scioContext: ScioContext,
|
||||
pipelineOptions: InteractionGraphAggregationOption
|
||||
): Unit = {
|
||||
@transient
|
||||
implicit lazy val sc: ScioContext = scioContext
|
||||
implicit lazy val dateInterval: Interval = pipelineOptions.interval
|
||||
val yesterday = DateUtil.subtract(dateInterval, Duration.fromDays(1))
|
||||
|
||||
val dalEnvironment: String = pipelineOptions
|
||||
.as(classOf[ServiceIdentifierOptions])
|
||||
.getEnvironment()
|
||||
val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) {
|
||||
pipelineOptions.getDALWriteEnvironment
|
||||
} else {
|
||||
dalEnvironment
|
||||
}
|
||||
val dateStr: String = pipelineOptions.getDate().value.getStart.toString("yyyy-MM-dd")
|
||||
logger.info(s"dateStr $dateStr")
|
||||
val project: String = "twttr-recos-ml-prod"
|
||||
val datasetName: String = "realgraph"
|
||||
val bqTableName: String = "scores"
|
||||
val fullBqTableName: String = s"$project:$datasetName.$bqTableName"
|
||||
|
||||
val scoreExport: SCollection[ScoredEdge] =
|
||||
sc.customInput(
|
||||
s"Read from BQ table $fullBqTableName",
|
||||
BigQueryIO
|
||||
.read(parseRow)
|
||||
.fromQuery(s"""SELECT source_id, destination_id, prob_explicit, followed
|
||||
|FROM `$project.$datasetName.$bqTableName`
|
||||
|WHERE ds = '$dateStr'""".stripMargin)
|
||||
.usingStandardSql()
|
||||
.withMethod(TypedRead.Method.DEFAULT)
|
||||
)
|
||||
|
||||
val source = InteractionGraphAggregationSource(pipelineOptions)
|
||||
|
||||
val (addressEdgeFeatures, addressVertexFeatures) = source.readAddressBookFeatures()
|
||||
|
||||
val (clientEventLogsEdgeFeatures, clientEventLogsVertexFeatures) =
|
||||
source.readClientEventLogsFeatures(dateInterval)
|
||||
|
||||
val (flockEdgeFeatures, flockVertexFeatures) = source.readFlockFeatures()
|
||||
|
||||
val (directInteractionsEdgeFeatures, directInteractionsVertexFeatures) =
|
||||
source.readDirectInteractionsFeatures(dateInterval)
|
||||
|
||||
val invalidUsers = UserUtil.getInvalidUsers(source.readFlatUsers())
|
||||
|
||||
val (prevAggEdge, prevAggVertex) = source.readAggregatedFeatures(yesterday)
|
||||
|
||||
val prevAggregatedVertex: SCollection[Vertex] =
|
||||
UserUtil
|
||||
.filterUsersByIdMapping[Vertex](
|
||||
prevAggVertex,
|
||||
invalidUsers,
|
||||
v => v.userId
|
||||
)
|
||||
|
||||
/** Remove status-based features (flock/ab) from current graph, because we only need the latest
|
||||
* This is to allow us to filter and roll-up a smaller dataset, to which we will still add
|
||||
* back the status-based features for the complete scoredAggregates (that other teams will read).
|
||||
*/
|
||||
val prevAggEdgeFiltered = prevAggEdge
|
||||
.filter { e =>
|
||||
e.sourceId != e.destinationId
|
||||
}
|
||||
.withName("filtering status-based edges")
|
||||
.flatMap(FeatureGeneratorUtil.removeStatusFeatures)
|
||||
val prevAggEdgeValid: SCollection[Edge] =
|
||||
UserUtil
|
||||
.filterUsersByMultipleIdMappings[Edge](
|
||||
prevAggEdgeFiltered,
|
||||
invalidUsers,
|
||||
Seq(e => e.sourceId, e => e.destinationId)
|
||||
)
|
||||
|
||||
val aggregatedActivityVertexDaily = UserUtil
|
||||
.filterUsersByIdMapping[Vertex](
|
||||
FeatureGeneratorUtil
|
||||
.combineVertexFeatures(
|
||||
clientEventLogsVertexFeatures ++
|
||||
directInteractionsVertexFeatures ++
|
||||
addressVertexFeatures ++
|
||||
flockVertexFeatures
|
||||
),
|
||||
invalidUsers,
|
||||
v => v.userId
|
||||
)
|
||||
|
||||
// we split up the roll-up of decayed counts between status vs activity/count-based features
|
||||
val aggregatedActivityEdgeDaily = FeatureGeneratorUtil
|
||||
.combineEdgeFeatures(clientEventLogsEdgeFeatures ++ directInteractionsEdgeFeatures)
|
||||
|
||||
// Vertex level, Add the decay sum for history and daily
|
||||
val aggregatedActivityVertex = FeatureGeneratorUtil
|
||||
.combineVertexFeaturesWithDecay(
|
||||
prevAggregatedVertex,
|
||||
aggregatedActivityVertexDaily,
|
||||
InteractionGraphScoringConfig.ONE_MINUS_ALPHA,
|
||||
InteractionGraphScoringConfig.ALPHA
|
||||
)
|
||||
|
||||
// Edge level, Add the decay sum for history and daily
|
||||
val aggregatedActivityEdge = FeatureGeneratorUtil
|
||||
.combineEdgeFeaturesWithDecay(
|
||||
prevAggEdgeValid,
|
||||
aggregatedActivityEdgeDaily,
|
||||
InteractionGraphScoringConfig.ONE_MINUS_ALPHA,
|
||||
InteractionGraphScoringConfig.ALPHA
|
||||
)
|
||||
.filter(FeatureGeneratorUtil.edgeWithFeatureOtherThanDwellTime)
|
||||
.withName("removing edges that only have dwell time features")
|
||||
|
||||
val edgeKeyedScores = scoreExport.keyBy { e => (e.sourceId, e.destinationId) }
|
||||
|
||||
val scoredAggregatedActivityEdge = aggregatedActivityEdge
|
||||
.keyBy { e => (e.sourceId, e.destinationId) }
|
||||
.withName("join with scores")
|
||||
.leftOuterJoin(edgeKeyedScores)
|
||||
.map {
|
||||
case (_, (e, scoredEdgeOpt)) =>
|
||||
val scoreOpt = scoredEdgeOpt.map(_.score)
|
||||
e.copy(weight = if (scoreOpt.nonEmpty) {
|
||||
ScioMetrics.counter("after joining edge with scores", "has score").inc()
|
||||
scoreOpt
|
||||
} else {
|
||||
ScioMetrics.counter("after joining edge with scores", "no score").inc()
|
||||
None
|
||||
})
|
||||
}
|
||||
|
||||
val combinedFeatures = FeatureGeneratorUtil
|
||||
.combineEdgeFeatures(aggregatedActivityEdge ++ addressEdgeFeatures ++ flockEdgeFeatures)
|
||||
.keyBy { e => (e.sourceId, e.destinationId) }
|
||||
|
||||
val aggregatedActivityScoredEdge =
|
||||
edgeKeyedScores
|
||||
.withName("join with combined edge features")
|
||||
.leftOuterJoin(combinedFeatures)
|
||||
.map {
|
||||
case (_, (scoredEdge, combinedFeaturesOpt)) =>
|
||||
if (combinedFeaturesOpt.exists(_.features.nonEmpty)) {
|
||||
ScioMetrics.counter("after joining scored edge with features", "has features").inc()
|
||||
Edge(
|
||||
sourceId = scoredEdge.sourceId,
|
||||
destinationId = scoredEdge.destinationId,
|
||||
weight = Some(scoredEdge.score),
|
||||
features = combinedFeaturesOpt.map(_.features).getOrElse(Nil)
|
||||
)
|
||||
} else {
|
||||
ScioMetrics.counter("after joining scored edge with features", "no features").inc()
|
||||
Edge(
|
||||
sourceId = scoredEdge.sourceId,
|
||||
destinationId = scoredEdge.destinationId,
|
||||
weight = Some(scoredEdge.score),
|
||||
features = Nil
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
val realGraphFeatures =
|
||||
getTopKTimelineFeatures(aggregatedActivityScoredEdge, pipelineOptions.getMaxDestinationIds)
|
||||
|
||||
aggregatedActivityVertex.saveAsCustomOutput(
|
||||
"Write History Aggregated Vertex Records",
|
||||
DAL.writeSnapshot[Vertex](
|
||||
dataset = InteractionGraphHistoryAggregatedVertexSnapshotScalaDataset,
|
||||
pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_vertex"),
|
||||
endDate = Instant.ofEpochMilli(dateInterval.getEndMillis),
|
||||
diskFormat = DiskFormat.Parquet,
|
||||
environmentOverride = Environment.valueOf(dalWriteEnvironment),
|
||||
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards / 10))
|
||||
)
|
||||
)
|
||||
|
||||
scoredAggregatedActivityEdge.saveAsCustomOutput(
|
||||
"Write History Aggregated Edge Records",
|
||||
DAL.writeSnapshot[Edge](
|
||||
dataset = InteractionGraphHistoryAggregatedEdgeSnapshotScalaDataset,
|
||||
pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_raw_edge"),
|
||||
endDate = Instant.ofEpochMilli(dateInterval.getEndMillis),
|
||||
diskFormat = DiskFormat.Parquet,
|
||||
environmentOverride = Environment.valueOf(dalWriteEnvironment),
|
||||
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
|
||||
)
|
||||
)
|
||||
|
||||
aggregatedActivityVertexDaily.saveAsCustomOutput(
|
||||
"Write Daily Aggregated Vertex Records",
|
||||
DAL.write[Vertex](
|
||||
dataset = InteractionGraphAggregatedVertexDailyScalaDataset,
|
||||
pathLayout =
|
||||
PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_vertex_daily"),
|
||||
interval = dateInterval,
|
||||
diskFormat = DiskFormat.Parquet,
|
||||
environmentOverride = Environment.valueOf(dalWriteEnvironment),
|
||||
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards / 10))
|
||||
)
|
||||
)
|
||||
|
||||
aggregatedActivityEdgeDaily.saveAsCustomOutput(
|
||||
"Write Daily Aggregated Edge Records",
|
||||
DAL.write[Edge](
|
||||
dataset = InteractionGraphAggregatedEdgeDailyScalaDataset,
|
||||
pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_edge_daily"),
|
||||
interval = dateInterval,
|
||||
diskFormat = DiskFormat.Parquet,
|
||||
environmentOverride = Environment.valueOf(dalWriteEnvironment),
|
||||
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
|
||||
)
|
||||
)
|
||||
|
||||
realGraphFeatures.saveAsCustomOutput(
|
||||
"Write Timeline Real Graph Features",
|
||||
DAL.writeVersionedKeyVal[KeyVal[Long, UserSession]](
|
||||
dataset = RealGraphFeaturesScalaDataset,
|
||||
pathLayout =
|
||||
PathLayout.VersionedPath(pipelineOptions.getOutputPath + "/real_graph_features"),
|
||||
environmentOverride = Environment.valueOf(dalWriteEnvironment),
|
||||
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
|
||||
)
|
||||
)
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,36 +0,0 @@
|
||||
package com.twitter.interaction_graph.scio.agg_all
|
||||
|
||||
import com.twitter.beam.io.dal.DALOptions
|
||||
import com.twitter.beam.job.DateRangeOptions
|
||||
import org.apache.beam.sdk.options.Default
|
||||
import org.apache.beam.sdk.options.Description
|
||||
import org.apache.beam.sdk.options.Validation.Required
|
||||
|
||||
trait InteractionGraphAggregationOption extends DALOptions with DateRangeOptions {
|
||||
@Required
|
||||
@Description("Output path for storing the final dataset")
|
||||
def getOutputPath: String
|
||||
def setOutputPath(value: String): Unit
|
||||
|
||||
@Description("Indicates DAL write environment. Can be set to dev/stg during local validation")
|
||||
@Default.String("PROD")
|
||||
def getDALWriteEnvironment: String
|
||||
def setDALWriteEnvironment(value: String): Unit
|
||||
|
||||
@Description("Number of shards/partitions for saving the final dataset.")
|
||||
@Default.Integer(16)
|
||||
def getNumberOfShards: Integer
|
||||
def setNumberOfShards(value: Integer): Unit
|
||||
|
||||
@Description("BQ Table name for reading scores from")
|
||||
def getBqTableName: String
|
||||
def setBqTableName(value: String): Unit
|
||||
|
||||
@Description("max destination ids that we will store for real graph features in TL")
|
||||
def getMaxDestinationIds: Integer
|
||||
def setMaxDestinationIds(value: Integer): Unit
|
||||
|
||||
@Description("true if getting scores from BQ instead of DAL-based dataset in GCS")
|
||||
def getScoresFromBQ: Boolean
|
||||
def setScoresFromBQ(value: Boolean): Unit
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user