[docx] split commit for file 4800

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
Ari Archer 2024-01-23 19:17:04 +02:00
parent 470dc00686
commit c4b4b821a3
No known key found for this signature in database
GPG Key ID: A50D5B4B599AF8A2
400 changed files with 0 additions and 15466 deletions

View File

@ -1,18 +0,0 @@
python3_library(
name = "libs_py3",
sources = ["*.py"],
dependencies = [
"src/python/twitter/deepbird/io",
"twml:twml-nodeps",
],
)
python37_binary(
name = "score",
source = "score.py",
dependencies = [
":libs_py3",
"3rdparty/python/_closures/src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird/lolly:score",
"twml",
],
)

View File

@ -1,23 +0,0 @@
# checkstyle: noqa
import tensorflow.compat.v1 as tf
from ..constants import EB_SCORE_IDX
# The rationale behind this logic is available at TQ-9678.
def get_lolly_logits(labels):
'''
:param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
:return: tf.Tensor of shape (batch size) with the extracted lolly logits.
'''
eb_lolly_scores = get_lolly_scores(labels)
inverse_eb_lolly_scores = tf.math.subtract(1.0, eb_lolly_scores)
lolly_activations = tf.math.subtract(tf.math.log(eb_lolly_scores), tf.math.log(inverse_eb_lolly_scores))
return lolly_activations
def get_lolly_scores(labels):
'''
:param labels: tf.Tensor of shape (batch size, num labels) with labels as specified by the feature config.
:return: tf.Tensor of shape (batch size) with the extracted lolly scores.
'''
logged_eb_lolly_scores = tf.reshape(labels[:, EB_SCORE_IDX], (-1, 1))
eb_lolly_scores = tf.truediv(logged_eb_lolly_scores, 100.0)
return eb_lolly_scores

View File

@ -1,145 +0,0 @@
import re
from twitter.deepbird.io.util import _get_feature_id
class Parser(object):
def parse(self, line):
match = re.search(self.pattern(), line)
if match:
return self._parse_match(match)
return None
def pattern(self):
raise NotImplementedError
def _parse_match(self, match):
raise NotImplementedError
class BiasParser(Parser):
'''
Parses the bias feature available in lolly model tsv files.
'''
def pattern(self):
'''
Matches lines like:
unified_engagement bias -0.935945
:return: a RegEx that extracts feature weight.
'''
return r"\t(bias)\t([^\s]+)"
def _parse_match(self, match):
return float(match.group(2))
class BinaryFeatureParser(Parser):
'''
Parses binary features available in lolly model tsv files.
'''
def pattern(self):
'''
Matches lines like:
unified_engagement encoded_tweet_features.is_user_spam_flag -0.181130
:return: a RegEx that extracts feature name and weight.
'''
return r"\t([\w\.]+)\t([^\s]+)"
def _parse_match(self, match):
return (match.group(1), float(match.group(2)))
class DiscretizedFeatureParser(Parser):
'''
Parses discretized features available in lolly model tsv files.
'''
def pattern(self):
'''
Matches lines like:
unified_engagement encoded_tweet_features.user_reputation.dz/dz_model=mdl/dz_range=1.000000e+00_2.000000e+00 0.031004
:return: a RegEx that extracts feature name, bin boundaries and weight.
'''
return r"([\w\.]+)\.dz\/dz_model=mdl\/dz_range=([^\s]+)\t([^\s]+)"
def _parse_match(self, match):
left_bin_side, right_bin_side = [float(number) for number in match.group(2).split("_")]
return (
match.group(1),
left_bin_side,
right_bin_side,
float(match.group(3))
)
class LollyModelFeaturesParser(Parser):
def __init__(self, bias_parser=BiasParser(), binary_feature_parser=BinaryFeatureParser(), discretized_feature_parser=DiscretizedFeatureParser()):
self._bias_parser = bias_parser
self._binary_feature_parser = binary_feature_parser
self._discretized_feature_parser = discretized_feature_parser
def parse(self, lolly_model_reader):
parsed_features = {
"bias": None,
"binary": {},
"discretized": {}
}
def process_line_fn(line):
bias_parser_result = self._bias_parser.parse(line)
if bias_parser_result:
parsed_features["bias"] = bias_parser_result
return
binary_feature_parser_result = self._binary_feature_parser.parse(line)
if binary_feature_parser_result:
name, value = binary_feature_parser_result
parsed_features["binary"][name] = value
return
discretized_feature_parser_result = self._discretized_feature_parser.parse(line)
if discretized_feature_parser_result:
name, left_bin, right_bin, weight = discretized_feature_parser_result
discretized_features = parsed_features["discretized"]
if name not in discretized_features:
discretized_features[name] = []
discretized_features[name].append((left_bin, right_bin, weight))
lolly_model_reader.read(process_line_fn)
return parsed_features
class DBv2DataExampleParser(Parser):
'''
Parses data records printed by the DBv2 train.py build_graph function.
Format: [[dbv2 logit]][[logged lolly logit]][[space separated feature ids]][[space separated feature values]]
'''
def __init__(self, lolly_model_reader, lolly_model_features_parser=LollyModelFeaturesParser()):
self.features = lolly_model_features_parser.parse(lolly_model_reader)
self.feature_name_by_dbv2_id = {}
for feature_name in list(self.features["binary"].keys()) + list(self.features["discretized"].keys()):
self.feature_name_by_dbv2_id[str(_get_feature_id(feature_name))] = feature_name
def pattern(self):
'''
:return: a RegEx that extracts dbv2 logit, logged lolly logit, feature ids and feature values.
'''
return r"\[\[([\w\.\-]+)\]\]\[\[([\w\.\-]+)\]\]\[\[([\w\.\- ]+)\]\]\[\[([\w\. ]+)\]\]"
def _parse_match(self, match):
feature_ids = match.group(3).split(" ")
feature_values = match.group(4).split(" ")
value_by_feature_name = {}
for index in range(len(feature_ids)):
feature_id = feature_ids[index]
if feature_id not in self.feature_name_by_dbv2_id:
print("Missing feature with id: " + str(feature_id))
continue
value_by_feature_name[self.feature_name_by_dbv2_id[feature_id]] = float(feature_values[index])
return value_by_feature_name

View File

@ -1,8 +0,0 @@
class LollyModelReader(object):
def __init__(self, lolly_model_file_path):
self._lolly_model_file_path = lolly_model_file_path
def read(self, process_line_fn):
with open(self._lolly_model_file_path, "r") as file:
for line in file:
process_line_fn(line)

View File

@ -1,13 +0,0 @@
import sys
from .parsers import DBv2DataExampleParser
from .reader import LollyModelReader
from .scorer import LollyModelScorer
if __name__ == "__main__":
lolly_model_reader = LollyModelReader(lolly_model_file_path=sys.argv[1])
lolly_model_scorer = LollyModelScorer(data_example_parser=DBv2DataExampleParser(lolly_model_reader))
score = lolly_model_scorer.score(data_example=sys.argv[2])
print(score)

View File

@ -1,37 +0,0 @@
class LollyModelScorer(object):
def __init__(self, data_example_parser):
self._data_example_parser = data_example_parser
def score(self, data_example):
value_by_feature_name = self._data_example_parser.parse(data_example)
features = self._data_example_parser.features
return self._score(value_by_feature_name, features)
def _score(self, value_by_feature_name, features):
score = features["bias"]
score += self._score_binary_features(features["binary"], value_by_feature_name)
score += self._score_discretized_features(features["discretized"], value_by_feature_name)
return score
def _score_binary_features(self, binary_features, value_by_feature_name):
score = 0.0
for binary_feature_name, binary_feature_weight in binary_features.items():
if binary_feature_name in value_by_feature_name:
score += binary_feature_weight
return score
def _score_discretized_features(self, discretized_features, value_by_feature_name):
score = 0.0
for discretized_feature_name, buckets in discretized_features.items():
if discretized_feature_name in value_by_feature_name:
feature_value = value_by_feature_name[discretized_feature_name]
score += self._find_matching_bucket_weight(buckets, feature_value)
return score
def _find_matching_bucket_weight(self, buckets, feature_value):
for left_side, right_side, weight in buckets:
# The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
if feature_value >= left_side and feature_value < right_side:
return weight
raise LookupError("Couldn't find a matching bucket for the given feature value.")

View File

@ -1,91 +0,0 @@
from .parsers import LollyModelFeaturesParser
class TFModelInitializerBuilder:
def __init__(self, model_features_parser=LollyModelFeaturesParser()):
self._model_features_parser = model_features_parser
def build(self, lolly_model_reader):
'''
:param lolly_model_reader: LollyModelReader instance
:return: tf_model_initializer dictionary of the following format:
{
"features": {
"bias": 0.0,
"binary": {
# (feature name : feature weight) pairs
"feature_name_1": 0.0,
...
"feature_nameN": 0.0
},
"discretized": {
# (feature name : index aligned lists of bin_boundaries and weights
"feature_name_1": {
"bin_boundaries": [1, ..., inf],
"weights": [0.0, ..., 0.0]
}
...
"feature_name_K": {
"bin_boundaries": [1, ..., inf],
"weights": [0.0, ..., 0.0]
}
}
}
}
'''
tf_model_initializer = {
"features": {}
}
features = self._model_features_parser.parse(lolly_model_reader)
tf_model_initializer["features"]["bias"] = features["bias"]
self._set_discretized_features(features["discretized"], tf_model_initializer)
self._dedup_binary_features(features["binary"], features["discretized"])
tf_model_initializer["features"]["binary"] = features["binary"]
return tf_model_initializer
def _set_discretized_features(self, discretized_features, tf_model_initializer):
if len(discretized_features) == 0:
return
num_bins = max([len(bins) for bins in discretized_features.values()])
bin_boundaries_and_weights = {}
for feature_name in discretized_features:
bin_boundaries_and_weights[feature_name] = self._extract_bin_boundaries_and_weights(
discretized_features[feature_name], num_bins)
tf_model_initializer["features"]["discretized"] = bin_boundaries_and_weights
def _dedup_binary_features(self, binary_features, discretized_features):
[binary_features.pop(feature_name) for feature_name in discretized_features]
def _extract_bin_boundaries_and_weights(self, discretized_feature_buckets, num_bins):
bin_boundary_weight_pairs = []
for bucket in discretized_feature_buckets:
bin_boundary_weight_pairs.append([bucket[0], bucket[2]])
# The default DBv2 HashingDiscretizer bin membership interval is (a, b]
#
# The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
#
# Thus, convert (a, b] to [a, b) by inverting the bin boundaries.
for bin_boundary_weight_pair in bin_boundary_weight_pairs:
if bin_boundary_weight_pair[0] < float("inf"):
bin_boundary_weight_pair[0] *= -1
while len(bin_boundary_weight_pairs) < num_bins:
bin_boundary_weight_pairs.append([float("inf"), float(0)])
bin_boundary_weight_pairs.sort(key=lambda bin_boundary_weight_pair: bin_boundary_weight_pair[0])
bin_boundaries, weights = list(zip(*bin_boundary_weight_pairs))
return {
"bin_boundaries": bin_boundaries,
"weights": weights
}

View File

@ -1,120 +0,0 @@
# checkstyle: noqa
import tensorflow.compat.v1 as tf
from collections import OrderedDict
from .constants import EB_SCORE_IDX
from .lolly.data_helpers import get_lolly_scores
import twml
def get_multi_binary_class_metric_fn(metrics, classes=None, class_dim=1):
"""
This function was copied from twml/metrics.py with the following adjustments:
- Override example weights with the ones set in graph_output.
- Tile labels in order to support per engagement metrics for both TF and Lolly scores.
- Add lolly_tf_score_MSE metric.
Note: All custom lines have a comment that starts with 'Added'
"""
# pylint: disable=invalid-name,dict-keys-not-iterating
if metrics is None:
# remove expensive metrics by default for faster eval
metrics = list(twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.keys())
metrics.remove('pr_curve')
def get_eval_metric_ops(graph_output, labels, weights):
"""
graph_output:
dict that is returned by build_graph given input features.
labels:
target labels associated to batch.
weights:
weights of the samples..
"""
# Added to support the example weights overriding.
weights = graph_output["weights"]
# Added to support per engagement metrics for both TF and Lolly scores.
labels = tf.tile(labels, [1, 2])
eval_metric_ops = OrderedDict()
preds = graph_output['output']
threshold = graph_output['threshold'] if 'threshold' in graph_output else 0.5
hard_preds = graph_output.get('hard_output')
if not hard_preds:
hard_preds = tf.greater_equal(preds, threshold)
shape = labels.get_shape()
# basic sanity check: multi_metric dimension must exist
assert len(shape) > class_dim, "Dimension specified by class_dim does not exist."
num_labels = shape[class_dim]
# If we are doing multi-class / multi-label metric, the number of classes / labels must
# be know at graph construction time. This dimension cannot have size None.
assert num_labels is not None, "The multi-metric dimension cannot be None."
assert classes is None or len(classes) == num_labels, (
"Number of classes must match the number of labels")
weights_shape = weights.get_shape() if weights is not None else None
if weights_shape is None:
num_weights = None
elif len(weights_shape) > 1:
num_weights = weights_shape[class_dim]
else:
num_weights = 1
for i in range(num_labels):
# add metrics to eval_metric_ops dict
for metric_name in metrics:
metric_name = metric_name.lower() # metric name are case insensitive.
class_metric_name = metric_name + "_" + (classes[i] if classes is not None else str(i))
if class_metric_name in eval_metric_ops:
# avoid adding duplicate metrics.
continue
class_labels = tf.gather(labels, indices=[i], axis=class_dim)
class_preds = tf.gather(preds, indices=[i], axis=class_dim)
class_hard_preds = tf.gather(hard_preds, indices=[i], axis=class_dim)
if num_weights is None:
class_weights = None
elif num_weights == num_labels:
class_weights = tf.gather(weights, indices=[i], axis=class_dim)
elif num_weights == 1:
class_weights = weights
else:
raise ValueError("num_weights (%d) and num_labels (%d) do not match"
% (num_weights, num_labels))
metric_factory, requires_threshold = twml.metrics.SUPPORTED_BINARY_CLASS_METRICS.get(metric_name)
if metric_factory:
value_op, update_op = metric_factory(
labels=class_labels,
predictions=(class_hard_preds if requires_threshold else class_preds),
weights=class_weights, name=class_metric_name)
eval_metric_ops[class_metric_name] = (value_op, update_op)
else:
raise ValueError('Cannot find the metric named ' + metric_name)
# Added to compare TF and Lolly scores.
eval_metric_ops["lolly_tf_score_MSE"] = get_mse(graph_output["output"], labels)
return eval_metric_ops
return get_eval_metric_ops
def get_mse(predictions, labels):
lolly_scores = get_lolly_scores(labels)
tf_scores = predictions[:, EB_SCORE_IDX]
squared_lolly_tf_score_diff = tf.square(tf.subtract(tf_scores, lolly_scores))
value_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="value_op")
update_op = tf.reduce_mean(squared_lolly_tf_score_diff, name="update_op")
return value_op, update_op

View File

@ -1,8 +0,0 @@
python3_library(
name = "libs_py3",
sources = ["*.py"],
dependencies = [
"src/python/twitter/deepbird/io",
"twml:twml-nodeps",
],
)

View File

@ -1,62 +0,0 @@
from .hashing_utils import make_feature_id
from twml.contrib.layers.hashing_discretizer import HashingDiscretizer
import numpy as np
class TFModelDiscretizerBuilder(object):
def __init__(self, num_bits):
self.num_bits = num_bits
def build(self, tf_model_initializer):
'''
:param tf_model_initializer: dictionary of the following format:
{
"features": {
"bias": 0.0,
"binary": {
# (feature name : feature weight) pairs
"feature_name_1": 0.0,
...
"feature_nameN": 0.0
},
"discretized": {
# (feature name : index aligned lists of bin_boundaries and weights
"feature_name_1": {
"bin_boundaries": [1, ..., inf],
"weights": [0.0, ..., 0.0]
}
...
"feature_name_K": {
"bin_boundaries": [1, ..., inf],
"weights": [0.0, ..., 0.0]
}
}
}
}
:return: a HashingDiscretizer instance.
'''
discretized_features = tf_model_initializer["features"]["discretized"]
max_bins = 0
feature_ids = []
bin_vals = []
for feature_name in discretized_features:
bin_boundaries = discretized_features[feature_name]["bin_boundaries"]
feature_id = make_feature_id(feature_name, self.num_bits)
feature_ids.append(feature_id)
np_bin_boundaries = [np.float(bin_boundary) for bin_boundary in bin_boundaries]
bin_vals.append(np_bin_boundaries)
max_bins = max(max_bins, len(np_bin_boundaries))
feature_ids_np = np.array(feature_ids)
bin_vals_np = np.array(bin_vals).flatten()
return HashingDiscretizer(
feature_ids=feature_ids_np,
bin_vals=bin_vals_np,
n_bin=max_bins,
out_bits=self.num_bits
)

View File

@ -1,29 +0,0 @@
from twitter.deepbird.io.util import _get_feature_id
import numpy as np
def numpy_hashing_uniform(the_id, bin_idx, output_bits):
"""
integer_multiplicative_hashing
This is a reimplementation, for testing purposes, of the
c++ version found in hashing_discretizer_impl.cpp
"""
hashing_constant = 2654435761
N = 32
with np.errstate(over='ignore'):
the_id *= hashing_constant
the_id += bin_idx
the_id *= hashing_constant
the_id >>= N - output_bits
the_id &= (1 << output_bits) - 1
return the_id
def make_feature_id(name, num_bits):
feature_id = _get_feature_id(name)
return np.int64(limit_bits(feature_id, num_bits))
def limit_bits(value, num_bits):
return value & ((2 ** num_bits) - 1)

View File

@ -1,34 +0,0 @@
from .hashing_utils import make_feature_id, numpy_hashing_uniform
import numpy as np
import tensorflow.compat.v1 as tf
import twml
class TFModelWeightsInitializerBuilder(object):
def __init__(self, num_bits):
self.num_bits = num_bits
def build(self, tf_model_initializer):
'''
:return: (bias_initializer, weight_initializer)
'''
initial_weights = np.zeros((2 ** self.num_bits, 1))
features = tf_model_initializer["features"]
self._set_binary_feature_weights(initial_weights, features["binary"])
self._set_discretized_feature_weights(initial_weights, features["discretized"])
return tf.constant_initializer(features["bias"]), twml.contrib.initializers.PartitionConstant(initial_weights)
def _set_binary_feature_weights(self, initial_weights, binary_features):
for feature_name, weight in binary_features.items():
feature_id = make_feature_id(feature_name, self.num_bits)
initial_weights[feature_id][0] = weight
def _set_discretized_feature_weights(self, initial_weights, discretized_features):
for feature_name, discretized_feature in discretized_features.items():
feature_id = make_feature_id(feature_name, self.num_bits)
for bin_idx, weight in enumerate(discretized_feature["weights"]):
final_bucket_id = numpy_hashing_uniform(feature_id, bin_idx, self.num_bits)
initial_weights[final_bucket_id][0] = weight

View File

@ -1,212 +0,0 @@
# checkstyle: noqa
import tensorflow.compat.v1 as tf
from tensorflow.python.estimator.export.export import build_raw_serving_input_receiver_fn
from tensorflow.python.framework import dtypes
from tensorflow.python.ops import array_ops
import tensorflow_hub as hub
from datetime import datetime
from tensorflow.compat.v1 import logging
from twitter.deepbird.projects.timelines.configs import all_configs
from twml.trainers import DataRecordTrainer
from twml.contrib.calibrators.common_calibrators import build_percentile_discretizer_graph
from twml.contrib.calibrators.common_calibrators import calibrate_discretizer_and_export
from .metrics import get_multi_binary_class_metric_fn
from .constants import TARGET_LABEL_IDX, PREDICTED_CLASSES
from .example_weights import add_weight_arguments, make_weights_tensor
from .lolly.data_helpers import get_lolly_logits
from .lolly.tf_model_initializer_builder import TFModelInitializerBuilder
from .lolly.reader import LollyModelReader
from .tf_model.discretizer_builder import TFModelDiscretizerBuilder
from .tf_model.weights_initializer_builder import TFModelWeightsInitializerBuilder
import twml
def get_feature_values(features_values, params):
if params.lolly_model_tsv:
# The default DBv2 HashingDiscretizer bin membership interval is (a, b]
#
# The Earlybird Lolly prediction engine discretizer bin membership interval is [a, b)
#
# TFModelInitializerBuilder converts (a, b] to [a, b) by inverting the bin boundaries.
#
# Thus, invert the feature values, so that HashingDiscretizer can to find the correct bucket.
return tf.multiply(features_values, -1.0)
else:
return features_values
def build_graph(features, label, mode, params, config=None):
weights = None
if "weights" in features:
weights = make_weights_tensor(features["weights"], label, params)
num_bits = params.input_size_bits
if mode == "infer":
indices = twml.limit_bits(features["input_sparse_tensor_indices"], num_bits)
dense_shape = tf.stack([features["input_sparse_tensor_shape"][0], 1 << num_bits])
sparse_tf = tf.SparseTensor(
indices=indices,
values=get_feature_values(features["input_sparse_tensor_values"], params),
dense_shape=dense_shape
)
else:
features["values"] = get_feature_values(features["values"], params)
sparse_tf = twml.util.convert_to_sparse(features, num_bits)
if params.lolly_model_tsv:
tf_model_initializer = TFModelInitializerBuilder().build(LollyModelReader(params.lolly_model_tsv))
bias_initializer, weight_initializer = TFModelWeightsInitializerBuilder(num_bits).build(tf_model_initializer)
discretizer = TFModelDiscretizerBuilder(num_bits).build(tf_model_initializer)
else:
discretizer = hub.Module(params.discretizer_save_dir)
bias_initializer, weight_initializer = None, None
input_sparse = discretizer(sparse_tf, signature="hashing_discretizer_calibrator")
logits = twml.layers.full_sparse(
inputs=input_sparse,
output_size=1,
bias_initializer=bias_initializer,
weight_initializer=weight_initializer,
use_sparse_grads=(mode == "train"),
use_binary_values=True,
name="full_sparse_1"
)
loss = None
if mode != "infer":
lolly_activations = get_lolly_logits(label)
if opt.print_data_examples:
logits = print_data_example(logits, lolly_activations, features)
if params.replicate_lolly:
loss = tf.reduce_mean(tf.math.squared_difference(logits, lolly_activations))
else:
batch_size = tf.shape(label)[0]
target_label = tf.reshape(tensor=label[:, TARGET_LABEL_IDX], shape=(batch_size, 1))
loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=target_label, logits=logits)
loss = twml.util.weighted_average(loss, weights)
num_labels = tf.shape(label)[1]
eb_scores = tf.tile(lolly_activations, [1, num_labels])
logits = tf.tile(logits, [1, num_labels])
logits = tf.concat([logits, eb_scores], axis=1)
output = tf.nn.sigmoid(logits)
return {"output": output, "loss": loss, "weights": weights}
def print_data_example(logits, lolly_activations, features):
return tf.Print(
logits,
[logits, lolly_activations, tf.reshape(features['keys'], (1, -1)), tf.reshape(tf.multiply(features['values'], -1.0), (1, -1))],
message="DATA EXAMPLE = ",
summarize=10000
)
def earlybird_output_fn(graph_output):
export_outputs = {
tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
tf.estimator.export.PredictOutput(
{"prediction": tf.identity(graph_output["output"], name="output_scores")}
)
}
return export_outputs
if __name__ == "__main__":
parser = DataRecordTrainer.add_parser_arguments()
parser = twml.contrib.calibrators.add_discretizer_arguments(parser)
parser.add_argument("--label", type=str, help="label for the engagement")
parser.add_argument("--model.use_existing_discretizer", action="store_true",
dest="model_use_existing_discretizer",
help="Load a pre-trained calibration or train a new one")
parser.add_argument("--input_size_bits", type=int)
parser.add_argument("--export_module_name", type=str, default="base_mlp", dest="export_module_name")
parser.add_argument("--feature_config", type=str)
parser.add_argument("--replicate_lolly", type=bool, default=False, dest="replicate_lolly",
help="Train a regression model with MSE loss and the logged Earlybird score as a label")
parser.add_argument("--lolly_model_tsv", type=str, required=False, dest="lolly_model_tsv",
help="Initialize with weights and discretizer bins available in the given Lolly model tsv file"
"No discretizer gets trained or loaded if set.")
parser.add_argument("--print_data_examples", type=bool, default=False, dest="print_data_examples",
help="Prints 'DATA EXAMPLE = [[tf logit]][[logged lolly logit]][[feature ids][feature values]]'")
add_weight_arguments(parser)
opt = parser.parse_args()
feature_config_module = all_configs.select_feature_config(opt.feature_config)
feature_config = feature_config_module.get_feature_config(data_spec_path=opt.data_spec, label=opt.label)
parse_fn = twml.parsers.get_sparse_parse_fn(
feature_config,
keep_fields=("ids", "keys", "values", "batch_size", "total_size", "codes"))
if not opt.lolly_model_tsv:
if opt.model_use_existing_discretizer:
logging.info("Skipping discretizer calibration [model.use_existing_discretizer=True]")
logging.info(f"Using calibration at {opt.discretizer_save_dir}")
else:
logging.info("Calibrating new discretizer [model.use_existing_discretizer=False]")
calibrator = twml.contrib.calibrators.HashingDiscretizerCalibrator(
opt.discretizer_num_bins,
opt.discretizer_output_size_bits
)
calibrate_discretizer_and_export(name="recap_earlybird_hashing_discretizer",
params=opt,
calibrator=calibrator,
build_graph_fn=build_percentile_discretizer_graph,
feature_config=feature_config)
trainer = DataRecordTrainer(
name="earlybird",
params=opt,
build_graph_fn=build_graph,
save_dir=opt.save_dir,
feature_config=feature_config,
metric_fn=get_multi_binary_class_metric_fn(
metrics=["roc_auc"],
classes=PREDICTED_CLASSES
),
warm_start_from=None
)
train_input_fn = trainer.get_train_input_fn(parse_fn=parse_fn)
eval_input_fn = trainer.get_eval_input_fn(parse_fn=parse_fn)
logging.info("Training and Evaluation ...")
trainingStartTime = datetime.now()
trainer.train_and_evaluate(train_input_fn=train_input_fn, eval_input_fn=eval_input_fn)
trainingEndTime = datetime.now()
logging.info("Training and Evaluation time: " + str(trainingEndTime - trainingStartTime))
if trainer._estimator.config.is_chief:
serving_input_in_earlybird = {
"input_sparse_tensor_indices": array_ops.placeholder(
name="input_sparse_tensor_indices",
shape=[None, 2],
dtype=dtypes.int64),
"input_sparse_tensor_values": array_ops.placeholder(
name="input_sparse_tensor_values",
shape=[None],
dtype=dtypes.float32),
"input_sparse_tensor_shape": array_ops.placeholder(
name="input_sparse_tensor_shape",
shape=[2],
dtype=dtypes.int64)
}
serving_input_receiver_fn = build_raw_serving_input_receiver_fn(serving_input_in_earlybird)
twml.contrib.export.export_fn.export_all_models(
trainer=trainer,
export_dir=opt.export_dir,
parse_fn=parse_fn,
serving_input_receiver_fn=serving_input_receiver_fn,
export_output_fn=earlybird_output_fn,
feature_spec=feature_config.get_feature_spec()
)
logging.info("The export model path is: " + opt.export_dir)

View File

@ -1,91 +0,0 @@
JOB = ["job/**/*"]
scala_library(
name = "batch",
sources = ["**/*.scala"],
platform = "java8",
tags = [
"bazel-compatible",
"bazel-only",
],
dependencies = [
"3rdparty/jvm/cascading:cascading-core",
"3rdparty/jvm/cascading:cascading-hadoop",
"3rdparty/jvm/cascading:cascading-local",
"3rdparty/jvm/cascading:cascading-thrift",
"3rdparty/jvm/com/twitter/algebird:core",
"3rdparty/jvm/com/twitter/algebird:util",
"3rdparty/jvm/com/twitter/storehaus:algebra",
"3rdparty/jvm/com/twitter/storehaus:core",
"3rdparty/src/jvm/com/twitter/scalding:args",
"3rdparty/src/jvm/com/twitter/scalding:commons",
"3rdparty/src/jvm/com/twitter/scalding:core",
"3rdparty/src/jvm/com/twitter/scalding:date",
"3rdparty/src/jvm/com/twitter/scalding:parquet",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"3rdparty/src/jvm/com/twitter/summingbird:client",
"graphstore/common:flock_follows-java",
"src/java/com/twitter/common_internal/util:date_util",
"src/java/com/twitter/twadoop/batch",
"src/java/com/twitter/twadoop/util/dbconfig",
"src/java/com/twitter/twadoop/util/yaml",
"src/protobuf/com/twitter/twadoop",
"src/scala/com/twitter/pluck",
"src/scala/com/twitter/pluck/source/combined_user_source",
"src/scala/com/twitter/pluck/source/jdbc",
"src/scala/com/twitter/scalding_internal/error_handling",
"src/scala/com/twitter/scalding_internal/job",
"src/scala/com/twitter/scalding_internal/job/analytics_batch",
"src/scala/com/twitter/scalding_internal/multiformat",
"src/scala/com/twitter/scalding_internal/source",
"src/scala/com/twitter/wtf/scalding/jobs/common:date_util",
"src/thrift/com/twitter/gizmoduck:user-thrift-java",
"src/thrift/com/twitter/twadoop/user/gen:gen-java",
"util/util-core:scala",
],
)
#pants.new build target for the old "dist"
hadoop_binary(
name = "graph-batch-deploy",
main = "com.twitter.scalding.Tool",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":tweepcred",
],
)
# Generated with `capesospy-v2 create_target tweepcred_job science/scalding/mesos/wtf/recos_platform_atla_proc.yaml`, config hash d63a47.
scalding_job(
name = "tweepcred_job",
main = "com.twitter.graph.batch.job.tweepcred.TweepcredBatchJob",
args = ["--weighted false --hadoop_config /etc/hadoop/hadoop-conf-proc-atla"],
config = [
("hadoop.combine-input", "true"),
("hadoop.map.jvm.total-memory", "3072m"),
("hadoop.queue", "cassowary.default"),
("hadoop.reduce.jvm.total-memory", "3072m"),
("hadoop.reducers", "1200"),
("hadoop.submitter.disk", "200000m"),
("hadoop.submitter.jvm.total-memory", "5120m"),
("submitter.tier", "preemptible"),
],
cron = "24,44,04 * * * *",
hadoop_cluster = "atla-proc",
platform = "java8",
role = "cassowary",
runtime_platform = "java8",
tags = [
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":tweepcred",
],
)

Binary file not shown.

View File

@ -1,83 +0,0 @@
package com.twitter.graph.batch.job.tweepcred
import com.twitter.pluck.source.combined_user_source.MostRecentCombinedUserSnapshotSource
import com.twitter.scalding._
/**
* Calculate tweepcred from the given pagerank file. If post_adjust is true,
* reduce pagerank for users with low followers compared to number of
* followings based on existing reputation code.
* Options:
* --input_pagerank: given pagerank
* --user_mass: user mass tsv file, generated by twadoop user_mass job
* --output_pagerank: where to put pagerank file
* --output_tweepcred: where to put tweepcred file
* optional arguments:
* --post_adjust: whether to do post adjust, default true
*
*/
class ExtractTweepcred(args: Args) extends Job(args) {
val POST_ADJUST = args.getOrElse("post_adjust", "true").toBoolean
val inputPagerank = getInputPagerank(args("input_pagerank"))
.map(() -> ('num_followers, 'num_followings)) { (u: Unit) =>
(0, 0)
}
val userInfo = TypedPipe
.from(MostRecentCombinedUserSnapshotSource)
.flatMap { combinedUser =>
val user = Option(combinedUser.user)
val userId = user.map(_.id).getOrElse(0L)
val userExtended = Option(combinedUser.user_extended)
val numFollowers = userExtended.flatMap(u => Option(u.followers)).map(_.toInt).getOrElse(0)
val numFollowings = userExtended.flatMap(u => Option(u.followings)).map(_.toInt).getOrElse(0)
if (userId == 0L || user.map(_.safety).exists(_.deactivated)) {
None
} else {
Some((userId, 0.0, numFollowers, numFollowings))
}
}
.toPipe[(Long, Double, Int, Int)]('src_id, 'mass_input, 'num_followers, 'num_followings)
val pagerankWithSuspended = (inputPagerank ++ userInfo)
.groupBy('src_id) {
_.max('mass_input)
.max('num_followers)
.max('num_followings)
}
pagerankWithSuspended
.discard('num_followers, 'num_followings)
.write(Tsv(args("output_pagerank")))
val adjustedPagerank =
if (POST_ADJUST) {
pagerankWithSuspended
.map(('mass_input, 'num_followers, 'num_followings) -> 'mass_input) {
input: (Double, Int, Int) =>
Reputation.adjustReputationsPostCalculation(input._1, input._2, input._3)
}
.normalize('mass_input)
} else {
pagerankWithSuspended
.discard('num_followers, 'num_followings)
}
val tweepcred = adjustedPagerank
.map('mass_input -> 'mass_input) { input: Double =>
Reputation.scaledReputation(input)
}
tweepcred.write(Tsv(args("output_tweepcred")))
tweepcred.write(Tsv(args("current_tweepcred")))
tweepcred.write(Tsv(args("today_tweepcred")))
def getInputPagerank(fileName: String) = {
Tsv(fileName).read
.mapTo((0, 1) -> ('src_id, 'mass_input)) { input: (Long, Double) =>
input
}
}
}

View File

@ -1,275 +0,0 @@
package com.twitter.graph.batch.job.tweepcred
import com.twitter.data.proto.Flock
import com.twitter.scalding._
import com.twitter.pluck.source._
import com.twitter.pluck.source.combined_user_source.MostRecentCombinedUserSnapshotSource
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.service.interactions.InteractionGraph
import graphstore.common.FlockFollowsJavaDataset
import java.util.TimeZone
/**
* Prepare the graph data for page rank calculation. Also generate the initial
* pagerank as the starting point. Afterwards, start WeightedPageRank job.
*
* Either read a tsv file for testing or read the following to build the graph
* flock edges Flock.Edge
* real graph input for weights InteractionGraph.Edge
*
* Options:
* --pwd: working directory, will generate the following files there
* numnodes: total number of nodes
* nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior>
* pagerank: the page rank file
* --user_mass: user mass tsv file, generated by twadoop user_mass job
* Optional arguments:
* --input: use the given tsv file instead of flock and real graph
* --weighted: do weighted pagerank, default false
* --flock_edges_only: restrict graph to flock edges, default true
* --input_pagerank: continue pagerank from this
*
* Plus the following options for WeightedPageRank and ExtractTweepcred:
* --output_pagerank: where to put pagerank file
* --output_tweepcred: where to put tweepcred file
* Optional:
* --maxiterations: how many iterations to run. Default is 20
* --jumpprob: probability of a random jump, default is 0.1
* --threshold: total difference before finishing early, default 0.001
* --post_adjust: whether to do post adjust, default true
*/
class PreparePageRankData(args: Args) extends Job(args) {
implicit val timeZone: TimeZone = DateOps.UTC
val PWD = args("pwd")
val WEIGHTED = args.getOrElse("weighted", "false").toBoolean
val FLOCK_EDGES_ONLY = args.getOrElse("flock_edges_only", "true").toBoolean
val ROW_TYPE_1 = 1
val ROW_TYPE_2 = 2
// graph data and user mass
val userMass = getUserMass
val nodesWithPrior = getGraphData(userMass)
val numNodes = nodesWithPrior.groupAll { _.size }
numNodes.write(Tsv(PWD + "/numnodes"))
dumpNodes(nodesWithPrior, PWD + "/nodes");
// initial pagerank to start computation
generateInitialPagerank(nodesWithPrior)
// continue with the calculation
override def next = {
Some(new WeightedPageRank(args))
}
/**
* read flock edges
*/
def getFlockEdges = {
DAL
.readMostRecentSnapshotNoOlderThan(FlockFollowsJavaDataset, Days(7))
.toTypedSource
.flatMapTo('src_id, 'dst_id) { edge: Flock.Edge =>
if (edge.getStateId() == Flock.State.Positive.getNumber()) {
Some((edge.getSourceId(), edge.getDestinationId()))
} else {
None
}
}
}
/**
* read real graph edges with weights
*/
def getRealGraphEdges = {
RealGraphEdgeSource()
.flatMapTo('src_id, 'dst_id, 'weight) { edge: InteractionGraph.Edge =>
if (edge.getSourceId() != edge.getDestinationId()) {
val srcId = edge.getSourceId()
val dstId = edge.getDestinationId()
val weight = edge.getWeight().toFloat
Some((srcId, dstId, weight))
} else {
None
}
}
}
/**
* combine real graph and flock. If flock_edges_only is true, only take the
* flock edges; otherwise edges are either from flock or from real graph.
* edges weights default to be 1, overwritten by weights from real graph
*/
def getFlockRealGraphEdges = {
val flock = getFlockEdges
if (WEIGHTED) {
val flockWithWeight = flock
.map(() -> ('weight, 'rowtype)) { (u: Unit) =>
(1.0f, ROW_TYPE_1)
}
val realGraph = getRealGraphEdges
.map(() -> 'rowtype) { (u: Unit) =>
(ROW_TYPE_2)
}
val combined = (flockWithWeight ++ realGraph)
.groupBy('src_id, 'dst_id) {
_.min('rowtype)
.max('weight) // take whichever is bigger
}
if (FLOCK_EDGES_ONLY) {
combined.filter('rowtype) { (rowtype: Int) =>
rowtype == ROW_TYPE_1
}
} else {
combined
}
} else {
flock.map(() -> ('weight)) { (u: Unit) =>
1.0f
}
}.project('src_id, 'dst_id, 'weight)
}
def getCsvEdges(fileName: String) = {
Tsv(fileName).read
.mapTo((0, 1, 2) -> ('src_id, 'dst_id, 'weight)) { input: (Long, Long, Float) =>
input
}
}
/*
* Compute user mass based on combined user
*/
def getUserMass =
TypedPipe
.from(MostRecentCombinedUserSnapshotSource)
.flatMap { user =>
UserMass.getUserMass(user)
}
.map { userMassInfo =>
(userMassInfo.userId, userMassInfo.mass)
}
.toPipe[(Long, Double)]('src_id_input, 'mass_prior)
.normalize('mass_prior)
/**
* Read either flock/real_graph or a given tsv file
* group by the source id, and output node data structure
* merge with the user_mass.
* return <'src_id, 'dst_ids, 'weights, 'mass_prior>
*
* make sure src_id is the same set as in user_mass, and dst_ids
* are subset of user_mass. eg flock has edges like 1->2,
* where both users 1 and 2 do not exist anymore
*/
def getGraphData(userMass: RichPipe) = {
val edges: RichPipe = args.optional("input") match {
case None => getFlockRealGraphEdges
case Some(input) => getCsvEdges(input)
}
// remove edges where dst_id is not in userMass
val filterByDst = userMass
.joinWithLarger('src_id_input -> 'dst_id, edges)
.discard('src_id_input, 'mass_prior)
// aggreate by the source id
val nodes = filterByDst
.groupBy('src_id) {
_.mapReduceMap(('dst_id, 'weight) -> ('dst_ids, 'weights)) /* map1 */ { a: (Long, Float) =>
(Vector(a._1), if (WEIGHTED) Vector(a._2) else Vector())
} /* reduce */ { (a: (Vector[Long], Vector[Float]), b: (Vector[Long], Vector[Float])) =>
{
(a._1 ++ b._1, a._2 ++ b._2)
}
} /* map2 */ { a: (Vector[Long], Vector[Float]) =>
a
}
}
.mapTo(
('src_id, 'dst_ids, 'weights) -> ('src_id, 'dst_ids, 'weights, 'mass_prior, 'rowtype)) {
input: (Long, Vector[Long], Vector[Float]) =>
{
(input._1, input._2.toArray, input._3.toArray, 0.0, ROW_TYPE_1)
}
}
// get to the same schema
val userMassNodes = userMass
.mapTo(('src_id_input, 'mass_prior) -> ('src_id, 'dst_ids, 'weights, 'mass_prior, 'rowtype)) {
input: (Long, Double) =>
{
(input._1, Array[Long](), Array[Float](), input._2, ROW_TYPE_2)
}
}
// make src_id the same set as in userMass
(nodes ++ userMassNodes)
.groupBy('src_id) {
_.sortBy('rowtype)
.head('dst_ids, 'weights)
.last('mass_prior, 'rowtype)
}
.filter('rowtype) { input: Int =>
input == ROW_TYPE_2
}
}
/**
* generate the graph data output
*/
def dumpNodes(nodes: RichPipe, fileName: String) = {
mode match {
case Hdfs(_, conf) => nodes.write(SequenceFile(fileName))
case _ =>
nodes
.mapTo((0, 1, 2, 3) -> (0, 1, 2, 3)) { input: (Long, Array[Long], Array[Float], Double) =>
(input._1, input._2.mkString(","), input._3.mkString(","), input._4)
}
.write(Tsv(fileName))
}
}
/*
* output prior mass or copy the given mass file (merge, normalize)
* to be used as the starting point
*/
def generateInitialPagerank(nodes: RichPipe) = {
val prior = nodes
.project('src_id, 'mass_prior)
val combined = args.optional("input_pagerank") match {
case None => prior
case Some(fileName) => {
val massInput = Tsv(fileName).read
.mapTo((0, 1) -> ('src_id, 'mass_prior, 'rowtype)) { input: (Long, Double) =>
(input._1, input._2, ROW_TYPE_2)
}
val priorRow = prior
.map(() -> ('rowtype)) { (u: Unit) =>
ROW_TYPE_1
}
(priorRow ++ massInput)
.groupBy('src_id) {
_.sortBy('rowtype)
.last('mass_prior)
.head('rowtype)
}
// throw away extra nodes from input file
.filter('rowtype) { (rowtype: Int) =>
rowtype == ROW_TYPE_1
}
.discard('rowtype)
.normalize('mass_prior)
}
}
combined.write(Tsv(PWD + "/pagerank_0"))
}
}

View File

@ -1,75 +0,0 @@
Tweepcred
Tweepcred is a social network analysis tool that calculates the influence of Twitter users based on their interactions with other users. The tool uses the PageRank algorithm to rank users based on their influence.
PageRank Algorithm
PageRank is a graph algorithm that was originally developed by Google to determine the importance of web pages in search results. The algorithm works by assigning a numerical score to each page based on the number and quality of other pages that link to it. The more links a page has from other high-quality pages, the higher its PageRank score.
In the Tweepcred project, the PageRank algorithm is used to determine the influence of Twitter users based on their interactions with other users. The graph is constructed by treating Twitter users as nodes, and their interactions (mentions, retweets, etc.) as edges. The PageRank score of a user represents their influence in the network.
Tweepcred PageRank Implementation
The implementation of the PageRank algorithm in Tweepcred is based on the Hadoop MapReduce framework. The algorithm is split into two stages: preparation and iteration.
The preparation stage involves constructing the graph of Twitter users and their interactions, and initializing each user's PageRank score to a default value. This stage is implemented in the PreparePageRankData class.
The iteration stage involves repeatedly calculating and updating the PageRank scores of each user until convergence is reached. This stage is implemented in the UpdatePageRank class, which is run multiple times until the algorithm converges.
The Tweepcred PageRank implementation also includes a number of optimizations to improve performance and reduce memory usage. These optimizations include block compression, lazy loading, and in-memory caching.
========================================== TweepcredBatchJob.scala ==========================================
This is a Scala class that represents a batch job for computing the "tweepcred" (Twitter credibility) score for Twitter users using weighted or unweighted PageRank algorithm. The class extends the AnalyticsIterativeBatchJob class, which is part of the Scalding framework used for data processing on Hadoop.
The class defines various properties and methods that are used to configure and run the batch job. The args parameter represents the command-line arguments that are passed to the batch job, such as the --weighted flag that determines whether to use the weighted PageRank algorithm or not.
The run method overrides the run method of the base class and prints the batch statistics after the job has finished. The children method defines a list of child jobs that need to be executed as part of the batch job. The messageHeader method returns a string that represents the header of the batch job message.
========================================== ExtractTweepcred.scala ==========================================
This class is a Scalding job that calculates "tweepcred" from a given pagerank file. Tweepcred is a measure of reputation for Twitter users that takes into account the number of followers they have and the number of people they follow. If the optional argument post_adjust is set to true (default value), then the pagerank values are adjusted based on the user's follower-to-following ratio.
The class takes several command-line arguments specifying input and output files and options, and it uses the Scalding library to perform distributed data processing on the input files. It reads in the pagerank file and a user mass file, both in TSV format, and combines them to produce a new pagerank file with the adjusted values. The adjusted pagerank is then used to calculate tweepcred values, which are written to output files.
The code makes use of the MostRecentCombinedUserSnapshotSource class from the com.twitter.pluck.source.combined_user_source package to obtain user information from the user mass file. It also uses the Reputation class to perform the tweepcred calculations and adjustments.
========================================== UserMass.scala ==========================================
The UserMass class is a helper class used to calculate the "mass" of a user on Twitter, as defined by a certain algorithm. The mass score represents the user's reputation and is used in various applications, such as in determining which users should be recommended to follow or which users should have their content highlighted.
The getUserMass method of the UserMass class takes in a CombinedUser object, which contains information about a Twitter user, and returns an optional UserMassInfo object, which contains the user's ID and calculated mass score.
The algorithm used to calculate the mass score takes into account various factors such as the user's account age, number of followers and followings, device usage, and safety status (restricted, suspended, verified). The calculation involves adding and multiplying weight factors and adjusting the mass score based on a threshold for the number of friends and followers.
========================================== PreparePageRankData.scala ==========================================
The PreparePageRankData class prepares the graph data for the page rank calculation. It generates the initial pagerank and then starts the WeightedPageRank job. It has the following functionalities:
It reads the user mass TSV file generated by the twadoop user_mass job.
It reads the graph data, which is either a TSV file or a combination of flock edges and real graph inputs for weights.
It generates the initial pagerank as the starting point for the pagerank computation.
It writes the number of nodes to a TSV file and dumps the nodes to another TSV file.
It has several options like weighted, flock_edges_only, and input_pagerank to fine-tune the pagerank calculation.
It also has options for the WeightedPageRank and ExtractTweepcred jobs, like output_pagerank, output_tweepcred, maxiterations, jumpprob, threshold, and post_adjust.
The PreparePageRankData class has several helper functions like getFlockEdges, getRealGraphEdges, getFlockRealGraphEdges, and getCsvEdges that read the graph data from different sources like DAL, InteractionGraph, or CSV files. It also has the generateInitialPagerank function that generates the initial pagerank from the graph data.
========================================== WeightedPageRank.scala ==========================================
WeightedPageRank is a class that performs the weighted PageRank algorithm on a given graph.
The algorithm starts from a given PageRank value and performs one iteration, then tests for convergence. If convergence has not been reached, the algorithm clones itself and starts the next PageRank job with the updated PageRank as input. If convergence has been reached, the algorithm starts the ExtractTweepcred job instead.
The class takes in several options, including the working directory, total number of nodes, nodes file, PageRank file, total difference, whether to perform weighted PageRank, the current iteration, maximum iterations to run, probability of a random jump, and whether to do post adjust.
The algorithm reads a nodes file that includes the source node ID, destination node IDs, weights, and mass prior. The algorithm also reads an input PageRank file that includes the source node ID and mass input. The algorithm then performs one iteration of the PageRank algorithm and writes the output PageRank to a file.
The algorithm tests for convergence by calculating the total difference between the input and output PageRank masses. If convergence has not been reached, the algorithm clones itself and starts the next PageRank job. If convergence has been reached, the algorithm starts the ExtractTweepcred job.
========================================== Reputation.scala ==========================================
This is a helper class called Reputation that contains methods for calculating a user's reputation score. The first method called scaledReputation takes a Double parameter raw which represents the user's page rank, and returns a Byte value that represents the user's reputation on a scale of 0 to 100. This method uses a formula that involves converting the logarithm of the page rank to a number between 0 and 100.
The second method called adjustReputationsPostCalculation takes three parameters: mass (a Double value representing the user's page rank), numFollowers (an Int value representing the number of followers a user has), and numFollowings (an Int value representing the number of users a user is following). This method reduces the page rank of users who have a low number of followers but a high number of followings. It calculates a division factor based on the ratio of followings to followers, and reduces the user's page rank by dividing it by this factor. The method returns the adjusted page rank.

View File

@ -1,50 +0,0 @@
package com.twitter.graph.batch.job.tweepcred
/**
* helper class to calculate reputation, borrowed from repo reputations
*/
object Reputation {
/**
* convert pagerank to tweepcred between 0 and 100,
* take from repo reputations, util/Utils.scala
*/
def scaledReputation(raw: Double): Byte = {
if (raw == 0 || (raw < 1.0e-20)) {
0
} else {
// convert log(pagerank) to a number between 0 and 100
// the two parameters are from a linear fit by converting
// max pagerank -> 95
// min pagerank -> 15
val e: Double = 130d + 5.21 * scala.math.log(raw) // log to the base e
val pos = scala.math.rint(e)
val v = if (pos > 100) 100.0 else if (pos < 0) 0.0 else pos
v.toByte
}
}
// these constants are take from repo reputations, config/production.conf
private val threshAbsNumFriendsReps = 2500
private val constantDivisionFactorGt_threshFriendsToFollowersRatioReps = 3.0
private val threshFriendsToFollowersRatioUMass = 0.6
private val maxDivFactorReps = 50
/**
* reduce pagerank of users with low followers but high followings
*/
def adjustReputationsPostCalculation(mass: Double, numFollowers: Int, numFollowings: Int) = {
if (numFollowings > threshAbsNumFriendsReps) {
val friendsToFollowersRatio = (1.0 + numFollowings) / (1.0 + numFollowers)
val divFactor =
scala.math.exp(
constantDivisionFactorGt_threshFriendsToFollowersRatioReps *
(friendsToFollowersRatio - threshFriendsToFollowersRatioUMass) *
scala.math.log(scala.math.log(numFollowings))
)
mass / ((divFactor min maxDivFactorReps) max 1.0)
} else {
mass
}
}
}

View File

@ -1,64 +0,0 @@
package com.twitter.graph.batch.job.tweepcred
import com.twitter.scalding._
import com.twitter.scalding_internal.job._
import com.twitter.scalding_internal.job.analytics_batch._
/**
* Register the beginning of the tweepcred job in analytic batch table
*
* Options:
* --weighted: do weighted pagerank
* --hadoop_config: /etc/hadoop/hadoop-conf-proc-atla
*
*/
class TweepcredBatchJob(args: Args) extends AnalyticsIterativeBatchJob(args) {
def WEIGHTED = args("weighted").toBoolean
override def timeout = Hours(36)
override def hasFlow = false
def descriptionSuffix = " weighted=" + args("weighted")
override def batchIncrement = Hours(24)
override def firstTime = RichDate("2015-10-02")
override def batchDescription = classOf[TweepcredBatchJob].getCanonicalName + descriptionSuffix
override def run = {
val success = super.run
println("Batch Stat: " + messageHeader + " " + jobStat.get.toString)
success
}
def startTime = dateRange.start
def dateString = startTime.toString("yyyy/MM/dd")
override def children = {
val BASEDIR = "/user/cassowary/tweepcred/"
val baseDir = BASEDIR + (if (WEIGHTED) "weighted" else "unweighted") + "/daily/"
val tmpDir = baseDir + "tmp"
val outputDir = baseDir + dateString
val pageRankDir = outputDir + "/finalmass"
val tweepcredDir = outputDir + "/finaltweepcred"
val yesterdayStr = (startTime - Days(1)).toString("yyyy/MM/dd")
val yestPageRankDir = baseDir + yesterdayStr + "/finalmass"
val TWEEPCRED = "/tweepcred"
val curRep = (if (WEIGHTED) baseDir else BASEDIR) + "current"
val todayRep = (if (WEIGHTED) baseDir else BASEDIR) + dateString
val newArgs = args + ("pwd", Some(tmpDir)) +
("output_pagerank", Some(pageRankDir)) +
("output_tweepcred", Some(tweepcredDir)) +
("input_pagerank", Some(yestPageRankDir)) +
("current_tweepcred", Some(curRep + TWEEPCRED)) +
("today_tweepcred", Some(todayRep + TWEEPCRED))
val prJob = new PreparePageRankData(newArgs)
List(prJob)
}
private def messageHeader = {
val dateString = dateRange.start.toString("yyyy/MM/dd")
classOf[TweepcredBatchJob].getSimpleName +
(if (WEIGHTED) " weighted " else " unweighted ") + dateString
}
}

View File

@ -1,69 +0,0 @@
package com.twitter.graph.batch.job.tweepcred
import com.twitter.twadoop.user.gen.CombinedUser
import com.twitter.util.Time
import com.twitter.wtf.scalding.jobs.common.DateUtil
case class UserMassInfo(userId: Long, mass: Double)
/**
* helper class to calculate user mass, borrowed from repo reputations
*/
object UserMass {
private val currentTimestamp = Time.now.inMilliseconds
private val constantDivisionFactorGt_threshFriendsToFollowersRatioUMass = 5.0
private val threshAbsNumFriendsUMass = 500
private val threshFriendsToFollowersRatioUMass = 0.6
private val deviceWeightAdditive = 0.5
private val ageWeightAdditive = 0.2
private val restrictedWeightMultiplicative = 0.1
def getUserMass(combinedUser: CombinedUser): Option[UserMassInfo] = {
val user = Option(combinedUser.user)
val userId = user.map(_.id).getOrElse(0L)
val userExtended = Option(combinedUser.user_extended)
val age = user.map(_.created_at_msec).map(DateUtil.diffDays(_, currentTimestamp)).getOrElse(0)
val isRestricted = user.map(_.safety).exists(_.restricted)
val isSuspended = user.map(_.safety).exists(_.suspended)
val isVerified = user.map(_.safety).exists(_.verified)
val hasValidDevice = user.flatMap(u => Option(u.devices)).exists(_.isSetMessaging_devices)
val numFollowers = userExtended.flatMap(u => Option(u.followers)).map(_.toInt).getOrElse(0)
val numFollowings = userExtended.flatMap(u => Option(u.followings)).map(_.toInt).getOrElse(0)
if (userId == 0L || user.map(_.safety).exists(_.deactivated)) {
None
} else {
val mass =
if (isSuspended)
0
else if (isVerified)
100
else {
var score = deviceWeightAdditive * 0.1 +
(if (hasValidDevice) deviceWeightAdditive else 0)
val normalizedAge = if (age > 30) 1.0 else (1.0 min scala.math.log(1.0 + age / 15.0))
score *= normalizedAge
if (score < 0.01) score = 0.01
if (isRestricted) score *= restrictedWeightMultiplicative
score = (score min 1.0) max 0
score *= 100
score
}
val friendsToFollowersRatio = (1.0 + numFollowings) / (1.0 + numFollowers)
val adjustedMass =
if (numFollowings > threshAbsNumFriendsUMass &&
friendsToFollowersRatio > threshFriendsToFollowersRatioUMass) {
mass / scala.math.exp(
constantDivisionFactorGt_threshFriendsToFollowersRatioUMass *
(friendsToFollowersRatio - threshFriendsToFollowersRatioUMass)
)
} else {
mass
}
Some(UserMassInfo(userId, adjustedMass))
}
}
}

View File

@ -1,235 +0,0 @@
package com.twitter.graph.batch.job.tweepcred
import com.twitter.scalding._
/**
* weighted page rank for the given graph, start from the given pagerank,
* perform one iteration, test for convergence, if not yet, clone itself
* and start the next page rank job with updated pagerank as input;
* if converged, start ExtractTweepcred job instead
*
* Options:
* --pwd: working directory, will read/generate the following files there
* numnodes: total number of nodes
* nodes: nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior>
* pagerank: the page rank file eg pagerank_0, pagerank_1 etc
* totaldiff: the current max pagerank delta
* Optional arguments:
* --weighted: do weighted pagerank, default false
* --curiteration: what is the current iteration, default 0
* --maxiterations: how many iterations to run. Default is 20
* --jumpprob: probability of a random jump, default is 0.1
* --threshold: total difference before finishing early, default 0.001
*
* plus the following options for ExtractTweepcred:
* --user_mass: user mass tsv file, generated by twadoop user_mass job
* --output_pagerank: where to put pagerank file
* --output_tweepcred: where to put tweepcred file
* Optional:
* --post_adjust: whether to do post adjust, default true
*
*/
class WeightedPageRank(args: Args) extends Job(args) {
val ROW_TYPE_1 = 1
val ROW_TYPE_2 = 2
val PWD = args("pwd")
val ALPHA = args.getOrElse("jumpprob", "0.1").toDouble
val WEIGHTED = args.getOrElse("weighted", "false").toBoolean
val THRESHOLD = args.getOrElse("threshold", "0.001").toDouble
val MAXITERATIONS = args.getOrElse("maxiterations", "20").toInt
val CURITERATION = args.getOrElse("curiteration", "0").toInt
// 'size
val numNodes = getNumNodes(PWD + "/numnodes")
// 'src_id, 'dst_ids, 'weights, 'mass_prior
val nodes = getNodes(PWD + "/nodes")
// 'src_id_input, 'mass_input
val inputPagerank = getInputPagerank(PWD + "/pagerank_" + CURITERATION)
// one iteration of pagerank
val outputPagerank = doPageRank(nodes, inputPagerank)
val outputFileName = PWD + "/pagerank_" + (CURITERATION + 1)
outputPagerank
.project('src_id, 'mass_n)
.write(Tsv(outputFileName))
// detect convergence
val totalDiff = outputPagerank
.mapTo(('mass_input, 'mass_n) -> 'mass_diff) { args: (Double, Double) =>
scala.math.abs(args._1 - args._2)
}
.groupAll { _.sum[Double]('mass_diff) }
.write(Tsv(PWD + "/totaldiff"))
/**
* test convergence, if not yet, kick off the next iteration
*/
override def next = {
// the max diff generated above
val totalDiff = Tsv(PWD + "/totaldiff").readAtSubmitter[Double].head
if (CURITERATION < MAXITERATIONS - 1 && totalDiff > THRESHOLD) {
val newArgs = args + ("curiteration", Some((CURITERATION + 1).toString))
Some(clone(newArgs))
} else {
val newArgs = args + ("input_pagerank", Some(outputFileName))
Some(new ExtractTweepcred(newArgs))
}
}
def getInputPagerank(fileName: String) = {
Tsv(fileName).read
.mapTo((0, 1) -> ('src_id_input, 'mass_input)) { input: (Long, Double) =>
input
}
}
/**
* read the pregenerated nodes file <'src_id, 'dst_ids, 'weights, 'mass_prior>
*/
def getNodes(fileName: String) = {
mode match {
case Hdfs(_, conf) => {
SequenceFile(fileName).read
.mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) {
input: (Long, Array[Long], Array[Float], Double) =>
input
}
}
case _ => {
Tsv(fileName).read
.mapTo((0, 1, 2, 3) -> ('src_id, 'dst_ids, 'weights, 'mass_prior)) {
input: (Long, String, String, Double) =>
{
(
input._1,
// convert string to int array
if (input._2 != null && input._2.length > 0) {
input._2.split(",").map { _.toLong }
} else {
Array[Long]()
},
// convert string to float array
if (input._3 != null && input._3.length > 0) {
input._3.split(",").map { _.toFloat }
} else {
Array[Float]()
},
input._4
)
}
}
}
}
}
/**
* the total number of nodes, single line file
*/
def getNumNodes(fileName: String) = {
Tsv(fileName).read
.mapTo(0 -> 'size) { input: Long =>
input
}
}
/**
* one iteration of pagerank
* inputPagerank: <'src_id_input, 'mass_input>
* return <'src_id, 'mass_n, 'mass_input>
*
* Here is a highlevel view of the unweighted algorithm:
* let
* N: number of nodes
* inputPagerank(N_i): prob of walking to node i,
* d(N_j): N_j's out degree
* then
* pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) / d_j)
* deadPagerank = (1 - \sum_{i} pagerankNext(N_i)) / N
* randomPagerank(N_i) = userMass(N_i) * ALPHA + deadPagerank * (1-ALPHA)
* pagerankOutput(N_i) = randomPagerank(N_i) + pagerankNext(N_i) * (1-ALPHA)
*
* For weighted algorithm:
* let
* w(N_j, N_i): weight from N_j to N_i
* tw(N_j): N_j's total out weights
* then
* pagerankNext(N_i) = (\sum_{j points to i} inputPagerank(N_j) * w(N_j, N_i) / tw(N_j))
*
*/
def doPageRank(nodeRows: RichPipe, inputPagerank: RichPipe): RichPipe = {
// 'src_id, 'dst_ids, 'weights, 'mass_prior, 'mass_input
val nodeJoined = nodeRows
.joinWithSmaller('src_id -> 'src_id_input, inputPagerank)
.discard('src_id_input)
// 'src_id, 'mass_n
val pagerankNext = nodeJoined
.flatMapTo(('dst_ids, 'weights, 'mass_input) -> ('src_id, 'mass_n)) {
args: (Array[Long], Array[Float], Double) =>
{
if (args._1.length > 0) {
if (WEIGHTED) {
// weighted distribution
val total: Double = args._2.sum
(args._1 zip args._2).map { idWeight: (Long, Float) =>
(idWeight._1, args._3 * idWeight._2 / total)
}
} else {
// equal distribution
val dist: Double = args._3 / args._1.length
args._1.map { id: Long =>
(id, dist)
}
}
} else {
//Here is a node that points to no other nodes (dangling)
Nil
}
}
}
.groupBy('src_id) {
_.sum[Double]('mass_n)
}
// 'sum_mass
val sumPagerankNext = pagerankNext.groupAll { _.sum[Double]('mass_n -> 'sum_mass) }
// 'deadMass
// single row jobs
// the dead page rank equally distributed to every node
val deadPagerank = sumPagerankNext
.crossWithTiny(numNodes)
.map(('sum_mass, 'size) -> 'deadMass) { input: (Double, Long) =>
(1.0 - input._1) / input._2
}
.discard('size, 'sum_mass)
// 'src_id_r, 'mass_n_r
// random jump probability plus dead page rank
val randomPagerank = nodeJoined
.crossWithTiny(deadPagerank)
.mapTo(('src_id, 'mass_prior, 'deadMass, 'mass_input) -> ('src_id, 'mass_n, 'mass_input)) {
ranks: (Long, Double, Double, Double) =>
(ranks._1, ranks._2 * ALPHA + ranks._3 * (1 - ALPHA), ranks._4)
}
// 'src_id, 'mass_n
// scale next page rank to 1-ALPHA
val pagerankNextScaled = pagerankNext
.map('mass_n -> ('mass_n, 'mass_input)) { m: Double =>
((1 - ALPHA) * m, 0.0)
}
// 'src_id, 'mass_n, 'mass_input
// random probability + next probability
(randomPagerank ++ pagerankNextScaled)
.groupBy('src_id) {
_.sum[Double]('mass_input) // keep the input pagerank
.sum[Double]('mass_n) // take the sum
}
}
}

Binary file not shown.

View File

@ -1,19 +0,0 @@
## Real Graph (bqe)
This project builds a machine learning model using a gradient boosting tree classifier to predict the likelihood of a Twitter user interacting with another user.
The algorithm works by first creating a labeled dataset of user interactions from a graph of Twitter users. This graph is represented in a BigQuery table where each row represents a directed edge between two users, along with various features such as the number of tweets, follows, favorites, and other metrics related to user behavior.
To create the labeled dataset, the algorithm first selects a set of candidate interactions by identifying all edges that were active during a certain time period. It then joins this candidate set with a set of labeled interactions that occurred one day after the candidate period. Positive interactions are labeled as "1" and negative interactions are labeled as "0". The resulting labeled dataset is then used to train a boosted tree classifier model.
The model is trained using the labeled dataset and various hyperparameters, including the maximum number of iterations and the subsample rate. The algorithm splits the labeled dataset into training and testing sets based on the source user's ID, using a custom data split method.
Once the model is trained, it can be used to generate a score estimating the probability of a user interacting with another user.
## Real Graph (scio)
This project aggregates the number of interactions between pairs of users on Twitter. On a daily basis, there are multiple dataflow jobs that perform this aggregation, which includes public engagements like favorites, retweets, follows, etc. as well as private engagements like profile views, tweet clicks, and whether or not a user has another user in their address book (given a user opt-in to share address book).
After the daily aggregation of interactions, there is a rollup job that aggregates yesterday's aggregation with today's interactions. The rollup job outputs several results, including the daily count of interactions per interaction types between a pair of users, the daily incoming interactions made on a user per interaction type, the rollup aggregation of interactions as a decayed sum between a pair of users, and the rollup aggregation of incoming interactions made on a user.
Finally, the rollup job outputs the ML predicted interaction score between the pair of users alongside the rollup aggregation of interactions as a decayed sum between them.

View File

@ -1,58 +0,0 @@
# Scoring
This folder contains the sql files that we'll use for scoring the real graph edges in BQ. We have 4 steps that take place:
- check to make sure that our models are in place. the feature importance query should return 20 rows in total: 10 rows per model, 1 for each feature.
- follow graph feature generation. this is to ensure that we have features for all users regardless if they have had any recent activity.
- candidate generation. this query combines the candidates from the follow graph and the activity graph, and the features from both.
- scoring. this query scores with 2 of our prod models and saves the scores to a table, with an additional field that distinguishes if an edge in in/out of network.
## Instructions
For deploying the job, you would need to create a zip file, upload to packer, and then schedule it with aurora.
```
zip -jr real_graph_scoring src/scala/com/twitter/interaction_graph/bqe/scoring && \
packer add_version --cluster=atla cassowary real_graph_scoring real_graph_scoring.zip
aurora cron schedule atla/cassowary/prod/real_graph_scoring src/scala/com/twitter/interaction_graph/bqe/scoring/scoring.aurora && \
aurora cron start atla/cassowary/prod/real_graph_scoring
```
# candidates.sql
This BigQuery (BQ) query does the following:
1. Declares two variables, date_start and date_end, which are both of type DATE.
2. Sets the date_end variable to the maximum partition ID of the interaction_graph_labels_daily table, using the PARSE_DATE() function to convert the partition ID to a date format.
3. Sets the date_start variable to 30 days prior to the date_end variable, using the DATE_SUB() function.
4. Creates a new table called candidates in the realgraph dataset, partitioned by ds.
5. The query uses three common table expressions (T1, T2, and T3) to join data from two tables (interaction_graph_labels_daily and tweeting_follows) to generate a table containing candidate information and features.
6. The table T3 is the result of a full outer join between T1 and T2, grouping by source_id and destination_id, and aggregating values such as num_tweets, label_types, and the counts of different types of labels (e.g. num_follows, num_favorites, etc.).
7. The T4 table ranks each source_id by the number of num_days and num_tweets, and selects the top 2000 rows for each source_id.
8. Finally, the query selects all columns from the T4 table and appends the date_end variable as a new column named ds.
Overall, the query generates a table of candidates and their associated features for a particular date range, using data from two tables in the twttr-bq-cassowary-prod and twttr-recos-ml-prod datasets.
# follow_graph_features.sql
This BigQuery script creates a table twttr-recos-ml-prod.realgraph.tweeting_follows that includes features for Twitter user interactions, specifically tweet counts and follows.
First, it sets two variables date_latest_tweet and date_latest_follows to the most recent dates available in two separate tables: twttr-bq-tweetsource-pub-prod.user.public_tweets and twttr-recos-ml-prod.user_events.valid_user_follows, respectively.
Then, it creates the tweet_count and all_follows CTEs.
The tweet_count CTE counts the number of tweets made by each user within the last 3 days prior to date_latest_tweet.
The all_follows CTE retrieves all the follows from the valid_user_follows table that happened on date_latest_follows and left joins it with the tweet_count CTE. It also adds a row number that partitions by the source user ID and orders by the number of tweets in descending order. The final output is filtered to keep only the top 2000 follows per user based on the row number.
The final SELECT statement combines the all_follows CTE with the date_latest_tweet variable and inserts the results into the twttr-recos-ml-prod.realgraph.tweeting_follows table partitioned by date.
# scoring.sql
This BQ code performs operations on a BigQuery table called twttr-recos-ml-prod.realgraph.scores. Here is a step-by-step breakdown of what the code does:
Declare two variables, date_end and date_latest_follows, and set their values based on the latest partitions in the twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS and twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS tables that correspond to specific tables, respectively. The PARSE_DATE() function is used to convert the partition IDs to date format.
Delete rows from the twttr-recos-ml-prod.realgraph.scores table where the value of the ds column is equal to date_end.
Insert rows into the twttr-recos-ml-prod.realgraph.scores table based on a query that generates predicted scores for pairs of user IDs using two machine learning models. Specifically, the query uses the ML.PREDICT() function to apply two machine learning models (twttr-recos-ml-prod.realgraph.prod and twttr-recos-ml-prod.realgraph.prod_explicit) to the twttr-recos-ml-prod.realgraph.candidates table. The resulting predicted scores are joined with the twttr-recos-ml-prod.realgraph.tweeting_follows table, which contains information about the number of tweets made by users and their follow relationships, using a full outer join. The final result includes columns for the source ID, destination ID, predicted score (prob), explicit predicted score (prob_explicit), a binary variable indicating whether the destination ID is followed by the source ID (followed), and the value of date_end for the ds column. If there is no match in the predicted_scores table for a given pair of user IDs, the COALESCE() function is used to return the corresponding values from the tweeting_follows table, with default values of 0.0 for the predicted scores.

View File

@ -1,42 +0,0 @@
DECLARE date_start, date_end DATE;
SET date_end = (
SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id
FROM `twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS`
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="interaction_graph_labels_daily"
);
SET date_start = DATE_SUB(date_end, INTERVAL 30 DAY);
-- all candidates and their features
CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.candidates`
PARTITION BY ds
AS
WITH T1 AS (
SELECT source_id, destination_id, label, dateHour
FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily`
LEFT JOIN UNNEST(labels) AS label
WHERE DATE(dateHour) BETWEEN date_start AND date_end
), T2 AS (
SELECT source_id, destination_id, num_tweets
FROM `twttr-recos-ml-prod.realgraph.tweeting_follows`
), T3 AS (
SELECT
COALESCE(T1.source_id, T2.source_id) AS source_id,
COALESCE(T1.destination_id, T2.destination_id) AS destination_id,
COUNT(DISTINCT(T1.dateHour)) AS num_days,
MIN(COALESCE(num_tweets,0)) AS num_tweets, -- all rows' num_tweets should be the same
COALESCE(DATE_DIFF(date_end, DATE(MAX(T1.dateHour)), DAY),30) AS days_since_last_interaction,
COUNT(DISTINCT(label)) AS label_types,
COUNTIF(label="num_follows") AS num_follows,
COUNTIF(label="num_favorites") AS num_favorites,
COUNTIF(label="num_tweet_clicks") AS num_tweet_clicks,
COUNTIF(label="num_profile_views") AS num_profile_views,
FROM T1
FULL JOIN T2
USING (source_id, destination_id)
GROUP BY 1,2
ORDER BY 3 DESC,4 DESC
), T4 AS (
SELECT RANK() OVER (PARTITION BY source_id ORDER BY num_days DESC, num_tweets DESC) AS rn, *
FROM T3
) SELECT *, date_end AS ds FROM T4 WHERE rn <= 2000

View File

@ -1,5 +0,0 @@
(SELECT * FROM ML.FEATURE_IMPORTANCE(MODEL `twttr-recos-ml-prod.realgraph.prod`)
ORDER BY importance_gain DESC)
UNION ALL
(SELECT * FROM ML.FEATURE_IMPORTANCE(MODEL `twttr-recos-ml-prod.realgraph.prod_explicit`)
ORDER BY importance_gain DESC)

View File

@ -1,28 +0,0 @@
DECLARE date_latest_tweet, date_latest_follows DATE;
SET date_latest_tweet = (
SELECT PARSE_DATE('%Y%m%d', SUBSTRING(MAX(partition_id), 1, 8)) AS partition_id
FROM `twttr-bq-tweetsource-pub-prod.user.INFORMATION_SCHEMA.PARTITIONS`
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="public_tweets");
SET date_latest_follows = (
SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id
FROM `twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS`
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="valid_user_follows");
-- tweet count candidate features
CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.tweeting_follows`
PARTITION BY ds
AS
WITH tweet_count AS (
SELECT userId, COUNT(userId) AS num_tweets
FROM `twttr-bq-tweetsource-pub-prod.user.public_tweets`
WHERE DATE(ts) BETWEEN DATE_SUB(date_latest_tweet, INTERVAL 3 DAY) AND date_latest_tweet
GROUP BY 1
), all_follows AS (
SELECT F.sourceId AS source_id, F.destinationId AS destination_id, COALESCE(T.num_tweets,0) AS num_tweets,
ROW_NUMBER() OVER (PARTITION BY F.sourceId ORDER BY T.num_tweets DESC) AS rn
FROM `twttr-recos-ml-prod.user_events.valid_user_follows` F
LEFT JOIN tweet_count T
ON F.destinationId=T.userId
WHERE DATE(F._PARTITIONTIME) = date_latest_follows
) SELECT *, date_latest_tweet AS ds FROM all_follows WHERE rn <= 2000
;

View File

@ -1,52 +0,0 @@
DECLARE date_end, date_latest_follows DATE;
SET date_end = (
SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id
FROM `twttr-bq-cassowary-prod.user.INFORMATION_SCHEMA.PARTITIONS`
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="interaction_graph_labels_daily"
);
SET date_latest_follows = (
SELECT PARSE_DATE('%Y%m%d', MAX(partition_id)) AS partition_id
FROM `twttr-recos-ml-prod.user_events.INFORMATION_SCHEMA.PARTITIONS`
WHERE partition_id IS NOT NULL AND partition_id != '__NULL__' AND table_name="valid_user_follows");
DELETE
FROM `twttr-recos-ml-prod.realgraph.scores`
WHERE ds = date_end;
-- score candidates (59m)
INSERT INTO `twttr-recos-ml-prod.realgraph.scores`
WITH predicted_scores AS (
SELECT
source_id,
destination_id,
p1.prob AS prob,
p2.prob AS prob_explicit
FROM ML.PREDICT(MODEL `twttr-recos-ml-prod.realgraph.prod`,
(
SELECT
*
FROM
`twttr-recos-ml-prod.realgraph.candidates` ) ) S1
CROSS JOIN UNNEST(S1.predicted_label_probs) AS p1
JOIN ML.PREDICT(MODEL `twttr-recos-ml-prod.realgraph.prod_explicit`,
(
SELECT
*
FROM
`twttr-recos-ml-prod.realgraph.candidates` ) ) S2
USING (source_id, destination_id)
CROSS JOIN UNNEST(S2.predicted_label_probs) AS p2
WHERE p1.label=1 AND p2.label=1
)
SELECT
COALESCE(predicted_scores.source_id, tweeting_follows.source_id) AS source_id,
COALESCE(predicted_scores.destination_id, tweeting_follows.destination_id) AS destination_id,
COALESCE(prob, 0.0) AS prob,
COALESCE(prob_explicit, 0.0) AS prob_explicit,
(tweeting_follows.source_id IS NOT NULL) AND (tweeting_follows.destination_id IS NOT NULL) AS followed,
date_end AS ds
FROM
predicted_scores
FULL JOIN
`twttr-recos-ml-prod.realgraph.tweeting_follows` tweeting_follows
USING (source_id, destination_id)

View File

@ -1,60 +0,0 @@
# Training
This folder contains the sql files that we'll use for training the prod real graph models:
- prod (predicts any interactions the next day)
- prod_explicit (predicts any explicit interactions the next day)
We have 3 steps that take place:
- candidate generation + feature hydration. this query samples 1% of edges from the `twttr-recos-ml-prod.realgraph.candidates` table which is already produced daily and saves it to `twttr-recos-ml-prod.realgraph.candidates_sampled`. we save each day's data according to the statebird batch run date and hence require checks to make sure that the data exists to begin with.
- label candidates. we join day T's candidates with day T+1's labels while filtering out any negative interactions to get our labeled dataset. we append an additional day's worth of segments for each day. we finally generate the training dataset which uses all day's labeled data for training, performing negative downsampling to get a roughly 50-50 split of positive to negative labels.
- training. we use bqml for training our xgboost models.
## Instructions
For deploying the job, you would need to create a zip file, upload to packer, and then schedule it with aurora.
```
zip -jr real_graph_training src/scala/com/twitter/interaction_graph/bqe/training && \
packer add_version --cluster=atla cassowary real_graph_training real_graph_training.zip
aurora cron schedule atla/cassowary/prod/real_graph_training src/scala/com/twitter/interaction_graph/bqe/training/training.aurora && \
aurora cron start atla/cassowary/prod/real_graph_training
```
# candidates.sql
1. Sets the value of the variable date_candidates to the date of the latest partition of the candidates_for_training table.
2. Creates a new table candidates_sampled if it does not exist already, which will contain a sample of 100 rows from the candidates_for_training table.
3. Deletes any existing rows from the candidates_sampled table where the ds column matches the date_candidates value, to avoid double-writing.
4. Inserts a sample of rows into the candidates_sampled table from the candidates_for_training table, where the modulo of the absolute value of the FARM_FINGERPRINT of the concatenation of source_id and destination_id is equal to the value of the $mod_remainder$ variable, and where the ds column matches the date_candidates value.
# check_candidates_exist.sql
This BigQuery prepares a table of candidates for training a machine learning model. It does the following:
1. Declares two variables date_start and date_end that are 30 days apart, and date_end is set to the value of $start_time$ parameter (which is a Unix timestamp).
2. Creates a table candidates_for_training that is partitioned by ds (date) and populated with data from several other tables in the database. It joins information from tables of user interactions, tweeting, and interaction graph aggregates, filters out negative edge snapshots, calculates some statistics and aggregates them by source_id and destination_id. Then, it ranks each source_id by the number of days and tweets, selects top 2000, and adds date_end as a new column ds.
3. Finally, it selects the ds column from candidates_for_training where ds equals date_end.
Overall, this script prepares a table of 2000 candidate pairs of user interactions with statistics and labels, which can be used to train a machine learning model for recommendation purposes.
# labeled_candidates.sql
The BQ does the following:
1. Defines two variables date_candidates and date_labels as dates based on the $start_time$ parameter.
2. Creates a new table twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ with default values.
3. Deletes any prior data in the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table for the current date_candidates.
4. Joins the twttr-recos-ml-prod.realgraph.candidates_sampled table with the twttr-bq-cassowary-prod.user.interaction_graph_labels_daily table and the twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot table. It assigns a label of 1 for positive interactions and 0 for negative interactions, and selects only the rows where there is no negative interaction.
5. Inserts the joined data into the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table.
6. Calculates the positive rate by counting the number of positive labels and dividing it by the total number of labels.
7. Creates a new table twttr-recos-ml-prod.realgraph.train$table_suffix$ by sampling from the twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$ table, with a downsampling of negative examples to balance the number of positive and negative examples, based on the positive rate calculated in step 6.
The resulting twttr-recos-ml-prod.realgraph.train$table_suffix$ table is used as a training dataset for a machine learning model.
# train_model.sql
This BQ command creates or replaces a machine learning model called twttr-recos-ml-prod.realgraph.prod$table_suffix$. The model is a boosted tree classifier, which is used for binary classification problems.
The options provided in the command configure the specific settings for the model, such as the number of parallel trees, the maximum number of iterations, and the data split method. The DATA_SPLIT_METHOD parameter is set to CUSTOM, and DATA_SPLIT_COL is set to if_eval, which means the data will be split into training and evaluation sets based on the if_eval column. The IF function is used to assign a boolean value of true or false to if_eval based on the modulo operation performed on source_id.
The SELECT statement specifies the input data for the model. The columns selected include label (the target variable to be predicted), as well as various features such as num_days, num_tweets, and num_follows that are used to predict the target variable.

View File

@ -1,18 +0,0 @@
-- get latest partition of candidates with data
DECLARE date_candidates DATE;
SET date_candidates = (SELECT DATE(TIMESTAMP_MILLIS($start_time$)));
CREATE TABLE IF NOT EXISTS `twttr-recos-ml-prod.realgraph.candidates_sampled` AS
SELECT * FROM `twttr-recos-ml-prod.realgraph.candidates_for_training` LIMIT 100;
-- remove previous output snapshot (if exists) to avoid double-writing
DELETE
FROM `twttr-recos-ml-prod.realgraph.candidates_sampled`
WHERE ds = date_candidates;
-- sample from candidates table instead of recomputing features
INSERT INTO `twttr-recos-ml-prod.realgraph.candidates_sampled`
SELECT * FROM `twttr-recos-ml-prod.realgraph.candidates_for_training`
WHERE MOD(ABS(FARM_FINGERPRINT(CONCAT(source_id, '_', destination_id))), 100) = $mod_remainder$
AND ds = date_candidates;

View File

@ -1,43 +0,0 @@
DECLARE date_start, date_end DATE;
SET date_end = (SELECT DATE(TIMESTAMP_MILLIS($start_time$)));
SET date_start = DATE_SUB(date_end, INTERVAL 30 DAY);
CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.candidates_for_training`
PARTITION BY ds
AS
WITH T1 AS (
SELECT source_id, destination_id, label, dateHour
FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily`
LEFT JOIN UNNEST(labels) AS label
WHERE DATE(dateHour) BETWEEN date_start AND date_end
), T2 AS (
SELECT source_id, destination_id, num_tweets
FROM `twttr-recos-ml-prod.realgraph.tweeting_follows`
), T3 AS (
SELECT
COALESCE(T1.source_id, T2.source_id) AS source_id,
COALESCE(T1.destination_id, T2.destination_id) AS destination_id,
COUNT(DISTINCT(T1.dateHour)) AS num_days,
MIN(COALESCE(num_tweets,0)) AS num_tweets, -- all rows' num_tweets should be the same
COALESCE(DATE_DIFF(date_end, DATE(MAX(T1.dateHour)), DAY),30) AS days_since_last_interaction,
COUNT(DISTINCT(label)) AS label_types,
COUNTIF(label="num_follows") AS num_follows,
COUNTIF(label="num_favorites") AS num_favorites,
COUNTIF(label="num_tweet_clicks") AS num_tweet_clicks,
COUNTIF(label="num_profile_views") AS num_profile_views,
FROM T1
FULL JOIN T2
USING (source_id, destination_id)
LEFT JOIN `twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot` N
USING (source_id, destination_id)
WHERE N.source_id IS NULL AND N.destination_id IS NULL
GROUP BY 1,2
ORDER BY 3 DESC,4 DESC
), T4 AS (
SELECT RANK() OVER (PARTITION BY source_id ORDER BY num_days DESC, num_tweets DESC) AS rn, *
FROM T3
) SELECT *, date_end AS ds FROM T4 WHERE rn <= 2000;
SELECT ds FROM `twttr-recos-ml-prod.realgraph.candidates_for_training`
WHERE ds = (SELECT DATE(TIMESTAMP_MILLIS($start_time$)))
LIMIT 1

View File

@ -1,4 +0,0 @@
SELECT dateHour FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily`
WHERE dateHour = (SELECT TIMESTAMP_ADD(TIMESTAMP_MILLIS($start_time$), INTERVAL 1 DAY))
LIMIT 1

View File

@ -1,67 +0,0 @@
-- date_labels is 1 day after date_candidates (which is the current batch run's start date)
DECLARE date_candidates, date_labels DATE;
DECLARE positive_rate FLOAT64;
SET date_candidates = (SELECT DATE(TIMESTAMP_MILLIS($start_time$)));
SET date_labels = DATE_ADD(date_candidates, INTERVAL 1 DAY);
CREATE TABLE IF NOT EXISTS `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$` AS
SELECT
0 AS source_id,
1 AS destination_id,
1 AS label,
1 AS num_days,
1 AS num_tweets,
1 AS num_follows,
1 AS num_favorites,
1 AS num_tweet_clicks,
1 AS num_profile_views,
1 AS days_since_last_interaction,
1 AS label_types,
DATE("2023-01-08") AS ds;
-- delete any prior data to avoid double writing
DELETE
FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$`
WHERE ds = date_candidates;
-- join labels with candidates with 1 day attribution delay and insert new segment
INSERT INTO `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$`
WITH label_positive AS (
SELECT source_id, destination_id
FROM `twttr-bq-cassowary-prod.user.interaction_graph_labels_daily`
WHERE DATE(dateHour)=date_labels
), label_negative AS (
SELECT source_id, destination_id
FROM `twttr-bq-cassowary-prod.user.interaction_graph_agg_negative_edge_snapshot`
) SELECT
F.source_id,
F.destination_id,
CASE WHEN P.source_id IS NULL THEN 0 ELSE 1 END AS label,
num_days,
num_tweets,
num_follows,
num_favorites,
num_tweet_clicks,
num_profile_views,
days_since_last_interaction,
label_types,
date_candidates AS ds
FROM `twttr-recos-ml-prod.realgraph.candidates_sampled` F
LEFT JOIN label_positive P USING(source_id, destination_id)
LEFT JOIN label_negative N USING(source_id, destination_id)
WHERE N.source_id IS NULL AND N.destination_id IS NULL
AND F.ds=date_candidates
;
-- get positive rate
SET positive_rate =
(SELECT SUM(label)/COUNT(label) AS pct_positive
FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$`
);
-- create training dataset with negative downsampling (should get ~50-50 split)
-- this spans over the cumulative date range of the labeled candidates table.
CREATE OR REPLACE TABLE `twttr-recos-ml-prod.realgraph.train$table_suffix$` AS
SELECT * FROM `twttr-recos-ml-prod.realgraph.labeled_candidates$table_suffix$`
WHERE CASE WHEN label = 0 AND RAND() < positive_rate THEN true WHEN label = 1 AND RAND() < (1-positive_rate) THEN true ELSE false END
;

View File

@ -1,27 +0,0 @@
CREATE OR REPLACE MODEL `twttr-recos-ml-prod.realgraph.prod$table_suffix$`
OPTIONS(MODEL_TYPE='BOOSTED_TREE_CLASSIFIER',
BOOSTER_TYPE = 'GBTREE',
NUM_PARALLEL_TREE = 1,
MAX_ITERATIONS = 20,
TREE_METHOD = 'HIST',
EARLY_STOP = TRUE,
SUBSAMPLE = 0.01,
INPUT_LABEL_COLS = ['label'],
DATA_SPLIT_METHOD = 'CUSTOM',
DATA_SPLIT_COL = 'if_eval')
AS SELECT
label,
source_id,
destination_id,
num_days,
num_tweets,
num_follows,
num_favorites,
num_tweet_clicks,
num_profile_views,
days_since_last_interaction,
label_types,
-- partition train/test by source_id's
IF(MOD(ABS(FARM_FINGERPRINT(CAST(source_id AS STRING))), 10) = 0, true, false) AS if_eval,
FROM `twttr-recos-ml-prod.realgraph.train$table_suffix$`
;

View File

@ -1,25 +0,0 @@
scala_library(
name = "user_session_inj",
sources = ["UserSessionInjection.scala"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/bijection:scrooge",
"src/scala/com/twitter/scalding_internal/multiformat/format",
"src/thrift/com/twitter/user_session_store:thrift-scala",
],
)
scala_library(
name = "edge_list_injection",
sources = ["EdgeListInjection.scala"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/bijection:scrooge",
"src/scala/com/twitter/scalding_internal/multiformat/format",
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
],
)

View File

@ -1,14 +0,0 @@
package com.twitter.interaction_graph.injection
import com.twitter.interaction_graph.thriftscala.EdgeList
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
object EdgeListInjection {
final val injection: KeyValInjection[Long, EdgeList] =
KeyValInjection(
Long2BigEndian,
ScalaCompactThrift(EdgeList)
)
}

View File

@ -1,14 +0,0 @@
package com.twitter.interaction_graph.injection
import com.twitter.user_session_store.thriftscala.UserSession
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.ScalaCompactThrift
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian
object UserSessionInjection {
final val injection: KeyValInjection[Long, UserSession] =
KeyValInjection(
Long2BigEndian,
ScalaCompactThrift(UserSession)
)
}

View File

@ -1,7 +0,0 @@
# Interaction Graph
This folder contains the code used in the offline pipeline for real graph v2.
The ETL jobs are contained in folders prefaced with `agg_*`, while the jobs powering the ml pipeline are in the ml folder.
Note that the jobs in the ml folder are mostly ETL jobs; the main training and scoring happens within BQML.

View File

@ -1,62 +0,0 @@
scala_library(
name = "agg_address_book",
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":interaction_graph_agg_address_book_edge_snapshot-scala",
":interaction_graph_agg_address_book_vertex_snapshot-scala",
"3rdparty/jvm/com/twitter/storehaus:algebra",
"addressbook/jobs/src/main/scala/com/twitter/addressbook/jobs/simplematches:simple_user_matches-scala",
"beam-internal/src/main/scala/com/twitter/beam/io/dal",
"beam-internal/src/main/scala/com/twitter/scio_internal/job",
"beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow",
"consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read",
"src/scala/com/twitter/interaction_graph/scio/common",
],
)
jvm_binary(
name = "interaction_graph_address_book_scio",
main = "com.twitter.interaction_graph.scio.agg_address_book.InteractionGraphAddressBookJob",
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":agg_address_book",
],
)
create_datasets(
base_name = "interaction_graph_agg_address_book_edge_snapshot",
description = "User-user directed edges with addressbook features",
java_schema = "com.twitter.interaction_graph.thriftjava.Edge",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.thriftscala.Edge",
segment_type = "snapshot",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
],
scala_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
],
)
create_datasets(
base_name = "interaction_graph_agg_address_book_vertex_snapshot",
description = "User vertex with addressbook features",
java_schema = "com.twitter.interaction_graph.thriftjava.Vertex",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex",
segment_type = "snapshot",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
],
scala_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
],
)

View File

@ -1,34 +0,0 @@
package com.twitter.interaction_graph.scio.agg_address_book
import com.spotify.scio.ScioMetrics
import org.apache.beam.sdk.metrics.Counter
trait InteractionGraphAddressBookCountersTrait {
val Namespace = "Interaction Graph Address Book"
def emailFeatureInc(): Unit
def phoneFeatureInc(): Unit
def bothFeatureInc(): Unit
}
/**
* SCIO counters are used to gather run time statistics
*/
case object InteractionGraphAddressBookCounters extends InteractionGraphAddressBookCountersTrait {
val emailFeatureCounter: Counter =
ScioMetrics.counter(Namespace, "Email Feature")
val phoneFeatureCounter: Counter =
ScioMetrics.counter(Namespace, "Phone Feature")
val bothFeatureCounter: Counter =
ScioMetrics.counter(Namespace, "Both Feature")
override def emailFeatureInc(): Unit = emailFeatureCounter.inc()
override def phoneFeatureInc(): Unit = phoneFeatureCounter.inc()
override def bothFeatureInc(): Unit = bothFeatureCounter.inc()
}

View File

@ -1,71 +0,0 @@
package com.twitter.interaction_graph.scio.agg_address_book
import com.spotify.scio.ScioContext
import com.spotify.scio.values.SCollection
import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord
import com.twitter.beam.io.dal.DAL
import com.twitter.beam.io.dal.DAL.DiskFormat
import com.twitter.beam.io.dal.DAL.PathLayout
import com.twitter.beam.io.dal.DAL.WriteOptions
import com.twitter.beam.job.ServiceIdentifierOptions
import com.twitter.scio_internal.job.ScioBeamJob
import com.twitter.statebird.v2.thriftscala.Environment
import com.twitter.interaction_graph.thriftscala.Edge
import com.twitter.interaction_graph.thriftscala.Vertex
import java.time.Instant
import org.joda.time.Interval
object InteractionGraphAddressBookJob extends ScioBeamJob[InteractionGraphAddressBookOption] {
override protected def configurePipeline(
scioContext: ScioContext,
pipelineOptions: InteractionGraphAddressBookOption
): Unit = {
@transient
implicit lazy val sc: ScioContext = scioContext
implicit lazy val dateInterval: Interval = pipelineOptions.interval
implicit lazy val addressBookCounters: InteractionGraphAddressBookCountersTrait =
InteractionGraphAddressBookCounters
val interactionGraphAddressBookSource = InteractionGraphAddressBookSource(pipelineOptions)
val addressBook: SCollection[UserMatchesRecord] =
interactionGraphAddressBookSource.readSimpleUserMatches(
dateInterval.withStart(dateInterval.getStart.minusDays(3))
)
val (vertex, edges) = InteractionGraphAddressBookUtil.process(addressBook)
val dalEnvironment: String = pipelineOptions
.as(classOf[ServiceIdentifierOptions])
.getEnvironment()
val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) {
pipelineOptions.getDALWriteEnvironment
} else {
dalEnvironment
}
vertex.saveAsCustomOutput(
"Write Vertex Records",
DAL.writeSnapshot[Vertex](
InteractionGraphAggAddressBookVertexSnapshotScalaDataset,
PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_vertex_daily"),
Instant.ofEpochMilli(dateInterval.getEndMillis),
DiskFormat.Parquet,
Environment.valueOf(dalWriteEnvironment),
writeOption =
WriteOptions(numOfShards = Some((pipelineOptions.getNumberOfShards / 16.0).ceil.toInt))
)
)
edges.saveAsCustomOutput(
"Write Edge Records",
DAL.writeSnapshot[Edge](
InteractionGraphAggAddressBookEdgeSnapshotScalaDataset,
PathLayout.DailyPath(pipelineOptions.getOutputPath + "/address_book_edge_daily"),
Instant.ofEpochMilli(dateInterval.getEndMillis),
DiskFormat.Parquet,
Environment.valueOf(dalWriteEnvironment),
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
)
)
}
}

View File

@ -1,24 +0,0 @@
package com.twitter.interaction_graph.scio.agg_address_book
import com.twitter.beam.io.dal.DALOptions
import com.twitter.beam.job.DateRangeOptions
import org.apache.beam.sdk.options.Default
import org.apache.beam.sdk.options.Description
import org.apache.beam.sdk.options.Validation.Required
trait InteractionGraphAddressBookOption extends DALOptions with DateRangeOptions {
@Required
@Description("Output path for storing the final dataset")
def getOutputPath: String
def setOutputPath(value: String): Unit
@Description("Indicates DAL write environment. Can be set to dev/stg during local validation")
@Default.String("PROD")
def getDALWriteEnvironment: String
def setDALWriteEnvironment(value: String): Unit
@Description("Number of shards/partitions for saving the final dataset.")
@Default.Integer(16)
def getNumberOfShards: Integer
def setNumberOfShards(value: Integer): Unit
}

View File

@ -1,28 +0,0 @@
package com.twitter.interaction_graph.scio.agg_address_book
import com.spotify.scio.ScioContext
import com.spotify.scio.values.SCollection
import com.twitter.addressbook.jobs.simplematches.SimpleUserMatchesScalaDataset
import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord
import com.twitter.beam.job.ServiceIdentifierOptions
import com.twitter.cde.scio.dal_read.SourceUtil
import org.joda.time.Interval
case class InteractionGraphAddressBookSource(
pipelineOptions: InteractionGraphAddressBookOption
)(
implicit sc: ScioContext,
) {
val dalEnvironment: String = pipelineOptions
.as(classOf[ServiceIdentifierOptions])
.getEnvironment()
def readSimpleUserMatches(
dateInterval: Interval
): SCollection[UserMatchesRecord] = {
SourceUtil.readMostRecentSnapshotDALDataset[UserMatchesRecord](
SimpleUserMatchesScalaDataset,
dateInterval,
dalEnvironment)
}
}

View File

@ -1,93 +0,0 @@
package com.twitter.interaction_graph.scio.agg_address_book
import com.spotify.scio.values.SCollection
import com.twitter.addressbook.matches.thriftscala.UserMatchesRecord
import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil
import com.twitter.interaction_graph.scio.common.InteractionGraphRawInput
import com.twitter.interaction_graph.thriftscala.Edge
import com.twitter.interaction_graph.thriftscala.FeatureName
import com.twitter.interaction_graph.thriftscala.Vertex
object InteractionGraphAddressBookUtil {
val EMAIL = "email"
val PHONE = "phone"
val BOTH = "both"
val DefaultAge = 1
val DegaultFeatureValue = 1.0
def process(
addressBook: SCollection[UserMatchesRecord]
)(
implicit addressBookCounters: InteractionGraphAddressBookCountersTrait
): (SCollection[Vertex], SCollection[Edge]) = {
// First construct a data with (src, dst, name), where name can be "email", "phone", or "both"
val addressBookTypes: SCollection[((Long, Long), String)] = addressBook.flatMap { record =>
record.forwardMatches.toSeq.flatMap { matchDetails =>
val matchedUsers = (record.userId, matchDetails.userId)
(matchDetails.matchedByEmail, matchDetails.matchedByPhone) match {
case (true, true) =>
Seq((matchedUsers, EMAIL), (matchedUsers, PHONE), (matchedUsers, BOTH))
case (true, false) => Seq((matchedUsers, EMAIL))
case (false, true) => Seq((matchedUsers, PHONE))
case _ => Seq.empty
}
}
}
// Then construct the input data for feature calculation
val addressBookFeatureInput: SCollection[InteractionGraphRawInput] = addressBookTypes
.map {
case ((src, dst), name) =>
if (src < dst)
((src, dst, name), false)
else
((dst, src, name), true)
}.groupByKey
.flatMap {
case ((src, dst, name), iterator) =>
val isReversedValues = iterator.toSeq
// check if (src, dst) is mutual follow
val isMutualFollow = isReversedValues.size == 2
// get correct srcId and dstId if there is no mutual follow and they are reversed
val (srcId, dstId) = {
if (!isMutualFollow && isReversedValues.head)
(dst, src)
else
(src, dst)
}
// get the feature name and mutual follow name
val (featureName, mfFeatureName) = name match {
case EMAIL =>
addressBookCounters.emailFeatureInc()
(FeatureName.AddressBookEmail, FeatureName.AddressBookMutualEdgeEmail)
case PHONE =>
addressBookCounters.phoneFeatureInc()
(FeatureName.AddressBookPhone, FeatureName.AddressBookMutualEdgePhone)
case BOTH =>
addressBookCounters.bothFeatureInc()
(FeatureName.AddressBookInBoth, FeatureName.AddressBookMutualEdgeInBoth)
}
// construct the TypedPipe for feature calculation
if (isMutualFollow) {
Iterator(
InteractionGraphRawInput(srcId, dstId, featureName, DefaultAge, DegaultFeatureValue),
InteractionGraphRawInput(dstId, srcId, featureName, DefaultAge, DegaultFeatureValue),
InteractionGraphRawInput(
srcId,
dstId,
mfFeatureName,
DefaultAge,
DegaultFeatureValue),
InteractionGraphRawInput(dstId, srcId, mfFeatureName, DefaultAge, DegaultFeatureValue)
)
} else {
Iterator(
InteractionGraphRawInput(srcId, dstId, featureName, DefaultAge, DegaultFeatureValue))
}
}
// Calculate the Features
FeatureGeneratorUtil.getFeatures(addressBookFeatureInput)
}
}

View File

@ -1,34 +0,0 @@
## InteractionGraphAddressBook Dataflow Job
#### IntelliJ
```
./bazel idea src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio
```
#### Compile
```
./bazel build src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio
```
#### Build Jar
```
./bazel bundle src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_address_book_scio
```
#### Run Scheduled Job
```
export PROJECTID=twttr-recos-ml-prod
export REGION=us-central1
export JOB_NAME=interaction-graph-address-book-dataflow
bin/d6w schedule \
${PROJECTID}/${REGION}/${JOB_NAME} \
src/scala/com/twitter/interaction_graph/scio/agg_address_book/config.d6w \
--bind=profile.user_name=cassowary \
--bind=profile.project=${PROJECTID} \
--bind=profile.region=${REGION} \
--bind=profile.job_name=${JOB_NAME} \
--bind=profile.environment=prod \
--bind=profile.date=2022-04-13 \
--bind=profile.output_path=processed/interaction_graph_agg_address_book_dataflow
```

View File

@ -1,175 +0,0 @@
scala_library(
name = "agg_all",
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":interaction_graph_history_aggregated_raw_edge_daily-scala",
":interaction_graph_history_aggregated_vertex_daily-scala",
":interaction_graph_aggregated_edge_daily-scala",
":interaction_graph_aggregated_vertex_daily-scala",
":interaction_graph_history_aggregated_edge_snapshot-scala",
":interaction_graph_history_aggregated_vertex_snapshot-scala",
":real_graph_features-scala",
"beam-internal/src/main/scala/com/twitter/beam/io/dal",
"beam-internal/src/main/scala/com/twitter/scio_internal/job",
"beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow",
"consumer-data-tools/src/main/scala/com/twitter/cde/scio/dal_read",
"src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_agg_address_book_edge_snapshot-scala",
"src/scala/com/twitter/interaction_graph/scio/agg_address_book:interaction_graph_agg_address_book_vertex_snapshot-scala",
"src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_edge_daily-scala",
"src/scala/com/twitter/interaction_graph/scio/agg_client_event_logs:interaction_graph_agg_client_event_logs_vertex_daily-scala",
"src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_edge_daily-scala",
"src/scala/com/twitter/interaction_graph/scio/agg_direct_interactions:interaction_graph_agg_direct_interactions_vertex_daily-scala",
"src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_edge_snapshot-scala",
"src/scala/com/twitter/interaction_graph/scio/agg_flock:interaction_graph_agg_flock_vertex_snapshot-scala",
"src/scala/com/twitter/interaction_graph/scio/common",
"src/scala/com/twitter/interaction_graph/scio/ml/scores:real_graph_in_scores-scala",
"src/scala/com/twitter/interaction_graph/scio/ml/scores:real_graph_oon_scores-scala",
"src/scala/com/twitter/wtf/dataflow/user_events:valid_user_follows-scala",
"src/thrift/com/twitter/wtf/candidate:wtf-candidate-scala",
"tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam",
"usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala",
],
)
create_datasets(
base_name = "interaction_graph_history_aggregated_raw_edge_daily",
description = "User-user directed edges with all features",
java_schema = "com.twitter.interaction_graph.thriftjava.Edge",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.thriftscala.Edge",
segment_type = "partitioned",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
],
scala_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
],
)
create_datasets(
base_name = "interaction_graph_history_aggregated_vertex_daily",
description = "User vertex with all features",
java_schema = "com.twitter.interaction_graph.thriftjava.Vertex",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex",
segment_type = "partitioned",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
],
scala_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
],
)
jvm_binary(
name = "interaction_graph_aggregation_job_scio",
main = "com.twitter.interaction_graph.scio.agg_all.InteractionGraphAggregationJob",
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
":agg_all",
],
)
create_datasets(
base_name = "interaction_graph_history_aggregated_edge_snapshot",
description = "User-user directed edges with all features",
java_schema = "com.twitter.interaction_graph.thriftjava.Edge",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.thriftscala.Edge",
segment_type = "snapshot",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
],
scala_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
],
)
create_datasets(
base_name = "interaction_graph_history_aggregated_vertex_snapshot",
description = "User vertex with all features",
java_schema = "com.twitter.interaction_graph.thriftjava.Vertex",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex",
segment_type = "snapshot",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
],
scala_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
],
)
create_datasets(
base_name = "interaction_graph_aggregated_edge_daily",
description = "User-user directed edges with all features",
java_schema = "com.twitter.interaction_graph.thriftjava.Edge",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.thriftscala.Edge",
segment_type = "partitioned",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
],
scala_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
],
)
create_datasets(
base_name = "interaction_graph_aggregated_vertex_daily",
description = "User vertex with all features",
java_schema = "com.twitter.interaction_graph.thriftjava.Vertex",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.thriftscala.Vertex",
segment_type = "partitioned",
tags = ["bazel-compatible"],
java_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-java",
],
scala_dependencies = [
"src/thrift/com/twitter/interaction_graph:interaction_graph-scala",
],
)
create_datasets(
base_name = "real_graph_features",
key_type = "Long",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.injection.UserSessionInjection.injection",
segment_type = "snapshot",
tags = ["bazel-compatible"],
val_type = "com.twitter.user_session_store.thriftscala.UserSession",
scala_dependencies = [
"src/scala/com/twitter/interaction_graph/injection:user_session_inj",
],
)
create_datasets(
base_name = "home_light_ranker_top_k_real_graph_features",
key_type = "Long",
platform = "java8",
role = "cassowary",
scala_schema = "com.twitter.interaction_graph.injection.EdgeListInjection.injection",
segment_type = "snapshot",
tags = ["bazel-compatible"],
val_type = "com.twitter.interaction_graph.thriftscala.EdgeList",
scala_dependencies = [
"src/scala/com/twitter/interaction_graph/injection:edge_list_injection",
],
)

View File

@ -1,14 +0,0 @@
package com.twitter.interaction_graph.scio.agg_all
object InteractionGraphScoringConfig {
/**
* This is alpha for a variant of the Exponentially weighted moving average, computed as:
* ewma_{t+1} = x_{t+1} + (1-alpha) * ewma_t (ewma_1 = x_1, t > 0)
* We choose alpha such that the half life of weights is 7 days.
* Note that we don't down-weight x_{t+1} (unlike in EWMA) as we only want to decay actions
* as they grow old, not compute the average value.
*/
val ALPHA = 1.0
val ONE_MINUS_ALPHA = 0.955
}

View File

@ -1,314 +0,0 @@
package com.twitter.interaction_graph.scio.agg_all
import com.google.cloud.bigquery.BigQueryOptions
import com.google.cloud.bigquery.QueryJobConfiguration
import com.spotify.scio.ScioContext
import com.spotify.scio.ScioMetrics
import com.spotify.scio.values.SCollection
import com.twitter.beam.io.dal.DAL
import com.twitter.beam.io.dal.DAL.DiskFormat
import com.twitter.beam.io.dal.DAL.PathLayout
import com.twitter.beam.io.dal.DAL.WriteOptions
import com.twitter.beam.io.exception.DataNotFoundException
import com.twitter.beam.job.ServiceIdentifierOptions
import com.twitter.interaction_graph.scio.agg_all.InteractionGraphAggregationTransform._
import com.twitter.interaction_graph.scio.common.DateUtil
import com.twitter.interaction_graph.scio.common.FeatureGeneratorUtil
import com.twitter.interaction_graph.scio.common.UserUtil
import com.twitter.interaction_graph.thriftscala.Edge
import com.twitter.interaction_graph.thriftscala.Vertex
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.scio_internal.job.ScioBeamJob
import com.twitter.statebird.v2.thriftscala.Environment
import com.twitter.user_session_store.thriftscala.UserSession
import com.twitter.util.Duration
import com.twitter.wtf.candidate.thriftscala.ScoredEdge
import java.time.Instant
import org.apache.avro.generic.GenericRecord
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead
import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord
import org.apache.beam.sdk.transforms.SerializableFunction
import org.joda.time.Interval
import scala.collection.JavaConverters._
object InteractionGraphAggregationJob extends ScioBeamJob[InteractionGraphAggregationOption] {
// to parse latest date from the BQ table we're reading from
val parseDateRow = new SerializableFunction[SchemaAndRecord, String] {
override def apply(input: SchemaAndRecord): String = {
val genericRecord: GenericRecord = input.getRecord()
genericRecord.get("ds").toString
}
}
// note that we're using the prob_explicit for real_graph_features (for Home)
val parseRow = new SerializableFunction[SchemaAndRecord, ScoredEdge] {
override def apply(record: SchemaAndRecord): ScoredEdge = {
val genericRecord: GenericRecord = record.getRecord()
ScoredEdge(
genericRecord.get("source_id").asInstanceOf[Long],
genericRecord.get("destination_id").asInstanceOf[Long],
genericRecord.get("prob_explicit").asInstanceOf[Double],
genericRecord.get("followed").asInstanceOf[Boolean],
)
}
}
override def runPipeline(
sc: ScioContext,
opts: InteractionGraphAggregationOption
): Unit = {
val dateStr: String = opts.getDate().value.getStart.toString("yyyyMMdd")
logger.info(s"dateStr $dateStr")
val project: String = "twttr-recos-ml-prod"
val datasetName: String = "realgraph"
val bqTableName: String = "scores"
val fullBqTableName: String = s"$project:$datasetName.$bqTableName"
if (opts.getDALWriteEnvironment.toLowerCase == "prod") {
val bqClient =
BigQueryOptions.newBuilder.setProjectId(project).build.getService
val query =
s"""
|SELECT total_rows
|FROM `$project.$datasetName.INFORMATION_SCHEMA.PARTITIONS`
|WHERE partition_id ="$dateStr" AND
|table_name="$bqTableName" AND total_rows > 0
|""".stripMargin
val queryConfig = QueryJobConfiguration.of(query)
val results = bqClient.query(queryConfig).getValues.asScala.toSeq
if (results.isEmpty || results.head.get(0).getLongValue == 0) {
throw new DataNotFoundException(s"$dateStr not present in $fullBqTableName.")
}
}
sc.run()
}
override protected def configurePipeline(
scioContext: ScioContext,
pipelineOptions: InteractionGraphAggregationOption
): Unit = {
@transient
implicit lazy val sc: ScioContext = scioContext
implicit lazy val dateInterval: Interval = pipelineOptions.interval
val yesterday = DateUtil.subtract(dateInterval, Duration.fromDays(1))
val dalEnvironment: String = pipelineOptions
.as(classOf[ServiceIdentifierOptions])
.getEnvironment()
val dalWriteEnvironment = if (pipelineOptions.getDALWriteEnvironment != null) {
pipelineOptions.getDALWriteEnvironment
} else {
dalEnvironment
}
val dateStr: String = pipelineOptions.getDate().value.getStart.toString("yyyy-MM-dd")
logger.info(s"dateStr $dateStr")
val project: String = "twttr-recos-ml-prod"
val datasetName: String = "realgraph"
val bqTableName: String = "scores"
val fullBqTableName: String = s"$project:$datasetName.$bqTableName"
val scoreExport: SCollection[ScoredEdge] =
sc.customInput(
s"Read from BQ table $fullBqTableName",
BigQueryIO
.read(parseRow)
.fromQuery(s"""SELECT source_id, destination_id, prob_explicit, followed
|FROM `$project.$datasetName.$bqTableName`
|WHERE ds = '$dateStr'""".stripMargin)
.usingStandardSql()
.withMethod(TypedRead.Method.DEFAULT)
)
val source = InteractionGraphAggregationSource(pipelineOptions)
val (addressEdgeFeatures, addressVertexFeatures) = source.readAddressBookFeatures()
val (clientEventLogsEdgeFeatures, clientEventLogsVertexFeatures) =
source.readClientEventLogsFeatures(dateInterval)
val (flockEdgeFeatures, flockVertexFeatures) = source.readFlockFeatures()
val (directInteractionsEdgeFeatures, directInteractionsVertexFeatures) =
source.readDirectInteractionsFeatures(dateInterval)
val invalidUsers = UserUtil.getInvalidUsers(source.readFlatUsers())
val (prevAggEdge, prevAggVertex) = source.readAggregatedFeatures(yesterday)
val prevAggregatedVertex: SCollection[Vertex] =
UserUtil
.filterUsersByIdMapping[Vertex](
prevAggVertex,
invalidUsers,
v => v.userId
)
/** Remove status-based features (flock/ab) from current graph, because we only need the latest
* This is to allow us to filter and roll-up a smaller dataset, to which we will still add
* back the status-based features for the complete scoredAggregates (that other teams will read).
*/
val prevAggEdgeFiltered = prevAggEdge
.filter { e =>
e.sourceId != e.destinationId
}
.withName("filtering status-based edges")
.flatMap(FeatureGeneratorUtil.removeStatusFeatures)
val prevAggEdgeValid: SCollection[Edge] =
UserUtil
.filterUsersByMultipleIdMappings[Edge](
prevAggEdgeFiltered,
invalidUsers,
Seq(e => e.sourceId, e => e.destinationId)
)
val aggregatedActivityVertexDaily = UserUtil
.filterUsersByIdMapping[Vertex](
FeatureGeneratorUtil
.combineVertexFeatures(
clientEventLogsVertexFeatures ++
directInteractionsVertexFeatures ++
addressVertexFeatures ++
flockVertexFeatures
),
invalidUsers,
v => v.userId
)
// we split up the roll-up of decayed counts between status vs activity/count-based features
val aggregatedActivityEdgeDaily = FeatureGeneratorUtil
.combineEdgeFeatures(clientEventLogsEdgeFeatures ++ directInteractionsEdgeFeatures)
// Vertex level, Add the decay sum for history and daily
val aggregatedActivityVertex = FeatureGeneratorUtil
.combineVertexFeaturesWithDecay(
prevAggregatedVertex,
aggregatedActivityVertexDaily,
InteractionGraphScoringConfig.ONE_MINUS_ALPHA,
InteractionGraphScoringConfig.ALPHA
)
// Edge level, Add the decay sum for history and daily
val aggregatedActivityEdge = FeatureGeneratorUtil
.combineEdgeFeaturesWithDecay(
prevAggEdgeValid,
aggregatedActivityEdgeDaily,
InteractionGraphScoringConfig.ONE_MINUS_ALPHA,
InteractionGraphScoringConfig.ALPHA
)
.filter(FeatureGeneratorUtil.edgeWithFeatureOtherThanDwellTime)
.withName("removing edges that only have dwell time features")
val edgeKeyedScores = scoreExport.keyBy { e => (e.sourceId, e.destinationId) }
val scoredAggregatedActivityEdge = aggregatedActivityEdge
.keyBy { e => (e.sourceId, e.destinationId) }
.withName("join with scores")
.leftOuterJoin(edgeKeyedScores)
.map {
case (_, (e, scoredEdgeOpt)) =>
val scoreOpt = scoredEdgeOpt.map(_.score)
e.copy(weight = if (scoreOpt.nonEmpty) {
ScioMetrics.counter("after joining edge with scores", "has score").inc()
scoreOpt
} else {
ScioMetrics.counter("after joining edge with scores", "no score").inc()
None
})
}
val combinedFeatures = FeatureGeneratorUtil
.combineEdgeFeatures(aggregatedActivityEdge ++ addressEdgeFeatures ++ flockEdgeFeatures)
.keyBy { e => (e.sourceId, e.destinationId) }
val aggregatedActivityScoredEdge =
edgeKeyedScores
.withName("join with combined edge features")
.leftOuterJoin(combinedFeatures)
.map {
case (_, (scoredEdge, combinedFeaturesOpt)) =>
if (combinedFeaturesOpt.exists(_.features.nonEmpty)) {
ScioMetrics.counter("after joining scored edge with features", "has features").inc()
Edge(
sourceId = scoredEdge.sourceId,
destinationId = scoredEdge.destinationId,
weight = Some(scoredEdge.score),
features = combinedFeaturesOpt.map(_.features).getOrElse(Nil)
)
} else {
ScioMetrics.counter("after joining scored edge with features", "no features").inc()
Edge(
sourceId = scoredEdge.sourceId,
destinationId = scoredEdge.destinationId,
weight = Some(scoredEdge.score),
features = Nil
)
}
}
val realGraphFeatures =
getTopKTimelineFeatures(aggregatedActivityScoredEdge, pipelineOptions.getMaxDestinationIds)
aggregatedActivityVertex.saveAsCustomOutput(
"Write History Aggregated Vertex Records",
DAL.writeSnapshot[Vertex](
dataset = InteractionGraphHistoryAggregatedVertexSnapshotScalaDataset,
pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_vertex"),
endDate = Instant.ofEpochMilli(dateInterval.getEndMillis),
diskFormat = DiskFormat.Parquet,
environmentOverride = Environment.valueOf(dalWriteEnvironment),
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards / 10))
)
)
scoredAggregatedActivityEdge.saveAsCustomOutput(
"Write History Aggregated Edge Records",
DAL.writeSnapshot[Edge](
dataset = InteractionGraphHistoryAggregatedEdgeSnapshotScalaDataset,
pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_raw_edge"),
endDate = Instant.ofEpochMilli(dateInterval.getEndMillis),
diskFormat = DiskFormat.Parquet,
environmentOverride = Environment.valueOf(dalWriteEnvironment),
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
)
)
aggregatedActivityVertexDaily.saveAsCustomOutput(
"Write Daily Aggregated Vertex Records",
DAL.write[Vertex](
dataset = InteractionGraphAggregatedVertexDailyScalaDataset,
pathLayout =
PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_vertex_daily"),
interval = dateInterval,
diskFormat = DiskFormat.Parquet,
environmentOverride = Environment.valueOf(dalWriteEnvironment),
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards / 10))
)
)
aggregatedActivityEdgeDaily.saveAsCustomOutput(
"Write Daily Aggregated Edge Records",
DAL.write[Edge](
dataset = InteractionGraphAggregatedEdgeDailyScalaDataset,
pathLayout = PathLayout.DailyPath(pipelineOptions.getOutputPath + "/aggregated_edge_daily"),
interval = dateInterval,
diskFormat = DiskFormat.Parquet,
environmentOverride = Environment.valueOf(dalWriteEnvironment),
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
)
)
realGraphFeatures.saveAsCustomOutput(
"Write Timeline Real Graph Features",
DAL.writeVersionedKeyVal[KeyVal[Long, UserSession]](
dataset = RealGraphFeaturesScalaDataset,
pathLayout =
PathLayout.VersionedPath(pipelineOptions.getOutputPath + "/real_graph_features"),
environmentOverride = Environment.valueOf(dalWriteEnvironment),
writeOption = WriteOptions(numOfShards = Some(pipelineOptions.getNumberOfShards))
)
)
}
}

View File

@ -1,36 +0,0 @@
package com.twitter.interaction_graph.scio.agg_all
import com.twitter.beam.io.dal.DALOptions
import com.twitter.beam.job.DateRangeOptions
import org.apache.beam.sdk.options.Default
import org.apache.beam.sdk.options.Description
import org.apache.beam.sdk.options.Validation.Required
trait InteractionGraphAggregationOption extends DALOptions with DateRangeOptions {
@Required
@Description("Output path for storing the final dataset")
def getOutputPath: String
def setOutputPath(value: String): Unit
@Description("Indicates DAL write environment. Can be set to dev/stg during local validation")
@Default.String("PROD")
def getDALWriteEnvironment: String
def setDALWriteEnvironment(value: String): Unit
@Description("Number of shards/partitions for saving the final dataset.")
@Default.Integer(16)
def getNumberOfShards: Integer
def setNumberOfShards(value: Integer): Unit
@Description("BQ Table name for reading scores from")
def getBqTableName: String
def setBqTableName(value: String): Unit
@Description("max destination ids that we will store for real graph features in TL")
def getMaxDestinationIds: Integer
def setMaxDestinationIds(value: Integer): Unit
@Description("true if getting scores from BQ instead of DAL-based dataset in GCS")
def getScoresFromBQ: Boolean
def setScoresFromBQ(value: Boolean): Unit
}

Some files were not shown because too many files have changed in this diff Show More