diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/user_video_tweet_fav_engagement_generation.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/user_video_tweet_fav_engagement_generation.docx new file mode 100644 index 000000000..fc15d9491 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/user_video_tweet_fav_engagement_generation.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/user_video_tweet_fav_engagement_generation.sql b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/user_video_tweet_fav_engagement_generation.sql deleted file mode 100644 index 56b0f73a8..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql/user_video_tweet_fav_engagement_generation.sql +++ /dev/null @@ -1,69 +0,0 @@ -WITH - vars AS ( - SELECT - TIMESTAMP("{START_TIME}") AS start_date, - TIMESTAMP("{END_TIME}") AS end_date, - ), - - -- Get raw user-tweet interaction events from UUA (We will use fav engagements here) - raw_engagements AS ( - SELECT - userIdentifier.userId AS userId, - eventMetadata.sourceTimestampMs AS tsMillis, - CASE - WHEN actionType IN ({CONTRIBUTING_ACTION_TYPES_STR}) THEN {CONTRIBUTING_ACTION_TWEET_ID_COLUMN} - WHEN actionType IN ({UNDO_ACTION_TYPES_STR}) THEN {UNDO_ACTION_TWEET_ID_COLUMN} - END AS tweetId, - CASE - WHEN actionType IN ({CONTRIBUTING_ACTION_TYPES_STR}) THEN 1 - WHEN actionType IN ({UNDO_ACTION_TYPES_STR}) THEN -1 - END AS doOrUndo - FROM `twttr-bql-unified-prod.unified_user_actions_engagements.streaming_unified_user_actions_engagements`, vars - WHERE (DATE(dateHour) >= DATE(vars.start_date) AND DATE(dateHour) <= DATE(vars.end_date)) - AND eventMetadata.sourceTimestampMs >= UNIX_MILLIS(vars.start_date) - AND eventMetadata.sourceTimestampMs <= UNIX_MILLIS(vars.end_date) - AND (actionType IN ({CONTRIBUTING_ACTION_TYPES_STR}) - OR actionType IN ({UNDO_ACTION_TYPES_STR})) - ), - - -- Get video tweet ids - video_tweet_ids AS ( - WITH vars AS ( - SELECT - TIMESTAMP("{START_TIME}") AS start_date, - TIMESTAMP("{END_TIME}") AS end_date - ), - - -- Get raw user-tweet interaction events from UUA - video_view_engagements AS ( - SELECT item.tweetInfo.actionTweetId AS tweetId - FROM `twttr-bql-unified-prod.unified_user_actions_engagements.streaming_unified_user_actions_engagements`, vars - WHERE (DATE(dateHour) >= DATE(vars.start_date) AND DATE(dateHour) <= DATE(vars.end_date)) - AND eventMetadata.sourceTimestampMs >= UNIX_MILLIS(start_date) - AND eventMetadata.sourceTimestampMs <= UNIX_MILLIS(end_date) - AND (actionType IN ("ClientTweetVideoPlayback50") - OR actionType IN ("ClientTweetVideoPlayback95")) - ) - - SELECT DISTINCT(tweetId) - FROM video_view_engagements - ), - - -- Join video tweet ids - video_tweets_engagements AS ( - SELECT raw_engagements.* - FROM raw_engagements JOIN video_tweet_ids USING(tweetId) - ), - - -- Group by userId and tweetId - user_tweet_engagement_pairs AS ( - SELECT userId, tweetId, ARRAY_AGG(STRUCT(doOrUndo, tsMillis) ORDER BY tsMillis DESC LIMIT 1) AS details, COUNT(*) AS cnt - FROM video_tweets_engagements - GROUP BY userId, tweetId - ) - --- Remove undo events -SELECT userId, tweetId, CAST(dt.tsMillis AS FLOAT64) AS tsMillis -FROM user_tweet_engagement_pairs, vars -CROSS JOIN UNNEST(details) AS dt -WHERE dt.doOrUndo = 1 diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/BUILD b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/BUILD deleted file mode 100644 index 43135fdf9..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/BUILD +++ /dev/null @@ -1,110 +0,0 @@ -scala_library( - name = "bq_generation", - sources = [ - "**/*.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/job", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:offline_tweet_recommendations_from_interested_in_20M_145K_2020-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_0_EL_15-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_2_EL_15-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_2_EL_50-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_8_EL_50-scala", - "src/scala/com/twitter/simclusters_v2/hdfs_sources:offline_tweet_recommendations_from_mts_consumer_embeddings-scala", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/common", - "src/scala/com/twitter/simclusters_v2/scio/bq_generation/sql", - "src/scala/com/twitter/wtf/beam/bq_embedding_export:bq_embedding_export_lib", - "tcdc/bq_blaster/src/main/scala/com/twitter/tcdc/bqblaster/beam", - ], -) - -jvm_binary( - name = "iikf-tweets-ann-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.IIKF2020TweetsANNBQAdhocJob", - platform = "java8", - dependencies = [ - ":bq_generation", - ], -) - -jvm_binary( - name = "iikf-hl-8-el-50-tweets-ann-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.IIKF2020Hl8El50TweetsANNBQAdhocJob", - platform = "java8", - dependencies = [ - ":bq_generation", - ], -) - -jvm_binary( - name = "iikf-tweets-ann-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.IIKF2020TweetsANNBQBatchJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":bq_generation", - ], -) - -jvm_binary( - name = "iikf-hl-0-el-15-tweets-ann-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.IIKF2020Hl0El15TweetsANNBQBatchJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":bq_generation", - ], -) - -jvm_binary( - name = "iikf-hl-2-el-15-tweets-ann-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.IIKF2020Hl2El15TweetsANNBQBatchJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":bq_generation", - ], -) - -jvm_binary( - name = "iikf-hl-2-el-50-tweets-ann-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.IIKF2020Hl2El50TweetsANNBQBatchJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":bq_generation", - ], -) - -jvm_binary( - name = "iikf-hl-8-el-50-tweets-ann-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.IIKF2020Hl8El50TweetsANNBQBatchJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":bq_generation", - ], -) - -jvm_binary( - name = "mts-consumer-embeddings-tweets-ann-adhoc-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.MTSConsumerEmbeddingsTweetsANNBQAdhocJob", - platform = "java8", - dependencies = [ - ":bq_generation", - ], -) - -jvm_binary( - name = "mts-consumer-embeddings-tweets-ann-batch-job", - main = "com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.MTSConsumerEmbeddingsTweetsANNBQBatchJob", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":bq_generation", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/BUILD.docx new file mode 100644 index 000000000..6592db693 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/Config.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/Config.docx new file mode 100644 index 000000000..083f5a2c4 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/Config.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/Config.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/Config.scala deleted file mode 100644 index 9046768bb..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/Config.scala +++ /dev/null @@ -1,33 +0,0 @@ -package com.twitter.simclusters_v2.scio.bq_generation.tweets_ann - -object Config { - /* - * Common root path - */ - val RootMHPath: String = "manhattan_sequence_files/offline_sann/" - val RootThriftPath: String = "processed/offline_sann/" - val AdhocRootPath = "adhoc/offline_sann/" - - /* - * Variables for MH output path - */ - val IIKFANNOutputPath: String = "tweets_ann/iikf" - val IIKFHL0EL15ANNOutputPath: String = "tweets_ann/iikf_hl_0_el_15" - val IIKFHL2EL15ANNOutputPath: String = "tweets_ann/iikf_hl_2_el_15" - val IIKFHL2EL50ANNOutputPath: String = "tweets_ann/iikf_hl_2_el_50" - val IIKFHL8EL50ANNOutputPath: String = "tweets_ann/iikf_hl_8_el_50" - val MTSConsumerEmbeddingsANNOutputPath: String = "tweets_ann/mts_consumer_embeddings" - - /* - * Variables for tweet embeddings generation - */ - val SimClustersTweetEmbeddingsGenerationHalfLife: Int = 28800000 // 8hrs in ms - val SimClustersTweetEmbeddingsGenerationEmbeddingLength: Int = 15 - - /* - * Variables for ANN - */ - val SimClustersANNTopNClustersPerSourceEmbedding: Int = 20 - val SimClustersANNTopMTweetsPerCluster: Int = 50 - val SimClustersANNTopKTweetsPerUserRequest: Int = 200 -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/README b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/README deleted file mode 100644 index 7947963af..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/README +++ /dev/null @@ -1,95 +0,0 @@ -To run iikf-tweets-ann-adhoc-job (adhoc): -bin/d6w create \ - ${GCP_PROJECT_NAME}/us-central1/iikf-tweets-ann-adhoc-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-adhoc-job.d6w \ - --jar dist/iikf-tweets-ann-adhoc-job.jar \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=your_ldap \ - --bind=profile.date="2022-03-28" \ - --bind=profile.machine="n2-highmem-4" \ - --bind=profile.job_name="iikf-tweets-ann-adhoc-job" --ignore-existing - -To run iikf-hl-8-el-50-tweets-ann-adhoc-job (adhoc): -bin/d6w create \ - ${GCP_PROJECT_NAME}/us-central1/iikf-hl-8-el-50-tweets-ann-adhoc-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-adhoc-job.d6w \ - --jar dist/iikf-hl-8-el-50-tweets-ann-adhoc-job.jar \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=your_ldap \ - --bind=profile.date="2022-03-28" \ - --bind=profile.machine="n2-highmem-4" \ - --bind=profile.job_name="iikf-hl-8-el-50-tweets-ann-adhoc-job" --ignore-existing - -To run mts-consumer-embeddings-tweets-ann-adhoc-job (adhoc) -bin/d6w create \ - ${GCP_PROJECT_NAME}/us-central1/mts-consumer-embeddings-tweets-ann-adhoc-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-adhoc-job.d6w \ - --jar dist/mts-consumer-embeddings-tweets-ann-adhoc-job.jar \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=your_ldap \ - --bind=profile.date="2022-03-28" \ - --bind=profile.machine="n2-highmem-4" \ - --bind=profile.job_name="mts-consumer-embeddings-tweets-ann-adhoc-job" --ignore-existing - - -To schedule iikf-tweets-ann-batch-job (batch) -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/iikf-tweets-ann-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-batch-job.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=cassowary \ - --bind=profile.date="2022-03-26" \ - --bind=profile.machine="n2-highmem-4" \ - --bind=profile.job_name="iikf-tweets-ann-batch-job" - -To schedule iikf-hl-0-el-15-tweets-ann-batch-job (batch) -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/iikf-hl-0-el-15-tweets-ann-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-0-el-15-tweets-ann-batch-job.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=cassowary \ - --bind=profile.date="2022-03-26" \ - --bind=profile.machine="n2-highmem-4" \ - --bind=profile.job_name="iikf-hl-0-el-15-tweets-ann-batch-job" - -To schedule iikf-hl-2-el-15-tweets-ann-batch-job (batch) -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/iikf-hl-2-el-15-tweets-ann-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-15-tweets-ann-batch-job.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=cassowary \ - --bind=profile.date="2022-03-26" \ - --bind=profile.machine="n2-highmem-4" \ - --bind=profile.job_name="iikf-hl-2-el-15-tweets-ann-batch-job" - -To schedule iikf-hl-2-el-50-tweets-ann-batch-job (batch) -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/iikf-hl-2-el-50-tweets-ann-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-50-tweets-ann-batch-job.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=cassowary \ - --bind=profile.date="2022-03-26" \ - --bind=profile.machine="n2-highmem-4" \ - --bind=profile.job_name="iikf-hl-2-el-50-tweets-ann-batch-job" - -To schedule iikf-hl-8-el-50-tweets-ann-batch-job (batch) -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/iikf-hl-8-el-50-tweets-ann-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-batch-job.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=cassowary \ - --bind=profile.date="2022-03-26" \ - --bind=profile.machine="n2-highmem-4" \ - --bind=profile.job_name="iikf-hl-8-el-50-tweets-ann-batch-job" - -To schedule mts-consumer-embeddings-tweets-ann-batch-job(batch) -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/mts-consumer-embeddings-tweets-ann-batch-job \ - src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-batch-job.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=cassowary \ - --bind=profile.date="2022-03-26" \ - --bind=profile.machine="n2-highmem-4" \ - --bind=profile.job_name="mts-consumer-embeddings-tweets-ann-batch-job" - - diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/README.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/README.docx new file mode 100644 index 000000000..67919c3be Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/README.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNFromBQ.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNFromBQ.docx new file mode 100644 index 000000000..002b8808e Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNFromBQ.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNFromBQ.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNFromBQ.scala deleted file mode 100644 index 23663ab9a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNFromBQ.scala +++ /dev/null @@ -1,120 +0,0 @@ -package com.twitter.simclusters_v2.scio.bq_generation -package tweets_ann - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.simclusters_v2.thriftscala.CandidateTweet -import com.twitter.wtf.beam.bq_embedding_export.BQQueryUtils -import org.apache.avro.generic.GenericData -import org.apache.avro.generic.GenericRecord -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO -import org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord -import org.apache.beam.sdk.transforms.SerializableFunction -import org.joda.time.DateTime -import scala.collection.mutable.ListBuffer - -object TweetsANNFromBQ { - // Default ANN config variables - val topNClustersPerSourceEmbedding = Config.SimClustersANNTopNClustersPerSourceEmbedding - val topMTweetsPerCluster = Config.SimClustersANNTopMTweetsPerCluster - val topKTweetsPerUserRequest = Config.SimClustersANNTopKTweetsPerUserRequest - - // SQL file paths - val tweetsANNSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/tweets_ann.sql" - val tweetsEmbeddingGenerationSQLPath = - s"/com/twitter/simclusters_v2/scio/bq_generation/sql/tweet_embeddings_generation.sql" - - // Function that parses the GenericRecord results we read from BQ - val parseUserToTweetRecommendationsFunc = - new SerializableFunction[SchemaAndRecord, UserToTweetRecommendations] { - override def apply(record: SchemaAndRecord): UserToTweetRecommendations = { - val genericRecord: GenericRecord = record.getRecord() - UserToTweetRecommendations( - userId = genericRecord.get("userId").toString.toLong, - tweetCandidates = parseTweetIdColumn(genericRecord, "tweets"), - ) - } - } - - // Parse tweetId candidates column - def parseTweetIdColumn( - genericRecord: GenericRecord, - columnName: String - ): List[CandidateTweet] = { - val tweetIds: GenericData.Array[GenericRecord] = - genericRecord.get(columnName).asInstanceOf[GenericData.Array[GenericRecord]] - val results: ListBuffer[CandidateTweet] = new ListBuffer[CandidateTweet]() - tweetIds.forEach((sc: GenericRecord) => { - results += CandidateTweet( - tweetId = sc.get("tweetId").toString.toLong, - score = Some(sc.get("logCosineSimilarityScore").toString.toDouble) - ) - }) - results.toList - } - - def getTweetEmbeddingsSQL( - queryDate: DateTime, - consumerEmbeddingsSQL: String, - tweetEmbeddingsSQLPath: String, - tweetEmbeddingsHalfLife: Int, - tweetEmbeddingsLength: Int - ): String = { - // We read one day of fav events to construct our tweet embeddings - val templateVariables = - Map( - "CONSUMER_EMBEDDINGS_SQL" -> consumerEmbeddingsSQL, - "QUERY_DATE" -> queryDate.toString(), - "START_TIME" -> queryDate.minusDays(1).toString(), - "END_TIME" -> queryDate.toString(), - "MIN_SCORE_THRESHOLD" -> 0.0.toString, - "HALF_LIFE" -> tweetEmbeddingsHalfLife.toString, - "TWEET_EMBEDDING_LENGTH" -> tweetEmbeddingsLength.toString, - "NO_OLDER_TWEETS_THAN_DATE" -> queryDate.minusDays(1).toString(), - ) - BQQueryUtils.getBQQueryFromSqlFile(tweetEmbeddingsSQLPath, templateVariables) - } - - def getTweetRecommendationsBQ( - sc: ScioContext, - queryTimestamp: DateTime, - consumerEmbeddingsSQL: String, - tweetEmbeddingsHalfLife: Int, - tweetEmbeddingsLength: Int - ): SCollection[UserToTweetRecommendations] = { - // Get the tweet embeddings SQL string based on the provided consumerEmbeddingsSQL - val tweetEmbeddingsSQL = - getTweetEmbeddingsSQL( - queryTimestamp, - consumerEmbeddingsSQL, - tweetsEmbeddingGenerationSQLPath, - tweetEmbeddingsHalfLife, - tweetEmbeddingsLength - ) - - // Define template variables which we would like to be replaced in the corresponding sql file - val templateVariables = - Map( - "CONSUMER_EMBEDDINGS_SQL" -> consumerEmbeddingsSQL, - "TWEET_EMBEDDINGS_SQL" -> tweetEmbeddingsSQL, - "TOP_N_CLUSTER_PER_SOURCE_EMBEDDING" -> topNClustersPerSourceEmbedding.toString, - "TOP_M_TWEETS_PER_CLUSTER" -> topMTweetsPerCluster.toString, - "TOP_K_TWEETS_PER_USER_REQUEST" -> topKTweetsPerUserRequest.toString - ) - val query = BQQueryUtils.getBQQueryFromSqlFile(tweetsANNSQLPath, templateVariables) - - // Run SimClusters ANN on BQ and parse the results - sc.customInput( - s"SimClusters BQ ANN", - BigQueryIO - .read(parseUserToTweetRecommendationsFunc) - .fromQuery(query) - .usingStandardSql() - ) - } - - case class UserToTweetRecommendations( - userId: Long, - tweetCandidates: List[CandidateTweet]) -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNJob.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNJob.docx new file mode 100644 index 000000000..39d74cceb Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNJob.scala b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNJob.scala deleted file mode 100644 index 81a89f3ff..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/TweetsANNJob.scala +++ /dev/null @@ -1,297 +0,0 @@ -package com.twitter.simclusters_v2.scio.bq_generation -package tweets_ann - -import com.google.api.services.bigquery.model.TimePartitioning -import com.spotify.scio.ScioContext -import com.spotify.scio.coders.Coder -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.job.DateRangeOptions -import com.twitter.conversions.DurationOps.richDurationFromInt -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.coders.ThriftStructLazyBinaryScroogeCoder -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.scrooge.ThriftStruct -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.getMTSConsumerEmbeddingsFav90P20MSQL -import com.twitter.simclusters_v2.scio.bq_generation.common.BQGenerationUtil.getInterestedIn2020SQL -import com.twitter.simclusters_v2.scio.bq_generation.tweets_ann.TweetsANNFromBQ.getTweetRecommendationsBQ -import com.twitter.simclusters_v2.hdfs_sources.OfflineTweetRecommendationsFromInterestedIn20M145K2020ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.OfflineTweetRecommendationsFromInterestedIn20M145K2020Hl0El15ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.OfflineTweetRecommendationsFromInterestedIn20M145K2020Hl2El15ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.OfflineTweetRecommendationsFromInterestedIn20M145K2020Hl2El50ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.OfflineTweetRecommendationsFromInterestedIn20M145K2020Hl8El50ScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.OfflineTweetRecommendationsFromMtsConsumerEmbeddingsScalaDataset -import com.twitter.simclusters_v2.scio.bq_generation.common.BQTableDetails -import com.twitter.simclusters_v2.thriftscala.CandidateTweets -import com.twitter.simclusters_v2.thriftscala.CandidateTweetsList -import com.twitter.tcdc.bqblaster.beam.syntax.BigQueryIOHelpers -import com.twitter.tcdc.bqblaster.beam.BQBlasterIO.AvroConverter -import com.twitter.tcdc.bqblaster.core.avro.TypedProjection -import com.twitter.tcdc.bqblaster.core.transform.RootTransform -import java.time.Instant -import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO -import org.joda.time.DateTime - -trait TweetsANNJob extends ScioBeamJob[DateRangeOptions] { - // Configs to set for different type of embeddings and jobs - val isAdhoc: Boolean - val getConsumerEmbeddingsSQLFunc: (DateTime, Int) => String - val outputTable: BQTableDetails - val keyValDatasetOutputPath: String - val tweetRecommentationsSnapshotDataset: KeyValDALDataset[KeyVal[Long, CandidateTweetsList]] - val tweetEmbeddingsGenerationHalfLife: Int = Config.SimClustersTweetEmbeddingsGenerationHalfLife - val tweetEmbeddingsGenerationEmbeddingLength: Int = - Config.SimClustersTweetEmbeddingsGenerationEmbeddingLength - - // Base configs - val projectId = "twttr-recos-ml-prod" - val environment: DAL.Env = if (isAdhoc) DAL.Environment.Dev else DAL.Environment.Prod - - override implicit def scroogeCoder[T <: ThriftStruct: Manifest]: Coder[T] = - ThriftStructLazyBinaryScroogeCoder.scroogeCoder - - override def configurePipeline(sc: ScioContext, opts: DateRangeOptions): Unit = { - // The time when the job is scheduled - val queryTimestamp = opts.interval.getEnd - - // Read consumer embeddings SQL - val consumerEmbeddingsSQL = getConsumerEmbeddingsSQLFunc(queryTimestamp, 14) - - // Generate tweet embeddings and tweet ANN results - val tweetRecommendations = - getTweetRecommendationsBQ( - sc, - queryTimestamp, - consumerEmbeddingsSQL, - tweetEmbeddingsGenerationHalfLife, - tweetEmbeddingsGenerationEmbeddingLength - ) - - // Setup BQ writer - val ingestionTime = opts.getDate().value.getEnd.toDate - val bqFieldsTransform = RootTransform - .Builder() - .withPrependedFields("ingestionTime" -> TypedProjection.fromConstant(ingestionTime)) - val timePartitioning = new TimePartitioning() - .setType("HOUR").setField("ingestionTime").setExpirationMs(3.days.inMilliseconds) - val bqWriter = BigQueryIO - .write[CandidateTweets] - .to(outputTable.toString) - .withExtendedErrorInfo() - .withTimePartitioning(timePartitioning) - .withLoadJobProjectId(projectId) - .withThriftSupport(bqFieldsTransform.build(), AvroConverter.Legacy) - .withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED) - .withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND) - - // Save Tweet ANN results to BQ - tweetRecommendations - .map { userToTweetRecommendations => - { - CandidateTweets( - targetUserId = userToTweetRecommendations.userId, - recommendedTweets = userToTweetRecommendations.tweetCandidates) - } - } - .saveAsCustomOutput(s"WriteToBQTable - ${outputTable}", bqWriter) - - // Save Tweet ANN results as KeyValSnapshotDataset - tweetRecommendations - .map { userToTweetRecommendations => - KeyVal( - userToTweetRecommendations.userId, - CandidateTweetsList(userToTweetRecommendations.tweetCandidates)) - }.saveAsCustomOutput( - name = "WriteTweetRecommendationsToKeyValDataset", - DAL.writeVersionedKeyVal( - tweetRecommentationsSnapshotDataset, - PathLayout.VersionedPath(prefix = - ((if (!isAdhoc) - Config.RootMHPath - else - Config.AdhocRootPath) - + keyValDatasetOutputPath)), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - environmentOverride = environment, - ) - ) - } - -} - -/** - * Scio job for adhoc run for tweet recommendations from IIKF 2020 - */ -object IIKF2020TweetsANNBQAdhocJob extends TweetsANNJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val outputTable = BQTableDetails( - "twttr-recos-ml-prod", - "multi_type_simclusters", - "offline_tweet_recommendations_from_interested_in_20M_145K_2020_adhoc") - override val keyValDatasetOutputPath = Config.IIKFANNOutputPath - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFromInterestedIn20M145K2020ScalaDataset -} - -/** - * Scio job for adhoc run for tweet recommendations from IIKF 2020 with - * - Half life = 8hrs - * - Embedding Length = 50 - */ -object IIKF2020Hl8El50TweetsANNBQAdhocJob extends TweetsANNJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val outputTable = BQTableDetails( - "twttr-recos-ml-prod", - "multi_type_simclusters", - "offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_8_EL_50_adhoc") - override val keyValDatasetOutputPath = Config.IIKFHL8EL50ANNOutputPath - override val tweetEmbeddingsGenerationEmbeddingLength: Int = 50 - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = { - OfflineTweetRecommendationsFromInterestedIn20M145K2020Hl8El50ScalaDataset - } -} - -/** - * Scio job for adhoc run for tweet recommendations from MTS Consumer Embeddings - */ -object MTSConsumerEmbeddingsTweetsANNBQAdhocJob extends TweetsANNJob { - override val isAdhoc = true - override val getConsumerEmbeddingsSQLFunc = getMTSConsumerEmbeddingsFav90P20MSQL - override val outputTable = BQTableDetails( - "twttr-recos-ml-prod", - "multi_type_simclusters", - "offline_tweet_recommendations_from_mts_consumer_embeddings_adhoc") - override val keyValDatasetOutputPath = Config.MTSConsumerEmbeddingsANNOutputPath - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFromMtsConsumerEmbeddingsScalaDataset -} - -/** -Scio job for batch run for tweet recommendations from IIKF 2020 -The schedule cmd needs to be run only if there is any change in the config - */ -object IIKF2020TweetsANNBQBatchJob extends TweetsANNJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val outputTable = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "offline_tweet_recommendations_from_interested_in_20M_145K_2020") - override val keyValDatasetOutputPath = Config.IIKFANNOutputPath - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFromInterestedIn20M145K2020ScalaDataset -} - -/** -Scio job for batch run for tweet recommendations from IIKF 2020 with parameter setup: - - Half Life: None, no decay, direct sum - - Embedding Length: 15 -The schedule cmd needs to be run only if there is any change in the config - */ -object IIKF2020Hl0El15TweetsANNBQBatchJob extends TweetsANNJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val outputTable = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_0_EL_15") - override val keyValDatasetOutputPath = Config.IIKFHL0EL15ANNOutputPath - override val tweetEmbeddingsGenerationHalfLife: Int = -1 - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFromInterestedIn20M145K2020Hl0El15ScalaDataset -} - -/** -Scio job for batch run for tweet recommendations from IIKF 2020 with parameter setup: - - Half Life: 2hrs - - Embedding Length: 15 -The schedule cmd needs to be run only if there is any change in the config - */ -object IIKF2020Hl2El15TweetsANNBQBatchJob extends TweetsANNJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val outputTable = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_2_EL_15") - override val keyValDatasetOutputPath = Config.IIKFHL2EL15ANNOutputPath - override val tweetEmbeddingsGenerationHalfLife: Int = 7200000 // 2hrs in ms - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFromInterestedIn20M145K2020Hl2El15ScalaDataset -} - -/** -Scio job for batch run for tweet recommendations from IIKF 2020 with parameter setup: - - Half Life: 2hrs - - Embedding Length: 50 -The schedule cmd needs to be run only if there is any change in the config - */ -object IIKF2020Hl2El50TweetsANNBQBatchJob extends TweetsANNJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val outputTable = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_2_EL_50") - override val keyValDatasetOutputPath = Config.IIKFHL2EL50ANNOutputPath - override val tweetEmbeddingsGenerationHalfLife: Int = 7200000 // 2hrs in ms - override val tweetEmbeddingsGenerationEmbeddingLength: Int = 50 - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFromInterestedIn20M145K2020Hl2El50ScalaDataset -} - -/** -Scio job for batch run for tweet recommendations from IIKF 2020 with parameter setup: - - Half Life: 8hrs - - Embedding Length: 50 -The schedule cmd needs to be run only if there is any change in the config - */ -object IIKF2020Hl8El50TweetsANNBQBatchJob extends TweetsANNJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getInterestedIn2020SQL - override val outputTable = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "offline_tweet_recommendations_from_interested_in_20M_145K_2020_HL_8_EL_50") - override val keyValDatasetOutputPath = Config.IIKFHL8EL50ANNOutputPath - override val tweetEmbeddingsGenerationEmbeddingLength: Int = 50 - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFromInterestedIn20M145K2020Hl8El50ScalaDataset -} - -/** -Scio job for batch run for tweet recommendations from MTS Consumer Embeddings -The schedule cmd needs to be run only if there is any change in the config - */ -object MTSConsumerEmbeddingsTweetsANNBQBatchJob extends TweetsANNJob { - override val isAdhoc = false - override val getConsumerEmbeddingsSQLFunc = getMTSConsumerEmbeddingsFav90P20MSQL - override val outputTable = BQTableDetails( - "twttr-bq-cassowary-prod", - "user", - "offline_tweet_recommendations_from_mts_consumer_embeddings") - override val keyValDatasetOutputPath = Config.MTSConsumerEmbeddingsANNOutputPath - override val tweetRecommentationsSnapshotDataset: KeyValDALDataset[ - KeyVal[Long, CandidateTweetsList] - ] = - OfflineTweetRecommendationsFromMtsConsumerEmbeddingsScalaDataset -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-0-el-15-tweets-ann-batch-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-0-el-15-tweets-ann-batch-job.d6w deleted file mode 100644 index b86af2653..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-0-el-15-tweets-ann-batch-job.d6w +++ /dev/null @@ -1,39 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'iikf-hl-0-el-15-tweets-ann-batch-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - environment='prod', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:iikf-hl-0-el-15-tweets-ann-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT24H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-0-el-15-tweets-ann-batch-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-0-el-15-tweets-ann-batch-job.docx new file mode 100644 index 000000000..39c6d2a75 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-0-el-15-tweets-ann-batch-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-15-tweets-ann-batch-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-15-tweets-ann-batch-job.d6w deleted file mode 100644 index 55a9b5382..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-15-tweets-ann-batch-job.d6w +++ /dev/null @@ -1,39 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'iikf-hl-2-el-15-tweets-ann-batch-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - environment='prod', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:iikf-hl-2-el-15-tweets-ann-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT24H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-15-tweets-ann-batch-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-15-tweets-ann-batch-job.docx new file mode 100644 index 000000000..79456d69b Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-15-tweets-ann-batch-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-50-tweets-ann-batch-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-50-tweets-ann-batch-job.d6w deleted file mode 100644 index 6fdd1c2f2..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-50-tweets-ann-batch-job.d6w +++ /dev/null @@ -1,39 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'iikf-hl-2-el-50-tweets-ann-batch-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - environment='prod', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:iikf-hl-2-el-50-tweets-ann-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT24H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-50-tweets-ann-batch-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-50-tweets-ann-batch-job.docx new file mode 100644 index 000000000..0be822f56 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-2-el-50-tweets-ann-batch-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-adhoc-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-adhoc-job.d6w deleted file mode 100644 index beb0dbc93..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-adhoc-job.d6w +++ /dev/null @@ -1,39 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'iikf-hl-8-el-50-tweets-ann-batch-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - environment='prod', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:iikf-hl-8-el-50-tweets-ann-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT24H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-adhoc-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-adhoc-job.docx new file mode 100644 index 000000000..34da366e9 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-adhoc-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-batch-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-batch-job.d6w deleted file mode 100644 index beb0dbc93..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-batch-job.d6w +++ /dev/null @@ -1,39 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'iikf-hl-8-el-50-tweets-ann-batch-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - environment='prod', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:iikf-hl-8-el-50-tweets-ann-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT24H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-batch-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-batch-job.docx new file mode 100644 index 000000000..34da366e9 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-hl-8-el-50-tweets-ann-batch-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-adhoc-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-adhoc-job.d6w deleted file mode 100644 index 6cc067816..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-adhoc-job.d6w +++ /dev/null @@ -1,34 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'iikf-tweets-ann-adhoc-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:iikf-tweets-ann-adhoc-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT2H', - first_time='{{profile.date}}', - ), - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-adhoc-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-adhoc-job.docx new file mode 100644 index 000000000..91d944343 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-adhoc-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-batch-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-batch-job.d6w deleted file mode 100644 index 065a83eec..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-batch-job.d6w +++ /dev/null @@ -1,39 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'iikf-tweets-ann-batch-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - environment='prod', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:iikf-tweets-ann-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT24H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-batch-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-batch-job.docx new file mode 100644 index 000000000..29c103027 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/iikf-tweets-ann-batch-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-adhoc-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-adhoc-job.d6w deleted file mode 100644 index c7f921708..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-adhoc-job.d6w +++ /dev/null @@ -1,34 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - job_name = Default(String, 'mts-consumer-embeddings-tweets-ann-adhoc-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:mts-consumer-embeddings-tweets-ann-adhoc-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT2H', - first_time='{{profile.date}}', - ), - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-adhoc-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-adhoc-job.docx new file mode 100644 index 000000000..5d3384630 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-adhoc-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-batch-job.d6w b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-batch-job.d6w deleted file mode 100644 index d87e68e9f..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-batch-job.d6w +++ /dev/null @@ -1,39 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'prod') - job_name = Default(String, 'mts-consumer-embeddings-tweets-ann-batch-job') - machine = Default(String, 'n2-highmem-4') - -job = Job( - name='{{profile.job_name}}', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "date": '{{profile.date}}' - }, - service_identifier='twtr:svc:{{profile.user_name}}:{{profile.job_name}}:{{profile.environment}}:{{profile.cluster}}', - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - environment='prod', - build_target='src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann:mts-consumer-embeddings-tweets-ann-batch-job', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT4H', - first_time='{{profile.date}}', - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT24H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-batch-job.docx b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-batch-job.docx new file mode 100644 index 000000000..c199ce187 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/bq_generation/tweets_ann/mts-consumer-embeddings-tweets-ann-batch-job.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/common/BUILD b/src/scala/com/twitter/simclusters_v2/scio/common/BUILD deleted file mode 100644 index 1ad664680..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/common/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -scala_library( - sources = [ - "*.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "flockdb-tools/datasets/flock:flock-blocks-edges-scala", - "flockdb-tools/datasets/flock:flock-follows-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-abuse-edges-scala", - "flockdb-tools/datasets/flock:flock-report-as-spam-edges-scala", - "iesource/processing/events/src/main/scala/com/twitter/iesource/processing/events/batch:server_engagements-scala", - "src/scala/com/twitter/simclusters_v2/scalding", - "src/thrift/com/twitter/twadoop/user/gen:gen-scala", - "tweetsource/public_tweets/src/main/scala/com/twitter/tweetsource/public_tweets:public_tweets-scala", - "usersource/snapshot/src/main/scala/com/twitter/usersource/snapshot/flat:usersource_flat-scala", - "usersource/snapshot/src/main/thrift/com/twitter/usersource/snapshot/flat:flat-scala", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/common/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/common/BUILD.docx new file mode 100644 index 000000000..c62ea5776 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/common/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/common/ExternalDataSources.docx b/src/scala/com/twitter/simclusters_v2/scio/common/ExternalDataSources.docx new file mode 100644 index 000000000..ed5dcfd8c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/common/ExternalDataSources.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/common/ExternalDataSources.scala b/src/scala/com/twitter/simclusters_v2/scio/common/ExternalDataSources.scala deleted file mode 100644 index ed9e1aa2d..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/common/ExternalDataSources.scala +++ /dev/null @@ -1,301 +0,0 @@ -package com.twitter.simclusters_v2.scio.common - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.common.util.Clock -import com.twitter.common_header.thriftscala.CommonHeader -import com.twitter.common_header.thriftscala.IdType -import com.twitter.common_header.thriftscala.VersionedCommonHeader -import com.twitter.frigate.data_pipeline.magicrecs.magicrecs_notifications_lite.thriftscala.MagicRecsNotificationLite -import com.twitter.frigate.data_pipeline.scalding.magicrecs.magicrecs_notification_lite.MagicrecsNotificationLite1DayLagScalaDataset -import com.twitter.iesource.thriftscala.InteractionEvent -import com.twitter.iesource.thriftscala.InteractionTargetType -import com.twitter.interests_ds.jobs.interests_service.UserTopicRelationSnapshotScalaDataset -import com.twitter.interests.thriftscala.InterestRelationType -import com.twitter.interests.thriftscala.UserInterestsRelationSnapshot -import com.twitter.penguin.scalding.datasets.PenguinUserLanguagesScalaDataset -import com.twitter.search.adaptive.scribing.thriftscala.AdaptiveSearchScribeLog -import com.twitter.simclusters_v2.hdfs_sources.UserUserFavGraphScalaDataset -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources.ValidFlockEdgeStateId -import com.twitter.simclusters_v2.scalding.embedding.common.ExternalDataSources.getStandardLanguageCode -import com.twitter.twadoop.user.gen.thriftscala.CombinedUser -import flockdb_tools.datasets.flock.FlockBlocksEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockFollowsEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockReportAsAbuseEdgesScalaDataset -import flockdb_tools.datasets.flock.FlockReportAsSpamEdgesScalaDataset -import org.joda.time.Interval -import com.twitter.simclusters_v2.thriftscala.EdgeWithDecayedWeights -import com.twitter.usersource.snapshot.combined.UsersourceScalaDataset -import com.twitter.usersource.snapshot.flat.UsersourceFlatScalaDataset -import com.twitter.util.Duration -import twadoop_config.configuration.log_categories.group.search.AdaptiveSearchScalaDataset - -object ExternalDataSources { - def userSource( - noOlderThan: Duration = Duration.fromDays(7) - )( - implicit sc: ScioContext - ): SCollection[CombinedUser] = { - sc.customInput( - "ReadUserSource", - DAL - .readMostRecentSnapshotNoOlderThan( - UsersourceScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod - ) - ) - } - - def userCountrySource( - noOlderThan: Duration = Duration.fromDays(7) - )( - implicit sc: ScioContext - ): SCollection[(Long, String)] = { - sc.customInput( - "ReadUserCountrySource", - DAL - .readMostRecentSnapshotNoOlderThan( - UsersourceFlatScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod, - ) - ).flatMap { flatUser => - for { - userId <- flatUser.id - country <- flatUser.accountCountryCode - } yield { - (userId, country.toUpperCase) - } - }.distinct - } - - def userUserFavSource( - noOlderThan: Duration = Duration.fromDays(14) - )( - implicit sc: ScioContext - ): SCollection[EdgeWithDecayedWeights] = { - sc.customInput( - "ReadUserUserFavSource", - DAL - .readMostRecentSnapshotNoOlderThan( - UserUserFavGraphScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod - ) - ) - } - - def inferredUserConsumedLanguageSource( - noOlderThan: Duration = Duration.fromDays(7) - )( - implicit sc: ScioContext - ): SCollection[(Long, Seq[(String, Double)])] = { - sc.customInput( - "ReadInferredUserConsumedLanguageSource", - DAL - .readMostRecentSnapshotNoOlderThan( - PenguinUserLanguagesScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod - ) - ).map { kv => - val consumed = kv.value.consumed - .collect { - case scoredString if scoredString.weight > 0.001 => //throw away 5% outliers - (getStandardLanguageCode(scoredString.item), scoredString.weight) - }.collect { - case (Some(language), score) => (language, score) - } - (kv.key, consumed) - } - } - - def flockBlockSource( - noOlderThan: Duration = Duration.fromDays(7) - )( - implicit sc: ScioContext - ): SCollection[(Long, Long)] = { - sc.customInput( - "ReadFlockBlock", - DAL.readMostRecentSnapshotNoOlderThan( - FlockBlocksEdgesScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod)) - .collect { - case edge if edge.state == ValidFlockEdgeStateId => - (edge.sourceId, edge.destinationId) - } - } - - def flockFollowSource( - noOlderThan: Duration = Duration.fromDays(7) - )( - implicit sc: ScioContext - ): SCollection[(Long, Long)] = { - sc.customInput( - "ReadFlockFollow", - DAL - .readMostRecentSnapshotNoOlderThan( - FlockFollowsEdgesScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod)) - .collect { - case edge if edge.state == ValidFlockEdgeStateId => - (edge.sourceId, edge.destinationId) - } - } - - def flockReportAsAbuseSource( - noOlderThan: Duration = Duration.fromDays(7) - )( - implicit sc: ScioContext - ): SCollection[(Long, Long)] = { - sc.customInput( - "ReadFlockReportAsAbuseJava", - DAL - .readMostRecentSnapshotNoOlderThan( - FlockReportAsAbuseEdgesScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod) - ) - .collect { - case edge if edge.state == ValidFlockEdgeStateId => - (edge.sourceId, edge.destinationId) - } - } - - def flockReportAsSpamSource( - noOlderThan: Duration = Duration.fromDays(7) - )( - implicit sc: ScioContext - ): SCollection[(Long, Long)] = { - sc.customInput( - "ReadFlockReportAsSpam", - DAL - .readMostRecentSnapshotNoOlderThan( - FlockReportAsSpamEdgesScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod)) - .collect { - case edge if edge.state == ValidFlockEdgeStateId => - (edge.sourceId, edge.destinationId) - } - } - - def ieSourceTweetEngagementsSource( - interval: Interval - )( - implicit sc: ScioContext - ): SCollection[InteractionEvent] = { - sc.customInput( - "ReadIeSourceTweetEngagementsSource", - DAL - .read( - com.twitter.iesource.processing.events.batch.ServerEngagementsScalaDataset, - interval, - DAL.Environment.Prod, - ) - ).filter { event => - // filter out logged out users because their favorites are less reliable - event.engagingUserId > 0L && event.targetType == InteractionTargetType.Tweet - } - } - - def topicFollowGraphSource( - noOlderThan: Duration = Duration.fromDays(7) - )( - implicit sc: ScioContext - ): SCollection[(Long, Long)] = { - // The implementation here is slightly different than the topicFollowGraphSource function in - // src/scala/com/twitter/simclusters_v2/scalding/embedding/common/ExternalDataSources.scala - // We don't do an additional hashJoin on uttFollowableEntitiesSource. - sc.customInput( - "ReadTopicFollowGraphSource", - DAL - .readMostRecentSnapshotNoOlderThan( - UserTopicRelationSnapshotScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod - ) - ).collect { - case userInterestsRelationSnapshot: UserInterestsRelationSnapshot - if userInterestsRelationSnapshot.interestType == "UTT" && - userInterestsRelationSnapshot.relation == InterestRelationType.Followed => - (userInterestsRelationSnapshot.interestId, userInterestsRelationSnapshot.userId) - } - } - - def magicRecsNotficationOpenOrClickEventsSource( - interval: Interval - )( - implicit sc: ScioContext - ): SCollection[MagicRecsNotificationLite] = { - sc.customInput( - "ReadMagicRecsNotficationOpenOrClickEventsSource", - DAL - .read(MagicrecsNotificationLite1DayLagScalaDataset, interval, DAL.Environment.Prod)) - .filter { entry => - // keep entries with a valid userId and tweetId, opened or clicked timestamp defined - val userIdExists = entry.targetUserId.isDefined - val tweetIdExists = entry.tweetId.isDefined - val openOrClickExists = - entry.openTimestampMs.isDefined || entry.ntabClickTimestampMs.isDefined - userIdExists && tweetIdExists && openOrClickExists - } - } - - def adaptiveSearchScribeLogsSource( - interval: Interval - )( - implicit sc: ScioContext - ): SCollection[(Long, String)] = { - sc.customInput( - "ReadAdaptiveSearchScribeLogsSource", - DAL - .read(AdaptiveSearchScalaDataset, interval, DAL.Environment.Prod)) - .flatMap({ scribeLog: AdaptiveSearchScribeLog => - for { - userId <- userIdFromBlenderAdaptiveScribeLog(scribeLog) - // filter out logged out search queries - if userId != 0 - queryString <- scribeLog.requestLog.flatMap(_.request).flatMap(_.rawQuery) - } yield { - (userId, Set(queryString)) - } - }) - // if a user searches for the same query multiple times, there could be duplicates. - // De-dup them to get the distinct queries searched by a user - .sumByKey - .flatMap { - case (userId, distinctQuerySet) => - distinctQuerySet.map { query => - (userId, query) - } - } - } - - private def userIdFromBlenderAdaptiveScribeLog( - blenderAdaptiveLog: AdaptiveSearchScribeLog - ): Option[Long] = { - blenderAdaptiveLog.versionedCommonHeader match { - case VersionedCommonHeader.CommonHeader(CommonHeader.ServerHeader(serverHeader)) => - serverHeader.requestInfo match { - case Some(requestInfo) => requestInfo.ids.get(IdType.UserId).map(_.toLong) - case _ => None - } - case _ => None - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioApp.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioApp.docx new file mode 100644 index 000000000..d7546e65d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioApp.scala b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioApp.scala deleted file mode 100644 index 34f9b5f61..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioApp.scala +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph - -/** -Build: -./bazel bundle src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph:assemble-multi-type-graph-scio-adhoc-app - -To kick off an adhoc run: -bin/d6w create \ - ${GCP_PROJECT_NAME}/us-central1/assemble-multi-type-graph-scio-adhoc-app \ - src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-adhoc.d6w \ - --jar dist/assemble-multi-type-graph-scio-adhoc-app.jar \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=${USER} \ - --bind=profile.date="2021-11-04" \ - --bind=profile.machine="n2-highmem-16" - */ - -object AssembleMultiTypeGraphScioAdhocApp extends AssembleMultiTypeGraphScioBaseApp { - override val isAdhoc: Boolean = true - override val rootMHPath: String = Config.AdhocRootPath - override val rootThriftPath: String = Config.AdhocRootPath -} - -/** -To deploy the job: - -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/assemble-multi-type-graph-scio-batch-app \ - src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-batch.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=recos-platform \ - --bind=profile.date="2021-11-04" \ - --bind=profile.machine="n2-highmem-16" - */ -object AssembleMultiTypeGraphScioBatchApp extends AssembleMultiTypeGraphScioBaseApp { - override val isAdhoc: Boolean = false - override val rootMHPath: String = Config.RootMHPath - override val rootThriftPath: String = Config.RootThriftPath -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioBaseApp.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioBaseApp.docx new file mode 100644 index 000000000..5b346ccf4 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioBaseApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioBaseApp.scala b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioBaseApp.scala deleted file mode 100644 index 18325e2fc..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraphScioBaseApp.scala +++ /dev/null @@ -1,574 +0,0 @@ -package com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph - -import com.spotify.scio.ScioContext -import com.spotify.scio.coders.Coder -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.DiskFormat -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.job.DateRangeOptions -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.frigate.data_pipeline.magicrecs.magicrecs_notifications_lite.thriftscala.MagicRecsNotificationLite -import com.twitter.iesource.thriftscala.InteractionEvent -import com.twitter.iesource.thriftscala.InteractionType -import com.twitter.iesource.thriftscala.ReferenceTweet -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.coders.ThriftStructLazyBinaryScroogeCoder -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.scrooge.ThriftStruct -import com.twitter.simclusters_v2.common.Country -import com.twitter.simclusters_v2.common.Language -import com.twitter.simclusters_v2.common.TopicId -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.hdfs_sources.MultiTypeGraphForTopKRightNodesThriftScioScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.TopKRightNounsScioScalaDataset -import com.twitter.simclusters_v2.hdfs_sources.TruncatedMultiTypeGraphScioScalaDataset -import com.twitter.simclusters_v2.scio.common.ExternalDataSources -import com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph.Config.GlobalDefaultMinFrequencyOfRightNodeType -import com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph.Config.HalfLifeInDaysForFavScore -import com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph.Config.NumTopNounsForUnknownRightNodeType -import com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph.Config.SampledEmployeeIds -import com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph.Config.TopKConfig -import com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph.Config.TopKRightNounsForMHDump -import com.twitter.simclusters_v2.scio.multi_type_graph.common.MultiTypeGraphUtil -import com.twitter.simclusters_v2.thriftscala.EdgeWithDecayedWeights -import com.twitter.simclusters_v2.thriftscala.LeftNode -import com.twitter.simclusters_v2.thriftscala.MultiTypeGraphEdge -import com.twitter.simclusters_v2.thriftscala.Noun -import com.twitter.simclusters_v2.thriftscala.NounWithFrequency -import com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList -import com.twitter.simclusters_v2.thriftscala.RightNode -import com.twitter.simclusters_v2.thriftscala.RightNodeType -import com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct -import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeight -import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList -import com.twitter.twadoop.user.gen.thriftscala.CombinedUser -import com.twitter.util.Duration -import java.time.Instant -import org.joda.time.Interval - -/** - * Scio version of - * src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraph.scala - */ -trait AssembleMultiTypeGraphScioBaseApp extends ScioBeamJob[DateRangeOptions] { - // Provides an implicit binary thrift scrooge coder by default. - override implicit def scroogeCoder[T <: ThriftStruct: Manifest]: Coder[T] = - ThriftStructLazyBinaryScroogeCoder.scroogeCoder - - val isAdhoc: Boolean - val rootMHPath: String - val rootThriftPath: String - - val truncatedMultiTypeGraphMHOutputDir: String = - Config.truncatedMultiTypeGraphMHOutputDir - val truncatedMultiTypeGraphThriftOutputDir: String = - Config.truncatedMultiTypeGraphThriftOutputDir - val topKRightNounsMHOutputDir: String = Config.topKRightNounsMHOutputDir - val topKRightNounsOutputDir: String = Config.topKRightNounsOutputDir - - val fullMultiTypeGraphThriftOutputDir: String = - Config.fullMultiTypeGraphThriftOutputDir - val truncatedMultiTypeGraphKeyValDataset: KeyValDALDataset[ - KeyVal[LeftNode, RightNodeWithEdgeWeightList] - ] = TruncatedMultiTypeGraphScioScalaDataset - val topKRightNounsKeyValDataset: KeyValDALDataset[ - KeyVal[RightNodeTypeStruct, NounWithFrequencyList] - ] = TopKRightNounsScioScalaDataset - val topKRightNounsMHKeyValDataset: KeyValDALDataset[ - KeyVal[RightNodeTypeStruct, NounWithFrequencyList] - ] = TopKRightNounsMhScioScalaDataset - val fullMultiTypeGraphSnapshotDataset: SnapshotDALDataset[MultiTypeGraphEdge] = - FullMultiTypeGraphScioScalaDataset - val multiTypeGraphTopKForRightNodesSnapshotDataset: SnapshotDALDataset[ - MultiTypeGraphEdge - ] = - MultiTypeGraphForTopKRightNodesThriftScioScalaDataset - - def getValidUsers( - input: SCollection[CombinedUser] - ): SCollection[UserId] = { - input - .flatMap { u => - for { - user <- u.user - if user.id != 0 - safety <- user.safety - if !(safety.suspended || safety.deactivated) - } yield { - user.id - } - } - } - - def filterInvalidUsers( - flockEdges: SCollection[(UserId, UserId)], - validUsers: SCollection[UserId] - ): SCollection[(UserId, UserId)] = { - val validUsersWithValues = validUsers.map(userId => (userId, ())) - flockEdges - .join(validUsersWithValues) - .map { - case (srcId, (destId, _)) => - (destId, srcId) - } - .join(validUsersWithValues) - .map { - case (destId, (srcId, _)) => - (srcId, destId) - } - } - - def getFavEdges( - input: SCollection[EdgeWithDecayedWeights], - halfLifeInDaysForFavScore: Int, - ): SCollection[(Long, Long, Double)] = { - input - .flatMap { edge => - if (edge.weights.halfLifeInDaysToDecayedSums.contains(halfLifeInDaysForFavScore)) { - Some( - ( - edge.sourceId, - edge.destinationId, - edge.weights.halfLifeInDaysToDecayedSums(halfLifeInDaysForFavScore))) - } else { - None - } - } - } - - def leftRightTuple( - leftNodeUserId: UserId, - rightNodeType: RightNodeType, - rightNoun: Noun, - weight: Double = 1.0 - ): (LeftNode, RightNodeWithEdgeWeight) = { - ( - LeftNode.UserId(leftNodeUserId), - RightNodeWithEdgeWeight( - rightNode = RightNode(rightNodeType = rightNodeType, noun = rightNoun), - weight = weight)) - } - - def getUserFavGraph( - userUserFavEdges: SCollection[(UserId, UserId, Double)] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - userUserFavEdges.map { - case (srcId, destId, edgeWt) => - leftRightTuple(srcId, RightNodeType.FavUser, Noun.UserId(destId), edgeWt) - } - } - - def getUserFollowGraph( - userUserFollowEdges: SCollection[(UserId, UserId)] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - userUserFollowEdges.map { - case (srcId, destId) => - leftRightTuple(srcId, RightNodeType.FollowUser, Noun.UserId(destId), 1.0) - } - } - - def getUserBlockGraph( - userUserBlockEdges: SCollection[(UserId, UserId)] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - userUserBlockEdges.map { - case (srcId, destId) => - leftRightTuple(srcId, RightNodeType.BlockUser, Noun.UserId(destId), 1.0) - } - } - - def getUserAbuseReportGraph( - userUserAbuseReportEdges: SCollection[(UserId, UserId)] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - userUserAbuseReportEdges.map { - case (srcId, destId) => - leftRightTuple(srcId, RightNodeType.AbuseReportUser, Noun.UserId(destId), 1.0) - } - } - - def getUserSpamReportGraph( - userUserSpamReportEdges: SCollection[(UserId, UserId)] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - userUserSpamReportEdges.map { - case (srcId, destId) => - leftRightTuple(srcId, RightNodeType.SpamReportUser, Noun.UserId(destId), 1.0) - } - } - - def getUserTopicFollowGraph( - topicUserFollowedByEdges: SCollection[(TopicId, UserId)] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - topicUserFollowedByEdges.map { - case (topicId, userId) => - leftRightTuple(userId, RightNodeType.FollowTopic, Noun.TopicId(topicId), 1.0) - } - } - - def getUserSignUpCountryGraph( - userSignUpCountryEdges: SCollection[(UserId, Country)] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - userSignUpCountryEdges.map { - case (userId, country) => - leftRightTuple(userId, RightNodeType.SignUpCountry, Noun.Country(country), 1.0) - } - } - - def getMagicRecsNotifOpenOrClickTweetsGraph( - userMRNotifOpenOrClickEvents: SCollection[MagicRecsNotificationLite] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - userMRNotifOpenOrClickEvents.flatMap { entry => - for { - userId <- entry.targetUserId - tweetId <- entry.tweetId - } yield { - leftRightTuple(userId, RightNodeType.NotifOpenOrClickTweet, Noun.TweetId(tweetId), 1.0) - } - } - } - - def getUserConsumedLanguagesGraph( - userConsumedLanguageEdges: SCollection[(UserId, Seq[(Language, Double)])] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - userConsumedLanguageEdges.flatMap { - case (userId, langWithWeights) => - langWithWeights.map { - case (lang, weight) => - leftRightTuple(userId, RightNodeType.ConsumedLanguage, Noun.Language(lang), 1.0) - } - } - } - - def getSearchGraph( - userSearchQueryEdges: SCollection[(UserId, String)] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - userSearchQueryEdges.map { - case (userId, query) => - leftRightTuple(userId, RightNodeType.SearchQuery, Noun.Query(query), 1.0) - } - } - - def getUserTweetInteractionGraph( - tweetInteractionEvents: SCollection[InteractionEvent], - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - val userTweetInteractionsByType: SCollection[((UserId, TweetId), RightNodeType)] = - tweetInteractionEvents - .flatMap { event => - val referenceTweet: Option[ReferenceTweet] = event.referenceTweet - val targetId: Long = event.targetId - val userId: Long = event.engagingUserId - - // To find the id of the tweet that was interacted with - // For likes, this is the targetId; for retweet or reply, it is the referenceTweet's id - // One thing to note is that for likes, referenceTweet is empty - val (tweetIdOpt, rightNodeTypeOpt) = { - event.interactionType match { - case Some(InteractionType.Favorite) => - // Only allow favorites on original tweets, not retweets, to avoid double-counting - // because we have retweet-type tweets in the data source as well - ( - if (referenceTweet.isEmpty) { - Some(targetId) - } else None, - Some(RightNodeType.FavTweet)) - case Some(InteractionType.Reply) => - (referenceTweet.map(_.tweetId), Some(RightNodeType.ReplyTweet)) - case Some(InteractionType.Retweet) => - (referenceTweet.map(_.tweetId), Some(RightNodeType.RetweetTweet)) - case _ => (None, None) - } - } - for { - tweetId <- tweetIdOpt - rightNodeType <- rightNodeTypeOpt - } yield { - ((userId, tweetId), rightNodeType) - } - } - - userTweetInteractionsByType - .mapValues(Set(_)) - .sumByKey - .flatMap { - case ((userId, tweetId), rightNodeTypeSet) => - rightNodeTypeSet.map { rightNodeType => - leftRightTuple(userId, rightNodeType, Noun.TweetId(tweetId), 1.0) - } - } - } - - def getTopKRightNounsWithFrequencies( - fullGraph: SCollection[(LeftNode, RightNodeWithEdgeWeight)], - topKConfig: Map[RightNodeType, Int], - minFrequency: Int, - ): SCollection[(RightNodeType, Seq[(Noun, Double)])] = { - val maxAcrossRightNounType: Int = topKConfig.valuesIterator.max - - fullGraph - .map { - case (leftNode, rightNodeWithWeight) => - (rightNodeWithWeight.rightNode, 1.0) - } - .sumByKey - .filter(_._2 >= minFrequency) - .map { - case (rightNode, freq) => - (rightNode.rightNodeType, (rightNode.noun, freq)) - } - .topByKey(maxAcrossRightNounType)(Ordering.by(_._2)) - .map { - case (rightNodeType, nounsListWithFreq) => - val truncatedList = nounsListWithFreq.toSeq - .sortBy(-_._2) - .take(topKConfig.getOrElse(rightNodeType, NumTopNounsForUnknownRightNodeType)) - (rightNodeType, truncatedList) - } - } - - def getTruncatedGraph( - fullGraph: SCollection[(LeftNode, RightNodeWithEdgeWeight)], - topKWithFrequency: SCollection[(RightNodeType, Seq[(Noun, Double)])] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - val topNouns = topKWithFrequency - .flatMap { - case (rightNodeType, nounsList) => - nounsList - .map { - case (nounVal, aggregatedFrequency) => - RightNode(rightNodeType, nounVal) - } - }.map(nouns => (nouns, ())) - - fullGraph - .map { - case (leftNode, rightNodeWithWeight) => - (rightNodeWithWeight.rightNode, (leftNode, rightNodeWithWeight)) - } - .hashJoin(topNouns) - .map { - case (rightNode, ((left, rightNodeWithWeight), _)) => - (left, rightNodeWithWeight) - } - } - - def buildEmployeeGraph( - graph: SCollection[(LeftNode, RightNodeWithEdgeWeight)] - ): SCollection[(LeftNode, RightNodeWithEdgeWeight)] = { - val employeeIds = SampledEmployeeIds - graph - .collect { - case (LeftNode.UserId(userId), rightNodeWithWeight) if employeeIds.contains(userId) => - (LeftNode.UserId(userId), rightNodeWithWeight) - } - } - - override def configurePipeline(sc: ScioContext, opts: DateRangeOptions): Unit = { - // Define the implicit ScioContext to read datasets from ExternalDataSources - implicit def scioContext: ScioContext = sc - - // DAL.Environment variable for WriteExecs - val dalEnv = if (isAdhoc) DAL.Environment.Dev else DAL.Environment.Prod - - // Define date intervals - val interval_7days = - new Interval(opts.interval.getEnd.minusWeeks(1), opts.interval.getEnd.minusMillis(1)) - val interval_14days = - new Interval(opts.interval.getEnd.minusWeeks(2), opts.interval.getEnd.minusMillis(1)) - - /* - * Dataset read operations - */ - // Get list of valid UserIds - to filter out deactivated or suspended user accounts - val validUsers = getValidUsers(ExternalDataSources.userSource(Duration.fromDays(7))) - - // ieSource tweet engagements data for tweet favs, replies, retweets - from last 14 days - val tweetSource = ExternalDataSources.ieSourceTweetEngagementsSource(interval_14days) - - // Read TFlock datasets - val flockFollowSource = ExternalDataSources.flockFollowSource(Duration.fromDays(7)) - val flockBlockSource = ExternalDataSources.flockBlockSource(Duration.fromDays(7)) - val flockReportAsAbuseSource = - ExternalDataSources.flockReportAsAbuseSource(Duration.fromDays(7)) - val flockReportAsSpamSource = - ExternalDataSources.flockReportAsSpamSource(Duration.fromDays(7)) - - // user-user fav edges - val userUserFavSource = ExternalDataSources.userUserFavSource(Duration.fromDays(14)) - val userUserFavEdges = getFavEdges(userUserFavSource, HalfLifeInDaysForFavScore) - - // user-user follow edges - val userUserFollowEdges = filterInvalidUsers(flockFollowSource, validUsers) - - // user-user block edges - val userUserBlockEdges = filterInvalidUsers(flockBlockSource, validUsers) - - // user-user abuse report edges - val userUserAbuseReportEdges = filterInvalidUsers(flockReportAsAbuseSource, validUsers) - - // user-user spam report edges - val userUserSpamReportEdges = filterInvalidUsers(flockReportAsSpamSource, validUsers) - - // user-signup country edges - val userSignUpCountryEdges = ExternalDataSources - .userCountrySource(Duration.fromDays(7)) - - // user-consumed language edges - val userConsumedLanguageEdges = - ExternalDataSources.inferredUserConsumedLanguageSource(Duration.fromDays(7)) - - // user-topic follow edges - val topicUserFollowedByEdges = - ExternalDataSources.topicFollowGraphSource(Duration.fromDays(7)) - - // user-MRNotifOpenOrClick events from last 7 days - val userMRNotifOpenOrClickEvents = - ExternalDataSources.magicRecsNotficationOpenOrClickEventsSource(interval_7days) - - // user-searchQuery strings from last 7 days - val userSearchQueryEdges = - ExternalDataSources.adaptiveSearchScribeLogsSource(interval_7days) - - /* - * Generate the full graph - */ - val fullGraph = - getUserTweetInteractionGraph(tweetSource) ++ - getUserFavGraph(userUserFavEdges) ++ - getUserFollowGraph(userUserFollowEdges) ++ - getUserBlockGraph(userUserBlockEdges) ++ - getUserAbuseReportGraph(userUserAbuseReportEdges) ++ - getUserSpamReportGraph(userUserSpamReportEdges) ++ - getUserSignUpCountryGraph(userSignUpCountryEdges) ++ - getUserConsumedLanguagesGraph(userConsumedLanguageEdges) ++ - getUserTopicFollowGraph(topicUserFollowedByEdges) ++ - getMagicRecsNotifOpenOrClickTweetsGraph(userMRNotifOpenOrClickEvents) ++ - getSearchGraph(userSearchQueryEdges) - - // Get Top K RightNodes - val topKRightNodes: SCollection[(RightNodeType, Seq[(Noun, Double)])] = - getTopKRightNounsWithFrequencies( - fullGraph, - TopKConfig, - GlobalDefaultMinFrequencyOfRightNodeType) - - // key transformation - topK nouns, keyed by the RightNodeNounType - val topKNounsKeyedByType: SCollection[(RightNodeTypeStruct, NounWithFrequencyList)] = - topKRightNodes - .map { - case (rightNodeType, rightNounsWithScoresList) => - val nounsListWithFrequency: Seq[NounWithFrequency] = rightNounsWithScoresList - .map { - case (noun, aggregatedFrequency) => - NounWithFrequency(noun, aggregatedFrequency) - } - (RightNodeTypeStruct(rightNodeType), NounWithFrequencyList(nounsListWithFrequency)) - } - - // Get Truncated graph based on the top K RightNodes - val truncatedGraph: SCollection[(LeftNode, RightNodeWithEdgeWeight)] = - getTruncatedGraph(fullGraph, topKRightNodes) - - // key transformations - truncated graph, keyed by LeftNode - // Note: By wrapping and unwrapping with the LeftNode.UserId, we don't have to deal - // with defining our own customer ordering for LeftNode type - val truncatedGraphKeyedBySrc: SCollection[(LeftNode, RightNodeWithEdgeWeightList)] = - truncatedGraph - .collect { - case (LeftNode.UserId(userId), rightNodeWithWeight) => - userId -> List(rightNodeWithWeight) - } - .sumByKey - .map { - case (userId, rightNodeWithWeightList) => - (LeftNode.UserId(userId), RightNodeWithEdgeWeightList(rightNodeWithWeightList)) - } - - // WriteExecs - // Write TopK RightNodes to DAL - save all the top K nodes for the clustering step - topKNounsKeyedByType - .map { - case (engagementType, rightList) => - KeyVal(engagementType, rightList) - } - .saveAsCustomOutput( - name = "WriteTopKNouns", - DAL.writeVersionedKeyVal( - topKRightNounsKeyValDataset, - PathLayout.VersionedPath(prefix = - rootMHPath + topKRightNounsOutputDir), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - environmentOverride = dalEnv, - ) - ) - - // Write TopK RightNodes to DAL - only take TopKRightNounsForMHDump RightNodes for MH dump - topKNounsKeyedByType - .map { - case (engagementType, rightList) => - val rightListMH = - NounWithFrequencyList(rightList.nounWithFrequencyList.take(TopKRightNounsForMHDump)) - KeyVal(engagementType, rightListMH) - } - .saveAsCustomOutput( - name = "WriteTopKNounsToMHForDebugger", - DAL.writeVersionedKeyVal( - topKRightNounsMHKeyValDataset, - PathLayout.VersionedPath(prefix = - rootMHPath + topKRightNounsMHOutputDir), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - environmentOverride = dalEnv, - ) - ) - - // Write truncated graph (MultiTypeGraphTopKForRightNodes) to DAL in KeyVal format - truncatedGraphKeyedBySrc - .map { - case (leftNode, rightNodeWithWeightList) => - KeyVal(leftNode, rightNodeWithWeightList) - }.saveAsCustomOutput( - name = "WriteTruncatedMultiTypeGraph", - DAL.writeVersionedKeyVal( - truncatedMultiTypeGraphKeyValDataset, - PathLayout.VersionedPath(prefix = - rootMHPath + truncatedMultiTypeGraphMHOutputDir), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - environmentOverride = dalEnv, - ) - ) - - // Write truncated graph (MultiTypeGraphTopKForRightNodes) to DAL in thrift format - truncatedGraph - .map { - case (leftNode, rightNodeWithWeight) => - MultiTypeGraphEdge(leftNode, rightNodeWithWeight) - }.saveAsCustomOutput( - name = "WriteTruncatedMultiTypeGraphThrift", - DAL.writeSnapshot( - multiTypeGraphTopKForRightNodesSnapshotDataset, - PathLayout.FixedPath(rootThriftPath + truncatedMultiTypeGraphThriftOutputDir), - Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - DiskFormat.Thrift(), - environmentOverride = dalEnv - ) - ) - - // Write full graph to DAL - fullGraph - .map { - case (leftNode, rightNodeWithWeight) => - MultiTypeGraphEdge(leftNode, rightNodeWithWeight) - } - .saveAsCustomOutput( - name = "WriteFullMultiTypeGraph", - DAL.writeSnapshot( - fullMultiTypeGraphSnapshotDataset, - PathLayout.FixedPath(rootThriftPath + fullMultiTypeGraphThriftOutputDir), - Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - DiskFormat.Thrift(), - environmentOverride = dalEnv - ) - ) - - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/BUILD b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/BUILD deleted file mode 100644 index 4ad3bfb53..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/BUILD +++ /dev/null @@ -1,73 +0,0 @@ -scala_library( - name = "assemble-multi-type-graph-scio-lib", - sources = [ - "*.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":full_multi_type_graph_scio-scala", - ":top_k_right_nouns_mh_scio-scala", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/beam/io/manhattan", - "beam-internal/src/main/scala/com/twitter/beam/job", - "beam-internal/src/main/scala/com/twitter/beam/transform", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding/multi_type_graph/assemble_multi_type_graph", - "src/scala/com/twitter/simclusters_v2/scio/common", - "src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common", - ], -) - -jvm_binary( - name = "assemble-multi-type-graph-scio-adhoc-app", - main = "com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph.AssembleMultiTypeGraphScioAdhocApp", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":assemble-multi-type-graph-scio-lib", - "beam-internal/src/main/scala/com/twitter/beam/runner/dataflow", - ], -) - -jvm_binary( - name = "assemble-multi-type-graph-scio-batch-app", - main = "com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph.AssembleMultiTypeGraphScioBatchApp", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":assemble-multi-type-graph-scio-lib", - "beam-internal/src/main/scala/com/twitter/beam/runner/dataflow", - ], -) - -create_datasets( - base_name = "full_multi_type_graph_scio", - java_schema = "com.twitter.simclusters_v2.thriftjava.MultiTypeGraphEdge", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.MultiTypeGraphEdge", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "top_k_right_nouns_mh_scio", - key_type = "com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.topKRightNounListInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList", - scala_dependencies = [ - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/BUILD.docx new file mode 100644 index 000000000..19bf86785 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/Config.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/Config.docx new file mode 100644 index 000000000..5af397f2c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/Config.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/Config.scala b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/Config.scala deleted file mode 100644 index 337789ca1..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/Config.scala +++ /dev/null @@ -1,37 +0,0 @@ -package com.twitter.simclusters_v2.scio.multi_type_graph.assemble_multi_type_graph - -import com.twitter.simclusters_v2.thriftscala.RightNodeType - -object Config { - val RootMHPath: String = "manhattan_sequence_files/multi_type_graph/" - val RootThriftPath: String = "processed/multi_type_graph/" - val AdhocRootPath = "adhoc/multi_type_graph/" - val truncatedMultiTypeGraphMHOutputDir: String = "truncated_graph_mh" - val truncatedMultiTypeGraphThriftOutputDir: String = "truncated_graph_thrift" - val topKRightNounsMHOutputDir: String = "top_k_right_nouns_mh" - val topKRightNounsOutputDir: String = "top_k_right_nouns" - val fullMultiTypeGraphThriftOutputDir: String = "full_graph_thrift" - val HalfLifeInDaysForFavScore = 100 - val NumTopNounsForUnknownRightNodeType = 20 - val GlobalDefaultMinFrequencyOfRightNodeType = 100 - val TopKRightNounsForMHDump = 1000 - - // the topK most frequent nouns for each engagement type - val TopKConfig: Map[RightNodeType, Int] = Map( - RightNodeType.FollowUser -> 10000000, // 10M, current simclusters_v2 has this value set to 20M, providing this the most weight - RightNodeType.FavUser -> 5000000, - RightNodeType.BlockUser -> 1000000, - RightNodeType.AbuseReportUser -> 1000000, - RightNodeType.SpamReportUser -> 1000000, - RightNodeType.FollowTopic -> 5000, - RightNodeType.SignUpCountry -> 200, - RightNodeType.ConsumedLanguage -> 50, - RightNodeType.FavTweet -> 500000, - RightNodeType.ReplyTweet -> 500000, - RightNodeType.RetweetTweet -> 500000, - RightNodeType.NotifOpenOrClickTweet -> 500000, - RightNodeType.SearchQuery -> 500000 - ) - val SampledEmployeeIds: Set[Long] = - Set() -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/README.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/README.docx new file mode 100644 index 000000000..7ba9ca173 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/README.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/README.md b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/README.md deleted file mode 100644 index f258c9683..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/README.md +++ /dev/null @@ -1,49 +0,0 @@ -# Pre-requisites - -## Tutorial -Follow the tutorial Batch Job on Dataflow Quickstart on how to run a simple batch job on Dataflow. - -## GCP setup - -Ensure `gcloud` CLI is installed and `application_default_credentials.json` has been generated. - -## Data access - -If you want to run an adhoc job with your ldap, you will need access to multiple LDAP groups to read the datasets. - -# Running the job - -### Running an adhoc job - -```bash -export GCP_PROJECT_NAME='twttr-recos-ml-prod' - -./bazel bundle src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph:assemble-multi-type-graph-scio-adhoc-app - -bin/d6w create \ - ${GCP_PROJECT_NAME}/us-central1/assemble-multi-type-graph-scio-adhoc-app \ - src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-adhoc.d6w \ - --jar dist/assemble-multi-type-graph-scio-adho-app.jar \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=${USER} \ - --bind=profile.date="2021-11-04" \ - --bind=profile.machine="n2-highmem-16" -``` - -### Scheduling the job on Workflow - -Scheduling a job will require a service account as `recos-platform`. -Remember this account will need permissions to read all the required dataset. - -```bash -export SERVICE_ACCOUNT='recos-platform' -export GCP_PROJECT_NAME='twttr-recos-ml-prod' - -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/assemble-multi-type-graph-scio-batch-app \ - src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-batch.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name="recos-platform" \ - --bind=profile.date="2021-11-04" \ - --bind=profile.machine="n2-highmem-16" -``` diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-adhoc.d6w b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-adhoc.d6w deleted file mode 100644 index 835c48e71..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-adhoc.d6w +++ /dev/null @@ -1,36 +0,0 @@ -# See -# Checkout the README to see how to deploy the job - -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - machine= Default(String, 'n2-highmem-16') - -job = Job( - name='assemble-multi-type-graph-scio-adhoc-app', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD') - ), - extra_args={ - "environment": '{{profile.environment}}', - "date": Quote('{{profile.date}}'), - }, - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph:assemble-multi-type-graph-scio-adhoc-app', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT1H', - first_time='{{profile.date}}' - ) - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-adhoc.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-adhoc.docx new file mode 100644 index 000000000..24e6c2d0f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-adhoc.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-batch.d6w b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-batch.d6w deleted file mode 100644 index 4734e9c0f..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-batch.d6w +++ /dev/null @@ -1,41 +0,0 @@ -# See -# Checkout the README to see how to deploy the job - -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'prod') - machine= Default(String, 'n2-highmem-16') - -job = Job( - name='assemble-multi-type-graph-scio-batch-app', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD') - ), - extra_args={ - "environment": '{{profile.environment}}', - "date": Quote('{{profile.date}}'), - }, - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph:assemble-multi-type-graph-scio-batch-app', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - environment='prod', - statebird_config=StatebirdConfig( - batch_width='P1W', - first_time='{{profile.date}}' - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT18H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-batch.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-batch.docx new file mode 100644 index 000000000..b6a8bb609 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/assemble_multi_type_graph/assemble-multi-type-graph-scio-batch.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/BUILD b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/BUILD deleted file mode 100644 index d8ca4cd90..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -scala_library( - sources = [ - "*.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scalding", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/BUILD.docx new file mode 100644 index 000000000..4d314e6ba Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/MultiTypeGraphUtil.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/MultiTypeGraphUtil.docx new file mode 100644 index 000000000..2860a3c21 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/MultiTypeGraphUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/MultiTypeGraphUtil.scala b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/MultiTypeGraphUtil.scala deleted file mode 100644 index 4a5cd67de..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common/MultiTypeGraphUtil.scala +++ /dev/null @@ -1,69 +0,0 @@ -package com.twitter.simclusters_v2.scio -package multi_type_graph.common - -import com.spotify.scio.ScioContext -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.common.util.Clock -import com.twitter.scalding_internal.job.RequiredBinaryComparators.ordSer -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.hdfs_sources.TruncatedMultiTypeGraphScioScalaDataset -import com.twitter.simclusters_v2.thriftscala.LeftNode -import com.twitter.simclusters_v2.thriftscala.Noun -import com.twitter.simclusters_v2.thriftscala.RightNode -import com.twitter.simclusters_v2.thriftscala.RightNodeType -import com.twitter.util.Duration - -object MultiTypeGraphUtil { - val RootMHPath: String = "manhattan_sequence_files/multi_type_graph/" - val RootThriftPath: String = "processed/multi_type_graph/" - val AdhocRootPath = "adhoc/multi_type_graph/" - - val nounOrdering: Ordering[Noun] = new Ordering[Noun] { - // We define an ordering for each noun type as specified in simclusters_v2/multi_type_graph.thrift - // Please make sure we don't remove anything here that's still a part of the union Noun thrift and - // vice versa, if we add a new noun type to thrift, an ordering for it needs to added here as well. - def nounTypeOrder(noun: Noun): Int = noun match { - case _: Noun.UserId => 0 - case _: Noun.Country => 1 - case _: Noun.Language => 2 - case _: Noun.Query => 3 - case _: Noun.TopicId => 4 - case _: Noun.TweetId => 5 - } - - override def compare(x: Noun, y: Noun): Int = nounTypeOrder(x) compare nounTypeOrder(y) - } - - val rightNodeTypeOrdering: Ordering[RightNodeType] = ordSer[RightNodeType] - - val rightNodeOrdering: Ordering[RightNode] = - new Ordering[RightNode] { - override def compare(x: RightNode, y: RightNode): Int = { - Ordering - .Tuple2(rightNodeTypeOrdering, nounOrdering) - .compare((x.rightNodeType, x.noun), (y.rightNodeType, y.noun)) - } - } - - def getTruncatedMultiTypeGraph( - noOlderThan: Duration = Duration.fromDays(14) - )( - implicit sc: ScioContext - ): SCollection[(Long, RightNode, Double)] = { - sc.customInput( - "ReadTruncatedMultiTypeGraph", - DAL - .readMostRecentSnapshotNoOlderThan( - TruncatedMultiTypeGraphScioScalaDataset, - noOlderThan, - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod - ) - ).flatMap { - case KeyVal(LeftNode.UserId(userId), rightNodesList) => - rightNodesList.rightNodeWithEdgeWeightList.map(rightNodeWithWeight => - (userId, rightNodeWithWeight.rightNode, rightNodeWithWeight.weight)) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/BUILD b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/BUILD deleted file mode 100644 index fa06b6d7a..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/BUILD +++ /dev/null @@ -1,92 +0,0 @@ -scala_library( - name = "multi-type-graph-scio-sims-lib", - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":right_node_cosine_similarity_scio_adhoc-scala", - ":right_node_sim_hash_scio_adhoc-scala", - "3rdparty/jvm/com/twitter/bijection:scrooge", - "beam-internal/src/main/scala/com/twitter/beam/io/dal", - "beam-internal/src/main/scala/com/twitter/beam/io/manhattan", - "beam-internal/src/main/scala/com/twitter/beam/job", - "beam-internal/src/main/scala/com/twitter/beam/transform", - "beam-internal/src/main/scala/com/twitter/scio_internal/runner/dataflow", - "src/scala/com/twitter/simclusters_v2/hdfs_sources", - "src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/common", - "src/scala/com/twitter/wtf/dataflow/cosine_similarity/common", - ], -) - -jvm_binary( - name = "multi-type-graph-sim-hash-scio-adhoc-app", - main = "com.twitter.simclusters_v2.scio.multi_type_graph.multi_type_graph_sims.RightNodeSimHashScioAdhocApp", - platform = "java8", - dependencies = [ - ":multi-type-graph-scio-sims-lib", - "beam-internal/src/main/scala/com/twitter/beam/runner/dataflow", - ], -) - -jvm_binary( - name = "multi-type-graph-sim-hash-scio-batch-app", - main = "com.twitter.simclusters_v2.scio.multi_type_graph.multi_type_graph_sims.RightNodeSimHashScioBatchApp", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":multi-type-graph-scio-sims-lib", - "beam-internal/src/main/scala/com/twitter/beam/runner/dataflow", - ], -) - -jvm_binary( - name = "multi-type-graph-cosine-similarity-scio-adhoc-app", - main = "com.twitter.simclusters_v2.scio.multi_type_graph.multi_type_graph_sims.RightNodeCosineSimilarityScioAdhocApp", - platform = "java8", - dependencies = [ - ":multi-type-graph-scio-sims-lib", - "beam-internal/src/main/scala/com/twitter/beam/runner/dataflow", - ], -) - -jvm_binary( - name = "multi-type-graph-cosine-similarity-scio-batch-app", - main = "com.twitter.simclusters_v2.scio.multi_type_graph.multi_type_graph_sims.RightNodeCosineSimilarityScioBatchApp", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":multi-type-graph-scio-sims-lib", - "beam-internal/src/main/scala/com/twitter/beam/runner/dataflow", - ], -) - -create_datasets( - base_name = "right_node_sim_hash_scio_adhoc", - java_schema = "com.twitter.simclusters_v2.thriftjava.RightNodeSimHashSketch", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.thriftscala.RightNodeSimHashSketch", - segment_type = "snapshot", - tags = ["bazel-compatible"], - java_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-java", - ], - scala_dependencies = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) - -create_datasets( - base_name = "right_node_cosine_similarity_scio_adhoc", - key_type = "com.twitter.simclusters_v2.thriftscala.RightNode", - platform = "java8", - role = "cassowary", - scala_schema = "com.twitter.simclusters_v2.hdfs_sources.injections.MultiTypeGraphInjections.similarRightNodesInjection", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "com.twitter.simclusters_v2.thriftscala.SimilarRightNodes", - scala_dependencies = [ - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/simclusters_v2/hdfs_sources/injections", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/BUILD.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/BUILD.docx new file mode 100644 index 000000000..33940a6d4 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/Config.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/Config.docx new file mode 100644 index 000000000..cdb8c7d1f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/Config.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/Config.scala b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/Config.scala deleted file mode 100644 index de0dc39c0..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/Config.scala +++ /dev/null @@ -1,18 +0,0 @@ -package com.twitter.simclusters_v2.scio -package multi_type_graph.multi_type_graph_sims - -object Config { - // Config settings for RightNodeSimHashScioBaseApp job - // Number of hashes to generate in the sketch - val numHashes: Int = 8192 // each is a bit, so this results in 1KB uncompressed sketch/user - // Reduce skew by letting each reducers process a limited number of followers/user - val maxNumNeighborsPerReducers: Int = 300000 - val simsHashJobOutputDirectory: String = "right_node/sims/sim_hash" - - // Config settings for RightNodeCosineSimilarityScioBaseApp job - val numSims: Int = 500 - val minCosineSimilarityThreshold: Double = 0.01 - val maxOutDegree: Int = 10000 - val cosineSimJobOutputDirectory = "right_node/sims/cosine_similarity" - -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioApp.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioApp.docx new file mode 100644 index 000000000..328483e5a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioApp.scala b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioApp.scala deleted file mode 100644 index 6c064be9b..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioApp.scala +++ /dev/null @@ -1,55 +0,0 @@ -package com.twitter.simclusters_v2.scio -package multi_type_graph.multi_type_graph_sims - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.simclusters_v2.hdfs_sources.RightNodeCosineSimilarityScioScalaDataset -import com.twitter.simclusters_v2.thriftscala.RightNode -import com.twitter.simclusters_v2.thriftscala.SimilarRightNodes -import com.twitter.wtf.scalding.jobs.cosine_similarity.common.ApproximateMatrixSelfTransposeMultiplicationJob - -/** -Build: -./bazel bundle src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims:multi-type-graph-cosine-similarity-scio-adhoc-app - -To kick off an adhoc run: -bin/d6w create \ - ${GCP_PROJECT_NAME}/us-central1/multi-type-graph-cosine-similarity-scio-adhoc-app \ - src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-adhoc.d6w \ - --jar dist/multi-type-graph-cosine-similarity-scio-adhoc-app.jar \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=${USER} \ - --bind=profile.date="2022-01-16" \ - --bind=profile.machine="n2d-highmem-16" --ignore-existing - */ - -object RightNodeCosineSimilarityScioAdhocApp extends RightNodeCosineSimilarityScioBaseApp { - override val isAdhoc = true - override val cosineSimKeyValSnapshotDataset: KeyValDALDataset[ - KeyVal[RightNode, SimilarRightNodes] - ] = - RightNodeCosineSimilarityScioAdhocScalaDataset - override val filterCandidateSimilarityPair: (Double, Double, Double) => Boolean = - ApproximateMatrixSelfTransposeMultiplicationJob.filterCandidateSimilarityPair -} - -/** -To deploy the job: - -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/multi-type-graph-cosine-similarity-scio-batch-app \ - src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-batch.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=recos-platform \ - --bind=profile.date="2021-12-01" \ - --bind=profile.machine="n2d-highmem-16" - */ -object RightNodeCosineSimilarityScioBatchApp extends RightNodeCosineSimilarityScioBaseApp { - override val isAdhoc = false - override val cosineSimKeyValSnapshotDataset: KeyValDALDataset[ - KeyVal[RightNode, SimilarRightNodes] - ] = - RightNodeCosineSimilarityScioScalaDataset - override val filterCandidateSimilarityPair: (Double, Double, Double) => Boolean = - ApproximateMatrixSelfTransposeMultiplicationJob.filterCandidateSimilarityPair -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioBaseApp.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioBaseApp.docx new file mode 100644 index 000000000..7ef5d4b7b Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioBaseApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioBaseApp.scala b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioBaseApp.scala deleted file mode 100644 index 963178f7b..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeCosineSimilarityScioBaseApp.scala +++ /dev/null @@ -1,96 +0,0 @@ -package com.twitter.simclusters_v2.scio -package multi_type_graph.multi_type_graph_sims - -import com.spotify.scio.ScioContext -import com.spotify.scio.coders.Coder -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.job.DateRangeOptions -import com.twitter.common.util.Clock -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal -import com.twitter.scio_internal.coders.ThriftStructLazyBinaryScroogeCoder -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.scrooge.ThriftStruct -import com.twitter.simclusters_v2.hdfs_sources.RightNodeSimHashScioScalaDataset -import com.twitter.simclusters_v2.scio.multi_type_graph.common.MultiTypeGraphUtil -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.util.Duration -import com.twitter.wtf.dataflow.cosine_similarity.ApproximateMatrixSelfTransposeMultiplicationJob -import java.time.Instant - -trait RightNodeCosineSimilarityScioBaseApp - extends ScioBeamJob[DateRangeOptions] - with ApproximateMatrixSelfTransposeMultiplicationJob[RightNode] { - override implicit def scroogeCoder[T <: ThriftStruct: Manifest]: Coder[T] = - ThriftStructLazyBinaryScroogeCoder.scroogeCoder - override val ordering: Ordering[RightNode] = MultiTypeGraphUtil.rightNodeOrdering - - val isAdhoc: Boolean - val cosineSimKeyValSnapshotDataset: KeyValDALDataset[KeyVal[RightNode, SimilarRightNodes]] - val rightNodeSimHashSnapshotDataset: SnapshotDALDataset[RightNodeSimHashSketch] = - RightNodeSimHashScioScalaDataset - val cosineSimJobOutputDirectory: String = Config.cosineSimJobOutputDirectory - - override def graph( - implicit sc: ScioContext, - coder: Coder[RightNode] - ): SCollection[(Long, RightNode, Double)] = - MultiTypeGraphUtil.getTruncatedMultiTypeGraph(noOlderThan = Duration.fromDays(14)) - - override def simHashSketches( - implicit sc: ScioContext, - coder: Coder[RightNode] - ): SCollection[(RightNode, Array[Byte])] = { - sc.customInput( - "ReadSimHashSketches", - DAL - .readMostRecentSnapshotNoOlderThan( - rightNodeSimHashSnapshotDataset, - Duration.fromDays(14), - Clock.SYSTEM_CLOCK, - DAL.Environment.Prod - ) - ).map { sketch => - sketch.rightNode -> sketch.simHashOfEngagers.toArray - } - } - - override def configurePipeline( - sc: ScioContext, - opts: DateRangeOptions - ): Unit = { - implicit def scioContext: ScioContext = sc - // DAL.Environment variable for WriteExecs - val dalEnv = if (isAdhoc) DAL.Environment.Dev else DAL.Environment.Prod - - val topKRightNodes: SCollection[(RightNode, SimilarRightNodes)] = topK - .collect { - case (rightNode, simRightNodes) => - val sims = simRightNodes.collect { - case (simRightNode, score) => SimilarRightNode(simRightNode, score) - } - (rightNode, SimilarRightNodes(sims)) - } - - topKRightNodes - .map { - case (rightNode, sims) => KeyVal(rightNode, sims) - }.saveAsCustomOutput( - name = "WriteRightNodeCosineSimilarityDataset", - DAL.writeVersionedKeyVal( - cosineSimKeyValSnapshotDataset, - PathLayout.VersionedPath(prefix = - ((if (!isAdhoc) - MultiTypeGraphUtil.RootMHPath - else - MultiTypeGraphUtil.AdhocRootPath) - + Config.cosineSimJobOutputDirectory)), - instant = Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - environmentOverride = dalEnv, - ) - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioApp.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioApp.docx new file mode 100644 index 000000000..da9a70e5c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioApp.scala b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioApp.scala deleted file mode 100644 index f485b52ce..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioApp.scala +++ /dev/null @@ -1,43 +0,0 @@ -package com.twitter.simclusters_v2.scio -package multi_type_graph.multi_type_graph_sims - -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.simclusters_v2.hdfs_sources.RightNodeSimHashScioScalaDataset -import com.twitter.simclusters_v2.thriftscala.RightNodeSimHashSketch - -/** -Build: -./bazel bundle src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims:multi-type-graph-sim-hash-scio-adhoc-app - -To kick off an adhoc run: -bin/d6w create \ - ${GCP_PROJECT_NAME}/us-central1/multi-type-graph-sim-hash-scio-adhoc-app \ - src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-adhoc.d6w \ - --jar dist/multi-type-graph-sim-hash-scio-adhoc-app.jar \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=${USER} \ - --bind=profile.date="2021-12-01" \ - --bind=profile.machine="n2d-highmem-16" --ignore-existing - */ -object RightNodeSimHashScioAdhocApp extends RightNodeSimHashScioBaseApp { - override val isAdhoc: Boolean = true - override val rightNodeSimHashSnapshotDataset: SnapshotDALDataset[RightNodeSimHashSketch] = - RightNodeSimHashScioAdhocScalaDataset -} - -/** -To deploy the job: - -bin/d6w schedule \ - ${GCP_PROJECT_NAME}/us-central1/multi-type-graph-sim-hash-scio-batch-app \ - src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-batch.d6w \ - --bind=profile.project=${GCP_PROJECT_NAME} \ - --bind=profile.user_name=recos-platform \ - --bind=profile.date="2021-12-01" \ - --bind=profile.machine="n2d-highmem-16" - */ -object RightNodeSimHashScioBatchApp extends RightNodeSimHashScioBaseApp { - override val isAdhoc: Boolean = false - override val rightNodeSimHashSnapshotDataset: SnapshotDALDataset[RightNodeSimHashSketch] = - RightNodeSimHashScioScalaDataset -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioBaseApp.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioBaseApp.docx new file mode 100644 index 000000000..0e768c660 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioBaseApp.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioBaseApp.scala b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioBaseApp.scala deleted file mode 100644 index e17fe5a15..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/RightNodeSimHashScioBaseApp.scala +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.simclusters_v2.scio -package multi_type_graph.multi_type_graph_sims - -import com.spotify.scio.ScioContext -import com.spotify.scio.coders.Coder -import com.spotify.scio.values.SCollection -import com.twitter.beam.io.dal.DAL -import com.twitter.beam.io.fs.multiformat.DiskFormat -import com.twitter.beam.io.fs.multiformat.PathLayout -import com.twitter.beam.job.DateRangeOptions -import com.twitter.dal.client.dataset.SnapshotDALDataset -import com.twitter.scio_internal.coders.ThriftStructLazyBinaryScroogeCoder -import com.twitter.scio_internal.job.ScioBeamJob -import com.twitter.scrooge.ThriftStruct -import com.twitter.simclusters_v2.scio.multi_type_graph.common.MultiTypeGraphUtil -import com.twitter.simclusters_v2.thriftscala.RightNode -import com.twitter.simclusters_v2.thriftscala.RightNodeSimHashSketch -import com.twitter.util.Duration -import com.twitter.wtf.dataflow.cosine_similarity.SimHashJob -import java.time.Instant - -trait RightNodeSimHashScioBaseApp extends ScioBeamJob[DateRangeOptions] with SimHashJob[RightNode] { - override implicit def scroogeCoder[T <: ThriftStruct: Manifest]: Coder[T] = - ThriftStructLazyBinaryScroogeCoder.scroogeCoder - override val ordering: Ordering[RightNode] = MultiTypeGraphUtil.rightNodeOrdering - - val isAdhoc: Boolean - val rightNodeSimHashSnapshotDataset: SnapshotDALDataset[RightNodeSimHashSketch] - val simsHashJobOutputDirectory: String = Config.simsHashJobOutputDirectory - - override def graph( - implicit sc: ScioContext, - ): SCollection[(Long, RightNode, Double)] = - MultiTypeGraphUtil.getTruncatedMultiTypeGraph(noOlderThan = Duration.fromDays(14)) - - override def configurePipeline(sc: ScioContext, opts: DateRangeOptions): Unit = { - implicit def scioContext: ScioContext = sc - - // DAL.Environment variable for WriteExecs - val dalEnv = if (isAdhoc) DAL.Environment.Dev else DAL.Environment.Prod - - val sketches = computeSimHashSketchesForWeightedGraph(graph) - .map { - case (rightNode, sketch, norm) => RightNodeSimHashSketch(rightNode, sketch, norm) - } - - // Write SimHashSketches to DAL - sketches - .saveAsCustomOutput( - name = "WriteSimHashSketches", - DAL.writeSnapshot( - rightNodeSimHashSnapshotDataset, - PathLayout.FixedPath( - ((if (!isAdhoc) - MultiTypeGraphUtil.RootThriftPath - else - MultiTypeGraphUtil.AdhocRootPath) - + simsHashJobOutputDirectory)), - Instant.ofEpochMilli(opts.interval.getEndMillis - 1L), - DiskFormat.Thrift(), - environmentOverride = dalEnv - ) - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-adhoc.d6w b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-adhoc.d6w deleted file mode 100644 index 2bdc591cf..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-adhoc.d6w +++ /dev/null @@ -1,33 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - machine = Default(String, 'n2d-highmem-16') - -job = Job( - name='multi-type-graph-cosine-similarity-scio-adhoc-app', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "environment": '{{profile.environment}}', - "date": Quote('{{profile.date}}'), - }, - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims:multi-type-graph-cosine-similarity-scio-adhoc-app', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT1H', - first_time='{{profile.date}}' - ) - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-adhoc.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-adhoc.docx new file mode 100644 index 000000000..8959f84d6 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-adhoc.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-batch.d6w b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-batch.d6w deleted file mode 100644 index b88bcd094..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-batch.d6w +++ /dev/null @@ -1,39 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'prod') - machine = Default(String, 'n2d-highmem-16') - -job = Job( - name='multi-type-graph-cosine-similarity-scio-batch-app', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "environment": '{{profile.environment}}', - "date": Quote('{{profile.date}}'), - }, - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims:multi-type-graph-cosine-similarity-scio-batch-app', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - environment='prod', - statebird_config=StatebirdConfig( - batch_width='P1W', - first_time='{{profile.date}}' - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT50H' - ) -) - -jobs=[job] - diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-batch.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-batch.docx new file mode 100644 index 000000000..32c5adf01 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/cosine-similarity-scio-batch.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-adhoc.d6w b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-adhoc.d6w deleted file mode 100644 index ee653aabd..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-adhoc.d6w +++ /dev/null @@ -1,33 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'dev') - machine = Default(String, 'n2d-highmem-16') - -job = Job( - name='multi-type-graph-sim-hash-scio-adhoc-app', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "environment": '{{profile.environment}}', - "date": Quote('{{profile.date}}'), - }, - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims:multi-type-graph-sim-hash-scio-adhoc-app', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - statebird_config=StatebirdConfig( - batch_width='PT1H', - first_time='{{profile.date}}' - ) - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-adhoc.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-adhoc.docx new file mode 100644 index 000000000..2f631eee5 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-adhoc.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-batch.d6w b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-batch.d6w deleted file mode 100644 index ff6a7b84c..000000000 --- a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-batch.d6w +++ /dev/null @@ -1,38 +0,0 @@ -class Profile(Struct): - project = Required(String) - date = Required(String) - environment = Default(String, 'prod') - machine = Default(String, 'n2d-highmem-16') - -job = Job( - name='multi-type-graph-sim-hash-scio-batch-app', - project='{{profile.project}}', - staging_bucket='{{profile.project}}', - service_account='{{profile.user_name}}-shdw@twttr-dp-svc-accounts.iam.gserviceaccount.com', - region='us-central1', - worker_config=WorkerConfig( - num_workers=2, - worker_machine_type='{{profile.machine}}', - worker_disk_type=WorkerDiskType('HDD'), - ), - extra_args={ - "environment": '{{profile.environment}}', - "date": Quote('{{profile.date}}'), - }, - deployment_config=BatchDeploymentConfig( - role='{{profile.user_name}}', - build_target='src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims:multi-type-graph-sim-hash-scio-batch-app', - gcp_deployment_credentials='/var/lib/tss/keys/{{profile.user_name}}/cloud/gcp/dp/shadow.json', - environment='prod', - statebird_config=StatebirdConfig( - batch_width='P1W', - first_time='{{profile.date}}' - ), - workflow_config=WorkflowConfig( - play=True, - ), - timeout='PT20H' - ) -) - -jobs=[job] diff --git a/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-batch.docx b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-batch.docx new file mode 100644 index 000000000..120682b89 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/scio/multi_type_graph/multi_type_graph_sims/sim-hash-scio-batch.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/score/AggregatedScoreStore.docx b/src/scala/com/twitter/simclusters_v2/score/AggregatedScoreStore.docx new file mode 100644 index 000000000..7e98009f7 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/score/AggregatedScoreStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/score/AggregatedScoreStore.scala b/src/scala/com/twitter/simclusters_v2/score/AggregatedScoreStore.scala deleted file mode 100644 index 31734f226..000000000 --- a/src/scala/com/twitter/simclusters_v2/score/AggregatedScoreStore.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.simclusters_v2.score - -import com.twitter.simclusters_v2.thriftscala.{ScoreId => ThriftScoreId, Score => ThriftScore} -import com.twitter.storehaus.ReadableStore - -/** - * A wrapper class, used to aggregate the scores calculated by other score stores. It relies on the - * results of other ScoreStores registered in the ScoreFacadeStore. - */ -trait AggregatedScoreStore extends ReadableStore[ThriftScoreId, ThriftScore] { - - // The underlyingScoreStore relies on [[ScoreFacadeStore]] to finish the dependency injection. - protected var scoreFacadeStore: ReadableStore[ThriftScoreId, ThriftScore] = ReadableStore.empty - - /** - * When registering this store in a ScoreFacadeStore, the facade store calls this function to - * provide references to other score stores. - */ - private[score] def set(facadeStore: ReadableStore[ThriftScoreId, ThriftScore]): Unit = { - this.synchronized { - scoreFacadeStore = facadeStore - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/score/BUILD b/src/scala/com/twitter/simclusters_v2/score/BUILD deleted file mode 100644 index 13e8c07f6..000000000 --- a/src/scala/com/twitter/simclusters_v2/score/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "finagle/finagle-stats", - "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common", - "src/scala/com/twitter/simclusters_v2/stores", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/score/BUILD.docx b/src/scala/com/twitter/simclusters_v2/score/BUILD.docx new file mode 100644 index 000000000..12ead8c79 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/score/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/score/Score.docx b/src/scala/com/twitter/simclusters_v2/score/Score.docx new file mode 100644 index 000000000..5b50c192c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/score/Score.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/score/Score.scala b/src/scala/com/twitter/simclusters_v2/score/Score.scala deleted file mode 100644 index c12acf97e..000000000 --- a/src/scala/com/twitter/simclusters_v2/score/Score.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.twitter.simclusters_v2.score - -import com.twitter.simclusters_v2.thriftscala.{Score => ThriftScore} - -/** - * A uniform value type for all kinds of Calculation Score. - **/ -case class Score(score: Double) { - - implicit lazy val toThrift: ThriftScore = { - ThriftScore(score) - } -} - -object Score { - - /** - * Only support Double Type Thrift score - */ - implicit val fromThriftScore: ThriftScore => Score = { thriftScore => Score(thriftScore.score) } - -} diff --git a/src/scala/com/twitter/simclusters_v2/score/ScoreFacadeStore.docx b/src/scala/com/twitter/simclusters_v2/score/ScoreFacadeStore.docx new file mode 100644 index 000000000..1e0c863f3 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/score/ScoreFacadeStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/score/ScoreFacadeStore.scala b/src/scala/com/twitter/simclusters_v2/score/ScoreFacadeStore.scala deleted file mode 100644 index ac084e737..000000000 --- a/src/scala/com/twitter/simclusters_v2/score/ScoreFacadeStore.scala +++ /dev/null @@ -1,103 +0,0 @@ -package com.twitter.simclusters_v2.score - -import com.twitter.finagle.stats.BroadcastStatsReceiver -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.hermit.store.common.ObservedReadableStore -import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm -import com.twitter.simclusters_v2.thriftscala.{ScoreId => ThriftScoreId} -import com.twitter.simclusters_v2.thriftscala.{Score => ThriftScore} -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Future - -/** - * Provide a uniform access layer for all kind of Score. - * @param readableStores readable stores indexed by the ScoringAlgorithm they implement - */ -class ScoreFacadeStore private ( - stores: Map[ScoringAlgorithm, ReadableStore[ThriftScoreId, ThriftScore]]) - extends ReadableStore[ThriftScoreId, ThriftScore] { - - override def get(k: ThriftScoreId): Future[Option[ThriftScore]] = { - findStore(k).get(k) - } - - // Override the multiGet for better batch performance. - override def multiGet[K1 <: ThriftScoreId](ks: Set[K1]): Map[K1, Future[Option[ThriftScore]]] = { - if (ks.isEmpty) { - Map.empty - } else { - val head = ks.head - val notSameType = ks.exists(k => k.algorithm != head.algorithm) - if (!notSameType) { - findStore(head).multiGet(ks) - } else { - // Generate a large amount temp objects. - // For better performance, avoid querying the multiGet with more than one kind of embedding - ks.groupBy(id => id.algorithm).flatMap { - case (_, ks) => - findStore(ks.head).multiGet(ks) - } - } - } - } - - // If not store mapping, fast return a IllegalArgumentException. - private def findStore(id: ThriftScoreId): ReadableStore[ThriftScoreId, ThriftScore] = { - stores.get(id.algorithm) match { - case Some(store) => store - case None => - throw new IllegalArgumentException(s"The Scoring Algorithm ${id.algorithm} doesn't exist.") - } - } - -} - -object ScoreFacadeStore { - /* - Build a ScoreFacadeStore which exposes stats for all requests (under "all") and per scoring algorithm: - - score_facade_store/all/ - score_facade_store// - - Stores in aggregatedStores may reference stores in readableStores. An instance of ScoreFacadeStore - is passed to them after instantiation. - */ - def buildWithMetrics( - readableStores: Map[ScoringAlgorithm, ReadableStore[ThriftScoreId, ThriftScore]], - aggregatedStores: Map[ScoringAlgorithm, AggregatedScoreStore], - statsReceiver: StatsReceiver - ) = { - val scopedStatsReceiver = statsReceiver.scope("score_facade_store") - - def wrapStore( - scoringAlgorithm: ScoringAlgorithm, - store: ReadableStore[ThriftScoreId, ThriftScore] - ): ReadableStore[ThriftScoreId, ThriftScore] = { - val sr = BroadcastStatsReceiver( - Seq( - scopedStatsReceiver.scope("all"), - scopedStatsReceiver.scope(scoringAlgorithm.name) - )) - ObservedReadableStore(store)(sr) - } - - val stores = (readableStores ++ aggregatedStores).map { - case (algo, store) => algo -> wrapStore(algo, store) - } - val store = new ScoreFacadeStore(stores = stores) - - /* - AggregatedScores aggregate scores from multiple non-aggregated stores. They access these via the - ScoreFacadeStore itself, and therefore must be passed an instance of it after it has been - constructed. - */ - assert( - readableStores.keySet.forall(algorithm => !aggregatedStores.keySet.contains(algorithm)), - "Keys for stores are disjoint") - - aggregatedStores.values.foreach(_.set(store)) - - store - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/score/ScoreId.docx b/src/scala/com/twitter/simclusters_v2/score/ScoreId.docx new file mode 100644 index 000000000..7dd83b6c8 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/score/ScoreId.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/score/ScoreId.scala b/src/scala/com/twitter/simclusters_v2/score/ScoreId.scala deleted file mode 100644 index da045ecda..000000000 --- a/src/scala/com/twitter/simclusters_v2/score/ScoreId.scala +++ /dev/null @@ -1,129 +0,0 @@ -package com.twitter.simclusters_v2.score - -import com.twitter.simclusters_v2.common.SimClustersEmbeddingId._ -import com.twitter.simclusters_v2.thriftscala.{ - InternalId, - ScoreInternalId, - ScoringAlgorithm, - SimClustersEmbeddingId, - GenericPairScoreId => ThriftGenericPairScoreId, - ScoreId => ThriftScoreId, - SimClustersEmbeddingPairScoreId => ThriftSimClustersEmbeddingPairScoreId -} - -/** - * A uniform Identifier type for all kinds of Calculation Score. - **/ -trait ScoreId { - - def algorithm: ScoringAlgorithm - - /** - * Convert to a Thrift object. Throw a exception if the operation is not override. - */ - implicit def toThrift: ThriftScoreId = - throw new UnsupportedOperationException(s"ScoreId $this doesn't support Thrift format") -} - -object ScoreId { - - implicit val fromThriftScoreId: ThriftScoreId => ScoreId = { - case scoreId @ ThriftScoreId(_, ScoreInternalId.GenericPairScoreId(_)) => - PairScoreId.fromThriftScoreId(scoreId) - case scoreId @ ThriftScoreId(_, ScoreInternalId.SimClustersEmbeddingPairScoreId(_)) => - SimClustersEmbeddingPairScoreId.fromThriftScoreId(scoreId) - } - -} - -/** - * Generic Internal pairwise id. Support all the subtypes in InternalId, which includes TweetId, - * UserId, EntityId and more combination ids. - **/ -trait PairScoreId extends ScoreId { - - def id1: InternalId - def id2: InternalId - - override implicit lazy val toThrift: ThriftScoreId = { - ThriftScoreId( - algorithm, - ScoreInternalId.GenericPairScoreId(ThriftGenericPairScoreId(id1, id2)) - ) - } -} - -object PairScoreId { - - // The default PairScoreId assume id1 <= id2. It used to increase the cache hit rate. - def apply(algorithm: ScoringAlgorithm, id1: InternalId, id2: InternalId): PairScoreId = { - if (internalIdOrdering.lteq(id1, id2)) { - DefaultPairScoreId(algorithm, id1, id2) - } else { - DefaultPairScoreId(algorithm, id2, id1) - } - } - - private case class DefaultPairScoreId( - algorithm: ScoringAlgorithm, - id1: InternalId, - id2: InternalId) - extends PairScoreId - - implicit val fromThriftScoreId: ThriftScoreId => PairScoreId = { - case ThriftScoreId(algorithm, ScoreInternalId.GenericPairScoreId(pairScoreId)) => - DefaultPairScoreId(algorithm, pairScoreId.id1, pairScoreId.id2) - case ThriftScoreId(algorithm, ScoreInternalId.SimClustersEmbeddingPairScoreId(pairScoreId)) => - SimClustersEmbeddingPairScoreId(algorithm, pairScoreId.id1, pairScoreId.id2) - } - -} - -/** - * ScoreId for a pair of SimClustersEmbedding. - * Used for dot product, cosine similarity and other basic embedding operations. - */ -trait SimClustersEmbeddingPairScoreId extends PairScoreId { - def embeddingId1: SimClustersEmbeddingId - - def embeddingId2: SimClustersEmbeddingId - - override def id1: InternalId = embeddingId1.internalId - - override def id2: InternalId = embeddingId2.internalId - - override implicit lazy val toThrift: ThriftScoreId = { - ThriftScoreId( - algorithm, - ScoreInternalId.SimClustersEmbeddingPairScoreId( - ThriftSimClustersEmbeddingPairScoreId(embeddingId1, embeddingId2)) - ) - } -} - -object SimClustersEmbeddingPairScoreId { - - // The default PairScoreId assume id1 <= id2. It used to increase the cache hit rate. - def apply( - algorithm: ScoringAlgorithm, - id1: SimClustersEmbeddingId, - id2: SimClustersEmbeddingId - ): SimClustersEmbeddingPairScoreId = { - if (simClustersEmbeddingIdOrdering.lteq(id1, id2)) { - DefaultSimClustersEmbeddingPairScoreId(algorithm, id1, id2) - } else { - DefaultSimClustersEmbeddingPairScoreId(algorithm, id2, id1) - } - } - - private case class DefaultSimClustersEmbeddingPairScoreId( - algorithm: ScoringAlgorithm, - embeddingId1: SimClustersEmbeddingId, - embeddingId2: SimClustersEmbeddingId) - extends SimClustersEmbeddingPairScoreId - - implicit val fromThriftScoreId: ThriftScoreId => SimClustersEmbeddingPairScoreId = { - case ThriftScoreId(algorithm, ScoreInternalId.SimClustersEmbeddingPairScoreId(pairScoreId)) => - SimClustersEmbeddingPairScoreId(algorithm, pairScoreId.id1, pairScoreId.id2) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/score/ScoreStore.docx b/src/scala/com/twitter/simclusters_v2/score/ScoreStore.docx new file mode 100644 index 000000000..285dad21f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/score/ScoreStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/score/ScoreStore.scala b/src/scala/com/twitter/simclusters_v2/score/ScoreStore.scala deleted file mode 100644 index 3aea91e1a..000000000 --- a/src/scala/com/twitter/simclusters_v2/score/ScoreStore.scala +++ /dev/null @@ -1,72 +0,0 @@ -package com.twitter.simclusters_v2.score - -import com.twitter.simclusters_v2.thriftscala.{Score => ThriftScore, ScoreId => ThriftScoreId} -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Future - -/** - * A Score Store is a readableStore with ScoreId as Key and Score as the Value. - * It also needs to include the algorithm type. - * A algorithm type should only be used by one Score Store in the application. - */ -trait ScoreStore[K <: ScoreId] extends ReadableStore[K, Score] { - - def fromThriftScoreId: ThriftScoreId => K - - // Convert to a Thrift version. - def toThriftStore: ReadableStore[ThriftScoreId, ThriftScore] = { - this - .composeKeyMapping[ThriftScoreId](fromThriftScoreId) - .mapValues(_.toThrift) - } -} - -/** - * A generic Pairwise Score store. - * Requires provide both left and right side feature hydration. - */ -trait PairScoreStore[K <: PairScoreId, K1, K2, V1, V2] extends ScoreStore[K] { - - def compositeKey1: K => K1 - def compositeKey2: K => K2 - - // Left side feature hydration - def underlyingStore1: ReadableStore[K1, V1] - - // Right side feature hydration - def underlyingStore2: ReadableStore[K2, V2] - - def score: (V1, V2) => Future[Option[Double]] - - override def get(k: K): Future[Option[Score]] = { - for { - vs <- - Future.join(underlyingStore1.get(compositeKey1(k)), underlyingStore2.get(compositeKey2(k))) - v <- vs match { - case (Some(v1), Some(v2)) => - score(v1, v2) - case _ => - Future.None - } - } yield { - v.map(buildScore) - } - } - - override def multiGet[KK <: K](ks: Set[KK]): Map[KK, Future[Option[Score]]] = { - - val v1Map = underlyingStore1.multiGet(ks.map { k => compositeKey1(k) }) - val v2Map = underlyingStore2.multiGet(ks.map { k => compositeKey2(k) }) - - ks.map { k => - k -> Future.join(v1Map(compositeKey1(k)), v2Map(compositeKey2(k))).flatMap { - case (Some(v1), Some(v2)) => - score(v1, v2).map(_.map(buildScore)) - case _ => - Future.value(None) - } - }.toMap - } - - private def buildScore(v: Double): Score = Score(v) -} diff --git a/src/scala/com/twitter/simclusters_v2/score/SimClustersEmbeddingPairScoreStore.docx b/src/scala/com/twitter/simclusters_v2/score/SimClustersEmbeddingPairScoreStore.docx new file mode 100644 index 000000000..19805c507 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/score/SimClustersEmbeddingPairScoreStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/score/SimClustersEmbeddingPairScoreStore.scala b/src/scala/com/twitter/simclusters_v2/score/SimClustersEmbeddingPairScoreStore.scala deleted file mode 100644 index ef0143711..000000000 --- a/src/scala/com/twitter/simclusters_v2/score/SimClustersEmbeddingPairScoreStore.scala +++ /dev/null @@ -1,201 +0,0 @@ -package com.twitter.simclusters_v2.score - -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbeddingId, ScoreId => ThriftScoreId} -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Future - -object SimClustersEmbeddingPairScoreStore { - - /** - * Internal Instance of a SimClusters Embedding based Pair Score store. - */ - private case class SimClustersEmbeddingInternalPairScoreStore( - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding], - score: (SimClustersEmbedding, SimClustersEmbedding) => Future[Option[Double]]) - extends PairScoreStore[ - SimClustersEmbeddingPairScoreId, - SimClustersEmbeddingId, - SimClustersEmbeddingId, - SimClustersEmbedding, - SimClustersEmbedding - ] { - - override val compositeKey1: SimClustersEmbeddingPairScoreId => SimClustersEmbeddingId = - _.embeddingId1 - override val compositeKey2: SimClustersEmbeddingPairScoreId => SimClustersEmbeddingId = - _.embeddingId2 - - override def underlyingStore1: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = - simClustersEmbeddingStore - - override def underlyingStore2: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = - simClustersEmbeddingStore - - override def fromThriftScoreId: ThriftScoreId => SimClustersEmbeddingPairScoreId = - SimClustersEmbeddingPairScoreId.fromThriftScoreId - } - - def buildDotProductStore( - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ): PairScoreStore[ - SimClustersEmbeddingPairScoreId, - SimClustersEmbeddingId, - SimClustersEmbeddingId, - SimClustersEmbedding, - SimClustersEmbedding - ] = { - - def dotProduct: (SimClustersEmbedding, SimClustersEmbedding) => Future[Option[Double]] = { - case (embedding1, embedding2) => - Future.value(Some(embedding1.dotProduct(embedding2))) - } - - SimClustersEmbeddingInternalPairScoreStore( - simClustersEmbeddingStore, - dotProduct - ) - } - - def buildCosineSimilarityStore( - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ): PairScoreStore[ - SimClustersEmbeddingPairScoreId, - SimClustersEmbeddingId, - SimClustersEmbeddingId, - SimClustersEmbedding, - SimClustersEmbedding - ] = { - - def cosineSimilarity: (SimClustersEmbedding, SimClustersEmbedding) => Future[Option[Double]] = { - case (embedding1, embedding2) => - Future.value(Some(embedding1.cosineSimilarity(embedding2))) - } - - SimClustersEmbeddingInternalPairScoreStore( - simClustersEmbeddingStore, - cosineSimilarity - ) - } - - def buildLogCosineSimilarityStore( - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ): PairScoreStore[ - SimClustersEmbeddingPairScoreId, - SimClustersEmbeddingId, - SimClustersEmbeddingId, - SimClustersEmbedding, - SimClustersEmbedding - ] = { - - def logNormCosineSimilarity: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Future[Option[Double]] = { - case (embedding1, embedding2) => - Future.value(Some(embedding1.logNormCosineSimilarity(embedding2))) - } - - SimClustersEmbeddingInternalPairScoreStore( - simClustersEmbeddingStore, - logNormCosineSimilarity - ) - } - - def buildExpScaledCosineSimilarityStore( - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ): PairScoreStore[ - SimClustersEmbeddingPairScoreId, - SimClustersEmbeddingId, - SimClustersEmbeddingId, - SimClustersEmbedding, - SimClustersEmbedding - ] = { - - def expScaledCosineSimilarity: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Future[Option[Double]] = { - case (embedding1, embedding2) => - Future.value(Some(embedding1.expScaledCosineSimilarity(embedding2))) - } - - SimClustersEmbeddingInternalPairScoreStore( - simClustersEmbeddingStore, - expScaledCosineSimilarity - ) - } - - def buildJaccardSimilarityStore( - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ): PairScoreStore[ - SimClustersEmbeddingPairScoreId, - SimClustersEmbeddingId, - SimClustersEmbeddingId, - SimClustersEmbedding, - SimClustersEmbedding - ] = { - - def jaccardSimilarity: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Future[Option[Double]] = { - case (embedding1, embedding2) => - Future.value(Some(embedding1.jaccardSimilarity(embedding2))) - } - - SimClustersEmbeddingInternalPairScoreStore( - simClustersEmbeddingStore, - jaccardSimilarity - ) - } - - def buildEuclideanDistanceStore( - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ): PairScoreStore[ - SimClustersEmbeddingPairScoreId, - SimClustersEmbeddingId, - SimClustersEmbeddingId, - SimClustersEmbedding, - SimClustersEmbedding - ] = { - - def euclideanDistance: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Future[Option[Double]] = { - case (embedding1, embedding2) => - Future.value(Some(embedding1.euclideanDistance(embedding2))) - } - - SimClustersEmbeddingInternalPairScoreStore( - simClustersEmbeddingStore, - euclideanDistance - ) - } - - def buildManhattanDistanceStore( - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ): PairScoreStore[ - SimClustersEmbeddingPairScoreId, - SimClustersEmbeddingId, - SimClustersEmbeddingId, - SimClustersEmbedding, - SimClustersEmbedding - ] = { - - def manhattanDistance: ( - SimClustersEmbedding, - SimClustersEmbedding - ) => Future[Option[Double]] = { - case (embedding1, embedding2) => - Future.value(Some(embedding1.manhattanDistance(embedding2))) - } - - SimClustersEmbeddingInternalPairScoreStore( - simClustersEmbeddingStore, - manhattanDistance - ) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/score/WeightedSumAggregatedScoreStore.docx b/src/scala/com/twitter/simclusters_v2/score/WeightedSumAggregatedScoreStore.docx new file mode 100644 index 000000000..94cb69461 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/score/WeightedSumAggregatedScoreStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/score/WeightedSumAggregatedScoreStore.scala b/src/scala/com/twitter/simclusters_v2/score/WeightedSumAggregatedScoreStore.scala deleted file mode 100644 index 8c1552c95..000000000 --- a/src/scala/com/twitter/simclusters_v2/score/WeightedSumAggregatedScoreStore.scala +++ /dev/null @@ -1,84 +0,0 @@ -package com.twitter.simclusters_v2.score - -import com.twitter.simclusters_v2.score.WeightedSumAggregatedScoreStore.WeightedSumAggregatedScoreParameter -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - GenericPairScoreId, - ModelVersion, - ScoreInternalId, - ScoringAlgorithm, - SimClustersEmbeddingId, - Score => ThriftScore, - ScoreId => ThriftScoreId, - SimClustersEmbeddingPairScoreId => ThriftSimClustersEmbeddingPairScoreId -} -import com.twitter.util.Future - -/** - * A generic store wrapper to aggregate the scores of N underlying stores in a weighted fashion. - * - */ -case class WeightedSumAggregatedScoreStore(parameters: Seq[WeightedSumAggregatedScoreParameter]) - extends AggregatedScoreStore { - - override def get(k: ThriftScoreId): Future[Option[ThriftScore]] = { - val underlyingScores = parameters.map { parameter => - scoreFacadeStore - .get(ThriftScoreId(parameter.scoreAlgorithm, parameter.idTransform(k.internalId))) - .map(_.map(s => parameter.scoreTransform(s.score) * parameter.weight)) - } - Future.collect(underlyingScores).map { scores => - if (scores.exists(_.nonEmpty)) { - val newScore = scores.foldLeft(0.0) { - case (sum, maybeScore) => - sum + maybeScore.getOrElse(0.0) - } - Some(ThriftScore(score = newScore)) - } else { - // Return None if all of the underlying score is None. - None - } - } - } -} - -object WeightedSumAggregatedScoreStore { - - /** - * The parameter of WeightedSumAggregatedScoreStore. Create 0 to N parameters for a WeightedSum - * AggregatedScore Store. Please evaluate the performance before productionization any new score. - * - * @param scoreAlgorithm the underlying score algorithm name - * @param weight contribution to weighted sum of this sub-score - * @param idTransform transform the source ScoreInternalId to underlying score InternalId. - * @param scoreTransform function to apply to sub-score before adding to weighted sum - */ - case class WeightedSumAggregatedScoreParameter( - scoreAlgorithm: ScoringAlgorithm, - weight: Double, - idTransform: ScoreInternalId => ScoreInternalId, - scoreTransform: Double => Double = identityScoreTransform) - - val SameTypeScoreInternalIdTransform: ScoreInternalId => ScoreInternalId = { id => id } - val identityScoreTransform: Double => Double = { score => score } - - // Convert Generic Internal Id to a SimClustersEmbeddingId - def genericPairScoreIdToSimClustersEmbeddingPairScoreId( - embeddingType1: EmbeddingType, - embeddingType2: EmbeddingType, - modelVersion: ModelVersion - ): ScoreInternalId => ScoreInternalId = { - case id: ScoreInternalId.GenericPairScoreId => - ScoreInternalId.SimClustersEmbeddingPairScoreId( - ThriftSimClustersEmbeddingPairScoreId( - SimClustersEmbeddingId(embeddingType1, modelVersion, id.genericPairScoreId.id1), - SimClustersEmbeddingId(embeddingType2, modelVersion, id.genericPairScoreId.id2) - )) - } - - val simClustersEmbeddingPairScoreIdToGenericPairScoreId: ScoreInternalId => ScoreInternalId = { - case ScoreInternalId.SimClustersEmbeddingPairScoreId(simClustersId) => - ScoreInternalId.GenericPairScoreId( - GenericPairScoreId(simClustersId.id1.internalId, simClustersId.id2.internalId)) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/stores/BUILD b/src/scala/com/twitter/simclusters_v2/stores/BUILD deleted file mode 100644 index 11bc8e7e6..000000000 --- a/src/scala/com/twitter/simclusters_v2/stores/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/storehaus:core", - "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common", - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/storehaus_internal/manhattan", - "src/scala/com/twitter/storehaus_internal/util", - "src/scala/com/twitter/wtf/scalding/jobs/injection", - "src/thrift/com/twitter/recos/entities:entities-thrift-scala", - "storage/clients/manhattan/client/src/main/scala", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/stores/BUILD.docx b/src/scala/com/twitter/simclusters_v2/stores/BUILD.docx new file mode 100644 index 000000000..e09e8bf74 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/stores/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/stores/LanguageFilteredLocaleEntityEmbeddingStore.docx b/src/scala/com/twitter/simclusters_v2/stores/LanguageFilteredLocaleEntityEmbeddingStore.docx new file mode 100644 index 000000000..ca2816877 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/stores/LanguageFilteredLocaleEntityEmbeddingStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/stores/LanguageFilteredLocaleEntityEmbeddingStore.scala b/src/scala/com/twitter/simclusters_v2/stores/LanguageFilteredLocaleEntityEmbeddingStore.scala deleted file mode 100644 index e461e1ed2..000000000 --- a/src/scala/com/twitter/simclusters_v2/stores/LanguageFilteredLocaleEntityEmbeddingStore.scala +++ /dev/null @@ -1,96 +0,0 @@ -package com.twitter.simclusters_v2.stores - -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.ClusterDetails -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Future - -/** - * Transfer a Entity SimClustersEmbedding to a language filtered embedding. - * The new embedding only contains clusters whose main language is the same as the language field in - * the SimClustersEmbeddingId. - * - * This store is special designed for Topic Tweet and Topic Follow Prompt. - * Only support new Ids whose internalId is LocaleEntityId. - */ -@deprecated -case class LanguageFilteredLocaleEntityEmbeddingStore( - underlyingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding], - clusterDetailsStore: ReadableStore[(ModelVersion, ClusterId), ClusterDetails], - composeKeyMapping: SimClustersEmbeddingId => SimClustersEmbeddingId) - extends ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] { - - import LanguageFilteredLocaleEntityEmbeddingStore._ - - override def get(k: SimClustersEmbeddingId): Future[Option[SimClustersEmbedding]] = { - for { - maybeEmbedding <- underlyingStore.get(composeKeyMapping(k)) - maybeFilteredEmbedding <- maybeEmbedding match { - case Some(embedding) => - embeddingsLanguageFilter(k, embedding).map(Some(_)) - case None => - Future.None - } - } yield maybeFilteredEmbedding - } - - private def embeddingsLanguageFilter( - sourceEmbeddingId: SimClustersEmbeddingId, - simClustersEmbedding: SimClustersEmbedding - ): Future[SimClustersEmbedding] = { - val language = getLanguage(sourceEmbeddingId) - val modelVersion = sourceEmbeddingId.modelVersion - - val clusterDetailKeys = simClustersEmbedding.sortedClusterIds.map { clusterId => - (modelVersion, clusterId) - }.toSet - - Future - .collect { - clusterDetailsStore.multiGet(clusterDetailKeys) - }.map { clusterDetailsMap => - simClustersEmbedding.embedding.filter { - case (clusterId, _) => - isDominantLanguage( - language, - clusterDetailsMap.getOrElse((modelVersion, clusterId), None)) - } - }.map(SimClustersEmbedding(_)) - } - - private def isDominantLanguage( - requestLang: String, - clusterDetails: Option[ClusterDetails] - ): Boolean = - clusterDetails match { - case Some(details) => - val dominantLanguage = - details.languageToFractionDeviceLanguage.map { langMap => - langMap.maxBy { - case (_, score) => score - }._1 - } - - dominantLanguage.exists(_.equalsIgnoreCase(requestLang)) - case _ => true - } - -} - -object LanguageFilteredLocaleEntityEmbeddingStore { - - def getLanguage(simClustersEmbeddingId: SimClustersEmbeddingId): String = { - simClustersEmbeddingId match { - case SimClustersEmbeddingId(_, _, InternalId.LocaleEntityId(localeEntityId)) => - localeEntityId.language - case _ => - throw new IllegalArgumentException( - s"The Id $simClustersEmbeddingId doesn't contain Locale info") - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/stores/MultiTypeGraphStore.docx b/src/scala/com/twitter/simclusters_v2/stores/MultiTypeGraphStore.docx new file mode 100644 index 000000000..2a03d6cc9 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/stores/MultiTypeGraphStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/stores/MultiTypeGraphStore.scala b/src/scala/com/twitter/simclusters_v2/stores/MultiTypeGraphStore.scala deleted file mode 100644 index 656a61696..000000000 --- a/src/scala/com/twitter/simclusters_v2/stores/MultiTypeGraphStore.scala +++ /dev/null @@ -1,287 +0,0 @@ -package com.twitter.simclusters_v2.stores -import com.twitter.bijection.Bufferable -import com.twitter.bijection.Injection -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.simclusters_v2.common.Language -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.LeftNode -import com.twitter.simclusters_v2.thriftscala.NounWithFrequencyList -import com.twitter.simclusters_v2.thriftscala.RightNode -import com.twitter.simclusters_v2.thriftscala.RightNodeTypeStruct -import com.twitter.simclusters_v2.thriftscala.RightNodeWithEdgeWeightList -import com.twitter.simclusters_v2.thriftscala.SimilarRightNodes -import com.twitter.simclusters_v2.thriftscala.CandidateTweetsList -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan.Apollo -import com.twitter.storehaus_internal.manhattan.ManhattanRO -import com.twitter.storehaus_internal.manhattan.ManhattanROConfig -import com.twitter.storehaus_internal.util.ApplicationID -import com.twitter.storehaus_internal.util.DatasetName -import com.twitter.storehaus_internal.util.HDFSPath -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Long2BigEndian -import com.twitter.simclusters_v2.thriftscala.FullClusterId -import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores - -object MultiTypeGraphStore { - - implicit val leftNodesInject: Injection[LeftNode, Array[Byte]] = - CompactScalaCodec(LeftNode) - implicit val truncatedMultiTypeGraphInject: Injection[RightNodeWithEdgeWeightList, Array[Byte]] = - CompactScalaCodec(RightNodeWithEdgeWeightList) - implicit val topKNounsListInject: Injection[NounWithFrequencyList, Array[Byte]] = - CompactScalaCodec(NounWithFrequencyList) - implicit val rightNodesStructInject: Injection[RightNodeTypeStruct, Array[Byte]] = - CompactScalaCodec(RightNodeTypeStruct) - implicit val similarRightNodesStructInject: Injection[SimilarRightNodes, Array[Byte]] = - CompactScalaCodec(SimilarRightNodes) - implicit val rightNodesInject: Injection[RightNode, Array[Byte]] = - CompactScalaCodec(RightNode) - implicit val tweetCandidatesInject: Injection[CandidateTweetsList, Array[Byte]] = - CompactScalaCodec(CandidateTweetsList) - implicit val fullClusterIdInject: Injection[FullClusterId, Array[Byte]] = - CompactScalaCodec(FullClusterId) - implicit val topKTweetsWithScoresInject: Injection[TopKTweetsWithScores, Array[Byte]] = - CompactScalaCodec(TopKTweetsWithScores) - implicit val clustersUserIsInterestedInInjection: Injection[ClustersUserIsInterestedIn, Array[ - Byte - ]] = - CompactScalaCodec(ClustersUserIsInterestedIn) - - private val appId = "multi_type_simclusters" - - def getTruncatedMultiTypeGraphRightNodesForUser( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[LeftNode, RightNodeWithEdgeWeightList] = { - ManhattanRO.getReadableStoreWithMtls[LeftNode, RightNodeWithEdgeWeightList]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("mts_user_truncated_graph"), - Apollo - ), - mhMtlsParams - ) - } - - def getTopKNounsForRightNodeType( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[RightNodeTypeStruct, NounWithFrequencyList] = { - ManhattanRO.getReadableStoreWithMtls[RightNodeTypeStruct, NounWithFrequencyList]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("mts_topk_frequent_nouns"), - Apollo - ), - mhMtlsParams - ) - } - - def getTopKSimilarRightNodes( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[RightNode, SimilarRightNodes] = { - ManhattanRO.getReadableStoreWithMtls[RightNode, SimilarRightNodes]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("mts_topk_similar_right_nodes_scio"), - Apollo - ), - mhMtlsParams - ) - } - - def getOfflineTweetMTSCandidateStore( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[Long, CandidateTweetsList] = { - ManhattanRO.getReadableStoreWithMtls[Long, CandidateTweetsList]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("offline_tweet_recommendations_from_mts_consumer_embeddings"), - Apollo - ), - mhMtlsParams - ) - } - - def getOfflineTweet2020CandidateStore( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[Long, CandidateTweetsList] = { - ManhattanRO.getReadableStoreWithMtls[Long, CandidateTweetsList]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("offline_tweet_recommendations_from_interestedin_2020"), - Apollo - ), - mhMtlsParams - ) - } - - def getVideoViewBasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("video_view_based_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getRetweetBasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("retweet_based_simclusters_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getReplyBasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("reply_based_simclusters_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getPushOpenBasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("push_open_based_simclusters_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getAdsFavBasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("ads_fav_based_simclusters_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getAdsFavClickBasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("ads_fav_click_based_simclusters_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getFTRPop1000BasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("ftr_pop1000_rank_decay_1_1_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getFTRPop10000BasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("ftr_pop10000_rank_decay_1_1_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getOONFTRPop1000BasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("oon_ftr_pop1000_rnkdecay_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getOfflineLogFavBasedTweetBasedClusterTopKTweets( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[FullClusterId, TopKTweetsWithScores] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("decayed_sum_cluster_to_tweet_index"), - Apollo - ), - mhMtlsParams - ) - } - - def getGlobalSimClustersLanguageEmbeddings( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[Language, ClustersUserIsInterestedIn] = { - ManhattanRO - .getReadableStoreWithMtls[Language, ClustersUserIsInterestedIn]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("global_simclusters_language_embeddings"), - Apollo - ), - mhMtlsParams - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/stores/SimClustersEmbeddingStore.docx b/src/scala/com/twitter/simclusters_v2/stores/SimClustersEmbeddingStore.docx new file mode 100644 index 000000000..057d5c38a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/stores/SimClustersEmbeddingStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/stores/SimClustersEmbeddingStore.scala b/src/scala/com/twitter/simclusters_v2/stores/SimClustersEmbeddingStore.scala deleted file mode 100644 index 62785e205..000000000 --- a/src/scala/com/twitter/simclusters_v2/stores/SimClustersEmbeddingStore.scala +++ /dev/null @@ -1,120 +0,0 @@ -package com.twitter.simclusters_v2.stores - -import com.twitter.decider.Decider -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.hermit.store.common.DeciderableReadableStore -import com.twitter.servo.decider.DeciderKeyEnum -import com.twitter.simclusters_v2.common.DeciderGateBuilderWithIdHashing -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Future - -/** - * Facade of all SimClusters Embedding Store. - * Provide a uniform access layer for all kind of SimClusters Embedding. - */ -case class SimClustersEmbeddingStore( - stores: Map[ - (EmbeddingType, ModelVersion), - ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ]) extends ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] { - - private val lookupStores = - stores - .groupBy(_._1._1).mapValues(_.map { - case ((_, modelVersion), store) => - modelVersion -> store - }) - - override def get(k: SimClustersEmbeddingId): Future[Option[SimClustersEmbedding]] = { - findStore(k) match { - case Some(store) => store.get(k) - case None => Future.None - } - } - - // Override the multiGet for better batch performance. - override def multiGet[K1 <: SimClustersEmbeddingId]( - ks: Set[K1] - ): Map[K1, Future[Option[SimClustersEmbedding]]] = { - if (ks.isEmpty) { - Map.empty - } else { - val head = ks.head - val notSameType = - ks.exists(k => k.embeddingType != head.embeddingType || k.modelVersion != head.modelVersion) - if (!notSameType) { - findStore(head) match { - case Some(store) => store.multiGet(ks) - case None => ks.map(_ -> Future.None).toMap - } - } else { - // Generate a large amount temp objects. - // For better performance, avoid querying the multiGet with more than one kind of embedding - ks.groupBy(id => (id.embeddingType, id.modelVersion)).flatMap { - case ((_, _), ks) => - findStore(ks.head) match { - case Some(store) => store.multiGet(ks) - case None => ks.map(_ -> Future.None).toMap - } - } - } - } - } - - private def findStore( - id: SimClustersEmbeddingId - ): Option[ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding]] = { - lookupStores.get(id.embeddingType).flatMap(_.get(id.modelVersion)) - } - -} - -object SimClustersEmbeddingStore { - /* - Build a SimClustersEmbeddingStore which wraps all stores in DeciderableReadableStore - */ - def buildWithDecider( - underlyingStores: Map[ - (EmbeddingType, ModelVersion), - ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ], - decider: Decider, - statsReceiver: StatsReceiver - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - // To allow for lazy adding of decider config to enable / disable stores, if a value is not found - // fall back on returning true (equivalent to availability of 10000) - // This overrides default availability of 0 when not decider value is not found - val deciderGateBuilder = new DeciderGateBuilderWithIdHashing(decider.orElse(Decider.True)) - - val deciderKeyEnum = new DeciderKeyEnum { - underlyingStores.keySet.map(key => Value(s"enable_${key._1.name}_${key._2.name}")) - } - - def wrapStore( - embeddingType: EmbeddingType, - modelVersion: ModelVersion, - store: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - val gate = deciderGateBuilder.idGateWithHashing[SimClustersEmbeddingId]( - deciderKeyEnum.withName(s"enable_${embeddingType.name}_${modelVersion.name}")) - - DeciderableReadableStore( - underlying = store, - gate = gate, - statsReceiver = statsReceiver.scope(embeddingType.name, modelVersion.name) - ) - } - - val stores = underlyingStores.map { - case ((embeddingType, modelVersion), store) => - (embeddingType, modelVersion) -> wrapStore(embeddingType, modelVersion, store) - } - - new SimClustersEmbeddingStore(stores = stores) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/stores/SimClustersMultiEmbeddingStore.docx b/src/scala/com/twitter/simclusters_v2/stores/SimClustersMultiEmbeddingStore.docx new file mode 100644 index 000000000..741dbca6f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/stores/SimClustersMultiEmbeddingStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/stores/SimClustersMultiEmbeddingStore.scala b/src/scala/com/twitter/simclusters_v2/stores/SimClustersMultiEmbeddingStore.scala deleted file mode 100644 index 0a520439e..000000000 --- a/src/scala/com/twitter/simclusters_v2/stores/SimClustersMultiEmbeddingStore.scala +++ /dev/null @@ -1,74 +0,0 @@ -package com.twitter.simclusters_v2.stores - -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.SimClustersMultiEmbeddingId._ -import com.twitter.simclusters_v2.thriftscala.{ - SimClustersMultiEmbedding, - SimClustersEmbeddingId, - SimClustersMultiEmbeddingId -} -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Future - -/** - * The helper methods for SimClusters Multi-Embedding based ReadableStore - */ -object SimClustersMultiEmbeddingStore { - - /** - * Only support the Values based Multi-embedding transformation. - */ - case class SimClustersMultiEmbeddingWrapperStore( - sourceStore: ReadableStore[SimClustersMultiEmbeddingId, SimClustersMultiEmbedding]) - extends ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] { - - override def get(k: SimClustersEmbeddingId): Future[Option[SimClustersEmbedding]] = { - sourceStore.get(toMultiEmbeddingId(k)).map(_.map(toSimClustersEmbedding(k, _))) - } - - // Override the multiGet for better batch performance. - override def multiGet[K1 <: SimClustersEmbeddingId]( - ks: Set[K1] - ): Map[K1, Future[Option[SimClustersEmbedding]]] = { - if (ks.isEmpty) { - Map.empty - } else { - // Aggregate multiple get requests by MultiEmbeddingId - val multiEmbeddingIds = ks.map { k => - k -> toMultiEmbeddingId(k) - }.toMap - - val multiEmbeddings = sourceStore.multiGet(multiEmbeddingIds.values.toSet) - ks.map { k => - k -> multiEmbeddings(multiEmbeddingIds(k)).map(_.map(toSimClustersEmbedding(k, _))) - }.toMap - } - } - - private def toSimClustersEmbedding( - id: SimClustersEmbeddingId, - multiEmbedding: SimClustersMultiEmbedding - ): SimClustersEmbedding = { - multiEmbedding match { - case SimClustersMultiEmbedding.Values(values) => - val subId = toSubId(id) - if (subId >= values.embeddings.size) { - throw new IllegalArgumentException( - s"SimClustersMultiEmbeddingId $id is over the size of ${values.embeddings.size}") - } else { - values.embeddings(subId).embedding - } - case _ => - throw new IllegalArgumentException( - s"Invalid SimClustersMultiEmbedding $id, $multiEmbedding") - } - } - } - - def toSimClustersEmbeddingStore( - sourceStore: ReadableStore[SimClustersMultiEmbeddingId, SimClustersMultiEmbedding] - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - SimClustersMultiEmbeddingWrapperStore(sourceStore) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/stores/TopicTopProducersStore.docx b/src/scala/com/twitter/simclusters_v2/stores/TopicTopProducersStore.docx new file mode 100644 index 000000000..c04e616d7 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/stores/TopicTopProducersStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/stores/TopicTopProducersStore.scala b/src/scala/com/twitter/simclusters_v2/stores/TopicTopProducersStore.scala deleted file mode 100644 index c733ed157..000000000 --- a/src/scala/com/twitter/simclusters_v2/stores/TopicTopProducersStore.scala +++ /dev/null @@ -1,87 +0,0 @@ -package com.twitter.simclusters_v2.stores - -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.recos.entities.thriftscala.{SemanticCoreEntityWithLocale, UserScoreList} -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan.{Athena, ManhattanRO, ManhattanROConfig} -import com.twitter.storehaus_internal.util.{ApplicationID, DatasetName, HDFSPath} - -object TopicTopProducersStore { - val appIdDevel = "recos_platform_dev" - val v2DatasetNameDevel = "topic_producers_em" - val v3DatasetNameDevel = "topic_producers_agg" - val v4DatasetNameDevel = "topic_producers_em_erg" - - val appIdProd = "simclusters_v2" - val v1DatasetNameProd = "top_producers_for_topic_from_topic_follow_graph" - val v2DatasetNameProd = "top_producers_for_topic_em" - - implicit val keyInj = CompactScalaCodec(SemanticCoreEntityWithLocale) - implicit val valInj = CompactScalaCodec(UserScoreList) - - def getTopicTopProducerStoreV1Prod( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[SemanticCoreEntityWithLocale, UserScoreList] = - ManhattanRO.getReadableStoreWithMtls[SemanticCoreEntityWithLocale, UserScoreList]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appIdProd), - DatasetName(v1DatasetNameProd), - Athena - ), - mhMtlsParams - ) - - def getTopicTopProducerStoreV2Devel( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[SemanticCoreEntityWithLocale, UserScoreList] = - ManhattanRO.getReadableStoreWithMtls[SemanticCoreEntityWithLocale, UserScoreList]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appIdDevel), - DatasetName(v2DatasetNameDevel), - Athena - ), - mhMtlsParams - ) - - def getTopicTopProducerStoreV2Prod( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[SemanticCoreEntityWithLocale, UserScoreList] = - ManhattanRO.getReadableStoreWithMtls[SemanticCoreEntityWithLocale, UserScoreList]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appIdProd), - DatasetName(v2DatasetNameProd), - Athena - ), - mhMtlsParams - ) - - def getTopicTopProducerStoreV3Devel( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[SemanticCoreEntityWithLocale, UserScoreList] = - ManhattanRO.getReadableStoreWithMtls[SemanticCoreEntityWithLocale, UserScoreList]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appIdDevel), - DatasetName(v3DatasetNameDevel), - Athena - ), - mhMtlsParams - ) - - def getTopicTopProducerStoreV4Devel( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[SemanticCoreEntityWithLocale, UserScoreList] = - ManhattanRO.getReadableStoreWithMtls[SemanticCoreEntityWithLocale, UserScoreList]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appIdDevel), - DatasetName(v4DatasetNameDevel), - Athena - ), - mhMtlsParams - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/stores/WtfMbcgStore.docx b/src/scala/com/twitter/simclusters_v2/stores/WtfMbcgStore.docx new file mode 100644 index 000000000..7e55b3371 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/stores/WtfMbcgStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/stores/WtfMbcgStore.scala b/src/scala/com/twitter/simclusters_v2/stores/WtfMbcgStore.scala deleted file mode 100644 index 471d4bf2b..000000000 --- a/src/scala/com/twitter/simclusters_v2/stores/WtfMbcgStore.scala +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.simclusters_v2.stores - -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.{ - Long2BigEndian, - ScalaBinaryThrift -} -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan.{Apollo, ManhattanRO, ManhattanROConfig} -import com.twitter.storehaus_internal.util.{ApplicationID, DatasetName, HDFSPath} -import com.twitter.wtf.candidate.thriftscala.CandidateSeq - -object WtfMbcgStore { - - val appId = "recos_platform_apollo" - - implicit val keyInj = Long2BigEndian - implicit val valInj = ScalaBinaryThrift(CandidateSeq) - - def getWtfMbcgStore( - mhMtlsParams: ManhattanKVClientMtlsParams, - datasetName: String - ): ReadableStore[Long, CandidateSeq] = { - ManhattanRO.getReadableStoreWithMtls[Long, CandidateSeq]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName(datasetName), - Apollo - ), - mhMtlsParams - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/BUILD b/src/scala/com/twitter/simclusters_v2/summingbird/BUILD deleted file mode 100644 index f01857d26..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/BUILD +++ /dev/null @@ -1,118 +0,0 @@ -scala_library( - name = "common", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/scala/com/twitter/simclusters_v2/summingbird/common", - ], -) - -scala_library( - name = "stores", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/scala/com/twitter/simclusters_v2/summingbird/stores", - ], -) - -scala_library( - name = "webservice", - tags = ["bazel-compatible"], - dependencies = [ - "src/scala/com/twitter/simclusters_v2/summingbird/webservice", - "twitter-server/slf4j-jdk14/src/main/scala/com/twitter/server/logging", - ], -) - -heron_binary( - name = "tweet-simclusters-storm-binary", - main = "com.twitter.simclusters_v2.summingbird.storm.TweetJobRunner", - platform = "java8", - runtime_platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":common", - "3rdparty/jvm/org/slf4j:slf4j-jdk14", - "src/scala/com/twitter/simclusters_v2/summingbird/storm", - ], -) - -jvm_app( - name = "tweet-simclusters-storm-job", - binary = ":tweet-simclusters-storm-binary", - bundles = [ - bundle( - fileset = ["config/jaas.conf"], - ), - ], - tags = ["bazel-compatible"], -) - -heron_binary( - name = "persistent-tweet-simclusters-storm-binary", - main = "com.twitter.simclusters_v2.summingbird.storm.PersistentTweetJobRunner", - platform = "java8", - runtime_platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":common", - "3rdparty/jvm/org/slf4j:slf4j-jdk14", - "src/scala/com/twitter/simclusters_v2/summingbird/storm", - ], -) - -jvm_app( - name = "persistent-tweet-simclusters-storm-job", - binary = ":persistent-tweet-simclusters-storm-binary", - bundles = [ - bundle( - fileset = ["config/jaas.conf"], - ), - ], - tags = ["bazel-compatible"], -) - -heron_binary( - name = "multi-model-tweet-simclusters-storm-binary", - main = "com.twitter.simclusters_v2.summingbird.storm.MultiModelTweetJobRunner", - platform = "java8", - runtime_platform = "java8", - dependencies = [ - ":common", - "3rdparty/jvm/org/slf4j:slf4j-jdk14", - "src/scala/com/twitter/simclusters_v2/summingbird/storm", - ], -) - -jvm_app( - name = "multi-model-tweet-simclusters-storm-job", - binary = ":multi-model-tweet-simclusters-storm-binary", - bundles = [ - bundle( - fileset = ["config/jaas.conf"], - ), - ], -) - -jvm_binary( - name = "repl", - basename = "repl-simclusters_v2", - main = "scala.tools.nsc.MainGenericRunner", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":common", - "3rdparty/jvm/org/scala-lang:scala-compiler", - ], -) - -target( - dependencies = [ - ":common", - ":repl", - ":stores", - ":webservice", - "src/scala/com/twitter/simclusters_v2/summingbird/storm", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/BUILD.docx b/src/scala/com/twitter/simclusters_v2/summingbird/BUILD.docx new file mode 100644 index 000000000..62a51e48d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/README.docx b/src/scala/com/twitter/simclusters_v2/summingbird/README.docx new file mode 100644 index 000000000..88f7e4d24 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/README.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/README.md b/src/scala/com/twitter/simclusters_v2/summingbird/README.md deleted file mode 100644 index 026df3a26..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/README.md +++ /dev/null @@ -1,4 +0,0 @@ -Simclusters v2 Online Tweet Embedding Pipeline -============================================== - -The Heron jobs generate the tweet embedding and index of tweets for SimClusters, as well as persistenting the tweet embeddings from MemCache into Manhattan. \ No newline at end of file diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/BUILD b/src/scala/com/twitter/simclusters_v2/summingbird/common/BUILD deleted file mode 100644 index 0912b12fe..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/BUILD +++ /dev/null @@ -1,62 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/algebird:core", - "3rdparty/jvm/com/twitter/algebird:util", - "3rdparty/jvm/com/twitter/bijection:core", - "3rdparty/jvm/com/twitter/bijection:util", - "3rdparty/jvm/com/twitter/storehaus:core", - "3rdparty/src/jvm/com/twitter/summingbird:client", - "cuad/projects/ner/client", - "cuad/projects/ner/thrift/src/main/thrift:thrift-scala", - "snowflake/src/main/scala/com/twitter/snowflake/id", - "src/scala/com/twitter/algebird_internal/injection", - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/storehaus_internal/manhattan", - "src/scala/com/twitter/storehaus_internal/manhattan/config", - "src/scala/com/twitter/storehaus_internal/memcache", - "src/scala/com/twitter/storehaus_internal/memcache/config", - "src/scala/com/twitter/storehaus_internal/offline", - "src/scala/com/twitter/storehaus_internal/online", - "src/scala/com/twitter/storehaus_internal/util", - "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", - "src/scala/com/twitter/summingbird_internal/runner/store_config", - "src/scala/com/twitter/taxi/util/text", - "src/scala/com/twitter/wtf/summingbird/sources/common", - "src/thrift/com/twitter/recos/entities:entities-thrift-scala", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - "src/thrift/com/twitter/timelineservice/server/internal:thrift-scala", - "src/thrift/com/twitter/tweetypie:tweet-scala", - "src/thrift/com/twitter/wtf/interest:interest-thrift-scala", - "stitch/stitch-core", - "stitch/stitch-storehaus/src/main/scala", - ], -) - -## smaller build target for external usage -scala_library( - name = "util", - sources = [ - "Configs.scala", - "Implicits.scala", - "ModelVersionProfile.scala", - "Monoids.scala", - "ThriftDecayedValueMonoid.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/algebird:core", - "3rdparty/jvm/com/twitter/algebird:util", - "3rdparty/jvm/com/twitter/bijection:core", - "3rdparty/jvm/com/twitter/bijection:util", - "3rdparty/src/jvm/com/twitter/summingbird:batch", - "snowflake/src/main/scala/com/twitter/snowflake/id", - "src/scala/com/twitter/algebird_internal/injection", - "src/scala/com/twitter/simclusters_v2/common", - "src/thrift/com/twitter/recos/entities:entities-thrift-scala", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - "src/thrift/com/twitter/tweetypie:tweet-scala", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/BUILD.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/BUILD.docx new file mode 100644 index 000000000..49ae531aa Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/ClientConfigs.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/ClientConfigs.docx new file mode 100644 index 000000000..0ca19d56a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/ClientConfigs.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/ClientConfigs.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/ClientConfigs.scala deleted file mode 100644 index d288ad692..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/ClientConfigs.scala +++ /dev/null @@ -1,81 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.thrift.ClientId -import com.twitter.storehaus_internal.memcache.ConnectionConfig -import com.twitter.storehaus_internal.memcache.MemcacheConfig -import com.twitter.storehaus_internal.util.KeyPrefix -import com.twitter.storehaus_internal.util.TTL -import com.twitter.strato.client.Strato -import com.twitter.strato.client.{Client => StratoClient} - -object ClientConfigs { - - com.twitter.server.Init() // necessary in order to use WilyNS path - - final lazy val simClustersCoreAltCachePath = - "/srv#/prod/local/cache/simclusters_core_alt" - - final lazy val simClustersCoreAltLightCachePath = - "/srv#/prod/local/cache/simclusters_core_alt_light" - - final lazy val develSimClustersCoreCachePath = - "/srv#/test/local/cache/twemcache_simclusters_core" - - final lazy val develSimClustersCoreLightCachePath = - "/srv#/test/local/cache/twemcache_simclusters_core_light" - - final lazy val logFavBasedTweet20M145K2020StratoPath = - "recommendations/simclusters_v2/embeddings/logFavBasedTweet20M145K2020Persistent" - - final lazy val logFavBasedTweet20M145K2020UncachedStratoPath = - "recommendations/simclusters_v2/embeddings/logFavBasedTweet20M145K2020-UNCACHED" - - final lazy val develLogFavBasedTweet20M145K2020StratoPath = - "recommendations/simclusters_v2/embeddings/logFavBasedTweet20M145K2020Devel" - - final lazy val entityClusterScoreMemcacheConfig: (String, ServiceIdentifier) => MemcacheConfig = { - (path: String, serviceIdentifier: ServiceIdentifier) => - new MemcacheConfig { - val connectionConfig: ConnectionConfig = ConnectionConfig(path, serviceIdentifier = serviceIdentifier) - override val keyPrefix: KeyPrefix = KeyPrefix(s"ecs_") - override val ttl: TTL = TTL(8.hours) - } - } - - // note: this should in dedicated cache for tweet - final lazy val tweetTopKClustersMemcacheConfig: (String, ServiceIdentifier) => MemcacheConfig = { - (path: String, serviceIdentifier: ServiceIdentifier) => - new MemcacheConfig { - val connectionConfig: ConnectionConfig = - ConnectionConfig(path, serviceIdentifier = serviceIdentifier) - override val keyPrefix: KeyPrefix = KeyPrefix(s"etk_") - override val ttl: TTL = TTL(2.days) - } - } - - // note: this should in dedicated cache for tweet - final lazy val clusterTopTweetsMemcacheConfig: (String, ServiceIdentifier) => MemcacheConfig = { - (path: String, serviceIdentifier: ServiceIdentifier) => - new MemcacheConfig { - val connectionConfig: ConnectionConfig = - ConnectionConfig(path, serviceIdentifier = serviceIdentifier) - override val keyPrefix: KeyPrefix = KeyPrefix(s"ctkt_") - override val ttl: TTL = TTL(8.hours) - } - } - - final lazy val stratoClient: ServiceIdentifier => StratoClient = { serviceIdentifier => - Strato.client - .withRequestTimeout(2.seconds) - .withMutualTls(serviceIdentifier) - .build() - } - - // thrift client id - private final lazy val thriftClientId: String => ClientId = { env: String => - ClientId(s"simclusters_v2_summingbird.$env") - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/Configs.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/Configs.docx new file mode 100644 index 000000000..ed74754b1 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/Configs.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/Configs.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/Configs.scala deleted file mode 100644 index d769330f0..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/Configs.scala +++ /dev/null @@ -1,70 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.conversions.DurationOps._ -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.util.Duration - -object Configs { - - final val role = "cassowary" - - final val ZoneAtla: String = "atla" - - @deprecated("Use 'common/ModelVersions'", "2019-09-04") - final val ModelVersion20M145KDec11: String = "20M_145K_dec11" - @deprecated("Use 'common/ModelVersions'", "2019-09-04") - final val ModelVersion20M145KUpdated: String = "20M_145K_updated" - final val ModelVersion20M145K2020: String = "20M_145K_2020" - - @deprecated("Use 'common/ModelVersions'", "2019-09-04") - final val ModelVersionMap: Map[String, ModelVersion] = Map( - ModelVersion20M145KDec11 -> ModelVersion.Model20m145kDec11, - ModelVersion20M145KUpdated -> ModelVersion.Model20m145kUpdated, - ModelVersion20M145K2020 -> ModelVersion.Model20m145k2020 - ) - - final val favScoreThresholdForUserInterest: String => Double = { - case ModelVersion20M145KDec11 => 0.15 - case ModelVersion20M145KUpdated => 1.0 - case ModelVersion20M145K2020 => 0.3 - case modelVersionStr => throw new Exception(s"$modelVersionStr is not a valid model") - } - - @deprecated("Use 'common/ModelVersions'", "2019-09-04") - final val ReversedModelVersionMap = ModelVersionMap.map(_.swap) - - final val batchesToKeep: Int = 1 - - final val HalfLife: Duration = 8.hours - final val HalfLifeInMs: Long = HalfLife.inMilliseconds - - final val topKTweetsPerCluster: Int = 1600 - - final val topKClustersPerEntity: Int = 50 - - // the config used in offline job only - final val topKClustersPerTweet: Int = 400 - - // minimum score to save clusterIds in entityTopKClusters cache - // entity includes entities other than tweetId. - final val scoreThresholdForEntityTopKClustersCache: Double = 0.02 - - // minimum score to save clusterIds in tweetTopKClusters cache - final val scoreThresholdForTweetTopKClustersCache: Double = 0.02 - - // minimum score to save tweetIds in clusterTopKTweets cache - final val scoreThresholdForClusterTopKTweetsCache: Double = 0.001 - - // minimum score to save entities in clusterTopKEntities cache - final val scoreThresholdForClusterTopKEntitiesCache: Double = 0.001 - - final val MinFavoriteCount = 8 - - final val OldestTweetInLightIndexInMillis = 1.hours.inMillis - - final val OldestTweetFavEventTimeInMillis = 3.days.inMillis - - final val FirstUpdateValue = 1 - - final val TempUpdateValue = -1 -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/EntityUtil.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/EntityUtil.docx new file mode 100644 index 000000000..c11ec0bfd Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/EntityUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/EntityUtil.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/EntityUtil.scala deleted file mode 100644 index 4e4bbd7e7..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/EntityUtil.scala +++ /dev/null @@ -1,46 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.cuad.ner.thriftscala.WholeEntityType -import com.twitter.simclusters_v2.summingbird.common.Implicits.thriftDecayedValueMonoid -import com.twitter.simclusters_v2.thriftscala.{Scores, SimClusterEntity, TweetTextEntity} -import scala.collection.Map - -private[summingbird] object EntityUtil { - - def updateScoreWithLatestTimestamp[K]( - scoresMapOption: Option[Map[K, Scores]], - timeInMs: Long - ): Option[Map[K, Scores]] = { - scoresMapOption map { scoresMap => - scoresMap.mapValues(score => updateScoreWithLatestTimestamp(score, timeInMs)) - } - } - - def updateScoreWithLatestTimestamp(score: Scores, timeInMs: Long): Scores = { - score.copy( - favClusterNormalized8HrHalfLifeScore = score.favClusterNormalized8HrHalfLifeScore.map { - decayedValue => thriftDecayedValueMonoid.decayToTimestamp(decayedValue, timeInMs) - }, - followClusterNormalized8HrHalfLifeScore = score.followClusterNormalized8HrHalfLifeScore.map { - decayedValue => thriftDecayedValueMonoid.decayToTimestamp(decayedValue, timeInMs) - } - ) - } - - def entityToString(entity: SimClusterEntity): String = { - entity match { - case SimClusterEntity.TweetId(id) => s"t_id:$id" - case SimClusterEntity.SpaceId(id) => s"space_id:$id" - case SimClusterEntity.TweetEntity(textEntity) => - textEntity match { - case TweetTextEntity.Hashtag(str) => s"$str[h_tag]" - case TweetTextEntity.Penguin(penguin) => - s"${penguin.textEntity}[penguin]" - case TweetTextEntity.Ner(ner) => - s"${ner.textEntity}[ner_${WholeEntityType(ner.wholeEntityType)}]" - case TweetTextEntity.SemanticCore(semanticCore) => - s"[sc:${semanticCore.entityId}]" - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/Implicits.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/Implicits.docx new file mode 100644 index 000000000..0291c1a9c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/Implicits.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/Implicits.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/Implicits.scala deleted file mode 100644 index 79235573f..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/Implicits.scala +++ /dev/null @@ -1,140 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.algebird.DecayedValueMonoid -import com.twitter.algebird.Monoid -import com.twitter.algebird_internal.injection.AlgebirdImplicits -import com.twitter.algebird_internal.thriftscala.{DecayedValue => ThriftDecayedValue} -import com.twitter.bijection.Bufferable -import com.twitter.bijection.Injection -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.simclusters_v2.summingbird.common.Monoids.ClustersWithScoresMonoid -import com.twitter.simclusters_v2.summingbird.common.Monoids.MultiModelClustersWithScoresMonoid -import com.twitter.simclusters_v2.summingbird.common.Monoids.MultiModelPersistentSimClustersEmbeddingLongestL2NormMonoid -import com.twitter.simclusters_v2.summingbird.common.Monoids.MultiModelPersistentSimClustersEmbeddingMonoid -import com.twitter.simclusters_v2.summingbird.common.Monoids.MultiModelTopKTweetsWithScoresMonoid -import com.twitter.simclusters_v2.summingbird.common.Monoids.PersistentSimClustersEmbeddingLongestL2NormMonoid -import com.twitter.simclusters_v2.summingbird.common.Monoids.PersistentSimClustersEmbeddingMonoid -import com.twitter.simclusters_v2.summingbird.common.Monoids.ScoresMonoid -import com.twitter.simclusters_v2.summingbird.common.Monoids.TopKClustersWithScoresMonoid -import com.twitter.simclusters_v2.summingbird.common.Monoids.TopKTweetsWithScoresMonoid -import com.twitter.simclusters_v2.thriftscala.FullClusterIdBucket -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.summingbird.batch.Batcher -import com.twitter.tweetypie.thriftscala.StatusCounts - -object Implicits { - - // -------------------- Monoids -------------------- // - implicit val decayedValueMonoid: DecayedValueMonoid = DecayedValueMonoid(0.0) - - implicit val thriftDecayedValueMonoid: ThriftDecayedValueMonoid = - new ThriftDecayedValueMonoid(Configs.HalfLifeInMs)(decayedValueMonoid) - - implicit val scoresMonoid: ScoresMonoid = new Monoids.ScoresMonoid() - - implicit val clustersWithScoreMonoid: ClustersWithScoresMonoid = - new Monoids.ClustersWithScoresMonoid()(scoresMonoid) - - implicit val multiModelClustersWithScoresMonoid: Monoid[MultiModelClustersWithScores] = - new MultiModelClustersWithScoresMonoid() - - implicit val topKClustersWithScoresMonoid: Monoid[TopKClustersWithScores] = - new TopKClustersWithScoresMonoid( - Configs.topKClustersPerEntity, - Configs.scoreThresholdForEntityTopKClustersCache - )(thriftDecayedValueMonoid) - - implicit val topKTweetsWithScoresMonoid: Monoid[TopKTweetsWithScores] = - new TopKTweetsWithScoresMonoid( - Configs.topKTweetsPerCluster, - Configs.scoreThresholdForClusterTopKTweetsCache, - Configs.OldestTweetFavEventTimeInMillis - )(thriftDecayedValueMonoid) - - implicit val topKTweetsWithScoresLightMonoid: Monoid[TopKTweetsWithScores] = - new TopKTweetsWithScoresMonoid( - Configs.topKTweetsPerCluster, - Configs.scoreThresholdForClusterTopKTweetsCache, - Configs.OldestTweetInLightIndexInMillis - )(thriftDecayedValueMonoid) - - implicit val MultiModeltopKTweetsWithScoresMonoid: Monoid[MultiModelTopKTweetsWithScores] = - new MultiModelTopKTweetsWithScoresMonoid( - )(thriftDecayedValueMonoid) - - implicit val persistentSimClustersEmbeddingMonoid: Monoid[PersistentSimClustersEmbedding] = - new PersistentSimClustersEmbeddingMonoid() - - implicit val persistentSimClustersEmbeddingLongestL2NormMonoid: Monoid[ - PersistentSimClustersEmbedding - ] = - new PersistentSimClustersEmbeddingLongestL2NormMonoid() - - implicit val multiModelPersistentSimClustersEmbeddingMonoid: Monoid[ - MultiModelPersistentSimClustersEmbedding - ] = - new MultiModelPersistentSimClustersEmbeddingMonoid() - - implicit val multiModelPersistentSimClustersEmbeddingLongestL2NormMonoid: Monoid[ - MultiModelPersistentSimClustersEmbedding - ] = new MultiModelPersistentSimClustersEmbeddingLongestL2NormMonoid() - - // -------------------- Codecs -------------------- // - implicit val longIntPairCodec: Injection[(Long, Int), Array[Byte]] = - Bufferable.injectionOf[(Long, Int)] - - implicit val simClusterEntityCodec: Injection[SimClusterEntity, Array[Byte]] = - CompactScalaCodec(SimClusterEntity) - - implicit val fullClusterIdBucket: Injection[FullClusterIdBucket, Array[Byte]] = - CompactScalaCodec(FullClusterIdBucket) - - implicit val clustersWithScoresCodec: Injection[ClustersWithScores, Array[Byte]] = - CompactScalaCodec(ClustersWithScores) - - implicit val topKClustersKeyCodec: Injection[EntityWithVersion, Array[Byte]] = - CompactScalaCodec(EntityWithVersion) - - implicit val topKClustersWithScoresCodec: Injection[TopKClustersWithScores, Array[Byte]] = - CompactScalaCodec(TopKClustersWithScores) - - implicit val fullClusterIdCodec: Injection[FullClusterId, Array[Byte]] = - CompactScalaCodec(FullClusterId) - - implicit val topKEntitiesWithScoresCodec: Injection[TopKEntitiesWithScores, Array[Byte]] = - CompactScalaCodec(TopKEntitiesWithScores) - - implicit val topKTweetsWithScoresCodec: Injection[TopKTweetsWithScores, Array[Byte]] = - CompactScalaCodec(TopKTweetsWithScores) - - implicit val pairedArrayBytesCodec: Injection[(Array[Byte], Array[Byte]), Array[Byte]] = - Bufferable.injectionOf[(Array[Byte], Array[Byte])] - - implicit val entityWithClusterInjection: Injection[(SimClusterEntity, FullClusterIdBucket), Array[ - Byte - ]] = - Injection - .connect[(SimClusterEntity, FullClusterIdBucket), (Array[Byte], Array[Byte]), Array[Byte]] - - implicit val topKClustersCodec: Injection[TopKClusters, Array[Byte]] = - CompactScalaCodec(TopKClusters) - - implicit val topKTweetsCodec: Injection[TopKTweets, Array[Byte]] = - CompactScalaCodec(TopKTweets) - - implicit val simClustersEmbeddingCodec: Injection[SimClustersEmbedding, Array[Byte]] = - CompactScalaCodec(SimClustersEmbedding) - - implicit val persistentSimClustersEmbeddingCodec: Injection[PersistentSimClustersEmbedding, Array[ - Byte - ]] = - CompactScalaCodec(PersistentSimClustersEmbedding) - - implicit val statusCountsCodec: Injection[StatusCounts, Array[Byte]] = - CompactScalaCodec(StatusCounts) - - implicit val thriftDecayedValueCodec: Injection[ThriftDecayedValue, Array[Byte]] = - AlgebirdImplicits.decayedValueCodec - - implicit val batcher: Batcher = Batcher.unit -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/ModelVersionProfile.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/ModelVersionProfile.docx new file mode 100644 index 000000000..175c21123 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/ModelVersionProfile.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/ModelVersionProfile.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/ModelVersionProfile.scala deleted file mode 100644 index ad2c56386..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/ModelVersionProfile.scala +++ /dev/null @@ -1,40 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.util.Duration -import com.twitter.conversions.DurationOps._ -import com.twitter.simclusters_v2.thriftscala.ModelVersion - -case class ModelVersionProfile( - modelVersion: ModelVersion, - usingLogFavScore: Boolean, - // redundant in the current models because the above parameter does the same currently. - coreEmbeddingType: EmbeddingType, - favScoreThresholdForUserInterest: Double, - // these values are shared between all profiles so lets set up defaults - halfLife: Duration = 8.hours, - scoreThresholdForEntityTopKClustersCache: Double = 0.2, - scoreThresholdForTweetTopKClustersCache: Double = 0.02, - scoreThresholdForClusterTopKTweetsCache: Double = 0.001, - scoreThresholdForClusterTopKEntitiesCache: Double = 0.001) - -object ModelVersionProfiles { - final val ModelVersion20M145KUpdated = ModelVersionProfile( - ModelVersion.Model20m145kUpdated, - usingLogFavScore = true, - coreEmbeddingType = EmbeddingType.LogFavBasedTweet, - favScoreThresholdForUserInterest = 1.0 - ) - - final val ModelVersion20M145K2020 = ModelVersionProfile( - ModelVersion.Model20m145k2020, - usingLogFavScore = true, - coreEmbeddingType = EmbeddingType.LogFavBasedTweet, - favScoreThresholdForUserInterest = 0.3 - ) - - final val ModelVersionProfiles: Map[ModelVersion, ModelVersionProfile] = Map( - ModelVersion.Model20m145kUpdated -> ModelVersion20M145KUpdated, - ModelVersion.Model20m145k2020 -> ModelVersion20M145K2020 - ) -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/Monoids.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/Monoids.docx new file mode 100644 index 000000000..c9760504d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/Monoids.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/Monoids.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/Monoids.scala deleted file mode 100644 index 34dd27586..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/Monoids.scala +++ /dev/null @@ -1,478 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.algebird.DecayedValue -import com.twitter.algebird.Monoid -import com.twitter.algebird.OptionMonoid -import com.twitter.algebird.ScMapMonoid -import com.twitter.algebird_internal.thriftscala.{DecayedValue => ThriftDecayedValue} -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.ClustersWithScores -import com.twitter.simclusters_v2.thriftscala.MultiModelClustersWithScores -import com.twitter.simclusters_v2.thriftscala.MultiModelTopKTweetsWithScores -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.MultiModelPersistentSimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.PersistentSimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.Scores -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingMetadata -import com.twitter.simclusters_v2.thriftscala.TopKClustersWithScores -import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} -import com.twitter.snowflake.id.SnowflakeId -import scala.collection.mutable - -/** - * Contains various monoids used in the EntityJob - */ -object Monoids { - - class ScoresMonoid(implicit thriftDecayedValueMonoid: ThriftDecayedValueMonoid) - extends Monoid[Scores] { - - private val optionalThriftDecayedValueMonoid = - new OptionMonoid[ThriftDecayedValue]() - - override val zero: Scores = Scores() - - override def plus(x: Scores, y: Scores): Scores = { - Scores( - optionalThriftDecayedValueMonoid.plus( - x.favClusterNormalized8HrHalfLifeScore, - y.favClusterNormalized8HrHalfLifeScore - ), - optionalThriftDecayedValueMonoid.plus( - x.followClusterNormalized8HrHalfLifeScore, - y.followClusterNormalized8HrHalfLifeScore - ) - ) - } - } - - class ClustersWithScoresMonoid(implicit scoresMonoid: ScoresMonoid) - extends Monoid[ClustersWithScores] { - - private val optionMapMonoid = - new OptionMonoid[collection.Map[Int, Scores]]()(new ScMapMonoid[Int, Scores]()) - - override val zero: ClustersWithScores = ClustersWithScores() - - override def plus(x: ClustersWithScores, y: ClustersWithScores): ClustersWithScores = { - ClustersWithScores( - optionMapMonoid.plus(x.clustersToScore, y.clustersToScore) - ) - } - } - - class MultiModelClustersWithScoresMonoid(implicit scoresMonoid: ScoresMonoid) - extends Monoid[MultiModelClustersWithScores] { - - override val zero: MultiModelClustersWithScores = MultiModelClustersWithScores() - - override def plus( - x: MultiModelClustersWithScores, - y: MultiModelClustersWithScores - ): MultiModelClustersWithScores = { - // We reuse the logic from the Monoid for the Value here - val clustersWithScoreMonoid = Implicits.clustersWithScoreMonoid - - MultiModelClustersWithScores( - MultiModelUtils.mergeTwoMultiModelMaps( - x.multiModelClustersWithScores, - y.multiModelClustersWithScores, - clustersWithScoreMonoid)) - } - } - - class TopKClustersWithScoresMonoid( - topK: Int, - threshold: Double - )( - implicit thriftDecayedValueMonoid: ThriftDecayedValueMonoid) - extends Monoid[TopKClustersWithScores] { - - override val zero: TopKClustersWithScores = TopKClustersWithScores() - - override def plus( - x: TopKClustersWithScores, - y: TopKClustersWithScores - ): TopKClustersWithScores = { - - val mergedFavMap = TopKScoresUtils - .mergeTwoTopKMapWithDecayedValues( - x.topClustersByFavClusterNormalizedScore - .map(_.mapValues( - _.favClusterNormalized8HrHalfLifeScore.getOrElse(thriftDecayedValueMonoid.zero))), - y.topClustersByFavClusterNormalizedScore - .map(_.mapValues( - _.favClusterNormalized8HrHalfLifeScore.getOrElse(thriftDecayedValueMonoid.zero))), - topK, - threshold - ).map(_.mapValues(decayedValue => - Scores(favClusterNormalized8HrHalfLifeScore = Some(decayedValue)))) - - val mergedFollowMap = TopKScoresUtils - .mergeTwoTopKMapWithDecayedValues( - x.topClustersByFollowClusterNormalizedScore - .map(_.mapValues( - _.followClusterNormalized8HrHalfLifeScore.getOrElse(thriftDecayedValueMonoid.zero))), - y.topClustersByFollowClusterNormalizedScore - .map(_.mapValues( - _.followClusterNormalized8HrHalfLifeScore.getOrElse(thriftDecayedValueMonoid.zero))), - topK, - threshold - ).map(_.mapValues(decayedValue => - Scores(followClusterNormalized8HrHalfLifeScore = Some(decayedValue)))) - - TopKClustersWithScores( - mergedFavMap, - mergedFollowMap - ) - } - } - class TopKTweetsWithScoresMonoid( - topK: Int, - threshold: Double, - tweetAgeThreshold: Long - )( - implicit thriftDecayedValueMonoid: ThriftDecayedValueMonoid) - extends Monoid[TopKTweetsWithScores] { - - override val zero: TopKTweetsWithScores = TopKTweetsWithScores() - - override def plus(x: TopKTweetsWithScores, y: TopKTweetsWithScores): TopKTweetsWithScores = { - val oldestTweetId = SnowflakeId.firstIdFor(System.currentTimeMillis() - tweetAgeThreshold) - - val mergedFavMap = TopKScoresUtils - .mergeTwoTopKMapWithDecayedValues( - x.topTweetsByFavClusterNormalizedScore - .map(_.mapValues( - _.favClusterNormalized8HrHalfLifeScore.getOrElse(thriftDecayedValueMonoid.zero))), - y.topTweetsByFavClusterNormalizedScore - .map(_.mapValues( - _.favClusterNormalized8HrHalfLifeScore.getOrElse(thriftDecayedValueMonoid.zero))), - topK, - threshold - ).map(_.filter(_._1 >= oldestTweetId).mapValues(decayedValue => - Scores(favClusterNormalized8HrHalfLifeScore = Some(decayedValue)))) - - TopKTweetsWithScores(mergedFavMap, None) - } - } - - class MultiModelTopKTweetsWithScoresMonoid( - )( - implicit thriftDecayedValueMonoid: ThriftDecayedValueMonoid) - extends Monoid[MultiModelTopKTweetsWithScores] { - override val zero: MultiModelTopKTweetsWithScores = MultiModelTopKTweetsWithScores() - - override def plus( - x: MultiModelTopKTweetsWithScores, - y: MultiModelTopKTweetsWithScores - ): MultiModelTopKTweetsWithScores = { - // We reuse the logic from the Monoid for the Value here - val topKTweetsWithScoresMonoid = Implicits.topKTweetsWithScoresMonoid - - MultiModelTopKTweetsWithScores( - MultiModelUtils.mergeTwoMultiModelMaps( - x.multiModelTopKTweetsWithScores, - y.multiModelTopKTweetsWithScores, - topKTweetsWithScoresMonoid)) - } - - } - - /** - * Merge two PersistentSimClustersEmbedding. The latest embedding overwrite the old embedding. - * The new count equals to the sum of the count. - */ - class PersistentSimClustersEmbeddingMonoid extends Monoid[PersistentSimClustersEmbedding] { - - override val zero: PersistentSimClustersEmbedding = PersistentSimClustersEmbedding( - ThriftSimClustersEmbedding(), - SimClustersEmbeddingMetadata() - ) - - private val optionLongMonoid = new OptionMonoid[Long]() - - override def plus( - x: PersistentSimClustersEmbedding, - y: PersistentSimClustersEmbedding - ): PersistentSimClustersEmbedding = { - val latest = - if (x.metadata.updatedAtMs.getOrElse(0L) > y.metadata.updatedAtMs.getOrElse(0L)) x else y - latest.copy( - metadata = latest.metadata.copy( - updatedCount = optionLongMonoid.plus(x.metadata.updatedCount, y.metadata.updatedCount))) - } - } - - class MultiModelPersistentSimClustersEmbeddingMonoid - extends Monoid[MultiModelPersistentSimClustersEmbedding] { - - override val zero: MultiModelPersistentSimClustersEmbedding = - MultiModelPersistentSimClustersEmbedding(Map[ModelVersion, PersistentSimClustersEmbedding]()) - - override def plus( - x: MultiModelPersistentSimClustersEmbedding, - y: MultiModelPersistentSimClustersEmbedding - ): MultiModelPersistentSimClustersEmbedding = { - val monoid = Implicits.persistentSimClustersEmbeddingMonoid - - // PersistentSimClustersEmbeddings is the only required thrift object so we need to wrap it - // in Some - MultiModelUtils.mergeTwoMultiModelMaps( - Some(x.multiModelPersistentSimClustersEmbedding), - Some(y.multiModelPersistentSimClustersEmbedding), - monoid) match { - // clean up the empty embeddings - case Some(res) => - MultiModelPersistentSimClustersEmbedding(res.flatMap { - // in some cases the list of SimClustersScore is empty, so we want to remove the - // modelVersion from the list of Models for the embedding - case (modelVersion, persistentSimClustersEmbedding) => - persistentSimClustersEmbedding.embedding.embedding match { - case embedding if embedding.nonEmpty => - Map(modelVersion -> persistentSimClustersEmbedding) - case _ => - None - } - }) - case _ => zero - } - } - } - - /** - * Merge two PersistentSimClustersEmbeddings. The embedding with the longest l2 norm overwrites - * the other embedding. The new count equals to the sum of the count. - */ - class PersistentSimClustersEmbeddingLongestL2NormMonoid - extends Monoid[PersistentSimClustersEmbedding] { - - override val zero: PersistentSimClustersEmbedding = PersistentSimClustersEmbedding( - ThriftSimClustersEmbedding(), - SimClustersEmbeddingMetadata() - ) - - override def plus( - x: PersistentSimClustersEmbedding, - y: PersistentSimClustersEmbedding - ): PersistentSimClustersEmbedding = { - if (SimClustersEmbedding(x.embedding).l2norm >= SimClustersEmbedding(y.embedding).l2norm) x - else y - } - } - - class MultiModelPersistentSimClustersEmbeddingLongestL2NormMonoid - extends Monoid[MultiModelPersistentSimClustersEmbedding] { - - override val zero: MultiModelPersistentSimClustersEmbedding = - MultiModelPersistentSimClustersEmbedding(Map[ModelVersion, PersistentSimClustersEmbedding]()) - - override def plus( - x: MultiModelPersistentSimClustersEmbedding, - y: MultiModelPersistentSimClustersEmbedding - ): MultiModelPersistentSimClustersEmbedding = { - val monoid = Implicits.persistentSimClustersEmbeddingLongestL2NormMonoid - - MultiModelUtils.mergeTwoMultiModelMaps( - Some(x.multiModelPersistentSimClustersEmbedding), - Some(y.multiModelPersistentSimClustersEmbedding), - monoid) match { - // clean up empty embeddings - case Some(res) => - MultiModelPersistentSimClustersEmbedding(res.flatMap { - case (modelVersion, persistentSimClustersEmbedding) => - // in some cases the list of SimClustersScore is empty, so we want to remove the - // modelVersion from the list of Models for the embedding - persistentSimClustersEmbedding.embedding.embedding match { - case embedding if embedding.nonEmpty => - Map(modelVersion -> persistentSimClustersEmbedding) - case _ => - None - } - }) - case _ => zero - } - } - } - - object TopKScoresUtils { - - /** - * Function for merging TopK scores with decayed values. - * - * This is for use with topk scores where all scores are updated at the same time (i.e. most - * time-decayed embedding aggregations). Rather than storing individual scores as algebird.DecayedValue - * and replicating time information for every key, we can store a single timestamp for the entire - * embedding and replicate the decay logic when processing each score. - * - * This should replicate the behaviour of `mergeTwoTopKMapWithDecayedValues` - * - * The logic is: - * - Determine the most recent update and build a DecayedValue for it (decayedValueForLatestTime) - * - For each (cluster, score), decay the score relative to the time of the most-recently updated embedding - * - This is a no-op for scores from the most recently-updated embedding, and will scale scores - * for the older embedding. - * - Drop any (cluster, score) which are below the `threshold` score - * - If both input embeddings contribute a score for the same cluster, keep the one with the largest score (after scaling) - * - Sort (cluster, score) by score and keep the `topK` - * - */ - def mergeClusterScoresWithUpdateTimes[Key]( - x: Seq[(Key, Double)], - xUpdatedAtMs: Long, - y: Seq[(Key, Double)], - yUpdatedAtMs: Long, - halfLifeMs: Long, - topK: Int, - threshold: Double - ): Seq[(Key, Double)] = { - val latestUpdate = math.max(xUpdatedAtMs, yUpdatedAtMs) - val decayedValueForLatestTime = DecayedValue.build(0.0, latestUpdate, halfLifeMs) - - val merged = mutable.HashMap[Key, Double]() - - x.foreach { - case (key, score) => - val decayedScore = Implicits.decayedValueMonoid - .plus( - DecayedValue.build(score, xUpdatedAtMs, halfLifeMs), - decayedValueForLatestTime - ).value - if (decayedScore > threshold) - merged += key -> decayedScore - } - - y.foreach { - case (key, score) => - val decayedScore = Implicits.decayedValueMonoid - .plus( - DecayedValue.build(score, yUpdatedAtMs, halfLifeMs), - decayedValueForLatestTime - ).value - if (decayedScore > threshold) - merged.get(key) match { - case Some(existingValue) => - if (decayedScore > existingValue) - merged += key -> decayedScore - case None => - merged += key -> decayedScore - } - } - - merged.toSeq - .sortBy(-_._2) - .take(topK) - } - - /** - * Function for merging to TopK map with decayed values. - * - * First of all, all the values will be decayed to the latest scaled timestamp to be comparable. - * - * If the same key appears at both a and b, the one with larger scaled time (or larger value when - * their scaled times are same) will be taken. The values smaller than the threshold will be dropped. - * - * After merging, if the size is larger than TopK, only scores with topK largest value will be kept. - */ - def mergeTwoTopKMapWithDecayedValues[T]( - a: Option[collection.Map[T, ThriftDecayedValue]], - b: Option[collection.Map[T, ThriftDecayedValue]], - topK: Int, - threshold: Double - )( - implicit thriftDecayedValueMonoid: ThriftDecayedValueMonoid - ): Option[collection.Map[T, ThriftDecayedValue]] = { - - if (a.isEmpty || a.exists(_.isEmpty)) { - return b - } - - if (b.isEmpty || b.exists(_.isEmpty)) { - return a - } - - val latestScaledTime = (a.get.view ++ b.get.view).map { - case (_, scores) => - scores.scaledTime - }.max - - val decayedValueWithLatestScaledTime = ThriftDecayedValue(0.0, latestScaledTime) - - val merged = mutable.HashMap[T, ThriftDecayedValue]() - - a.foreach { - _.foreach { - case (k, v) => - // decay the value to latest scaled time - val decayedScores = thriftDecayedValueMonoid - .plus(v, decayedValueWithLatestScaledTime) - - // only merge if the value is larger than the threshold - if (decayedScores.value > threshold) { - merged += k -> decayedScores - } - } - } - - b.foreach { - _.foreach { - case (k, v) => - val decayedScores = thriftDecayedValueMonoid - .plus(v, decayedValueWithLatestScaledTime) - - // only merge if the value is larger than the threshold - if (decayedScores.value > threshold) { - if (!merged.contains(k)) { - merged += k -> decayedScores - } else { - // only update if the value is larger than the one already merged - if (decayedScores.value > merged(k).value) { - merged.update(k, decayedScores) - } - } - } - } - } - - // add some buffer size (~ 0.2 * topK) to avoid sorting and taking too frequently - if (merged.size > topK * 1.2) { - Some( - merged.toSeq - .sortBy { case (_, scores) => scores.value * -1 } - .take(topK) - .toMap - ) - } else { - Some(merged) - } - } - } - - object MultiModelUtils { - - /** - * In order to reduce complexity we use the Monoid for the value to plus two MultiModel maps - */ - def mergeTwoMultiModelMaps[T]( - a: Option[collection.Map[ModelVersion, T]], - b: Option[collection.Map[ModelVersion, T]], - monoid: Monoid[T] - ): Option[collection.Map[ModelVersion, T]] = { - (a, b) match { - case (Some(_), None) => a - case (None, Some(_)) => b - case (Some(aa), Some(bb)) => - val res = ModelVersionProfiles.ModelVersionProfiles.foldLeft(Map[ModelVersion, T]()) { - (map, model) => - map + (model._1 -> monoid.plus( - aa.getOrElse(model._1, monoid.zero), - bb.getOrElse(model._1, monoid.zero) - )) - } - Some(res) - case _ => None - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersEmbeddingWithMetadataMonoid.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersEmbeddingWithMetadataMonoid.docx new file mode 100644 index 000000000..8058ab5f9 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersEmbeddingWithMetadataMonoid.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersEmbeddingWithMetadataMonoid.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersEmbeddingWithMetadataMonoid.scala deleted file mode 100644 index 4379eccb9..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersEmbeddingWithMetadataMonoid.scala +++ /dev/null @@ -1,59 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.algebird.{Monoid, OptionMonoid} -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.summingbird.common.Monoids.TopKScoresUtils -import com.twitter.simclusters_v2.thriftscala.{ - SimClustersEmbeddingMetadata, - SimClustersEmbeddingWithMetadata, - SimClustersEmbedding => ThriftSimClustersEmbedding -} - -/** - * Decayed aggregation of embeddings. - * - * When merging 2 embeddings, the older embedding's scores are scaled by time. If a cluster is - * present in both embeddings, the highest score (after scaling) is used in the result. - * - * @halfLifeMs - defines how quickly a score decays - * @topK - only the topk clusters with the highest scores are retained in the result - * @threshold - any clusters with weights below threshold are excluded from the result - */ -class SimClustersEmbeddingWithMetadataMonoid( - halfLifeMs: Long, - topK: Int, - threshold: Double) - extends Monoid[SimClustersEmbeddingWithMetadata] { - - override val zero: SimClustersEmbeddingWithMetadata = SimClustersEmbeddingWithMetadata( - ThriftSimClustersEmbedding(), - SimClustersEmbeddingMetadata() - ) - - private val optionLongMonoid = new OptionMonoid[Long]() - private val optionMaxMonoid = - new OptionMonoid[Long]()(com.twitter.algebird.Max.maxSemigroup[Long]) - - override def plus( - x: SimClustersEmbeddingWithMetadata, - y: SimClustersEmbeddingWithMetadata - ): SimClustersEmbeddingWithMetadata = { - - val mergedClusterScores = TopKScoresUtils.mergeClusterScoresWithUpdateTimes( - x = SimClustersEmbedding(x.embedding).embedding, - xUpdatedAtMs = x.metadata.updatedAtMs.getOrElse(0), - y = SimClustersEmbedding(y.embedding).embedding, - yUpdatedAtMs = y.metadata.updatedAtMs.getOrElse(0), - halfLifeMs = halfLifeMs, - topK = topK, - threshold = threshold - ) - SimClustersEmbeddingWithMetadata( - embedding = SimClustersEmbedding(mergedClusterScores).toThrift, - metadata = SimClustersEmbeddingMetadata( - updatedAtMs = optionMaxMonoid.plus(x.metadata.updatedAtMs, y.metadata.updatedAtMs), - updatedCount = optionLongMonoid.plus(x.metadata.updatedCount, y.metadata.updatedCount) - ) - ) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersHashUtil.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersHashUtil.docx new file mode 100644 index 000000000..d3f3bb507 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersHashUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersHashUtil.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersHashUtil.scala deleted file mode 100644 index fff4bb851..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersHashUtil.scala +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -/** - * Provides int to int hash function. Used to batch clusterIds together. - */ -object SimClustersHashUtil { - def clusterIdToBucket(clusterId: Int): Int = { - clusterId % numBuckets - } - - val numBuckets: Int = 200 - - val getAllBuckets: Seq[Int] = 0.until(numBuckets) -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersInterestedInUtil.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersInterestedInUtil.docx new file mode 100644 index 000000000..b3b825f0a Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersInterestedInUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersInterestedInUtil.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersInterestedInUtil.scala deleted file mode 100644 index 4cd7ff14b..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersInterestedInUtil.scala +++ /dev/null @@ -1,72 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.thriftscala.{ - ClustersUserIsInterestedIn, - ClustersWithScores, - Scores -} - -object SimClustersInterestedInUtil { - - private final val EmptyClustersWithScores = ClustersWithScores() - - case class InterestedInScores( - favScore: Double, - clusterNormalizedFavScore: Double, - clusterNormalizedFollowScore: Double, - clusterNormalizedLogFavScore: Double) - - def topClustersWithScores( - userInterests: ClustersUserIsInterestedIn - ): Seq[(ClusterId, InterestedInScores)] = { - userInterests.clusterIdToScores.toSeq.map { - case (clusterId, scores) => - val favScore = scores.favScore.getOrElse(0.0) - val normalizedFavScore = scores.favScoreClusterNormalizedOnly.getOrElse(0.0) - val normalizedFollowScore = scores.followScoreClusterNormalizedOnly.getOrElse(0.0) - val normalizedLogFavScore = scores.logFavScoreClusterNormalizedOnly.getOrElse(0.0) - - ( - clusterId, - InterestedInScores( - favScore, - normalizedFavScore, - normalizedFollowScore, - normalizedLogFavScore)) - } - } - - def buildClusterWithScores( - clusterScores: Seq[(ClusterId, InterestedInScores)], - timeInMs: Double, - favScoreThresholdForUserInterest: Double - )( - implicit thriftDecayedValueMonoid: ThriftDecayedValueMonoid - ): ClustersWithScores = { - val scoresMap = clusterScores.collect { - case ( - clusterId, - InterestedInScores( - favScore, - _, - _, - clusterNormalizedLogFavScore)) - // NOTE: the threshold is on favScore, and the computation is on normalizedFavScore - // This threshold reduces the number of unique keys in the cache by 80%, - // based on offline analysis - if favScore >= favScoreThresholdForUserInterest => - - val favClusterNormalized8HrHalfLifeScoreOpt = - Some(thriftDecayedValueMonoid.build(clusterNormalizedLogFavScore, timeInMs)) - - clusterId -> Scores(favClusterNormalized8HrHalfLifeScore = favClusterNormalized8HrHalfLifeScoreOpt) - }.toMap - - if (scoresMap.nonEmpty) { - ClustersWithScores(Some(scoresMap)) - } else { - EmptyClustersWithScores - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersProfile.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersProfile.docx new file mode 100644 index 000000000..340fe868f Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersProfile.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersProfile.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersProfile.scala deleted file mode 100644 index ee58bbd67..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/SimClustersProfile.scala +++ /dev/null @@ -1,212 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.simclusters_v2.common.ModelVersions._ -import com.twitter.simclusters_v2.summingbird.common.ClientConfigs._ -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.AltSetting.AltSetting -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.Environment.Environment -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.JobType.JobType -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.AltSetting -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.JobType -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.ModelVersion - -sealed trait SimClustersProfile { - val env: Environment - val alt: AltSetting - val modelVersionStr: String - - lazy val modelVersion: ModelVersion = modelVersionStr -} - -sealed trait SimClustersJobProfile extends SimClustersProfile { - - val jobType: JobType - - final lazy val jobName: String = { - alt match { - case AltSetting.Alt => - s"simclusters_v2_${jobType}_alt_job_$env" - case AltSetting.Esc => - s"simclusters_v2_${jobType}_esc_job_$env" - case _ => - s"simclusters_v2_${jobType}_job_$env" - } - } - - // Build the serviceIdentifier by jobType, env and zone(dc) - final lazy val serviceIdentifier: String => ServiceIdentifier = { zone => - ServiceIdentifier(Configs.role, s"summingbird_$jobName", env.toString, zone) - } - - final lazy val favScoreThresholdForUserInterest: Double = - Configs.favScoreThresholdForUserInterest(modelVersionStr) - - lazy val timelineEventSourceSubscriberId: String = { - val jobTypeStr = jobType match { - case JobType.MultiModelTweet => "multi_model_tweet_" - case JobType.PersistentTweet => "persistent_tweet_" - case JobType.Tweet => "" - } - - val prefix = alt match { - case AltSetting.Alt => - "alt_" - case AltSetting.Esc => - "esc_" - case _ => - "" - } - - s"simclusters_v2_${jobTypeStr}summingbird_$prefix$env" - } - -} - -object SimClustersProfile { - - object JobType extends Enumeration { - type JobType = Value - val Tweet: JobType = Value("tweet") - val PersistentTweet: JobType = Value("persistent_tweet") - val MultiModelTweet: JobType = Value("multimodel_tweet") - } - - object Environment extends Enumeration { - type Environment = Value - val Prod: Environment = Value("prod") - val Devel: Environment = Value("devel") - - def apply(setting: String): Environment = { - if (setting == Prod.toString) { - Prod - } else { - Devel - } - } - } - - object AltSetting extends Enumeration { - type AltSetting = Value - val Normal: AltSetting = Value("normal") - val Alt: AltSetting = Value("alt") - val Esc: AltSetting = Value("esc") - - def apply(setting: String): AltSetting = { - - setting match { - case "alt" => Alt - case "esc" => Esc - case _ => Normal - } - } - } - - case class SimClustersTweetProfile( - env: Environment, - alt: AltSetting, - modelVersionStr: String, - entityClusterScorePath: String, - tweetTopKClustersPath: String, - clusterTopKTweetsPath: String, - coreEmbeddingType: EmbeddingType, - clusterTopKTweetsLightPath: Option[String] = None) - extends SimClustersJobProfile { - - final val jobType: JobType = JobType.Tweet - } - - case class PersistentTweetProfile( - env: Environment, - alt: AltSetting, - modelVersionStr: String, - persistentTweetStratoPath: String, - coreEmbeddingType: EmbeddingType) - extends SimClustersJobProfile { - final val jobType: JobType = JobType.PersistentTweet - } - - final val AltProdTweetJobProfile = SimClustersTweetProfile( - env = Environment.Prod, - alt = AltSetting.Alt, - modelVersionStr = Model20M145K2020, - entityClusterScorePath = simClustersCoreAltCachePath, - tweetTopKClustersPath = simClustersCoreAltCachePath, - clusterTopKTweetsPath = simClustersCoreAltCachePath, - clusterTopKTweetsLightPath = Some(simClustersCoreAltLightCachePath), - coreEmbeddingType = EmbeddingType.LogFavBasedTweet - ) - - final val AltDevelTweetJobProfile = SimClustersTweetProfile( - env = Environment.Devel, - alt = AltSetting.Alt, - modelVersionStr = Model20M145K2020, - // using the same devel cache with job - entityClusterScorePath = develSimClustersCoreCachePath, - tweetTopKClustersPath = develSimClustersCoreCachePath, - clusterTopKTweetsPath = develSimClustersCoreCachePath, - clusterTopKTweetsLightPath = Some(develSimClustersCoreLightCachePath), - coreEmbeddingType = EmbeddingType.LogFavBasedTweet, - ) - - final val ProdPersistentTweetProfile = PersistentTweetProfile( - env = Environment.Prod, - alt = AltSetting.Normal, - modelVersionStr = Model20M145K2020, - // This profile is used by the persistent tweet embedding job to update the embedding. We - // use the uncached column to avoid reading stale data - persistentTweetStratoPath = logFavBasedTweet20M145K2020UncachedStratoPath, - coreEmbeddingType = EmbeddingType.LogFavBasedTweet - ) - - final val DevelPersistentTweetProfile = PersistentTweetProfile( - env = Environment.Devel, - alt = AltSetting.Normal, - modelVersionStr = Model20M145K2020, - persistentTweetStratoPath = develLogFavBasedTweet20M145K2020StratoPath, - coreEmbeddingType = EmbeddingType.LogFavBasedTweet - ) - - def fetchTweetJobProfile( - env: Environment, - alt: AltSetting = AltSetting.Normal - ): SimClustersTweetProfile = { - (env, alt) match { - case (Environment.Prod, AltSetting.Alt) => AltProdTweetJobProfile - case (Environment.Devel, AltSetting.Alt) => AltDevelTweetJobProfile - case _ => throw new IllegalArgumentException("Invalid env or alt setting") - } - } - - def fetchPersistentJobProfile( - env: Environment, - alt: AltSetting = AltSetting.Normal - ): PersistentTweetProfile = { - (env, alt) match { - case (Environment.Prod, AltSetting.Normal) => ProdPersistentTweetProfile - case (Environment.Devel, AltSetting.Normal) => DevelPersistentTweetProfile - case _ => throw new IllegalArgumentException("Invalid env or alt setting") - } - } - - /** - * For short term, fav based tweet embedding and log fav based tweets embedding exists at the - * same time. We want to move to log fav based tweet embedding eventually. - * Follow based tweet embeddings exists in both environment. - * A uniform tweet embedding API is the future to replace the existing use case. - */ - final lazy val tweetJobProfileMap: Environment => Map[ - (EmbeddingType, String), - SimClustersTweetProfile - ] = { - case Environment.Prod => - Map( - (EmbeddingType.LogFavBasedTweet, Model20M145K2020) -> AltProdTweetJobProfile - ) - case Environment.Devel => - Map( - (EmbeddingType.LogFavBasedTweet, Model20M145K2020) -> AltDevelTweetJobProfile - ) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/StatsUtil.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/StatsUtil.docx new file mode 100644 index 000000000..1d9dab2d6 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/StatsUtil.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/StatsUtil.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/StatsUtil.scala deleted file mode 100644 index 78a34fef2..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/StatsUtil.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.summingbird.{Counter, Group, Name, Platform, Producer} -import com.twitter.summingbird.option.JobId - -object StatsUtil { - - // for adding stats in Producer. - // this enables us to add new stats by just calling producer.observer("name") - implicit class EnrichedProducer[P <: Platform[P], T]( - producer: Producer[P, T] - )( - implicit jobId: JobId) { - def observe(counter: String): Producer[P, T] = { - val stat = Counter(Group(jobId.get), Name(counter)) - producer.map { v => - stat.incr() - v - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SummerWithSumValues.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/SummerWithSumValues.docx new file mode 100644 index 000000000..cb045d207 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/SummerWithSumValues.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/SummerWithSumValues.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/SummerWithSumValues.scala deleted file mode 100644 index e10718162..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/SummerWithSumValues.scala +++ /dev/null @@ -1,40 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.algebird.Monoid -import com.twitter.summingbird._ - -object SummerWithSumValues { - /* - A common pattern in heron is to use .sumByKeys to aggregate a value in a store, and then continue - processing with the aggregated value. Unfortunately, .sumByKeys returns the existing value from the - store and the delta separately, leaving you to manually combine them. - - Example without sumValues: - - someKeyedProducer - .sumByKeys(score)(monoid) - .map { - case (key, (existingValueOpt, delta)) => - // if you want the value that was actually written to the store, you have to combine - // existingValueOpt and delta yourself - } - - Example with sumValues: - - someKeyedProducer - .sumByKeys(score)(monoid) - .sumValues(monoid) - .map { - case (key, value) => - // `value` is the same as what was written to the store - } - */ - implicit class SummerWithSumValues[P <: Platform[P], K, V]( - summer: Summer[P, K, V]) { - def sumValues(monoid: Monoid[V]): KeyedProducer[P, K, V] = - summer.mapValues { - case (Some(oldV), deltaV) => monoid.plus(oldV, deltaV) - case (None, deltaV) => deltaV - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/ThriftDecayedValueMonoid.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/ThriftDecayedValueMonoid.docx new file mode 100644 index 000000000..b39ed42d1 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/ThriftDecayedValueMonoid.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/ThriftDecayedValueMonoid.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/ThriftDecayedValueMonoid.scala deleted file mode 100644 index af490fc9d..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/ThriftDecayedValueMonoid.scala +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.algebird.DecayedValue -import com.twitter.algebird.DecayedValueMonoid -import com.twitter.algebird.Monoid -import com.twitter.algebird_internal.injection.DecayedValueImplicits -import com.twitter.algebird_internal.thriftscala.{DecayedValue => ThriftDecayedValue} - -/** - * Monoid for ThriftDecayedValue - */ -class ThriftDecayedValueMonoid(halfLifeInMs: Long)(implicit decayedValueMonoid: DecayedValueMonoid) - extends Monoid[ThriftDecayedValue] { - - override val zero: ThriftDecayedValue = DecayedValueImplicits.toThrift(decayedValueMonoid.zero) - - override def plus(x: ThriftDecayedValue, y: ThriftDecayedValue): ThriftDecayedValue = { - DecayedValueImplicits.toThrift( - decayedValueMonoid - .plus(DecayedValueImplicits.toThrift.invert(x), DecayedValueImplicits.toThrift.invert(y)) - ) - } - - def build(value: Double, timeInMs: Double): ThriftDecayedValue = { - DecayedValueImplicits.toThrift( - DecayedValue.build(value, timeInMs, halfLifeInMs) - ) - } - - /** - * decay to a timestamp; note that timestamp should be in Ms, and do not use scaledTime! - */ - def decayToTimestamp( - thriftDecayedValue: ThriftDecayedValue, - timestampInMs: Double - ): ThriftDecayedValue = { - this.plus(thriftDecayedValue, this.build(0.0, timestampInMs)) - } -} - -object ThriftDecayedValueMonoid { - // add the implicit class so that a decayed value can direct call .plus, .decayedValueOfTime and - // so on. - implicit class EnrichedThriftDecayedValue( - thriftDecayedValue: ThriftDecayedValue - )( - implicit thriftDecayedValueMonoid: ThriftDecayedValueMonoid) { - def plus(other: ThriftDecayedValue): ThriftDecayedValue = { - thriftDecayedValueMonoid.plus(thriftDecayedValue, other) - } - - // decay to a timestamp; note that timestamp should be in Ms - def decayToTimestamp(timeInMs: Double): ThriftDecayedValue = { - thriftDecayedValueMonoid.decayToTimestamp(thriftDecayedValue, timeInMs) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/TweetEntityExtractor.docx b/src/scala/com/twitter/simclusters_v2/summingbird/common/TweetEntityExtractor.docx new file mode 100644 index 000000000..f0bbf210c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/common/TweetEntityExtractor.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/common/TweetEntityExtractor.scala b/src/scala/com/twitter/simclusters_v2/summingbird/common/TweetEntityExtractor.scala deleted file mode 100644 index bd6a81baa..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/common/TweetEntityExtractor.scala +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.common - -import com.twitter.recos.entities.thriftscala.NamedEntity -import com.twitter.simclusters_v2.thriftscala.{ - NerKey, - PenguinKey, - SimClusterEntity, - TweetTextEntity -} -import com.twitter.taxi.util.text.{TweetFeatureExtractor, TweetTextFeatures} -import com.twitter.tweetypie.thriftscala.Tweet - -object TweetEntityExtractor { - - private val MaxHashtagsPerTweet: Int = 4 - - private val MaxNersPerTweet: Int = 4 - - private val MaxPenguinsPerTweet: Int = 4 - - private val tweetFeatureExtractor: TweetFeatureExtractor = TweetFeatureExtractor.Default - - private def extractTweetTextFeatures( - text: String, - languageCode: Option[String] - ): TweetTextFeatures = { - if (languageCode.isDefined) { - tweetFeatureExtractor.extract(text, languageCode.get) - } else { - tweetFeatureExtractor.extract(text) - } - } - - def extractEntitiesFromText( - tweet: Option[Tweet], - nerEntitiesOpt: Option[Seq[NamedEntity]] - ): Seq[SimClusterEntity.TweetEntity] = { - - val hashtagEntities = tweet - .flatMap(_.hashtags.map(_.map(_.text))).getOrElse(Nil) - .map { hashtag => TweetTextEntity.Hashtag(hashtag.toLowerCase) }.take(MaxHashtagsPerTweet) - - val nerEntities = nerEntitiesOpt - .getOrElse(Nil).map { namedEntity => - TweetTextEntity - .Ner(NerKey(namedEntity.namedEntity.toLowerCase, namedEntity.entityType.getValue)) - }.take(MaxNersPerTweet) - - val nerEntitySet = nerEntities.map(_.ner.textEntity).toSet - - val penguinEntities = - extractTweetTextFeatures( - tweet.flatMap(_.coreData.map(_.text)).getOrElse(""), - tweet.flatMap(_.language.map(_.language)) - ).phrases - .map(_.normalizedOrOriginal) - .filter { s => - s.charAt(0) != '#' && !nerEntitySet.contains(s) // not included in hashtags and NER - } - .map { penguinStr => TweetTextEntity.Penguin(PenguinKey(penguinStr.toLowerCase)) }.take( - MaxPenguinsPerTweet) - - (hashtagEntities ++ penguinEntities ++ nerEntities).map(e => SimClusterEntity.TweetEntity(e)) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ApeTopicEmbeddingStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ApeTopicEmbeddingStore.docx new file mode 100644 index 000000000..8460f16db Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ApeTopicEmbeddingStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ApeTopicEmbeddingStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ApeTopicEmbeddingStore.scala deleted file mode 100644 index 0eec17b81..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ApeTopicEmbeddingStore.scala +++ /dev/null @@ -1,43 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.frigate.common.store.strato.StratoStore -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.ModelVersions._ -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.TopicId -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} -import com.twitter.storehaus.ReadableStore -import com.twitter.strato.client.Client - -object ApeTopicEmbeddingStore { - - private val logFavBasedAPEColumn20M145K2020 = - "recommendations/simclusters_v2/embeddings/logFavBasedAPE20M145K2020" - - private def getStore( - stratoClient: Client, - column: String - ): ReadableStore[SimClustersEmbeddingId, ThriftSimClustersEmbedding] = { - StratoStore - .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding](stratoClient, column) - } - - def getFavBasedLocaleEntityEmbedding2020Store( - stratoClient: Client, - ): ReadableStore[TopicId, SimClustersEmbedding] = { - - getStore(stratoClient, logFavBasedAPEColumn20M145K2020) - .composeKeyMapping[TopicId] { topicId => - SimClustersEmbeddingId( - EmbeddingType.LogFavBasedKgoApeTopic, - ModelVersions.Model20M145K2020, - InternalId.TopicId(topicId) - ) - } - .mapValues(SimClustersEmbedding(_)) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/BUILD b/src/scala/com/twitter/simclusters_v2/summingbird/stores/BUILD deleted file mode 100644 index 9e78da7c4..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/BUILD +++ /dev/null @@ -1,32 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/algebird:core", - "3rdparty/jvm/com/twitter/algebird:util", - "3rdparty/jvm/com/twitter/bijection:core", - "3rdparty/jvm/com/twitter/bijection:util", - "3rdparty/jvm/com/twitter/storehaus:core", - "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/store/strato", - "relevance-platform/src/main/scala/com/twitter/relevance_platform/simclustersann/multicluster", - "src/scala/com/twitter/algebird_internal/injection", - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/simclusters_v2/summingbird/common", - "src/scala/com/twitter/storehaus_internal/manhattan", - "src/scala/com/twitter/storehaus_internal/manhattan/config", - "src/scala/com/twitter/storehaus_internal/memcache", - "src/scala/com/twitter/storehaus_internal/memcache/config", - "src/scala/com/twitter/storehaus_internal/offline", - "src/scala/com/twitter/storehaus_internal/online", - "src/scala/com/twitter/storehaus_internal/util", - "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", - "src/scala/com/twitter/summingbird_internal/runner/store_config", - "src/scala/com/twitter/wtf/summingbird/sources/common", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - "src/thrift/com/twitter/timelineservice/server/internal:thrift-scala", - "src/thrift/com/twitter/wtf/interest:interest-thrift-scala", - "src/thrift/com/twitter/wtf/utt:utt-scala", - "strato/src/main/scala/com/twitter/strato/client", - "strato/src/main/scala/com/twitter/strato/mh", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/BUILD.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/BUILD.docx new file mode 100644 index 000000000..d1f942939 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ClusterDetailsReadableStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ClusterDetailsReadableStore.docx new file mode 100644 index 000000000..4f86a90f3 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ClusterDetailsReadableStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ClusterDetailsReadableStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ClusterDetailsReadableStore.scala deleted file mode 100644 index a553e7ff8..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ClusterDetailsReadableStore.scala +++ /dev/null @@ -1,67 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.bijection.{Bufferable, Injection} -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.thriftscala.ClusterDetails -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan.{Athena, ManhattanRO, ManhattanROConfig} -import com.twitter.storehaus_internal.util.{ApplicationID, DatasetName, HDFSPath} -import com.twitter.util.{Future, Memoize} - -object ClusterDetailsReadableStore { - - val modelVersionToDatasetMap: Map[String, String] = Map( - ModelVersions.Model20M145KDec11 -> "simclusters_v2_cluster_details", - ModelVersions.Model20M145KUpdated -> "simclusters_v2_cluster_details_20m_145k_updated", - ModelVersions.Model20M145K2020 -> "simclusters_v2_cluster_details_20m_145k_2020" - ) - - val knownModelVersions: String = modelVersionToDatasetMap.keys.mkString(",") - - private val clusterDetailsStores = - Memoize[(ManhattanKVClientMtlsParams, String), ReadableStore[(String, Int), ClusterDetails]] { - case (mhMtlsParams: ManhattanKVClientMtlsParams, datasetName: String) => - getForDatasetName(mhMtlsParams, datasetName) - } - - def getForDatasetName( - mhMtlsParams: ManhattanKVClientMtlsParams, - datasetName: String - ): ReadableStore[(String, Int), ClusterDetails] = { - implicit val keyInjection: Injection[(String, Int), Array[Byte]] = - Bufferable.injectionOf[(String, Int)] - implicit val valueInjection: Injection[ClusterDetails, Array[Byte]] = - CompactScalaCodec(ClusterDetails) - - ManhattanRO.getReadableStoreWithMtls[(String, Int), ClusterDetails]( - ManhattanROConfig( - HDFSPath(""), // not needed - ApplicationID("simclusters_v2"), - DatasetName(datasetName), // this should be correct - Athena - ), - mhMtlsParams - ) - } - - def apply( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[(String, Int), ClusterDetails] = { - new ReadableStore[(String, Int), ClusterDetails] { - override def get(modelVersionAndClusterId: (String, Int)): Future[Option[ClusterDetails]] = { - val (modelVersion, _) = modelVersionAndClusterId - modelVersionToDatasetMap.get(modelVersion) match { - case Some(datasetName) => - clusterDetailsStores((mhMtlsParams, datasetName)).get(modelVersionAndClusterId) - case None => - Future.exception( - new IllegalArgumentException( - "Unknown model version " + modelVersion + ". Known modelVersions: " + knownModelVersions) - ) - } - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/EntityClusterScoreReadableStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/EntityClusterScoreReadableStore.docx new file mode 100644 index 000000000..0cc5f6eae Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/EntityClusterScoreReadableStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/EntityClusterScoreReadableStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/EntityClusterScoreReadableStore.scala deleted file mode 100644 index b25687f4e..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/EntityClusterScoreReadableStore.scala +++ /dev/null @@ -1,62 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.frigate.common.store.strato.StratoStore -import com.twitter.simclusters_v2.summingbird.common.Implicits.clustersWithScoreMonoid -import com.twitter.simclusters_v2.summingbird.common.Implicits.clustersWithScoresCodec -import com.twitter.storehaus.algebra.MergeableStore -import com.twitter.simclusters_v2.summingbird.common.ClientConfigs -import com.twitter.simclusters_v2.summingbird.common.Implicits -import com.twitter.simclusters_v2.thriftscala.ClustersWithScores -import com.twitter.simclusters_v2.thriftscala.FullClusterIdBucket -import com.twitter.simclusters_v2.thriftscala.MultiModelClustersWithScores -import com.twitter.simclusters_v2.thriftscala.SimClusterEntity -import com.twitter.storehaus.Store -import com.twitter.storehaus_internal.memcache.Memcache -import com.twitter.strato.client.Client -import com.twitter.summingbird.batch.BatchID -import com.twitter.summingbird_internal.bijection.BatchPairImplicits -import com.twitter.util.Future -import com.twitter.strato.thrift.ScroogeConvImplicits._ - -object EntityClusterScoreReadableStore { - - private[simclusters_v2] final lazy val onlineMergeableStore: ( - String, - ServiceIdentifier - ) => MergeableStore[ - ((SimClusterEntity, FullClusterIdBucket), BatchID), - ClustersWithScores - ] = { (path: String, serviceIdentifier: ServiceIdentifier) => - Memcache - .getMemcacheStore[((SimClusterEntity, FullClusterIdBucket), BatchID), ClustersWithScores]( - ClientConfigs.entityClusterScoreMemcacheConfig(path, serviceIdentifier) - )( - BatchPairImplicits.keyInjection[(SimClusterEntity, FullClusterIdBucket)]( - Implicits.entityWithClusterInjection - ), - clustersWithScoresCodec, - clustersWithScoreMonoid - ) - } - -} - -object MultiModelEntityClusterScoreReadableStore { - - private[simclusters_v2] def MultiModelEntityClusterScoreReadableStore( - stratoClient: Client, - column: String - ): Store[EntityClusterId, MultiModelClustersWithScores] = { - StratoStore - .withUnitView[(SimClusterEntity, Int), MultiModelClustersWithScores](stratoClient, column) - .composeKeyMapping(_.toTuple) - } - - case class EntityClusterId( - simClusterEntity: SimClusterEntity, - clusterIdBucket: Int) { - lazy val toTuple: (SimClusterEntity, Int) = - (simClusterEntity, clusterIdBucket) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ManhattanFromStratoStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ManhattanFromStratoStore.docx new file mode 100644 index 000000000..e65b0a2cf Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ManhattanFromStratoStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ManhattanFromStratoStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ManhattanFromStratoStore.scala deleted file mode 100644 index ba9af7f00..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ManhattanFromStratoStore.scala +++ /dev/null @@ -1,108 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.bijection.Injection -import com.twitter.finagle.stats.NullStatsReceiver -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.io.Buf -import com.twitter.scrooge.ThriftStruct -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore.Timestamp -import com.twitter.simclusters_v2.thriftscala.PersistentSimClustersEmbedding -import com.twitter.storage.client.manhattan.kv.Guarantee -import com.twitter.storage.client.manhattan.kv.ManhattanKVClient -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storage.client.manhattan.kv.ManhattanKVEndpointBuilder -import com.twitter.storage.client.manhattan.kv.impl.FullBufKey -import com.twitter.storage.client.manhattan.kv.impl.ValueDescriptor -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan_kv.ManhattanEndpointStore -import com.twitter.strato.catalog.Version -import com.twitter.strato.config.MValEncoding -import com.twitter.strato.config.NativeEncoding -import com.twitter.strato.config.PkeyLkey2 -import com.twitter.strato.data.Conv -import com.twitter.strato.data.Type -import com.twitter.strato.mh.ManhattanInjections -import com.twitter.strato.thrift.ScroogeConv -import com.twitter.strato.thrift.ScroogeConvImplicits._ - -object ManhattanFromStratoStore { - /* This enables reading from a MH store where the data is written by Strato. Strato uses a unique - encoding (Conv) which needs to be reconstructed for each MH store based on the type of data that - is written to it. Once that encoding is generated on start-up, we can read from the store like - any other ReadableStore. - */ - def createPersistentTweetStore( - dataset: String, - mhMtlsParams: ManhattanKVClientMtlsParams, - statsReceiver: StatsReceiver = NullStatsReceiver - ): ReadableStore[(TweetId, Timestamp), PersistentSimClustersEmbedding] = { - val appId = "simclusters_embeddings_prod" - val dest = "/s/manhattan/omega.native-thrift" - - val endpoint = createMhEndpoint( - appId = appId, - dest = dest, - mhMtlsParams = mhMtlsParams, - statsReceiver = statsReceiver) - - val ( - keyInj: Injection[(TweetId, Timestamp), FullBufKey], - valueDesc: ValueDescriptor.EmptyValue[PersistentSimClustersEmbedding]) = - injectionsFromPkeyLkeyValueStruct[TweetId, Timestamp, PersistentSimClustersEmbedding]( - dataset = dataset, - pkType = Type.Long, - lkType = Type.Long) - - ManhattanEndpointStore - .readable[(TweetId, Timestamp), PersistentSimClustersEmbedding, FullBufKey]( - endpoint = endpoint, - keyDescBuilder = keyInj, - emptyValDesc = valueDesc) - } - - private def createMhEndpoint( - appId: String, - dest: String, - mhMtlsParams: ManhattanKVClientMtlsParams, - statsReceiver: StatsReceiver = NullStatsReceiver - ) = { - val mhc = ManhattanKVClient.memoizedByDest( - appId = appId, - dest = dest, - mtlsParams = mhMtlsParams - ) - - ManhattanKVEndpointBuilder(mhc) - .defaultGuarantee(Guarantee.SoftDcReadMyWrites) - .statsReceiver(statsReceiver) - .build() - } - - private def injectionsFromPkeyLkeyValueStruct[PK: Conv, LK: Conv, V <: ThriftStruct: Manifest]( - dataset: String, - pkType: Type, - lkType: Type - ): (Injection[(PK, LK), FullBufKey], ValueDescriptor.EmptyValue[V]) = { - // Strato uses a unique encoding (Conv) so we need to rebuild that based on the pkey, lkey and - // value type before converting it to the Manhattan injections for key -> FullBufKey and - // value -> Buf - val valueConv: Conv[V] = ScroogeConv.fromStruct[V] - - val mhEncodingMapping = PkeyLkey2( - pkey = pkType, - lkey = lkType, - value = valueConv.t, - pkeyEncoding = NativeEncoding, - lkeyEncoding = NativeEncoding, - valueEncoding = MValEncoding() - ) - - val (keyInj: Injection[(PK, LK), FullBufKey], valueInj: Injection[V, Buf], _, _) = - ManhattanInjections.fromPkeyLkey[PK, LK, V](mhEncodingMapping, dataset, Version.Default) - - val valDesc: ValueDescriptor.EmptyValue[V] = ValueDescriptor.EmptyValue(valueInj) - - (keyInj, valDesc) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/PersistentTweetEmbeddingStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/PersistentTweetEmbeddingStore.docx new file mode 100644 index 000000000..5a2228b9c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/PersistentTweetEmbeddingStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/PersistentTweetEmbeddingStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/PersistentTweetEmbeddingStore.scala deleted file mode 100644 index ab9c06240..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/PersistentTweetEmbeddingStore.scala +++ /dev/null @@ -1,104 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.store.strato.StratoFetchableStore -import com.twitter.frigate.common.store.strato.StratoStore -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.SimClustersEmbedding._ -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.thriftscala.PersistentSimClustersEmbedding -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus.Store -import com.twitter.strato.catalog.Scan.Slice -import com.twitter.strato.client.Client -import com.twitter.strato.thrift.ScroogeConvImplicits._ - -object PersistentTweetEmbeddingStore { - - val LogFavBasedColumn = - "recommendations/simclusters_v2/embeddings/logFavBasedTweet20M145KUpdatedPersistent" - val LogFavBasedColumn20m145k2020 = - "recommendations/simclusters_v2/embeddings/logFavBasedTweet20M145K2020Persistent" - - val LogFavBased20m145k2020Dataset = "log_fav_based_tweet_20m_145k_2020_embeddings" - val LogFavBased20m145kUpdatedDataset = "log_fav_based_tweet_20m_145k_updated_embeddings" - - val DefaultMaxLength = 15 - - def mostRecentTweetEmbeddingStore( - stratoClient: Client, - column: String, - maxLength: Int = DefaultMaxLength - ): ReadableStore[TweetId, SimClustersEmbedding] = { - StratoFetchableStore - .withUnitView[(TweetId, Timestamp), PersistentSimClustersEmbedding](stratoClient, column) - .composeKeyMapping[TweetId]((_, LatestEmbeddingVersion)) - .mapValues(_.embedding.truncate(maxLength)) - } - - def longestL2NormTweetEmbeddingStore( - stratoClient: Client, - column: String - ): ReadableStore[TweetId, SimClustersEmbedding] = - StratoFetchableStore - .withUnitView[(TweetId, Timestamp), PersistentSimClustersEmbedding](stratoClient, column) - .composeKeyMapping[TweetId]((_, LongestL2EmbeddingVersion)) - .mapValues(_.embedding) - - def mostRecentTweetEmbeddingStoreManhattan( - mhMtlsParams: ManhattanKVClientMtlsParams, - dataset: String, - statsReceiver: StatsReceiver, - maxLength: Int = DefaultMaxLength - ): ReadableStore[TweetId, SimClustersEmbedding] = - ManhattanFromStratoStore - .createPersistentTweetStore( - dataset = dataset, - mhMtlsParams = mhMtlsParams, - statsReceiver = statsReceiver - ).composeKeyMapping[TweetId]((_, LatestEmbeddingVersion)) - .mapValues[SimClustersEmbedding](_.embedding.truncate(maxLength)) - - def longestL2NormTweetEmbeddingStoreManhattan( - mhMtlsParams: ManhattanKVClientMtlsParams, - dataset: String, - statsReceiver: StatsReceiver, - maxLength: Int = 50 - ): ReadableStore[TweetId, SimClustersEmbedding] = - ManhattanFromStratoStore - .createPersistentTweetStore( - dataset = dataset, - mhMtlsParams = mhMtlsParams, - statsReceiver = statsReceiver - ).composeKeyMapping[TweetId]((_, LongestL2EmbeddingVersion)) - .mapValues[SimClustersEmbedding](_.embedding.truncate(maxLength)) - - /** - * The writeable store for Persistent Tweet embedding. Only available in SimClusters package. - */ - private[simclusters_v2] def persistentTweetEmbeddingStore( - stratoClient: Client, - column: String - ): Store[PersistentTweetEmbeddingId, PersistentSimClustersEmbedding] = { - StratoStore - .withUnitView[(TweetId, Timestamp), PersistentSimClustersEmbedding](stratoClient, column) - .composeKeyMapping(_.toTuple) - } - - type Timestamp = Long - - case class PersistentTweetEmbeddingId( - tweetId: TweetId, - timestampInMs: Timestamp = LatestEmbeddingVersion) { - lazy val toTuple: (TweetId, Timestamp) = (tweetId, timestampInMs) - } - - // Special version - reserved for the latest version of the embedding - private[summingbird] val LatestEmbeddingVersion = 0L - // Special version - reserved for the embedding with the longest L2 norm - private[summingbird] val LongestL2EmbeddingVersion = 1L - - // The tweet embedding store keeps at most 20 LKeys - private[stores] val DefaultSlice = Slice[Long](from = None, to = None, limit = None) -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ProducerClusterEmbeddingReadableStores.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ProducerClusterEmbeddingReadableStores.docx new file mode 100644 index 000000000..7e50b8a3d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ProducerClusterEmbeddingReadableStores.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ProducerClusterEmbeddingReadableStores.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/ProducerClusterEmbeddingReadableStores.scala deleted file mode 100644 index e978aa9f9..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/ProducerClusterEmbeddingReadableStores.scala +++ /dev/null @@ -1,101 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.bijection.Injection -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.simclusters_v2.thriftscala.PersistedFullClusterId -import com.twitter.simclusters_v2.thriftscala.TopProducersWithScore -import com.twitter.simclusters_v2.thriftscala.TopSimClustersWithScore -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan.Athena -import com.twitter.storehaus_internal.manhattan.ManhattanRO -import com.twitter.storehaus_internal.manhattan.ManhattanROConfig -import com.twitter.storehaus_internal.util.ApplicationID -import com.twitter.storehaus_internal.util.DatasetName -import com.twitter.storehaus_internal.util.HDFSPath - -object ProducerClusterEmbeddingReadableStores { - - implicit val longInject: Injection[Long, Array[Byte]] = Injection.long2BigEndian - implicit val clusterInject: Injection[TopSimClustersWithScore, Array[Byte]] = - CompactScalaCodec(TopSimClustersWithScore) - implicit val producerInject: Injection[TopProducersWithScore, Array[Byte]] = - CompactScalaCodec(TopProducersWithScore) - implicit val clusterIdInject: Injection[PersistedFullClusterId, Array[Byte]] = - CompactScalaCodec(PersistedFullClusterId) - - private val appId = "simclusters_v2" - - def getSimClusterEmbeddingTopKProducersStore( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[PersistedFullClusterId, TopProducersWithScore] = { - ManhattanRO.getReadableStoreWithMtls[PersistedFullClusterId, TopProducersWithScore]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("simcluster_embedding_top_k_producers_by_fav_score_20m_145k_updated"), - Athena - ), - mhMtlsParams - ) - } - - def getProducerTopKSimClustersEmbeddingsStore( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[Long, TopSimClustersWithScore] = { - val datasetName = "producer_top_k_simcluster_embeddings_by_fav_score_20m_145k_updated" - ManhattanRO.getReadableStoreWithMtls[Long, TopSimClustersWithScore]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName(datasetName), - Athena - ), - mhMtlsParams - ) - } - - def getProducerTopKSimClusters2020EmbeddingsStore( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[Long, TopSimClustersWithScore] = { - val datasetName = "producer_top_k_simcluster_embeddings_by_fav_score_20m_145k_2020" - ManhattanRO.getReadableStoreWithMtls[Long, TopSimClustersWithScore]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName(datasetName), - Athena - ), - mhMtlsParams - ) - } - - def getSimClusterEmbeddingTopKProducersByFollowStore( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[PersistedFullClusterId, TopProducersWithScore] = { - ManhattanRO.getReadableStoreWithMtls[PersistedFullClusterId, TopProducersWithScore]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("simcluster_embedding_top_k_producers_by_follow_score_20m_145k_updated"), - Athena - ), - mhMtlsParams - ) - } - - def getProducerTopKSimClustersEmbeddingsByFollowStore( - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[Long, TopSimClustersWithScore] = { - ManhattanRO.getReadableStoreWithMtls[Long, TopSimClustersWithScore]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(appId), - DatasetName("producer_top_k_simcluster_embeddings_by_follow_score_20m_145k_2020"), - Athena - ), - mhMtlsParams - ) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/SemanticCoreEntityEmbeddingStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/SemanticCoreEntityEmbeddingStore.docx new file mode 100644 index 000000000..e0a108a71 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/SemanticCoreEntityEmbeddingStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/SemanticCoreEntityEmbeddingStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/SemanticCoreEntityEmbeddingStore.scala deleted file mode 100644 index ccdea937c..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/SemanticCoreEntityEmbeddingStore.scala +++ /dev/null @@ -1,49 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.frigate.common.store.strato.StratoStore -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.ModelVersions._ -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - InternalId, - LocaleEntityId, - SimClustersEmbeddingId, - SimClustersEmbedding => ThriftSimClustersEmbedding -} -import com.twitter.storehaus.ReadableStore -import com.twitter.strato.client.Client -import com.twitter.strato.thrift.ScroogeConvImplicits._ -import com.twitter.simclusters_v2.common.SimClustersEmbedding - -/** - * entity -> List< cluster > - */ -object SemanticCoreEntityEmbeddingStore { - - private val column = - "recommendations/simclusters_v2/embeddings/semanticCoreEntityPerLanguageEmbeddings20M145KUpdated" - - /** - * Default store, wrapped in generic data types. Use this if you know the underlying key struct. - */ - private def getDefaultStore( - stratoClient: Client - ): ReadableStore[SimClustersEmbeddingId, ThriftSimClustersEmbedding] = { - StratoStore - .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding](stratoClient, column) - } - - def getFavBasedLocaleEntityEmbeddingStore( - stratoClient: Client - ): ReadableStore[LocaleEntityId, SimClustersEmbedding] = { - getDefaultStore(stratoClient) - .composeKeyMapping[LocaleEntityId] { entityId => - SimClustersEmbeddingId( - EmbeddingType.FavBasedSematicCoreEntity, - ModelVersions.Model20M145KUpdated, - InternalId.LocaleEntityId(entityId) - ) - } - .mapValues(SimClustersEmbedding(_)) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/SimClustersManhattanReadableStoreForReadWriteDataset.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/SimClustersManhattanReadableStoreForReadWriteDataset.docx new file mode 100644 index 000000000..ea71e371e Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/SimClustersManhattanReadableStoreForReadWriteDataset.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/SimClustersManhattanReadableStoreForReadWriteDataset.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/SimClustersManhattanReadableStoreForReadWriteDataset.scala deleted file mode 100644 index 63c1e772c..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/SimClustersManhattanReadableStoreForReadWriteDataset.scala +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.storage.client.manhattan.kv.ManhattanKVClient -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storage.client.manhattan.kv.ManhattanKVEndpointBuilder -import com.twitter.storage.client.manhattan.kv.impl.Component -import com.twitter.storage.client.manhattan.kv.impl.DescriptorP1L0 -import com.twitter.storage.client.manhattan.kv.impl.KeyDescriptor -import com.twitter.storage.client.manhattan.kv.impl.ValueDescriptor -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan.ManhattanCluster -import com.twitter.storehaus_internal.manhattan.Adama -import com.twitter.storage.client.manhattan.bijections.Bijections.BinaryScalaInjection -import com.twitter.storage.client.manhattan.kv.Guarantee -import com.twitter.conversions.DurationOps._ -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.stitch.Stitch -import com.twitter.storage.client.manhattan.bijections.Bijections.LongInjection -import com.twitter.util.Future - -/** - * Manhattan Readable Store to fetch simcluster embedding from a read-write dataset. - * Only read operations are allowed through this store. - * @param appId The "application id" - * @param datasetName The MH dataset name. - * @param label The human readable label for the finagle thrift client - * @param mtlsParams Client service identifier to use to authenticate with Manhattan service - * @param manhattanCluster Manhattan RW cluster - **/ -class SimClustersManhattanReadableStoreForReadWriteDataset( - appId: String, - datasetName: String, - label: String, - mtlsParams: ManhattanKVClientMtlsParams, - manhattanCluster: ManhattanCluster = Adama) - extends ReadableStore[SimClustersEmbeddingId, ClustersUserIsInterestedIn] { - /* - Setting up a new builder to read from Manhattan RW dataset. This is specifically required for - BeT project where we update the MH RW dataset (every 2 hours) using cloud shuttle service. - */ - val destName = manhattanCluster.wilyName - val endPoint = ManhattanKVEndpointBuilder(ManhattanKVClient(appId, destName, mtlsParams, label)) - .defaultGuarantee(Guarantee.SoftDcReadMyWrites) - .build() - - val keyDesc = KeyDescriptor(Component(LongInjection), Component()).withDataset(datasetName) - val valueDesc = ValueDescriptor(BinaryScalaInjection(ClustersUserIsInterestedIn)) - - override def get( - embeddingId: SimClustersEmbeddingId - ): Future[Option[ClustersUserIsInterestedIn]] = { - embeddingId match { - case SimClustersEmbeddingId(theEmbeddingType, theModelVersion, InternalId.UserId(userId)) => - val populatedKey: DescriptorP1L0.FullKey[Long] = keyDesc.withPkey(userId) - // returns result - val mhValue = Stitch.run(endPoint.get(populatedKey, valueDesc)) - mhValue.map { - case Some(x) => Option(x.contents) - case _ => None - } - case _ => Future.None - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TfgTopicEmbeddingsStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TfgTopicEmbeddingsStore.docx new file mode 100644 index 000000000..b9bf2591c Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TfgTopicEmbeddingsStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TfgTopicEmbeddingsStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TfgTopicEmbeddingsStore.scala deleted file mode 100644 index 1332c573a..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TfgTopicEmbeddingsStore.scala +++ /dev/null @@ -1,46 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.frigate.common.store.strato.StratoStore -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.ModelVersions._ -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.TopicId -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} -import com.twitter.storehaus.ReadableStore -import com.twitter.strato.client.Client -import com.twitter.strato.thrift.ScroogeConvImplicits._ -import com.twitter.simclusters_v2.common.SimClustersEmbedding - -/** - * TopicId -> List< cluster> - */ -object TfgTopicEmbeddingsStore { - - private val favBasedColumn20M145K2020 = - "recommendations/simclusters_v2/embeddings/favBasedTFGTopic20M145K2020" - - private def getStore( - stratoClient: Client, - column: String - ): ReadableStore[SimClustersEmbeddingId, ThriftSimClustersEmbedding] = { - StratoStore - .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding](stratoClient, column) - } - - def getFavBasedLocaleEntityEmbedding2020Store( - stratoClient: Client, - ): ReadableStore[TopicId, SimClustersEmbedding] = { - - getStore(stratoClient, favBasedColumn20M145K2020) - .composeKeyMapping[TopicId] { topicId => - SimClustersEmbeddingId( - EmbeddingType.FavTfgTopic, - ModelVersions.Model20M145K2020, - InternalId.TopicId(topicId) - ) - } - .mapValues(SimClustersEmbedding(_)) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForEntityReadableStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForEntityReadableStore.docx new file mode 100644 index 000000000..1accdcfa0 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForEntityReadableStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForEntityReadableStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForEntityReadableStore.scala deleted file mode 100644 index baa3fa2a1..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForEntityReadableStore.scala +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.simclusters_v2.summingbird.common.EntityUtil -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Future -import com.twitter.util.Time - -case class TopKClustersForEntityReadableStore( - underlyingStore: ReadableStore[EntityWithVersion, TopKClustersWithScores]) - extends ReadableStore[EntityWithVersion, TopKClustersWithScores] { - - override def multiGet[K1 <: EntityWithVersion]( - ks: Set[K1] - ): Map[K1, Future[Option[TopKClustersWithScores]]] = { - val nowInMs = Time.now.inMilliseconds - underlyingStore - .multiGet(ks) - .mapValues { resFuture => - resFuture.map { resOpt => - resOpt.map { clustersWithScores => - clustersWithScores.copy( - topClustersByFavClusterNormalizedScore = EntityUtil.updateScoreWithLatestTimestamp( - clustersWithScores.topClustersByFavClusterNormalizedScore, - nowInMs - ), - topClustersByFollowClusterNormalizedScore = EntityUtil.updateScoreWithLatestTimestamp( - clustersWithScores.topClustersByFollowClusterNormalizedScore, - nowInMs - ) - ) - } - } - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForTweetReadableStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForTweetReadableStore.docx new file mode 100644 index 000000000..765e4be18 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForTweetReadableStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForTweetReadableStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForTweetReadableStore.scala deleted file mode 100644 index f2381a2a5..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKClustersForTweetReadableStore.scala +++ /dev/null @@ -1,176 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.summingbird.common.Implicits.batcher -import com.twitter.simclusters_v2.summingbird.common.Implicits.topKClustersWithScoresCodec -import com.twitter.simclusters_v2.summingbird.common.Implicits.topKClustersWithScoresMonoid -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.Environment -import com.twitter.simclusters_v2.summingbird.common.ClientConfigs -import com.twitter.simclusters_v2.summingbird.common.Configs -import com.twitter.simclusters_v2.summingbird.common.Implicits -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus.algebra.MergeableStore -import com.twitter.storehaus_internal.memcache.Memcache -import com.twitter.summingbird.batch.BatchID -import com.twitter.summingbird.store.ClientStore -import com.twitter.summingbird_internal.bijection.BatchPairImplicits -import com.twitter.util.Duration -import com.twitter.util.Future - -object TopKClustersForTweetReadableStore { - - private[summingbird] final lazy val onlineMergeableStore: ( - String, - ServiceIdentifier - ) => MergeableStore[(EntityWithVersion, BatchID), TopKClustersWithScores] = { - (storePath: String, serviceIdentifier: ServiceIdentifier) => - Memcache.getMemcacheStore[(EntityWithVersion, BatchID), TopKClustersWithScores]( - ClientConfigs.tweetTopKClustersMemcacheConfig(storePath, serviceIdentifier) - )( - BatchPairImplicits.keyInjection[EntityWithVersion](Implicits.topKClustersKeyCodec), - topKClustersWithScoresCodec, - topKClustersWithScoresMonoid - ) - } - - final lazy val defaultStore: ( - String, - ServiceIdentifier - ) => ReadableStore[EntityWithVersion, TopKClustersWithScores] = { - (storePath: String, serviceIdentifier: ServiceIdentifier) => - // note that DefaultTopKClustersForEntityReadableStore is reused here because they share the - // same structure - TopKClustersForEntityReadableStore( - ClientStore(this.onlineMergeableStore(storePath, serviceIdentifier), Configs.batchesToKeep)) - } -} - -case class TweetKey( - tweetId: Long, - modelVersion: String, - embeddingType: EmbeddingType = EmbeddingType.FavBasedTweet, - halfLife: Duration = Configs.HalfLife) { - - lazy val modelVersionThrift: ModelVersion = ModelVersions.toModelVersion(modelVersion) - - lazy val simClustersEmbeddingId: SimClustersEmbeddingId = - SimClustersEmbeddingId(embeddingType, modelVersionThrift, InternalId.TweetId(tweetId)) -} - -object TweetKey { - - def apply(simClustersEmbeddingId: SimClustersEmbeddingId): TweetKey = { - simClustersEmbeddingId match { - case SimClustersEmbeddingId(embeddingType, modelVersion, InternalId.TweetId(tweetId)) => - TweetKey(tweetId, ModelVersions.toKnownForModelVersion(modelVersion), embeddingType) - case id => - throw new IllegalArgumentException(s"Invalid $id for TweetKey") - } - } - -} - -case class TopKClustersForTweetKeyReadableStore( - proxyMap: Map[(EmbeddingType, String), ReadableStore[EntityWithVersion, TopKClustersWithScores]], - halfLifeDuration: Duration, - topKClustersWithScoresToSeq: TopKClustersWithScores => Seq[(Int, Double)], - maxResult: Option[Int] = None) - extends ReadableStore[TweetKey, Seq[(Int, Double)]] { - - private val modifiedProxyMap = proxyMap.map { - case ((embeddingType, modelVersion), proxy) => - (embeddingType, modelVersion) -> proxy.composeKeyMapping { key: TweetKey => - EntityWithVersion( - SimClusterEntity.TweetId(key.tweetId), - // Fast fail if the model version is invalid. - ModelVersions.toModelVersion(modelVersion)) - } - } - - override def multiGet[K1 <: TweetKey]( - keys: Set[K1] - ): Map[K1, Future[Option[Seq[(Int, Double)]]]] = { - val (validKeys, invalidKeys) = keys.partition { tweetKey => - proxyMap.contains((tweetKey.embeddingType, tweetKey.modelVersion)) && - halfLifeDuration.inMilliseconds == Configs.HalfLifeInMs - } - - val resultsFuture = validKeys.groupBy(key => (key.embeddingType, key.modelVersion)).flatMap { - case (typeModelTuple, subKeys) => - modifiedProxyMap(typeModelTuple).multiGet(subKeys) - } - - resultsFuture.mapValues { topKClustersWithScoresFut => - for (topKClustersWithScoresOpt <- topKClustersWithScoresFut) yield { - for { - topKClustersWithScores <- topKClustersWithScoresOpt - } yield { - val results = topKClustersWithScoresToSeq(topKClustersWithScores) - maxResult match { - case Some(max) => - results.take(max) - case None => - results - } - } - } - } ++ invalidKeys.map { key => (key, Future.None) }.toMap - } -} - -object TopKClustersForTweetKeyReadableStore { - // Use Prod cache by default - def defaultProxyMap( - serviceIdentifier: ServiceIdentifier - ): Map[(EmbeddingType, String), ReadableStore[EntityWithVersion, TopKClustersWithScores]] = - SimClustersProfile.tweetJobProfileMap(Environment.Prod).mapValues { profile => - TopKClustersForTweetReadableStore - .defaultStore(profile.clusterTopKTweetsPath, serviceIdentifier) - } - val defaultHalfLife: Duration = Duration.fromMilliseconds(Configs.HalfLifeInMs) - - def defaultStore( - serviceIdentifier: ServiceIdentifier - ): ReadableStore[TweetKey, Seq[(Int, Double)]] = - TopKClustersForTweetKeyReadableStore( - defaultProxyMap(serviceIdentifier), - defaultHalfLife, - getTopClustersWithScoresByFavClusterNormalizedScore - ) - - def overrideLimitDefaultStore( - maxResult: Int, - serviceIdentifier: ServiceIdentifier - ): ReadableStore[TweetKey, Seq[(Int, Double)]] = { - TopKClustersForTweetKeyReadableStore( - defaultProxyMap(serviceIdentifier), - defaultHalfLife, - getTopClustersWithScoresByFavClusterNormalizedScore, - Some(maxResult) - ) - } - - private def getTopClustersWithScoresByFavClusterNormalizedScore( - topKClustersWithScores: TopKClustersWithScores - ): Seq[(Int, Double)] = { - { - for { - clusterIdWIthScores <- topKClustersWithScores.topClustersByFavClusterNormalizedScore - } yield { - ( - for { - (clusterId, scores) <- clusterIdWIthScores - favClusterNormalized8HrHalfLifeScore <- scores.favClusterNormalized8HrHalfLifeScore - if favClusterNormalized8HrHalfLifeScore.value > 0.0 - } yield { - clusterId -> favClusterNormalized8HrHalfLifeScore.value - } - ).toSeq.sortBy(-_._2) - } - }.getOrElse(Nil) - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKTweetsForClusterReadableStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKTweetsForClusterReadableStore.docx new file mode 100644 index 000000000..420cccb92 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKTweetsForClusterReadableStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKTweetsForClusterReadableStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKTweetsForClusterReadableStore.scala deleted file mode 100644 index 39284424f..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TopKTweetsForClusterReadableStore.scala +++ /dev/null @@ -1,298 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.bijection.Injection -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.frigate.common.store.strato.StratoStore -import com.twitter.relevance_platform.simclustersann.multicluster.ClusterTweetIndexStoreConfig -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.summingbird.common.ClientConfigs -import com.twitter.simclusters_v2.summingbird.common.Configs -import com.twitter.simclusters_v2.summingbird.common.EntityUtil -import com.twitter.simclusters_v2.summingbird.common.Implicits -import com.twitter.simclusters_v2.summingbird.common.Implicits.batcher -import com.twitter.simclusters_v2.summingbird.common.Implicits.topKTweetsWithScoresCodec -import com.twitter.simclusters_v2.summingbird.common.Implicits.topKTweetsWithScoresMonoid -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.Environment -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.FullClusterId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.MultiModelTopKTweetsWithScores -import com.twitter.simclusters_v2.thriftscala.TopKTweetsWithScores -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus.Store -import com.twitter.storehaus.algebra.MergeableStore -import com.twitter.storehaus_internal.manhattan.ManhattanRO -import com.twitter.storehaus_internal.manhattan.ManhattanROConfig -import com.twitter.storehaus_internal.memcache.Memcache -import com.twitter.storehaus_internal.util.ApplicationID -import com.twitter.storehaus_internal.util.DatasetName -import com.twitter.storehaus_internal.util.HDFSPath -import com.twitter.strato.client.Client -import com.twitter.strato.thrift.ScroogeConvImplicits._ -import com.twitter.summingbird.batch.BatchID -import com.twitter.summingbird.store.ClientStore -import com.twitter.summingbird_internal.bijection.BatchPairImplicits -import com.twitter.util.Duration -import com.twitter.util.Future -import com.twitter.util.Time - -/** - * Comparing to underlyingStore, this store decays all the values to current timestamp - */ -case class TopKTweetsForClusterReadableStore( - underlyingStore: ReadableStore[FullClusterId, TopKTweetsWithScores]) - extends ReadableStore[FullClusterId, TopKTweetsWithScores] { - - override def multiGet[K1 <: FullClusterId]( - ks: Set[K1] - ): Map[K1, Future[Option[TopKTweetsWithScores]]] = { - val nowInMs = Time.now.inMilliseconds - underlyingStore - .multiGet(ks) - .mapValues { resFuture => - resFuture.map { resOpt => - resOpt.map { tweetsWithScores => - tweetsWithScores.copy( - topTweetsByFavClusterNormalizedScore = EntityUtil.updateScoreWithLatestTimestamp( - tweetsWithScores.topTweetsByFavClusterNormalizedScore, - nowInMs), - topTweetsByFollowClusterNormalizedScore = EntityUtil.updateScoreWithLatestTimestamp( - tweetsWithScores.topTweetsByFollowClusterNormalizedScore, - nowInMs) - ) - } - } - } - } -} - -object TopKTweetsForClusterReadableStore { - - private[summingbird] final lazy val onlineMergeableStore: ( - String, - ServiceIdentifier - ) => MergeableStore[(FullClusterId, BatchID), TopKTweetsWithScores] = { - (storePath: String, serviceIdentifier: ServiceIdentifier) => - Memcache.getMemcacheStore[(FullClusterId, BatchID), TopKTweetsWithScores]( - ClientConfigs.clusterTopTweetsMemcacheConfig(storePath, serviceIdentifier) - )( - BatchPairImplicits.keyInjection[FullClusterId](Implicits.fullClusterIdCodec), - topKTweetsWithScoresCodec, - topKTweetsWithScoresMonoid - ) - } - - final lazy val defaultStore: ( - String, - ServiceIdentifier - ) => ReadableStore[FullClusterId, TopKTweetsWithScores] = { - (storePath: String, serviceIdentifier: ServiceIdentifier) => - TopKTweetsForClusterReadableStore( - ClientStore( - TopKTweetsForClusterReadableStore.onlineMergeableStore(storePath, serviceIdentifier), - Configs.batchesToKeep - )) - } -} - -object MultiModelTopKTweetsForClusterReadableStore { - - private[simclusters_v2] def MultiModelTopKTweetsForClusterReadableStore( - stratoClient: Client, - column: String - ): Store[Int, MultiModelTopKTweetsWithScores] = { - StratoStore - .withUnitView[Int, MultiModelTopKTweetsWithScores](stratoClient, column) - } -} - -case class ClusterKey( - clusterId: ClusterId, - modelVersion: String, - embeddingType: EmbeddingType = EmbeddingType.FavBasedTweet, - halfLife: Duration = Configs.HalfLife) { - lazy val modelVersionThrift: ModelVersion = ModelVersions.toModelVersion(modelVersion) -} - -case class TopKTweetsForClusterKeyReadableStore( - proxyMap: Map[(EmbeddingType, String), ReadableStore[FullClusterId, TopKTweetsWithScores]], - halfLife: Duration, - topKTweetsWithScoresToSeq: TopKTweetsWithScores => Seq[(Long, Double)], - maxResult: Option[Int] = None) - extends ReadableStore[ClusterKey, Seq[(Long, Double)]] { - - private val modifiedProxyMap = proxyMap.map { - case (typeModelTuple, proxy) => - typeModelTuple -> proxy.composeKeyMapping { key: ClusterKey => - FullClusterId(ModelVersions.toModelVersion(typeModelTuple._2), key.clusterId) - } - } - - override def multiGet[K1 <: ClusterKey]( - keys: Set[K1] - ): Map[K1, Future[Option[Seq[(Long, Double)]]]] = { - val (validKeys, invalidKeys) = keys.partition { clusterKey => - proxyMap.contains( - (clusterKey.embeddingType, clusterKey.modelVersion)) && clusterKey.halfLife == halfLife - } - - val resultsFuture = validKeys.groupBy(key => (key.embeddingType, key.modelVersion)).flatMap { - case (typeModelTuple, subKeys) => - modifiedProxyMap(typeModelTuple).multiGet(subKeys) - } - - resultsFuture.mapValues { topKTweetsWithScoresFut => - for (topKTweetsWithScoresOpt <- topKTweetsWithScoresFut) yield { - for { - topKTweetsWithScores <- topKTweetsWithScoresOpt - } yield { - val results = topKTweetsWithScoresToSeq(topKTweetsWithScores) - maxResult match { - case Some(max) => - results.take(max) - case None => - results - } - } - } - } ++ invalidKeys.map { key => (key, Future.None) }.toMap - } -} - -object TopKTweetsForClusterKeyReadableStore { - implicit val fullClusterIdInjection: Injection[FullClusterId, Array[Byte]] = - CompactScalaCodec(FullClusterId) - - // Use Prod cache by default - def defaultProxyMap( - serviceIdentifier: ServiceIdentifier, - ): Map[(EmbeddingType, String), ReadableStore[FullClusterId, TopKTweetsWithScores]] = - SimClustersProfile.tweetJobProfileMap(Environment.Prod).mapValues { profile => - TopKTweetsForClusterReadableStore - .defaultStore(profile.clusterTopKTweetsPath, serviceIdentifier) - } - val defaultHalfLife: Duration = Configs.HalfLife - - def defaultStore( - serviceIdentifier: ServiceIdentifier - ): ReadableStore[ClusterKey, Seq[(Long, Double)]] = - TopKTweetsForClusterKeyReadableStore( - defaultProxyMap(serviceIdentifier), - defaultHalfLife, - getTopTweetsWithScoresByFavClusterNormalizedScore - ) - - def storeUsingFollowClusterNormalizedScore( - serviceIdentifier: ServiceIdentifier - ): ReadableStore[ClusterKey, Seq[(Long, Double)]] = - TopKTweetsForClusterKeyReadableStore( - defaultProxyMap(serviceIdentifier), - defaultHalfLife, - getTopTweetsWithScoresByFollowClusterNormalizedScore - ) - - def overrideLimitDefaultStore( - maxResult: Int, - serviceIdentifier: ServiceIdentifier, - ): ReadableStore[ClusterKey, Seq[(Long, Double)]] = { - TopKTweetsForClusterKeyReadableStore( - defaultProxyMap(serviceIdentifier), - defaultHalfLife, - getTopTweetsWithScoresByFavClusterNormalizedScore, - Some(maxResult) - ) - } - - private def getTopTweetsWithScoresByFavClusterNormalizedScore( - topKTweets: TopKTweetsWithScores - ): Seq[(Long, Double)] = { - { - for { - tweetIdWithScores <- topKTweets.topTweetsByFavClusterNormalizedScore - } yield { - ( - for { - (tweetId, scores) <- tweetIdWithScores - favClusterNormalized8HrHalfLifeScore <- scores.favClusterNormalized8HrHalfLifeScore - if favClusterNormalized8HrHalfLifeScore.value > 0.0 - } yield { - tweetId -> favClusterNormalized8HrHalfLifeScore.value - } - ).toSeq.sortBy(-_._2) - } - }.getOrElse(Nil) - } - - private def getTopTweetsWithScoresByFollowClusterNormalizedScore( - topKTweets: TopKTweetsWithScores - ): Seq[(Long, Double)] = { - { - for { - tweetIdWithScores <- topKTweets.topTweetsByFollowClusterNormalizedScore - } yield { - ( - for { - (tweetId, scores) <- tweetIdWithScores - followClusterNormalized8HrHalfLifeScore <- - scores.followClusterNormalized8HrHalfLifeScore - if followClusterNormalized8HrHalfLifeScore.value > 0.0 - } yield { - tweetId -> followClusterNormalized8HrHalfLifeScore.value - } - ).toSeq.sortBy(-_._2) - } - }.getOrElse(Nil) - } - - def getClusterToTopKTweetsStoreFromManhattanRO( - maxResults: Int, - manhattanConfig: ClusterTweetIndexStoreConfig.Manhattan, - serviceIdentifier: ServiceIdentifier, - ): ReadableStore[ClusterKey, Seq[(TweetId, Double)]] = { - ManhattanRO - .getReadableStoreWithMtls[FullClusterId, TopKTweetsWithScores]( - ManhattanROConfig( - HDFSPath(""), - ApplicationID(manhattanConfig.applicationID), - DatasetName(manhattanConfig.datasetName), - manhattanConfig.manhattanCluster - ), - ManhattanKVClientMtlsParams(serviceIdentifier) - ).composeKeyMapping[ClusterKey] { clusterKey => - FullClusterId( - modelVersion = ModelVersions.toModelVersion(clusterKey.modelVersion), - clusterId = clusterKey.clusterId - ) - }.mapValues { topKTweetsWithScores => - // Only return maxResults tweets for each cluster Id - getTopTweetsWithScoresByFavClusterNormalizedScore(topKTweetsWithScores).take(maxResults) - } - } - - def getClusterToTopKTweetsStoreFromMemCache( - maxResults: Int, - memCacheConfig: ClusterTweetIndexStoreConfig.Memcached, - serviceIdentifier: ServiceIdentifier, - ): ReadableStore[ClusterKey, Seq[(TweetId, Double)]] = { - TopKTweetsForClusterReadableStore( - ClientStore( - TopKTweetsForClusterReadableStore - .onlineMergeableStore(memCacheConfig.memcachedDest, serviceIdentifier), - Configs.batchesToKeep - )) - .composeKeyMapping[ClusterKey] { clusterKey => - FullClusterId( - modelVersion = ModelVersions.toModelVersion(clusterKey.modelVersion), - clusterId = clusterKey.clusterId - ) - }.mapValues { topKTweetsWithScores => - // Only return maxResults tweets for each cluster Id - getTopTweetsWithScoresByFavClusterNormalizedScore(topKTweetsWithScores).take(maxResults) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TweetStatusCountsStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TweetStatusCountsStore.docx new file mode 100644 index 000000000..33ed08de5 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TweetStatusCountsStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TweetStatusCountsStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/TweetStatusCountsStore.scala deleted file mode 100644 index ce7ee2409..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/TweetStatusCountsStore.scala +++ /dev/null @@ -1,29 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.frigate.common.store.strato.StratoFetchableStore -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.storehaus.ReadableStore -import com.twitter.strato.client.Client -import com.twitter.strato.thrift.ScroogeConvImplicits._ -import com.twitter.tweetypie.thriftscala.{GetTweetOptions, StatusCounts, Tweet} - -object TweetStatusCountsStore { - - def tweetStatusCountsStore( - stratoClient: Client, - column: String - ): ReadableStore[TweetId, StatusCounts] = { - StratoFetchableStore - .withView[TweetId, GetTweetOptions, Tweet](stratoClient, column, getTweetOptions) - .mapValues(_.counts.getOrElse(emptyStatusCount)) - } - - private val emptyStatusCount = StatusCounts() - - private val getTweetOptions = - GetTweetOptions( - includeRetweetCount = true, - includeReplyCount = true, - includeFavoriteCount = true, - includeQuoteCount = true) -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserInterestedInReadableStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserInterestedInReadableStore.docx new file mode 100644 index 000000000..2e56c7774 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserInterestedInReadableStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserInterestedInReadableStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserInterestedInReadableStore.scala deleted file mode 100644 index e318c9185..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserInterestedInReadableStore.scala +++ /dev/null @@ -1,263 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.bijection.Injection -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.UserId -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan.ManhattanCluster -import com.twitter.storehaus_internal.manhattan.Athena -import com.twitter.storehaus_internal.manhattan.ManhattanRO -import com.twitter.storehaus_internal.manhattan.ManhattanROConfig -import com.twitter.storehaus_internal.manhattan.Nash -import com.twitter.storehaus_internal.util.ApplicationID -import com.twitter.storehaus_internal.util.DatasetName -import com.twitter.storehaus_internal.util.HDFSPath - -object UserInterestedInReadableStore { - - // Clusters whose size is greater than this will not be considered. This is how the using UTEG - // experiment was run (because it could not process such clusters), and we don't have such a - // restriction for the Summingbird/Memcache implementation, but noticing that we aren't scoring - // tweets correctly in the big clusters. The fix for this seems a little involved, so for now - // let's just exclude such clusters. - val MaxClusterSizeForUserInterestedInDataset: Int = 5e6.toInt - - val modelVersionToDatasetMap: Map[String, String] = Map( - ModelVersions.Model20M145KDec11 -> "simclusters_v2_interested_in", - ModelVersions.Model20M145KUpdated -> "simclusters_v2_interested_in_20m_145k_updated", - ModelVersions.Model20M145K2020 -> "simclusters_v2_interested_in_20m_145k_2020" - ) - - // Producer embedding based User InterestedIn. - val modelVersionToDenserDatasetMap: Map[String, String] = Map( - ModelVersions.Model20M145KUpdated -> "simclusters_v2_interested_in_from_producer_embeddings_model20m145kupdated" - ) - - val modelVersionToIIAPEDatasetMap: Map[String, String] = Map( - ModelVersions.Model20M145K2020 -> "simclusters_v2_interested_in_from_ape_20m145k2020" - ) - - val modelVersionToIIKFLiteDatasetMap: Map[String, String] = Map( - ModelVersions.Model20M145K2020 -> "simclusters_v2_interested_in_lite_20m_145k_2020" - ) - - val modelVersionToNextInterestedInDatasetMap: Map[String, String] = Map( - ModelVersions.Model20M145K2020 -> "bet_consumer_embedding_v2" - ) - - val defaultModelVersion: String = ModelVersions.Model20M145KUpdated - val knownModelVersions: String = modelVersionToDatasetMap.keys.mkString(",") - - def defaultStoreWithMtls( - mhMtlsParams: ManhattanKVClientMtlsParams, - modelVersion: String = defaultModelVersion - ): ReadableStore[UserId, ClustersUserIsInterestedIn] = { - if (!modelVersionToDatasetMap.contains(modelVersion)) { - throw new IllegalArgumentException( - "Unknown model version: " + modelVersion + ". Known model versions: " + knownModelVersions) - } - this.getStore("simclusters_v2", mhMtlsParams, modelVersionToDatasetMap(modelVersion)) - } - - def defaultSimClustersEmbeddingStoreWithMtls( - mhMtlsParams: ManhattanKVClientMtlsParams, - embeddingType: EmbeddingType, - modelVersion: ModelVersion - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - defaultStoreWithMtls(mhMtlsParams, ModelVersions.toKnownForModelVersion(modelVersion)) - .composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(theEmbeddingType, theModelVersion, InternalId.UserId(userId)) - if theEmbeddingType == embeddingType && theModelVersion == modelVersion => - userId - }.mapValues( - toSimClustersEmbedding(_, embeddingType, Some(MaxClusterSizeForUserInterestedInDataset))) - } - - def defaultIIKFLiteStoreWithMtls( - mhMtlsParams: ManhattanKVClientMtlsParams, - modelVersion: String = defaultModelVersion - ): ReadableStore[Long, ClustersUserIsInterestedIn] = { - if (!modelVersionToIIKFLiteDatasetMap.contains(modelVersion)) { - throw new IllegalArgumentException( - "Unknown model version: " + modelVersion + ". Known model versions: " + knownModelVersions) - } - getStore("simclusters_v2", mhMtlsParams, modelVersionToIIKFLiteDatasetMap(modelVersion)) - } - - def defaultIIPEStoreWithMtls( - mhMtlsParams: ManhattanKVClientMtlsParams, - modelVersion: String = defaultModelVersion - ): ReadableStore[Long, ClustersUserIsInterestedIn] = { - if (!modelVersionToDatasetMap.contains(modelVersion)) { - throw new IllegalArgumentException( - "Unknown model version: " + modelVersion + ". Known model versions: " + knownModelVersions) - } - getStore("simclusters_v2", mhMtlsParams, modelVersionToDenserDatasetMap(modelVersion)) - } - - def defaultIIAPEStoreWithMtls( - mhMtlsParams: ManhattanKVClientMtlsParams, - modelVersion: String = defaultModelVersion - ): ReadableStore[Long, ClustersUserIsInterestedIn] = { - if (!modelVersionToDatasetMap.contains(modelVersion)) { - throw new IllegalArgumentException( - "Unknown model version: " + modelVersion + ". Known model versions: " + knownModelVersions) - } - getStore("simclusters_v2", mhMtlsParams, modelVersionToIIAPEDatasetMap(modelVersion)) - } - - def defaultIIPESimClustersEmbeddingStoreWithMtls( - mhMtlsParams: ManhattanKVClientMtlsParams, - embeddingType: EmbeddingType, - modelVersion: ModelVersion - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - defaultIIPEStoreWithMtls(mhMtlsParams, ModelVersions.toKnownForModelVersion(modelVersion)) - .composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(theEmbeddingType, theModelVersion, InternalId.UserId(userId)) - if theEmbeddingType == embeddingType && theModelVersion == modelVersion => - userId - - }.mapValues(toSimClustersEmbedding(_, embeddingType)) - } - - def defaultIIAPESimClustersEmbeddingStoreWithMtls( - mhMtlsParams: ManhattanKVClientMtlsParams, - embeddingType: EmbeddingType, - modelVersion: ModelVersion - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - defaultIIAPEStoreWithMtls(mhMtlsParams, ModelVersions.toKnownForModelVersion(modelVersion)) - .composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(theEmbeddingType, theModelVersion, InternalId.UserId(userId)) - if theEmbeddingType == embeddingType && theModelVersion == modelVersion => - userId - }.mapValues(toSimClustersEmbedding(_, embeddingType)) - } - - def defaultNextInterestedInStoreWithMtls( - mhMtlsParams: ManhattanKVClientMtlsParams, - embeddingType: EmbeddingType, - modelVersion: ModelVersion - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - if (!modelVersionToNextInterestedInDatasetMap.contains( - ModelVersions.toKnownForModelVersion(modelVersion))) { - throw new IllegalArgumentException( - "Unknown model version: " + modelVersion + ". Known model versions: " + knownModelVersions) - } - val datasetName = modelVersionToNextInterestedInDatasetMap( - ModelVersions.toKnownForModelVersion(modelVersion)) - new SimClustersManhattanReadableStoreForReadWriteDataset( - appId = "kafka_beam_sink_bet_consumer_embedding_prod", - datasetName = datasetName, - label = datasetName, - mtlsParams = mhMtlsParams, - manhattanCluster = Nash - ).mapValues(toSimClustersEmbedding(_, embeddingType)) - } - - def getWithMtls( - appId: String, - mtlsParams: ManhattanKVClientMtlsParams, - modelVersion: String = defaultModelVersion - ): ReadableStore[Long, ClustersUserIsInterestedIn] = { - if (!modelVersionToDatasetMap.contains(modelVersion)) { - throw new IllegalArgumentException( - "Unknown model version: " + modelVersion + ". Known model versions: " + knownModelVersions) - } - this.getStore(appId, mtlsParams, modelVersionToDatasetMap(modelVersion)) - } - - /** - * @param appId Manhattan AppId - * @param mtlsParams MltsParams for s2s Authentication - * - * @return ReadableStore of user to cluster interestedIn data set - */ - def getStore( - appId: String, - mtlsParams: ManhattanKVClientMtlsParams, - datasetName: String, - manhattanCluster: ManhattanCluster = Athena - ): ReadableStore[Long, ClustersUserIsInterestedIn] = { - - implicit val keyInjection: Injection[Long, Array[Byte]] = Injection.long2BigEndian - implicit val userInterestsCodec: Injection[ClustersUserIsInterestedIn, Array[Byte]] = - CompactScalaCodec(ClustersUserIsInterestedIn) - - ManhattanRO.getReadableStoreWithMtls[Long, ClustersUserIsInterestedIn]( - ManhattanROConfig( - HDFSPath(""), // not needed - ApplicationID(appId), - DatasetName(datasetName), - manhattanCluster - ), - mtlsParams - ) - } - - /** - * - * @param record ClustersUserIsInterestedIn thrift struct from the MH data set - * @param embeddingType Embedding Type as defined in com.twitter.simclusters_v2.thriftscala.EmbeddingType - * @param maxClusterSizeOpt Option param to set max cluster size. - * We will not filter out clusters based on cluster size if it is None - * @return - */ - def toSimClustersEmbedding( - record: ClustersUserIsInterestedIn, - embeddingType: EmbeddingType, - maxClusterSizeOpt: Option[Int] = None - ): SimClustersEmbedding = { - val embedding = record.clusterIdToScores - .collect { - case (clusterId, clusterScores) if maxClusterSizeOpt.forall { maxClusterSize => - clusterScores.numUsersInterestedInThisClusterUpperBound.exists(_ < maxClusterSize) - } => - val score = embeddingType match { - case EmbeddingType.FavBasedUserInterestedIn => - clusterScores.favScore - case EmbeddingType.FollowBasedUserInterestedIn => - clusterScores.followScore - case EmbeddingType.LogFavBasedUserInterestedIn => - clusterScores.logFavScore - case EmbeddingType.FavBasedUserInterestedInFromPE => - clusterScores.favScore - case EmbeddingType.FollowBasedUserInterestedInFromPE => - clusterScores.followScore - case EmbeddingType.LogFavBasedUserInterestedInFromPE => - clusterScores.logFavScore - case EmbeddingType.LogFavBasedUserInterestedInFromAPE => - clusterScores.logFavScore - case EmbeddingType.FollowBasedUserInterestedInFromAPE => - clusterScores.followScore - case EmbeddingType.UserNextInterestedIn => - clusterScores.logFavScore - case EmbeddingType.LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE => - clusterScores.logFavScore - case EmbeddingType.LogFavBasedUserInterestedAverageAddressBookFromIIAPE => - clusterScores.logFavScore - case EmbeddingType.LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE => - clusterScores.logFavScore - case EmbeddingType.LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE => - clusterScores.logFavScore - case EmbeddingType.LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE => - clusterScores.logFavScore - case EmbeddingType.LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE => - clusterScores.logFavScore - - case _ => - throw new IllegalArgumentException(s"unknown EmbeddingType: $embeddingType") - } - score.map(clusterId -> _) - }.flatten.toMap - - SimClustersEmbedding(embedding) - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserKnownForReadableStore.docx b/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserKnownForReadableStore.docx new file mode 100644 index 000000000..832fa48d2 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserKnownForReadableStore.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserKnownForReadableStore.scala b/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserKnownForReadableStore.scala deleted file mode 100644 index 8655e605a..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/stores/UserKnownForReadableStore.scala +++ /dev/null @@ -1,75 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.stores - -import com.twitter.bijection.Injection -import com.twitter.bijection.scrooge.CompactScalaCodec -import com.twitter.simclusters_v2.thriftscala.{ClustersUserIsKnownFor, ModelVersion} -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan.{Athena, ManhattanRO, ManhattanROConfig} -import com.twitter.storehaus_internal.util.{ApplicationID, DatasetName, HDFSPath} -import com.twitter.util.Future - -object UserKnownForReadableStore { - - private val dataSetNameDec11 = "simclusters_v2_known_for_20m_145k_dec11" - private val dataSetNameUpdated = "simclusters_v2_known_for_20m_145k_updated" - private val dataSetName2020 = "simclusters_v2_known_for_20m_145k_2020" - - private def buildForModelVersion( - appId: String, - storeName: String, - mhMtlsParams: ManhattanKVClientMtlsParams - ): ReadableStore[Long, ClustersUserIsKnownFor] = { - implicit val keyInjection: Injection[Long, Array[Byte]] = Injection.long2BigEndian - implicit val knownForCodec: Injection[ClustersUserIsKnownFor, Array[Byte]] = - CompactScalaCodec(ClustersUserIsKnownFor) - - ManhattanRO.getReadableStoreWithMtls[Long, ClustersUserIsKnownFor]( - ManhattanROConfig( - HDFSPath(""), // not needed - ApplicationID(appId), - DatasetName(storeName), - Athena - ), - mhMtlsParams - ) - } - - def get(appId: String, mhMtlsParams: ManhattanKVClientMtlsParams): UserKnownForReadableStore = { - val dec11Store = buildForModelVersion(appId, dataSetNameDec11, mhMtlsParams) - val updatedStore = buildForModelVersion(appId, dataSetNameUpdated, mhMtlsParams) - val version2020Store = buildForModelVersion(appId, dataSetName2020, mhMtlsParams) - - UserKnownForReadableStore(dec11Store, updatedStore, version2020Store) - } - - def getDefaultStore(mhMtlsParams: ManhattanKVClientMtlsParams): UserKnownForReadableStore = - get("simclusters_v2", mhMtlsParams) - -} - -case class Query(userId: Long, modelVersion: ModelVersion = ModelVersion.Model20m145kUpdated) - -/** - * Mainly used in debuggers to fetch the top knownFor clusters across different model versions - */ -case class UserKnownForReadableStore( - knownForStoreDec11: ReadableStore[Long, ClustersUserIsKnownFor], - knownForStoreUpdated: ReadableStore[Long, ClustersUserIsKnownFor], - knownForStore2020: ReadableStore[Long, ClustersUserIsKnownFor]) - extends ReadableStore[Query, ClustersUserIsKnownFor] { - - override def get(query: Query): Future[Option[ClustersUserIsKnownFor]] = { - query.modelVersion match { - case ModelVersion.Model20m145kDec11 => - knownForStoreDec11.get(query.userId) - case ModelVersion.Model20m145kUpdated => - knownForStoreUpdated.get(query.userId) - case ModelVersion.Model20m145k2020 => - knownForStore2020.get(query.userId) - case c => - throw new IllegalArgumentException( - s"Never heard of $c before! Is this a new model version?") - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/BUILD b/src/scala/com/twitter/simclusters_v2/summingbird/storm/BUILD deleted file mode 100644 index 62f92f3e7..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/storm/BUILD +++ /dev/null @@ -1,27 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/twitter/algebird:core", - "3rdparty/jvm/com/twitter/algebird:util", - "3rdparty/jvm/com/twitter/bijection:core", - "3rdparty/jvm/com/twitter/bijection:util", - "3rdparty/jvm/com/twitter/storehaus:core", - "3rdparty/jvm/com/twitter/storehaus:memcache", - "3rdparty/src/jvm/com/twitter/storehaus:memcache", - "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common", - "src/scala/com/twitter/simclusters_v2/summingbird:common", - "src/scala/com/twitter/simclusters_v2/summingbird:stores", - "src/scala/com/twitter/storehaus_internal/memcache/config", - "src/scala/com/twitter/storehaus_internal/online", - "src/scala/com/twitter/summingbird_internal/runner/common", - "src/scala/com/twitter/summingbird_internal/runner/store_config", - "src/scala/com/twitter/summingbird_internal/runner/storm", - "src/scala/com/twitter/summingbird_internal/sources/common", - "src/scala/com/twitter/summingbird_internal/sources/common/remote:TweetEventSource", - "src/scala/com/twitter/summingbird_internal/sources/storm/remote:TweetEventSource", - "src/scala/com/twitter/tormenta_internal/spout/eventbus", - "src/scala/com/twitter/wtf/summingbird/sources/common", - "src/scala/com/twitter/wtf/summingbird/sources/storm", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/BUILD.docx b/src/scala/com/twitter/simclusters_v2/summingbird/storm/BUILD.docx new file mode 100644 index 000000000..bd1bcc99d Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/storm/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJob.docx b/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJob.docx new file mode 100644 index 000000000..d14e1fcdf Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJob.scala b/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJob.scala deleted file mode 100644 index 1e0703647..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJob.scala +++ /dev/null @@ -1,151 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.storm - -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.summingbird.common.Implicits -import com.twitter.simclusters_v2.summingbird.common.Monoids.PersistentSimClustersEmbeddingLongestL2NormMonoid -import com.twitter.simclusters_v2.summingbird.common.StatsUtil -import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore.{ - LatestEmbeddingVersion, - LongestL2EmbeddingVersion, - PersistentTweetEmbeddingId -} -import com.twitter.simclusters_v2.thriftscala.{ - PersistentSimClustersEmbedding, - SimClustersEmbedding, - SimClustersEmbeddingMetadata -} -import com.twitter.summingbird.option.JobId -import com.twitter.summingbird.{Platform, Producer, TailProducer} -import com.twitter.timelineservice.thriftscala.Event -import com.twitter.tweetypie.thriftscala.StatusCounts - -/** - * The job to save the qualified tweet SimClustersEmbedding into Strato Store(Back by Manhattan). - * - * The steps - * 1. Read from Favorite Stream. - * 2. Join with Tweet Status Count Service. - * 3. Filter out the tweets whose favorite count < 8. - * We consider these tweets' SimClusters embedding is too noisy and untrustable. - * 4. Update the SimClusters Tweet embedding with timestamp 0L. - * 0L is reserved for the latest tweet embedding. It's also used to maintain the tweet count. - * 5. If the SimClusters Tweet embedding's update count is 2 power N & N >= 3. - * Persistent the embeddings with the timestamp as part of the LK. - **/ -private[storm] object PersistentTweetJob { - import StatsUtil._ - - private val MinFavoriteCount = 8 - type Timestamp = Long - - val longestL2NormMonoid = new PersistentSimClustersEmbeddingLongestL2NormMonoid() - - def generate[P <: Platform[P]]( - timelineEventSource: Producer[P, Event], - tweetStatusCountService: P#Service[TweetId, StatusCounts], - tweetEmbeddingService: P#Service[TweetId, SimClustersEmbedding], - persistentTweetEmbeddingStoreWithLatestAggregation: P#Store[ - PersistentTweetEmbeddingId, - PersistentSimClustersEmbedding - ], - persistentTweetEmbeddingStoreWithLongestL2NormAggregation: P#Store[ - PersistentTweetEmbeddingId, - PersistentSimClustersEmbedding - ] - )( - implicit jobId: JobId - ): TailProducer[P, Any] = { - - val timelineEvents: Producer[P, (TweetId, Timestamp)] = timelineEventSource - .collect { - case Event.Favorite(favoriteEvent) => - (favoriteEvent.tweetId, favoriteEvent.eventTimeMs) - } - - val filteredEvents = timelineEvents - .leftJoin[StatusCounts](tweetStatusCountService) - .filter { - case (_, (_, Some(statusCounts))) => - // Only consider tweets which has more than 8 favorite - statusCounts.favoriteCount.exists(_ >= MinFavoriteCount) - case _ => - false - } - .leftJoin[SimClustersEmbedding](tweetEmbeddingService) - - val latestAndPersistentEmbeddingProducer = filteredEvents - .collect { - case (tweetId, ((eventTimeMs, _), Some(tweetEmbedding))) => - ( - // This special timestamp is a reserved space for the latest tweet embedding. - PersistentTweetEmbeddingId(tweetId, timestampInMs = LatestEmbeddingVersion), - PersistentSimClustersEmbedding( - tweetEmbedding, - SimClustersEmbeddingMetadata(updatedAtMs = Some(eventTimeMs), updatedCount = Some(1)) - )) - } - .observe("num_of_embedding_updates") - .sumByKey(persistentTweetEmbeddingStoreWithLatestAggregation)( - Implicits.persistentSimClustersEmbeddingMonoid) - .name("latest_embedding_producer") - .flatMap { - case (persistentTweetEmbeddingId, (maybeEmbedding, deltaEmbedding)) => - lastQualifiedUpdatedCount( - maybeEmbedding.flatMap(_.metadata.updatedCount), - deltaEmbedding.metadata.updatedCount - ).map { newUpdateCount => - ( - persistentTweetEmbeddingId.copy(timestampInMs = - deltaEmbedding.metadata.updatedAtMs.getOrElse(0L)), - deltaEmbedding.copy(metadata = - deltaEmbedding.metadata.copy(updatedCount = Some(newUpdateCount))) - ) - } - } - .observe("num_of_extra_embedding") - .sumByKey(persistentTweetEmbeddingStoreWithLatestAggregation)( - Implicits.persistentSimClustersEmbeddingMonoid) - .name("persistent_embeddings_producer") - - val longestL2NormEmbeddingProducer = filteredEvents - .collect { - case (tweetId, ((eventTimeMs, Some(statusCounts)), Some(tweetEmbedding))) => - ( - // This special timestamp is a reserved space for the latest tweet embedding. - PersistentTweetEmbeddingId(tweetId, timestampInMs = LongestL2EmbeddingVersion), - PersistentSimClustersEmbedding( - tweetEmbedding, - SimClustersEmbeddingMetadata( - updatedAtMs = Some(eventTimeMs), - // We're not aggregating the existing embedding, we're replacing it. The count - // therefore needs to be the absolute fav count for this tweet, not the delta. - updatedCount = statusCounts.favoriteCount.map(_ + 1) - ) - )) - } - .observe("num_longest_l2_norm_updates") - .sumByKey(persistentTweetEmbeddingStoreWithLongestL2NormAggregation)(longestL2NormMonoid) - .name("longest_l2_norm_embedding_producer") - - latestAndPersistentEmbeddingProducer.also(longestL2NormEmbeddingProducer) - } - - /* - If this change in counts crosses one or more powers of 2 (8,16,32...), return the last boundary - that was crossed. In the case where a count delta is large, it may skip a power of 2, and - thus we may not store embeddings for all 2^(i+3) where 0 <= i <= tweetFavCount. - */ - private def lastQualifiedUpdatedCount( - existingUpdateCount: Option[Long], - deltaUpdateCount: Option[Long] - ): Option[Int] = { - val existing = existingUpdateCount.getOrElse(0L) - val sum = existing + deltaUpdateCount.getOrElse(0L) - qualifiedSet.filter { i => (existing < i) && (i <= sum) }.lastOption - } - - // Only 2 Power n while n >= 3 is qualified for Persistent. The max = 16,777,216 - private lazy val qualifiedSet = 3 - .until(25).map { i => Math.pow(2, i).toInt }.toSet - -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJobRunner.docx b/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJobRunner.docx new file mode 100644 index 000000000..71eadd616 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJobRunner.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJobRunner.scala b/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJobRunner.scala deleted file mode 100644 index b7960d846..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJobRunner.scala +++ /dev/null @@ -1,227 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.storm - -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.stats.NullStatsReceiver -import com.twitter.hermit.store.common.ObservedCachedReadableStore -import com.twitter.scalding.Args -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.summingbird.common.Monoids.PersistentSimClustersEmbeddingLongestL2NormMonoid -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.AltSetting -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.Environment -import com.twitter.simclusters_v2.summingbird.common.ClientConfigs -import com.twitter.simclusters_v2.summingbird.common.Implicits -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile -import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore.PersistentTweetEmbeddingId -import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore -import com.twitter.simclusters_v2.summingbird.stores.TopKClustersForTweetKeyReadableStore -import com.twitter.simclusters_v2.summingbird.stores.TweetKey -import com.twitter.simclusters_v2.summingbird.stores.TweetStatusCountsStore -import com.twitter.simclusters_v2.thriftscala.PersistentSimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} -import com.twitter.storehaus.FutureCollector -import com.twitter.summingbird.online.option._ -import com.twitter.summingbird.option._ -import com.twitter.summingbird.storm.Storm -import com.twitter.summingbird.Options -import com.twitter.summingbird.TailProducer -import com.twitter.summingbird_internal.runner.common.JobName -import com.twitter.summingbird_internal.runner.common.SBRunConfig -import com.twitter.summingbird_internal.runner.storm.GenericRunner -import com.twitter.summingbird_internal.runner.storm.StormConfig -import com.twitter.tormenta_internal.spout.eventbus.SubscriberId -import com.twitter.tweetypie.thriftscala.StatusCounts -import com.twitter.wtf.summingbird.sources.storm.TimelineEventSource -import java.lang -import java.util.{HashMap => JMap} -import org.apache.heron.api.{Config => HeronConfig} -import org.apache.storm.{Config => BTConfig} - -object PersistentTweetJobRunner { - def main(args: Array[String]): Unit = { - GenericRunner(args, PersistentTweetStormJob(_)) - } -} - -object PersistentTweetStormJob { - - import com.twitter.simclusters_v2.summingbird.common.Implicits._ - - def jLong(num: Long): lang.Long = java.lang.Long.valueOf(num) - def jInt(num: Int): Integer = java.lang.Integer.valueOf(num) - def jFloat(num: Float): lang.Float = java.lang.Float.valueOf(num) - - def apply(args: Args): StormConfig = { - - lazy val env: String = args.getOrElse("env", "prod") - lazy val zone: String = args.getOrElse("dc", "atla") - lazy val alt: String = args.getOrElse("alt", default = "normal") - - lazy val profile = - SimClustersProfile.fetchPersistentJobProfile(Environment(env), AltSetting(alt)) - - lazy val stratoClient = ClientConfigs.stratoClient(profile.serviceIdentifier(zone)) - - lazy val favoriteEventSource = TimelineEventSource( - // Note: do not share the same subsriberId with other jobs. Apply a new one if needed - SubscriberId(profile.timelineEventSourceSubscriberId) - ).kafkaSource - - lazy val persistentTweetEmbeddingStore = - PersistentTweetEmbeddingStore - .persistentTweetEmbeddingStore(stratoClient, profile.persistentTweetStratoPath) - - lazy val persistentTweetEmbeddingStoreWithLatestAggregation: Storm#Store[ - PersistentTweetEmbeddingId, - PersistentSimClustersEmbedding - ] = { - import com.twitter.storehaus.algebra.StoreAlgebra._ - - lazy val mergeableStore = - persistentTweetEmbeddingStore.toMergeable( - mon = Implicits.persistentSimClustersEmbeddingMonoid, - fc = implicitly[FutureCollector]) - - Storm.onlineOnlyStore(mergeableStore) - } - - lazy val persistentTweetEmbeddingStoreWithLongestL2NormAggregation: Storm#Store[ - PersistentTweetEmbeddingId, - PersistentSimClustersEmbedding - ] = { - import com.twitter.storehaus.algebra.StoreAlgebra._ - - val longestL2NormMonoid = new PersistentSimClustersEmbeddingLongestL2NormMonoid() - lazy val mergeableStore = - persistentTweetEmbeddingStore.toMergeable( - mon = longestL2NormMonoid, - fc = implicitly[FutureCollector]) - - Storm.onlineOnlyStore(mergeableStore) - } - - lazy val tweetStatusCountsService: Storm#Service[TweetId, StatusCounts] = - Storm.service( - ObservedCachedReadableStore.from[TweetId, StatusCounts]( - TweetStatusCountsStore.tweetStatusCountsStore(stratoClient, "tweetypie/core.Tweet"), - ttl = 1.minute, - maxKeys = 10000, // 10K is enough for Heron Job. - cacheName = "tweet_status_count", - windowSize = 10000L - )(NullStatsReceiver) - ) - - lazy val tweetEmbeddingService: Storm#Service[TweetId, ThriftSimClustersEmbedding] = - Storm.service( - TopKClustersForTweetKeyReadableStore - .overrideLimitDefaultStore(50, profile.serviceIdentifier(zone)) - .composeKeyMapping { tweetId: TweetId => - TweetKey(tweetId, profile.modelVersionStr, profile.coreEmbeddingType) - }.mapValues { value => SimClustersEmbedding(value).toThrift }) - - new StormConfig { - - val jobName: JobName = JobName(profile.jobName) - - implicit val jobID: JobId = JobId(jobName.toString) - - /** - * Add registrars for chill serialization for user-defined types. - */ - override def registrars = - List( - SBRunConfig.register[StatusCounts], - SBRunConfig.register[ThriftSimClustersEmbedding], - SBRunConfig.register[PersistentSimClustersEmbedding] - ) - - /***** Job configuration settings *****/ - /** - * Use vmSettings to configure the VM - */ - override def vmSettings: Seq[String] = Seq() - - private val SourcePerWorker = 1 - private val FlatMapPerWorker = 1 - private val SummerPerWorker = 1 - - private val TotalWorker = 60 - - /** - * Use transformConfig to set Heron options. - */ - override def transformConfig(config: Map[String, AnyRef]): Map[String, AnyRef] = { - - val heronJvmOptions = new JMap[String, AnyRef]() - - val MetaspaceSize = jLong(256L * 1024 * 1024) - val DefaultHeapSize = jLong(2L * 1024 * 1024 * 1024) - val HighHeapSize = jLong(4L * 1024 * 1024 * 1024) - - val TotalCPU = jLong( - SourcePerWorker * 1 + FlatMapPerWorker * 4 + SummerPerWorker * 3 + 1 - ) - - // reserve 4GB for the StreamMgr - val TotalRam = jLong( - DefaultHeapSize * (SourcePerWorker * 1 + FlatMapPerWorker * 4) - + HighHeapSize * SummerPerWorker * 3 - + MetaspaceSize * 8 // Applies to all workers - + 4L * 1024 * 1024 * 1024) - - // These settings help prevent GC issues in the most memory intensive steps of the job by - // dedicating more memory to the new gen heap designated by the -Xmn flag. - Map( - "Tail" -> HighHeapSize - ).foreach { - case (stage, heap) => - HeronConfig.setComponentJvmOptions( - heronJvmOptions, - stage, - s"-Xmx$heap -Xms$heap -Xmn${heap / 2}" - ) - } - - super.transformConfig(config) ++ List( - BTConfig.TOPOLOGY_TEAM_NAME -> "cassowary", - BTConfig.TOPOLOGY_TEAM_EMAIL -> "no-reply@twitter.com", - BTConfig.TOPOLOGY_WORKERS -> jInt(TotalWorker), - BTConfig.TOPOLOGY_ACKER_EXECUTORS -> jInt(0), - BTConfig.TOPOLOGY_MESSAGE_TIMEOUT_SECS -> jInt(30), - BTConfig.TOPOLOGY_WORKER_CHILDOPTS -> List( - "-Djava.security.auth.login.config=config/jaas.conf", - "-Dsun.security.krb5.debug=true", - "-Dcom.twitter.eventbus.client.EnableKafkaSaslTls=true", - "-Dcom.twitter.eventbus.client.zoneName=" + zone, - s"-XX:MaxMetaspaceSize=$MetaspaceSize" - ).mkString(" "), - HeronConfig.TOPOLOGY_CONTAINER_CPU_REQUESTED -> TotalCPU, - HeronConfig.TOPOLOGY_CONTAINER_RAM_REQUESTED -> TotalRam, - "storm.job.uniqueId" -> jobID.get - ) - } - - /** - * Use getNamedOptions to set Summingbird runtime options - * The list of available options: com.twitter.summingbird.online.option - */ - override def getNamedOptions: Map[String, Options] = Map( - "DEFAULT" -> Options() - .set(SummerParallelism(TotalWorker * SummerPerWorker)) - .set(FlatMapParallelism(TotalWorker * FlatMapPerWorker)) - .set(SourceParallelism(TotalWorker * SourcePerWorker)) - .set(CacheSize(10000)) - .set(FlushFrequency(30.seconds)) - ) - - /** Required job generation call for your job, defined in Job.scala */ - override def graph: TailProducer[Storm, Any] = PersistentTweetJob.generate[Storm]( - favoriteEventSource, - tweetStatusCountsService, - tweetEmbeddingService, - persistentTweetEmbeddingStoreWithLatestAggregation, - persistentTweetEmbeddingStoreWithLongestL2NormAggregation - ) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJob.docx b/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJob.docx new file mode 100644 index 000000000..5d59d8998 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJob.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJob.scala b/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJob.scala deleted file mode 100644 index 54ac8011a..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJob.scala +++ /dev/null @@ -1,232 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.storm - -import com.twitter.simclusters_v2.common.ModelVersions._ -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.SimClustersTweetProfile -import com.twitter.simclusters_v2.summingbird.common.Configs -import com.twitter.simclusters_v2.summingbird.common.Implicits -import com.twitter.simclusters_v2.summingbird.common.SimClustersHashUtil -import com.twitter.simclusters_v2.summingbird.common.SimClustersInterestedInUtil -import com.twitter.simclusters_v2.summingbird.common.StatsUtil -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.snowflake.id.SnowflakeId -import com.twitter.summingbird._ -import com.twitter.summingbird.option.JobId -import com.twitter.timelineservice.thriftscala.Event -import com.twitter.conversions.DurationOps._ -import com.twitter.timelineservice.thriftscala.EventAliases.FavoriteAlias - -object TweetJob { - - import Implicits._ - import StatsUtil._ - - object NodeName { - final val TweetClusterScoreFlatMapNodeName: String = "TweetClusterScoreFlatMap" - final val TweetClusterUpdatedScoresFlatMapNodeName: String = "TweetClusterUpdatedScoreFlatMap" - final val TweetClusterScoreSummerNodeName: String = "TweetClusterScoreSummer" - final val TweetTopKNodeName: String = "TweetTopKSummer" - final val ClusterTopKTweetsNodeName: String = "ClusterTopKTweetsSummer" - final val ClusterTopKTweetsLightNodeName: String = "ClusterTopKTweetsLightSummer" - } - - def generate[P <: Platform[P]]( - profile: SimClustersTweetProfile, - timelineEventSource: Producer[P, Event], - userInterestedInService: P#Service[Long, ClustersUserIsInterestedIn], - tweetClusterScoreStore: P#Store[(SimClusterEntity, FullClusterIdBucket), ClustersWithScores], - tweetTopKClustersStore: P#Store[EntityWithVersion, TopKClustersWithScores], - clusterTopKTweetsStore: P#Store[FullClusterId, TopKTweetsWithScores], - clusterTopKTweetsLightStore: Option[P#Store[FullClusterId, TopKTweetsWithScores]] - )( - implicit jobId: JobId - ): TailProducer[P, Any] = { - - val userInterestNonEmptyCount = Counter(Group(jobId.get), Name("num_user_interests_non_empty")) - val userInterestEmptyCount = Counter(Group(jobId.get), Name("num_user_interests_empty")) - - val numClustersCount = Counter(Group(jobId.get), Name("num_clusters")) - - val entityClusterPairCount = Counter(Group(jobId.get), Name("num_entity_cluster_pairs_emitted")) - - // Fav QPS is around 6K - val qualifiedFavEvents = timelineEventSource - .collect { - case Event.Favorite(favEvent) - if favEvent.userId != favEvent.tweetUserId && !isTweetTooOld(favEvent) => - (favEvent.userId, favEvent) - } - .observe("num_qualified_favorite_events") - - val entityWithSimClustersProducer = qualifiedFavEvents - .leftJoin(userInterestedInService) - .map { - case (_, (favEvent, userInterestOpt)) => - (favEvent.tweetId, (favEvent, userInterestOpt)) - } - .flatMap { - case (_, (favEvent, Some(userInterests))) => - userInterestNonEmptyCount.incr() - - val timestamp = favEvent.eventTimeMs - - val clustersWithScores = SimClustersInterestedInUtil.topClustersWithScores(userInterests) - - // clusters.size is around 25 in average - numClustersCount.incrBy(clustersWithScores.size) - - val simClusterScoresByHashBucket = clustersWithScores.groupBy { - case (clusterId, _) => SimClustersHashUtil.clusterIdToBucket(clusterId) - } - - for { - (hashBucket, scores) <- simClusterScoresByHashBucket - } yield { - entityClusterPairCount.incr() - - val clusterBucket = FullClusterIdBucket(userInterests.knownForModelVersion, hashBucket) - - val tweetId: SimClusterEntity = SimClusterEntity.TweetId(favEvent.tweetId) - - (tweetId, clusterBucket) -> SimClustersInterestedInUtil - .buildClusterWithScores( - scores, - timestamp, - profile.favScoreThresholdForUserInterest - ) - } - case _ => - userInterestEmptyCount.incr() - None - } - .observe("entity_cluster_delta_scores") - .name(NodeName.TweetClusterScoreFlatMapNodeName) - .sumByKey(tweetClusterScoreStore)(clustersWithScoreMonoid) - .name(NodeName.TweetClusterScoreSummerNodeName) - .map { - case ((simClusterEntity, clusterBucket), (oldValueOpt, deltaValue)) => - val updatedClusterIds = deltaValue.clustersToScore.map(_.keySet).getOrElse(Set.empty[Int]) - - (simClusterEntity, clusterBucket) -> clustersWithScoreMonoid.plus( - oldValueOpt - .map { oldValue => - oldValue.copy( - clustersToScore = - oldValue.clustersToScore.map(_.filterKeys(updatedClusterIds.contains)) - ) - }.getOrElse(clustersWithScoreMonoid.zero), - deltaValue - ) - } - .observe("entity_cluster_updated_scores") - .name(NodeName.TweetClusterUpdatedScoresFlatMapNodeName) - - val tweetTopK = entityWithSimClustersProducer - .flatMap { - case ((simClusterEntity, FullClusterIdBucket(modelVersion, _)), clusterWithScores) - if simClusterEntity.isInstanceOf[SimClusterEntity.TweetId] => - clusterWithScores.clustersToScore - .map { clustersToScores => - val topClustersWithFavScores = clustersToScores.mapValues { scores: Scores => - Scores( - favClusterNormalized8HrHalfLifeScore = - scores.favClusterNormalized8HrHalfLifeScore.filter( - _.value >= Configs.scoreThresholdForTweetTopKClustersCache - ) - ) - } - - ( - EntityWithVersion(simClusterEntity, modelVersion), - TopKClustersWithScores(Some(topClustersWithFavScores), None) - ) - } - case _ => - None - - } - .observe("tweet_topk_updates") - .sumByKey(tweetTopKClustersStore)(topKClustersWithScoresMonoid) - .name(NodeName.TweetTopKNodeName) - - val clusterTopKTweets = entityWithSimClustersProducer - .flatMap { - case ((simClusterEntity, FullClusterIdBucket(modelVersion, _)), clusterWithScores) => - simClusterEntity match { - case SimClusterEntity.TweetId(tweetId) => - clusterWithScores.clustersToScore - .map { clustersToScores => - clustersToScores.toSeq.map { - case (clusterId, scores) => - val topTweetsByFavScore = Map( - tweetId -> Scores(favClusterNormalized8HrHalfLifeScore = - scores.favClusterNormalized8HrHalfLifeScore.filter(_.value >= - Configs.scoreThresholdForClusterTopKTweetsCache))) - - ( - FullClusterId(modelVersion, clusterId), - TopKTweetsWithScores(Some(topTweetsByFavScore), None) - ) - } - }.getOrElse(Nil) - case _ => - Nil - } - } - .observe("cluster_topk_tweets_updates") - .sumByKey(clusterTopKTweetsStore)(topKTweetsWithScoresMonoid) - .name(NodeName.ClusterTopKTweetsNodeName) - - val clusterTopKTweetsLight = clusterTopKTweetsLightStore.map { lightStore => - entityWithSimClustersProducer - .flatMap { - case ((simClusterEntity, FullClusterIdBucket(modelVersion, _)), clusterWithScores) => - simClusterEntity match { - case SimClusterEntity.TweetId(tweetId) if isTweetTooOldForLight(tweetId) => - clusterWithScores.clustersToScore - .map { clustersToScores => - clustersToScores.toSeq.map { - case (clusterId, scores) => - val topTweetsByFavScore = Map( - tweetId -> Scores(favClusterNormalized8HrHalfLifeScore = - scores.favClusterNormalized8HrHalfLifeScore.filter(_.value >= - Configs.scoreThresholdForClusterTopKTweetsCache))) - - ( - FullClusterId(modelVersion, clusterId), - TopKTweetsWithScores(Some(topTweetsByFavScore), None) - ) - } - }.getOrElse(Nil) - case _ => - Nil - } - } - .observe("cluster_topk_tweets_updates") - .sumByKey(lightStore)(topKTweetsWithScoresLightMonoid) - .name(NodeName.ClusterTopKTweetsLightNodeName) - } - - clusterTopKTweetsLight match { - case Some(lightNode) => - tweetTopK.also(clusterTopKTweets).also(lightNode) - case None => - tweetTopK.also(clusterTopKTweets) - } - } - - // Boolean check to see if the tweet is too old - private def isTweetTooOld(favEvent: FavoriteAlias): Boolean = { - favEvent.tweet.forall { tweet => - SnowflakeId.unixTimeMillisOptFromId(tweet.id).exists { millis => - System.currentTimeMillis() - millis >= Configs.OldestTweetFavEventTimeInMillis - } - } - } - - private def isTweetTooOldForLight(tweetId: Long): Boolean = { - SnowflakeId.unixTimeMillisOptFromId(tweetId).exists { millis => - System.currentTimeMillis() - millis >= Configs.OldestTweetInLightIndexInMillis - } - } - -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJobRunner.docx b/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJobRunner.docx new file mode 100644 index 000000000..a38303ca9 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJobRunner.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJobRunner.scala b/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJobRunner.scala deleted file mode 100644 index 11a94a47b..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/storm/TweetJobRunner.scala +++ /dev/null @@ -1,242 +0,0 @@ -package com.twitter.simclusters_v2.summingbird.storm - -import com.twitter.conversions.DurationOps._ -import com.twitter.heron.util.CommonMetric -import com.twitter.scalding.Args -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.AltSetting -import com.twitter.simclusters_v2.summingbird.common.SimClustersProfile.Environment -import com.twitter.simclusters_v2.summingbird.stores.EntityClusterScoreReadableStore -import com.twitter.simclusters_v2.summingbird.stores.TopKClustersForTweetReadableStore -import com.twitter.simclusters_v2.summingbird.stores.TopKTweetsForClusterReadableStore -import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore -import com.twitter.simclusters_v2.thriftscala._ -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.summingbird.online.option._ -import com.twitter.summingbird.option._ -import com.twitter.summingbird.storm.option.FlatMapStormMetrics -import com.twitter.summingbird.storm.option.SummerStormMetrics -import com.twitter.summingbird.storm.Storm -import com.twitter.summingbird.storm.StormMetric -import com.twitter.summingbird.Options -import com.twitter.summingbird.TailProducer -import com.twitter.summingbird_internal.runner.common.JobName -import com.twitter.summingbird_internal.runner.common.SBRunConfig -import com.twitter.summingbird_internal.runner.storm.GenericRunner -import com.twitter.summingbird_internal.runner.storm.StormConfig -import com.twitter.tormenta_internal.spout.eventbus.SubscriberId -import com.twitter.wtf.summingbird.sources.storm.TimelineEventSource -import java.lang -import org.apache.heron.api.{Config => HeronConfig} -import org.apache.heron.common.basics.ByteAmount -import org.apache.storm.{Config => BTConfig} -import scala.collection.JavaConverters._ - -object TweetJobRunner { - def main(args: Array[String]): Unit = { - GenericRunner(args, TweetStormJob(_)) - } -} - -object TweetStormJob { - - import com.twitter.simclusters_v2.summingbird.common.Implicits._ - - def jLong(num: Long): lang.Long = java.lang.Long.valueOf(num) - def jInt(num: Int): Integer = java.lang.Integer.valueOf(num) - def apply(args: Args): StormConfig = { - - lazy val env: String = args.getOrElse("env", "prod") - lazy val zone: String = args.getOrElse("dc", "atla") - - // The only SimClusters ENV is Alt. Will clean up soon. - lazy val profile = SimClustersProfile.fetchTweetJobProfile(Environment(env), AltSetting.Alt) - - lazy val favoriteEventSource = TimelineEventSource( - // Note: do not share the same subsriberId with other jobs. Apply a new one if needed - SubscriberId(profile.timelineEventSourceSubscriberId) - ).source - - lazy val commonMetric = - StormMetric(new CommonMetric(), CommonMetric.NAME, CommonMetric.POLL_INTERVAL) - lazy val flatMapMetrics = FlatMapStormMetrics(Iterable(commonMetric)) - lazy val summerMetrics = SummerStormMetrics(Iterable(commonMetric)) - - lazy val entityClusterScoreStore: Storm#Store[ - (SimClusterEntity, FullClusterIdBucket), - ClustersWithScores - ] = { - Storm.store( - EntityClusterScoreReadableStore - .onlineMergeableStore(profile.entityClusterScorePath, profile.serviceIdentifier(zone))) - } - - lazy val tweetTopKStore: Storm#Store[EntityWithVersion, TopKClustersWithScores] = { - Storm.store( - TopKClustersForTweetReadableStore - .onlineMergeableStore(profile.tweetTopKClustersPath, profile.serviceIdentifier(zone))) - } - - lazy val clusterTopKTweetsStore: Storm#Store[FullClusterId, TopKTweetsWithScores] = { - Storm.store( - TopKTweetsForClusterReadableStore - .onlineMergeableStore(profile.clusterTopKTweetsPath, profile.serviceIdentifier(zone))) - } - - lazy val clusterTopKTweetsLightStore: Option[ - Storm#Store[FullClusterId, TopKTweetsWithScores] - ] = { - profile.clusterTopKTweetsLightPath.map { lightPath => - Storm.store( - TopKTweetsForClusterReadableStore - .onlineMergeableStore(lightPath, profile.serviceIdentifier(zone))) - } - } - - lazy val userInterestedInService: Storm#Service[Long, ClustersUserIsInterestedIn] = { - Storm.service( - UserInterestedInReadableStore.defaultStoreWithMtls( - ManhattanKVClientMtlsParams(profile.serviceIdentifier(zone)), - modelVersion = profile.modelVersionStr - )) - } - - new StormConfig { - - val jobName: JobName = JobName(profile.jobName) - - implicit val jobID: JobId = JobId(jobName.toString) - - /** - * Add registrars for chill serialization for user-defined types. - */ - override def registrars = - List( - SBRunConfig.register[SimClusterEntity], - SBRunConfig.register[FullClusterIdBucket], - SBRunConfig.register[ClustersWithScores], - SBRunConfig.register[EntityWithVersion], - SBRunConfig.register[FullClusterId], - SBRunConfig.register[EntityWithVersion], - SBRunConfig.register[TopKEntitiesWithScores], - SBRunConfig.register[TopKClustersWithScores], - SBRunConfig.register[TopKTweetsWithScores] - ) - - /***** Job configuration settings *****/ - /** - * Use vmSettings to configure the VM - */ - override def vmSettings: Seq[String] = Seq() - - private val SourcePerWorker = 1 - private val FlatMapPerWorker = 3 - private val SummerPerWorker = 3 - - private val TotalWorker = 150 - - /** - * Use transformConfig to set Heron options. - */ - override def transformConfig(config: Map[String, AnyRef]): Map[String, AnyRef] = { - val heronConfig = new HeronConfig() - - /** - Component names (subject to change if you add more components, make sure to update this) - Source: Tail-FlatMap-FlatMap-Summer-FlatMap-Source - FlatMap: Tail-FlatMap-FlatMap-Summer-FlatMap, Tail-FlatMap-FlatMap, Tail-FlatMap-FlatMap, - Tail-FlatMap - Summer: Tail-FlatMap-FlatMap-Summer * 2, Tail, Tail.2 - */ - val sourceName = "Tail-FlatMap-FlatMap-Summer-FlatMap-Source" - val flatMapFlatMapSummerFlatMapName = "Tail-FlatMap-FlatMap-Summer-FlatMap" - - // 1 CPU per node, 1 for StreamMgr - // By default, numCpus per component = totalCPUs / total number of components. - // To add more CPUs for a specific component, use heronConfig.setComponentCpu(name, numCPUs) - // add 20% more CPUs to address back pressure issue - val TotalCPU = jLong( - (1.2 * (SourcePerWorker * 1 + FlatMapPerWorker * 4 + SummerPerWorker * 6 + 1)).ceil.toInt) - heronConfig.setContainerCpuRequested(TotalCPU.toDouble) - - // RAM settings - val RamPerSourceGB = 8 - val RamPerSummerFlatMap = 8 - val RamDefaultPerComponent = 4 - - // The extra 4GB is not explicitly assigned to the StreamMgr, so it gets 2GB by default, and - // the remaining 2GB is shared among components. Keeping this configuration for now, since - // it seems stable - val TotalRamRB = - RamPerSourceGB * SourcePerWorker * 1 + - RamDefaultPerComponent * FlatMapPerWorker * 4 + - RamDefaultPerComponent * SummerPerWorker * 6 + - 4 // reserve 4GB for the StreamMgr - - // By default, ramGB per component = totalRAM / total number of components. - // To adjust RAMs for a specific component, use heronConfig.setComponentRam(name, ramGB) - heronConfig.setComponentRam(sourceName, ByteAmount.fromGigabytes(RamPerSourceGB)) - heronConfig.setComponentRam( - flatMapFlatMapSummerFlatMapName, - ByteAmount.fromGigabytes(RamPerSummerFlatMap)) - heronConfig.setContainerRamRequested(ByteAmount.fromGigabytes(TotalRamRB)) - - super.transformConfig(config) ++ List( - BTConfig.TOPOLOGY_TEAM_NAME -> "cassowary", - BTConfig.TOPOLOGY_TEAM_EMAIL -> "no-reply@twitter.com", - BTConfig.TOPOLOGY_WORKERS -> jInt(TotalWorker), - BTConfig.TOPOLOGY_ACKER_EXECUTORS -> jInt(0), - BTConfig.TOPOLOGY_MESSAGE_TIMEOUT_SECS -> jInt(30), - BTConfig.TOPOLOGY_WORKER_CHILDOPTS -> List( - "-XX:MaxMetaspaceSize=256M", - "-Djava.security.auth.login.config=config/jaas.conf", - "-Dsun.security.krb5.debug=true", - "-Dcom.twitter.eventbus.client.EnableKafkaSaslTls=true", - "-Dcom.twitter.eventbus.client.zoneName=" + zone - ).mkString(" "), - "storm.job.uniqueId" -> jobID.get - ) ++ heronConfig.asScala.toMap - } - - /** - * Use getNamedOptions to set Summingbird runtime options - * The list of available options: com.twitter.summingbird.online.option - */ - override def getNamedOptions: Map[String, Options] = Map( - "DEFAULT" -> Options() - .set(FlatMapParallelism(TotalWorker * FlatMapPerWorker)) - .set(SourceParallelism(TotalWorker)) - .set(SummerBatchMultiplier(1000)) - .set(CacheSize(10000)) - .set(flatMapMetrics) - .set(summerMetrics), - TweetJob.NodeName.TweetClusterUpdatedScoresFlatMapNodeName -> Options() - .set(FlatMapParallelism(TotalWorker * FlatMapPerWorker)), - TweetJob.NodeName.TweetClusterScoreSummerNodeName -> Options() - // Most expensive step. Double the capacity. - .set(SummerParallelism(TotalWorker * SummerPerWorker * 4)) - .set(FlushFrequency(30.seconds)), - TweetJob.NodeName.ClusterTopKTweetsNodeName -> Options() - .set(SummerParallelism(TotalWorker * SummerPerWorker)) - .set(FlushFrequency(30.seconds)), - TweetJob.NodeName.ClusterTopKTweetsLightNodeName -> Options() - .set(SummerParallelism(TotalWorker * SummerPerWorker)) - .set(FlushFrequency(30.seconds)), - TweetJob.NodeName.TweetTopKNodeName -> Options() - .set(SummerParallelism(TotalWorker * SummerPerWorker)) - .set(FlushFrequency(30.seconds)) - ) - - /** Required job generation call for your job, defined in Job.scala */ - override def graph: TailProducer[Storm, Any] = TweetJob.generate[Storm]( - profile, - favoriteEventSource, - userInterestedInService, - entityClusterScoreStore, - tweetTopKStore, - clusterTopKTweetsStore, - clusterTopKTweetsLightStore - ) - } - } -} diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/persistent_tweet_job_deploy.docx b/src/scala/com/twitter/simclusters_v2/summingbird/storm/persistent_tweet_job_deploy.docx new file mode 100644 index 000000000..bbeacd45b Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/storm/persistent_tweet_job_deploy.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/persistent_tweet_job_deploy.sh b/src/scala/com/twitter/simclusters_v2/summingbird/storm/persistent_tweet_job_deploy.sh deleted file mode 100755 index 9340c72bb..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/storm/persistent_tweet_job_deploy.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash -# script to deploy simclusters persistent storm job to CI - -set -u -e - -cd "$(git rev-parse --show-toplevel)" - -# shellcheck source=/dev/null -. "$(git rev-parse --show-toplevel)/devprod/source-sh-setup" - -function usage { - cat <// where can only be devel or prod -AURORA_PATH=${AURORA_PATH:="$CLUSTER/$USER/$ENV"} -AURORA_JOB_KEY="${AURORA_PATH}/${JOB_NAME}" - -heron kill "$AURORA_PATH" "$JOB_NAME" || true - -echo "Waiting 5 seconds so heron is sure its dead" -sleep 5 - -echo "AURORA_JOB_KEY: $AURORA_JOB_KEY" - -echo "Starting your topology... for ${ENV} ${JOB_NAME}" -#set -v - -heron submit "${AURORA_PATH}" "dist/${JAR_NAME}" com.twitter.simclusters_v2.summingbird.storm.PersistentTweetJobRunner --env "$ENV" --dc "$CLUSTER" diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_alt_job_deploy.docx b/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_alt_job_deploy.docx new file mode 100644 index 000000000..ada984233 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_alt_job_deploy.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_alt_job_deploy.sh b/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_alt_job_deploy.sh deleted file mode 100755 index 67b14d126..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_alt_job_deploy.sh +++ /dev/null @@ -1,78 +0,0 @@ -#!/bin/bash -# script to deploy simcluster storm job to CI - -set -u -e - -cd "$(git rev-parse --show-toplevel)" - -# shellcheck source=/dev/null -. "$(git rev-parse --show-toplevel)/devprod/source-sh-setup" - -function usage { - cat <// where can only be devel or prod -AURORA_PATH=${AURORA_PATH:="$CLUSTER/$USER/$ENV"} -AURORA_JOB_KEY="${AURORA_PATH}/${JOB_NAME}" - -heron kill "$AURORA_PATH" "$JOB_NAME" || true - -echo "Waiting 5 seconds so heron is sure its dead" -sleep 5 - -echo "AURORA_JOB_KEY: $AURORA_JOB_KEY" - -echo "Starting your topology... for ${ENV} ${JOB_NAME}" -#set -v - -heron submit "${AURORA_PATH}" "dist/${JAR_NAME}" com.twitter.simclusters_v2.summingbird.storm.TweetJobRunner --env "$ENV" --dc "$CLUSTER" --alt "alt" --usingLogFavScore - diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_job_deploy.docx b/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_job_deploy.docx new file mode 100644 index 000000000..3278a1cb3 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_job_deploy.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_job_deploy.sh b/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_job_deploy.sh deleted file mode 100755 index b3e4f22d4..000000000 --- a/src/scala/com/twitter/simclusters_v2/summingbird/storm/tweet_job_deploy.sh +++ /dev/null @@ -1,77 +0,0 @@ -#!/bin/bash -# script to deploy simcluster storm job to CI - -set -u -e - -cd "$(git rev-parse --show-toplevel)" - -# shellcheck source=/dev/null -. "$(git rev-parse --show-toplevel)/devprod/source-sh-setup" - -function usage { - cat <// where can only be devel or prod -AURORA_PATH=${AURORA_PATH:="$CLUSTER/$USER/$ENV"} -AURORA_JOB_KEY="${AURORA_PATH}/${JOB_NAME}" - -heron kill "$AURORA_PATH" "$JOB_NAME" || true - -echo "Waiting 5 seconds so heron is sure its dead" -sleep 5 - -echo "AURORA_JOB_KEY: $AURORA_JOB_KEY" - -echo "Starting your topology... for ${ENV} ${JOB_NAME}" -#set -v - -heron submit "${AURORA_PATH}" "dist/${JAR_NAME}" com.twitter.simclusters_v2.summingbird.storm.TweetJobRunner --env "$ENV" --dc "$CLUSTER" diff --git a/src/scala/com/twitter/simclusters_v2/tweet_similarity/BUILD b/src/scala/com/twitter/simclusters_v2/tweet_similarity/BUILD deleted file mode 100644 index 526ee6d23..000000000 --- a/src/scala/com/twitter/simclusters_v2/tweet_similarity/BUILD +++ /dev/null @@ -1,11 +0,0 @@ -scala_library( - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/scala/com/twitter/ml/api:api-base", - "src/scala/com/twitter/ml/featurestore/catalog/features/recommendations:aggregate", - "src/scala/com/twitter/ml/featurestore/lib/embedding", - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/simclusters_v2/common/ml", - ], -) diff --git a/src/scala/com/twitter/simclusters_v2/tweet_similarity/BUILD.docx b/src/scala/com/twitter/simclusters_v2/tweet_similarity/BUILD.docx new file mode 100644 index 000000000..7e187a560 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/tweet_similarity/BUILD.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/tweet_similarity/ModelBasedTweetSimilaritySimClustersEmbeddingAdapter.docx b/src/scala/com/twitter/simclusters_v2/tweet_similarity/ModelBasedTweetSimilaritySimClustersEmbeddingAdapter.docx new file mode 100644 index 000000000..e9cb0d509 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/tweet_similarity/ModelBasedTweetSimilaritySimClustersEmbeddingAdapter.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/tweet_similarity/ModelBasedTweetSimilaritySimClustersEmbeddingAdapter.scala b/src/scala/com/twitter/simclusters_v2/tweet_similarity/ModelBasedTweetSimilaritySimClustersEmbeddingAdapter.scala deleted file mode 100644 index f1c3f8cc2..000000000 --- a/src/scala/com/twitter/simclusters_v2/tweet_similarity/ModelBasedTweetSimilaritySimClustersEmbeddingAdapter.scala +++ /dev/null @@ -1,37 +0,0 @@ -package com.twitter.simclusters_v2.tweet_similarity - -import com.twitter.ml.api.{DataRecord, DataRecordMerger} -import com.twitter.simclusters_v2.common.ml.{ - SimClustersEmbeddingAdapter, - NormalizedSimClustersEmbeddingAdapter -} -import com.twitter.simclusters_v2.common.SimClustersEmbedding - -object ModelBasedTweetSimilaritySimClustersEmbeddingAdapter { - val QueryEmbAdapter = new SimClustersEmbeddingAdapter(TweetSimilarityFeatures.QueryTweetEmbedding) - val CandidateEmbAdapter = new SimClustersEmbeddingAdapter( - TweetSimilarityFeatures.CandidateTweetEmbedding) - - val NormalizedQueryEmbAdapter = new NormalizedSimClustersEmbeddingAdapter( - TweetSimilarityFeatures.QueryTweetEmbedding, - TweetSimilarityFeatures.QueryTweetEmbeddingNorm) - val NormalizedCandidateEmbAdapter = new NormalizedSimClustersEmbeddingAdapter( - TweetSimilarityFeatures.CandidateTweetEmbedding, - TweetSimilarityFeatures.CandidateTweetEmbeddingNorm) - - def adaptEmbeddingPairToDataRecord( - queryEmbedding: SimClustersEmbedding, - candidateEmbedding: SimClustersEmbedding, - normalized: Boolean - ): DataRecord = { - val DataRecordMerger = new DataRecordMerger() - val queryAdapter = if (normalized) NormalizedQueryEmbAdapter else QueryEmbAdapter - val candidateAdapter = if (normalized) NormalizedCandidateEmbAdapter else CandidateEmbAdapter - - val featureDataRecord = queryAdapter.adaptToDataRecord(queryEmbedding) - DataRecordMerger.merge( - featureDataRecord, - candidateAdapter.adaptToDataRecord(candidateEmbedding)) - featureDataRecord - } -} diff --git a/src/scala/com/twitter/simclusters_v2/tweet_similarity/TweetSimilarityFeatures.docx b/src/scala/com/twitter/simclusters_v2/tweet_similarity/TweetSimilarityFeatures.docx new file mode 100644 index 000000000..100c9a688 Binary files /dev/null and b/src/scala/com/twitter/simclusters_v2/tweet_similarity/TweetSimilarityFeatures.docx differ diff --git a/src/scala/com/twitter/simclusters_v2/tweet_similarity/TweetSimilarityFeatures.scala b/src/scala/com/twitter/simclusters_v2/tweet_similarity/TweetSimilarityFeatures.scala deleted file mode 100644 index 0d6b90c95..000000000 --- a/src/scala/com/twitter/simclusters_v2/tweet_similarity/TweetSimilarityFeatures.scala +++ /dev/null @@ -1,54 +0,0 @@ -package com.twitter.simclusters_v2.tweet_similarity - -import com.twitter.ml.api.Feature.{Binary, Continuous, Discrete, SparseContinuous} -import com.twitter.ml.api.util.FDsl._ -import com.twitter.ml.api.{DataRecord, FeatureContext, IRecordOneToOneAdapter} -import com.twitter.ml.featurestore.catalog.features.recommendations.ProducerSimClustersEmbedding -import com.twitter.ml.featurestore.lib.UserId -import com.twitter.ml.featurestore.lib.data.{PredictionRecord, PredictionRecordAdapter} -import com.twitter.ml.featurestore.lib.entity.Entity -import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet - -object TweetSimilarityFeatures { - val QueryTweetId = new Discrete("query_tweet.id") - val CandidateTweetId = new Discrete("candidate_tweet.id") - val QueryTweetEmbedding = new SparseContinuous("query_tweet.simclusters_embedding") - val CandidateTweetEmbedding = new SparseContinuous("candidate_tweet.simclusters_embedding") - val QueryTweetEmbeddingNorm = new Continuous("query_tweet.embedding_norm") - val CandidateTweetEmbeddingNorm = new Continuous("candidate_tweet.embedding_norm") - val QueryTweetTimestamp = new Discrete("query_tweet.timestamp") - val CandidateTweetTimestamp = new Discrete("candidate_tweet.timestamp") - val TweetPairCount = new Discrete("popularity_count.tweet_pair") - val QueryTweetCount = new Discrete("popularity_count.query_tweet") - val CosineSimilarity = new Continuous("meta.cosine_similarity") - val Label = new Binary("co-engagement.label") - - val FeatureContext: FeatureContext = new FeatureContext( - QueryTweetId, - CandidateTweetId, - QueryTweetEmbedding, - CandidateTweetEmbedding, - QueryTweetEmbeddingNorm, - CandidateTweetEmbeddingNorm, - QueryTweetTimestamp, - CandidateTweetTimestamp, - TweetPairCount, - QueryTweetCount, - CosineSimilarity, - Label - ) - - def isCoengaged(dataRecord: DataRecord): Boolean = { - dataRecord.getFeatureValue(Label) - } -} - -class TweetSimilarityFeaturesStoreConfig(identifier: String) { - val bindingIdentifier: Entity[UserId] = Entity[UserId](identifier) - - val featureStoreBoundFeatureSet: BoundFeatureSet = BoundFeatureSet( - ProducerSimClustersEmbedding.FavBasedEmbedding20m145kUpdated.bind(bindingIdentifier)) - - val predictionRecordAdapter: IRecordOneToOneAdapter[PredictionRecord] = - PredictionRecordAdapter.oneToOne(featureStoreBoundFeatureSet) -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/BCELabelTransformFromUUADataRecord.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/BCELabelTransformFromUUADataRecord.docx new file mode 100644 index 000000000..5ecf5ac1e Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/BCELabelTransformFromUUADataRecord.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/BCELabelTransformFromUUADataRecord.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/BCELabelTransformFromUUADataRecord.scala deleted file mode 100644 index 6adf6eaf8..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/BCELabelTransformFromUUADataRecord.scala +++ /dev/null @@ -1,68 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates - -import com.twitter.ml.api.Feature -import com.twitter.ml.api.FeatureContext -import com.twitter.ml.api.ITransform -import com.twitter.ml.api.constant.SharedFeatures -import java.lang.{Double => JDouble} - -import com.twitter.timelines.prediction.common.adapters.AdapterConsumer -import com.twitter.timelines.prediction.common.adapters.EngagementLabelFeaturesDataRecordUtils -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.RichDataRecord -import com.twitter.timelines.suggests.common.engagement.thriftscala.EngagementType -import com.twitter.timelines.suggests.common.engagement.thriftscala.Engagement -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures -import com.twitter.timelines.prediction.features.common.CombinedFeatures - -/** - * To transfrom BCE events UUA data records that contain only continuous dwell time to datarecords that contain corresponding binary label features - * The UUA datarecords inputted would have USER_ID, SOURCE_TWEET_ID,TIMESTAMP and - * 0 or one of (TWEET_DETAIL_DWELL_TIME_MS, PROFILE_DWELL_TIME_MS, FULLSCREEN_VIDEO_DWELL_TIME_MS) features. - * We will use the different engagement TIME_MS to differentiate different engagements, - * and then re-use the function in EngagementTypeConverte to add the binary label to the datarecord. - **/ - -object BCELabelTransformFromUUADataRecord extends ITransform { - - val dwellTimeFeatureToEngagementMap = Map( - TimelinesSharedFeatures.TWEET_DETAIL_DWELL_TIME_MS -> EngagementType.TweetDetailDwell, - TimelinesSharedFeatures.PROFILE_DWELL_TIME_MS -> EngagementType.ProfileDwell, - TimelinesSharedFeatures.FULLSCREEN_VIDEO_DWELL_TIME_MS -> EngagementType.FullscreenVideoDwell - ) - - def dwellFeatureToEngagement( - rdr: RichDataRecord, - dwellTimeFeature: Feature[JDouble], - engagementType: EngagementType - ): Option[Engagement] = { - if (rdr.hasFeature(dwellTimeFeature)) { - Some( - Engagement( - engagementType = engagementType, - timestampMs = rdr.getFeatureValue(SharedFeatures.TIMESTAMP), - weight = Some(rdr.getFeatureValue(dwellTimeFeature)) - )) - } else { - None - } - } - override def transformContext(featureContext: FeatureContext): FeatureContext = { - featureContext.addFeatures( - (CombinedFeatures.TweetDetailDwellEngagements ++ CombinedFeatures.ProfileDwellEngagements ++ CombinedFeatures.FullscreenVideoDwellEngagements).toSeq: _*) - } - override def transform(record: DataRecord): Unit = { - val rdr = new RichDataRecord(record) - val engagements = dwellTimeFeatureToEngagementMap - .map { - case (dwellTimeFeature, engagementType) => - dwellFeatureToEngagement(rdr, dwellTimeFeature, engagementType) - }.flatten.toSeq - - // Re-use BCE( behavior client events) label conversion in EngagementTypeConverter to align with BCE labels generation for offline training data - EngagementLabelFeaturesDataRecordUtils.setDwellTimeFeatures( - rdr, - Some(engagements), - AdapterConsumer.Combined) - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/BUILD b/src/scala/com/twitter/timelines/prediction/common/aggregates/BUILD deleted file mode 100644 index 01c930e8e..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/BUILD +++ /dev/null @@ -1,353 +0,0 @@ -create_datasets( - base_name = "original_author_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/original_author_aggregates/1556496000000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.OriginalAuthor", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "twitter_wide_user_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/twitter_wide_user_aggregates/1556496000000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.TwitterWideUser", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "twitter_wide_user_author_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/twitter_wide_user_author_aggregates/1556323200000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.TwitterWideUserAuthor", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "user_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_aggregates/1556150400000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.User", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "user_author_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_author_aggregates/1556064000000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserAuthor", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "aggregates_canary", - fallback_path = "gs://user.timelines.dp.gcp.twttr.net//canaries/processed/aggregates_v2/user_aggregates/1622851200000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.User", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "user_engager_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_engager_aggregates/1556496000000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserEngager", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "user_original_author_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_original_author_aggregates/1556496000000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserOriginalAuthor", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "author_topic_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/author_topic_aggregates/1589932800000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.AuthorTopic", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "user_topic_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_topic_aggregates/1590278400000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserTopic", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "user_inferred_topic_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_inferred_topic_aggregates/1599696000000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserInferredTopic", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "user_mention_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_mention_aggregates/1556582400000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserMention", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "user_request_dow_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_request_dow_aggregates/1556236800000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserRequestDow", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -create_datasets( - base_name = "user_request_hour_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_request_hour_aggregates/1556150400000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserRequestHour", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - - -create_datasets( - base_name = "user_list_aggregates", - fallback_path = "viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_list_aggregates/1590624000000", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserList", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - - -create_datasets( - base_name = "user_media_understanding_annotation_aggregates", - key_type = "com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey", - platform = "java8", - role = "timelines", - scala_schema = "com.twitter.timelines.prediction.common.aggregates.TimelinesAggregationKeyValInjections.UserMediaUnderstandingAnnotation", - segment_type = "snapshot", - tags = ["bazel-compatible"], - val_type = "(com.twitter.summingbird.batch.BatchID, com.twitter.ml.api.DataRecord)", - scala_dependencies = [ - ":injections", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) - -scala_library( - sources = [ - "BCELabelTransformFromUUADataRecord.scala", - "FeatureSelectorConfig.scala", - "RecapUserFeatureAggregation.scala", - "RectweetUserFeatureAggregation.scala", - "TimelinesAggregationConfig.scala", - "TimelinesAggregationConfigDetails.scala", - "TimelinesAggregationConfigTrait.scala", - "TimelinesAggregationSources.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":aggregates_canary-scala", - ":author_topic_aggregates-scala", - ":original_author_aggregates-scala", - ":twitter_wide_user_aggregates-scala", - ":twitter_wide_user_author_aggregates-scala", - ":user_aggregates-scala", - ":user_author_aggregates-scala", - ":user_engager_aggregates-scala", - ":user_inferred_topic_aggregates-scala", - ":user_list_aggregates-scala", - ":user_media_understanding_annotation_aggregates-scala", - ":user_mention_aggregates-scala", - ":user_original_author_aggregates-scala", - ":user_request_dow_aggregates-scala", - ":user_request_hour_aggregates-scala", - ":user_topic_aggregates-scala", - "src/java/com/twitter/ml/api:api-base", - "src/java/com/twitter/ml/api/constant", - "src/java/com/twitter/ml/api/matcher", - "src/scala/com/twitter/common/text/util", - "src/scala/com/twitter/dal/client/dataset", - "src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core", - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/timelines/prediction/common/adapters:engagement-converter", - "src/scala/com/twitter/timelines/prediction/features/client_log_event", - "src/scala/com/twitter/timelines/prediction/features/common", - "src/scala/com/twitter/timelines/prediction/features/engagement_features", - "src/scala/com/twitter/timelines/prediction/features/escherbird", - "src/scala/com/twitter/timelines/prediction/features/itl", - "src/scala/com/twitter/timelines/prediction/features/list_features", - "src/scala/com/twitter/timelines/prediction/features/p_home_latest", - "src/scala/com/twitter/timelines/prediction/features/real_graph", - "src/scala/com/twitter/timelines/prediction/features/recap", - "src/scala/com/twitter/timelines/prediction/features/request_context", - "src/scala/com/twitter/timelines/prediction/features/simcluster", - "src/scala/com/twitter/timelines/prediction/features/time_features", - "src/scala/com/twitter/timelines/prediction/transform/filter", - "src/thrift/com/twitter/timelines/suggests/common:engagement-scala", - "timelines/data_processing/ad_hoc/recap/data_record_preparation:recap_data_records_agg_minimal-java", - "util/util-core:scala", - ], -) - -scala_library( - name = "injections", - sources = [ - "FeatureSelectorConfig.scala", - "RecapUserFeatureAggregation.scala", - "RectweetUserFeatureAggregation.scala", - "TimelinesAggregationConfigDetails.scala", - "TimelinesAggregationConfigTrait.scala", - "TimelinesAggregationKeyValInjections.scala", - "TimelinesAggregationSources.scala", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/java/com/twitter/ml/api/constant", - "src/java/com/twitter/ml/api/matcher", - "src/scala/com/twitter/common/text/util", - "src/scala/com/twitter/dal/client/dataset", - "src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core", - "src/scala/com/twitter/scalding_internal/multiformat/format", - "src/scala/com/twitter/timelines/prediction/features/client_log_event", - "src/scala/com/twitter/timelines/prediction/features/common", - "src/scala/com/twitter/timelines/prediction/features/engagement_features", - "src/scala/com/twitter/timelines/prediction/features/escherbird", - "src/scala/com/twitter/timelines/prediction/features/itl", - "src/scala/com/twitter/timelines/prediction/features/list_features", - "src/scala/com/twitter/timelines/prediction/features/p_home_latest", - "src/scala/com/twitter/timelines/prediction/features/real_graph", - "src/scala/com/twitter/timelines/prediction/features/recap", - "src/scala/com/twitter/timelines/prediction/features/request_context", - "src/scala/com/twitter/timelines/prediction/features/semantic_core_features", - "src/scala/com/twitter/timelines/prediction/features/simcluster", - "src/scala/com/twitter/timelines/prediction/features/time_features", - "src/scala/com/twitter/timelines/prediction/transform/filter", - "timelines/data_processing/ad_hoc/recap/data_record_preparation:recap_data_records_agg_minimal-java", - "util/util-core:scala", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/BUILD.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/BUILD.docx new file mode 100644 index 000000000..2c821dfc5 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/FeatureSelectorConfig.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/FeatureSelectorConfig.docx new file mode 100644 index 000000000..ab274a3fd Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/FeatureSelectorConfig.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/FeatureSelectorConfig.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/FeatureSelectorConfig.scala deleted file mode 100644 index 1c91ef16c..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/FeatureSelectorConfig.scala +++ /dev/null @@ -1,121 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates - -import com.twitter.ml.api.matcher.FeatureMatcher -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup -import scala.collection.JavaConverters._ - -object FeatureSelectorConfig { - val BasePairsToStore = Seq( - ("twitter_wide_user_aggregate.pair", "*"), - ("twitter_wide_user_author_aggregate.pair", "*"), - ("user_aggregate_v5.continuous.pair", "*"), - ("user_aggregate_v7.pair", "*"), - ("user_author_aggregate_v2.pair", "recap.earlybird.*"), - ("user_author_aggregate_v2.pair", "recap.searchfeature.*"), - ("user_author_aggregate_v2.pair", "recap.tweetfeature.embeds*"), - ("user_author_aggregate_v2.pair", "recap.tweetfeature.link_count*"), - ("user_author_aggregate_v2.pair", "engagement_features.in_network.*"), - ("user_author_aggregate_v2.pair", "recap.tweetfeature.is_reply.*"), - ("user_author_aggregate_v2.pair", "recap.tweetfeature.is_retweet.*"), - ("user_author_aggregate_v2.pair", "recap.tweetfeature.num_mentions.*"), - ("user_author_aggregate_v5.pair", "*"), - ("user_author_aggregate_tweetsource_v1.pair", "*"), - ("user_engager_aggregate.pair", "*"), - ("user_mention_aggregate.pair", "*"), - ("user_request_context_aggregate.dow.pair", "*"), - ("user_request_context_aggregate.hour.pair", "*"), - ("user_aggregate_v6.pair", "*"), - ("user_original_author_aggregate_v1.pair", "*"), - ("user_original_author_aggregate_v2.pair", "*"), - ("original_author_aggregate_v1.pair", "*"), - ("original_author_aggregate_v2.pair", "*"), - ("author_topic_aggregate.pair", "*"), - ("user_list_aggregate.pair", "*"), - ("user_topic_aggregate.pair", "*"), - ("user_topic_aggregate_v2.pair", "*"), - ("user_inferred_topic_aggregate.pair", "*"), - ("user_inferred_topic_aggregate_v2.pair", "*"), - ("user_media_annotation_aggregate.pair", "*"), - ("user_media_annotation_aggregate.pair", "*"), - ("user_author_good_click_aggregate.pair", "*"), - ("user_engager_good_click_aggregate.pair", "*") - ) - val PairsToStore = BasePairsToStore ++ Seq( - ("user_aggregate_v2.pair", "*"), - ("user_aggregate_v5.boolean.pair", "*"), - ("user_aggregate_tweetsource_v1.pair", "*"), - ) - - - val LabelsToStore = Seq( - "any_label", - "recap.engagement.is_favorited", - "recap.engagement.is_retweeted", - "recap.engagement.is_replied", - "recap.engagement.is_open_linked", - "recap.engagement.is_profile_clicked", - "recap.engagement.is_clicked", - "recap.engagement.is_photo_expanded", - "recap.engagement.is_video_playback_50", - "recap.engagement.is_video_quality_viewed", - "recap.engagement.is_replied_reply_impressed_by_author", - "recap.engagement.is_replied_reply_favorited_by_author", - "recap.engagement.is_replied_reply_replied_by_author", - "recap.engagement.is_report_tweet_clicked", - "recap.engagement.is_block_clicked", - "recap.engagement.is_mute_clicked", - "recap.engagement.is_dont_like", - "recap.engagement.is_good_clicked_convo_desc_favorited_or_replied", - "recap.engagement.is_good_clicked_convo_desc_v2", - "itl.engagement.is_favorited", - "itl.engagement.is_retweeted", - "itl.engagement.is_replied", - "itl.engagement.is_open_linked", - "itl.engagement.is_profile_clicked", - "itl.engagement.is_clicked", - "itl.engagement.is_photo_expanded", - "itl.engagement.is_video_playback_50" - ) - - val PairGlobsToStore = for { - (prefix, suffix) <- PairsToStore - label <- LabelsToStore - } yield FeatureMatcher.glob(prefix + "." + label + "." + suffix) - - val BaseAggregateV2FeatureSelector = FeatureMatcher - .none() - .or( - FeatureMatcher.glob("meta.user_id"), - FeatureMatcher.glob("meta.author_id"), - FeatureMatcher.glob("entities.original_author_id"), - FeatureMatcher.glob("entities.topic_id"), - FeatureMatcher - .glob("entities.inferred_topic_ids" + TypedAggregateGroup.SparseFeatureSuffix), - FeatureMatcher.glob("timelines.meta.list_id"), - FeatureMatcher.glob("list.id"), - FeatureMatcher - .glob("engagement_features.user_ids.public" + TypedAggregateGroup.SparseFeatureSuffix), - FeatureMatcher - .glob("entities.users.mentioned_screen_names" + TypedAggregateGroup.SparseFeatureSuffix), - FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_dont_like.*"), - FeatureMatcher.glob("user_author_aggregate_v2.pair.any_label.recap.tweetfeature.has_*"), - FeatureMatcher.glob("request_context.country_code"), - FeatureMatcher.glob("request_context.timestamp_gmt_dow"), - FeatureMatcher.glob("request_context.timestamp_gmt_hour"), - FeatureMatcher.glob( - "semantic_core.media_understanding.high_recall.non_sensitive.entity_ids" + TypedAggregateGroup.SparseFeatureSuffix) - ) - - val AggregatesV2ProdFeatureSelector = BaseAggregateV2FeatureSelector - .orList(PairGlobsToStore.asJava) - - val ReducedPairGlobsToStore = (for { - (prefix, suffix) <- BasePairsToStore - label <- LabelsToStore - } yield FeatureMatcher.glob(prefix + "." + label + "." + suffix)) ++ Seq( - FeatureMatcher.glob("user_aggregate_v2.pair.any_label.*"), - FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_favorited.*"), - FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_photo_expanded.*"), - FeatureMatcher.glob("user_aggregate_v2.pair.recap.engagement.is_profile_clicked.*") - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/README.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/README.docx new file mode 100644 index 000000000..a9b5bf9fd Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/README.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/README.md b/src/scala/com/twitter/timelines/prediction/common/aggregates/README.md deleted file mode 100644 index 0bae21a14..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/README.md +++ /dev/null @@ -1,6 +0,0 @@ -## Timelines Aggregation Jobs - -This directory contains the specific definition of aggregate jobs that generate features used by the Heavy Ranker. -The primary files of interest are [`TimelinesAggregationConfigDetails.scala`](TimelinesAggregationConfigDetails.scala), which contains the defintion for the batch aggregate jobs and [`real_time/TimelinesOnlineAggregationConfigBase.scala`](real_time/TimelinesOnlineAggregationConfigBase.scala) which contains the definitions for the real time aggregate jobs. - -The aggregation framework that these jobs are based on is [here](../../../../../../../../timelines/data_processing/ml_util/aggregation_framework). \ No newline at end of file diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/RecapUserFeatureAggregation.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/RecapUserFeatureAggregation.docx new file mode 100644 index 000000000..c0fd7392d Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/RecapUserFeatureAggregation.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/RecapUserFeatureAggregation.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/RecapUserFeatureAggregation.scala deleted file mode 100644 index 657d5a713..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/RecapUserFeatureAggregation.scala +++ /dev/null @@ -1,415 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates - -import com.twitter.ml.api.Feature -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures -import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures -import com.twitter.timelines.prediction.features.real_graph.RealGraphDataRecordFeatures -import com.twitter.timelines.prediction.features.recap.RecapFeatures -import com.twitter.timelines.prediction.features.time_features.TimeDataRecordFeatures - -object RecapUserFeatureAggregation { - val RecapFeaturesForAggregation: Set[Feature[_]] = - Set( - RecapFeatures.HAS_IMAGE, - RecapFeatures.HAS_VIDEO, - RecapFeatures.FROM_MUTUAL_FOLLOW, - RecapFeatures.HAS_CARD, - RecapFeatures.HAS_NEWS, - RecapFeatures.REPLY_COUNT, - RecapFeatures.FAV_COUNT, - RecapFeatures.RETWEET_COUNT, - RecapFeatures.BLENDER_SCORE, - RecapFeatures.CONVERSATIONAL_COUNT, - RecapFeatures.IS_BUSINESS_SCORE, - RecapFeatures.CONTAINS_MEDIA, - RecapFeatures.RETWEET_SEARCHER, - RecapFeatures.REPLY_SEARCHER, - RecapFeatures.MENTION_SEARCHER, - RecapFeatures.REPLY_OTHER, - RecapFeatures.RETWEET_OTHER, - RecapFeatures.MATCH_UI_LANG, - RecapFeatures.MATCH_SEARCHER_MAIN_LANG, - RecapFeatures.MATCH_SEARCHER_LANGS, - RecapFeatures.TWEET_COUNT_FROM_USER_IN_SNAPSHOT, - RecapFeatures.TEXT_SCORE, - RealGraphDataRecordFeatures.NUM_RETWEETS_EWMA, - RealGraphDataRecordFeatures.NUM_RETWEETS_NON_ZERO_DAYS, - RealGraphDataRecordFeatures.NUM_RETWEETS_ELAPSED_DAYS, - RealGraphDataRecordFeatures.NUM_RETWEETS_DAYS_SINCE_LAST, - RealGraphDataRecordFeatures.NUM_FAVORITES_EWMA, - RealGraphDataRecordFeatures.NUM_FAVORITES_NON_ZERO_DAYS, - RealGraphDataRecordFeatures.NUM_FAVORITES_ELAPSED_DAYS, - RealGraphDataRecordFeatures.NUM_FAVORITES_DAYS_SINCE_LAST, - RealGraphDataRecordFeatures.NUM_MENTIONS_EWMA, - RealGraphDataRecordFeatures.NUM_MENTIONS_NON_ZERO_DAYS, - RealGraphDataRecordFeatures.NUM_MENTIONS_ELAPSED_DAYS, - RealGraphDataRecordFeatures.NUM_MENTIONS_DAYS_SINCE_LAST, - RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_EWMA, - RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_NON_ZERO_DAYS, - RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_ELAPSED_DAYS, - RealGraphDataRecordFeatures.NUM_TWEET_CLICKS_DAYS_SINCE_LAST, - RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_EWMA, - RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_NON_ZERO_DAYS, - RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_ELAPSED_DAYS, - RealGraphDataRecordFeatures.NUM_PROFILE_VIEWS_DAYS_SINCE_LAST, - RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_EWMA, - RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_NON_ZERO_DAYS, - RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_ELAPSED_DAYS, - RealGraphDataRecordFeatures.TOTAL_DWELL_TIME_DAYS_SINCE_LAST, - RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_EWMA, - RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_NON_ZERO_DAYS, - RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_ELAPSED_DAYS, - RealGraphDataRecordFeatures.NUM_INSPECTED_TWEETS_DAYS_SINCE_LAST - ) - - val RecapLabelsForAggregation: Set[Feature.Binary] = - Set( - RecapFeatures.IS_FAVORITED, - RecapFeatures.IS_RETWEETED, - RecapFeatures.IS_CLICKED, - RecapFeatures.IS_PROFILE_CLICKED, - RecapFeatures.IS_OPEN_LINKED - ) - - val DwellDuration: Set[Feature[_]] = - Set( - TimelinesSharedFeatures.DWELL_TIME_MS, - ) - - val UserFeaturesV2: Set[Feature[_]] = RecapFeaturesForAggregation ++ Set( - RecapFeatures.HAS_VINE, - RecapFeatures.HAS_PERISCOPE, - RecapFeatures.HAS_PRO_VIDEO, - RecapFeatures.HAS_VISIBLE_LINK, - RecapFeatures.BIDIRECTIONAL_FAV_COUNT, - RecapFeatures.UNIDIRECTIONAL_FAV_COUNT, - RecapFeatures.BIDIRECTIONAL_REPLY_COUNT, - RecapFeatures.UNIDIRECTIONAL_REPLY_COUNT, - RecapFeatures.BIDIRECTIONAL_RETWEET_COUNT, - RecapFeatures.UNIDIRECTIONAL_RETWEET_COUNT, - RecapFeatures.EMBEDS_URL_COUNT, - RecapFeatures.EMBEDS_IMPRESSION_COUNT, - RecapFeatures.VIDEO_VIEW_COUNT, - RecapFeatures.IS_RETWEET, - RecapFeatures.IS_REPLY, - RecapFeatures.IS_EXTENDED_REPLY, - RecapFeatures.HAS_LINK, - RecapFeatures.HAS_TREND, - RecapFeatures.LINK_LANGUAGE, - RecapFeatures.NUM_HASHTAGS, - RecapFeatures.NUM_MENTIONS, - RecapFeatures.IS_SENSITIVE, - RecapFeatures.HAS_MULTIPLE_MEDIA, - RecapFeatures.USER_REP, - RecapFeatures.FAV_COUNT_V2, - RecapFeatures.RETWEET_COUNT_V2, - RecapFeatures.REPLY_COUNT_V2, - RecapFeatures.LINK_COUNT, - EngagementDataRecordFeatures.InNetworkFavoritesCount, - EngagementDataRecordFeatures.InNetworkRetweetsCount, - EngagementDataRecordFeatures.InNetworkRepliesCount - ) - - val UserAuthorFeaturesV2: Set[Feature[_]] = Set( - RecapFeatures.HAS_IMAGE, - RecapFeatures.HAS_VINE, - RecapFeatures.HAS_PERISCOPE, - RecapFeatures.HAS_PRO_VIDEO, - RecapFeatures.HAS_VIDEO, - RecapFeatures.HAS_CARD, - RecapFeatures.HAS_NEWS, - RecapFeatures.HAS_VISIBLE_LINK, - RecapFeatures.REPLY_COUNT, - RecapFeatures.FAV_COUNT, - RecapFeatures.RETWEET_COUNT, - RecapFeatures.BLENDER_SCORE, - RecapFeatures.CONVERSATIONAL_COUNT, - RecapFeatures.IS_BUSINESS_SCORE, - RecapFeatures.CONTAINS_MEDIA, - RecapFeatures.RETWEET_SEARCHER, - RecapFeatures.REPLY_SEARCHER, - RecapFeatures.MENTION_SEARCHER, - RecapFeatures.REPLY_OTHER, - RecapFeatures.RETWEET_OTHER, - RecapFeatures.MATCH_UI_LANG, - RecapFeatures.MATCH_SEARCHER_MAIN_LANG, - RecapFeatures.MATCH_SEARCHER_LANGS, - RecapFeatures.TWEET_COUNT_FROM_USER_IN_SNAPSHOT, - RecapFeatures.TEXT_SCORE, - RecapFeatures.BIDIRECTIONAL_FAV_COUNT, - RecapFeatures.UNIDIRECTIONAL_FAV_COUNT, - RecapFeatures.BIDIRECTIONAL_REPLY_COUNT, - RecapFeatures.UNIDIRECTIONAL_REPLY_COUNT, - RecapFeatures.BIDIRECTIONAL_RETWEET_COUNT, - RecapFeatures.UNIDIRECTIONAL_RETWEET_COUNT, - RecapFeatures.EMBEDS_URL_COUNT, - RecapFeatures.EMBEDS_IMPRESSION_COUNT, - RecapFeatures.VIDEO_VIEW_COUNT, - RecapFeatures.IS_RETWEET, - RecapFeatures.IS_REPLY, - RecapFeatures.HAS_LINK, - RecapFeatures.HAS_TREND, - RecapFeatures.LINK_LANGUAGE, - RecapFeatures.NUM_HASHTAGS, - RecapFeatures.NUM_MENTIONS, - RecapFeatures.IS_SENSITIVE, - RecapFeatures.HAS_MULTIPLE_MEDIA, - RecapFeatures.FAV_COUNT_V2, - RecapFeatures.RETWEET_COUNT_V2, - RecapFeatures.REPLY_COUNT_V2, - RecapFeatures.LINK_COUNT, - EngagementDataRecordFeatures.InNetworkFavoritesCount, - EngagementDataRecordFeatures.InNetworkRetweetsCount, - EngagementDataRecordFeatures.InNetworkRepliesCount - ) - - val UserAuthorFeaturesV2Count: Set[Feature[_]] = Set( - RecapFeatures.HAS_IMAGE, - RecapFeatures.HAS_VINE, - RecapFeatures.HAS_PERISCOPE, - RecapFeatures.HAS_PRO_VIDEO, - RecapFeatures.HAS_VIDEO, - RecapFeatures.HAS_CARD, - RecapFeatures.HAS_NEWS, - RecapFeatures.HAS_VISIBLE_LINK, - RecapFeatures.FAV_COUNT, - RecapFeatures.CONTAINS_MEDIA, - RecapFeatures.RETWEET_SEARCHER, - RecapFeatures.REPLY_SEARCHER, - RecapFeatures.MENTION_SEARCHER, - RecapFeatures.REPLY_OTHER, - RecapFeatures.RETWEET_OTHER, - RecapFeatures.MATCH_UI_LANG, - RecapFeatures.MATCH_SEARCHER_MAIN_LANG, - RecapFeatures.MATCH_SEARCHER_LANGS, - RecapFeatures.IS_RETWEET, - RecapFeatures.IS_REPLY, - RecapFeatures.HAS_LINK, - RecapFeatures.HAS_TREND, - RecapFeatures.IS_SENSITIVE, - RecapFeatures.HAS_MULTIPLE_MEDIA, - EngagementDataRecordFeatures.InNetworkFavoritesCount - ) - - val UserTopicFeaturesV2Count: Set[Feature[_]] = Set( - RecapFeatures.HAS_IMAGE, - RecapFeatures.HAS_VIDEO, - RecapFeatures.HAS_CARD, - RecapFeatures.HAS_NEWS, - RecapFeatures.FAV_COUNT, - RecapFeatures.CONTAINS_MEDIA, - RecapFeatures.RETWEET_SEARCHER, - RecapFeatures.REPLY_SEARCHER, - RecapFeatures.MENTION_SEARCHER, - RecapFeatures.REPLY_OTHER, - RecapFeatures.RETWEET_OTHER, - RecapFeatures.MATCH_UI_LANG, - RecapFeatures.MATCH_SEARCHER_MAIN_LANG, - RecapFeatures.MATCH_SEARCHER_LANGS, - RecapFeatures.IS_RETWEET, - RecapFeatures.IS_REPLY, - RecapFeatures.HAS_LINK, - RecapFeatures.HAS_TREND, - RecapFeatures.IS_SENSITIVE, - EngagementDataRecordFeatures.InNetworkFavoritesCount, - EngagementDataRecordFeatures.InNetworkRetweetsCount, - TimelinesSharedFeatures.NUM_CAPS, - TimelinesSharedFeatures.ASPECT_RATIO_DEN, - TimelinesSharedFeatures.NUM_NEWLINES, - TimelinesSharedFeatures.IS_360, - TimelinesSharedFeatures.IS_MANAGED, - TimelinesSharedFeatures.IS_MONETIZABLE, - TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE, - TimelinesSharedFeatures.HAS_TITLE, - TimelinesSharedFeatures.HAS_DESCRIPTION, - TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION, - TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION - ) - - val UserFeaturesV5Continuous: Set[Feature[_]] = Set( - TimelinesSharedFeatures.QUOTE_COUNT, - TimelinesSharedFeatures.VISIBLE_TOKEN_RATIO, - TimelinesSharedFeatures.WEIGHTED_FAV_COUNT, - TimelinesSharedFeatures.WEIGHTED_RETWEET_COUNT, - TimelinesSharedFeatures.WEIGHTED_REPLY_COUNT, - TimelinesSharedFeatures.WEIGHTED_QUOTE_COUNT, - TimelinesSharedFeatures.EMBEDS_IMPRESSION_COUNT_V2, - TimelinesSharedFeatures.EMBEDS_URL_COUNT_V2, - TimelinesSharedFeatures.DECAYED_FAVORITE_COUNT, - TimelinesSharedFeatures.DECAYED_RETWEET_COUNT, - TimelinesSharedFeatures.DECAYED_REPLY_COUNT, - TimelinesSharedFeatures.DECAYED_QUOTE_COUNT, - TimelinesSharedFeatures.FAKE_FAVORITE_COUNT, - TimelinesSharedFeatures.FAKE_RETWEET_COUNT, - TimelinesSharedFeatures.FAKE_REPLY_COUNT, - TimelinesSharedFeatures.FAKE_QUOTE_COUNT, - TimeDataRecordFeatures.LAST_FAVORITE_SINCE_CREATION_HRS, - TimeDataRecordFeatures.LAST_RETWEET_SINCE_CREATION_HRS, - TimeDataRecordFeatures.LAST_REPLY_SINCE_CREATION_HRS, - TimeDataRecordFeatures.LAST_QUOTE_SINCE_CREATION_HRS, - TimeDataRecordFeatures.TIME_SINCE_LAST_FAVORITE_HRS, - TimeDataRecordFeatures.TIME_SINCE_LAST_RETWEET_HRS, - TimeDataRecordFeatures.TIME_SINCE_LAST_REPLY_HRS, - TimeDataRecordFeatures.TIME_SINCE_LAST_QUOTE_HRS - ) - - val UserFeaturesV5Boolean: Set[Feature[_]] = Set( - TimelinesSharedFeatures.LABEL_ABUSIVE_FLAG, - TimelinesSharedFeatures.LABEL_ABUSIVE_HI_RCL_FLAG, - TimelinesSharedFeatures.LABEL_DUP_CONTENT_FLAG, - TimelinesSharedFeatures.LABEL_NSFW_HI_PRC_FLAG, - TimelinesSharedFeatures.LABEL_NSFW_HI_RCL_FLAG, - TimelinesSharedFeatures.LABEL_SPAM_FLAG, - TimelinesSharedFeatures.LABEL_SPAM_HI_RCL_FLAG, - TimelinesSharedFeatures.PERISCOPE_EXISTS, - TimelinesSharedFeatures.PERISCOPE_IS_LIVE, - TimelinesSharedFeatures.PERISCOPE_HAS_BEEN_FEATURED, - TimelinesSharedFeatures.PERISCOPE_IS_CURRENTLY_FEATURED, - TimelinesSharedFeatures.PERISCOPE_IS_FROM_QUALITY_SOURCE, - TimelinesSharedFeatures.HAS_QUOTE - ) - - val UserAuthorFeaturesV5: Set[Feature[_]] = Set( - TimelinesSharedFeatures.HAS_QUOTE, - TimelinesSharedFeatures.LABEL_ABUSIVE_FLAG, - TimelinesSharedFeatures.LABEL_ABUSIVE_HI_RCL_FLAG, - TimelinesSharedFeatures.LABEL_DUP_CONTENT_FLAG, - TimelinesSharedFeatures.LABEL_NSFW_HI_PRC_FLAG, - TimelinesSharedFeatures.LABEL_NSFW_HI_RCL_FLAG, - TimelinesSharedFeatures.LABEL_SPAM_FLAG, - TimelinesSharedFeatures.LABEL_SPAM_HI_RCL_FLAG - ) - - val UserTweetSourceFeaturesV1Continuous: Set[Feature[_]] = Set( - TimelinesSharedFeatures.NUM_CAPS, - TimelinesSharedFeatures.NUM_WHITESPACES, - TimelinesSharedFeatures.TWEET_LENGTH, - TimelinesSharedFeatures.ASPECT_RATIO_DEN, - TimelinesSharedFeatures.ASPECT_RATIO_NUM, - TimelinesSharedFeatures.BIT_RATE, - TimelinesSharedFeatures.HEIGHT_1, - TimelinesSharedFeatures.HEIGHT_2, - TimelinesSharedFeatures.HEIGHT_3, - TimelinesSharedFeatures.HEIGHT_4, - TimelinesSharedFeatures.VIDEO_DURATION, - TimelinesSharedFeatures.WIDTH_1, - TimelinesSharedFeatures.WIDTH_2, - TimelinesSharedFeatures.WIDTH_3, - TimelinesSharedFeatures.WIDTH_4, - TimelinesSharedFeatures.NUM_MEDIA_TAGS - ) - - val UserTweetSourceFeaturesV1Boolean: Set[Feature[_]] = Set( - TimelinesSharedFeatures.HAS_QUESTION, - TimelinesSharedFeatures.RESIZE_METHOD_1, - TimelinesSharedFeatures.RESIZE_METHOD_2, - TimelinesSharedFeatures.RESIZE_METHOD_3, - TimelinesSharedFeatures.RESIZE_METHOD_4 - ) - - val UserTweetSourceFeaturesV2Continuous: Set[Feature[_]] = Set( - TimelinesSharedFeatures.NUM_EMOJIS, - TimelinesSharedFeatures.NUM_EMOTICONS, - TimelinesSharedFeatures.NUM_NEWLINES, - TimelinesSharedFeatures.NUM_STICKERS, - TimelinesSharedFeatures.NUM_FACES, - TimelinesSharedFeatures.NUM_COLOR_PALLETTE_ITEMS, - TimelinesSharedFeatures.VIEW_COUNT, - TimelinesSharedFeatures.TWEET_LENGTH_TYPE - ) - - val UserTweetSourceFeaturesV2Boolean: Set[Feature[_]] = Set( - TimelinesSharedFeatures.IS_360, - TimelinesSharedFeatures.IS_MANAGED, - TimelinesSharedFeatures.IS_MONETIZABLE, - TimelinesSharedFeatures.IS_EMBEDDABLE, - TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE, - TimelinesSharedFeatures.HAS_TITLE, - TimelinesSharedFeatures.HAS_DESCRIPTION, - TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION, - TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION - ) - - val UserAuthorTweetSourceFeaturesV1: Set[Feature[_]] = Set( - TimelinesSharedFeatures.HAS_QUESTION, - TimelinesSharedFeatures.TWEET_LENGTH, - TimelinesSharedFeatures.VIDEO_DURATION, - TimelinesSharedFeatures.NUM_MEDIA_TAGS - ) - - val UserAuthorTweetSourceFeaturesV2: Set[Feature[_]] = Set( - TimelinesSharedFeatures.NUM_CAPS, - TimelinesSharedFeatures.NUM_WHITESPACES, - TimelinesSharedFeatures.ASPECT_RATIO_DEN, - TimelinesSharedFeatures.ASPECT_RATIO_NUM, - TimelinesSharedFeatures.BIT_RATE, - TimelinesSharedFeatures.TWEET_LENGTH_TYPE, - TimelinesSharedFeatures.NUM_EMOJIS, - TimelinesSharedFeatures.NUM_EMOTICONS, - TimelinesSharedFeatures.NUM_NEWLINES, - TimelinesSharedFeatures.NUM_STICKERS, - TimelinesSharedFeatures.NUM_FACES, - TimelinesSharedFeatures.IS_360, - TimelinesSharedFeatures.IS_MANAGED, - TimelinesSharedFeatures.IS_MONETIZABLE, - TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE, - TimelinesSharedFeatures.HAS_TITLE, - TimelinesSharedFeatures.HAS_DESCRIPTION, - TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION, - TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION - ) - - val UserAuthorTweetSourceFeaturesV2Count: Set[Feature[_]] = Set( - TimelinesSharedFeatures.NUM_CAPS, - TimelinesSharedFeatures.ASPECT_RATIO_DEN, - TimelinesSharedFeatures.NUM_NEWLINES, - TimelinesSharedFeatures.IS_360, - TimelinesSharedFeatures.IS_MANAGED, - TimelinesSharedFeatures.IS_MONETIZABLE, - TimelinesSharedFeatures.HAS_SELECTED_PREVIEW_IMAGE, - TimelinesSharedFeatures.HAS_TITLE, - TimelinesSharedFeatures.HAS_DESCRIPTION, - TimelinesSharedFeatures.HAS_VISIT_SITE_CALL_TO_ACTION, - TimelinesSharedFeatures.HAS_WATCH_NOW_CALL_TO_ACTION - ) - - val LabelsV2: Set[Feature.Binary] = RecapLabelsForAggregation ++ Set( - RecapFeatures.IS_REPLIED, - RecapFeatures.IS_PHOTO_EXPANDED, - RecapFeatures.IS_VIDEO_PLAYBACK_50 - ) - - val TwitterWideFeatures: Set[Feature[_]] = Set( - RecapFeatures.IS_REPLY, - TimelinesSharedFeatures.HAS_QUOTE, - RecapFeatures.HAS_MENTION, - RecapFeatures.HAS_HASHTAG, - RecapFeatures.HAS_LINK, - RecapFeatures.HAS_CARD, - RecapFeatures.CONTAINS_MEDIA - ) - - val TwitterWideLabels: Set[Feature.Binary] = Set( - RecapFeatures.IS_FAVORITED, - RecapFeatures.IS_RETWEETED, - RecapFeatures.IS_REPLIED - ) - - val ReciprocalLabels: Set[Feature.Binary] = Set( - RecapFeatures.IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR, - RecapFeatures.IS_REPLIED_REPLY_REPLIED_BY_AUTHOR, - RecapFeatures.IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR - ) - - val NegativeEngagementLabels: Set[Feature.Binary] = Set( - RecapFeatures.IS_REPORT_TWEET_CLICKED, - RecapFeatures.IS_BLOCK_CLICKED, - RecapFeatures.IS_MUTE_CLICKED, - RecapFeatures.IS_DONT_LIKE - ) - - val GoodClickLabels: Set[Feature.Binary] = Set( - RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V1, - RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V2, - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/RectweetUserFeatureAggregation.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/RectweetUserFeatureAggregation.docx new file mode 100644 index 000000000..9e9299e6a Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/RectweetUserFeatureAggregation.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/RectweetUserFeatureAggregation.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/RectweetUserFeatureAggregation.scala deleted file mode 100644 index 12835ef1f..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/RectweetUserFeatureAggregation.scala +++ /dev/null @@ -1,52 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates - -import com.twitter.ml.api.Feature -import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures -import com.twitter.timelines.prediction.features.itl.ITLFeatures - -object RectweetUserFeatureAggregation { - val RectweetLabelsForAggregation: Set[Feature.Binary] = - Set( - ITLFeatures.IS_FAVORITED, - ITLFeatures.IS_RETWEETED, - ITLFeatures.IS_REPLIED, - ITLFeatures.IS_CLICKED, - ITLFeatures.IS_PROFILE_CLICKED, - ITLFeatures.IS_OPEN_LINKED, - ITLFeatures.IS_PHOTO_EXPANDED, - ITLFeatures.IS_VIDEO_PLAYBACK_50 - ) - - val TweetFeatures: Set[Feature[_]] = Set( - ITLFeatures.HAS_IMAGE, - ITLFeatures.HAS_CARD, - ITLFeatures.HAS_NEWS, - ITLFeatures.REPLY_COUNT, - ITLFeatures.FAV_COUNT, - ITLFeatures.REPLY_COUNT, - ITLFeatures.RETWEET_COUNT, - ITLFeatures.MATCHES_UI_LANG, - ITLFeatures.MATCHES_SEARCHER_MAIN_LANG, - ITLFeatures.MATCHES_SEARCHER_LANGS, - ITLFeatures.TEXT_SCORE, - ITLFeatures.LINK_LANGUAGE, - ITLFeatures.NUM_HASHTAGS, - ITLFeatures.NUM_MENTIONS, - ITLFeatures.IS_SENSITIVE, - ITLFeatures.HAS_VIDEO, - ITLFeatures.HAS_LINK, - ITLFeatures.HAS_VISIBLE_LINK, - EngagementDataRecordFeatures.InNetworkFavoritesCount - // nice to have, but currently not hydrated in the RecommendedTweet payload - //EngagementDataRecordFeatures.InNetworkRetweetsCount, - //EngagementDataRecordFeatures.InNetworkRepliesCount - ) - - val ReciprocalLabels: Set[Feature.Binary] = Set( - ITLFeatures.IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR, - ITLFeatures.IS_REPLIED_REPLY_REPLIED_BY_AUTHOR, - ITLFeatures.IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR, - ITLFeatures.IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR, - ITLFeatures.IS_REPLIED_REPLY_QUOTED_BY_AUTHOR - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.docx new file mode 100644 index 000000000..a8e2690cc Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.scala deleted file mode 100644 index e6581e32e..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.scala +++ /dev/null @@ -1,80 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates - -import com.twitter.dal.client.dataset.KeyValDALDataset -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.FeatureContext -import com.twitter.scalding_internal.multiformat.format.keyval -import com.twitter.summingbird.batch.BatchID -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion.CombineCountsPolicy -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateStore -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.OfflineAggregateDataRecordStore -import scala.collection.JavaConverters._ - -object TimelinesAggregationConfig extends TimelinesAggregationConfigTrait { - override def outputHdfsPath: String = "/user/timelines/processed/aggregates_v2" - - def storeToDatasetMap: Map[String, KeyValDALDataset[ - keyval.KeyVal[AggregationKey, (BatchID, DataRecord)] - ]] = Map( - AuthorTopicAggregateStore -> AuthorTopicAggregatesScalaDataset, - UserTopicAggregateStore -> UserTopicAggregatesScalaDataset, - UserInferredTopicAggregateStore -> UserInferredTopicAggregatesScalaDataset, - UserAggregateStore -> UserAggregatesScalaDataset, - UserAuthorAggregateStore -> UserAuthorAggregatesScalaDataset, - UserOriginalAuthorAggregateStore -> UserOriginalAuthorAggregatesScalaDataset, - OriginalAuthorAggregateStore -> OriginalAuthorAggregatesScalaDataset, - UserEngagerAggregateStore -> UserEngagerAggregatesScalaDataset, - UserMentionAggregateStore -> UserMentionAggregatesScalaDataset, - TwitterWideUserAggregateStore -> TwitterWideUserAggregatesScalaDataset, - TwitterWideUserAuthorAggregateStore -> TwitterWideUserAuthorAggregatesScalaDataset, - UserRequestHourAggregateStore -> UserRequestHourAggregatesScalaDataset, - UserRequestDowAggregateStore -> UserRequestDowAggregatesScalaDataset, - UserListAggregateStore -> UserListAggregatesScalaDataset, - UserMediaUnderstandingAnnotationAggregateStore -> UserMediaUnderstandingAnnotationAggregatesScalaDataset, - ) - - override def mkPhysicalStore(store: AggregateStore): AggregateStore = store match { - case s: OfflineAggregateDataRecordStore => - s.toOfflineAggregateDataRecordStoreWithDAL(storeToDatasetMap(s.name)) - case _ => throw new IllegalArgumentException("Unsupported logical dataset type.") - } - - object CombineCountPolicies { - val EngagerCountsPolicy: CombineCountsPolicy = mkCountsPolicy("user_engager_aggregate") - val EngagerGoodClickCountsPolicy: CombineCountsPolicy = mkCountsPolicy( - "user_engager_good_click_aggregate") - val RectweetEngagerCountsPolicy: CombineCountsPolicy = - mkCountsPolicy("rectweet_user_engager_aggregate") - val MentionCountsPolicy: CombineCountsPolicy = mkCountsPolicy("user_mention_aggregate") - val RectweetSimclustersTweetCountsPolicy: CombineCountsPolicy = - mkCountsPolicy("rectweet_user_simcluster_tweet_aggregate") - val UserInferredTopicCountsPolicy: CombineCountsPolicy = - mkCountsPolicy("user_inferred_topic_aggregate") - val UserInferredTopicV2CountsPolicy: CombineCountsPolicy = - mkCountsPolicy("user_inferred_topic_aggregate_v2") - val UserMediaUnderstandingAnnotationCountsPolicy: CombineCountsPolicy = - mkCountsPolicy("user_media_annotation_aggregate") - - private[this] def mkCountsPolicy(prefix: String): CombineCountsPolicy = { - val features = TimelinesAggregationConfig.aggregatesToCompute - .filter(_.aggregatePrefix == prefix) - .flatMap(_.allOutputFeatures) - CombineCountsPolicy( - topK = 2, - aggregateContextToPrecompute = new FeatureContext(features.asJava), - hardLimit = Some(20) - ) - } - } -} - -object TimelinesAggregationCanaryConfig extends TimelinesAggregationConfigTrait { - override def outputHdfsPath: String = "/user/timelines/canaries/processed/aggregates_v2" - - override def mkPhysicalStore(store: AggregateStore): AggregateStore = store match { - case s: OfflineAggregateDataRecordStore => - s.toOfflineAggregateDataRecordStoreWithDAL(dalDataset = AggregatesCanaryScalaDataset) - case _ => throw new IllegalArgumentException("Unsupported logical dataset type.") - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.docx new file mode 100644 index 000000000..1e37e61cf Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.scala deleted file mode 100644 index aa439deda..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.scala +++ /dev/null @@ -1,579 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates - -import com.twitter.conversions.DurationOps._ -import com.twitter.ml.api.constant.SharedFeatures.AUTHOR_ID -import com.twitter.ml.api.constant.SharedFeatures.USER_ID -import com.twitter.timelines.data_processing.ml_util.aggregation_framework._ -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics._ -import com.twitter.timelines.data_processing.ml_util.transforms.DownsampleTransform -import com.twitter.timelines.data_processing.ml_util.transforms.RichRemoveAuthorIdZero -import com.twitter.timelines.data_processing.ml_util.transforms.RichRemoveUserIdZero -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures -import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures -import com.twitter.timelines.prediction.features.engagement_features.EngagementDataRecordFeatures.RichUnifyPublicEngagersTransform -import com.twitter.timelines.prediction.features.list_features.ListFeatures -import com.twitter.timelines.prediction.features.recap.RecapFeatures -import com.twitter.timelines.prediction.features.request_context.RequestContextFeatures -import com.twitter.timelines.prediction.features.semantic_core_features.SemanticCoreFeatures -import com.twitter.timelines.prediction.transform.filter.FilterInNetworkTransform -import com.twitter.timelines.prediction.transform.filter.FilterImageTweetTransform -import com.twitter.timelines.prediction.transform.filter.FilterVideoTweetTransform -import com.twitter.timelines.prediction.transform.filter.FilterOutImageVideoTweetTransform -import com.twitter.util.Duration - -trait TimelinesAggregationConfigDetails extends Serializable { - - import TimelinesAggregationSources._ - - def outputHdfsPath: String - - /** - * Converts the given logical store to a physical store. The reason we do not specify the - * physical store directly with the [[AggregateGroup]] is because of a cyclic dependency when - * create physical stores that are DalDataset with PersonalDataType annotations derived from - * the [[AggregateGroup]]. - * - */ - def mkPhysicalStore(store: AggregateStore): AggregateStore - - def defaultMaxKvSourceFailures: Int = 100 - - val timelinesOfflineAggregateSink = new OfflineStoreCommonConfig { - override def apply(startDate: String) = OfflineAggregateStoreCommonConfig( - outputHdfsPathPrefix = outputHdfsPath, - dummyAppId = "timelines_aggregates_v2_ro", - dummyDatasetPrefix = "timelines_aggregates_v2_ro", - startDate = startDate - ) - } - - val UserAggregateStore = "user_aggregates" - val UserAuthorAggregateStore = "user_author_aggregates" - val UserOriginalAuthorAggregateStore = "user_original_author_aggregates" - val OriginalAuthorAggregateStore = "original_author_aggregates" - val UserEngagerAggregateStore = "user_engager_aggregates" - val UserMentionAggregateStore = "user_mention_aggregates" - val TwitterWideUserAggregateStore = "twitter_wide_user_aggregates" - val TwitterWideUserAuthorAggregateStore = "twitter_wide_user_author_aggregates" - val UserRequestHourAggregateStore = "user_request_hour_aggregates" - val UserRequestDowAggregateStore = "user_request_dow_aggregates" - val UserListAggregateStore = "user_list_aggregates" - val AuthorTopicAggregateStore = "author_topic_aggregates" - val UserTopicAggregateStore = "user_topic_aggregates" - val UserInferredTopicAggregateStore = "user_inferred_topic_aggregates" - val UserMediaUnderstandingAnnotationAggregateStore = - "user_media_understanding_annotation_aggregates" - val AuthorCountryCodeAggregateStore = "author_country_code_aggregates" - val OriginalAuthorCountryCodeAggregateStore = "original_author_country_code_aggregates" - - /** - * Step 3: Configure all aggregates to compute. - * Note that different subsets of aggregates in this list - * can be launched by different summingbird job instances. - * Any given job can be responsible for a set of AggregateGroup - * configs whose outputStores share the same exact startDate. - * AggregateGroups that do not share the same inputSource, - * outputStore or startDate MUST be launched using different - * summingbird jobs and passed in a different --start-time argument - * See science/scalding/mesos/timelines/prod.yaml for an example - * of how to configure your own job. - */ - val negativeDownsampleTransform = - DownsampleTransform( - negativeSamplingRate = 0.03, - keepLabels = RecapUserFeatureAggregation.LabelsV2) - val negativeRecTweetDownsampleTransform = DownsampleTransform( - negativeSamplingRate = 0.03, - keepLabels = RectweetUserFeatureAggregation.RectweetLabelsForAggregation - ) - - val userAggregatesV2: AggregateGroup = - AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_aggregate_v2", - preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */ - keys = Set(USER_ID), - features = RecapUserFeatureAggregation.UserFeaturesV2, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric, SumMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserAggregateStore, - startDate = "2016-07-15 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userAuthorAggregatesV2: Set[AggregateGroup] = { - - /** - * NOTE: We need to remove records from out-of-network authors from the recap input - * records (which now include out-of-network records as well after merging recap and - * rectweet models) that are used to compute user-author aggregates. This is necessary - * to limit the growth rate of user-author aggregates. - */ - val allFeatureAggregates = Set( - AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_author_aggregate_v2", - preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), - keys = Set(USER_ID, AUTHOR_ID), - features = RecapUserFeatureAggregation.UserAuthorFeaturesV2, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(SumMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserAuthorAggregateStore, - startDate = "2016-07-15 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - ) - - val countAggregates: Set[AggregateGroup] = Set( - AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_author_aggregate_v2", - preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), - keys = Set(USER_ID, AUTHOR_ID), - features = RecapUserFeatureAggregation.UserAuthorFeaturesV2Count, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserAuthorAggregateStore, - startDate = "2016-07-15 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - ) - - allFeatureAggregates ++ countAggregates - } - - val userAggregatesV5Continuous: AggregateGroup = - AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_aggregate_v5.continuous", - preTransforms = Seq(RichRemoveUserIdZero), - keys = Set(USER_ID), - features = RecapUserFeatureAggregation.UserFeaturesV5Continuous, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric, SumMetric, SumSqMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserAggregateStore, - startDate = "2016-07-15 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userAuthorAggregatesV5: AggregateGroup = - AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_author_aggregate_v5", - preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), - keys = Set(USER_ID, AUTHOR_ID), - features = RecapUserFeatureAggregation.UserAuthorFeaturesV5, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserAuthorAggregateStore, - startDate = "2016-07-15 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val tweetSourceUserAuthorAggregatesV1: AggregateGroup = - AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_author_aggregate_tweetsource_v1", - preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), - keys = Set(USER_ID, AUTHOR_ID), - features = RecapUserFeatureAggregation.UserAuthorTweetSourceFeaturesV1, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric, SumMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserAuthorAggregateStore, - startDate = "2016-07-15 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userEngagerAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_engager_aggregate", - keys = Set(USER_ID, EngagementDataRecordFeatures.PublicEngagementUserIds), - features = Set.empty, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserEngagerAggregateStore, - startDate = "2016-09-02 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )), - preTransforms = Seq( - RichRemoveUserIdZero, - RichUnifyPublicEngagersTransform - ) - ) - - val userMentionAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */ - aggregatePrefix = "user_mention_aggregate", - keys = Set(USER_ID, RecapFeatures.MENTIONED_SCREEN_NAMES), - features = Set.empty, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserMentionAggregateStore, - startDate = "2017-03-01 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )), - includeAnyLabel = false - ) - - val twitterWideUserAggregates = AggregateGroup( - inputSource = timelinesDailyTwitterWideSource, - preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */ - aggregatePrefix = "twitter_wide_user_aggregate", - keys = Set(USER_ID), - features = RecapUserFeatureAggregation.TwitterWideFeatures, - labels = RecapUserFeatureAggregation.TwitterWideLabels, - metrics = Set(CountMetric, SumMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = TwitterWideUserAggregateStore, - startDate = "2016-12-28 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val twitterWideUserAuthorAggregates = AggregateGroup( - inputSource = timelinesDailyTwitterWideSource, - preTransforms = Seq(RichRemoveUserIdZero), /* Eliminates reducer skew */ - aggregatePrefix = "twitter_wide_user_author_aggregate", - keys = Set(USER_ID, AUTHOR_ID), - features = RecapUserFeatureAggregation.TwitterWideFeatures, - labels = RecapUserFeatureAggregation.TwitterWideLabels, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = TwitterWideUserAuthorAggregateStore, - startDate = "2016-12-28 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )), - includeAnyLabel = false - ) - - /** - * User-HourOfDay and User-DayOfWeek aggregations, both for recap and rectweet - */ - val userRequestHourAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_request_context_aggregate.hour", - preTransforms = Seq(RichRemoveUserIdZero, negativeDownsampleTransform), - keys = Set(USER_ID, RequestContextFeatures.TIMESTAMP_GMT_HOUR), - features = Set.empty, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserRequestHourAggregateStore, - startDate = "2017-08-01 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userRequestDowAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_request_context_aggregate.dow", - preTransforms = Seq(RichRemoveUserIdZero, negativeDownsampleTransform), - keys = Set(USER_ID, RequestContextFeatures.TIMESTAMP_GMT_DOW), - features = Set.empty, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserRequestDowAggregateStore, - startDate = "2017-08-01 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val authorTopicAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "author_topic_aggregate", - preTransforms = Seq(RichRemoveUserIdZero), - keys = Set(AUTHOR_ID, TimelinesSharedFeatures.TOPIC_ID), - features = Set.empty, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = AuthorTopicAggregateStore, - startDate = "2020-05-19 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userTopicAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_topic_aggregate", - preTransforms = Seq(RichRemoveUserIdZero), - keys = Set(USER_ID, TimelinesSharedFeatures.TOPIC_ID), - features = Set.empty, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserTopicAggregateStore, - startDate = "2020-05-23 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userTopicAggregatesV2 = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_topic_aggregate_v2", - preTransforms = Seq(RichRemoveUserIdZero), - keys = Set(USER_ID, TimelinesSharedFeatures.TOPIC_ID), - features = RecapUserFeatureAggregation.UserTopicFeaturesV2Count, - labels = RecapUserFeatureAggregation.LabelsV2, - includeAnyFeature = false, - includeAnyLabel = false, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserTopicAggregateStore, - startDate = "2020-05-23 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userInferredTopicAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_inferred_topic_aggregate", - preTransforms = Seq(RichRemoveUserIdZero), - keys = Set(USER_ID, TimelinesSharedFeatures.INFERRED_TOPIC_IDS), - features = Set.empty, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserInferredTopicAggregateStore, - startDate = "2020-09-09 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userInferredTopicAggregatesV2 = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_inferred_topic_aggregate_v2", - preTransforms = Seq(RichRemoveUserIdZero), - keys = Set(USER_ID, TimelinesSharedFeatures.INFERRED_TOPIC_IDS), - features = RecapUserFeatureAggregation.UserTopicFeaturesV2Count, - labels = RecapUserFeatureAggregation.LabelsV2, - includeAnyFeature = false, - includeAnyLabel = false, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserInferredTopicAggregateStore, - startDate = "2020-09-09 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userReciprocalEngagementAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_aggregate_v6", - preTransforms = Seq(RichRemoveUserIdZero), - keys = Set(USER_ID), - features = Set.empty, - labels = RecapUserFeatureAggregation.ReciprocalLabels, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserAggregateStore, - startDate = "2016-07-15 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )), - includeAnyLabel = false - ) - - val userOriginalAuthorReciprocalEngagementAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_original_author_aggregate_v1", - preTransforms = Seq(RichRemoveUserIdZero, RichRemoveAuthorIdZero), - keys = Set(USER_ID, TimelinesSharedFeatures.ORIGINAL_AUTHOR_ID), - features = Set.empty, - labels = RecapUserFeatureAggregation.ReciprocalLabels, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserOriginalAuthorAggregateStore, - startDate = "2018-12-26 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )), - includeAnyLabel = false - ) - - val originalAuthorReciprocalEngagementAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "original_author_aggregate_v1", - preTransforms = Seq(RichRemoveUserIdZero, RichRemoveAuthorIdZero), - keys = Set(TimelinesSharedFeatures.ORIGINAL_AUTHOR_ID), - features = Set.empty, - labels = RecapUserFeatureAggregation.ReciprocalLabels, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = OriginalAuthorAggregateStore, - startDate = "2023-02-25 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )), - includeAnyLabel = false - ) - - val originalAuthorNegativeEngagementAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "original_author_aggregate_v2", - preTransforms = Seq(RichRemoveUserIdZero, RichRemoveAuthorIdZero), - keys = Set(TimelinesSharedFeatures.ORIGINAL_AUTHOR_ID), - features = Set.empty, - labels = RecapUserFeatureAggregation.NegativeEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = OriginalAuthorAggregateStore, - startDate = "2023-02-25 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )), - includeAnyLabel = false - ) - - val userListAggregates: AggregateGroup = - AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_list_aggregate", - keys = Set(USER_ID, ListFeatures.LIST_ID), - features = Set.empty, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserListAggregateStore, - startDate = "2020-05-28 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )), - preTransforms = Seq(RichRemoveUserIdZero) - ) - - val userMediaUnderstandingAnnotationAggregates: AggregateGroup = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_media_annotation_aggregate", - preTransforms = Seq(RichRemoveUserIdZero), - keys = - Set(USER_ID, SemanticCoreFeatures.mediaUnderstandingHighRecallNonSensitiveEntityIdsFeature), - features = Set.empty, - labels = RecapUserFeatureAggregation.LabelsV2, - metrics = Set(CountMetric), - halfLives = Set(50.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserMediaUnderstandingAnnotationAggregateStore, - startDate = "2021-03-20 00:00", - commonConfig = timelinesOfflineAggregateSink - )) - ) - - val userAuthorGoodClickAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_author_good_click_aggregate", - preTransforms = Seq(FilterInNetworkTransform, RichRemoveUserIdZero), - keys = Set(USER_ID, AUTHOR_ID), - features = RecapUserFeatureAggregation.UserAuthorFeaturesV2, - labels = RecapUserFeatureAggregation.GoodClickLabels, - metrics = Set(SumMetric), - halfLives = Set(14.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserAuthorAggregateStore, - startDate = "2016-07-15 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )) - ) - - val userEngagerGoodClickAggregates = AggregateGroup( - inputSource = timelinesDailyRecapMinimalSource, - aggregatePrefix = "user_engager_good_click_aggregate", - keys = Set(USER_ID, EngagementDataRecordFeatures.PublicEngagementUserIds), - features = Set.empty, - labels = RecapUserFeatureAggregation.GoodClickLabels, - metrics = Set(CountMetric), - halfLives = Set(14.days), - outputStore = mkPhysicalStore( - OfflineAggregateDataRecordStore( - name = UserEngagerAggregateStore, - startDate = "2016-09-02 00:00", - commonConfig = timelinesOfflineAggregateSink, - maxKvSourceFailures = defaultMaxKvSourceFailures - )), - preTransforms = Seq( - RichRemoveUserIdZero, - RichUnifyPublicEngagersTransform - ) - ) - -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigTrait.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigTrait.docx new file mode 100644 index 000000000..1389d8c20 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigTrait.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigTrait.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigTrait.scala deleted file mode 100644 index 6fb2e07b7..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigTrait.scala +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates - -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationConfig -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateGroup -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup - -trait TimelinesAggregationConfigTrait - extends TimelinesAggregationConfigDetails - with AggregationConfig { - private val aggregateGroups = Set( - authorTopicAggregates, - userTopicAggregates, - userTopicAggregatesV2, - userInferredTopicAggregates, - userInferredTopicAggregatesV2, - userAggregatesV2, - userAggregatesV5Continuous, - userReciprocalEngagementAggregates, - userAuthorAggregatesV5, - userOriginalAuthorReciprocalEngagementAggregates, - originalAuthorReciprocalEngagementAggregates, - tweetSourceUserAuthorAggregatesV1, - userEngagerAggregates, - userMentionAggregates, - twitterWideUserAggregates, - twitterWideUserAuthorAggregates, - userRequestHourAggregates, - userRequestDowAggregates, - userListAggregates, - userMediaUnderstandingAnnotationAggregates, - ) ++ userAuthorAggregatesV2 - - val aggregatesToComputeList: Set[List[TypedAggregateGroup[_]]] = - aggregateGroups.map(_.buildTypedAggregateGroups()) - - override val aggregatesToCompute: Set[TypedAggregateGroup[_]] = aggregatesToComputeList.flatten - - /* - * Feature selection config to save storage space and manhattan query bandwidth. - * Only the most important features found using offline RCE simulations are used - * when actually training and serving. This selector is used by - * [[com.twitter.timelines.data_processing.jobs.timeline_ranking_user_features.TimelineRankingAggregatesV2FeaturesProdJob]] - * but defined here to keep it in sync with the config that computes the aggregates. - */ - val AggregatesV2FeatureSelector = FeatureSelectorConfig.AggregatesV2ProdFeatureSelector - - def filterAggregatesGroups(storeNames: Set[String]): Set[AggregateGroup] = { - aggregateGroups.filter(aggregateGroup => storeNames.contains(aggregateGroup.outputStore.name)) - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationKeyValInjections.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationKeyValInjections.docx new file mode 100644 index 000000000..5960168d4 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationKeyValInjections.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationKeyValInjections.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationKeyValInjections.scala deleted file mode 100644 index 1f2433b53..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationKeyValInjections.scala +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates - -import com.twitter.ml.api.DataRecord -import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection -import com.twitter.summingbird.batch.BatchID -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.{ - AggregateStore, - AggregationKey, - OfflineAggregateInjections, - TypedAggregateGroup -} - -object TimelinesAggregationKeyValInjections extends TimelinesAggregationConfigTrait { - - import OfflineAggregateInjections.getInjection - - type KVInjection = KeyValInjection[AggregationKey, (BatchID, DataRecord)] - - val AuthorTopic: KVInjection = getInjection(filter(AuthorTopicAggregateStore)) - val UserTopic: KVInjection = getInjection(filter(UserTopicAggregateStore)) - val UserInferredTopic: KVInjection = getInjection(filter(UserInferredTopicAggregateStore)) - val User: KVInjection = getInjection(filter(UserAggregateStore)) - val UserAuthor: KVInjection = getInjection(filter(UserAuthorAggregateStore)) - val UserOriginalAuthor: KVInjection = getInjection(filter(UserOriginalAuthorAggregateStore)) - val OriginalAuthor: KVInjection = getInjection(filter(OriginalAuthorAggregateStore)) - val UserEngager: KVInjection = getInjection(filter(UserEngagerAggregateStore)) - val UserMention: KVInjection = getInjection(filter(UserMentionAggregateStore)) - val TwitterWideUser: KVInjection = getInjection(filter(TwitterWideUserAggregateStore)) - val TwitterWideUserAuthor: KVInjection = getInjection(filter(TwitterWideUserAuthorAggregateStore)) - val UserRequestHour: KVInjection = getInjection(filter(UserRequestHourAggregateStore)) - val UserRequestDow: KVInjection = getInjection(filter(UserRequestDowAggregateStore)) - val UserList: KVInjection = getInjection(filter(UserListAggregateStore)) - val UserMediaUnderstandingAnnotation: KVInjection = getInjection( - filter(UserMediaUnderstandingAnnotationAggregateStore)) - - private def filter(storeName: String): Set[TypedAggregateGroup[_]] = { - val groups = aggregatesToCompute.filter(_.outputStore.name == storeName) - require(groups.nonEmpty) - groups - } - - override def outputHdfsPath: String = "/user/timelines/processed/aggregates_v2" - - // Since this object is not used to execute any online or offline aggregates job, but is meant - // to store all PDT enabled KeyValInjections, we do not need to construct a physical store. - // We use the identity operation as a default. - override def mkPhysicalStore(store: AggregateStore): AggregateStore = store -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationSources.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationSources.docx new file mode 100644 index 000000000..71fd8ba05 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationSources.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationSources.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationSources.scala deleted file mode 100644 index c799f22fa..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationSources.scala +++ /dev/null @@ -1,45 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates - -import com.twitter.ml.api.constant.SharedFeatures.TIMESTAMP -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.OfflineAggregateSource -import com.twitter.timelines.prediction.features.p_home_latest.HomeLatestUserAggregatesFeatures -import timelines.data_processing.ad_hoc.recap.data_record_preparation.RecapDataRecordsAggMinimalJavaDataset - -/** - * Any update here should be in sync with [[TimelinesFeatureGroups]] and [[AggMinimalDataRecordGeneratorJob]]. - */ -object TimelinesAggregationSources { - - /** - * This is the recap data records after post-processing in [[GenerateRecapAggMinimalDataRecordsJob]] - */ - val timelinesDailyRecapMinimalSource = OfflineAggregateSource( - name = "timelines_daily_recap", - timestampFeature = TIMESTAMP, - dalDataSet = Some(RecapDataRecordsAggMinimalJavaDataset), - scaldingSuffixType = Some("dal"), - withValidation = true - ) - val timelinesDailyTwitterWideSource = OfflineAggregateSource( - name = "timelines_daily_twitter_wide", - timestampFeature = TIMESTAMP, - scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/twitter_wide_data_records"), - scaldingSuffixType = Some("daily"), - withValidation = true - ) - - val timelinesDailyListTimelineSource = OfflineAggregateSource( - name = "timelines_daily_list_timeline", - timestampFeature = TIMESTAMP, - scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/all_features/list"), - scaldingSuffixType = Some("hourly"), - withValidation = true - ) - - val timelinesDailyHomeLatestSource = OfflineAggregateSource( - name = "timelines_daily_home_latest", - timestampFeature = HomeLatestUserAggregatesFeatures.AGGREGATE_TIMESTAMP_MS, - scaldingHdfsPath = Some("/user/timelines/processed/p_home_latest/user_aggregates"), - scaldingSuffixType = Some("daily") - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/AuthorFeaturesAdapter.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/AuthorFeaturesAdapter.docx new file mode 100644 index 000000000..f50b427be Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/AuthorFeaturesAdapter.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/AuthorFeaturesAdapter.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/AuthorFeaturesAdapter.scala deleted file mode 100644 index 7cefc67b9..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/AuthorFeaturesAdapter.scala +++ /dev/null @@ -1,70 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType.UserState -import com.twitter.ml.api.Feature.Binary -import com.twitter.ml.api.{DataRecord, Feature, FeatureContext, RichDataRecord} -import com.twitter.ml.featurestore.catalog.entities.core.Author -import com.twitter.ml.featurestore.catalog.features.magicrecs.UserActivity -import com.twitter.ml.featurestore.lib.data.PredictionRecord -import com.twitter.ml.featurestore.lib.feature.{BoundFeature, BoundFeatureSet} -import com.twitter.ml.featurestore.lib.{UserId, Discrete => FSDiscrete} -import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase -import java.lang.{Boolean => JBoolean} -import java.util -import scala.collection.JavaConverters._ - -object AuthorFeaturesAdapter extends TimelinesAdapterBase[PredictionRecord] { - val UserStateBoundFeature: BoundFeature[UserId, FSDiscrete] = UserActivity.UserState.bind(Author) - val UserFeaturesSet: BoundFeatureSet = BoundFeatureSet(UserStateBoundFeature) - - /** - * Boolean features about viewer's user state. - * enum UserState { - * NEW = 0, - * NEAR_ZERO = 1, - * VERY_LIGHT = 2, - * LIGHT = 3, - * MEDIUM_TWEETER = 4, - * MEDIUM_NON_TWEETER = 5, - * HEAVY_NON_TWEETER = 6, - * HEAVY_TWEETER = 7 - * }(persisted='true') - */ - val IS_USER_NEW = new Binary("timelines.author.user_state.is_user_new", Set(UserState).asJava) - val IS_USER_LIGHT = new Binary("timelines.author.user_state.is_user_light", Set(UserState).asJava) - val IS_USER_MEDIUM_TWEETER = - new Binary("timelines.author.user_state.is_user_medium_tweeter", Set(UserState).asJava) - val IS_USER_MEDIUM_NON_TWEETER = - new Binary("timelines.author.user_state.is_user_medium_non_tweeter", Set(UserState).asJava) - val IS_USER_HEAVY_NON_TWEETER = - new Binary("timelines.author.user_state.is_user_heavy_non_tweeter", Set(UserState).asJava) - val IS_USER_HEAVY_TWEETER = - new Binary("timelines.author.user_state.is_user_heavy_tweeter", Set(UserState).asJava) - val userStateToFeatureMap: Map[Long, Binary] = Map( - 0L -> IS_USER_NEW, - 1L -> IS_USER_LIGHT, - 2L -> IS_USER_LIGHT, - 3L -> IS_USER_LIGHT, - 4L -> IS_USER_MEDIUM_TWEETER, - 5L -> IS_USER_MEDIUM_NON_TWEETER, - 6L -> IS_USER_HEAVY_NON_TWEETER, - 7L -> IS_USER_HEAVY_TWEETER - ) - - val UserStateBooleanFeatures: Set[Feature[_]] = userStateToFeatureMap.values.toSet - - private val allFeatures: Seq[Feature[_]] = UserStateBooleanFeatures.toSeq - override def getFeatureContext: FeatureContext = new FeatureContext(allFeatures: _*) - override def commonFeatures: Set[Feature[_]] = Set.empty - - override def adaptToDataRecords(record: PredictionRecord): util.List[DataRecord] = { - val newRecord = new RichDataRecord(new DataRecord) - record - .getFeatureValue(UserStateBoundFeature) - .flatMap { userState => userStateToFeatureMap.get(userState.value) }.foreach { - booleanFeature => newRecord.setFeatureValue[JBoolean](booleanFeature, true) - } - - List(newRecord.getRecord).asJava - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/BUILD b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/BUILD deleted file mode 100644 index 93f39405d..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/BUILD +++ /dev/null @@ -1,199 +0,0 @@ -heron_binary( - name = "heron-without-jass", - main = "com.twitter.timelines.prediction.common.aggregates.real_time.TypeSafeRunner", - oss = True, - platform = "java8", - runtime_platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":real_time", - "3rdparty/jvm/org/slf4j:slf4j-jdk14", - ], -) - -jvm_app( - name = "rta_heron", - binary = ":heron-without-jass", - bundles = [ - bundle( - fileset = ["resources/jaas.conf"], - ), - ], - tags = [ - "bazel-compatible", - "bazel-only", - ], -) - -scala_library( - sources = ["*.scala"], - platform = "java8", - strict_deps = False, - tags = ["bazel-compatible"], - dependencies = [ - ":online-configs", - "3rdparty/src/jvm/com/twitter/summingbird:storm", - "src/java/com/twitter/heron/util", - "src/java/com/twitter/ml/api:api-base", - "src/java/com/twitter/ml/api/constant", - "src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core:core-features", - "src/scala/com/twitter/ml/api/util", - "src/scala/com/twitter/storehaus_internal/memcache", - "src/scala/com/twitter/storehaus_internal/util", - "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", - "src/scala/com/twitter/summingbird_internal/runner/store_config", - "src/scala/com/twitter/summingbird_internal/runner/storm", - "src/scala/com/twitter/summingbird_internal/sources/storm/remote:ClientEventSourceScrooge2", - "src/scala/com/twitter/timelines/prediction/adapters/client_log_event", - "src/scala/com/twitter/timelines/prediction/adapters/client_log_event_mr", - "src/scala/com/twitter/timelines/prediction/features/client_log_event", - "src/scala/com/twitter/timelines/prediction/features/common", - "src/scala/com/twitter/timelines/prediction/features/list_features", - "src/scala/com/twitter/timelines/prediction/features/recap", - "src/scala/com/twitter/timelines/prediction/features/user_health", - "src/thrift/com/twitter/ml/api:data-java", - "src/thrift/com/twitter/timelines/suggests/common:record-scala", - "timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/served_features_cache", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - "timelines/data_processing/ml_util/aggregation_framework/heron", - "timelines/data_processing/ml_util/aggregation_framework/job", - "timelines/data_processing/ml_util/aggregation_framework/metrics", - "timelines/data_processing/ml_util/transforms", - "timelines/src/main/scala/com/twitter/timelines/clients/memcache_common", - "util/util-core:scala", - ], -) - -scala_library( - name = "online-configs", - sources = [ - "AuthorFeaturesAdapter.scala", - "Event.scala", - "FeatureStoreUtils.scala", - "StormAggregateSourceUtils.scala", - "TimelinesOnlineAggregationConfig.scala", - "TimelinesOnlineAggregationConfigBase.scala", - "TimelinesOnlineAggregationSources.scala", - "TimelinesStormAggregateSource.scala", - "TweetFeaturesReadableStore.scala", - "UserFeaturesAdapter.scala", - "UserFeaturesReadableStore.scala", - ], - platform = "java8", - strict_deps = True, - tags = ["bazel-compatible"], - dependencies = [ - ":base-config", - "3rdparty/src/jvm/com/twitter/scalding:db", - "3rdparty/src/jvm/com/twitter/storehaus:core", - "3rdparty/src/jvm/com/twitter/summingbird:core", - "3rdparty/src/jvm/com/twitter/summingbird:online", - "3rdparty/src/jvm/com/twitter/summingbird:storm", - "abuse/detection/src/main/thrift/com/twitter/abuse/detection/mention_interactions:thrift-scala", - "snowflake/src/main/scala/com/twitter/snowflake/id", - "snowflake/src/main/thrift:thrift-scala", - "src/java/com/twitter/ml/api:api-base", - "src/java/com/twitter/ml/api/constant", - "src/scala/com/twitter/frigate/data_pipeline/features_aggregated/core:core-features", - "src/scala/com/twitter/ml/api/util:datarecord", - "src/scala/com/twitter/ml/featurestore/catalog/datasets/geo:geo-user-location", - "src/scala/com/twitter/ml/featurestore/catalog/datasets/magicrecs:user-features", - "src/scala/com/twitter/ml/featurestore/catalog/entities/core", - "src/scala/com/twitter/ml/featurestore/catalog/features/core:user", - "src/scala/com/twitter/ml/featurestore/catalog/features/geo", - "src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-activity", - "src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-info", - "src/scala/com/twitter/ml/featurestore/catalog/features/trends:tweet_trends_scores", - "src/scala/com/twitter/ml/featurestore/lib/data", - "src/scala/com/twitter/ml/featurestore/lib/dataset/offline", - "src/scala/com/twitter/ml/featurestore/lib/export/strato:app-names", - "src/scala/com/twitter/ml/featurestore/lib/feature", - "src/scala/com/twitter/ml/featurestore/lib/online", - "src/scala/com/twitter/ml/featurestore/lib/params", - "src/scala/com/twitter/storehaus_internal/util", - "src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits", - "src/scala/com/twitter/summingbird_internal/runner/store_config", - "src/scala/com/twitter/summingbird_internal/runner/storm", - "src/scala/com/twitter/summingbird_internal/sources/common", - "src/scala/com/twitter/summingbird_internal/sources/common/remote:ClientEventSourceScrooge", - "src/scala/com/twitter/summingbird_internal/sources/storm/remote:ClientEventSourceScrooge2", - "src/scala/com/twitter/timelines/prediction/adapters/client_log_event", - "src/scala/com/twitter/timelines/prediction/adapters/client_log_event_mr", - "src/scala/com/twitter/timelines/prediction/common/adapters:base", - "src/scala/com/twitter/timelines/prediction/common/adapters:engagement-converter", - "src/scala/com/twitter/timelines/prediction/common/aggregates", - "src/scala/com/twitter/timelines/prediction/features/client_log_event", - "src/scala/com/twitter/timelines/prediction/features/common", - "src/scala/com/twitter/timelines/prediction/features/list_features", - "src/scala/com/twitter/timelines/prediction/features/recap", - "src/scala/com/twitter/timelines/prediction/features/user_health", - "src/thrift/com/twitter/clientapp/gen:clientapp-scala", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/ml/api:data-java", - "src/thrift/com/twitter/timelines/suggests/common:engagement-java", - "src/thrift/com/twitter/timelines/suggests/common:engagement-scala", - "src/thrift/com/twitter/timelines/suggests/common:record-scala", - "src/thrift/com/twitter/timelineservice/injection:thrift-scala", - "src/thrift/com/twitter/timelineservice/server/suggests/logging:thrift-scala", - "strato/src/main/scala/com/twitter/strato/client", - "timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/served_features_cache", - "timelines/data_processing/ad_hoc/suggests/common:raw_training_data_creator", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - "timelines/data_processing/ml_util/aggregation_framework/heron:configs", - "timelines/data_processing/ml_util/aggregation_framework/metrics", - "timelines/data_processing/ml_util/transforms", - "timelines/data_processing/util:rich-request", - "tweetsource/common/src/main/thrift:thrift-scala", - "twitter-server-internal/src/main/scala", - "unified_user_actions/client/src/main/scala/com/twitter/unified_user_actions/client/config", - "unified_user_actions/client/src/main/scala/com/twitter/unified_user_actions/client/summingbird", - "unified_user_actions/thrift/src/main/thrift/com/twitter/unified_user_actions:unified_user_actions-scala", - "util/util-core:scala", - "util/util-stats/src/main/scala/com/twitter/finagle/stats", - ], -) - -scala_library( - name = "base-config", - sources = [ - "AuthorFeaturesAdapter.scala", - "TimelinesOnlineAggregationConfigBase.scala", - "TweetFeaturesAdapter.scala", - "UserFeaturesAdapter.scala", - ], - platform = "java8", - strict_deps = True, - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/java/com/twitter/ml/api/constant", - "src/resources/com/twitter/timelines/prediction/common/aggregates/real_time", - "src/scala/com/twitter/ml/api/util:datarecord", - "src/scala/com/twitter/ml/featurestore/catalog/datasets/magicrecs:user-features", - "src/scala/com/twitter/ml/featurestore/catalog/entities/core", - "src/scala/com/twitter/ml/featurestore/catalog/features/core:user", - "src/scala/com/twitter/ml/featurestore/catalog/features/geo", - "src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-activity", - "src/scala/com/twitter/ml/featurestore/catalog/features/magicrecs:user-info", - "src/scala/com/twitter/ml/featurestore/catalog/features/trends:tweet_trends_scores", - "src/scala/com/twitter/ml/featurestore/lib/data", - "src/scala/com/twitter/ml/featurestore/lib/feature", - "src/scala/com/twitter/timelines/prediction/common/adapters:base", - "src/scala/com/twitter/timelines/prediction/common/adapters:engagement-converter", - "src/scala/com/twitter/timelines/prediction/common/aggregates", - "src/scala/com/twitter/timelines/prediction/features/client_log_event", - "src/scala/com/twitter/timelines/prediction/features/common", - "src/scala/com/twitter/timelines/prediction/features/list_features", - "src/scala/com/twitter/timelines/prediction/features/recap", - "src/scala/com/twitter/timelines/prediction/features/user_health", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/ml/api:feature_context-java", - "src/thrift/com/twitter/timelines/suggests/common:engagement-scala", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - "timelines/data_processing/ml_util/aggregation_framework/heron:base-config", - "timelines/data_processing/ml_util/aggregation_framework/metrics", - "timelines/data_processing/ml_util/transforms", - "util/util-core:scala", - "util/util-core:util-core-util", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/BUILD.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/BUILD.docx new file mode 100644 index 000000000..3ebae7619 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/Event.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/Event.docx new file mode 100644 index 000000000..ed42b61d0 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/Event.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/Event.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/Event.scala deleted file mode 100644 index 1bd697d0d..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/Event.scala +++ /dev/null @@ -1,11 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -private[real_time] sealed trait Event[T] { def event: T } - -private[real_time] case class HomeEvent[T](override val event: T) extends Event[T] - -private[real_time] case class ProfileEvent[T](override val event: T) extends Event[T] - -private[real_time] case class SearchEvent[T](override val event: T) extends Event[T] - -private[real_time] case class UuaEvent[T](override val event: T) extends Event[T] diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/FeatureStoreUtils.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/FeatureStoreUtils.docx new file mode 100644 index 000000000..13250c1c9 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/FeatureStoreUtils.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/FeatureStoreUtils.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/FeatureStoreUtils.scala deleted file mode 100644 index 156d9d35f..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/FeatureStoreUtils.scala +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.ml.featurestore.catalog.datasets.magicrecs.UserFeaturesDataset -import com.twitter.ml.featurestore.catalog.datasets.geo.GeoUserLocationDataset -import com.twitter.ml.featurestore.lib.dataset.DatasetParams -import com.twitter.ml.featurestore.lib.export.strato.FeatureStoreAppNames -import com.twitter.ml.featurestore.lib.online.FeatureStoreClient -import com.twitter.ml.featurestore.lib.params.FeatureStoreParams -import com.twitter.strato.client.{Client, Strato} -import com.twitter.strato.opcontext.Attribution.ManhattanAppId -import com.twitter.util.Duration - -private[real_time] object FeatureStoreUtils { - private def mkStratoClient(serviceIdentifier: ServiceIdentifier): Client = - Strato.client - .withMutualTls(serviceIdentifier) - .withRequestTimeout(Duration.fromMilliseconds(50)) - .build() - - private val featureStoreParams: FeatureStoreParams = - FeatureStoreParams( - perDataset = Map( - UserFeaturesDataset.id -> - DatasetParams( - stratoSuffix = Some(FeatureStoreAppNames.Timelines), - attributions = Seq(ManhattanAppId("athena", "timelines_aggregates_v2_features_by_user")) - ), - GeoUserLocationDataset.id -> - DatasetParams( - attributions = Seq(ManhattanAppId("starbuck", "timelines_geo_features_by_user")) - ) - ) - ) - - def mkFeatureStoreClient( - serviceIdentifier: ServiceIdentifier, - statsReceiver: StatsReceiver - ): FeatureStoreClient = { - com.twitter.server.Init() // necessary in order to use WilyNS path - - val stratoClient: Client = mkStratoClient(serviceIdentifier) - val featureStoreClient: FeatureStoreClient = FeatureStoreClient( - featureSet = - UserFeaturesAdapter.UserFeaturesSet ++ AuthorFeaturesAdapter.UserFeaturesSet ++ TweetFeaturesAdapter.TweetFeaturesSet, - client = stratoClient, - statsReceiver = statsReceiver, - featureStoreParams = featureStoreParams - ) - featureStoreClient - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/LocallyReplicatedStore.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/LocallyReplicatedStore.docx new file mode 100644 index 000000000..09768903c Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/LocallyReplicatedStore.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/LocallyReplicatedStore.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/LocallyReplicatedStore.scala deleted file mode 100644 index 42f86fa4f..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/LocallyReplicatedStore.scala +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.storehaus.ReplicatedReadableStore -import com.twitter.storehaus.Store -import com.twitter.timelines.clients.memcache_common._ -import com.twitter.timelines.util.FailOpenHandler -import com.twitter.util.Future - -object ServedFeaturesMemcacheConfigBuilder { - def getTwCacheDestination(cluster: String, isProd: Boolean = false): String = - if (!isProd) { - s"/srv#/test/$cluster/cache//twemcache_timelines_served_features_cache" - } else { - s"/srv#/prod/$cluster/cache/timelines_served_features" - } - - /** - * @cluster The DC of the cache that this client will send requests to. This - * can be different to the DC where the summingbird job is running in. - * @isProd Define if this client is part of a production summingbird job as - * different accesspoints will need to be chosen. - */ - def build(cluster: String, isProd: Boolean = false): StorehausMemcacheConfig = - StorehausMemcacheConfig( - destName = getTwCacheDestination(cluster, isProd), - keyPrefix = "", - requestTimeout = 200.milliseconds, - numTries = 2, - globalTimeout = 400.milliseconds, - tcpConnectTimeout = 200.milliseconds, - connectionAcquisitionTimeout = 200.milliseconds, - numPendingRequests = 1000, - isReadOnly = false - ) -} - -/** - * If lookup key does not exist locally, make a call to the replicated store(s). - * If value exists remotely, write the first returned value to the local store - * and return it. Map any exceptions to None so that the subsequent operations - * may proceed. - */ -class LocallyReplicatedStore[-K, V]( - localStore: Store[K, V], - remoteStore: ReplicatedReadableStore[K, V], - scopedStatsReceiver: StatsReceiver) - extends Store[K, V] { - private[this] val failOpenHandler = new FailOpenHandler(scopedStatsReceiver.scope("failOpen")) - private[this] val localFailsCounter = scopedStatsReceiver.counter("localFails") - private[this] val localWritesCounter = scopedStatsReceiver.counter("localWrites") - private[this] val remoteFailsCounter = scopedStatsReceiver.counter("remoteFails") - - override def get(k: K): Future[Option[V]] = - failOpenHandler { - localStore - .get(k) - .flatMap { - case Some(v) => Future.value(Some(v)) - case _ => { - localFailsCounter.incr() - val replicatedOptFu = remoteStore.get(k) - // async write if result is not empty - replicatedOptFu.onSuccess { - case Some(v) => { - localWritesCounter.incr() - localStore.put((k, Some(v))) - } - case _ => { - remoteFailsCounter.incr() - Unit - } - } - replicatedOptFu - } - } - } { _: Throwable => Future.None } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.docx new file mode 100644 index 000000000..cf9cd7b7f Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.scala deleted file mode 100644 index e72d3392b..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.scala +++ /dev/null @@ -1,254 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.finagle.stats.Counter -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.ml.api.constant.SharedFeatures -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.DataRecordMerger -import com.twitter.ml.api.Feature -import com.twitter.ml.api.RichDataRecord -import com.twitter.ml.featurestore.catalog.entities.core.Author -import com.twitter.ml.featurestore.catalog.entities.core.Tweet -import com.twitter.ml.featurestore.catalog.entities.core.User -import com.twitter.ml.featurestore.lib.online.FeatureStoreClient -import com.twitter.summingbird.Producer -import com.twitter.summingbird.storm.Storm -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.RealTimeAggregatesJobConfig -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures -import java.lang.{Long => JLong} - -import com.twitter.unified_user_actions.thriftscala.ActionType -import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction - -private[real_time] object StormAggregateSourceUtils { - type UserId = Long - type AuthorId = Long - type TweetId = Long - - /** - * Attaches a [[FeatureStoreClient]] to the underyling [[Producer]]. The FeatureStoreClient - * hydrates additional user features. - * - * @param underlyingProducer converts a stream of [[com.twitter.clientapp.thriftscala.LogEvent]] - * to a stream of [[DataRecord]]. - */ - def wrapByFeatureStoreClient( - underlyingProducer: Producer[Storm, Event[DataRecord]], - jobConfig: RealTimeAggregatesJobConfig, - scopedStatsReceiver: StatsReceiver - ): Producer[Storm, Event[DataRecord]] = { - lazy val keyDataRecordCounter = scopedStatsReceiver.counter("keyDataRecord") - lazy val keyFeatureCounter = scopedStatsReceiver.counter("keyFeature") - lazy val leftDataRecordCounter = scopedStatsReceiver.counter("leftDataRecord") - lazy val rightDataRecordCounter = scopedStatsReceiver.counter("rightDataRecord") - lazy val mergeNumFeaturesCounter = scopedStatsReceiver.counter("mergeNumFeatures") - lazy val authorKeyDataRecordCounter = scopedStatsReceiver.counter("authorKeyDataRecord") - lazy val authorKeyFeatureCounter = scopedStatsReceiver.counter("authorKeyFeature") - lazy val authorLeftDataRecordCounter = scopedStatsReceiver.counter("authorLeftDataRecord") - lazy val authorRightDataRecordCounter = scopedStatsReceiver.counter("authorRightDataRecord") - lazy val authorMergeNumFeaturesCounter = scopedStatsReceiver.counter("authorMergeNumFeatures") - lazy val tweetKeyDataRecordCounter = - scopedStatsReceiver.counter("tweetKeyDataRecord") - lazy val tweetKeyFeatureCounter = scopedStatsReceiver.counter("tweetKeyFeature") - lazy val tweetLeftDataRecordCounter = - scopedStatsReceiver.counter("tweetLeftDataRecord") - lazy val tweetRightDataRecordCounter = - scopedStatsReceiver.counter("tweetRightDataRecord") - lazy val tweetMergeNumFeaturesCounter = - scopedStatsReceiver.counter("tweetMergeNumFeatures") - - @transient lazy val featureStoreClient: FeatureStoreClient = - FeatureStoreUtils.mkFeatureStoreClient( - serviceIdentifier = jobConfig.serviceIdentifier, - statsReceiver = scopedStatsReceiver - ) - - lazy val joinUserFeaturesDataRecordProducer = - if (jobConfig.keyedByUserEnabled) { - lazy val keyedByUserFeaturesStormService: Storm#Service[Set[UserId], DataRecord] = - Storm.service( - new UserFeaturesReadableStore( - featureStoreClient = featureStoreClient, - userEntity = User, - userFeaturesAdapter = UserFeaturesAdapter - ) - ) - - leftJoinDataRecordProducer( - keyFeature = SharedFeatures.USER_ID, - leftDataRecordProducer = underlyingProducer, - rightStormService = keyedByUserFeaturesStormService, - keyDataRecordCounter = keyDataRecordCounter, - keyFeatureCounter = keyFeatureCounter, - leftDataRecordCounter = leftDataRecordCounter, - rightDataRecordCounter = rightDataRecordCounter, - mergeNumFeaturesCounter = mergeNumFeaturesCounter - ) - } else { - underlyingProducer - } - - lazy val joinAuthorFeaturesDataRecordProducer = - if (jobConfig.keyedByAuthorEnabled) { - lazy val keyedByAuthorFeaturesStormService: Storm#Service[Set[AuthorId], DataRecord] = - Storm.service( - new UserFeaturesReadableStore( - featureStoreClient = featureStoreClient, - userEntity = Author, - userFeaturesAdapter = AuthorFeaturesAdapter - ) - ) - - leftJoinDataRecordProducer( - keyFeature = TimelinesSharedFeatures.SOURCE_AUTHOR_ID, - leftDataRecordProducer = joinUserFeaturesDataRecordProducer, - rightStormService = keyedByAuthorFeaturesStormService, - keyDataRecordCounter = authorKeyDataRecordCounter, - keyFeatureCounter = authorKeyFeatureCounter, - leftDataRecordCounter = authorLeftDataRecordCounter, - rightDataRecordCounter = authorRightDataRecordCounter, - mergeNumFeaturesCounter = authorMergeNumFeaturesCounter - ) - } else { - joinUserFeaturesDataRecordProducer - } - - lazy val joinTweetFeaturesDataRecordProducer = { - if (jobConfig.keyedByTweetEnabled) { - lazy val keyedByTweetFeaturesStormService: Storm#Service[Set[TweetId], DataRecord] = - Storm.service( - new TweetFeaturesReadableStore( - featureStoreClient = featureStoreClient, - tweetEntity = Tweet, - tweetFeaturesAdapter = TweetFeaturesAdapter - ) - ) - - leftJoinDataRecordProducer( - keyFeature = TimelinesSharedFeatures.SOURCE_TWEET_ID, - leftDataRecordProducer = joinAuthorFeaturesDataRecordProducer, - rightStormService = keyedByTweetFeaturesStormService, - keyDataRecordCounter = tweetKeyDataRecordCounter, - keyFeatureCounter = tweetKeyFeatureCounter, - leftDataRecordCounter = tweetLeftDataRecordCounter, - rightDataRecordCounter = tweetRightDataRecordCounter, - mergeNumFeaturesCounter = tweetMergeNumFeaturesCounter - ) - } else { - joinAuthorFeaturesDataRecordProducer - } - } - - joinTweetFeaturesDataRecordProducer - } - - private[this] lazy val DataRecordMerger = new DataRecordMerger - - /** - * Make join key from the client event data record and return both. - * @param keyFeature Feature to extract join key value: USER_ID, SOURCE_TWEET_ID, etc. - * @param record DataRecord containing client engagement and basic tweet-side features - * @return The return type is a tuple of this key and original data record which will be used - * in the subsequent leftJoin operation. - */ - private[this] def mkKey( - keyFeature: Feature[JLong], - record: DataRecord, - keyDataRecordCounter: Counter, - keyFeatureCounter: Counter - ): Set[Long] = { - keyDataRecordCounter.incr() - val richRecord = new RichDataRecord(record) - if (richRecord.hasFeature(keyFeature)) { - keyFeatureCounter.incr() - val key: Long = richRecord.getFeatureValue(keyFeature).toLong - Set(key) - } else { - Set.empty[Long] - } - } - - /** - * After the leftJoin, merge the client event data record and the joined data record - * into a single data record used for further aggregation. - */ - private[this] def mergeDataRecord( - leftRecord: Event[DataRecord], - rightRecordOpt: Option[DataRecord], - leftDataRecordCounter: Counter, - rightDataRecordCounter: Counter, - mergeNumFeaturesCounter: Counter - ): Event[DataRecord] = { - leftDataRecordCounter.incr() - rightRecordOpt.foreach { rightRecord => - rightDataRecordCounter.incr() - DataRecordMerger.merge(leftRecord.event, rightRecord) - mergeNumFeaturesCounter.incr(new RichDataRecord(leftRecord.event).numFeatures()) - } - leftRecord - } - - private[this] def leftJoinDataRecordProducer( - keyFeature: Feature[JLong], - leftDataRecordProducer: Producer[Storm, Event[DataRecord]], - rightStormService: Storm#Service[Set[Long], DataRecord], - keyDataRecordCounter: => Counter, - keyFeatureCounter: => Counter, - leftDataRecordCounter: => Counter, - rightDataRecordCounter: => Counter, - mergeNumFeaturesCounter: => Counter - ): Producer[Storm, Event[DataRecord]] = { - val keyedLeftDataRecordProducer: Producer[Storm, (Set[Long], Event[DataRecord])] = - leftDataRecordProducer.map { - case dataRecord: HomeEvent[DataRecord] => - val key = mkKey( - keyFeature = keyFeature, - record = dataRecord.event, - keyDataRecordCounter = keyDataRecordCounter, - keyFeatureCounter = keyFeatureCounter - ) - (key, dataRecord) - case dataRecord: ProfileEvent[DataRecord] => - val key = Set.empty[Long] - (key, dataRecord) - case dataRecord: SearchEvent[DataRecord] => - val key = Set.empty[Long] - (key, dataRecord) - case dataRecord: UuaEvent[DataRecord] => - val key = Set.empty[Long] - (key, dataRecord) - } - - keyedLeftDataRecordProducer - .leftJoin(rightStormService) - .map { - case (_, (leftRecord, rightRecordOpt)) => - mergeDataRecord( - leftRecord = leftRecord, - rightRecordOpt = rightRecordOpt, - leftDataRecordCounter = leftDataRecordCounter, - rightDataRecordCounter = rightDataRecordCounter, - mergeNumFeaturesCounter = mergeNumFeaturesCounter - ) - } - } - - /** - * Filter Unified User Actions events to include only actions that has home timeline visit prior to landing on the page - */ - def isUuaBCEEventsFromHome(event: UnifiedUserAction): Boolean = { - def breadcrumbViewsContain(view: String): Boolean = - event.eventMetadata.breadcrumbViews.map(_.contains(view)).getOrElse(false) - - (event.actionType) match { - case ActionType.ClientTweetV2Impression if breadcrumbViewsContain("home") => - true - case ActionType.ClientTweetVideoFullscreenV2Impression - if (breadcrumbViewsContain("home") & breadcrumbViewsContain("video")) => - true - case ActionType.ClientProfileV2Impression if breadcrumbViewsContain("home") => - true - case _ => false - } - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfig.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfig.docx new file mode 100644 index 000000000..6cde9194d Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfig.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfig.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfig.scala deleted file mode 100644 index 8d7a41d21..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfig.scala +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.conversions.DurationOps._ -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.{ - OnlineAggregationStoresTrait, - RealTimeAggregateStore -} - -object TimelinesOnlineAggregationConfig - extends TimelinesOnlineAggregationDefinitionsTrait - with OnlineAggregationStoresTrait { - - import TimelinesOnlineAggregationSources._ - - override lazy val ProductionStore = RealTimeAggregateStore( - memcacheDataSet = "timelines_real_time_aggregates", - isProd = true, - cacheTTL = 5.days - ) - - override lazy val StagingStore = RealTimeAggregateStore( - memcacheDataSet = "twemcache_timelines_real_time_aggregates", - isProd = false, - cacheTTL = 5.days - ) - - override lazy val inputSource = timelinesOnlineAggregateSource - - /** - * AggregateToCompute: This defines the complete set of aggregates to be - * computed by the aggregation job and to be stored in memcache. - */ - override lazy val AggregatesToCompute = ProdAggregates ++ StagingAggregates -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.docx new file mode 100644 index 000000000..31d6abb91 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.scala deleted file mode 100644 index 0d7c072e2..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.scala +++ /dev/null @@ -1,1112 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.conversions.DurationOps._ -import com.twitter.ml.api.Feature -import com.twitter.ml.api.constant.SharedFeatures -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateGroup -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateSource -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateStore -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.OnlineAggregationConfigTrait -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.CountMetric -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.SumMetric -import com.twitter.timelines.data_processing.ml_util.transforms.BinaryUnion -import com.twitter.timelines.data_processing.ml_util.transforms.DownsampleTransform -import com.twitter.timelines.data_processing.ml_util.transforms.IsNewUserTransform -import com.twitter.timelines.data_processing.ml_util.transforms.IsPositionTransform -import com.twitter.timelines.data_processing.ml_util.transforms.LogTransform -import com.twitter.timelines.data_processing.ml_util.transforms.PositionCase -import com.twitter.timelines.data_processing.ml_util.transforms.RichITransform -import com.twitter.timelines.data_processing.ml_util.transforms.RichRemoveUnverifiedUserTransform -import com.twitter.timelines.prediction.features.client_log_event.ClientLogEventDataRecordFeatures -import com.twitter.timelines.prediction.features.common.CombinedFeatures -import com.twitter.timelines.prediction.features.common.CombinedFeatures._ -import com.twitter.timelines.prediction.features.common.ProfileLabelFeatures -import com.twitter.timelines.prediction.features.common.SearchLabelFeatures -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.IS_TOP_FIVE -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.IS_TOP_ONE -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.IS_TOP_TEN -import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures.LOG_POSITION -import com.twitter.timelines.prediction.features.list_features.ListFeatures -import com.twitter.timelines.prediction.features.recap.RecapFeatures -import com.twitter.util.Duration -import java.lang.{Boolean => JBoolean} -import java.lang.{Long => JLong} -import scala.io.Source - -object TimelinesOnlineAggregationUtils { - val TweetLabels: Set[Feature[JBoolean]] = CombinedFeatures.EngagementsRealTime - val TweetCoreLabels: Set[Feature[JBoolean]] = CombinedFeatures.CoreEngagements - val TweetDwellLabels: Set[Feature[JBoolean]] = CombinedFeatures.DwellEngagements - val TweetCoreAndDwellLabels: Set[Feature[JBoolean]] = TweetCoreLabels ++ TweetDwellLabels - val PrivateEngagementLabelsV2: Set[Feature[JBoolean]] = CombinedFeatures.PrivateEngagementsV2 - val ProfileCoreLabels: Set[Feature[JBoolean]] = ProfileLabelFeatures.CoreEngagements - val ProfileNegativeEngagementLabels: Set[Feature[JBoolean]] = - ProfileLabelFeatures.NegativeEngagements - val ProfileNegativeEngagementUnionLabels: Set[Feature[JBoolean]] = Set( - ProfileLabelFeatures.IS_NEGATIVE_FEEDBACK_UNION) - val SearchCoreLabels: Set[Feature[JBoolean]] = SearchLabelFeatures.CoreEngagements - val TweetNegativeEngagementLabels: Set[Feature[JBoolean]] = - CombinedFeatures.NegativeEngagementsRealTime - val TweetNegativeEngagementDontLikeLabels: Set[Feature[JBoolean]] = - CombinedFeatures.NegativeEngagementsRealTimeDontLike - val TweetNegativeEngagementSecondaryLabels: Set[Feature[JBoolean]] = - CombinedFeatures.NegativeEngagementsSecondary - val AllTweetNegativeEngagementLabels: Set[Feature[JBoolean]] = - TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels ++ TweetNegativeEngagementSecondaryLabels - val UserAuthorEngagementLabels: Set[Feature[JBoolean]] = CombinedFeatures.UserAuthorEngagements - val ShareEngagementLabels: Set[Feature[JBoolean]] = CombinedFeatures.ShareEngagements - val BookmarkEngagementLabels: Set[Feature[JBoolean]] = CombinedFeatures.BookmarkEngagements - val AllBCEDwellLabels: Set[Feature[JBoolean]] = - CombinedFeatures.TweetDetailDwellEngagements ++ CombinedFeatures.ProfileDwellEngagements ++ CombinedFeatures.FullscreenVideoDwellEngagements - val AllTweetUnionLabels: Set[Feature[JBoolean]] = Set( - CombinedFeatures.IS_IMPLICIT_POSITIVE_FEEDBACK_UNION, - CombinedFeatures.IS_EXPLICIT_POSITIVE_FEEDBACK_UNION, - CombinedFeatures.IS_ALL_NEGATIVE_FEEDBACK_UNION - ) - val AllTweetLabels: Set[Feature[JBoolean]] = - TweetLabels ++ TweetCoreAndDwellLabels ++ AllTweetNegativeEngagementLabels ++ ProfileCoreLabels ++ ProfileNegativeEngagementLabels ++ ProfileNegativeEngagementUnionLabels ++ UserAuthorEngagementLabels ++ SearchCoreLabels ++ ShareEngagementLabels ++ BookmarkEngagementLabels ++ PrivateEngagementLabelsV2 ++ AllBCEDwellLabels ++ AllTweetUnionLabels - - def addFeatureFilterFromResource( - prodGroup: AggregateGroup, - aggRemovalPath: String - ): AggregateGroup = { - val resource = Some(Source.fromResource(aggRemovalPath)) - val lines = resource.map(_.getLines.toSeq) - lines match { - case Some(value) => prodGroup.copy(aggExclusionRegex = value) - case _ => prodGroup - } - } -} - -trait TimelinesOnlineAggregationDefinitionsTrait extends OnlineAggregationConfigTrait { - import TimelinesOnlineAggregationUtils._ - - def inputSource: AggregateSource - def ProductionStore: AggregateStore - def StagingStore: AggregateStore - - val TweetFeatures: Set[Feature[_]] = Set( - ClientLogEventDataRecordFeatures.HasConsumerVideo, - ClientLogEventDataRecordFeatures.PhotoCount - ) - val CandidateTweetSourceFeatures: Set[Feature[_]] = Set( - ClientLogEventDataRecordFeatures.FromRecap, - ClientLogEventDataRecordFeatures.FromRecycled, - ClientLogEventDataRecordFeatures.FromActivity, - ClientLogEventDataRecordFeatures.FromSimcluster, - ClientLogEventDataRecordFeatures.FromErg, - ClientLogEventDataRecordFeatures.FromCroon, - ClientLogEventDataRecordFeatures.FromList, - ClientLogEventDataRecordFeatures.FromRecTopic - ) - - def createStagingGroup(prodGroup: AggregateGroup): AggregateGroup = - prodGroup.copy( - outputStore = StagingStore - ) - - // Aggregate user engagements/features by tweet Id. - val tweetEngagement30MinuteCountsProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v1", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - // Aggregate user engagements/features by tweet Id. - val tweetVerifiedDontLikeEngagementRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v6", - preTransforms = Seq(RichRemoveUnverifiedUserTransform), - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val tweetNegativeEngagement6HourCounts = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v2", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = TweetNegativeEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val tweetVerifiedNegativeEngagementCounts = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v7", - preTransforms = Seq(RichRemoveUnverifiedUserTransform), - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = TweetNegativeEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val promotedTweetEngagementRealTimeCounts = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v3.is_promoted", - preTransforms = Seq( - DownsampleTransform( - negativeSamplingRate = 0.0, - keepLabels = Set(ClientLogEventDataRecordFeatures.IsPromoted))), - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = TweetCoreAndDwellLabels, - metrics = Set(CountMetric), - halfLives = Set(2.hours, 24.hours), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate total engagement counts by tweet Id for non-public - * engagements. Similar to EB's public engagement counts. - */ - val tweetEngagementTotalCountsProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v1", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val tweetNegativeEngagementTotalCounts = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v2", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = TweetNegativeEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet features grouped by viewer's user id. - */ - val userEngagementRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_aggregates_v1", - keys = Set(SharedFeatures.USER_ID), - features = TweetFeatures, - labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet features grouped by viewer's user id. - */ - val userEngagementRealTimeAggregatesV2 = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_aggregates_v2", - keys = Set(SharedFeatures.USER_ID), - features = ClientLogEventDataRecordFeatures.TweetFeaturesV2, - labels = TweetCoreAndDwellLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate author's user state features grouped by viewer's user id. - */ - val userEngagementAuthorUserStateRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_aggregates_v3", - preTransforms = Seq.empty, - keys = Set(SharedFeatures.USER_ID), - features = AuthorFeaturesAdapter.UserStateBooleanFeatures, - labels = TweetCoreAndDwellLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate author's user state features grouped by viewer's user id. - */ - val userNegativeEngagementAuthorUserStateRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_aggregates_v4", - preTransforms = Seq.empty, - keys = Set(SharedFeatures.USER_ID), - features = AuthorFeaturesAdapter.UserStateBooleanFeatures, - labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet features grouped by viewer's user id, with 48 hour halfLife. - */ - val userEngagement48HourRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_aggregates_v5", - keys = Set(SharedFeatures.USER_ID), - features = TweetFeatures, - labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(48.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate author's user state features grouped by viewer's user id. - */ - val userNegativeEngagementAuthorUserState72HourRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_aggregates_v6", - preTransforms = Seq.empty, - keys = Set(SharedFeatures.USER_ID), - features = AuthorFeaturesAdapter.UserStateBooleanFeatures, - labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(72.hours), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate features grouped by source author id: for each author, aggregate features are created - * to quantify engagements (fav, reply, etc.) which tweets of the author has received. - */ - val authorEngagementRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_author_aggregates_v1", - keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = TweetLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate features grouped by source author id: for each author, aggregate features are created - * to quantify negative engagements (mute, block, etc.) which tweets of the author has received. - * - * This aggregate group is not used in Home, but it is used in Follow Recommendation Service so need to keep it for now. - * - */ - val authorNegativeEngagementRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_author_aggregates_v2", - keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = TweetNegativeEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate features grouped by source author id: for each author, aggregate features are created - * to quantify negative engagements (don't like) which tweets of the author has received from - * verified users. - */ - val authorVerifiedNegativeEngagementRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_author_aggregates_v3", - preTransforms = Seq(RichRemoveUnverifiedUserTransform), - keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet features grouped by topic id. - */ - val topicEngagementRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_topic_aggregates_v1", - keys = Set(TimelinesSharedFeatures.TOPIC_ID), - features = Set.empty, - labels = TweetLabels ++ AllTweetNegativeEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate user engagements / user state by topic id. - */ - val topicEngagementUserStateRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_topic_aggregates_v2", - keys = Set(TimelinesSharedFeatures.TOPIC_ID), - features = UserFeaturesAdapter.UserStateBooleanFeatures, - labels = TweetCoreAndDwellLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate user negative engagements / user state by topic id. - */ - val topicNegativeEngagementUserStateRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_topic_aggregates_v3", - keys = Set(TimelinesSharedFeatures.TOPIC_ID), - features = UserFeaturesAdapter.UserStateBooleanFeatures, - labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet features grouped by topic id like real_time_topic_aggregates_v1 but 24hour halfLife - */ - val topicEngagement24HourRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_topic_aggregates_v4", - keys = Set(TimelinesSharedFeatures.TOPIC_ID), - features = Set.empty, - labels = TweetLabels ++ AllTweetNegativeEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - // Aggregate user engagements / user state by tweet Id. - val tweetEngagementUserStateRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v3", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = UserFeaturesAdapter.UserStateBooleanFeatures, - labels = TweetCoreAndDwellLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - // Aggregate user engagements / user gender by tweet Id. - val tweetEngagementGenderRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v4", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = UserFeaturesAdapter.GenderBooleanFeatures, - labels = - TweetCoreAndDwellLabels ++ TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - // Aggregate user negative engagements / user state by tweet Id. - val tweetNegativeEngagementUserStateRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v5", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = UserFeaturesAdapter.UserStateBooleanFeatures, - labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - // Aggregate user negative engagements / user state by tweet Id. - val tweetVerifiedNegativeEngagementUserStateRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_aggregates_v8", - preTransforms = Seq(RichRemoveUnverifiedUserTransform), - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = UserFeaturesAdapter.UserStateBooleanFeatures, - labels = TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet engagement labels and candidate tweet source features grouped by user id. - */ - val userCandidateTweetSourceEngagementRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_candidate_tweet_source_aggregates_v1", - keys = Set(SharedFeatures.USER_ID), - features = CandidateTweetSourceFeatures, - labels = TweetCoreAndDwellLabels ++ NegativeEngagementsRealTimeDontLike, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet engagement labels and candidate tweet source features grouped by user id. - */ - val userCandidateTweetSourceEngagement48HourRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_candidate_tweet_source_aggregates_v2", - keys = Set(SharedFeatures.USER_ID), - features = CandidateTweetSourceFeatures, - labels = TweetCoreAndDwellLabels ++ NegativeEngagementsRealTimeDontLike, - metrics = Set(CountMetric), - halfLives = Set(48.hours), - outputStore = ProductionStore, - includeAnyFeature = false, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet features grouped by viewer's user id on Profile engagements - */ - val userProfileEngagementRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "profile_real_time_user_aggregates_v1", - preTransforms = Seq(IsNewUserTransform), - keys = Set(SharedFeatures.USER_ID), - features = TweetFeatures, - labels = ProfileCoreLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = true, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val NegativeEngagementsUnionTransform = RichITransform( - BinaryUnion( - featuresToUnify = ProfileNegativeEngagementLabels, - outputFeature = ProfileLabelFeatures.IS_NEGATIVE_FEEDBACK_UNION - )) - - /** - * Aggregate tweet features grouped by viewer's user id on Profile negative engagements. - */ - val userProfileNegativeEngagementRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "profile_negative_engagement_real_time_user_aggregates_v1", - preTransforms = Seq(NegativeEngagementsUnionTransform), - keys = Set(SharedFeatures.USER_ID), - features = Set.empty, - labels = ProfileNegativeEngagementLabels ++ ProfileNegativeEngagementUnionLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 72.hours, 14.day), - outputStore = ProductionStore, - includeAnyFeature = true, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet features grouped by viewer's and author's user ids and on Profile engagements - */ - val userAuthorProfileEngagementRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "user_author_profile_real_time_aggregates_v1", - keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = ProfileCoreLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours, 72.hours), - outputStore = ProductionStore, - includeAnyFeature = true, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate tweet features grouped by viewer's and author's user ids and on negative Profile engagements - */ - val userAuthorProfileNegativeEngagementRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "user_author_profile_negative_engagement_real_time_aggregates_v1", - preTransforms = Seq(NegativeEngagementsUnionTransform), - keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = ProfileNegativeEngagementUnionLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 72.hours, 14.day), - outputStore = ProductionStore, - includeAnyFeature = true, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val newUserAuthorEngagementRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_new_user_author_aggregates_v1", - preTransforms = Seq(IsNewUserTransform), - keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = TweetCoreAndDwellLabels ++ Set( - IS_CLICKED, - IS_PROFILE_CLICKED, - IS_PHOTO_EXPANDED - ), - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyFeature = true, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val userAuthorEngagementRealTimeAggregatesProd = { - // Computing user-author real-time aggregates is very expensive so we - // take the union of all major negative feedback engagements to create - // a single negtive label for aggregation. We also include a number of - // core positive engagements. - val BinaryUnionNegativeEngagements = - BinaryUnion( - featuresToUnify = AllTweetNegativeEngagementLabels, - outputFeature = IS_NEGATIVE_FEEDBACK_UNION - ) - val BinaryUnionNegativeEngagementsTransform = RichITransform(BinaryUnionNegativeEngagements) - - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_author_aggregates_v1", - preTransforms = Seq(BinaryUnionNegativeEngagementsTransform), - keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = UserAuthorEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 1.day), - outputStore = ProductionStore, - includeAnyFeature = true, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - } - - /** - * Aggregate tweet features grouped by list id. - */ - val listEngagementRealTimeAggregatesProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_list_aggregates_v1", - keys = Set(ListFeatures.LIST_ID), - features = Set.empty, - labels = - TweetCoreAndDwellLabels ++ TweetNegativeEngagementLabels ++ TweetNegativeEngagementDontLikeLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - // Aggregate features grouped by topic of tweet and country from user's location - val topicCountryRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_topic_country_aggregates_v1", - keys = Set(TimelinesSharedFeatures.TOPIC_ID, UserFeaturesAdapter.USER_COUNTRY_ID), - features = Set.empty, - labels = - TweetCoreAndDwellLabels ++ AllTweetNegativeEngagementLabels ++ PrivateEngagementLabelsV2 ++ ShareEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 72.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - // Aggregate features grouped by TweetId_Country from user's location - val tweetCountryRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_country_aggregates_v1", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID, UserFeaturesAdapter.USER_COUNTRY_ID), - features = Set.empty, - labels = TweetCoreAndDwellLabels ++ AllTweetNegativeEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = true, - includeTimestampFeature = false, - ) - - // Additional aggregate features grouped by TweetId_Country from user's location - val tweetCountryPrivateEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_country_aggregates_v2", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID, UserFeaturesAdapter.USER_COUNTRY_ID), - features = Set.empty, - labels = PrivateEngagementLabelsV2 ++ ShareEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 72.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - // Aggregate features grouped by TweetId_Country from user's location - val tweetCountryVerifiedNegativeEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_country_aggregates_v3", - preTransforms = Seq(RichRemoveUnverifiedUserTransform), - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID, UserFeaturesAdapter.USER_COUNTRY_ID), - features = Set.empty, - labels = AllTweetNegativeEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, Duration.Top), - outputStore = ProductionStore, - includeAnyLabel = true, - includeTimestampFeature = false, - ) - - object positionTranforms extends IsPositionTransform { - override val isInPositionRangeFeature: Seq[PositionCase] = - Seq(PositionCase(1, IS_TOP_ONE), PositionCase(5, IS_TOP_FIVE), PositionCase(10, IS_TOP_TEN)) - override val decodedPositionFeature: Feature.Discrete = - ClientLogEventDataRecordFeatures.InjectedPosition - } - - val userPositionEngagementsCountsProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_position_based_user_aggregates_v1", - keys = Set(SharedFeatures.USER_ID), - features = Set(IS_TOP_ONE, IS_TOP_FIVE, IS_TOP_TEN), - labels = TweetCoreAndDwellLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - preTransforms = Seq(positionTranforms), - includeAnyLabel = false, - includeAnyFeature = false, - includeTimestampFeature = false, - ) - - val userPositionEngagementsSumProd = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_position_based_user_sum_aggregates_v2", - keys = Set(SharedFeatures.USER_ID), - features = Set(LOG_POSITION), - labels = TweetCoreAndDwellLabels, - metrics = Set(SumMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - preTransforms = - Seq(new LogTransform(ClientLogEventDataRecordFeatures.InjectedPosition, LOG_POSITION)), - includeAnyLabel = false, - includeAnyFeature = false, - includeTimestampFeature = false, - ) - - // Aggregates for share engagements - val tweetShareEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_share_aggregates_v1", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = ShareEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val userShareEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_share_aggregates_v1", - keys = Set(SharedFeatures.USER_ID), - features = Set.empty, - labels = ShareEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val userAuthorShareEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_author_share_aggregates_v1", - keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = ShareEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyFeature = true, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val topicShareEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_topic_share_aggregates_v1", - keys = Set(TimelinesSharedFeatures.TOPIC_ID), - features = Set.empty, - labels = ShareEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val authorShareEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_author_share_aggregates_v1", - keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = ShareEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - // Bookmark RTAs - val tweetBookmarkEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_bookmark_aggregates_v1", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = BookmarkEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val userBookmarkEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_bookmark_aggregates_v1", - keys = Set(SharedFeatures.USER_ID), - features = Set.empty, - labels = BookmarkEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val userAuthorBookmarkEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_author_bookmark_aggregates_v1", - keys = Set(SharedFeatures.USER_ID, TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = BookmarkEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyFeature = true, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val authorBookmarkEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_author_bookmark_aggregates_v1", - keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = Set.empty, - labels = BookmarkEngagementLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate on user level dwell labels from BCE - */ - val userBCEDwellEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_user_bce_dwell_aggregates", - keys = Set(SharedFeatures.USER_ID), - features = Set.empty, - labels = AllBCEDwellLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - /** - * Aggregate on tweet level dwell labels from BCE - */ - val tweetBCEDwellEngagementsRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_tweet_bce_dwell_aggregates", - keys = Set(TimelinesSharedFeatures.SOURCE_TWEET_ID), - features = Set.empty, - labels = AllBCEDwellLabels, - metrics = Set(CountMetric), - halfLives = Set(30.minutes, 24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeTimestampFeature = false, - ) - - val ImplicitPositiveEngagementsUnionTransform = RichITransform( - BinaryUnion( - featuresToUnify = CombinedFeatures.ImplicitPositiveEngagements, - outputFeature = CombinedFeatures.IS_IMPLICIT_POSITIVE_FEEDBACK_UNION - ) - ) - - val ExplicitPositiveEngagementsUnionTransform = RichITransform( - BinaryUnion( - featuresToUnify = CombinedFeatures.ExplicitPositiveEngagements, - outputFeature = CombinedFeatures.IS_EXPLICIT_POSITIVE_FEEDBACK_UNION - ) - ) - - val AllNegativeEngagementsUnionTransform = RichITransform( - BinaryUnion( - featuresToUnify = CombinedFeatures.AllNegativeEngagements, - outputFeature = CombinedFeatures.IS_ALL_NEGATIVE_FEEDBACK_UNION - ) - ) - - /** - * Aggregate features for author content preference - */ - val authorContentPreferenceRealTimeAggregates = - AggregateGroup( - inputSource = inputSource, - aggregatePrefix = "real_time_author_content_preference_aggregates", - preTransforms = Seq( - ImplicitPositiveEngagementsUnionTransform, - ExplicitPositiveEngagementsUnionTransform, - AllNegativeEngagementsUnionTransform), - keys = Set(TimelinesSharedFeatures.SOURCE_AUTHOR_ID), - features = - ClientLogEventDataRecordFeatures.AuthorContentPreferenceTweetTypeFeatures ++ AuthorFeaturesAdapter.UserStateBooleanFeatures, - labels = AllTweetUnionLabels, - metrics = Set(CountMetric), - halfLives = Set(24.hours), - outputStore = ProductionStore, - includeAnyLabel = false, - includeAnyFeature = false, - ) - - val FeaturesGeneratedByPreTransforms = Set(LOG_POSITION, IS_TOP_TEN, IS_TOP_FIVE, IS_TOP_ONE) - - val ProdAggregateGroups = Set( - tweetEngagement30MinuteCountsProd, - tweetEngagementTotalCountsProd, - tweetNegativeEngagement6HourCounts, - tweetNegativeEngagementTotalCounts, - userEngagementRealTimeAggregatesProd, - userEngagement48HourRealTimeAggregatesProd, - userNegativeEngagementAuthorUserStateRealTimeAggregates, - userNegativeEngagementAuthorUserState72HourRealTimeAggregates, - authorEngagementRealTimeAggregatesProd, - topicEngagementRealTimeAggregatesProd, - topicEngagement24HourRealTimeAggregatesProd, - tweetEngagementUserStateRealTimeAggregatesProd, - tweetNegativeEngagementUserStateRealTimeAggregates, - userProfileEngagementRealTimeAggregates, - newUserAuthorEngagementRealTimeAggregatesProd, - userAuthorEngagementRealTimeAggregatesProd, - listEngagementRealTimeAggregatesProd, - tweetCountryRealTimeAggregates, - tweetShareEngagementsRealTimeAggregates, - userShareEngagementsRealTimeAggregates, - userAuthorShareEngagementsRealTimeAggregates, - topicShareEngagementsRealTimeAggregates, - authorShareEngagementsRealTimeAggregates, - tweetBookmarkEngagementsRealTimeAggregates, - userBookmarkEngagementsRealTimeAggregates, - userAuthorBookmarkEngagementsRealTimeAggregates, - authorBookmarkEngagementsRealTimeAggregates, - topicCountryRealTimeAggregates, - tweetCountryPrivateEngagementsRealTimeAggregates, - userBCEDwellEngagementsRealTimeAggregates, - tweetBCEDwellEngagementsRealTimeAggregates, - authorContentPreferenceRealTimeAggregates, - authorVerifiedNegativeEngagementRealTimeAggregatesProd, - tweetVerifiedDontLikeEngagementRealTimeAggregatesProd, - tweetVerifiedNegativeEngagementCounts, - tweetVerifiedNegativeEngagementUserStateRealTimeAggregates, - tweetCountryVerifiedNegativeEngagementsRealTimeAggregates - ).map( - addFeatureFilterFromResource( - _, - "com/twitter/timelines/prediction/common/aggregates/real_time/aggregates_to_drop.txt")) - - val StagingAggregateGroups = ProdAggregateGroups.map(createStagingGroup) - - /** - * Contains the fully typed aggregate groups from which important - * values can be derived e.g. the features to be computed, halflives etc. - */ - override val ProdAggregates = ProdAggregateGroups.flatMap(_.buildTypedAggregateGroups()) - - override val StagingAggregates = StagingAggregateGroups.flatMap(_.buildTypedAggregateGroups()) - - - override val ProdCommonAggregates = ProdAggregates - .filter(_.keysToAggregate == Set(SharedFeatures.USER_ID)) - - /** - * This defines the set of selected features from a candidate - * that we'd like to send to the served features cache by TLM. - * These should include interesting and necessary features that - * cannot be extracted from LogEvents only by the real-time aggregates - * job. If you are adding new AggregateGroups requiring TLM-side - * candidate features, make sure to add them here. - */ - val candidateFeaturesToCache: Set[Feature[_]] = Set( - TimelinesSharedFeatures.SOURCE_AUTHOR_ID, - RecapFeatures.HASHTAGS, - RecapFeatures.MENTIONED_SCREEN_NAMES, - RecapFeatures.URL_DOMAINS - ) -} - -/** - * This config should only be used to access the aggregate features constructed by the - * aggregation config, and not for implementing an online real-time aggregates job. - */ -object TimelinesOnlineAggregationFeaturesOnlyConfig - extends TimelinesOnlineAggregationDefinitionsTrait { - - private[real_time] case class DummyAggregateSource(name: String, timestampFeature: Feature[JLong]) - extends AggregateSource - - private[real_time] case class DummyAggregateStore(name: String) extends AggregateStore - - override lazy val inputSource = DummyAggregateSource( - name = "timelines_rta", - timestampFeature = SharedFeatures.TIMESTAMP - ) - override lazy val ProductionStore = DummyAggregateStore("timelines_rta") - override lazy val StagingStore = DummyAggregateStore("timelines_rta") - - override lazy val AggregatesToCompute = ProdAggregates ++ StagingAggregates -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationSources.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationSources.docx new file mode 100644 index 000000000..1bad79931 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationSources.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationSources.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationSources.scala deleted file mode 100644 index 71e97a1b1..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationSources.scala +++ /dev/null @@ -1,5 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -object TimelinesOnlineAggregationSources { - val timelinesOnlineAggregateSource = new TimelinesStormAggregateSource -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.docx new file mode 100644 index 000000000..edca2ce66 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.scala deleted file mode 100644 index e386d4da1..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.scala +++ /dev/null @@ -1,182 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.stats.DefaultStatsReceiver -import com.twitter.summingbird.Options -import com.twitter.summingbird.online.option.FlatMapParallelism -import com.twitter.summingbird.online.option.SourceParallelism -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron._ -import com.twitter.timelines.data_processing.ml_util.transforms.DownsampleTransform -import com.twitter.timelines.data_processing.ml_util.transforms.RichITransform -import com.twitter.timelines.data_processing.ml_util.transforms.UserDownsampleTransform - -import com.twitter.timelines.prediction.common.aggregates.BCELabelTransformFromUUADataRecord - -/** - * Sets up relevant topology parameters. Our primary goal is to handle the - * LogEvent stream and aggregate (sum) on the parsed DataRecords without falling - * behind. Our constraint is the resulting write (and read) QPS to the backing - * memcache store. - * - * If the job is falling behind, add more flatMappers and/or Summers after - * inspecting the viz panels for the respective job (go/heron-ui). An increase in - * Summers (and/or aggregation keys and features in the config) results in an - * increase in memcache QPS (go/cb and search for our cache). Adjust with CacheSize - * settings until QPS is well-controlled. - * - */ -object TimelinesRealTimeAggregatesJobConfigs extends RealTimeAggregatesJobConfigs { - import TimelinesOnlineAggregationUtils._ - - /** - * We remove input records that do not contain a label/engagement as defined in AllTweetLabels, which includes - * explicit user engagements including public, private and impression events. By avoiding ingesting records without - * engagemnts, we guarantee that no distribution shifts occur in computed aggregate features when we add a new spout - * to input aggregate sources. Counterfactual signal is still available since we aggregate on explicit dwell - * engagements. - */ - val NegativeDownsampleTransform = - DownsampleTransform( - negativeSamplingRate = 0.0, - keepLabels = AllTweetLabels, - positiveSamplingRate = 1.0) - - /** - * We downsample positive engagements for devel topology to reduce traffic, aiming for equivalent of 10% of prod traffic. - * First apply consistent downsampling to 10% of users, and then apply downsampling to remove records without - * explicit labels. We apply user-consistent sampling to more closely approximate prod query patterns. - */ - val StagingUserBasedDownsampleTransform = - UserDownsampleTransform( - availability = 1000, - featureName = "rta_devel" - ) - - override val Prod = RealTimeAggregatesJobConfig( - appId = "summingbird_timelines_rta", - topologyWorkers = 1450, - sourceCount = 120, - flatMapCount = 1800, - summerCount = 3850, - cacheSize = 200, - containerRamGigaBytes = 54, - name = "timelines_real_time_aggregates", - teamName = "timelines", - teamEmail = "", - // If one component is hitting GC limit at prod, tune componentToMetaSpaceSizeMap. - // Except for Source bolts. Tune componentToRamGigaBytesMap for Source bolts instead. - componentToMetaSpaceSizeMap = Map( - "Tail-FlatMap" -> "-XX:MaxMetaspaceSize=1024M -XX:MetaspaceSize=1024M", - "Tail" -> "-XX:MaxMetaspaceSize=2560M -XX:MetaspaceSize=2560M" - ), - // If either component is hitting memory limit at prod - // its memory need to increase: either increase total memory of container (containerRamGigaBytes), - // or allocate more memory for one component while keeping total memory unchanged. - componentToRamGigaBytesMap = Map( - "Tail-FlatMap-Source" -> 3, // Home source - "Tail-FlatMap-Source.2" -> 3, // Profile source - "Tail-FlatMap-Source.3" -> 3, // Search source - "Tail-FlatMap-Source.4" -> 3, // UUA source - "Tail-FlatMap" -> 8 - // Tail will use the leftover memory in the container. - // Make sure to tune topologyWorkers and containerRamGigaBytes such that this is greater than 10 GB. - ), - topologyNamedOptions = Map( - "TL_EVENTS_SOURCE" -> Options() - .set(SourceParallelism(120)), - "PROFILE_EVENTS_SOURCE" -> Options() - .set(SourceParallelism(30)), - "SEARCH_EVENTS_SOURCE" -> Options() - .set(SourceParallelism(10)), - "UUA_EVENTS_SOURCE" -> Options() - .set(SourceParallelism(10)), - "COMBINED_PRODUCER" -> Options() - .set(FlatMapParallelism(1800)) - ), - // The UUA datarecord for BCE events inputted will not have binary labels populated. - // BCELabelTransform will set the datarecord with binary BCE dwell labels features based on the corresponding dwell_time_ms. - // It's important to have the BCELabelTransformFromUUADataRecord before ProdNegativeDownsampleTransform - // because ProdNegativeDownsampleTransform will remove datarecord that contains no features from AllTweetLabels. - onlinePreTransforms = - Seq(RichITransform(BCELabelTransformFromUUADataRecord), NegativeDownsampleTransform) - ) - - /** - * we downsample 10% computation of devel RTA based on [[StagingNegativeDownsampleTransform]]. - * To better test scalability of topology, we reduce computing resource of components "Tail-FlatMap" - * and "Tail" to be 10% of prod but keep computing resource of component "Tail-FlatMap-Source" unchanged. - * hence flatMapCount=110, summerCount=105 and sourceCount=100. Hence topologyWorkers =(110+105+100)/5 = 63. - */ - override val Devel = RealTimeAggregatesJobConfig( - appId = "summingbird_timelines_rta_devel", - topologyWorkers = 120, - sourceCount = 120, - flatMapCount = 150, - summerCount = 300, - cacheSize = 200, - containerRamGigaBytes = 54, - name = "timelines_real_time_aggregates_devel", - teamName = "timelines", - teamEmail = "", - // If one component is hitting GC limit at prod, tune componentToMetaSpaceSizeMap - // Except for Source bolts. Tune componentToRamGigaBytesMap for Source bolts instead. - componentToMetaSpaceSizeMap = Map( - "Tail-FlatMap" -> "-XX:MaxMetaspaceSize=1024M -XX:MetaspaceSize=1024M", - "Tail" -> "-XX:MaxMetaspaceSize=2560M -XX:MetaspaceSize=2560M" - ), - // If either component is hitting memory limit at prod - // its memory need to increase: either increase total memory of container (containerRamGigaBytes), - // or allocate more memory for one component while keeping total memory unchanged. - componentToRamGigaBytesMap = Map( - "Tail-FlatMap-Source" -> 3, // Home source - "Tail-FlatMap-Source.2" -> 3, // Profile source - "Tail-FlatMap-Source.3" -> 3, // Search source - "Tail-FlatMap-Source.4" -> 3, // UUA source - "Tail-FlatMap" -> 8 - // Tail will use the leftover memory in the container. - // Make sure to tune topologyWorkers and containerRamGigaBytes such that this is greater than 10 GB. - ), - topologyNamedOptions = Map( - "TL_EVENTS_SOURCE" -> Options() - .set(SourceParallelism(120)), - "PROFILE_EVENTS_SOURCE" -> Options() - .set(SourceParallelism(30)), - "SEARCH_EVENTS_SOURCE" -> Options() - .set(SourceParallelism(10)), - "UUA_EVENTS_SOURCE" -> Options() - .set(SourceParallelism(10)), - "COMBINED_PRODUCER" -> Options() - .set(FlatMapParallelism(150)) - ), - // It's important to have the BCELabelTransformFromUUADataRecord before ProdNegativeDownsampleTransform - onlinePreTransforms = Seq( - StagingUserBasedDownsampleTransform, - RichITransform(BCELabelTransformFromUUADataRecord), - NegativeDownsampleTransform), - enableUserReindexingNighthawkBtreeStore = true, - enableUserReindexingNighthawkHashStore = true, - userReindexingNighthawkBtreeStoreConfig = NighthawkUnderlyingStoreConfig( - serversetPath = - "/twitter/service/cache-user/test/nighthawk_timelines_real_time_aggregates_btree_test_api", - // NOTE: table names are prefixed to every pkey so keep it short - tableName = "u_r_v1", // (u)ser_(r)eindexing_v1 - // keep ttl <= 1 day because it's keyed on user, and we will have limited hit rates beyond 1 day - cacheTTL = 1.day - ), - userReindexingNighthawkHashStoreConfig = NighthawkUnderlyingStoreConfig( - // For prod: "/s/cache-user/nighthawk_timelines_real_time_aggregates_hash_api", - serversetPath = - "/twitter/service/cache-user/test/nighthawk_timelines_real_time_aggregates_hash_test_api", - // NOTE: table names are prefixed to every pkey so keep it short - tableName = "u_r_v1", // (u)ser_(r)eindexing_v1 - // keep ttl <= 1 day because it's keyed on user, and we will have limited hit rates beyond 1 day - cacheTTL = 1.day - ) - ) -} - -object TimelinesRealTimeAggregatesJob extends RealTimeAggregatesJobBase { - override lazy val statsReceiver = DefaultStatsReceiver.scope("timelines_real_time_aggregates") - override lazy val jobConfigs = TimelinesRealTimeAggregatesJobConfigs - override lazy val aggregatesToCompute = TimelinesOnlineAggregationConfig.AggregatesToCompute -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.docx new file mode 100644 index 000000000..ec9eaa180 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.scala deleted file mode 100644 index 2e096dc07..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.scala +++ /dev/null @@ -1,185 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.clientapp.thriftscala.LogEvent -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.stats.Counter -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.constant.SharedFeatures -import com.twitter.snowflake.id.SnowflakeId -import com.twitter.summingbird._ -import com.twitter.summingbird.storm.Storm -import com.twitter.summingbird_internal.sources.AppId -import com.twitter.summingbird_internal.sources.storm.remote.ClientEventSourceScrooge2 -import com.twitter.timelines.data_processing.ad_hoc.suggests.common.AllScribeProcessor -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.RealTimeAggregatesJobConfig -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.StormAggregateSource -import com.twitter.timelines.prediction.adapters.client_log_event.ClientLogEventAdapter -import com.twitter.timelines.prediction.adapters.client_log_event.ProfileClientLogEventAdapter -import com.twitter.timelines.prediction.adapters.client_log_event.SearchClientLogEventAdapter -import com.twitter.timelines.prediction.adapters.client_log_event.UuaEventAdapter -import com.twitter.unified_user_actions.client.config.KafkaConfigs -import com.twitter.unified_user_actions.client.summingbird.UnifiedUserActionsSourceScrooge -import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction -import scala.collection.JavaConverters._ - -/** - * Storm Producer for client events generated on Home, Profile, and Search - */ -class TimelinesStormAggregateSource extends StormAggregateSource { - - override val name = "timelines_rta" - override val timestampFeature = SharedFeatures.TIMESTAMP - - private lazy val TimelinesClientEventSourceName = "TL_EVENTS_SOURCE" - private lazy val ProfileClientEventSourceName = "PROFILE_EVENTS_SOURCE" - private lazy val SearchClientEventSourceName = "SEARCH_EVENTS_SOURCE" - private lazy val UuaEventSourceName = "UUA_EVENTS_SOURCE" - private lazy val CombinedProducerName = "COMBINED_PRODUCER" - private lazy val FeatureStoreProducerName = "FEATURE_STORE_PRODUCER" - - private def isNewUserEvent(event: LogEvent): Boolean = { - event.logBase.flatMap(_.userId).flatMap(SnowflakeId.timeFromIdOpt).exists(_.untilNow < 30.days) - } - - private def mkDataRecords(event: LogEvent, dataRecordCounter: Counter): Seq[DataRecord] = { - val dataRecords: Seq[DataRecord] = - if (AllScribeProcessor.isValidSuggestTweetEvent(event)) { - ClientLogEventAdapter.adaptToDataRecords(event).asScala - } else { - Seq.empty[DataRecord] - } - dataRecordCounter.incr(dataRecords.size) - dataRecords - } - - private def mkProfileDataRecords( - event: LogEvent, - dataRecordCounter: Counter - ): Seq[DataRecord] = { - val dataRecords: Seq[DataRecord] = - ProfileClientLogEventAdapter.adaptToDataRecords(event).asScala - dataRecordCounter.incr(dataRecords.size) - dataRecords - } - - private def mkSearchDataRecords( - event: LogEvent, - dataRecordCounter: Counter - ): Seq[DataRecord] = { - val dataRecords: Seq[DataRecord] = - SearchClientLogEventAdapter.adaptToDataRecords(event).asScala - dataRecordCounter.incr(dataRecords.size) - dataRecords - } - - private def mkUuaDataRecords( - event: UnifiedUserAction, - dataRecordCounter: Counter - ): Seq[DataRecord] = { - val dataRecords: Seq[DataRecord] = - UuaEventAdapter.adaptToDataRecords(event).asScala - dataRecordCounter.incr(dataRecords.size) - dataRecords - } - - override def build( - statsReceiver: StatsReceiver, - jobConfig: RealTimeAggregatesJobConfig - ): Producer[Storm, DataRecord] = { - lazy val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName) - lazy val dataRecordCounter = scopedStatsReceiver.counter("dataRecord") - - // Home Timeline Engagements - // Step 1: => LogEvent - lazy val clientEventProducer: Producer[Storm, HomeEvent[LogEvent]] = - ClientEventSourceScrooge2( - appId = AppId(jobConfig.appId), - topic = "julep_client_event_suggests", - resumeAtLastReadOffset = false, - enableTls = true - ).source.map(HomeEvent[LogEvent]).name(TimelinesClientEventSourceName) - - // Profile Engagements - // Step 1: => LogEvent - lazy val profileClientEventProducer: Producer[Storm, ProfileEvent[LogEvent]] = - ClientEventSourceScrooge2( - appId = AppId(jobConfig.appId), - topic = "julep_client_event_profile_real_time_engagement_metrics", - resumeAtLastReadOffset = false, - enableTls = true - ).source - .map(ProfileEvent[LogEvent]) - .name(ProfileClientEventSourceName) - - // Search Engagements - // Step 1: => LogEvent - // Only process events for all users to save resource - lazy val searchClientEventProducer: Producer[Storm, SearchEvent[LogEvent]] = - ClientEventSourceScrooge2( - appId = AppId(jobConfig.appId), - topic = "julep_client_event_search_real_time_engagement_metrics", - resumeAtLastReadOffset = false, - enableTls = true - ).source - .map(SearchEvent[LogEvent]) - .name(SearchClientEventSourceName) - - // Unified User Actions (includes Home and other product surfaces) - lazy val uuaEventProducer: Producer[Storm, UuaEvent[UnifiedUserAction]] = - UnifiedUserActionsSourceScrooge( - appId = AppId(jobConfig.appId), - parallelism = 10, - kafkaConfig = KafkaConfigs.ProdUnifiedUserActionsEngagementOnly - ).source - .filter(StormAggregateSourceUtils.isUuaBCEEventsFromHome(_)) - .map(UuaEvent[UnifiedUserAction]) - .name(UuaEventSourceName) - - // Combined - // Step 2: - // (a) Combine - // (b) Transform LogEvent => Seq[DataRecord] - // (c) Apply sampler - lazy val combinedClientEventDataRecordProducer: Producer[Storm, Event[DataRecord]] = - profileClientEventProducer // This becomes the bottom branch - .merge(clientEventProducer) // This becomes the middle branch - .merge(searchClientEventProducer) - .merge(uuaEventProducer) // This becomes the top - .flatMap { // LogEvent => Seq[DataRecord] - case e: HomeEvent[LogEvent] => - mkDataRecords(e.event, dataRecordCounter).map(HomeEvent[DataRecord]) - case e: ProfileEvent[LogEvent] => - mkProfileDataRecords(e.event, dataRecordCounter).map(ProfileEvent[DataRecord]) - case e: SearchEvent[LogEvent] => - mkSearchDataRecords(e.event, dataRecordCounter).map(SearchEvent[DataRecord]) - case e: UuaEvent[UnifiedUserAction] => - mkUuaDataRecords( - e.event, - dataRecordCounter - ).map(UuaEvent[DataRecord]) - } - .flatMap { // Apply sampler - case e: HomeEvent[DataRecord] => - jobConfig.sequentiallyTransform(e.event).map(HomeEvent[DataRecord]) - case e: ProfileEvent[DataRecord] => - jobConfig.sequentiallyTransform(e.event).map(ProfileEvent[DataRecord]) - case e: SearchEvent[DataRecord] => - jobConfig.sequentiallyTransform(e.event).map(SearchEvent[DataRecord]) - case e: UuaEvent[DataRecord] => - jobConfig.sequentiallyTransform(e.event).map(UuaEvent[DataRecord]) - } - .name(CombinedProducerName) - - // Step 3: Join with Feature Store features - lazy val featureStoreDataRecordProducer: Producer[Storm, DataRecord] = - StormAggregateSourceUtils - .wrapByFeatureStoreClient( - underlyingProducer = combinedClientEventDataRecordProducer, - jobConfig = jobConfig, - scopedStatsReceiver = scopedStatsReceiver - ).map(_.event).name(FeatureStoreProducerName) - - featureStoreDataRecordProducer - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesAdapter.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesAdapter.docx new file mode 100644 index 000000000..576bcd985 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesAdapter.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesAdapter.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesAdapter.scala deleted file mode 100644 index 0d5c06d7c..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesAdapter.scala +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.Feature -import com.twitter.ml.api.FeatureContext -import com.twitter.ml.featurestore.catalog.entities.core.Tweet -import com.twitter.ml.featurestore.catalog.features.trends.TweetTrendsScores -import com.twitter.ml.featurestore.lib.TweetId -import com.twitter.ml.featurestore.lib.data.PredictionRecord -import com.twitter.ml.featurestore.lib.data.PredictionRecordAdapter -import com.twitter.ml.featurestore.lib.feature.BoundFeature -import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet -import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase -import java.util -import scala.collection.JavaConverters._ - -object TweetFeaturesAdapter extends TimelinesAdapterBase[PredictionRecord] { - - private val ContinuousFeatureMap: Map[BoundFeature[TweetId, Double], Feature.Continuous] = Map() - - val TweetFeaturesSet: BoundFeatureSet = new BoundFeatureSet(ContinuousFeatureMap.keys.toSet) - - val AllFeatures: Seq[Feature[_]] = - ContinuousFeatureMap.values.toSeq - - private val adapter = PredictionRecordAdapter.oneToOne(TweetFeaturesSet) - - override def getFeatureContext: FeatureContext = new FeatureContext(AllFeatures: _*) - - override def commonFeatures: Set[Feature[_]] = Set.empty - - override def adaptToDataRecords(record: PredictionRecord): util.List[DataRecord] = { - List(adapter.adaptToDataRecord(record)).asJava - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesReadableStore.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesReadableStore.docx new file mode 100644 index 000000000..065940779 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesReadableStore.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesReadableStore.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesReadableStore.scala deleted file mode 100644 index b461e179a..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TweetFeaturesReadableStore.scala +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.ml.api.DataRecord -import com.twitter.ml.featurestore.lib.TweetId -import com.twitter.ml.featurestore.lib.data.PredictionRecord -import com.twitter.ml.featurestore.lib.entity.Entity -import com.twitter.ml.featurestore.lib.online.{FeatureStoreClient, FeatureStoreRequest} -import com.twitter.storehaus.ReadableStore -import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase -import com.twitter.util.Future -import scala.collection.JavaConverters._ - -class TweetFeaturesReadableStore( - featureStoreClient: FeatureStoreClient, - tweetEntity: Entity[TweetId], - tweetFeaturesAdapter: TimelinesAdapterBase[PredictionRecord]) - extends ReadableStore[Set[Long], DataRecord] { - - override def multiGet[K <: Set[Long]](keys: Set[K]): Map[K, Future[Option[DataRecord]]] = { - val orderedKeys: Seq[K] = keys.toSeq - val featureStoreRequests: Seq[FeatureStoreRequest] = getFeatureStoreRequests(orderedKeys) - val predictionRecordsFut: Future[Seq[PredictionRecord]] = featureStoreClient( - featureStoreRequests) - - getDataRecordMap(orderedKeys, predictionRecordsFut) - } - - private def getFeatureStoreRequests[K <: Set[Long]]( - orderedKeys: Seq[K] - ): Seq[FeatureStoreRequest] = { - orderedKeys.map { key: Set[Long] => - FeatureStoreRequest( - entityIds = key.map { tweetId => tweetEntity.withId(TweetId(tweetId)) }.toSeq - ) - } - } - - private def getDataRecordMap[K <: Set[Long]]( - orderedKeys: Seq[K], - predictionRecordsFut: Future[Seq[PredictionRecord]] - ): Map[K, Future[Option[DataRecord]]] = { - orderedKeys.zipWithIndex.map { - case (tweetIdSet, index) => - val dataRecordFutOpt: Future[Option[DataRecord]] = predictionRecordsFut.map { - predictionRecords => - predictionRecords.lift(index).flatMap { predictionRecordAtIndex: PredictionRecord => - tweetFeaturesAdapter.adaptToDataRecords(predictionRecordAtIndex).asScala.headOption - } - } - (tweetIdSet, dataRecordFutOpt) - }.toMap - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TypeSafeRunner.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TypeSafeRunner.docx new file mode 100644 index 000000000..8357fa66a Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TypeSafeRunner.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TypeSafeRunner.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TypeSafeRunner.scala deleted file mode 100644 index 92b6618e4..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TypeSafeRunner.scala +++ /dev/null @@ -1,7 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.summingbird_internal.runner.storm.GenericRunner - -object TypeSafeRunner { - def main(args: Array[String]): Unit = GenericRunner(args, TimelinesRealTimeAggregatesJob(_)) -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesAdapter.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesAdapter.docx new file mode 100644 index 000000000..29ec43b5a Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesAdapter.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesAdapter.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesAdapter.scala deleted file mode 100644 index 8ff39938c..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesAdapter.scala +++ /dev/null @@ -1,108 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType.InferredGender -import com.twitter.dal.personal_data.thriftjava.PersonalDataType.UserState -import com.twitter.ml.api.Feature.Binary -import com.twitter.ml.api.Feature.Text -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.Feature -import com.twitter.ml.api.FeatureContext -import com.twitter.ml.api.RichDataRecord -import com.twitter.ml.featurestore.catalog.entities.core.User -import com.twitter.ml.featurestore.catalog.features.core.UserAccount -import com.twitter.ml.featurestore.catalog.features.geo.UserLocation -import com.twitter.ml.featurestore.catalog.features.magicrecs.UserActivity -import com.twitter.ml.featurestore.lib.EntityId -import com.twitter.ml.featurestore.lib.data.PredictionRecord -import com.twitter.ml.featurestore.lib.feature.BoundFeature -import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet -import com.twitter.ml.featurestore.lib.UserId -import com.twitter.ml.featurestore.lib.{Discrete => FSDiscrete} -import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase -import com.twitter.timelines.prediction.features.user_health.UserHealthFeatures -import java.lang.{Boolean => JBoolean} -import java.lang.{String => JString} -import java.util -import scala.collection.JavaConverters._ - -object UserFeaturesAdapter extends TimelinesAdapterBase[PredictionRecord] { - val UserStateBoundFeature: BoundFeature[UserId, FSDiscrete] = UserActivity.UserState.bind(User) - - /** - * Boolean features about viewer's user state. - * enum UserState { - * NEW = 0, - * NEAR_ZERO = 1, - * VERY_LIGHT = 2, - * LIGHT = 3, - * MEDIUM_TWEETER = 4, - * MEDIUM_NON_TWEETER = 5, - * HEAVY_NON_TWEETER = 6, - * HEAVY_TWEETER = 7 - * }(persisted='true') - */ - val IS_USER_NEW = new Binary("timelines.user_state.is_user_new", Set(UserState).asJava) - val IS_USER_LIGHT = new Binary("timelines.user_state.is_user_light", Set(UserState).asJava) - val IS_USER_MEDIUM_TWEETER = - new Binary("timelines.user_state.is_user_medium_tweeter", Set(UserState).asJava) - val IS_USER_MEDIUM_NON_TWEETER = - new Binary("timelines.user_state.is_user_medium_non_tweeter", Set(UserState).asJava) - val IS_USER_HEAVY_NON_TWEETER = - new Binary("timelines.user_state.is_user_heavy_non_tweeter", Set(UserState).asJava) - val IS_USER_HEAVY_TWEETER = - new Binary("timelines.user_state.is_user_heavy_tweeter", Set(UserState).asJava) - val userStateToFeatureMap: Map[Long, Binary] = Map( - 0L -> IS_USER_NEW, - 1L -> IS_USER_LIGHT, - 2L -> IS_USER_LIGHT, - 3L -> IS_USER_LIGHT, - 4L -> IS_USER_MEDIUM_TWEETER, - 5L -> IS_USER_MEDIUM_NON_TWEETER, - 6L -> IS_USER_HEAVY_NON_TWEETER, - 7L -> IS_USER_HEAVY_TWEETER - ) - - val UserStateBooleanFeatures: Set[Feature[_]] = userStateToFeatureMap.values.toSet - - - val USER_COUNTRY_ID = new Text("geo.user_location.country_code") - val UserCountryCodeFeature: BoundFeature[UserId, String] = - UserLocation.CountryCodeAlpha2.bind(User) - val UserLocationFeatures: Set[Feature[_]] = Set(USER_COUNTRY_ID) - - private val UserVerifiedFeaturesSet = Set( - UserAccount.IsUserVerified.bind(User), - UserAccount.IsUserBlueVerified.bind(User), - UserAccount.IsUserGoldVerified.bind(User), - UserAccount.IsUserGrayVerified.bind(User) - ) - - val UserFeaturesSet: BoundFeatureSet = - BoundFeatureSet(UserStateBoundFeature, UserCountryCodeFeature) ++ - BoundFeatureSet(UserVerifiedFeaturesSet.asInstanceOf[Set[BoundFeature[_ <: EntityId, _]]]) - - private val allFeatures: Seq[Feature[_]] = - UserStateBooleanFeatures.toSeq ++ GenderBooleanFeatures.toSeq ++ - UserLocationFeatures.toSeq ++ Seq(UserHealthFeatures.IsUserVerifiedUnion) - - override def getFeatureContext: FeatureContext = new FeatureContext(allFeatures: _*) - override def commonFeatures: Set[Feature[_]] = Set.empty - - override def adaptToDataRecords(record: PredictionRecord): util.List[DataRecord] = { - val newRecord = new RichDataRecord(new DataRecord) - record - .getFeatureValue(UserStateBoundFeature) - .flatMap { userState => userStateToFeatureMap.get(userState.value) }.foreach { - booleanFeature => newRecord.setFeatureValue[JBoolean](booleanFeature, true) - } - record.getFeatureValue(UserCountryCodeFeature).foreach { countryCodeFeatureValue => - newRecord.setFeatureValue[JString](USER_COUNTRY_ID, countryCodeFeatureValue) - } - - val isUserVerifiedUnion = - UserVerifiedFeaturesSet.exists(feature => record.getFeatureValue(feature).getOrElse(false)) - newRecord.setFeatureValue[JBoolean](UserHealthFeatures.IsUserVerifiedUnion, isUserVerifiedUnion) - - List(newRecord.getRecord).asJava - } -} diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesReadableStore.docx b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesReadableStore.docx new file mode 100644 index 000000000..3769b54e5 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesReadableStore.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesReadableStore.scala b/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesReadableStore.scala deleted file mode 100644 index c1931c32b..000000000 --- a/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/UserFeaturesReadableStore.scala +++ /dev/null @@ -1,37 +0,0 @@ -package com.twitter.timelines.prediction.common.aggregates.real_time - -import com.twitter.ml.api.DataRecord -import com.twitter.ml.featurestore.lib.UserId -import com.twitter.ml.featurestore.lib.data.PredictionRecord -import com.twitter.ml.featurestore.lib.entity.Entity -import com.twitter.ml.featurestore.lib.online.{FeatureStoreClient, FeatureStoreRequest} -import com.twitter.storehaus.ReadableStore -import com.twitter.timelines.prediction.common.adapters.TimelinesAdapterBase -import com.twitter.util.Future -import scala.collection.JavaConverters._ - -class UserFeaturesReadableStore( - featureStoreClient: FeatureStoreClient, - userEntity: Entity[UserId], - userFeaturesAdapter: TimelinesAdapterBase[PredictionRecord]) - extends ReadableStore[Set[Long], DataRecord] { - - override def multiGet[K <: Set[Long]](keys: Set[K]): Map[K, Future[Option[DataRecord]]] = { - val orderedKeys = keys.toSeq - val featureStoreRequests: Seq[FeatureStoreRequest] = orderedKeys.map { key: Set[Long] => - FeatureStoreRequest( - entityIds = key.map(userId => userEntity.withId(UserId(userId))).toSeq - ) - } - val predictionRecordsFut: Future[Seq[PredictionRecord]] = featureStoreClient( - featureStoreRequests) - - orderedKeys.zipWithIndex.map { - case (userId, index) => - val dataRecordFutOpt = predictionRecordsFut.map { predictionRecords => - userFeaturesAdapter.adaptToDataRecords(predictionRecords(index)).asScala.headOption - } - (userId, dataRecordFutOpt) - }.toMap - } -} diff --git a/src/scala/com/twitter/timelines/prediction/features/README.docx b/src/scala/com/twitter/timelines/prediction/features/README.docx new file mode 100644 index 000000000..7db5ccc23 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/README.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/README.md b/src/scala/com/twitter/timelines/prediction/features/README.md deleted file mode 100644 index d42639a77..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/README.md +++ /dev/null @@ -1,6 +0,0 @@ -## Prediction Features - -This directory contains a collection of `Features` (`com.twitter.ml.api.Feature`) which are definitions of feature names and datatypes which allow the features to be efficiently processed and passed to the different ranking models. -By predefining the features with their names and datatypes, when features are being generated, scribed or used to score they can be identified with only a hash of their name. - -Not all of these features are used in the model, many are experimental or deprecated. \ No newline at end of file diff --git a/src/scala/com/twitter/timelines/prediction/features/client_log_event/BUILD b/src/scala/com/twitter/timelines/prediction/features/client_log_event/BUILD deleted file mode 100644 index 3d3c34092..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/client_log_event/BUILD +++ /dev/null @@ -1,11 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/scala/com/twitter/suggests/controller_data", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/timelineservice/server/suggests/logging:thrift-scala", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/client_log_event/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/client_log_event/BUILD.docx new file mode 100644 index 000000000..b0654e23a Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/client_log_event/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/client_log_event/ClientLogEventDataRecordFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/client_log_event/ClientLogEventDataRecordFeatures.docx new file mode 100644 index 000000000..8202740c3 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/client_log_event/ClientLogEventDataRecordFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/client_log_event/ClientLogEventDataRecordFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/client_log_event/ClientLogEventDataRecordFeatures.scala deleted file mode 100644 index cccb99998..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/client_log_event/ClientLogEventDataRecordFeatures.scala +++ /dev/null @@ -1,169 +0,0 @@ -package com.twitter.timelines.prediction.features.client_log_event - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.Feature -import com.twitter.ml.api.Feature.Binary -import com.twitter.ml.api.Feature.Continuous -import com.twitter.ml.api.Feature.Discrete -import scala.collection.JavaConverters._ -import com.twitter.timelineservice.suggests.logging.candidate_tweet_source_id.thriftscala.CandidateTweetSourceId - -object ClientLogEventDataRecordFeatures { - val HasConsumerVideo = new Binary( - "client_log_event.tweet.has_consumer_video", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val PhotoCount = new Continuous( - "client_log_event.tweet.photo_count", - Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) - val HasImage = new Binary( - "client_log_event.tweet.has_image", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val IsReply = - new Binary("client_log_event.tweet.is_reply", Set(PublicReplies, PrivateReplies).asJava) - val IsRetweet = - new Binary("client_log_event.tweet.is_retweet", Set(PublicRetweets, PrivateRetweets).asJava) - val IsPromoted = - new Binary( - "client_log_event.tweet.is_promoted", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HasVisibleLink = new Binary( - "client_log_event.tweet.has_visible_link", - Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HasHashtag = new Binary( - "client_log_event.tweet.has_hashtag", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val FromMutualFollow = new Binary("client_log_event.tweet.from_mutual_follow") - val IsInNetwork = new Binary("client_log_event.tweet.is_in_network") - val IsNotInNetwork = new Binary("client_log_event.tweet.is_not_in_network") - val FromRecap = new Binary("client_log_event.tweet.from_recap") - val FromRecycled = new Binary("client_log_event.tweet.from_recycled") - val FromActivity = new Binary("client_log_event.tweet.from_activity") - val FromSimcluster = new Binary("client_log_event.tweet.from_simcluster") - val FromErg = new Binary("client_log_event.tweet.from_erg") - val FromCroon = new Binary("client_log_event.tweet.from_croon") - val FromList = new Binary("client_log_event.tweet.from_list") - val FromRecTopic = new Binary("client_log_event.tweet.from_rec_topic") - val InjectedPosition = new Discrete("client_log_event.tweet.injectedPosition") - val TextOnly = new Binary("client_log_event.tweet.text_only") - val HasLikedBySocialContext = new Binary("client_log_event.tweet.has_liked_by_social_context") - val HasFollowedBySocialContext = new Binary( - "client_log_event.tweet.has_followed_by_social_context") - val HasTopicSocialContext = new Binary("client_log_event.tweet.has_topic_social_context") - val IsFollowedTopicTweet = new Binary("client_log_event.tweet.is_followed_topic_tweet") - val IsRecommendedTopicTweet = new Binary("client_log_event.tweet.is_recommended_topic_tweet") - val IsTweetAgeLessThan15Seconds = new Binary( - "client_log_event.tweet.tweet_age_less_than_15_seconds") - val IsTweetAgeLessThanOrEqualTo30Minutes = new Binary( - "client_log_event.tweet.tweet_age_lte_30_minutes") - val IsTweetAgeLessThanOrEqualTo1Hour = new Binary("client_log_event.tweet.tweet_age_lte_1_hour") - val IsTweetAgeLessThanOrEqualTo6Hours = new Binary("client_log_event.tweet.tweet_age_lte_6_hours") - val IsTweetAgeLessThanOrEqualTo12Hours = new Binary( - "client_log_event.tweet.tweet_age_lte_12_hours") - val IsTweetAgeGreaterThanOrEqualTo24Hours = new Binary( - "client_log_event.tweet.tweet_age_gte_24_hours") - val HasGreaterThanOrEqualTo100Favs = new Binary("client_log_event.tweet.has_gte_100_favs") - val HasGreaterThanOrEqualTo1KFavs = new Binary("client_log_event.tweet.has_gte_1k_favs") - val HasGreaterThanOrEqualTo10KFavs = new Binary("client_log_event.tweet.has_gte_10k_favs") - val HasGreaterThanOrEqualTo100KFavs = new Binary("client_log_event.tweet.has_gte_100k_favs") - val HasGreaterThanOrEqualTo10Retweets = new Binary("client_log_event.tweet.has_gte_10_retweets") - val HasGreaterThanOrEqualTo100Retweets = new Binary("client_log_event.tweet.has_gte_100_retweets") - val HasGreaterThanOrEqualTo1KRetweets = new Binary("client_log_event.tweet.has_gte_1k_retweets") - - val TweetTypeToFeatureMap: Map[String, Binary] = Map( - "link" -> HasVisibleLink, - "hashtag" -> HasHashtag, - "mutual_follow" -> FromMutualFollow, - "in_network" -> IsInNetwork, - "text_only" -> TextOnly, - "has_liked_by_social_context" -> HasLikedBySocialContext, - "has_followed_by_social_context" -> HasFollowedBySocialContext, - "has_topic_social_context" -> HasTopicSocialContext, - "is_followed_topic_tweet" -> IsFollowedTopicTweet, - "is_recommended_topic_tweet" -> IsRecommendedTopicTweet, - "tweet_age_less_than_15_seconds" -> IsTweetAgeLessThan15Seconds, - "tweet_age_lte_30_minutes" -> IsTweetAgeLessThanOrEqualTo30Minutes, - "tweet_age_lte_1_hour" -> IsTweetAgeLessThanOrEqualTo1Hour, - "tweet_age_lte_6_hours" -> IsTweetAgeLessThanOrEqualTo6Hours, - "tweet_age_lte_12_hours" -> IsTweetAgeLessThanOrEqualTo12Hours, - "tweet_age_gte_24_hours" -> IsTweetAgeGreaterThanOrEqualTo24Hours, - "has_gte_100_favs" -> HasGreaterThanOrEqualTo100Favs, - "has_gte_1k_favs" -> HasGreaterThanOrEqualTo1KFavs, - "has_gte_10k_favs" -> HasGreaterThanOrEqualTo10KFavs, - "has_gte_100k_favs" -> HasGreaterThanOrEqualTo100KFavs, - "has_gte_10_retweets" -> HasGreaterThanOrEqualTo10Retweets, - "has_gte_100_retweets" -> HasGreaterThanOrEqualTo100Retweets, - "has_gte_1k_retweets" -> HasGreaterThanOrEqualTo1KRetweets - ) - - val CandidateTweetSourceIdFeatureMap: Map[Int, Binary] = Map( - CandidateTweetSourceId.RecapTweet.value -> FromRecap, - CandidateTweetSourceId.RecycledTweet.value -> FromRecycled, - CandidateTweetSourceId.RecommendedTweet.value -> FromActivity, - CandidateTweetSourceId.Simcluster.value -> FromSimcluster, - CandidateTweetSourceId.ErgTweet.value -> FromErg, - CandidateTweetSourceId.CroonTopicTweet.value -> FromCroon, - CandidateTweetSourceId.CroonTweet.value -> FromCroon, - CandidateTweetSourceId.ListTweet.value -> FromList, - CandidateTweetSourceId.RecommendedTopicTweet.value -> FromRecTopic - ) - - val TweetFeaturesV2: Set[Feature[_]] = Set( - HasImage, - IsReply, - IsRetweet, - HasVisibleLink, - HasHashtag, - FromMutualFollow, - IsInNetwork - ) - - val ContentTweetTypeFeatures: Set[Feature[_]] = Set( - HasImage, - HasVisibleLink, - HasHashtag, - TextOnly, - HasVisibleLink - ) - - val FreshnessTweetTypeFeatures: Set[Feature[_]] = Set( - IsTweetAgeLessThan15Seconds, - IsTweetAgeLessThanOrEqualTo30Minutes, - IsTweetAgeLessThanOrEqualTo1Hour, - IsTweetAgeLessThanOrEqualTo6Hours, - IsTweetAgeLessThanOrEqualTo12Hours, - IsTweetAgeGreaterThanOrEqualTo24Hours - ) - - val SocialProofTweetTypeFeatures: Set[Feature[_]] = Set( - HasLikedBySocialContext, - HasFollowedBySocialContext, - HasTopicSocialContext - ) - - val TopicTweetPreferenceTweetTypeFeatures: Set[Feature[_]] = Set( - IsFollowedTopicTweet, - IsRecommendedTopicTweet - ) - - val TweetPopularityTweetTypeFeatures: Set[Feature[_]] = Set( - HasGreaterThanOrEqualTo100Favs, - HasGreaterThanOrEqualTo1KFavs, - HasGreaterThanOrEqualTo10KFavs, - HasGreaterThanOrEqualTo100KFavs, - HasGreaterThanOrEqualTo10Retweets, - HasGreaterThanOrEqualTo100Retweets, - HasGreaterThanOrEqualTo1KRetweets - ) - - val UserGraphInteractionTweetTypeFeatures: Set[Feature[_]] = Set( - IsInNetwork, - FromMutualFollow, - IsNotInNetwork, - IsPromoted - ) - - val UserContentPreferenceTweetTypeFeatures: Set[Feature[_]] = - ContentTweetTypeFeatures ++ FreshnessTweetTypeFeatures ++ SocialProofTweetTypeFeatures ++ TopicTweetPreferenceTweetTypeFeatures ++ TweetPopularityTweetTypeFeatures ++ UserGraphInteractionTweetTypeFeatures - val AuthorContentPreferenceTweetTypeFeatures: Set[Feature[_]] = - Set(IsInNetwork, FromMutualFollow, IsNotInNetwork) ++ ContentTweetTypeFeatures -} diff --git a/src/scala/com/twitter/timelines/prediction/features/common/BUILD b/src/scala/com/twitter/timelines/prediction/features/common/BUILD deleted file mode 100644 index bfbe764c7..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/common/BUILD +++ /dev/null @@ -1,11 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/ml/api:data-java", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/common/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/common/BUILD.docx new file mode 100644 index 000000000..833ed8363 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/common/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/common/CombinedFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/common/CombinedFeatures.docx new file mode 100644 index 000000000..284804225 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/common/CombinedFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/common/CombinedFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/common/CombinedFeatures.scala deleted file mode 100644 index d995fe2b0..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/common/CombinedFeatures.scala +++ /dev/null @@ -1,536 +0,0 @@ -package com.twitter.timelines.prediction.features.common - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.Feature -import com.twitter.ml.api.FeatureType -import com.twitter.ml.api.Feature.Binary -import java.lang.{Boolean => JBoolean} -import scala.collection.JavaConverters._ - -object CombinedFeatures { - val IS_CLICKED = - new Binary("timelines.engagement.is_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_DWELLED = - new Binary("timelines.engagement.is_dwelled", Set(TweetsViewed, EngagementsPrivate).asJava) - val IS_DWELLED_IN_BOUNDS_V1 = new Binary( - "timelines.engagement.is_dwelled_in_bounds_v1", - Set(TweetsViewed, EngagementsPrivate).asJava) - val IS_FAVORITED = new Binary( - "timelines.engagement.is_favorited", - Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) - val IS_FOLLOWED = new Binary( - "timelines.engagement.is_followed", - Set(EngagementsPrivate, EngagementsPublic, Follow).asJava) - val IS_IMPRESSED = - new Binary("timelines.engagement.is_impressed", Set(TweetsViewed, EngagementsPrivate).asJava) - val IS_OPEN_LINKED = new Binary( - "timelines.engagement.is_open_linked", - Set(EngagementsPrivate, LinksClickedOn).asJava) - val IS_PHOTO_EXPANDED = new Binary( - "timelines.engagement.is_photo_expanded", - Set(MediaEngagementActivities, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED = new Binary( - "timelines.engagement.is_profile_clicked", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_QUOTED = new Binary( - "timelines.engagement.is_quoted", - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED = new Binary( - "timelines.engagement.is_replied", - Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) - val IS_RETWEETED = new Binary( - "timelines.engagement.is_retweeted", - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_RETWEETED_WITHOUT_QUOTE = new Binary( - "timelines.enagagement.is_retweeted_without_quote", - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_SHARE_DM_CLICKED = - new Binary("timelines.engagement.is_tweet_share_dm_clicked", Set(EngagementsPrivate).asJava) - val IS_SHARE_DM_SENT = - new Binary("timelines.engagement.is_tweet_share_dm_sent", Set(EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_25 = new Binary( - "timelines.engagement.is_video_playback_25", - Set(MediaEngagementActivities, EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_50 = new Binary( - "timelines.engagement.is_video_playback_50", - Set(MediaEngagementActivities, EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_75 = new Binary( - "timelines.engagement.is_video_playback_75", - Set(MediaEngagementActivities, EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_95 = new Binary( - "timelines.engagement.is_video_playback_95", - Set(MediaEngagementActivities, EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_COMPLETE = new Binary( - "timelines.engagement.is_video_playback_complete", - Set(MediaEngagementActivities, EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_START = new Binary( - "timelines.engagement.is_video_playback_start", - Set(MediaEngagementActivities, EngagementsPrivate).asJava) - val IS_VIDEO_VIEWED = new Binary( - "timelines.engagement.is_video_viewed", - Set(MediaEngagementActivities, EngagementsPrivate).asJava) - val IS_VIDEO_QUALITY_VIEWED = new Binary( - "timelines.engagement.is_video_quality_viewed", - Set(MediaEngagementActivities, EngagementsPrivate).asJava - ) - // v1: post click engagements: fav, reply - val IS_GOOD_CLICKED_CONVO_DESC_V1 = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_favorited_or_replied", - Set( - TweetsClicked, - PublicLikes, - PrivateLikes, - PublicReplies, - PrivateReplies, - EngagementsPrivate, - EngagementsPublic).asJava) - // v2: post click engagements: click - val IS_GOOD_CLICKED_CONVO_DESC_V2 = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_v2", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_favorited_or_replied_or_dwell_sum_gte_60_secs", - Set( - TweetsClicked, - PublicLikes, - PrivateLikes, - PublicReplies, - PrivateReplies, - EngagementsPrivate, - EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_FAVORITED = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_favorited", - Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_REPLIED = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_replied", - Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_RETWEETED = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_retweeted", - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_CLICKED = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_clicked", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_FOLLOWED = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_followed", - Set(EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_SHARE_DM_CLICKED = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_share_dm_clicked", - Set(EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_PROFILE_CLICKED = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_profile_clicked", - Set(EngagementsPrivate).asJava) - - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_0 = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_uam_gt_0", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_1 = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_uam_gt_1", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_2 = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_uam_gt_2", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_3 = new Binary( - "timelines.engagement.is_good_clicked_convo_desc_uam_gt_3", - Set(EngagementsPrivate, EngagementsPublic).asJava) - - val IS_TWEET_DETAIL_DWELLED = new Binary( - "timelines.engagement.is_tweet_detail_dwelled", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_8_SEC = new Binary( - "timelines.engagement.is_tweet_detail_dwelled_8_sec", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_15_SEC = new Binary( - "timelines.engagement.is_tweet_detail_dwelled_15_sec", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_25_SEC = new Binary( - "timelines.engagement.is_tweet_detail_dwelled_25_sec", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_30_SEC = new Binary( - "timelines.engagement.is_tweet_detail_dwelled_30_sec", - Set(TweetsClicked, EngagementsPrivate).asJava) - - val IS_PROFILE_DWELLED = new Binary( - "timelines.engagement.is_profile_dwelled", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_DWELLED_10_SEC = new Binary( - "timelines.engagement.is_profile_dwelled_10_sec", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_DWELLED_20_SEC = new Binary( - "timelines.engagement.is_profile_dwelled_20_sec", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_DWELLED_30_SEC = new Binary( - "timelines.engagement.is_profile_dwelled_30_sec", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED = new Binary( - "timelines.engagement.is_fullscreen_video_dwelled", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Binary( - "timelines.engagement.is_fullscreen_video_dwelled_5_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Binary( - "timelines.engagement.is_fullscreen_video_dwelled_10_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Binary( - "timelines.engagement.is_fullscreen_video_dwelled_20_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Binary( - "timelines.engagement.is_fullscreen_video_dwelled_30_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_LINK_DWELLED_15_SEC = new Binary( - "timelines.engagement.is_link_dwelled_15_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_LINK_DWELLED_30_SEC = new Binary( - "timelines.engagement.is_link_dwelled_30_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_LINK_DWELLED_60_SEC = new Binary( - "timelines.engagement.is_link_dwelled_60_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_HOME_LATEST_VISITED = - new Binary("timelines.engagement.is_home_latest_visited", Set(EngagementsPrivate).asJava) - - val IS_BOOKMARKED = - new Binary("timelines.engagement.is_bookmarked", Set(EngagementsPrivate).asJava) - val IS_SHARED = - new Binary("timelines.engagement.is_shared", Set(EngagementsPrivate).asJava) - val IS_SHARE_MENU_CLICKED = - new Binary("timelines.engagement.is_share_menu_clicked", Set(EngagementsPrivate).asJava) - - // Negative engagements - val IS_DONT_LIKE = new Binary("timelines.engagement.is_dont_like", Set(EngagementsPrivate).asJava) - val IS_BLOCK_CLICKED = new Binary( - "timelines.engagement.is_block_clicked", - Set(Blocks, TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava) - val IS_BLOCK_DIALOG_BLOCKED = new Binary( - "timelines.engagement.is_block_dialog_blocked", - Set(Blocks, EngagementsPrivate, EngagementsPublic).asJava) - val IS_MUTE_CLICKED = new Binary( - "timelines.engagement.is_mute_clicked", - Set(Mutes, TweetsClicked, EngagementsPrivate).asJava) - val IS_MUTE_DIALOG_MUTED = - new Binary("timelines.engagement.is_mute_dialog_muted", Set(Mutes, EngagementsPrivate).asJava) - val IS_REPORT_TWEET_CLICKED = new Binary( - "timelines.engagement.is_report_tweet_clicked", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_CARET_CLICKED = - new Binary("timelines.engagement.is_caret_clicked", Set(EngagementsPrivate).asJava) - val IS_NOT_ABOUT_TOPIC = - new Binary("timelines.engagement.is_not_about_topic", Set(EngagementsPrivate).asJava) - val IS_NOT_RECENT = - new Binary("timelines.engagement.is_not_recent", Set(EngagementsPrivate).asJava) - val IS_NOT_RELEVANT = - new Binary("timelines.engagement.is_not_relevant", Set(EngagementsPrivate).asJava) - val IS_SEE_FEWER = - new Binary("timelines.engagement.is_see_fewer", Set(EngagementsPrivate).asJava) - val IS_UNFOLLOW_TOPIC = - new Binary("timelines.engagement.is_unfollow_topic", Set(EngagementsPrivate).asJava) - val IS_FOLLOW_TOPIC = - new Binary("timelines.engagement.is_follow_topic", Set(EngagementsPrivate).asJava) - val IS_NOT_INTERESTED_IN_TOPIC = - new Binary("timelines.engagement.is_not_interested_in_topic", Set(EngagementsPrivate).asJava) - val IS_NEGATIVE_FEEDBACK = - new Binary("timelines.engagement.is_negative_feedback", Set(EngagementsPrivate).asJava) - val IS_IMPLICIT_POSITIVE_FEEDBACK_UNION = - new Binary( - "timelines.engagement.is_implicit_positive_feedback_union", - Set(EngagementsPrivate).asJava) - val IS_EXPLICIT_POSITIVE_FEEDBACK_UNION = - new Binary( - "timelines.engagement.is_explicit_positive_feedback_union", - Set(EngagementsPrivate).asJava) - val IS_ALL_NEGATIVE_FEEDBACK_UNION = - new Binary( - "timelines.engagement.is_all_negative_feedback_union", - Set(EngagementsPrivate).asJava) - // Reciprocal engagements for reply forward engagement - val IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_impressed_by_author", - Set(EngagementsPrivate).asJava) - val IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_favorited_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava) - val IS_REPLIED_REPLY_QUOTED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_quoted_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava) - val IS_REPLIED_REPLY_REPLIED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_replied_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava) - val IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_retweeted_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava) - val IS_REPLIED_REPLY_BLOCKED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_blocked_by_author", - Set(Blocks, EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_FOLLOWED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_followed_by_author", - Set(EngagementsPrivate, EngagementsPublic, Follow).asJava) - val IS_REPLIED_REPLY_UNFOLLOWED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_unfollowed_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_MUTED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_muted_by_author", - Set(Mutes, EngagementsPrivate).asJava) - val IS_REPLIED_REPLY_REPORTED_BY_AUTHOR = new Binary( - "timelines.engagement.is_replied_reply_reported_by_author", - Set(EngagementsPrivate).asJava) - - // Reciprocal engagements for fav forward engagement - val IS_FAVORITED_FAV_FAVORITED_BY_AUTHOR = new Binary( - "timelines.engagement.is_favorited_fav_favorited_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava - ) - val IS_FAVORITED_FAV_REPLIED_BY_AUTHOR = new Binary( - "timelines.engagement.is_favorited_fav_replied_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava - ) - val IS_FAVORITED_FAV_RETWEETED_BY_AUTHOR = new Binary( - "timelines.engagement.is_favorited_fav_retweeted_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava - ) - val IS_FAVORITED_FAV_FOLLOWED_BY_AUTHOR = new Binary( - "timelines.engagement.is_favorited_fav_followed_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava - ) - - // define good profile click by considering following engagements (follow, fav, reply, retweet, etc.) at profile page - val IS_PROFILE_CLICKED_AND_PROFILE_FOLLOW = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_follow", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, Follow).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_FAV = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_fav", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateLikes, PublicLikes).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_REPLY = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_reply", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateReplies, PublicReplies).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_RETWEET = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_retweet", - Set( - ProfilesViewed, - ProfilesClicked, - EngagementsPrivate, - PrivateRetweets, - PublicRetweets).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_CLICK = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_tweet_click", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, TweetsClicked).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_SHARE_DM_CLICK = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_share_dm_click", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // This derived label is the union of all binary features above - val IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_engaged", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, EngagementsPublic).asJava) - - // define bad profile click by considering following engagements (user report, tweet report, mute, block, etc) at profile page - val IS_PROFILE_CLICKED_AND_PROFILE_USER_REPORT_CLICK = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_user_report_click", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_REPORT_CLICK = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_tweet_report_click", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_MUTE = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_mute", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_BLOCK = new Binary( - "timelines.engagement.is_profile_clicked_and_profile_block", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // This derived label is the union of bad profile click engagements and existing negative feedback - val IS_NEGATIVE_FEEDBACK_V2 = new Binary( - "timelines.engagement.is_negative_feedback_v2", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_NEGATIVE_FEEDBACK_UNION = new Binary( - "timelines.engagement.is_negative_feedback_union", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // don't like, mute or profile page -> mute - val IS_WEAK_NEGATIVE_FEEDBACK = new Binary( - "timelines.engagement.is_weak_negative_feedback", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // report, block or profile page -> report, block - val IS_STRONG_NEGATIVE_FEEDBACK = new Binary( - "timelines.engagement.is_strong_negative_feedback", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // engagement for following user from any surface area - val IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Binary( - "timelines.engagement.is_followed_from_any_surface_area", - Set(EngagementsPublic, EngagementsPrivate).asJava) - val IS_RELEVANCE_PROMPT_YES_CLICKED = new Binary( - "timelines.engagement.is_relevance_prompt_yes_clicked", - Set(EngagementsPublic, EngagementsPrivate).asJava) - - // Reply downvote engagements - val IS_REPLY_DOWNVOTED = - new Binary("timelines.engagement.is_reply_downvoted", Set(EngagementsPrivate).asJava) - val IS_REPLY_DOWNVOTE_REMOVED = - new Binary("timelines.engagement.is_reply_downvote_removed", Set(EngagementsPrivate).asJava) - - /** - * Contains all engagements that are used/consumed by real-time - * aggregates summingbird jobs. These engagements need to be - * extractable from [[ClientEvent]]. - */ - val EngagementsRealTime: Set[Feature[JBoolean]] = Set( - IS_CLICKED, - IS_DWELLED, - IS_FAVORITED, - IS_FOLLOWED, - IS_OPEN_LINKED, - IS_PHOTO_EXPANDED, - IS_PROFILE_CLICKED, - IS_QUOTED, - IS_REPLIED, - IS_RETWEETED, - IS_RETWEETED_WITHOUT_QUOTE, - IS_SHARE_DM_CLICKED, - IS_SHARE_DM_SENT, - IS_VIDEO_PLAYBACK_50, - IS_VIDEO_VIEWED, - IS_VIDEO_QUALITY_VIEWED - ) - - val NegativeEngagementsRealTime: Set[Feature[JBoolean]] = Set( - IS_REPORT_TWEET_CLICKED, - IS_BLOCK_CLICKED, - IS_MUTE_CLICKED - ) - - val NegativeEngagementsRealTimeDontLike: Set[Feature[JBoolean]] = Set( - IS_DONT_LIKE - ) - - val NegativeEngagementsSecondary: Set[Feature[JBoolean]] = Set( - IS_NOT_INTERESTED_IN_TOPIC, - IS_NOT_ABOUT_TOPIC, - IS_NOT_RECENT, - IS_NOT_RELEVANT, - IS_SEE_FEWER, - IS_UNFOLLOW_TOPIC - ) - - val PrivateEngagements: Set[Feature[JBoolean]] = Set( - IS_CLICKED, - IS_DWELLED, - IS_OPEN_LINKED, - IS_PHOTO_EXPANDED, - IS_PROFILE_CLICKED, - IS_QUOTED, - IS_VIDEO_PLAYBACK_50, - IS_VIDEO_QUALITY_VIEWED - ) - - val ImpressedEngagements: Set[Feature[JBoolean]] = Set( - IS_IMPRESSED - ) - - val PrivateEngagementsV2: Set[Feature[JBoolean]] = Set( - IS_CLICKED, - IS_OPEN_LINKED, - IS_PHOTO_EXPANDED, - IS_PROFILE_CLICKED, - IS_VIDEO_PLAYBACK_50, - IS_VIDEO_QUALITY_VIEWED - ) ++ ImpressedEngagements - - val CoreEngagements: Set[Feature[JBoolean]] = Set( - IS_FAVORITED, - IS_REPLIED, - IS_RETWEETED - ) - - val DwellEngagements: Set[Feature[JBoolean]] = Set( - IS_DWELLED - ) - - val PrivateCoreEngagements: Set[Feature[JBoolean]] = Set( - IS_CLICKED, - IS_OPEN_LINKED, - IS_PHOTO_EXPANDED, - IS_VIDEO_PLAYBACK_50, - IS_VIDEO_QUALITY_VIEWED - ) - - val ConditionalEngagements: Set[Feature[JBoolean]] = Set( - IS_GOOD_CLICKED_CONVO_DESC_V1, - IS_GOOD_CLICKED_CONVO_DESC_V2, - IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S - ) - - val ShareEngagements: Set[Feature[JBoolean]] = Set( - IS_SHARED, - IS_SHARE_MENU_CLICKED - ) - - val BookmarkEngagements: Set[Feature[JBoolean]] = Set( - IS_BOOKMARKED - ) - - val TweetDetailDwellEngagements: Set[Feature[JBoolean]] = Set( - IS_TWEET_DETAIL_DWELLED, - IS_TWEET_DETAIL_DWELLED_8_SEC, - IS_TWEET_DETAIL_DWELLED_15_SEC, - IS_TWEET_DETAIL_DWELLED_25_SEC, - IS_TWEET_DETAIL_DWELLED_30_SEC - ) - - val ProfileDwellEngagements: Set[Feature[JBoolean]] = Set( - IS_PROFILE_DWELLED, - IS_PROFILE_DWELLED_10_SEC, - IS_PROFILE_DWELLED_20_SEC, - IS_PROFILE_DWELLED_30_SEC - ) - - val FullscreenVideoDwellEngagements: Set[Feature[JBoolean]] = Set( - IS_FULLSCREEN_VIDEO_DWELLED, - IS_FULLSCREEN_VIDEO_DWELLED_5_SEC, - IS_FULLSCREEN_VIDEO_DWELLED_10_SEC, - IS_FULLSCREEN_VIDEO_DWELLED_20_SEC, - IS_FULLSCREEN_VIDEO_DWELLED_30_SEC - ) - - // Please do not add new engagements here until having estimated the impact - // to capacity requirements. User-author real-time aggregates have a very - // large key space. - val UserAuthorEngagements: Set[Feature[JBoolean]] = CoreEngagements ++ DwellEngagements ++ Set( - IS_CLICKED, - IS_PROFILE_CLICKED, - IS_PHOTO_EXPANDED, - IS_VIDEO_PLAYBACK_50, - IS_NEGATIVE_FEEDBACK_UNION - ) - - val ImplicitPositiveEngagements: Set[Feature[JBoolean]] = Set( - IS_CLICKED, - IS_DWELLED, - IS_OPEN_LINKED, - IS_PROFILE_CLICKED, - IS_QUOTED, - IS_VIDEO_PLAYBACK_50, - IS_VIDEO_QUALITY_VIEWED, - IS_TWEET_DETAIL_DWELLED, - IS_GOOD_CLICKED_CONVO_DESC_V1, - IS_GOOD_CLICKED_CONVO_DESC_V2, - IS_SHARED, - IS_SHARE_MENU_CLICKED, - IS_SHARE_DM_SENT, - IS_SHARE_DM_CLICKED - ) - - val ExplicitPositiveEngagements: Set[Feature[JBoolean]] = CoreEngagements ++ Set( - IS_FOLLOWED, - IS_QUOTED - ) - - val AllNegativeEngagements: Set[Feature[JBoolean]] = - NegativeEngagementsRealTime ++ NegativeEngagementsRealTimeDontLike ++ Set( - IS_NOT_RECENT, - IS_NOT_RELEVANT, - IS_SEE_FEWER - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/features/common/NonHomeLabelFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/common/NonHomeLabelFeatures.docx new file mode 100644 index 000000000..ac439d03c Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/common/NonHomeLabelFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/common/NonHomeLabelFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/common/NonHomeLabelFeatures.scala deleted file mode 100644 index 369b48b39..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/common/NonHomeLabelFeatures.scala +++ /dev/null @@ -1,97 +0,0 @@ -package com.twitter.timelines.prediction.features.common - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.Feature -import com.twitter.ml.api.Feature.Binary -import java.lang.{Boolean => JBoolean} -import scala.collection.JavaConverters._ - -object ProfileLabelFeatures { - private val prefix = "profile" - - val IS_CLICKED = - new Binary(s"${prefix}.engagement.is_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_DWELLED = - new Binary(s"${prefix}.engagement.is_dwelled", Set(TweetsViewed, EngagementsPrivate).asJava) - val IS_FAVORITED = new Binary( - s"${prefix}.engagement.is_favorited", - Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED = new Binary( - s"${prefix}.engagement.is_replied", - Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) - val IS_RETWEETED = new Binary( - s"${prefix}.engagement.is_retweeted", - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - - // Negative engagements - val IS_DONT_LIKE = - new Binary(s"${prefix}.engagement.is_dont_like", Set(EngagementsPrivate).asJava) - val IS_BLOCK_CLICKED = new Binary( - s"${prefix}.engagement.is_block_clicked", - Set(Blocks, TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava) - val IS_MUTE_CLICKED = new Binary( - s"${prefix}.engagement.is_mute_clicked", - Set(Mutes, TweetsClicked, EngagementsPrivate).asJava) - val IS_REPORT_TWEET_CLICKED = new Binary( - s"${prefix}.engagement.is_report_tweet_clicked", - Set(TweetsClicked, EngagementsPrivate).asJava) - - val IS_NEGATIVE_FEEDBACK_UNION = new Binary( - s"${prefix}.engagement.is_negative_feedback_union", - Set(EngagementsPrivate, Blocks, Mutes, TweetsClicked, EngagementsPublic).asJava) - - val CoreEngagements: Set[Feature[JBoolean]] = Set( - IS_CLICKED, - IS_DWELLED, - IS_FAVORITED, - IS_REPLIED, - IS_RETWEETED - ) - - val NegativeEngagements: Set[Feature[JBoolean]] = Set( - IS_DONT_LIKE, - IS_BLOCK_CLICKED, - IS_MUTE_CLICKED, - IS_REPORT_TWEET_CLICKED - ) - -} - -object SearchLabelFeatures { - private val prefix = "search" - - val IS_CLICKED = - new Binary(s"${prefix}.engagement.is_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_DWELLED = - new Binary(s"${prefix}.engagement.is_dwelled", Set(TweetsViewed, EngagementsPrivate).asJava) - val IS_FAVORITED = new Binary( - s"${prefix}.engagement.is_favorited", - Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED = new Binary( - s"${prefix}.engagement.is_replied", - Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) - val IS_RETWEETED = new Binary( - s"${prefix}.engagement.is_retweeted", - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_PROFILE_CLICKED_SEARCH_RESULT_USER = new Binary( - s"${prefix}.engagement.is_profile_clicked_search_result_user", - Set(ProfilesClicked, ProfilesViewed, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_SEARCH_RESULT_TWEET = new Binary( - s"${prefix}.engagement.is_profile_clicked_search_result_tweet", - Set(ProfilesClicked, ProfilesViewed, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_TYPEAHEAD_USER = new Binary( - s"${prefix}.engagement.is_profile_clicked_typeahead_user", - Set(ProfilesClicked, ProfilesViewed, EngagementsPrivate).asJava) - - val CoreEngagements: Set[Feature[JBoolean]] = Set( - IS_CLICKED, - IS_DWELLED, - IS_FAVORITED, - IS_REPLIED, - IS_RETWEETED, - IS_PROFILE_CLICKED_SEARCH_RESULT_USER, - IS_PROFILE_CLICKED_SEARCH_RESULT_TWEET, - IS_PROFILE_CLICKED_TYPEAHEAD_USER - ) -} -// Add Tweet Detail labels later diff --git a/src/scala/com/twitter/timelines/prediction/features/common/TimelinesSharedFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/common/TimelinesSharedFeatures.docx new file mode 100644 index 000000000..f8b64c569 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/common/TimelinesSharedFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/common/TimelinesSharedFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/common/TimelinesSharedFeatures.scala deleted file mode 100644 index 99698530f..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/common/TimelinesSharedFeatures.scala +++ /dev/null @@ -1,759 +0,0 @@ -package com.twitter.timelines.prediction.features.common - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.Feature.Binary -import com.twitter.ml.api.Feature.Continuous -import com.twitter.ml.api.Feature.Discrete -import com.twitter.ml.api.Feature.SparseBinary -import com.twitter.ml.api.Feature.SparseContinuous -import com.twitter.ml.api.Feature.Text -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup -import scala.collection.JavaConverters._ - -object TimelinesSharedFeatures extends TimelinesSharedFeatures("") -object InReplyToTweetTimelinesSharedFeatures extends TimelinesSharedFeatures("in_reply_to_tweet") - -/** - * Defines shared features - */ -class TimelinesSharedFeatures(prefix: String) { - private def name(featureName: String): String = { - if (prefix.nonEmpty) { - s"$prefix.$featureName" - } else { - featureName - } - } - - // meta - val EXPERIMENT_META = new SparseBinary( - name("timelines.meta.experiment_meta"), - Set(ExperimentId, ExperimentName).asJava) - - // historically used in the "combined models" to distinguish in-network and out of network tweets. - // now the feature denotes which adapter (recap or rectweet) was used to generate the datarecords. - // and is used by the data collection pipeline to split the training data. - val INJECTION_TYPE = new Discrete(name("timelines.meta.injection_type")) - - // Used to indicate which injection module is this - val INJECTION_MODULE_NAME = new Text(name("timelines.meta.injection_module_name")) - - val LIST_ID = new Discrete(name("timelines.meta.list_id")) - val LIST_IS_PINNED = new Binary(name("timelines.meta.list_is_pinned")) - - // internal id per each PS request. mainly to join back commomn features and candidate features later - val PREDICTION_REQUEST_ID = new Discrete(name("timelines.meta.prediction_request_id")) - // internal id per each TLM request. mainly to deduplicate re-served cached tweets in logging - val SERVED_REQUEST_ID = new Discrete(name("timelines.meta.served_request_id")) - // internal id used for join key in kafka logging, equal to servedRequestId if tweet is cached, - // else equal to predictionRequestId - val SERVED_ID = new Discrete(name("timelines.meta.served_id")) - val REQUEST_JOIN_ID = new Discrete(name("timelines.meta.request_join_id")) - - // Internal boolean flag per tweet, whether the tweet is served from RankedTweetsCache: TQ-14050 - // this feature should not be trained on, blacklisted in feature_config: D838346 - val IS_READ_FROM_CACHE = new Binary(name("timelines.meta.is_read_from_cache")) - - // model score discounts - val PHOTO_DISCOUNT = new Continuous(name("timelines.score_discounts.photo")) - val VIDEO_DISCOUNT = new Continuous(name("timelines.score_discounts.video")) - val TWEET_HEIGHT_DISCOUNT = new Continuous(name("timelines.score_discounts.tweet_height")) - val TOXICITY_DISCOUNT = new Continuous(name("timelines.score_discounts.toxicity")) - - // engagements - val ENGAGEMENT_TYPE = new Discrete(name("timelines.engagement.type")) - val PREDICTED_IS_FAVORITED = - new Continuous(name("timelines.engagement_predicted.is_favorited"), Set(EngagementScore).asJava) - val PREDICTED_IS_RETWEETED = - new Continuous(name("timelines.engagement_predicted.is_retweeted"), Set(EngagementScore).asJava) - val PREDICTED_IS_QUOTED = - new Continuous(name("timelines.engagement_predicted.is_quoted"), Set(EngagementScore).asJava) - val PREDICTED_IS_REPLIED = - new Continuous(name("timelines.engagement_predicted.is_replied"), Set(EngagementScore).asJava) - val PREDICTED_IS_OPEN_LINKED = new Continuous( - name("timelines.engagement_predicted.is_open_linked"), - Set(EngagementScore).asJava) - val PREDICTED_IS_GOOD_OPEN_LINK = new Continuous( - name("timelines.engagement_predicted.is_good_open_link"), - Set(EngagementScore).asJava) - val PREDICTED_IS_PROFILE_CLICKED = new Continuous( - name("timelines.engagement_predicted.is_profile_clicked"), - Set(EngagementScore).asJava - ) - val PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Continuous( - name("timelines.engagement_predicted.is_profile_clicked_and_profile_engaged"), - Set(EngagementScore).asJava - ) - val PREDICTED_IS_CLICKED = - new Continuous(name("timelines.engagement_predicted.is_clicked"), Set(EngagementScore).asJava) - val PREDICTED_IS_PHOTO_EXPANDED = new Continuous( - name("timelines.engagement_predicted.is_photo_expanded"), - Set(EngagementScore).asJava - ) - val PREDICTED_IS_FOLLOWED = - new Continuous(name("timelines.engagement_predicted.is_followed"), Set(EngagementScore).asJava) - val PREDICTED_IS_DONT_LIKE = - new Continuous(name("timelines.engagement_predicted.is_dont_like"), Set(EngagementScore).asJava) - val PREDICTED_IS_VIDEO_PLAYBACK_50 = new Continuous( - name("timelines.engagement_predicted.is_video_playback_50"), - Set(EngagementScore).asJava - ) - val PREDICTED_IS_VIDEO_QUALITY_VIEWED = new Continuous( - name("timelines.engagement_predicted.is_video_quality_viewed"), - Set(EngagementScore).asJava - ) - val PREDICTED_IS_GOOD_CLICKED_V1 = new Continuous( - name("timelines.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied"), - Set(EngagementScore).asJava) - val PREDICTED_IS_GOOD_CLICKED_V2 = new Continuous( - name("timelines.engagement_predicted.is_good_clicked_convo_desc_v2"), - Set(EngagementScore).asJava) - val PREDICTED_IS_TWEET_DETAIL_DWELLED_8_SEC = new Continuous( - name("timelines.engagement_predicted.is_tweet_detail_dwelled_8_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_TWEET_DETAIL_DWELLED_15_SEC = new Continuous( - name("timelines.engagement_predicted.is_tweet_detail_dwelled_15_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_TWEET_DETAIL_DWELLED_25_SEC = new Continuous( - name("timelines.engagement_predicted.is_tweet_detail_dwelled_25_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_TWEET_DETAIL_DWELLED_30_SEC = new Continuous( - name("timelines.engagement_predicted.is_tweet_detail_dwelled_30_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S = new Continuous( - name( - "timelines.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied_or_dwell_sum_gte_60_secs"), - Set(EngagementScore).asJava) - val PREDICTED_IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Continuous( - name("timelines.engagement_predicted.is_favorited_fav_engaged_by_author"), - Set(EngagementScore).asJava) - - val PREDICTED_IS_REPORT_TWEET_CLICKED = - new Continuous( - name("timelines.engagement_predicted.is_report_tweet_clicked"), - Set(EngagementScore).asJava) - val PREDICTED_IS_NEGATIVE_FEEDBACK = new Continuous( - name("timelines.engagement_predicted.is_negative_feedback"), - Set(EngagementScore).asJava) - val PREDICTED_IS_NEGATIVE_FEEDBACK_V2 = new Continuous( - name("timelines.engagement_predicted.is_negative_feedback_v2"), - Set(EngagementScore).asJava) - val PREDICTED_IS_WEAK_NEGATIVE_FEEDBACK = new Continuous( - name("timelines.engagement_predicted.is_weak_negative_feedback"), - Set(EngagementScore).asJava) - val PREDICTED_IS_STRONG_NEGATIVE_FEEDBACK = new Continuous( - name("timelines.engagement_predicted.is_strong_negative_feedback"), - Set(EngagementScore).asJava) - - val PREDICTED_IS_DWELLED_IN_BOUNDS_V1 = new Continuous( - name("timelines.engagement_predicted.is_dwelled_in_bounds_v1"), - Set(EngagementScore).asJava) - val PREDICTED_DWELL_NORMALIZED_OVERALL = new Continuous( - name("timelines.engagement_predicted.dwell_normalized_overall"), - Set(EngagementScore).asJava) - val PREDICTED_DWELL_CDF = - new Continuous(name("timelines.engagement_predicted.dwell_cdf"), Set(EngagementScore).asJava) - val PREDICTED_DWELL_CDF_OVERALL = new Continuous( - name("timelines.engagement_predicted.dwell_cdf_overall"), - Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED = - new Continuous(name("timelines.engagement_predicted.is_dwelled"), Set(EngagementScore).asJava) - - val PREDICTED_IS_HOME_LATEST_VISITED = new Continuous( - name("timelines.engagement_predicted.is_home_latest_visited"), - Set(EngagementScore).asJava) - - val PREDICTED_IS_BOOKMARKED = new Continuous( - name("timelines.engagement_predicted.is_bookmarked"), - Set(EngagementScore).asJava) - - val PREDICTED_IS_SHARED = - new Continuous(name("timelines.engagement_predicted.is_shared"), Set(EngagementScore).asJava) - val PREDICTED_IS_SHARE_MENU_CLICKED = new Continuous( - name("timelines.engagement_predicted.is_share_menu_clicked"), - Set(EngagementScore).asJava) - - val PREDICTED_IS_PROFILE_DWELLED_20_SEC = new Continuous( - name("timelines.engagement_predicted.is_profile_dwelled_20_sec"), - Set(EngagementScore).asJava) - - val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Continuous( - name("timelines.engagement_predicted.is_fullscreen_video_dwelled_5_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Continuous( - name("timelines.engagement_predicted.is_fullscreen_video_dwelled_10_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Continuous( - name("timelines.engagement_predicted.is_fullscreen_video_dwelled_20_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Continuous( - name("timelines.engagement_predicted.is_fullscreen_video_dwelled_30_sec"), - Set(EngagementScore).asJava) - - // Please use this timestamp, not the `meta.timestamp`, for the actual served timestamp. - val SERVED_TIMESTAMP = - new Discrete("timelines.meta.timestamp.served", Set(PrivateTimestamp).asJava) - - // timestamp when the engagement has occurred. do not train on these features - val TIMESTAMP_FAVORITED = - new Discrete("timelines.meta.timestamp.engagement.favorited", Set(PublicTimestamp).asJava) - val TIMESTAMP_RETWEETED = - new Discrete("timelines.meta.timestamp.engagement.retweeted", Set(PublicTimestamp).asJava) - val TIMESTAMP_REPLIED = - new Discrete("timelines.meta.timestamp.engagement.replied", Set(PublicTimestamp).asJava) - val TIMESTAMP_PROFILE_CLICKED = new Discrete( - "timelines.meta.timestamp.engagement.profile_clicked", - Set(PrivateTimestamp).asJava) - val TIMESTAMP_CLICKED = - new Discrete("timelines.meta.timestamp.engagement.clicked", Set(PrivateTimestamp).asJava) - val TIMESTAMP_PHOTO_EXPANDED = - new Discrete("timelines.meta.timestamp.engagement.photo_expanded", Set(PrivateTimestamp).asJava) - val TIMESTAMP_DWELLED = - new Discrete("timelines.meta.timestamp.engagement.dwelled", Set(PrivateTimestamp).asJava) - val TIMESTAMP_VIDEO_PLAYBACK_50 = new Discrete( - "timelines.meta.timestamp.engagement.video_playback_50", - Set(PrivateTimestamp).asJava) - // reply engaged by author - val TIMESTAMP_REPLY_FAVORITED_BY_AUTHOR = new Discrete( - "timelines.meta.timestamp.engagement.reply_favorited_by_author", - Set(PublicTimestamp).asJava) - val TIMESTAMP_REPLY_REPLIED_BY_AUTHOR = new Discrete( - "timelines.meta.timestamp.engagement.reply_replied_by_author", - Set(PublicTimestamp).asJava) - val TIMESTAMP_REPLY_RETWEETED_BY_AUTHOR = new Discrete( - "timelines.meta.timestamp.engagement.reply_retweeted_by_author", - Set(PublicTimestamp).asJava) - // fav engaged by author - val TIMESTAMP_FAV_FAVORITED_BY_AUTHOR = new Discrete( - "timelines.meta.timestamp.engagement.fav_favorited_by_author", - Set(PublicTimestamp).asJava) - val TIMESTAMP_FAV_REPLIED_BY_AUTHOR = new Discrete( - "timelines.meta.timestamp.engagement.fav_replied_by_author", - Set(PublicTimestamp).asJava) - val TIMESTAMP_FAV_RETWEETED_BY_AUTHOR = new Discrete( - "timelines.meta.timestamp.engagement.fav_retweeted_by_author", - Set(PublicTimestamp).asJava) - val TIMESTAMP_FAV_FOLLOWED_BY_AUTHOR = new Discrete( - "timelines.meta.timestamp.engagement.fav_followed_by_author", - Set(PublicTimestamp).asJava) - // good click - val TIMESTAMP_GOOD_CLICK_CONVO_DESC_FAVORITED = new Discrete( - "timelines.meta.timestamp.engagement.good_click_convo_desc_favorited", - Set(PrivateTimestamp).asJava) - val TIMESTAMP_GOOD_CLICK_CONVO_DESC_REPLIIED = new Discrete( - "timelines.meta.timestamp.engagement.good_click_convo_desc_replied", - Set(PrivateTimestamp).asJava) - val TIMESTAMP_GOOD_CLICK_CONVO_DESC_PROFILE_CLICKED = new Discrete( - "timelines.meta.timestamp.engagement.good_click_convo_desc_profiile_clicked", - Set(PrivateTimestamp).asJava) - val TIMESTAMP_NEGATIVE_FEEDBACK = new Discrete( - "timelines.meta.timestamp.engagement.negative_feedback", - Set(PrivateTimestamp).asJava) - val TIMESTAMP_REPORT_TWEET_CLICK = - new Discrete( - "timelines.meta.timestamp.engagement.report_tweet_click", - Set(PrivateTimestamp).asJava) - val TIMESTAMP_IMPRESSED = - new Discrete("timelines.meta.timestamp.engagement.impressed", Set(PublicTimestamp).asJava) - val TIMESTAMP_TWEET_DETAIL_DWELLED = - new Discrete( - "timelines.meta.timestamp.engagement.tweet_detail_dwelled", - Set(PublicTimestamp).asJava) - val TIMESTAMP_PROFILE_DWELLED = - new Discrete("timelines.meta.timestamp.engagement.profile_dwelled", Set(PublicTimestamp).asJava) - val TIMESTAMP_FULLSCREEN_VIDEO_DWELLED = - new Discrete( - "timelines.meta.timestamp.engagement.fullscreen_video_dwelled", - Set(PublicTimestamp).asJava) - val TIMESTAMP_LINK_DWELLED = - new Discrete("timelines.meta.timestamp.engagement.link_dwelled", Set(PublicTimestamp).asJava) - - // these are used to dup and split the negative instances during streaming processing (kafka) - val TRAINING_FOR_FAVORITED = - new Binary("timelines.meta.training_data.for_favorited", Set(EngagementId).asJava) - val TRAINING_FOR_RETWEETED = - new Binary("timelines.meta.training_data.for_retweeted", Set(EngagementId).asJava) - val TRAINING_FOR_REPLIED = - new Binary("timelines.meta.training_data.for_replied", Set(EngagementId).asJava) - val TRAINING_FOR_PROFILE_CLICKED = - new Binary("timelines.meta.training_data.for_profile_clicked", Set(EngagementId).asJava) - val TRAINING_FOR_CLICKED = - new Binary("timelines.meta.training_data.for_clicked", Set(EngagementId).asJava) - val TRAINING_FOR_PHOTO_EXPANDED = - new Binary("timelines.meta.training_data.for_photo_expanded", Set(EngagementId).asJava) - val TRAINING_FOR_VIDEO_PLAYBACK_50 = - new Binary("timelines.meta.training_data.for_video_playback_50", Set(EngagementId).asJava) - val TRAINING_FOR_NEGATIVE_FEEDBACK = - new Binary("timelines.meta.training_data.for_negative_feedback", Set(EngagementId).asJava) - val TRAINING_FOR_REPORTED = - new Binary("timelines.meta.training_data.for_reported", Set(EngagementId).asJava) - val TRAINING_FOR_DWELLED = - new Binary("timelines.meta.training_data.for_dwelled", Set(EngagementId).asJava) - val TRAINING_FOR_SHARED = - new Binary("timelines.meta.training_data.for_shared", Set(EngagementId).asJava) - val TRAINING_FOR_SHARE_MENU_CLICKED = - new Binary("timelines.meta.training_data.for_share_menu_clicked", Set(EngagementId).asJava) - - // Warning: do not train on these features - val PREDICTED_SCORE = new Continuous(name("timelines.score"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_FAV = new Continuous(name("timelines.score.fav"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_RETWEET = - new Continuous(name("timelines.score.retweet"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_REPLY = - new Continuous(name("timelines.score.reply"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_OPEN_LINK = - new Continuous(name("timelines.score.open_link"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_GOOD_OPEN_LINK = - new Continuous(name("timelines.score.good_open_link"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_PROFILE_CLICK = - new Continuous(name("timelines.score.profile_click"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_DETAIL_EXPAND = - new Continuous(name("timelines.score.detail_expand"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_PHOTO_EXPAND = - new Continuous(name("timelines.score.photo_expand"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_PLAYBACK_50 = - new Continuous(name("timelines.score.playback_50"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_VIDEO_QUALITY_VIEW = - new Continuous(name("timelines.score.video_quality_view"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_DONT_LIKE = - new Continuous(name("timelines.score.dont_like"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_PROFILE_CLICKED_AND_PROFILE_ENGAGED = - new Continuous( - name("timelines.score.profile_clicked_and_profile_engaged"), - Set(EngagementScore).asJava) - val PREDICTED_SCORE_GOOD_CLICKED_V1 = - new Continuous(name("timelines.score.good_clicked_v1"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_GOOD_CLICKED_V2 = - new Continuous(name("timelines.score.good_clicked_v2"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_DWELL = - new Continuous(name("timelines.score.dwell"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_DWELL_CDF = - new Continuous(name("timelines.score.dwell_cfd"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_DWELL_CDF_OVERALL = - new Continuous(name("timelines.score.dwell_cfd_overall"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_DWELL_NORMALIZED_OVERALL = - new Continuous(name("timelines.score.dwell_normalized_overall"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_NEGATIVE_FEEDBACK = - new Continuous(name("timelines.score.negative_feedback"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_NEGATIVE_FEEDBACK_V2 = - new Continuous(name("timelines.score.negative_feedback_v2"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_WEAK_NEGATIVE_FEEDBACK = - new Continuous(name("timelines.score.weak_negative_feedback"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_STRONG_NEGATIVE_FEEDBACK = - new Continuous(name("timelines.score.strong_negative_feedback"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_REPORT_TWEET_CLICKED = - new Continuous(name("timelines.score.report_tweet_clicked"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_UNFOLLOW_TOPIC = - new Continuous(name("timelines.score.unfollow_topic"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_FOLLOW = - new Continuous(name("timelines.score.follow"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_RELEVANCE_PROMPT_YES_CLICKED = - new Continuous( - name("timelines.score.relevance_prompt_yes_clicked"), - Set(EngagementScore).asJava) - val PREDICTED_SCORE_BOOKMARK = - new Continuous(name("timelines.score.bookmark"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_SHARE = - new Continuous(name("timelines.score.share"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_SHARE_MENU_CLICK = - new Continuous(name("timelines.score.share_menu_click"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_PROFILE_DWELLED = - new Continuous(name("timelines.score.good_profile_dwelled"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_TWEET_DETAIL_DWELLED = - new Continuous(name("timelines.score.tweet_detail_dwelled"), Set(EngagementScore).asJava) - val PREDICTED_SCORE_FULLSCREEN_VIDEO_DWELL = - new Continuous(name("timelines.score.fullscreen_video_dwell"), Set(EngagementScore).asJava) - - // hydrated in TimelinesSharedFeaturesAdapter that recap adapter calls - val ORIGINAL_AUTHOR_ID = new Discrete(name("entities.original_author_id"), Set(UserId).asJava) - val SOURCE_AUTHOR_ID = new Discrete(name("entities.source_author_id"), Set(UserId).asJava) - val SOURCE_TWEET_ID = new Discrete(name("entities.source_tweet_id"), Set(TweetId).asJava) - val TOPIC_ID = new Discrete(name("entities.topic_id"), Set(SemanticcoreClassification).asJava) - val INFERRED_TOPIC_IDS = - new SparseBinary(name("entities.inferred_topic_ids"), Set(SemanticcoreClassification).asJava) - val INFERRED_TOPIC_ID = TypedAggregateGroup.sparseFeature(INFERRED_TOPIC_IDS) - - val WEIGHTED_FAV_COUNT = new Continuous( - name("timelines.earlybird.weighted_fav_count"), - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val WEIGHTED_RETWEET_COUNT = new Continuous( - name("timelines.earlybird.weighted_retweet_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val WEIGHTED_REPLY_COUNT = new Continuous( - name("timelines.earlybird.weighted_reply_count"), - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) - val WEIGHTED_QUOTE_COUNT = new Continuous( - name("timelines.earlybird.weighted_quote_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val EMBEDS_IMPRESSION_COUNT_V2 = new Continuous( - name("timelines.earlybird.embeds_impression_count_v2"), - Set(CountOfImpression).asJava) - val EMBEDS_URL_COUNT_V2 = new Continuous( - name("timelines.earlybird.embeds_url_count_v2"), - Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) - val DECAYED_FAVORITE_COUNT = new Continuous( - name("timelines.earlybird.decayed_favorite_count"), - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val DECAYED_RETWEET_COUNT = new Continuous( - name("timelines.earlybird.decayed_retweet_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val DECAYED_REPLY_COUNT = new Continuous( - name("timelines.earlybird.decayed_reply_count"), - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) - val DECAYED_QUOTE_COUNT = new Continuous( - name("timelines.earlybird.decayed_quote_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val FAKE_FAVORITE_COUNT = new Continuous( - name("timelines.earlybird.fake_favorite_count"), - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val FAKE_RETWEET_COUNT = new Continuous( - name("timelines.earlybird.fake_retweet_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val FAKE_REPLY_COUNT = new Continuous( - name("timelines.earlybird.fake_reply_count"), - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) - val FAKE_QUOTE_COUNT = new Continuous( - name("timelines.earlybird.fake_quote_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val QUOTE_COUNT = new Continuous( - name("timelines.earlybird.quote_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - - // Safety features - val LABEL_ABUSIVE_FLAG = - new Binary(name("timelines.earlybird.label_abusive_flag"), Set(TweetSafetyLabels).asJava) - val LABEL_ABUSIVE_HI_RCL_FLAG = - new Binary(name("timelines.earlybird.label_abusive_hi_rcl_flag"), Set(TweetSafetyLabels).asJava) - val LABEL_DUP_CONTENT_FLAG = - new Binary(name("timelines.earlybird.label_dup_content_flag"), Set(TweetSafetyLabels).asJava) - val LABEL_NSFW_HI_PRC_FLAG = - new Binary(name("timelines.earlybird.label_nsfw_hi_prc_flag"), Set(TweetSafetyLabels).asJava) - val LABEL_NSFW_HI_RCL_FLAG = - new Binary(name("timelines.earlybird.label_nsfw_hi_rcl_flag"), Set(TweetSafetyLabels).asJava) - val LABEL_SPAM_FLAG = - new Binary(name("timelines.earlybird.label_spam_flag"), Set(TweetSafetyLabels).asJava) - val LABEL_SPAM_HI_RCL_FLAG = - new Binary(name("timelines.earlybird.label_spam_hi_rcl_flag"), Set(TweetSafetyLabels).asJava) - - // Periscope features - val PERISCOPE_EXISTS = new Binary( - name("timelines.earlybird.periscope_exists"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val PERISCOPE_IS_LIVE = new Binary( - name("timelines.earlybird.periscope_is_live"), - Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava) - val PERISCOPE_HAS_BEEN_FEATURED = new Binary( - name("timelines.earlybird.periscope_has_been_featured"), - Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava) - val PERISCOPE_IS_CURRENTLY_FEATURED = new Binary( - name("timelines.earlybird.periscope_is_currently_featured"), - Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava - ) - val PERISCOPE_IS_FROM_QUALITY_SOURCE = new Binary( - name("timelines.earlybird.periscope_is_from_quality_source"), - Set(PrivateBroadcastMetrics, PublicBroadcastMetrics).asJava - ) - - val VISIBLE_TOKEN_RATIO = new Continuous(name("timelines.earlybird.visible_token_ratio")) - val HAS_QUOTE = new Binary( - name("timelines.earlybird.has_quote"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val IS_COMPOSER_SOURCE_CAMERA = new Binary( - name("timelines.earlybird.is_composer_source_camera"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - - val EARLYBIRD_SCORE = new Continuous( - name("timelines.earlybird_score"), - Set(EngagementScore).asJava - ) // separating from the rest of "timelines.earlybird." namespace - - val DWELL_TIME_MS = new Continuous( - name("timelines.engagement.dwell_time_ms"), - Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) - - val TWEET_DETAIL_DWELL_TIME_MS = new Continuous( - name("timelines.engagement.tweet_detail_dwell_time_ms"), - Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) - - val PROFILE_DWELL_TIME_MS = new Continuous( - name("timelines.engagement.profile_dwell_time_ms"), - Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) - - val FULLSCREEN_VIDEO_DWELL_TIME_MS = new Continuous( - name("timelines.engagement.fullscreen_video_dwell_time_ms"), - Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) - - val LINK_DWELL_TIME_MS = new Continuous( - name("timelines.engagement.link_dwell_time_ms"), - Set(EngagementDurationAndTimestamp, ImpressionMetadata, PrivateTimestamp).asJava) - - val ASPECT_RATIO_DEN = new Continuous( - name("tweetsource.tweet.media.aspect_ratio_den"), - Set(MediaFile, MediaProcessingInformation).asJava) - val ASPECT_RATIO_NUM = new Continuous( - name("tweetsource.tweet.media.aspect_ratio_num"), - Set(MediaFile, MediaProcessingInformation).asJava) - val BIT_RATE = new Continuous( - name("tweetsource.tweet.media.bit_rate"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HEIGHT_2 = new Continuous( - name("tweetsource.tweet.media.height_2"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HEIGHT_1 = new Continuous( - name("tweetsource.tweet.media.height_1"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HEIGHT_3 = new Continuous( - name("tweetsource.tweet.media.height_3"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HEIGHT_4 = new Continuous( - name("tweetsource.tweet.media.height_4"), - Set(MediaFile, MediaProcessingInformation).asJava) - val RESIZE_METHOD_1 = new Discrete( - name("tweetsource.tweet.media.resize_method_1"), - Set(MediaFile, MediaProcessingInformation).asJava) - val RESIZE_METHOD_2 = new Discrete( - name("tweetsource.tweet.media.resize_method_2"), - Set(MediaFile, MediaProcessingInformation).asJava) - val RESIZE_METHOD_3 = new Discrete( - name("tweetsource.tweet.media.resize_method_3"), - Set(MediaFile, MediaProcessingInformation).asJava) - val RESIZE_METHOD_4 = new Discrete( - name("tweetsource.tweet.media.resize_method_4"), - Set(MediaFile, MediaProcessingInformation).asJava) - val VIDEO_DURATION = new Continuous( - name("tweetsource.tweet.media.video_duration"), - Set(MediaFile, MediaProcessingInformation).asJava) - val WIDTH_1 = new Continuous( - name("tweetsource.tweet.media.width_1"), - Set(MediaFile, MediaProcessingInformation).asJava) - val WIDTH_2 = new Continuous( - name("tweetsource.tweet.media.width_2"), - Set(MediaFile, MediaProcessingInformation).asJava) - val WIDTH_3 = new Continuous( - name("tweetsource.tweet.media.width_3"), - Set(MediaFile, MediaProcessingInformation).asJava) - val WIDTH_4 = new Continuous( - name("tweetsource.tweet.media.width_4"), - Set(MediaFile, MediaProcessingInformation).asJava) - val NUM_MEDIA_TAGS = new Continuous( - name("tweetsource.tweet.media.num_tags"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val MEDIA_TAG_SCREEN_NAMES = new SparseBinary( - name("tweetsource.tweet.media.tag_screen_names"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val STICKER_IDS = new SparseBinary( - name("tweetsource.tweet.media.sticker_ids"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - - val NUM_COLOR_PALLETTE_ITEMS = new Continuous( - name("tweetsource.v2.tweet.media.num_color_pallette_items"), - Set(MediaFile, MediaProcessingInformation).asJava) - val COLOR_1_RED = new Continuous( - name("tweetsource.v2.tweet.media.color_1_red"), - Set(MediaFile, MediaProcessingInformation).asJava) - val COLOR_1_BLUE = new Continuous( - name("tweetsource.v2.tweet.media.color_1_blue"), - Set(MediaFile, MediaProcessingInformation).asJava) - val COLOR_1_GREEN = new Continuous( - name("tweetsource.v2.tweet.media.color_1_green"), - Set(MediaFile, MediaProcessingInformation).asJava) - val COLOR_1_PERCENTAGE = new Continuous( - name("tweetsource.v2.tweet.media.color_1_percentage"), - Set(MediaFile, MediaProcessingInformation).asJava) - val MEDIA_PROVIDERS = new SparseBinary( - name("tweetsource.v2.tweet.media.providers"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val IS_360 = new Binary( - name("tweetsource.v2.tweet.media.is_360"), - Set(MediaFile, MediaProcessingInformation).asJava) - val VIEW_COUNT = - new Continuous(name("tweetsource.v2.tweet.media.view_count"), Set(MediaContentMetrics).asJava) - val IS_MANAGED = new Binary( - name("tweetsource.v2.tweet.media.is_managed"), - Set(MediaFile, MediaProcessingInformation).asJava) - val IS_MONETIZABLE = new Binary( - name("tweetsource.v2.tweet.media.is_monetizable"), - Set(MediaFile, MediaProcessingInformation).asJava) - val IS_EMBEDDABLE = new Binary( - name("tweetsource.v2.tweet.media.is_embeddable"), - Set(MediaFile, MediaProcessingInformation).asJava) - val CLASSIFICATION_LABELS = new SparseContinuous( - name("tweetsource.v2.tweet.media.classification_labels"), - Set(MediaFile, MediaProcessingInformation).asJava) - - val NUM_STICKERS = new Continuous( - name("tweetsource.v2.tweet.media.num_stickers"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val NUM_FACES = new Continuous( - name("tweetsource.v2.tweet.media.num_faces"), - Set(MediaFile, MediaProcessingInformation).asJava) - val FACE_AREAS = new Continuous( - name("tweetsource.v2.tweet.media.face_areas"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HAS_SELECTED_PREVIEW_IMAGE = new Binary( - name("tweetsource.v2.tweet.media.has_selected_preview_image"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HAS_TITLE = new Binary( - name("tweetsource.v2.tweet.media.has_title"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HAS_DESCRIPTION = new Binary( - name("tweetsource.v2.tweet.media.has_description"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HAS_VISIT_SITE_CALL_TO_ACTION = new Binary( - name("tweetsource.v2.tweet.media.has_visit_site_call_to_action"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HAS_APP_INSTALL_CALL_TO_ACTION = new Binary( - name("tweetsource.v2.tweet.media.has_app_install_call_to_action"), - Set(MediaFile, MediaProcessingInformation).asJava) - val HAS_WATCH_NOW_CALL_TO_ACTION = new Binary( - name("tweetsource.v2.tweet.media.has_watch_now_call_to_action"), - Set(MediaFile, MediaProcessingInformation).asJava) - - val NUM_CAPS = - new Continuous(name("tweetsource.tweet.text.num_caps"), Set(PublicTweets, PrivateTweets).asJava) - val TWEET_LENGTH = - new Continuous(name("tweetsource.tweet.text.length"), Set(PublicTweets, PrivateTweets).asJava) - val TWEET_LENGTH_TYPE = new Discrete( - name("tweetsource.tweet.text.length_type"), - Set(PublicTweets, PrivateTweets).asJava) - val NUM_WHITESPACES = new Continuous( - name("tweetsource.tweet.text.num_whitespaces"), - Set(PublicTweets, PrivateTweets).asJava) - val HAS_QUESTION = - new Binary(name("tweetsource.tweet.text.has_question"), Set(PublicTweets, PrivateTweets).asJava) - val NUM_NEWLINES = new Continuous( - name("tweetsource.tweet.text.num_newlines"), - Set(PublicTweets, PrivateTweets).asJava) - val EMOJI_TOKENS = new SparseBinary( - name("tweetsource.v3.tweet.text.emoji_tokens"), - Set(PublicTweets, PrivateTweets).asJava) - val EMOTICON_TOKENS = new SparseBinary( - name("tweetsource.v3.tweet.text.emoticon_tokens"), - Set(PublicTweets, PrivateTweets).asJava) - val NUM_EMOJIS = new Continuous( - name("tweetsource.v3.tweet.text.num_emojis"), - Set(PublicTweets, PrivateTweets).asJava) - val NUM_EMOTICONS = new Continuous( - name("tweetsource.v3.tweet.text.num_emoticons"), - Set(PublicTweets, PrivateTweets).asJava) - val POS_UNIGRAMS = new SparseBinary( - name("tweetsource.v3.tweet.text.pos_unigrams"), - Set(PublicTweets, PrivateTweets).asJava) - val POS_BIGRAMS = new SparseBinary( - name("tweetsource.v3.tweet.text.pos_bigrams"), - Set(PublicTweets, PrivateTweets).asJava) - val TEXT_TOKENS = new SparseBinary( - name("tweetsource.v4.tweet.text.tokens"), - Set(PublicTweets, PrivateTweets).asJava) - - // Health features model scores (see go/toxicity, go/pblock, go/pspammytweet) - val PBLOCK_SCORE = - new Continuous(name("timelines.earlybird.pblock_score"), Set(TweetSafetyScores).asJava) - val TOXICITY_SCORE = - new Continuous(name("timelines.earlybird.toxicity_score"), Set(TweetSafetyScores).asJava) - val EXPERIMENTAL_HEALTH_MODEL_SCORE_1 = - new Continuous( - name("timelines.earlybird.experimental_health_model_score_1"), - Set(TweetSafetyScores).asJava) - val EXPERIMENTAL_HEALTH_MODEL_SCORE_2 = - new Continuous( - name("timelines.earlybird.experimental_health_model_score_2"), - Set(TweetSafetyScores).asJava) - val EXPERIMENTAL_HEALTH_MODEL_SCORE_3 = - new Continuous( - name("timelines.earlybird.experimental_health_model_score_3"), - Set(TweetSafetyScores).asJava) - val EXPERIMENTAL_HEALTH_MODEL_SCORE_4 = - new Continuous( - name("timelines.earlybird.experimental_health_model_score_4"), - Set(TweetSafetyScores).asJava) - val PSPAMMY_TWEET_SCORE = - new Continuous(name("timelines.earlybird.pspammy_tweet_score"), Set(TweetSafetyScores).asJava) - val PREPORTED_TWEET_SCORE = - new Continuous(name("timelines.earlybird.preported_tweet_score"), Set(TweetSafetyScores).asJava) - - // where record was displayed e.g. recap vs ranked timeline vs recycled - // (do NOT use for training in prediction, since this is set post-scoring) - // This differs from TimelinesSharedFeatures.INJECTION_TYPE, which is only - // set to Recap or Rectweet, and is available pre-scoring. - // This also differs from TimeFeatures.IS_TWEET_RECYCLED, which is set - // pre-scoring and indicates if a tweet is being considered for recycling. - // In contrast, DISPLAY_SUGGEST_TYPE == RecycledTweet means the tweet - // was actually served in a recycled tweet module. The two should currently - // have the same value, but need not in future, so please only use - // IS_TWEET_RECYCLED/CANDIDATE_TWEET_SOURCE_ID for training models and - // only use DISPLAY_SUGGEST_TYPE for offline analysis of tweets actually - // served in recycled modules. - val DISPLAY_SUGGEST_TYPE = new Discrete(name("recap.display.suggest_type")) - - // Candidate tweet source id - related to DISPLAY_SUGGEST_TYPE above, but this is a - // property of the candidate rather than display location so is safe to use - // in model training, unlike DISPLAY_SUGGEST_TYPE. - val CANDIDATE_TWEET_SOURCE_ID = - new Discrete(name("timelines.meta.candidate_tweet_source_id"), Set(TweetId).asJava) - - // Was at least 50% of this tweet in the user's viewport for at least 500 ms, - // OR did the user engage with the tweet publicly or privately - val IS_LINGER_IMPRESSION = - new Binary(name("timelines.engagement.is_linger_impression"), Set(EngagementsPrivate).asJava) - - // Features to create rollups - val LANGUAGE_GROUP = new Discrete(name("timelines.tweet.text.language_group")) - - // The final position index of the tweet being trained on in the timeline - // served from TLM (could still change later in TLS-API), as recorded by - // PositionIndexLoggingEnvelopeTransform. - val FINAL_POSITION_INDEX = new Discrete(name("timelines.display.final_position_index")) - - // The traceId of the timeline request, can be used to group tweets in the same response. - val TRACE_ID = new Discrete(name("timelines.display.trace_id"), Set(TfeTransactionId).asJava) - - // Whether this tweet was randomly injected into the timeline or not, for exploration purposes - val IS_RANDOM_TWEET = new Binary(name("timelines.display.is_random_tweet")) - - // Whether this tweet was reordered with softmax ranking for explore/exploit, and needs to - // be excluded from exploit only holdback - val IS_SOFTMAX_RANKING_TWEET = new Binary(name("timelines.display.is_softmax_ranking_tweet")) - - // Whether the user viewing the tweet has disabled ranked timeline. - val IS_RANKED_TIMELINE_DISABLER = new Binary( - name("timelines.user_features.is_ranked_timeline_disabler"), - Set(AnnotationValue, GeneralSettings).asJava) - - // Whether the user viewing the tweet was one of those released from DDG 4205 control - // as part of http://go/shrink-4205 process to shrink the quality features holdback. - val IS_USER_RELEASED_FROM_QUALITY_HOLDBACK = new Binary( - name("timelines.user_features.is_released_from_quality_holdback"), - Set(ExperimentId, ExperimentName).asJava) - - val INITIAL_PREDICTION_FAV = - new Continuous(name("timelines.initial_prediction.fav"), Set(EngagementScore).asJava) - val INITIAL_PREDICTION_RETWEET = - new Continuous(name("timelines.initial_prediction.retweet"), Set(EngagementScore).asJava) - val INITIAL_PREDICTION_REPLY = - new Continuous(name("timelines.initial_prediction.reply"), Set(EngagementScore).asJava) - val INITIAL_PREDICTION_OPEN_LINK = - new Continuous(name("timelines.initial_prediction.open_link"), Set(EngagementScore).asJava) - val INITIAL_PREDICTION_PROFILE_CLICK = - new Continuous(name("timelines.initial_prediction.profile_click"), Set(EngagementScore).asJava) - val INITIAL_PREDICTION_VIDEO_PLAYBACK_50 = new Continuous( - name("timelines.initial_prediction.video_playback_50"), - Set(EngagementScore).asJava) - val INITIAL_PREDICTION_DETAIL_EXPAND = - new Continuous(name("timelines.initial_prediction.detail_expand"), Set(EngagementScore).asJava) - val INITIAL_PREDICTION_PHOTO_EXPAND = - new Continuous(name("timelines.initial_prediction.photo_expand"), Set(EngagementScore).asJava) - - val VIEWER_FOLLOWS_ORIGINAL_AUTHOR = - new Binary(name("timelines.viewer_follows_original_author"), Set(Follow).asJava) - - val IS_TOP_ONE = new Binary(name("timelines.position.is_top_one")) - val IS_TOP_FIVE = - new Binary(name(featureName = "timelines.position.is_top_five")) - val IS_TOP_TEN = - new Binary(name(featureName = "timelines.position.is_top_ten")) - - val LOG_POSITION = - new Continuous(name(featureName = "timelines.position.log_10")) - -} diff --git a/src/scala/com/twitter/timelines/prediction/features/engagement_features/BUILD b/src/scala/com/twitter/timelines/prediction/features/engagement_features/BUILD deleted file mode 100644 index f6caadea0..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/engagement_features/BUILD +++ /dev/null @@ -1,12 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/timelineservice/server/suggests/features/engagement_features:thrift-scala", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - "timelines/data_processing/ml_util/transforms", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/engagement_features/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/engagement_features/BUILD.docx new file mode 100644 index 000000000..b1514452c Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/engagement_features/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/engagement_features/EngagementFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/engagement_features/EngagementFeatures.docx new file mode 100644 index 000000000..d53ff9087 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/engagement_features/EngagementFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/engagement_features/EngagementFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/engagement_features/EngagementFeatures.scala deleted file mode 100644 index e65c9db20..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/engagement_features/EngagementFeatures.scala +++ /dev/null @@ -1,246 +0,0 @@ -package com.twitter.timelines.prediction.features.engagement_features - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.logging.Logger -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.Feature -import com.twitter.ml.api.Feature.Continuous -import com.twitter.ml.api.Feature.SparseBinary -import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform -import com.twitter.timelines.data_processing.ml_util.transforms.RichITransform -import com.twitter.timelines.data_processing.ml_util.transforms.SparseBinaryUnion -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup -import com.twitter.timelineservice.suggests.features.engagement_features.thriftscala.{ - EngagementFeatures => ThriftEngagementFeatures -} -import com.twitter.timelineservice.suggests.features.engagement_features.v1.thriftscala.{ - EngagementFeatures => ThriftEngagementFeaturesV1 -} -import scala.collection.JavaConverters._ - -object EngagementFeatures { - private[this] val logger = Logger.get(getClass.getSimpleName) - - sealed trait EngagementFeature - case object Count extends EngagementFeature - case object RealGraphWeightAverage extends EngagementFeature - case object RealGraphWeightMax extends EngagementFeature - case object RealGraphWeightMin extends EngagementFeature - case object RealGraphWeightMissing extends EngagementFeature - case object RealGraphWeightVariance extends EngagementFeature - case object UserIds extends EngagementFeature - - def fromThrift(thriftEngagementFeatures: ThriftEngagementFeatures): Option[EngagementFeatures] = { - thriftEngagementFeatures match { - case thriftEngagementFeaturesV1: ThriftEngagementFeatures.V1 => - Some( - EngagementFeatures( - favoritedBy = thriftEngagementFeaturesV1.v1.favoritedBy, - retweetedBy = thriftEngagementFeaturesV1.v1.retweetedBy, - repliedBy = thriftEngagementFeaturesV1.v1.repliedBy, - ) - ) - case _ => { - logger.error("Unexpected EngagementFeatures version found.") - None - } - } - } - - val empty: EngagementFeatures = EngagementFeatures() -} - -/** - * Contains user IDs who have engaged with a target entity, such as a Tweet, - * and any additional data needed for derived features. - */ -case class EngagementFeatures( - favoritedBy: Seq[Long] = Nil, - retweetedBy: Seq[Long] = Nil, - repliedBy: Seq[Long] = Nil, - realGraphWeightByUser: Map[Long, Double] = Map.empty) { - def isEmpty: Boolean = favoritedBy.isEmpty && retweetedBy.isEmpty && repliedBy.isEmpty - def nonEmpty: Boolean = !isEmpty - def toLogThrift: ThriftEngagementFeatures.V1 = - ThriftEngagementFeatures.V1( - ThriftEngagementFeaturesV1( - favoritedBy = favoritedBy, - retweetedBy = retweetedBy, - repliedBy = repliedBy - ) - ) -} - -/** - * Represents engagement features derived from the Real Graph weight. - * - * These features are from the perspective of the source user, who is viewing their - * timeline, to the destination users (or user), who created engagements. - * - * @param count number of engagements present - * @param max max score of the engaging users - * @param mean average score of the engaging users - * @param min minimum score of the engaging users - * @param missing for engagements present, how many Real Graph scores were missing - * @param variance variance of scores of the engaging users - */ -case class RealGraphDerivedEngagementFeatures( - count: Int, - max: Double, - mean: Double, - min: Double, - missing: Int, - variance: Double) - -object EngagementDataRecordFeatures { - import EngagementFeatures._ - - val FavoritedByUserIds = new SparseBinary( - "engagement_features.user_ids.favorited_by", - Set(UserId, PrivateLikes, PublicLikes).asJava) - val RetweetedByUserIds = new SparseBinary( - "engagement_features.user_ids.retweeted_by", - Set(UserId, PrivateRetweets, PublicRetweets).asJava) - val RepliedByUserIds = new SparseBinary( - "engagement_features.user_ids.replied_by", - Set(UserId, PrivateReplies, PublicReplies).asJava) - - val InNetworkFavoritesCount = new Continuous( - "engagement_features.in_network.favorites.count", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val InNetworkRetweetsCount = new Continuous( - "engagement_features.in_network.retweets.count", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val InNetworkRepliesCount = new Continuous( - "engagement_features.in_network.replies.count", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) - - // real graph derived features - val InNetworkFavoritesAvgRealGraphWeight = new Continuous( - "engagement_features.real_graph.favorites.avg_weight", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - val InNetworkFavoritesMaxRealGraphWeight = new Continuous( - "engagement_features.real_graph.favorites.max_weight", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - val InNetworkFavoritesMinRealGraphWeight = new Continuous( - "engagement_features.real_graph.favorites.min_weight", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - val InNetworkFavoritesRealGraphWeightMissing = new Continuous( - "engagement_features.real_graph.favorites.missing" - ) - val InNetworkFavoritesRealGraphWeightVariance = new Continuous( - "engagement_features.real_graph.favorites.weight_variance" - ) - - val InNetworkRetweetsMaxRealGraphWeight = new Continuous( - "engagement_features.real_graph.retweets.max_weight", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val InNetworkRetweetsMinRealGraphWeight = new Continuous( - "engagement_features.real_graph.retweets.min_weight", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val InNetworkRetweetsAvgRealGraphWeight = new Continuous( - "engagement_features.real_graph.retweets.avg_weight", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val InNetworkRetweetsRealGraphWeightMissing = new Continuous( - "engagement_features.real_graph.retweets.missing" - ) - val InNetworkRetweetsRealGraphWeightVariance = new Continuous( - "engagement_features.real_graph.retweets.weight_variance" - ) - - val InNetworkRepliesMaxRealGraphWeight = new Continuous( - "engagement_features.real_graph.replies.max_weight", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - val InNetworkRepliesMinRealGraphWeight = new Continuous( - "engagement_features.real_graph.replies.min_weight", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - val InNetworkRepliesAvgRealGraphWeight = new Continuous( - "engagement_features.real_graph.replies.avg_weight", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - val InNetworkRepliesRealGraphWeightMissing = new Continuous( - "engagement_features.real_graph.replies.missing" - ) - val InNetworkRepliesRealGraphWeightVariance = new Continuous( - "engagement_features.real_graph.replies.weight_variance" - ) - - sealed trait FeatureGroup { - def continuousFeatures: Map[EngagementFeature, Continuous] - def sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] - def allFeatures: Seq[Feature[_]] = - (continuousFeatures.values ++ sparseBinaryFeatures.values).toSeq - } - - case object Favorites extends FeatureGroup { - override val continuousFeatures: Map[EngagementFeature, Continuous] = - Map( - Count -> InNetworkFavoritesCount, - RealGraphWeightAverage -> InNetworkFavoritesAvgRealGraphWeight, - RealGraphWeightMax -> InNetworkFavoritesMaxRealGraphWeight, - RealGraphWeightMin -> InNetworkFavoritesMinRealGraphWeight, - RealGraphWeightMissing -> InNetworkFavoritesRealGraphWeightMissing, - RealGraphWeightVariance -> InNetworkFavoritesRealGraphWeightVariance - ) - - override val sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] = - Map(UserIds -> FavoritedByUserIds) - } - - case object Retweets extends FeatureGroup { - override val continuousFeatures: Map[EngagementFeature, Continuous] = - Map( - Count -> InNetworkRetweetsCount, - RealGraphWeightAverage -> InNetworkRetweetsAvgRealGraphWeight, - RealGraphWeightMax -> InNetworkRetweetsMaxRealGraphWeight, - RealGraphWeightMin -> InNetworkRetweetsMinRealGraphWeight, - RealGraphWeightMissing -> InNetworkRetweetsRealGraphWeightMissing, - RealGraphWeightVariance -> InNetworkRetweetsRealGraphWeightVariance - ) - - override val sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] = - Map(UserIds -> RetweetedByUserIds) - } - - case object Replies extends FeatureGroup { - override val continuousFeatures: Map[EngagementFeature, Continuous] = - Map( - Count -> InNetworkRepliesCount, - RealGraphWeightAverage -> InNetworkRepliesAvgRealGraphWeight, - RealGraphWeightMax -> InNetworkRepliesMaxRealGraphWeight, - RealGraphWeightMin -> InNetworkRepliesMinRealGraphWeight, - RealGraphWeightMissing -> InNetworkRepliesRealGraphWeightMissing, - RealGraphWeightVariance -> InNetworkRepliesRealGraphWeightVariance - ) - - override val sparseBinaryFeatures: Map[EngagementFeature, SparseBinary] = - Map(UserIds -> RepliedByUserIds) - } - - val PublicEngagerSets = Set(FavoritedByUserIds, RetweetedByUserIds, RepliedByUserIds) - val PublicEngagementUserIds = new SparseBinary( - "engagement_features.user_ids.public", - Set(UserId, EngagementsPublic).asJava - ) - val ENGAGER_ID = TypedAggregateGroup.sparseFeature(PublicEngagementUserIds) - - val UnifyPublicEngagersTransform = SparseBinaryUnion( - featuresToUnify = PublicEngagerSets, - outputFeature = PublicEngagementUserIds - ) - - object RichUnifyPublicEngagersTransform extends OneToSomeTransform { - override def apply(dataRecord: DataRecord): Option[DataRecord] = - RichITransform(EngagementDataRecordFeatures.UnifyPublicEngagersTransform)(dataRecord) - override def featuresToTransform: Set[Feature[_]] = - EngagementDataRecordFeatures.UnifyPublicEngagersTransform.featuresToUnify.toSet - } -} diff --git a/src/scala/com/twitter/timelines/prediction/features/escherbird/BUILD b/src/scala/com/twitter/timelines/prediction/features/escherbird/BUILD deleted file mode 100644 index c28786b77..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/escherbird/BUILD +++ /dev/null @@ -1,19 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/tweetypie:tweet-scala", - ], -) - -scala_library( - name = "escherbird-features", - sources = ["EscherbirdFeatures.scala"], - tags = ["bazel-only"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/escherbird/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/escherbird/BUILD.docx new file mode 100644 index 000000000..1e0d74f5f Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/escherbird/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeatures.docx new file mode 100644 index 000000000..4c5e192bc Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeatures.scala deleted file mode 100644 index 3aaf9b856..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeatures.scala +++ /dev/null @@ -1,19 +0,0 @@ -package com.twitter.timelines.prediction.features.escherbird - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.Feature -import java.util.{Set => JSet} -import scala.collection.JavaConverters._ - -object EscherbirdFeatures { - val TweetGroupIds = new Feature.SparseBinary("escherbird.tweet_group_ids") - val TweetDomainIds = new Feature.SparseBinary("escherbird.tweet_domain_ids", Set(DomainId).asJava) - val TweetEntityIds = - new Feature.SparseBinary("escherbird.tweet_entity_ids", Set(SemanticcoreClassification).asJava) -} - -case class EscherbirdFeatures( - tweetId: Long, - tweetGroupIds: JSet[String], - tweetDomainIds: JSet[String], - tweetEntityIds: JSet[String]) diff --git a/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeaturesConverter.docx b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeaturesConverter.docx new file mode 100644 index 000000000..ed3ea3ceb Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeaturesConverter.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeaturesConverter.scala b/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeaturesConverter.scala deleted file mode 100644 index bd3333a03..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/escherbird/EscherbirdFeaturesConverter.scala +++ /dev/null @@ -1,19 +0,0 @@ -package com.twitter.timelines.prediction.features.escherbird - -import com.twitter.tweetypie.thriftscala.Tweet -import scala.collection.JavaConverters._ - -object EscherbirdFeaturesConverter { - val DeprecatedOrTestDomains = Set(1L, 5L, 7L, 9L, 14L, 19L, 20L, 31L) - - def fromTweet(tweet: Tweet): Option[EscherbirdFeatures] = tweet.escherbirdEntityAnnotations.map { - escherbirdEntityAnnotations => - val annotations = escherbirdEntityAnnotations.entityAnnotations - .filterNot(annotation => DeprecatedOrTestDomains.contains(annotation.domainId)) - val tweetGroupIds = annotations.map(_.groupId.toString).toSet.asJava - val tweetDomainIds = annotations.map(_.domainId.toString).toSet.asJava - // An entity is only unique within a given domain - val tweetEntityIds = annotations.map(a => s"${a.domainId}.${a.entityId}").toSet.asJava - EscherbirdFeatures(tweet.id, tweetGroupIds, tweetDomainIds, tweetEntityIds) - } -} diff --git a/src/scala/com/twitter/timelines/prediction/features/followsource/BUILD.bazel b/src/scala/com/twitter/timelines/prediction/features/followsource/BUILD.bazel deleted file mode 100644 index 0ee33acdb..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/followsource/BUILD.bazel +++ /dev/null @@ -1,7 +0,0 @@ -scala_library( - sources = ["*.scala"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/followsource/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/followsource/BUILD.docx new file mode 100644 index 000000000..4cf9ff82a Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/followsource/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/followsource/FollowSourceFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/followsource/FollowSourceFeatures.docx new file mode 100644 index 000000000..288af8c6a Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/followsource/FollowSourceFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/followsource/FollowSourceFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/followsource/FollowSourceFeatures.scala deleted file mode 100644 index 012103b14..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/followsource/FollowSourceFeatures.scala +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.timelines.prediction.features.followsource - -import com.twitter.ml.api.Feature -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import scala.collection.JavaConverters._ - -object FollowSourceFeatures { - - // Corresponds to an algorithm constant from com.twitter.hermit.profile.HermitProfileConstants - val FollowSourceAlgorithm = new Feature.Text("follow_source.algorithm") - - // Type of follow action: one of "unfollow", "follow", "follow_back", "follow_many", "follow_all" - val FollowAction = new Feature.Text( - "follow_source.action", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - - // Millisecond timestamp when follow occurred - val FollowTimestamp = - new Feature.Discrete("follow_source.follow_timestamp", Set(Follow, PrivateTimestamp).asJava) - - // Age of follow (in minutes) - val FollowAgeMinutes = - new Feature.Continuous("follow_source.follow_age_minutes", Set(Follow).asJava) - - // Tweet ID of tweet details page from where follow happened (if applicable) - val FollowCauseTweetId = new Feature.Discrete("follow_source.cause_tweet_id", Set(TweetId).asJava) - - // String representation of follow client (android, web, iphone, etc). Derived from "client" - // portion of client event namespace. - val FollowClientId = new Feature.Text("follow_source.client_id", Set(ClientType).asJava) - - // If the follow happens via a profile's Following or Followers, - // the id of the profile owner is recorded here. - val FollowAssociationId = - new Feature.Discrete("follow_source.association_id", Set(Follow, UserId).asJava) - - // The "friendly name" here is computed using FollowSourceUtil.getSource. It represents - // a grouping on a few client events that reflect where the event occurred. For example, - // events on the tweet details page are grouped using "tweetDetails": - // case (Some("web"), Some("permalink"), _, _, _) => "tweetDetails" - // case (Some("iphone"), Some("tweet"), _, _, _) => "tweetDetails" - // case (Some("android"), Some("tweet"), _, _, _) => "tweetDetails" - val FollowSourceFriendlyName = new Feature.Text("follow_source.friendly_name", Set(Follow).asJava) - - // Up to two sources and actions that preceded the follow (for example, a profile visit - // through a mention click, which itself was on a tweet detail page reached through a tweet - // click in the Home tab). See go/followsource for more details and examples. - // The "source" here is computed using FollowSourceUtil.getSource - val PreFollowAction1 = new Feature.Text("follow_source.pre_follow_action_1", Set(Follow).asJava) - val PreFollowAction2 = new Feature.Text("follow_source.pre_follow_action_2", Set(Follow).asJava) - val PreFollowSource1 = new Feature.Text("follow_source.pre_follow_source_1", Set(Follow).asJava) - val PreFollowSource2 = new Feature.Text("follow_source.pre_follow_source_2", Set(Follow).asJava) -} diff --git a/src/scala/com/twitter/timelines/prediction/features/itl/BUILD b/src/scala/com/twitter/timelines/prediction/features/itl/BUILD deleted file mode 100644 index 6fc497bf3..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/itl/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/itl/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/itl/BUILD.docx new file mode 100644 index 000000000..7e4b99410 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/itl/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/itl/ITLFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/itl/ITLFeatures.docx new file mode 100644 index 000000000..693437c80 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/itl/ITLFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/itl/ITLFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/itl/ITLFeatures.scala deleted file mode 100644 index 3351e5c11..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/itl/ITLFeatures.scala +++ /dev/null @@ -1,575 +0,0 @@ -package com.twitter.timelines.prediction.features.itl - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.Feature.Binary -import com.twitter.ml.api.Feature.Continuous -import com.twitter.ml.api.Feature.Discrete -import com.twitter.ml.api.Feature.SparseBinary -import scala.collection.JavaConverters._ - -object ITLFeatures { - // engagement - val IS_RETWEETED = - new Binary("itl.engagement.is_retweeted", Set(PublicRetweets, PrivateRetweets).asJava) - val IS_FAVORITED = - new Binary("itl.engagement.is_favorited", Set(PublicLikes, PrivateLikes).asJava) - val IS_REPLIED = - new Binary("itl.engagement.is_replied", Set(PublicReplies, PrivateReplies).asJava) - // v1: post click engagements: fav, reply - val IS_GOOD_CLICKED_CONVO_DESC_V1 = new Binary( - "itl.engagement.is_good_clicked_convo_desc_favorited_or_replied", - Set( - PublicLikes, - PrivateLikes, - PublicReplies, - PrivateReplies, - EngagementsPrivate, - EngagementsPublic).asJava) - // v2: post click engagements: click - val IS_GOOD_CLICKED_CONVO_DESC_V2 = new Binary( - "itl.engagement.is_good_clicked_convo_desc_v2", - Set(TweetsClicked, EngagementsPrivate).asJava) - - val IS_GOOD_CLICKED_CONVO_DESC_FAVORITED = new Binary( - "itl.engagement.is_good_clicked_convo_desc_favorited", - Set(PublicLikes, PrivateLikes).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_REPLIED = new Binary( - "itl.engagement.is_good_clicked_convo_desc_replied", - Set(PublicReplies, PrivateReplies).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_RETWEETED = new Binary( - "itl.engagement.is_good_clicked_convo_desc_retweeted", - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_CLICKED = new Binary( - "itl.engagement.is_good_clicked_convo_desc_clicked", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_FOLLOWED = - new Binary("itl.engagement.is_good_clicked_convo_desc_followed", Set(EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_SHARE_DM_CLICKED = new Binary( - "itl.engagement.is_good_clicked_convo_desc_share_dm_clicked", - Set(EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_PROFILE_CLICKED = new Binary( - "itl.engagement.is_good_clicked_convo_desc_profile_clicked", - Set(EngagementsPrivate).asJava) - - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_0 = new Binary( - "itl.engagement.is_good_clicked_convo_desc_uam_gt_0", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_1 = new Binary( - "itl.engagement.is_good_clicked_convo_desc_uam_gt_1", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_2 = new Binary( - "itl.engagement.is_good_clicked_convo_desc_uam_gt_2", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_3 = new Binary( - "itl.engagement.is_good_clicked_convo_desc_uam_gt_3", - Set(EngagementsPrivate, EngagementsPublic).asJava) - - val IS_TWEET_DETAIL_DWELLED = new Binary( - "itl.engagement.is_tweet_detail_dwelled", - Set(TweetsClicked, EngagementsPrivate).asJava) - - val IS_TWEET_DETAIL_DWELLED_8_SEC = new Binary( - "itl.engagement.is_tweet_detail_dwelled_8_sec", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_15_SEC = new Binary( - "itl.engagement.is_tweet_detail_dwelled_15_sec", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_25_SEC = new Binary( - "itl.engagement.is_tweet_detail_dwelled_25_sec", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_30_SEC = new Binary( - "itl.engagement.is_tweet_detail_dwelled_30_sec", - Set(TweetsClicked, EngagementsPrivate).asJava) - - val IS_PROFILE_DWELLED = new Binary( - "itl.engagement.is_profile_dwelled", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_DWELLED_10_SEC = new Binary( - "itl.engagement.is_profile_dwelled_10_sec", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_DWELLED_20_SEC = new Binary( - "itl.engagement.is_profile_dwelled_20_sec", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_DWELLED_30_SEC = new Binary( - "itl.engagement.is_profile_dwelled_30_sec", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED = new Binary( - "itl.engagement.is_fullscreen_video_dwelled", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Binary( - "itl.engagement.is_fullscreen_video_dwelled_5_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Binary( - "itl.engagement.is_fullscreen_video_dwelled_10_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Binary( - "itl.engagement.is_fullscreen_video_dwelled_20_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Binary( - "itl.engagement.is_fullscreen_video_dwelled_30_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_LINK_DWELLED_15_SEC = new Binary( - "itl.engagement.is_link_dwelled_15_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_LINK_DWELLED_30_SEC = new Binary( - "itl.engagement.is_link_dwelled_30_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_LINK_DWELLED_60_SEC = new Binary( - "itl.engagement.is_link_dwelled_60_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_QUOTED = - new Binary("itl.engagement.is_quoted", Set(PublicRetweets, PrivateRetweets).asJava) - val IS_RETWEETED_WITHOUT_QUOTE = new Binary( - "itl.engagement.is_retweeted_without_quote", - Set(PublicRetweets, PrivateRetweets).asJava) - val IS_CLICKED = new Binary( - "itl.engagement.is_clicked", - Set(EngagementsPrivate, TweetsClicked, LinksClickedOn).asJava) - val IS_PROFILE_CLICKED = new Binary( - "itl.engagement.is_profile_clicked", - Set(EngagementsPrivate, TweetsClicked, ProfilesViewed, ProfilesClicked).asJava) - val IS_DWELLED = new Binary("itl.engagement.is_dwelled", Set(EngagementsPrivate).asJava) - val IS_DWELLED_IN_BOUNDS_V1 = - new Binary("itl.engagement.is_dwelled_in_bounds_v1", Set(EngagementsPrivate).asJava) - val DWELL_NORMALIZED_OVERALL = - new Continuous("itl.engagement.dwell_normalized_overall", Set(EngagementsPrivate).asJava) - val DWELL_CDF_OVERALL = - new Continuous("itl.engagement.dwell_cdf_overall", Set(EngagementsPrivate).asJava) - val DWELL_CDF = new Continuous("itl.engagement.dwell_cdf", Set(EngagementsPrivate).asJava) - - val IS_DWELLED_1S = new Binary("itl.engagement.is_dwelled_1s", Set(EngagementsPrivate).asJava) - val IS_DWELLED_2S = new Binary("itl.engagement.is_dwelled_2s", Set(EngagementsPrivate).asJava) - val IS_DWELLED_3S = new Binary("itl.engagement.is_dwelled_3s", Set(EngagementsPrivate).asJava) - val IS_DWELLED_4S = new Binary("itl.engagement.is_dwelled_4s", Set(EngagementsPrivate).asJava) - val IS_DWELLED_5S = new Binary("itl.engagement.is_dwelled_5s", Set(EngagementsPrivate).asJava) - val IS_DWELLED_6S = new Binary("itl.engagement.is_dwelled_6s", Set(EngagementsPrivate).asJava) - val IS_DWELLED_7S = new Binary("itl.engagement.is_dwelled_7s", Set(EngagementsPrivate).asJava) - val IS_DWELLED_8S = new Binary("itl.engagement.is_dwelled_8s", Set(EngagementsPrivate).asJava) - val IS_DWELLED_9S = new Binary("itl.engagement.is_dwelled_9s", Set(EngagementsPrivate).asJava) - val IS_DWELLED_10S = new Binary("itl.engagement.is_dwelled_10s", Set(EngagementsPrivate).asJava) - - val IS_SKIPPED_1S = new Binary("itl.engagement.is_skipped_1s", Set(EngagementsPrivate).asJava) - val IS_SKIPPED_2S = new Binary("itl.engagement.is_skipped_2s", Set(EngagementsPrivate).asJava) - val IS_SKIPPED_3S = new Binary("itl.engagement.is_skipped_3s", Set(EngagementsPrivate).asJava) - val IS_SKIPPED_4S = new Binary("itl.engagement.is_skipped_4s", Set(EngagementsPrivate).asJava) - val IS_SKIPPED_5S = new Binary("itl.engagement.is_skipped_5s", Set(EngagementsPrivate).asJava) - val IS_SKIPPED_6S = new Binary("itl.engagement.is_skipped_6s", Set(EngagementsPrivate).asJava) - val IS_SKIPPED_7S = new Binary("itl.engagement.is_skipped_7s", Set(EngagementsPrivate).asJava) - val IS_SKIPPED_8S = new Binary("itl.engagement.is_skipped_8s", Set(EngagementsPrivate).asJava) - val IS_SKIPPED_9S = new Binary("itl.engagement.is_skipped_9s", Set(EngagementsPrivate).asJava) - val IS_SKIPPED_10S = new Binary("itl.engagement.is_skipped_10s", Set(EngagementsPrivate).asJava) - - val IS_FOLLOWED = - new Binary("itl.engagement.is_followed", Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_IMPRESSED = new Binary("itl.engagement.is_impressed", Set(EngagementsPrivate).asJava) - val IS_OPEN_LINKED = - new Binary("itl.engagement.is_open_linked", Set(EngagementsPrivate, LinksClickedOn).asJava) - val IS_PHOTO_EXPANDED = new Binary( - "itl.engagement.is_photo_expanded", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_VIDEO_VIEWED = - new Binary("itl.engagement.is_video_viewed", Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_VIDEO_PLAYBACK_50 = new Binary( - "itl.engagement.is_video_playback_50", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_VIDEO_QUALITY_VIEWED = new Binary( - "itl.engagement.is_video_quality_viewed", - Set(EngagementsPrivate, EngagementsPublic).asJava - ) - val IS_BOOKMARKED = - new Binary("itl.engagement.is_bookmarked", Set(EngagementsPrivate).asJava) - val IS_SHARED = - new Binary("itl.engagement.is_shared", Set(EngagementsPrivate).asJava) - val IS_SHARE_MENU_CLICKED = - new Binary("itl.engagement.is_share_menu_clicked", Set(EngagementsPrivate).asJava) - - // Negative engagements - val IS_DONT_LIKE = - new Binary("itl.engagement.is_dont_like", Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_BLOCK_CLICKED = new Binary( - "itl.engagement.is_block_clicked", - Set(TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava) - val IS_BLOCK_DIALOG_BLOCKED = new Binary( - "itl.engagement.is_block_dialog_blocked", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_MUTE_CLICKED = - new Binary("itl.engagement.is_mute_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_MUTE_DIALOG_MUTED = - new Binary("itl.engagement.is_mute_dialog_muted", Set(EngagementsPrivate).asJava) - val IS_REPORT_TWEET_CLICKED = new Binary( - "itl.engagement.is_report_tweet_clicked", - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_CARET_CLICKED = - new Binary("itl.engagement.is_caret_clicked", Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_NOT_ABOUT_TOPIC = - new Binary("itl.engagement.is_not_about_topic", Set(EngagementsPrivate).asJava) - val IS_NOT_RECENT = - new Binary("itl.engagement.is_not_recent", Set(EngagementsPrivate).asJava) - val IS_NOT_RELEVANT = - new Binary("itl.engagement.is_not_relevant", Set(EngagementsPrivate).asJava) - val IS_SEE_FEWER = - new Binary("itl.engagement.is_see_fewer", Set(EngagementsPrivate).asJava) - val IS_UNFOLLOW_TOPIC = - new Binary("itl.engagement.is_unfollow_topic", Set(EngagementsPrivate).asJava) - val IS_FOLLOW_TOPIC = - new Binary("itl.engagement.is_follow_topic", Set(EngagementsPrivate).asJava) - val IS_NOT_INTERESTED_IN_TOPIC = - new Binary("itl.engagement.is_not_interested_in_topic", Set(EngagementsPrivate).asJava) - val IS_HOME_LATEST_VISITED = - new Binary("itl.engagement.is_home_latest_visited", Set(EngagementsPrivate).asJava) - - // This derived label is the logical OR of IS_DONT_LIKE, IS_BLOCK_CLICKED, IS_MUTE_CLICKED and IS_REPORT_TWEET_CLICKED - val IS_NEGATIVE_FEEDBACK = - new Binary("itl.engagement.is_negative_feedback", Set(EngagementsPrivate).asJava) - - // Reciprocal engagements for reply forward engagement - val IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_impressed_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_favorited_by_author", - Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_QUOTED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_quoted_by_author", - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_REPLIED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_replied_by_author", - Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_retweeted_by_author", - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_BLOCKED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_blocked_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_FOLLOWED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_followed_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_UNFOLLOWED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_unfollowed_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_MUTED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_muted_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_REPORTED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_reported_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava) - - // This derived label is the logical OR of REPLY_REPLIED, REPLY_FAVORITED, REPLY_RETWEETED - val IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR = new Binary( - "itl.engagement.is_replied_reply_engaged_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava) - - // Reciprocal engagements for fav forward engagement - val IS_FAVORITED_FAV_FAVORITED_BY_AUTHOR = new Binary( - "itl.engagement.is_favorited_fav_favorited_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava - ) - val IS_FAVORITED_FAV_REPLIED_BY_AUTHOR = new Binary( - "itl.engagement.is_favorited_fav_replied_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava - ) - val IS_FAVORITED_FAV_RETWEETED_BY_AUTHOR = new Binary( - "itl.engagement.is_favorited_fav_retweeted_by_author", - Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava - ) - val IS_FAVORITED_FAV_FOLLOWED_BY_AUTHOR = new Binary( - "itl.engagement.is_favorited_fav_followed_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava - ) - // This derived label is the logical OR of FAV_REPLIED, FAV_FAVORITED, FAV_RETWEETED, FAV_FOLLOWED - val IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Binary( - "itl.engagement.is_favorited_fav_engaged_by_author", - Set(EngagementsPrivate, EngagementsPublic).asJava - ) - - // define good profile click by considering following engagements (follow, fav, reply, retweet, etc.) at profile page - val IS_PROFILE_CLICKED_AND_PROFILE_FOLLOW = new Binary( - "itl.engagement.is_profile_clicked_and_profile_follow", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, Follow).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_FAV = new Binary( - "itl.engagement.is_profile_clicked_and_profile_fav", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateLikes, PublicLikes).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_REPLY = new Binary( - "itl.engagement.is_profile_clicked_and_profile_reply", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateReplies, PublicReplies).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_RETWEET = new Binary( - "itl.engagement.is_profile_clicked_and_profile_retweet", - Set( - ProfilesViewed, - ProfilesClicked, - EngagementsPrivate, - PrivateRetweets, - PublicRetweets).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_CLICK = new Binary( - "itl.engagement.is_profile_clicked_and_profile_tweet_click", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, TweetsClicked).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_SHARE_DM_CLICK = new Binary( - "itl.engagement.is_profile_clicked_and_profile_share_dm_click", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // This derived label is the union of all binary features above - val IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Binary( - "itl.engagement.is_profile_clicked_and_profile_engaged", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, EngagementsPublic).asJava) - - // define bad profile click by considering following engagements (user report, tweet report, mute, block, etc) at profile page - val IS_PROFILE_CLICKED_AND_PROFILE_USER_REPORT_CLICK = new Binary( - "itl.engagement.is_profile_clicked_and_profile_user_report_click", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_REPORT_CLICK = new Binary( - "itl.engagement.is_profile_clicked_and_profile_tweet_report_click", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_MUTE = new Binary( - "itl.engagement.is_profile_clicked_and_profile_mute", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_BLOCK = new Binary( - "itl.engagement.is_profile_clicked_and_profile_block", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // This derived label is the union of bad profile click engagements and existing negative feedback - val IS_NEGATIVE_FEEDBACK_V2 = new Binary( - "itl.engagement.is_negative_feedback_v2", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // engagement for following user from any surface area - val IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Binary( - "itl.engagement.is_followed_from_any_surface_area", - Set(EngagementsPublic, EngagementsPrivate).asJava) - - // Relevance prompt tweet engagements - val IS_RELEVANCE_PROMPT_YES_CLICKED = - new Binary("itl.engagement.is_relevance_prompt_yes_clicked", Set(EngagementsPrivate).asJava) - - // Reply downvote engagements - val IS_REPLY_DOWNVOTED = - new Binary("itl.engagement.is_reply_downvoted", Set(EngagementsPrivate).asJava) - val IS_REPLY_DOWNVOTE_REMOVED = - new Binary("itl.engagement.is_reply_downvote_removed", Set(EngagementsPrivate).asJava) - - // features from RecommendedTweet - val RECTWEET_SCORE = new Continuous("itl.recommended_tweet_features.rectweet_score") - val NUM_FAVORITING_USERS = new Continuous("itl.recommended_tweet_features.num_favoriting_users") - val NUM_FOLLOWING_USERS = new Continuous("itl.recommended_tweet_features.num_following_users") - val CONTENT_SOURCE_TYPE = new Discrete("itl.recommended_tweet_features.content_source_type") - - val RECOS_SCORE = new Continuous( - "itl.recommended_tweet_features.recos_score", - Set(EngagementScore, UsersRealGraphScore, UsersSalsaScore).asJava) - val AUTHOR_REALGRAPH_SCORE = new Continuous( - "itl.recommended_tweet_features.realgraph_score", - Set(UsersRealGraphScore).asJava) - val AUTHOR_SARUS_SCORE = new Continuous( - "itl.recommended_tweet_features.sarus_score", - Set(EngagementScore, UsersSalsaScore).asJava) - - val NUM_INTERACTING_USERS = new Continuous( - "itl.recommended_tweet_features.num_interacting_users", - Set(EngagementScore).asJava - ) - val MAX_REALGRAPH_SCORE_OF_INTERACTING_USERS = new Continuous( - "itl.recommended_tweet_features.max_realgraph_score_of_interacting_users", - Set(UsersRealGraphScore, EngagementScore).asJava - ) - val SUM_REALGRAPH_SCORE_OF_INTERACTING_USERS = new Continuous( - "itl.recommended_tweet_features.sum_realgraph_score_of_interacting_users", - Set(UsersRealGraphScore, EngagementScore).asJava - ) - val AVG_REALGRAPH_SCORE_OF_INTERACTING_USERS = new Continuous( - "itl.recommended_tweet_features.avg_realgraph_score_of_interacting_users", - Set(UsersRealGraphScore, EngagementScore).asJava - ) - val MAX_SARUS_SCORE_OF_INTERACTING_USERS = new Continuous( - "itl.recommended_tweet_features.max_sarus_score_of_interacting_users", - Set(EngagementScore, UsersSalsaScore).asJava - ) - val SUM_SARUS_SCORE_OF_INTERACTING_USERS = new Continuous( - "itl.recommended_tweet_features.sum_sarus_score_of_interacting_users", - Set(EngagementScore, UsersSalsaScore).asJava - ) - val AVG_SARUS_SCORE_OF_INTERACTING_USERS = new Continuous( - "itl.recommended_tweet_features.avg_sarus_score_of_interacting_users", - Set(EngagementScore, UsersSalsaScore).asJava - ) - - val NUM_INTERACTING_FOLLOWINGS = new Continuous( - "itl.recommended_tweet_features.num_interacting_followings", - Set(EngagementScore).asJava - ) - - // features from HydratedTweetFeatures - val REAL_GRAPH_WEIGHT = - new Continuous("itl.hydrated_tweet_features.real_graph_weight", Set(UsersRealGraphScore).asJava) - val SARUS_GRAPH_WEIGHT = new Continuous("itl.hydrated_tweet_features.sarus_graph_weight") - val FROM_TOP_ENGAGED_USER = new Binary("itl.hydrated_tweet_features.from_top_engaged_user") - val FROM_TOP_INFLUENCER = new Binary("itl.hydrated_tweet_features.from_top_influencer") - val TOPIC_SIM_SEARCHER_INTERSTED_IN_AUTHOR_KNOWN_FOR = new Continuous( - "itl.hydrated_tweet_features.topic_sim_searcher_interested_in_author_known_for" - ) - val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_INTERESTED_IN = new Continuous( - "itl.hydrated_tweet_features.topic_sim_searcher_author_both_interested_in" - ) - val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_KNOWN_FOR = new Continuous( - "itl.hydrated_tweet_features.topic_sim_searcher_author_both_known_for" - ) - val USER_REP = new Continuous("itl.hydrated_tweet_features.user_rep") - val NORMALIZED_PARUS_SCORE = new Continuous("itl.hydrated_tweet_features.normalized_parus_score") - val CONTAINS_MEDIA = new Binary("itl.hydrated_tweet_features.contains_media") - val FROM_NEARBY = new Binary("itl.hydrated_tweet_features.from_nearby") - val TOPIC_SIM_SEARCHER_INTERESTED_IN_TWEET = new Continuous( - "itl.hydrated_tweet_features.topic_sim_searcher_interested_in_tweet" - ) - val MATCHES_UI_LANG = new Binary( - "itl.hydrated_tweet_features.matches_ui_lang", - Set(ProvidedLanguage, InferredLanguage).asJava) - val MATCHES_SEARCHER_MAIN_LANG = new Binary( - "itl.hydrated_tweet_features.matches_searcher_main_lang", - Set(ProvidedLanguage, InferredLanguage).asJava - ) - val MATCHES_SEARCHER_LANGS = new Binary( - "itl.hydrated_tweet_features.matches_searcher_langs", - Set(ProvidedLanguage, InferredLanguage).asJava) - val HAS_CARD = new Binary( - "itl.hydrated_tweet_features.has_card", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_IMAGE = new Binary( - "itl.hydrated_tweet_features.has_image", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_NATIVE_IMAGE = new Binary( - "itl.hydrated_tweet_features.has_native_image", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_VIDEO = new Binary("itl.hydrated_tweet_features.has_video") - val HAS_CONSUMER_VIDEO = new Binary( - "itl.hydrated_tweet_features.has_consumer_video", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_PRO_VIDEO = new Binary( - "itl.hydrated_tweet_features.has_pro_video", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_PERISCOPE = new Binary( - "itl.hydrated_tweet_features.has_periscope", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_VINE = new Binary( - "itl.hydrated_tweet_features.has_vine", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_NATIVE_VIDEO = new Binary( - "itl.hydrated_tweet_features.has_native_video", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_LINK = new Binary( - "itl.hydrated_tweet_features.has_link", - Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val LINK_COUNT = new Continuous( - "itl.hydrated_tweet_features.link_count", - Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) - val URL_DOMAINS = new SparseBinary( - "itl.hydrated_tweet_features.url_domains", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_VISIBLE_LINK = new Binary( - "itl.hydrated_tweet_features.has_visible_link", - Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_NEWS = new Binary( - "itl.hydrated_tweet_features.has_news", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_TREND = new Binary( - "itl.hydrated_tweet_features.has_trend", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val BLENDER_SCORE = - new Continuous("itl.hydrated_tweet_features.blender_score", Set(EngagementScore).asJava) - val PARUS_SCORE = - new Continuous("itl.hydrated_tweet_features.parus_score", Set(EngagementScore).asJava) - val TEXT_SCORE = - new Continuous("itl.hydrated_tweet_features.text_score", Set(EngagementScore).asJava) - val BIDIRECTIONAL_REPLY_COUNT = new Continuous( - "itl.hydrated_tweet_features.bidirectional_reply_count", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - val UNIDIRECTIONAL_REPLY_COUNT = new Continuous( - "itl.hydrated_tweet_features.unidirectional_reply_count", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - val BIDIRECTIONAL_RETWEET_COUNT = new Continuous( - "itl.hydrated_tweet_features.bidirectional_retweet_count", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val UNIDIRECTIONAL_RETWEET_COUNT = new Continuous( - "itl.hydrated_tweet_features.unidirectional_retweet_count", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val BIDIRECTIONAL_FAV_COUNT = new Continuous( - "itl.hydrated_tweet_features.bidirectional_fav_count", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - val UNIDIRECTIONAL_FAV_COUNT = new Continuous( - "itl.hydrated_tweet_features.unidirectional_fav_count", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - val CONVERSATION_COUNT = new Continuous("itl.hydrated_tweet_features.conversation_count") - val FAV_COUNT = new Continuous( - "itl.hydrated_tweet_features.fav_count", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val REPLY_COUNT = new Continuous( - "itl.hydrated_tweet_features.reply_count", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) - val RETWEET_COUNT = new Continuous( - "itl.hydrated_tweet_features.retweet_count", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val PREV_USER_TWEET_ENGAGEMENT = new Continuous( - "itl.hydrated_tweet_features.prev_user_tweet_enagagement", - Set(EngagementScore, EngagementsPrivate, EngagementsPublic).asJava - ) - val IS_SENSITIVE = new Binary("itl.hydrated_tweet_features.is_sensitive") - val HAS_MULTIPLE_MEDIA = new Binary( - "itl.hydrated_tweet_features.has_multiple_media", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_MULTIPLE_HASHTAGS_OR_TRENDS = new Binary( - "itl.hydrated_tweet_features.has_multiple_hashtag_or_trend", - Set( - UserVisibleFlag, - CountOfPrivateTweetEntitiesAndMetadata, - CountOfPublicTweetEntitiesAndMetadata).asJava) - val IS_AUTHOR_PROFILE_EGG = - new Binary("itl.hydrated_tweet_features.is_author_profile_egg", Set(ProfileImage).asJava) - val IS_AUTHOR_NEW = - new Binary("itl.hydrated_tweet_features.is_author_new", Set(UserType, UserState).asJava) - val NUM_MENTIONS = new Continuous( - "itl.hydrated_tweet_features.num_mentions", - Set( - UserVisibleFlag, - CountOfPrivateTweetEntitiesAndMetadata, - CountOfPublicTweetEntitiesAndMetadata).asJava) - val NUM_HASHTAGS = new Continuous( - "itl.hydrated_tweet_features.num_hashtags", - Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) - val LANGUAGE = new Discrete( - "itl.hydrated_tweet_features.language", - Set(ProvidedLanguage, InferredLanguage).asJava) - val LINK_LANGUAGE = new Continuous( - "itl.hydrated_tweet_features.link_language", - Set(ProvidedLanguage, InferredLanguage).asJava) - val IS_AUTHOR_NSFW = - new Binary("itl.hydrated_tweet_features.is_author_nsfw", Set(UserType).asJava) - val IS_AUTHOR_SPAM = - new Binary("itl.hydrated_tweet_features.is_author_spam", Set(UserType).asJava) - val IS_AUTHOR_BOT = new Binary("itl.hydrated_tweet_features.is_author_bot", Set(UserType).asJava) - val IS_OFFENSIVE = new Binary("itl.hydrated_tweet_features.is_offensive") - val FROM_VERIFIED_ACCOUNT = - new Binary("itl.hydrated_tweet_features.from_verified_account", Set(UserVerifiedFlag).asJava) - val EMBEDS_IMPRESSION_COUNT = new Continuous( - "itl.hydrated_tweet_features.embeds_impression_count", - Set(CountOfImpression).asJava) - val EMBEDS_URL_COUNT = - new Continuous("itl.hydrated_tweet_features.embeds_url_count", Set(UrlFoundFlag).asJava) - val FAV_COUNT_V2 = new Continuous( - "recap.earlybird.fav_count_v2", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val RETWEET_COUNT_V2 = new Continuous( - "recap.earlybird.retweet_count_v2", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val REPLY_COUNT_V2 = new Continuous( - "recap.earlybird.reply_count_v2", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) -} diff --git a/src/scala/com/twitter/timelines/prediction/features/list_features/BUILD b/src/scala/com/twitter/timelines/prediction/features/list_features/BUILD deleted file mode 100644 index 6fc497bf3..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/list_features/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/list_features/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/list_features/BUILD.docx new file mode 100644 index 000000000..7e4b99410 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/list_features/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/list_features/ListFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/list_features/ListFeatures.docx new file mode 100644 index 000000000..20b5c2485 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/list_features/ListFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/list_features/ListFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/list_features/ListFeatures.scala deleted file mode 100644 index ffb00d1f6..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/list_features/ListFeatures.scala +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.timelines.prediction.features.list_features - -import com.twitter.ml.api.Feature.{Binary, Discrete} -import com.twitter.ml.api.FeatureContext -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import scala.collection.JavaConverters._ - -object ListFeatures { - - // list.id is used for list tweet injections in home. timelines.meta.list_id is used for list tweets in list timeline. - val LIST_ID = new Discrete("list.id") - - val VIEWER_IS_OWNER = - new Binary("list.viewer.is_owner", Set(ListsNonpublicList, ListsPublicList).asJava) - val VIEWER_IS_SUBSCRIBER = new Binary("list.viewer.is_subscriber") - val IS_PINNED_LIST = new Binary("list.is_pinned") - - val featureContext = new FeatureContext( - LIST_ID, - VIEWER_IS_OWNER, - VIEWER_IS_SUBSCRIBER, - IS_PINNED_LIST - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/features/p_home_latest/BUILD b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/BUILD deleted file mode 100644 index 6fc497bf3..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/p_home_latest/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/p_home_latest/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/BUILD.docx new file mode 100644 index 000000000..7e4b99410 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/p_home_latest/HomeLatestUserFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/HomeLatestUserFeatures.docx new file mode 100644 index 000000000..40c9badda Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/HomeLatestUserFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/p_home_latest/HomeLatestUserFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/p_home_latest/HomeLatestUserFeatures.scala deleted file mode 100644 index 65d721a05..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/p_home_latest/HomeLatestUserFeatures.scala +++ /dev/null @@ -1,49 +0,0 @@ -package com.twitter.timelines.prediction.features.p_home_latest - -import com.twitter.ml.api.Feature.{Continuous, Discrete} -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import scala.collection.JavaConverters._ - -object HomeLatestUserFeatures { - val LAST_LOGIN_TIMESTAMP_MS = - new Discrete("home_latest.user_feature.last_login_timestamp_ms", Set(PrivateTimestamp).asJava) -} - -object HomeLatestUserAggregatesFeatures { - - /** - * Used as `timestampFeature` in `OfflineAggregateSource` required by feature aggregations, set to - * the `dateRange` end timestamp by default - */ - val AGGREGATE_TIMESTAMP_MS = - new Discrete("home_latest.user_feature.aggregate_timestamp_ms", Set(PrivateTimestamp).asJava) - val HOME_TOP_IMPRESSIONS = - new Continuous("home_latest.user_feature.home_top_impressions", Set(CountOfImpression).asJava) - val HOME_LATEST_IMPRESSIONS = - new Continuous( - "home_latest.user_feature.home_latest_impressions", - Set(CountOfImpression).asJava) - val HOME_TOP_LAST_LOGIN_TIMESTAMP_MS = - new Discrete( - "home_latest.user_feature.home_top_last_login_timestamp_ms", - Set(PrivateTimestamp).asJava) - val HOME_LATEST_LAST_LOGIN_TIMESTAMP_MS = - new Discrete( - "home_latest.user_feature.home_latest_last_login_timestamp_ms", - Set(PrivateTimestamp).asJava) - val HOME_LATEST_MOST_RECENT_CLICK_TIMESTAMP_MS = - new Discrete( - "home_latest.user_feature.home_latest_most_recent_click_timestamp_ms", - Set(PrivateTimestamp).asJava) -} - -case class HomeLatestUserFeatures(userId: Long, lastLoginTimestampMs: Long) - -case class HomeLatestUserAggregatesFeatures( - userId: Long, - aggregateTimestampMs: Long, - homeTopImpressions: Option[Double], - homeLatestImpressions: Option[Double], - homeTopLastLoginTimestampMs: Option[Long], - homeLatestLastLoginTimestampMs: Option[Long], - homeLatestMostRecentClickTimestampMs: Option[Long]) diff --git a/src/scala/com/twitter/timelines/prediction/features/ppmi/BUILD b/src/scala/com/twitter/timelines/prediction/features/ppmi/BUILD deleted file mode 100644 index babba31bb..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/ppmi/BUILD +++ /dev/null @@ -1,8 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/ppmi/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/ppmi/BUILD.docx new file mode 100644 index 000000000..50c6d7101 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/ppmi/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/ppmi/PpmiFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/ppmi/PpmiFeatures.docx new file mode 100644 index 000000000..4750e81f2 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/ppmi/PpmiFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/ppmi/PpmiFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/ppmi/PpmiFeatures.scala deleted file mode 100644 index 7e6d1dea8..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/ppmi/PpmiFeatures.scala +++ /dev/null @@ -1,7 +0,0 @@ -package com.twitter.timelines.prediction.features.ppmi - -import com.twitter.ml.api.Feature.Continuous - -object PpmiDataRecordFeatures { - val PPMI_SCORE = new Continuous("ppmi.source_author.score") -} diff --git a/src/scala/com/twitter/timelines/prediction/features/real_graph/BUILD b/src/scala/com/twitter/timelines/prediction/features/real_graph/BUILD deleted file mode 100644 index 868acec21..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/real_graph/BUILD +++ /dev/null @@ -1,15 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/scala/com/twitter/ml/featurestore/catalog/entities/core", - "src/scala/com/twitter/ml/featurestore/catalog/entities/timelines", - "src/scala/com/twitter/ml/featurestore/catalog/features/timelines:realgraph", - "src/scala/com/twitter/ml/featurestore/lib/entity", - "src/scala/com/twitter/ml/featurestore/lib/feature", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/timelines/real_graph:real_graph-scala", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/real_graph/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/real_graph/BUILD.docx new file mode 100644 index 000000000..202cce6a7 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/real_graph/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatureStoreFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatureStoreFeatures.docx new file mode 100644 index 000000000..683df0113 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatureStoreFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatureStoreFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatureStoreFeatures.scala deleted file mode 100644 index 7c52349aa..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatureStoreFeatures.scala +++ /dev/null @@ -1,232 +0,0 @@ -package com.twitter.timelines.prediction.features.real_graph - -import com.twitter.ml.featurestore.catalog.entities.core.UserAuthor -import com.twitter.ml.featurestore.catalog.features.timelines.RealGraph -import com.twitter.ml.featurestore.lib.EdgeEntityId -import com.twitter.ml.featurestore.lib.UserId -import com.twitter.ml.featurestore.lib.feature.BoundFeatureSet -import com.twitter.ml.featurestore.lib.feature.Feature -import com.twitter.ml.featurestore.lib.feature.FeatureSet - -object RealGraphDataRecordFeatureStoreFeatures { - val boundUserAuthorfeatureSet: BoundFeatureSet = FeatureSet( - RealGraph.DestId, - RealGraph.AddressBookEmail.DaysSinceLast, - RealGraph.AddressBookEmail.ElapsedDays, - RealGraph.AddressBookEmail.Ewma, - RealGraph.AddressBookEmail.IsMissing, - RealGraph.AddressBookEmail.Mean, - RealGraph.AddressBookEmail.NonZeroDays, - RealGraph.AddressBookEmail.Variance, - RealGraph.AddressBookInBoth.DaysSinceLast, - RealGraph.AddressBookInBoth.ElapsedDays, - RealGraph.AddressBookInBoth.Ewma, - RealGraph.AddressBookInBoth.IsMissing, - RealGraph.AddressBookInBoth.Mean, - RealGraph.AddressBookInBoth.NonZeroDays, - RealGraph.AddressBookInBoth.Variance, - RealGraph.AddressBookMutualEdgeEmail.DaysSinceLast, - RealGraph.AddressBookMutualEdgeEmail.ElapsedDays, - RealGraph.AddressBookMutualEdgeEmail.Ewma, - RealGraph.AddressBookMutualEdgeEmail.IsMissing, - RealGraph.AddressBookMutualEdgeEmail.Mean, - RealGraph.AddressBookMutualEdgeEmail.NonZeroDays, - RealGraph.AddressBookMutualEdgeEmail.Variance, - RealGraph.AddressBookMutualEdgeInBoth.DaysSinceLast, - RealGraph.AddressBookMutualEdgeInBoth.ElapsedDays, - RealGraph.AddressBookMutualEdgeInBoth.Ewma, - RealGraph.AddressBookMutualEdgeInBoth.IsMissing, - RealGraph.AddressBookMutualEdgeInBoth.Mean, - RealGraph.AddressBookMutualEdgeInBoth.NonZeroDays, - RealGraph.AddressBookMutualEdgeInBoth.Variance, - RealGraph.AddressBookMutualEdgePhone.DaysSinceLast, - RealGraph.AddressBookMutualEdgePhone.ElapsedDays, - RealGraph.AddressBookMutualEdgePhone.Ewma, - RealGraph.AddressBookMutualEdgePhone.IsMissing, - RealGraph.AddressBookMutualEdgePhone.Mean, - RealGraph.AddressBookMutualEdgePhone.NonZeroDays, - RealGraph.AddressBookMutualEdgePhone.Variance, - RealGraph.AddressBookPhone.DaysSinceLast, - RealGraph.AddressBookPhone.ElapsedDays, - RealGraph.AddressBookPhone.Ewma, - RealGraph.AddressBookPhone.IsMissing, - RealGraph.AddressBookPhone.Mean, - RealGraph.AddressBookPhone.NonZeroDays, - RealGraph.AddressBookPhone.Variance, - RealGraph.DirectMessages.DaysSinceLast, - RealGraph.DirectMessages.ElapsedDays, - RealGraph.DirectMessages.Ewma, - RealGraph.DirectMessages.IsMissing, - RealGraph.DirectMessages.Mean, - RealGraph.DirectMessages.NonZeroDays, - RealGraph.DirectMessages.Variance, - RealGraph.DwellTime.DaysSinceLast, - RealGraph.DwellTime.ElapsedDays, - RealGraph.DwellTime.Ewma, - RealGraph.DwellTime.IsMissing, - RealGraph.DwellTime.Mean, - RealGraph.DwellTime.NonZeroDays, - RealGraph.DwellTime.Variance, - RealGraph.Follow.DaysSinceLast, - RealGraph.Follow.ElapsedDays, - RealGraph.Follow.Ewma, - RealGraph.Follow.IsMissing, - RealGraph.Follow.Mean, - RealGraph.Follow.NonZeroDays, - RealGraph.Follow.Variance, - RealGraph.InspectedStatuses.DaysSinceLast, - RealGraph.InspectedStatuses.ElapsedDays, - RealGraph.InspectedStatuses.Ewma, - RealGraph.InspectedStatuses.IsMissing, - RealGraph.InspectedStatuses.Mean, - RealGraph.InspectedStatuses.NonZeroDays, - RealGraph.InspectedStatuses.Variance, - RealGraph.Likes.DaysSinceLast, - RealGraph.Likes.ElapsedDays, - RealGraph.Likes.Ewma, - RealGraph.Likes.IsMissing, - RealGraph.Likes.Mean, - RealGraph.Likes.NonZeroDays, - RealGraph.Likes.Variance, - RealGraph.LinkClicks.DaysSinceLast, - RealGraph.LinkClicks.ElapsedDays, - RealGraph.LinkClicks.Ewma, - RealGraph.LinkClicks.IsMissing, - RealGraph.LinkClicks.Mean, - RealGraph.LinkClicks.NonZeroDays, - RealGraph.LinkClicks.Variance, - RealGraph.Mentions.DaysSinceLast, - RealGraph.Mentions.ElapsedDays, - RealGraph.Mentions.Ewma, - RealGraph.Mentions.IsMissing, - RealGraph.Mentions.Mean, - RealGraph.Mentions.NonZeroDays, - RealGraph.Mentions.Variance, - RealGraph.MutualFollow.DaysSinceLast, - RealGraph.MutualFollow.ElapsedDays, - RealGraph.MutualFollow.Ewma, - RealGraph.MutualFollow.IsMissing, - RealGraph.MutualFollow.Mean, - RealGraph.MutualFollow.NonZeroDays, - RealGraph.MutualFollow.Variance, - RealGraph.NumTweetQuotes.DaysSinceLast, - RealGraph.NumTweetQuotes.ElapsedDays, - RealGraph.NumTweetQuotes.Ewma, - RealGraph.NumTweetQuotes.IsMissing, - RealGraph.NumTweetQuotes.Mean, - RealGraph.NumTweetQuotes.NonZeroDays, - RealGraph.NumTweetQuotes.Variance, - RealGraph.PhotoTags.DaysSinceLast, - RealGraph.PhotoTags.ElapsedDays, - RealGraph.PhotoTags.Ewma, - RealGraph.PhotoTags.IsMissing, - RealGraph.PhotoTags.Mean, - RealGraph.PhotoTags.NonZeroDays, - RealGraph.PhotoTags.Variance, - RealGraph.ProfileViews.DaysSinceLast, - RealGraph.ProfileViews.ElapsedDays, - RealGraph.ProfileViews.Ewma, - RealGraph.ProfileViews.IsMissing, - RealGraph.ProfileViews.Mean, - RealGraph.ProfileViews.NonZeroDays, - RealGraph.ProfileViews.Variance, - RealGraph.Retweets.DaysSinceLast, - RealGraph.Retweets.ElapsedDays, - RealGraph.Retweets.Ewma, - RealGraph.Retweets.IsMissing, - RealGraph.Retweets.Mean, - RealGraph.Retweets.NonZeroDays, - RealGraph.Retweets.Variance, - RealGraph.SmsFollow.DaysSinceLast, - RealGraph.SmsFollow.ElapsedDays, - RealGraph.SmsFollow.Ewma, - RealGraph.SmsFollow.IsMissing, - RealGraph.SmsFollow.Mean, - RealGraph.SmsFollow.NonZeroDays, - RealGraph.SmsFollow.Variance, - RealGraph.TweetClicks.DaysSinceLast, - RealGraph.TweetClicks.ElapsedDays, - RealGraph.TweetClicks.Ewma, - RealGraph.TweetClicks.IsMissing, - RealGraph.TweetClicks.Mean, - RealGraph.TweetClicks.NonZeroDays, - RealGraph.TweetClicks.Variance, - RealGraph.Weight - ).bind(UserAuthor) - - private[this] val edgeFeatures: Seq[RealGraph.EdgeFeature] = Seq( - RealGraph.AddressBookEmail, - RealGraph.AddressBookInBoth, - RealGraph.AddressBookMutualEdgeEmail, - RealGraph.AddressBookMutualEdgeInBoth, - RealGraph.AddressBookMutualEdgePhone, - RealGraph.AddressBookPhone, - RealGraph.DirectMessages, - RealGraph.DwellTime, - RealGraph.Follow, - RealGraph.InspectedStatuses, - RealGraph.Likes, - RealGraph.LinkClicks, - RealGraph.Mentions, - RealGraph.MutualFollow, - RealGraph.PhotoTags, - RealGraph.ProfileViews, - RealGraph.Retweets, - RealGraph.SmsFollow, - RealGraph.TweetClicks - ) - - val htlDoubleFeatures: Set[Feature[EdgeEntityId[UserId, UserId], Double]] = { - val features = edgeFeatures.flatMap { ef => - Seq(ef.Ewma, ef.Mean, ef.Variance) - } ++ Seq(RealGraph.Weight) - features.toSet - } - - val htlLongFeatures: Set[Feature[EdgeEntityId[UserId, UserId], Long]] = { - val features = edgeFeatures.flatMap { ef => - Seq(ef.DaysSinceLast, ef.ElapsedDays, ef.NonZeroDays) - } - features.toSet - } - - private val edgeFeatureToLegacyName = Map( - RealGraph.AddressBookEmail -> "num_address_book_email", - RealGraph.AddressBookInBoth -> "num_address_book_in_both", - RealGraph.AddressBookMutualEdgeEmail -> "num_address_book_mutual_edge_email", - RealGraph.AddressBookMutualEdgeInBoth -> "num_address_book_mutual_edge_in_both", - RealGraph.AddressBookMutualEdgePhone -> "num_address_book_mutual_edge_phone", - RealGraph.AddressBookPhone -> "num_address_book_phone", - RealGraph.DirectMessages -> "direct_messages", - RealGraph.DwellTime -> "total_dwell_time", - RealGraph.Follow -> "num_follow", - RealGraph.InspectedStatuses -> "num_inspected_tweets", - RealGraph.Likes -> "num_favorites", - RealGraph.LinkClicks -> "num_link_clicks", - RealGraph.Mentions -> "num_mentions", - RealGraph.MutualFollow -> "num_mutual_follow", - RealGraph.PhotoTags -> "num_photo_tags", - RealGraph.ProfileViews -> "num_profile_views", - RealGraph.Retweets -> "num_retweets", - RealGraph.SmsFollow -> "num_sms_follow", - RealGraph.TweetClicks -> "num_tweet_clicks", - ) - - def convertFeatureToLegacyName( - prefix: String, - variance: String = "variance" - ): Map[Feature[EdgeEntityId[UserId, UserId], _ >: Long with Double <: AnyVal], String] = - edgeFeatureToLegacyName.flatMap { - case (k, v) => - Seq( - k.NonZeroDays -> s"${prefix}.${v}.non_zero_days", - k.DaysSinceLast -> s"${prefix}.${v}.days_since_last", - k.ElapsedDays -> s"${prefix}.${v}.elapsed_days", - k.Ewma -> s"${prefix}.${v}.ewma", - k.Mean -> s"${prefix}.${v}.mean", - k.Variance -> s"${prefix}.${v}.${variance}", - ) - } ++ Map( - RealGraph.Weight -> (prefix + ".weight") - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatures.docx new file mode 100644 index 000000000..bf20c172c Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatures.scala deleted file mode 100644 index 4c1915944..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/real_graph/RealGraphDataRecordFeatures.scala +++ /dev/null @@ -1,534 +0,0 @@ -package com.twitter.timelines.prediction.features.real_graph - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.Feature._ -import com.twitter.timelines.real_graph.v1.thriftscala.RealGraphEdgeFeature -import scala.collection.JavaConverters._ - - -object RealGraphDataRecordFeatures { - // the source user id - val SRC_ID = new Discrete("realgraph.src_id", Set(UserId).asJava) - // the destination user id - val DST_ID = new Discrete("realgraph.dst_id", Set(UserId).asJava) - // real graph weight - val WEIGHT = new Continuous("realgraph.weight", Set(UsersRealGraphScore).asJava) - // the number of retweets that the source user sent to the destination user - val NUM_RETWEETS_MEAN = - new Continuous("realgraph.num_retweets.mean", Set(PrivateRetweets, PublicRetweets).asJava) - val NUM_RETWEETS_EWMA = - new Continuous("realgraph.num_retweets.ewma", Set(PrivateRetweets, PublicRetweets).asJava) - val NUM_RETWEETS_VARIANCE = - new Continuous("realgraph.num_retweets.variance", Set(PrivateRetweets, PublicRetweets).asJava) - val NUM_RETWEETS_NON_ZERO_DAYS = new Continuous( - "realgraph.num_retweets.non_zero_days", - Set(PrivateRetweets, PublicRetweets).asJava) - val NUM_RETWEETS_ELAPSED_DAYS = new Continuous( - "realgraph.num_retweets.elapsed_days", - Set(PrivateRetweets, PublicRetweets).asJava) - val NUM_RETWEETS_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_retweets.days_since_last", - Set(PrivateRetweets, PublicRetweets).asJava) - val NUM_RETWEETS_IS_MISSING = - new Binary("realgraph.num_retweets.is_missing", Set(PrivateRetweets, PublicRetweets).asJava) - // the number of favories that the source user sent to the destination user - val NUM_FAVORITES_MEAN = - new Continuous("realgraph.num_favorites.mean", Set(PublicLikes, PrivateLikes).asJava) - val NUM_FAVORITES_EWMA = - new Continuous("realgraph.num_favorites.ewma", Set(PublicLikes, PrivateLikes).asJava) - val NUM_FAVORITES_VARIANCE = - new Continuous("realgraph.num_favorites.variance", Set(PublicLikes, PrivateLikes).asJava) - val NUM_FAVORITES_NON_ZERO_DAYS = - new Continuous("realgraph.num_favorites.non_zero_days", Set(PublicLikes, PrivateLikes).asJava) - val NUM_FAVORITES_ELAPSED_DAYS = - new Continuous("realgraph.num_favorites.elapsed_days", Set(PublicLikes, PrivateLikes).asJava) - val NUM_FAVORITES_DAYS_SINCE_LAST = - new Continuous("realgraph.num_favorites.days_since_last", Set(PublicLikes, PrivateLikes).asJava) - val NUM_FAVORITES_IS_MISSING = - new Binary("realgraph.num_favorites.is_missing", Set(PublicLikes, PrivateLikes).asJava) - // the number of mentions that the source user sent to the destination user - val NUM_MENTIONS_MEAN = - new Continuous("realgraph.num_mentions.mean", Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_MENTIONS_EWMA = - new Continuous("realgraph.num_mentions.ewma", Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_MENTIONS_VARIANCE = new Continuous( - "realgraph.num_mentions.variance", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_MENTIONS_NON_ZERO_DAYS = new Continuous( - "realgraph.num_mentions.non_zero_days", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_MENTIONS_ELAPSED_DAYS = new Continuous( - "realgraph.num_mentions.elapsed_days", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_MENTIONS_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_mentions.days_since_last", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_MENTIONS_IS_MISSING = new Binary( - "realgraph.num_mentions.is_missing", - Set(EngagementsPrivate, EngagementsPublic).asJava) - // the number of direct messages that the source user sent to the destination user - val NUM_DIRECT_MESSAGES_MEAN = new Continuous( - "realgraph.num_direct_messages.mean", - Set(DmEntitiesAndMetadata, CountOfDms).asJava) - val NUM_DIRECT_MESSAGES_EWMA = new Continuous( - "realgraph.num_direct_messages.ewma", - Set(DmEntitiesAndMetadata, CountOfDms).asJava) - val NUM_DIRECT_MESSAGES_VARIANCE = new Continuous( - "realgraph.num_direct_messages.variance", - Set(DmEntitiesAndMetadata, CountOfDms).asJava) - val NUM_DIRECT_MESSAGES_NON_ZERO_DAYS = new Continuous( - "realgraph.num_direct_messages.non_zero_days", - Set(DmEntitiesAndMetadata, CountOfDms).asJava - ) - val NUM_DIRECT_MESSAGES_ELAPSED_DAYS = new Continuous( - "realgraph.num_direct_messages.elapsed_days", - Set(DmEntitiesAndMetadata, CountOfDms).asJava - ) - val NUM_DIRECT_MESSAGES_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_direct_messages.days_since_last", - Set(DmEntitiesAndMetadata, CountOfDms).asJava - ) - val NUM_DIRECT_MESSAGES_IS_MISSING = new Binary( - "realgraph.num_direct_messages.is_missing", - Set(DmEntitiesAndMetadata, CountOfDms).asJava) - // the number of tweet clicks that the source user sent to the destination user - val NUM_TWEET_CLICKS_MEAN = - new Continuous("realgraph.num_tweet_clicks.mean", Set(TweetsClicked).asJava) - val NUM_TWEET_CLICKS_EWMA = - new Continuous("realgraph.num_tweet_clicks.ewma", Set(TweetsClicked).asJava) - val NUM_TWEET_CLICKS_VARIANCE = - new Continuous("realgraph.num_tweet_clicks.variance", Set(TweetsClicked).asJava) - val NUM_TWEET_CLICKS_NON_ZERO_DAYS = - new Continuous("realgraph.num_tweet_clicks.non_zero_days", Set(TweetsClicked).asJava) - val NUM_TWEET_CLICKS_ELAPSED_DAYS = - new Continuous("realgraph.num_tweet_clicks.elapsed_days", Set(TweetsClicked).asJava) - val NUM_TWEET_CLICKS_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_tweet_clicks.days_since_last", - Set(TweetsClicked).asJava - ) - val NUM_TWEET_CLICKS_IS_MISSING = - new Binary("realgraph.num_tweet_clicks.is_missing", Set(TweetsClicked).asJava) - // the number of link clicks that the source user sent to the destination user - val NUM_LINK_CLICKS_MEAN = - new Continuous("realgraph.num_link_clicks.mean", Set(CountOfTweetEntitiesClicked).asJava) - val NUM_LINK_CLICKS_EWMA = - new Continuous("realgraph.num_link_clicks.ewma", Set(CountOfTweetEntitiesClicked).asJava) - val NUM_LINK_CLICKS_VARIANCE = - new Continuous("realgraph.num_link_clicks.variance", Set(CountOfTweetEntitiesClicked).asJava) - val NUM_LINK_CLICKS_NON_ZERO_DAYS = new Continuous( - "realgraph.num_link_clicks.non_zero_days", - Set(CountOfTweetEntitiesClicked).asJava) - val NUM_LINK_CLICKS_ELAPSED_DAYS = new Continuous( - "realgraph.num_link_clicks.elapsed_days", - Set(CountOfTweetEntitiesClicked).asJava) - val NUM_LINK_CLICKS_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_link_clicks.days_since_last", - Set(CountOfTweetEntitiesClicked).asJava) - val NUM_LINK_CLICKS_IS_MISSING = - new Binary("realgraph.num_link_clicks.is_missing", Set(CountOfTweetEntitiesClicked).asJava) - // the number of profile views that the source user sent to the destination user - val NUM_PROFILE_VIEWS_MEAN = - new Continuous("realgraph.num_profile_views.mean", Set(ProfilesViewed).asJava) - val NUM_PROFILE_VIEWS_EWMA = - new Continuous("realgraph.num_profile_views.ewma", Set(ProfilesViewed).asJava) - val NUM_PROFILE_VIEWS_VARIANCE = - new Continuous("realgraph.num_profile_views.variance", Set(ProfilesViewed).asJava) - val NUM_PROFILE_VIEWS_NON_ZERO_DAYS = - new Continuous("realgraph.num_profile_views.non_zero_days", Set(ProfilesViewed).asJava) - val NUM_PROFILE_VIEWS_ELAPSED_DAYS = - new Continuous("realgraph.num_profile_views.elapsed_days", Set(ProfilesViewed).asJava) - val NUM_PROFILE_VIEWS_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_profile_views.days_since_last", - Set(ProfilesViewed).asJava - ) - val NUM_PROFILE_VIEWS_IS_MISSING = - new Binary("realgraph.num_profile_views.is_missing", Set(ProfilesViewed).asJava) - // the total dwell time the source user spends on the target user's tweets - val TOTAL_DWELL_TIME_MEAN = - new Continuous("realgraph.total_dwell_time.mean", Set(CountOfImpression).asJava) - val TOTAL_DWELL_TIME_EWMA = - new Continuous("realgraph.total_dwell_time.ewma", Set(CountOfImpression).asJava) - val TOTAL_DWELL_TIME_VARIANCE = - new Continuous("realgraph.total_dwell_time.variance", Set(CountOfImpression).asJava) - val TOTAL_DWELL_TIME_NON_ZERO_DAYS = - new Continuous("realgraph.total_dwell_time.non_zero_days", Set(CountOfImpression).asJava) - val TOTAL_DWELL_TIME_ELAPSED_DAYS = - new Continuous("realgraph.total_dwell_time.elapsed_days", Set(CountOfImpression).asJava) - val TOTAL_DWELL_TIME_DAYS_SINCE_LAST = new Continuous( - "realgraph.total_dwell_time.days_since_last", - Set(CountOfImpression).asJava - ) - val TOTAL_DWELL_TIME_IS_MISSING = - new Binary("realgraph.total_dwell_time.is_missing", Set(CountOfImpression).asJava) - // the number of the target user's tweets that the source user has inspected - val NUM_INSPECTED_TWEETS_MEAN = - new Continuous("realgraph.num_inspected_tweets.mean", Set(CountOfImpression).asJava) - val NUM_INSPECTED_TWEETS_EWMA = - new Continuous("realgraph.num_inspected_tweets.ewma", Set(CountOfImpression).asJava) - val NUM_INSPECTED_TWEETS_VARIANCE = - new Continuous("realgraph.num_inspected_tweets.variance", Set(CountOfImpression).asJava) - val NUM_INSPECTED_TWEETS_NON_ZERO_DAYS = new Continuous( - "realgraph.num_inspected_tweets.non_zero_days", - Set(CountOfImpression).asJava - ) - val NUM_INSPECTED_TWEETS_ELAPSED_DAYS = new Continuous( - "realgraph.num_inspected_tweets.elapsed_days", - Set(CountOfImpression).asJava - ) - val NUM_INSPECTED_TWEETS_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_inspected_tweets.days_since_last", - Set(CountOfImpression).asJava - ) - val NUM_INSPECTED_TWEETS_IS_MISSING = - new Binary("realgraph.num_inspected_tweets.is_missing", Set(CountOfImpression).asJava) - // the number of photos in which the source user has tagged the target user - val NUM_PHOTO_TAGS_MEAN = new Continuous( - "realgraph.num_photo_tags.mean", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_PHOTO_TAGS_EWMA = new Continuous( - "realgraph.num_photo_tags.ewma", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_PHOTO_TAGS_VARIANCE = new Continuous( - "realgraph.num_photo_tags.variance", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_PHOTO_TAGS_NON_ZERO_DAYS = new Continuous( - "realgraph.num_photo_tags.non_zero_days", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_PHOTO_TAGS_ELAPSED_DAYS = new Continuous( - "realgraph.num_photo_tags.elapsed_days", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_PHOTO_TAGS_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_photo_tags.days_since_last", - Set(EngagementsPrivate, EngagementsPublic).asJava) - val NUM_PHOTO_TAGS_IS_MISSING = new Binary( - "realgraph.num_photo_tags.is_missing", - Set(EngagementsPrivate, EngagementsPublic).asJava) - - val NUM_FOLLOW_MEAN = new Continuous( - "realgraph.num_follow.mean", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_FOLLOW_EWMA = new Continuous( - "realgraph.num_follow.ewma", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_FOLLOW_VARIANCE = new Continuous( - "realgraph.num_follow.variance", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_FOLLOW_NON_ZERO_DAYS = new Continuous( - "realgraph.num_follow.non_zero_days", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_FOLLOW_ELAPSED_DAYS = new Continuous( - "realgraph.num_follow.elapsed_days", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_FOLLOW_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_follow.days_since_last", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_FOLLOW_IS_MISSING = new Binary( - "realgraph.num_follow.is_missing", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - // the number of blocks that the source user sent to the destination user - val NUM_BLOCKS_MEAN = - new Continuous("realgraph.num_blocks.mean", Set(CountOfBlocks).asJava) - val NUM_BLOCKS_EWMA = - new Continuous("realgraph.num_blocks.ewma", Set(CountOfBlocks).asJava) - val NUM_BLOCKS_VARIANCE = - new Continuous("realgraph.num_blocks.variance", Set(CountOfBlocks).asJava) - val NUM_BLOCKS_NON_ZERO_DAYS = - new Continuous("realgraph.num_blocks.non_zero_days", Set(CountOfBlocks).asJava) - val NUM_BLOCKS_ELAPSED_DAYS = - new Continuous("realgraph.num_blocks.elapsed_days", Set(CountOfBlocks).asJava) - val NUM_BLOCKS_DAYS_SINCE_LAST = - new Continuous("realgraph.num_blocks.days_since_last", Set(CountOfBlocks).asJava) - val NUM_BLOCKS_IS_MISSING = - new Binary("realgraph.num_blocks.is_missing", Set(CountOfBlocks).asJava) - // the number of mutes that the source user sent to the destination user - val NUM_MUTES_MEAN = - new Continuous("realgraph.num_mutes.mean", Set(CountOfMutes).asJava) - val NUM_MUTES_EWMA = - new Continuous("realgraph.num_mutes.ewma", Set(CountOfMutes).asJava) - val NUM_MUTES_VARIANCE = - new Continuous("realgraph.num_mutes.variance", Set(CountOfMutes).asJava) - val NUM_MUTES_NON_ZERO_DAYS = - new Continuous("realgraph.num_mutes.non_zero_days", Set(CountOfMutes).asJava) - val NUM_MUTES_ELAPSED_DAYS = - new Continuous("realgraph.num_mutes.elapsed_days", Set(CountOfMutes).asJava) - val NUM_MUTES_DAYS_SINCE_LAST = - new Continuous("realgraph.num_mutes.days_since_last", Set(CountOfMutes).asJava) - val NUM_MUTES_IS_MISSING = - new Binary("realgraph.num_mutes.is_missing", Set(CountOfMutes).asJava) - // the number of report as abuses that the source user sent to the destination user - val NUM_REPORTS_AS_ABUSES_MEAN = - new Continuous("realgraph.num_report_as_abuses.mean", Set(CountOfAbuseReports).asJava) - val NUM_REPORTS_AS_ABUSES_EWMA = - new Continuous("realgraph.num_report_as_abuses.ewma", Set(CountOfAbuseReports).asJava) - val NUM_REPORTS_AS_ABUSES_VARIANCE = - new Continuous("realgraph.num_report_as_abuses.variance", Set(CountOfAbuseReports).asJava) - val NUM_REPORTS_AS_ABUSES_NON_ZERO_DAYS = - new Continuous("realgraph.num_report_as_abuses.non_zero_days", Set(CountOfAbuseReports).asJava) - val NUM_REPORTS_AS_ABUSES_ELAPSED_DAYS = - new Continuous("realgraph.num_report_as_abuses.elapsed_days", Set(CountOfAbuseReports).asJava) - val NUM_REPORTS_AS_ABUSES_DAYS_SINCE_LAST = - new Continuous( - "realgraph.num_report_as_abuses.days_since_last", - Set(CountOfAbuseReports).asJava) - val NUM_REPORTS_AS_ABUSES_IS_MISSING = - new Binary("realgraph.num_report_as_abuses.is_missing", Set(CountOfAbuseReports).asJava) - // the number of report as spams that the source user sent to the destination user - val NUM_REPORTS_AS_SPAMS_MEAN = - new Continuous( - "realgraph.num_report_as_spams.mean", - Set(CountOfAbuseReports, SafetyRelationships).asJava) - val NUM_REPORTS_AS_SPAMS_EWMA = - new Continuous( - "realgraph.num_report_as_spams.ewma", - Set(CountOfAbuseReports, SafetyRelationships).asJava) - val NUM_REPORTS_AS_SPAMS_VARIANCE = - new Continuous( - "realgraph.num_report_as_spams.variance", - Set(CountOfAbuseReports, SafetyRelationships).asJava) - val NUM_REPORTS_AS_SPAMS_NON_ZERO_DAYS = - new Continuous( - "realgraph.num_report_as_spams.non_zero_days", - Set(CountOfAbuseReports, SafetyRelationships).asJava) - val NUM_REPORTS_AS_SPAMS_ELAPSED_DAYS = - new Continuous( - "realgraph.num_report_as_spams.elapsed_days", - Set(CountOfAbuseReports, SafetyRelationships).asJava) - val NUM_REPORTS_AS_SPAMS_DAYS_SINCE_LAST = - new Continuous( - "realgraph.num_report_as_spams.days_since_last", - Set(CountOfAbuseReports, SafetyRelationships).asJava) - val NUM_REPORTS_AS_SPAMS_IS_MISSING = - new Binary( - "realgraph.num_report_as_spams.is_missing", - Set(CountOfAbuseReports, SafetyRelationships).asJava) - - val NUM_MUTUAL_FOLLOW_MEAN = new Continuous( - "realgraph.num_mutual_follow.mean", - Set( - Follow, - PrivateAccountsFollowedBy, - PublicAccountsFollowedBy, - PrivateAccountsFollowing, - PublicAccountsFollowing).asJava - ) - val NUM_MUTUAL_FOLLOW_EWMA = new Continuous( - "realgraph.num_mutual_follow.ewma", - Set( - Follow, - PrivateAccountsFollowedBy, - PublicAccountsFollowedBy, - PrivateAccountsFollowing, - PublicAccountsFollowing).asJava - ) - val NUM_MUTUAL_FOLLOW_VARIANCE = new Continuous( - "realgraph.num_mutual_follow.variance", - Set( - Follow, - PrivateAccountsFollowedBy, - PublicAccountsFollowedBy, - PrivateAccountsFollowing, - PublicAccountsFollowing).asJava - ) - val NUM_MUTUAL_FOLLOW_NON_ZERO_DAYS = new Continuous( - "realgraph.num_mutual_follow.non_zero_days", - Set( - Follow, - PrivateAccountsFollowedBy, - PublicAccountsFollowedBy, - PrivateAccountsFollowing, - PublicAccountsFollowing).asJava - ) - val NUM_MUTUAL_FOLLOW_ELAPSED_DAYS = new Continuous( - "realgraph.num_mutual_follow.elapsed_days", - Set( - Follow, - PrivateAccountsFollowedBy, - PublicAccountsFollowedBy, - PrivateAccountsFollowing, - PublicAccountsFollowing).asJava - ) - val NUM_MUTUAL_FOLLOW_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_mutual_follow.days_since_last", - Set( - Follow, - PrivateAccountsFollowedBy, - PublicAccountsFollowedBy, - PrivateAccountsFollowing, - PublicAccountsFollowing).asJava - ) - val NUM_MUTUAL_FOLLOW_IS_MISSING = new Binary( - "realgraph.num_mutual_follow.is_missing", - Set( - Follow, - PrivateAccountsFollowedBy, - PublicAccountsFollowedBy, - PrivateAccountsFollowing, - PublicAccountsFollowing).asJava - ) - - val NUM_SMS_FOLLOW_MEAN = new Continuous( - "realgraph.num_sms_follow.mean", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_SMS_FOLLOW_EWMA = new Continuous( - "realgraph.num_sms_follow.ewma", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_SMS_FOLLOW_VARIANCE = new Continuous( - "realgraph.num_sms_follow.variance", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_SMS_FOLLOW_NON_ZERO_DAYS = new Continuous( - "realgraph.num_sms_follow.non_zero_days", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_SMS_FOLLOW_ELAPSED_DAYS = new Continuous( - "realgraph.num_sms_follow.elapsed_days", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_SMS_FOLLOW_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_sms_follow.days_since_last", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - val NUM_SMS_FOLLOW_IS_MISSING = new Binary( - "realgraph.num_sms_follow.is_missing", - Set(Follow, PrivateAccountsFollowedBy, PublicAccountsFollowedBy).asJava) - - val NUM_ADDRESS_BOOK_EMAIL_MEAN = - new Continuous("realgraph.num_address_book_email.mean", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_EMAIL_EWMA = - new Continuous("realgraph.num_address_book_email.ewma", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_EMAIL_VARIANCE = - new Continuous("realgraph.num_address_book_email.variance", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_EMAIL_NON_ZERO_DAYS = new Continuous( - "realgraph.num_address_book_email.non_zero_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_EMAIL_ELAPSED_DAYS = new Continuous( - "realgraph.num_address_book_email.elapsed_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_EMAIL_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_address_book_email.days_since_last", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_EMAIL_IS_MISSING = - new Binary("realgraph.num_address_book_email.is_missing", Set(AddressBook).asJava) - - val NUM_ADDRESS_BOOK_IN_BOTH_MEAN = - new Continuous("realgraph.num_address_book_in_both.mean", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_IN_BOTH_EWMA = - new Continuous("realgraph.num_address_book_in_both.ewma", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_IN_BOTH_VARIANCE = new Continuous( - "realgraph.num_address_book_in_both.variance", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_IN_BOTH_NON_ZERO_DAYS = new Continuous( - "realgraph.num_address_book_in_both.non_zero_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_IN_BOTH_ELAPSED_DAYS = new Continuous( - "realgraph.num_address_book_in_both.elapsed_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_IN_BOTH_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_address_book_in_both.days_since_last", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_IN_BOTH_IS_MISSING = new Binary( - "realgraph.num_address_book_in_both.is_missing", - Set(AddressBook).asJava - ) - - val NUM_ADDRESS_BOOK_PHONE_MEAN = - new Continuous("realgraph.num_address_book_phone.mean", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_PHONE_EWMA = - new Continuous("realgraph.num_address_book_phone.ewma", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_PHONE_VARIANCE = - new Continuous("realgraph.num_address_book_phone.variance", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_PHONE_NON_ZERO_DAYS = new Continuous( - "realgraph.num_address_book_phone.non_zero_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_PHONE_ELAPSED_DAYS = new Continuous( - "realgraph.num_address_book_phone.elapsed_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_PHONE_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_address_book_phone.days_since_last", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_PHONE_IS_MISSING = - new Binary("realgraph.num_address_book_phone.is_missing", Set(AddressBook).asJava) - - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_MEAN = - new Continuous("realgraph.num_address_book_mutual_edge_email.mean", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_EWMA = - new Continuous("realgraph.num_address_book_mutual_edge_email.ewma", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_VARIANCE = - new Continuous("realgraph.num_address_book_mutual_edge_email.variance", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_NON_ZERO_DAYS = new Continuous( - "realgraph.num_address_book_mutual_edge_email.non_zero_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_ELAPSED_DAYS = new Continuous( - "realgraph.num_address_book_mutual_edge_email.elapsed_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_address_book_mutual_edge_email.days_since_last", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_EMAIL_IS_MISSING = - new Binary("realgraph.num_address_book_mutual_edge_email.is_missing", Set(AddressBook).asJava) - - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_MEAN = - new Continuous("realgraph.num_address_book_mutual_edge_in_both.mean", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_EWMA = - new Continuous("realgraph.num_address_book_mutual_edge_in_both.ewma", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_VARIANCE = new Continuous( - "realgraph.num_address_book_mutual_edge_in_both.variance", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_NON_ZERO_DAYS = new Continuous( - "realgraph.num_address_book_mutual_edge_in_both.non_zero_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_ELAPSED_DAYS = new Continuous( - "realgraph.num_address_book_mutual_edge_in_both.elapsed_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_address_book_mutual_edge_in_both.days_since_last", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_IN_BOTH_IS_MISSING = new Binary( - "realgraph.num_address_book_mutual_edge_in_both.is_missing", - Set(AddressBook).asJava - ) - - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_MEAN = - new Continuous("realgraph.num_address_book_mutual_edge_phone.mean", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_EWMA = - new Continuous("realgraph.num_address_book_mutual_edge_phone.ewma", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_VARIANCE = - new Continuous("realgraph.num_address_book_mutual_edge_phone.variance", Set(AddressBook).asJava) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_NON_ZERO_DAYS = new Continuous( - "realgraph.num_address_book_mutual_edge_phone.non_zero_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_ELAPSED_DAYS = new Continuous( - "realgraph.num_address_book_mutual_edge_phone.elapsed_days", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_DAYS_SINCE_LAST = new Continuous( - "realgraph.num_address_book_mutual_edge_phone.days_since_last", - Set(AddressBook).asJava - ) - val NUM_ADDRESS_BOOK_MUTUAL_EDGE_PHONE_IS_MISSING = - new Binary("realgraph.num_address_book_mutual_edge_phone.is_missing", Set(AddressBook).asJava) -} - -case class RealGraphEdgeDataRecordFeatures( - edgeFeatureOpt: Option[RealGraphEdgeFeature], - meanFeature: Continuous, - ewmaFeature: Continuous, - varianceFeature: Continuous, - nonZeroDaysFeature: Continuous, - elapsedDaysFeature: Continuous, - daysSinceLastFeature: Continuous, - isMissingFeature: Binary) diff --git a/src/scala/com/twitter/timelines/prediction/features/recap/BUILD b/src/scala/com/twitter/timelines/prediction/features/recap/BUILD deleted file mode 100644 index 6fc497bf3..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/recap/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/recap/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/recap/BUILD.docx new file mode 100644 index 000000000..7e4b99410 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/recap/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeatures.docx new file mode 100644 index 000000000..48a7767d6 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeatures.scala deleted file mode 100644 index c8ee6da7d..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeatures.scala +++ /dev/null @@ -1,967 +0,0 @@ -package com.twitter.timelines.prediction.features.recap - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.Feature.Binary -import com.twitter.ml.api.Feature.Continuous -import com.twitter.ml.api.Feature.Discrete -import com.twitter.ml.api.Feature.SparseBinary -import com.twitter.ml.api.Feature.Text -import scala.collection.JavaConverters._ - -object RecapFeatures extends RecapFeatures("") -object InReplyToRecapFeatures extends RecapFeatures("in_reply_to_tweet") - -class RecapFeatures(prefix: String) { - private def name(featureName: String): String = { - if (prefix.nonEmpty) { - s"$prefix.$featureName" - } else { - featureName - } - } - - val IS_IPAD_CLIENT = new Binary(name("recap.client.is_ipad"), Set(ClientType).asJava) - val IS_WEB_CLIENT = new Binary(name("recap.client.is_web"), Set(ClientType).asJava) - val IS_IPHONE_CLIENT = new Binary(name("recap.client.is_phone"), Set(ClientType).asJava) - val IS_ANDROID_CLIENT = new Binary(name("recap.client.is_android"), Set(ClientType).asJava) - val IS_ANDROID_TABLET_CLIENT = - new Binary(name("recap.client.is_android_tablet"), Set(ClientType).asJava) - - // features from userAgent - val CLIENT_NAME = new Text(name("recap.user_agent.client_name"), Set(ClientType).asJava) - val CLIENT_SOURCE = new Discrete(name("recap.user_agent.client_source"), Set(ClientType).asJava) - val CLIENT_VERSION = new Text(name("recap.user_agent.client_version"), Set(ClientVersion).asJava) - val CLIENT_VERSION_CODE = - new Text(name("recap.user_agent.client_version_code"), Set(ClientVersion).asJava) - val DEVICE = new Text(name("recap.user_agent.device"), Set(DeviceType).asJava) - val FROM_DOG_FOOD = new Binary(name("recap.meta.from_dog_food"), Set(UserAgent).asJava) - val FROM_TWITTER_CLIENT = - new Binary(name("recap.user_agent.from_twitter_client"), Set(UserAgent).asJava) - val MANUFACTURER = new Text(name("recap.user_agent.manufacturer"), Set(UserAgent).asJava) - val MODEL = new Text(name("recap.user_agent.model"), Set(UserAgent).asJava) - val NETWORK_CONNECTION = - new Discrete(name("recap.user_agent.network_connection"), Set(UserAgent).asJava) - val SDK_VERSION = new Text(name("recap.user_agent.sdk_version"), Set(AppId, UserAgent).asJava) - - // engagement - val IS_RETWEETED = new Binary( - name("recap.engagement.is_retweeted"), - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_FAVORITED = new Binary( - name("recap.engagement.is_favorited"), - Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED = new Binary( - name("recap.engagement.is_replied"), - Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) - // v1: post click engagements: fav, reply - val IS_GOOD_CLICKED_CONVO_DESC_V1 = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_favorited_or_replied"), - Set( - PublicLikes, - PrivateLikes, - PublicReplies, - PrivateReplies, - EngagementsPrivate, - EngagementsPublic).asJava) - // v2: post click engagements: click - val IS_GOOD_CLICKED_CONVO_DESC_V2 = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_v2"), - Set(TweetsClicked, EngagementsPrivate).asJava) - - val IS_GOOD_CLICKED_CONVO_DESC_FAVORITED = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_favorited"), - Set(PublicLikes, PrivateLikes, EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_REPLIED = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_replied"), - Set(PublicReplies, PrivateReplies, EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_RETWEETED = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_retweeted"), - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_CLICKED = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_clicked"), - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_FOLLOWED = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_followed"), - Set(EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_SHARE_DM_CLICKED = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_share_dm_clicked"), - Set(EngagementsPrivate).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_PROFILE_CLICKED = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_profile_clicked"), - Set(EngagementsPrivate).asJava) - - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_0 = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_uam_gt_0"), - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_1 = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_uam_gt_1"), - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_2 = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_uam_gt_2"), - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_GOOD_CLICKED_CONVO_DESC_UAM_GT_3 = new Binary( - name("recap.engagement.is_good_clicked_convo_desc_uam_gt_3"), - Set(EngagementsPrivate, EngagementsPublic).asJava) - - val IS_TWEET_DETAIL_DWELLED = new Binary( - name("recap.engagement.is_tweet_detail_dwelled"), - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_8_SEC = new Binary( - name("recap.engagement.is_tweet_detail_dwelled_8_sec"), - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_15_SEC = new Binary( - name("recap.engagement.is_tweet_detail_dwelled_15_sec"), - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_25_SEC = new Binary( - name("recap.engagement.is_tweet_detail_dwelled_25_sec"), - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_TWEET_DETAIL_DWELLED_30_SEC = new Binary( - name("recap.engagement.is_tweet_detail_dwelled_30_sec"), - Set(TweetsClicked, EngagementsPrivate).asJava) - - val IS_PROFILE_DWELLED = new Binary( - "recap.engagement.is_profile_dwelled", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_DWELLED_10_SEC = new Binary( - "recap.engagement.is_profile_dwelled_10_sec", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_DWELLED_20_SEC = new Binary( - "recap.engagement.is_profile_dwelled_20_sec", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_DWELLED_30_SEC = new Binary( - "recap.engagement.is_profile_dwelled_30_sec", - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED = new Binary( - "recap.engagement.is_fullscreen_video_dwelled", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Binary( - "recap.engagement.is_fullscreen_video_dwelled_5_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Binary( - "recap.engagement.is_fullscreen_video_dwelled_10_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Binary( - "recap.engagement.is_fullscreen_video_dwelled_20_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Binary( - "recap.engagement.is_fullscreen_video_dwelled_30_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_LINK_DWELLED_15_SEC = new Binary( - "recap.engagement.is_link_dwelled_15_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_LINK_DWELLED_30_SEC = new Binary( - "recap.engagement.is_link_dwelled_30_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_LINK_DWELLED_60_SEC = new Binary( - "recap.engagement.is_link_dwelled_60_sec", - Set(MediaEngagementActivities, EngagementTypePrivate, EngagementsPrivate).asJava) - - val IS_QUOTED = new Binary( - name("recap.engagement.is_quoted"), - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_RETWEETED_WITHOUT_QUOTE = new Binary( - name("recap.engagement.is_retweeted_without_quote"), - Set(PublicRetweets, PrivateRetweets, EngagementsPrivate, EngagementsPublic).asJava) - val IS_CLICKED = - new Binary(name("recap.engagement.is_clicked"), Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_DWELLED = new Binary(name("recap.engagement.is_dwelled"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_IN_BOUNDS_V1 = - new Binary(name("recap.engagement.is_dwelled_in_bounds_v1"), Set(EngagementsPrivate).asJava) - val DWELL_NORMALIZED_OVERALL = new Continuous( - name("recap.engagement.dwell_normalized_overall"), - Set(EngagementsPrivate).asJava) - val DWELL_CDF_OVERALL = - new Continuous(name("recap.engagement.dwell_cdf_overall"), Set(EngagementsPrivate).asJava) - val DWELL_CDF = new Continuous(name("recap.engagement.dwell_cdf"), Set(EngagementsPrivate).asJava) - - val IS_DWELLED_1S = - new Binary(name("recap.engagement.is_dwelled_1s"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_2S = - new Binary(name("recap.engagement.is_dwelled_2s"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_3S = - new Binary(name("recap.engagement.is_dwelled_3s"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_4S = - new Binary(name("recap.engagement.is_dwelled_4s"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_5S = - new Binary(name("recap.engagement.is_dwelled_5s"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_6S = - new Binary(name("recap.engagement.is_dwelled_6s"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_7S = - new Binary(name("recap.engagement.is_dwelled_7s"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_8S = - new Binary(name("recap.engagement.is_dwelled_8s"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_9S = - new Binary(name("recap.engagement.is_dwelled_9s"), Set(EngagementsPrivate).asJava) - val IS_DWELLED_10S = - new Binary(name("recap.engagement.is_dwelled_10s"), Set(EngagementsPrivate).asJava) - - val IS_SKIPPED_1S = - new Binary(name("recap.engagement.is_skipped_1s"), Set(EngagementsPrivate).asJava) - val IS_SKIPPED_2S = - new Binary(name("recap.engagement.is_skipped_2s"), Set(EngagementsPrivate).asJava) - val IS_SKIPPED_3S = - new Binary(name("recap.engagement.is_skipped_3s"), Set(EngagementsPrivate).asJava) - val IS_SKIPPED_4S = - new Binary(name("recap.engagement.is_skipped_4s"), Set(EngagementsPrivate).asJava) - val IS_SKIPPED_5S = - new Binary(name("recap.engagement.is_skipped_5s"), Set(EngagementsPrivate).asJava) - val IS_SKIPPED_6S = - new Binary(name("recap.engagement.is_skipped_6s"), Set(EngagementsPrivate).asJava) - val IS_SKIPPED_7S = - new Binary(name("recap.engagement.is_skipped_7s"), Set(EngagementsPrivate).asJava) - val IS_SKIPPED_8S = - new Binary(name("recap.engagement.is_skipped_8s"), Set(EngagementsPrivate).asJava) - val IS_SKIPPED_9S = - new Binary(name("recap.engagement.is_skipped_9s"), Set(EngagementsPrivate).asJava) - val IS_SKIPPED_10S = - new Binary(name("recap.engagement.is_skipped_10s"), Set(EngagementsPrivate).asJava) - - val IS_IMPRESSED = - new Binary(name("recap.engagement.is_impressed"), Set(EngagementsPrivate).asJava) - val IS_FOLLOWED = - new Binary("recap.engagement.is_followed", Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_PROFILE_CLICKED = new Binary( - name("recap.engagement.is_profile_clicked"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_OPEN_LINKED = new Binary( - name("recap.engagement.is_open_linked"), - Set(EngagementsPrivate, LinksClickedOn).asJava) - val IS_PHOTO_EXPANDED = - new Binary(name("recap.engagement.is_photo_expanded"), Set(EngagementsPrivate).asJava) - val IS_VIDEO_VIEWED = - new Binary(name("recap.engagement.is_video_viewed"), Set(EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_START = - new Binary(name("recap.engagement.is_video_playback_start"), Set(EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_25 = - new Binary(name("recap.engagement.is_video_playback_25"), Set(EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_50 = - new Binary(name("recap.engagement.is_video_playback_50"), Set(EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_75 = - new Binary(name("recap.engagement.is_video_playback_75"), Set(EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_95 = - new Binary(name("recap.engagement.is_video_playback_95"), Set(EngagementsPrivate).asJava) - val IS_VIDEO_PLAYBACK_COMPLETE = - new Binary(name("recap.engagement.is_video_playback_complete"), Set(EngagementsPrivate).asJava) - val IS_VIDEO_VIEWED_AND_PLAYBACK_50 = new Binary( - name("recap.engagement.is_video_viewed_and_playback_50"), - Set(EngagementsPrivate).asJava) - val IS_VIDEO_QUALITY_VIEWED = new Binary( - name("recap.engagement.is_video_quality_viewed"), - Set(EngagementsPrivate).asJava - ) - val IS_TWEET_SHARE_DM_CLICKED = - new Binary(name("recap.engagement.is_tweet_share_dm_clicked"), Set(EngagementsPrivate).asJava) - val IS_TWEET_SHARE_DM_SENT = - new Binary(name("recap.engagement.is_tweet_share_dm_sent"), Set(EngagementsPrivate).asJava) - val IS_BOOKMARKED = - new Binary(name("recap.engagement.is_bookmarked"), Set(EngagementsPrivate).asJava) - val IS_SHARED = - new Binary(name("recap.engagement.is_shared"), Set(EngagementsPrivate).asJava) - val IS_SHARE_MENU_CLICKED = - new Binary(name("recap.engagement.is_share_menu_clicked"), Set(EngagementsPrivate).asJava) - - // Negative engagements - val IS_DONT_LIKE = - new Binary(name("recap.engagement.is_dont_like"), Set(EngagementsPrivate).asJava) - val IS_BLOCK_CLICKED = new Binary( - name("recap.engagement.is_block_clicked"), - Set(TweetsClicked, EngagementsPrivate, EngagementsPublic).asJava) - val IS_BLOCK_DIALOG_BLOCKED = new Binary( - name("recap.engagement.is_block_dialog_blocked"), - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_MUTE_CLICKED = new Binary( - name("recap.engagement.is_mute_clicked"), - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_MUTE_DIALOG_MUTED = - new Binary(name("recap.engagement.is_mute_dialog_muted"), Set(EngagementsPrivate).asJava) - val IS_REPORT_TWEET_CLICKED = new Binary( - name("recap.engagement.is_report_tweet_clicked"), - Set(TweetsClicked, EngagementsPrivate).asJava) - val IS_NEGATIVE_FEEDBACK = - new Binary("recap.engagement.is_negative_feedback", Set(EngagementsPrivate).asJava) - val IS_NOT_ABOUT_TOPIC = - new Binary(name("recap.engagement.is_not_about_topic"), Set(EngagementsPrivate).asJava) - val IS_NOT_RECENT = - new Binary(name("recap.engagement.is_not_recent"), Set(EngagementsPrivate).asJava) - val IS_NOT_RELEVANT = - new Binary(name("recap.engagement.is_not_relevant"), Set(EngagementsPrivate).asJava) - val IS_SEE_FEWER = - new Binary(name("recap.engagement.is_see_fewer"), Set(EngagementsPrivate).asJava) - val IS_TOPIC_SPEC_NEG_ENGAGEMENT = - new Binary("recap.engagement.is_topic_spec_neg_engagement", Set(EngagementsPrivate).asJava) - val IS_UNFOLLOW_TOPIC = - new Binary("recap.engagement.is_unfollow_topic", Set(EngagementsPrivate).asJava) - val IS_UNFOLLOW_TOPIC_EXPLICIT_POSITIVE_LABEL = - new Binary( - "recap.engagement.is_unfollow_topic_explicit_positive_label", - Set(EngagementsPrivate).asJava) - val IS_UNFOLLOW_TOPIC_IMPLICIT_POSITIVE_LABEL = - new Binary( - "recap.engagement.is_unfollow_topic_implicit_positive_label", - Set(EngagementsPrivate).asJava) - val IS_UNFOLLOW_TOPIC_STRONG_EXPLICIT_NEGATIVE_LABEL = - new Binary( - "recap.engagement.is_unfollow_topic_strong_explicit_negative_label", - Set(EngagementsPrivate).asJava) - val IS_UNFOLLOW_TOPIC_EXPLICIT_NEGATIVE_LABEL = - new Binary( - "recap.engagement.is_unfollow_topic_explicit_negative_label", - Set(EngagementsPrivate).asJava) - val IS_NOT_INTERESTED_IN = - new Binary("recap.engagement.is_not_interested_in", Set(EngagementsPrivate).asJava) - val IS_NOT_INTERESTED_IN_EXPLICIT_POSITIVE_LABEL = - new Binary( - "recap.engagement.is_not_interested_in_explicit_positive_label", - Set(EngagementsPrivate).asJava) - val IS_NOT_INTERESTED_IN_EXPLICIT_NEGATIVE_LABEL = - new Binary( - "recap.engagement.is_not_interested_in_explicit_negative_label", - Set(EngagementsPrivate).asJava) - val IS_CARET_CLICKED = - new Binary(name("recap.engagement.is_caret_clicked"), Set(EngagementsPrivate).asJava) - val IS_FOLLOW_TOPIC = - new Binary("recap.engagement.is_follow_topic", Set(EngagementsPrivate).asJava) - val IS_NOT_INTERESTED_IN_TOPIC = - new Binary("recap.engagement.is_not_interested_in_topic", Set(EngagementsPrivate).asJava) - val IS_HOME_LATEST_VISITED = - new Binary(name("recap.engagement.is_home_latest_visited"), Set(EngagementsPrivate).asJava) - - // Relevance prompt tweet engagements - val IS_RELEVANCE_PROMPT_YES_CLICKED = new Binary( - name("recap.engagement.is_relevance_prompt_yes_clicked"), - Set(EngagementsPrivate).asJava) - val IS_RELEVANCE_PROMPT_NO_CLICKED = new Binary( - name("recap.engagement.is_relevance_prompt_no_clicked"), - Set(EngagementsPrivate).asJava) - val IS_RELEVANCE_PROMPT_IMPRESSED = new Binary( - name("recap.engagement.is_relevance_prompt_impressed"), - Set(EngagementsPrivate).asJava) - - // Reciprocal engagements for reply forward engagement - val IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_impressed_by_author"), - Set(EngagementsPrivate).asJava) - val IS_REPLIED_REPLY_FAVORITED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_favorited_by_author"), - Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava) - val IS_REPLIED_REPLY_QUOTED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_quoted_by_author"), - Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava) - val IS_REPLIED_REPLY_REPLIED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_replied_by_author"), - Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava) - val IS_REPLIED_REPLY_RETWEETED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_retweeted_by_author"), - Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava) - val IS_REPLIED_REPLY_BLOCKED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_blocked_by_author"), - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_FOLLOWED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_followed_by_author"), - Set(EngagementsPrivate, EngagementsPublic, Follow).asJava) - val IS_REPLIED_REPLY_UNFOLLOWED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_unfollowed_by_author"), - Set(EngagementsPrivate, EngagementsPublic).asJava) - val IS_REPLIED_REPLY_MUTED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_muted_by_author"), - Set(EngagementsPrivate).asJava) - val IS_REPLIED_REPLY_REPORTED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_reported_by_author"), - Set(EngagementsPrivate).asJava) - - // This derived label is the logical OR of REPLY_REPLIED, REPLY_FAVORITED, REPLY_RETWEETED - val IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR = new Binary( - name("recap.engagement.is_replied_reply_engaged_by_author"), - Set(EngagementsPrivate, EngagementsPublic).asJava) - - // Reciprocal engagements for fav forward engagement - val IS_FAVORITED_FAV_FAVORITED_BY_AUTHOR = new Binary( - name("recap.engagement.is_favorited_fav_favorited_by_author"), - Set(EngagementsPrivate, EngagementsPublic, PrivateLikes, PublicLikes).asJava - ) - val IS_FAVORITED_FAV_REPLIED_BY_AUTHOR = new Binary( - name("recap.engagement.is_favorited_fav_replied_by_author"), - Set(EngagementsPrivate, EngagementsPublic, PrivateReplies, PublicReplies).asJava - ) - val IS_FAVORITED_FAV_RETWEETED_BY_AUTHOR = new Binary( - name("recap.engagement.is_favorited_fav_retweeted_by_author"), - Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava - ) - val IS_FAVORITED_FAV_FOLLOWED_BY_AUTHOR = new Binary( - name("recap.engagement.is_favorited_fav_followed_by_author"), - Set(EngagementsPrivate, EngagementsPublic, PrivateRetweets, PublicRetweets).asJava - ) - // This derived label is the logical OR of FAV_REPLIED, FAV_FAVORITED, FAV_RETWEETED, FAV_FOLLOWED - val IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Binary( - name("recap.engagement.is_favorited_fav_engaged_by_author"), - Set(EngagementsPrivate, EngagementsPublic).asJava) - - // define good profile click by considering following engagements (follow, fav, reply, retweet, etc.) at profile page - val IS_PROFILE_CLICKED_AND_PROFILE_FOLLOW = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_follow"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, Follow).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_FAV = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_fav"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateLikes, PublicLikes).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_REPLY = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_reply"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, PrivateReplies, PublicReplies).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_RETWEET = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_retweet"), - Set( - ProfilesViewed, - ProfilesClicked, - EngagementsPrivate, - PrivateRetweets, - PublicRetweets).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_CLICK = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_tweet_click"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, TweetsClicked).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_SHARE_DM_CLICK = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_share_dm_click"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // This derived label is the union of all binary features above - val IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_engaged"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate, EngagementsPublic).asJava) - - // define bad profile click by considering following engagements (user report, tweet report, mute, block, etc) at profile page - val IS_PROFILE_CLICKED_AND_PROFILE_USER_REPORT_CLICK = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_user_report_click"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_TWEET_REPORT_CLICK = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_tweet_report_click"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_MUTE = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_mute"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_PROFILE_CLICKED_AND_PROFILE_BLOCK = new Binary( - name("recap.engagement.is_profile_clicked_and_profile_block"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // This derived label is the union of bad profile click engagements and existing negative feedback - val IS_NEGATIVE_FEEDBACK_V2 = new Binary( - name("recap.engagement.is_negative_feedback_v2"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_STRONG_NEGATIVE_FEEDBACK = new Binary( - name("recap.engagement.is_strong_negative_feedback"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - val IS_WEAK_NEGATIVE_FEEDBACK = new Binary( - name("recap.engagement.is_weak_negative_feedback"), - Set(ProfilesViewed, ProfilesClicked, EngagementsPrivate).asJava) - // engagement for following user from any surface area - val IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Binary( - "recap.engagement.is_followed_from_any_surface_area", - Set(EngagementsPublic, EngagementsPrivate).asJava) - - // Reply downvote engagements - val IS_REPLY_DOWNVOTED = - new Binary(name("recap.engagement.is_reply_downvoted"), Set(EngagementsPrivate).asJava) - val IS_REPLY_DOWNVOTE_REMOVED = - new Binary(name("recap.engagement.is_reply_downvote_removed"), Set(EngagementsPrivate).asJava) - - // Other engagements - val IS_GOOD_OPEN_LINK = new Binary( - name("recap.engagement.is_good_open_link"), - Set(EngagementsPrivate, LinksClickedOn).asJava) - val IS_ENGAGED = new Binary( - name("recap.engagement.any"), - Set(EngagementsPrivate, EngagementsPublic).asJava - ) // Deprecated - to be removed shortly - val IS_EARLYBIRD_UNIFIED_ENGAGEMENT = new Binary( - name("recap.engagement.is_unified_engagement"), - Set(EngagementsPrivate, EngagementsPublic).asJava - ) // A subset of IS_ENGAGED specifically intended for use in earlybird models - - // features from ThriftTweetFeatures - val PREV_USER_TWEET_ENGAGEMENT = new Continuous( - name("recap.tweetfeature.prev_user_tweet_enagagement"), - Set(EngagementScore, EngagementsPrivate, EngagementsPublic).asJava) - val IS_SENSITIVE = new Binary(name("recap.tweetfeature.is_sensitive")) - val HAS_MULTIPLE_MEDIA = new Binary( - name("recap.tweetfeature.has_multiple_media"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val IS_AUTHOR_PROFILE_EGG = new Binary(name("recap.tweetfeature.is_author_profile_egg")) - val IS_AUTHOR_NEW = - new Binary(name("recap.tweetfeature.is_author_new"), Set(UserState, UserType).asJava) - val NUM_MENTIONS = new Continuous( - name("recap.tweetfeature.num_mentions"), - Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) - val HAS_MENTION = new Binary(name("recap.tweetfeature.has_mention"), Set(UserVisibleFlag).asJava) - val NUM_HASHTAGS = new Continuous( - name("recap.tweetfeature.num_hashtags"), - Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) - val HAS_HASHTAG = new Binary( - name("recap.tweetfeature.has_hashtag"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val LINK_LANGUAGE = new Continuous( - name("recap.tweetfeature.link_language"), - Set(ProvidedLanguage, InferredLanguage).asJava) - val IS_AUTHOR_NSFW = - new Binary(name("recap.tweetfeature.is_author_nsfw"), Set(UserSafetyLabels, UserType).asJava) - val IS_AUTHOR_SPAM = - new Binary(name("recap.tweetfeature.is_author_spam"), Set(UserSafetyLabels, UserType).asJava) - val IS_AUTHOR_BOT = - new Binary(name("recap.tweetfeature.is_author_bot"), Set(UserSafetyLabels, UserType).asJava) - val SIGNATURE = - new Discrete(name("recap.tweetfeature.signature"), Set(DigitalSignatureNonrepudiation).asJava) - val LANGUAGE = new Discrete( - name("recap.tweetfeature.language"), - Set(ProvidedLanguage, InferredLanguage).asJava) - val FROM_INACTIVE_USER = - new Binary(name("recap.tweetfeature.from_inactive_user"), Set(UserActiveFlag).asJava) - val PROBABLY_FROM_FOLLOWED_AUTHOR = new Binary(name("recap.v3.tweetfeature.probably_from_follow")) - val FROM_MUTUAL_FOLLOW = new Binary(name("recap.tweetfeature.from_mutual_follow")) - val USER_REP = new Continuous(name("recap.tweetfeature.user_rep")) - val FROM_VERIFIED_ACCOUNT = - new Binary(name("recap.tweetfeature.from_verified_account"), Set(UserVerifiedFlag).asJava) - val IS_BUSINESS_SCORE = new Continuous(name("recap.tweetfeature.is_business_score")) - val HAS_CONSUMER_VIDEO = new Binary( - name("recap.tweetfeature.has_consumer_video"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_PRO_VIDEO = new Binary( - name("recap.tweetfeature.has_pro_video"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_VINE = new Binary( - name("recap.tweetfeature.has_vine"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_PERISCOPE = new Binary( - name("recap.tweetfeature.has_periscope"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_NATIVE_VIDEO = new Binary( - name("recap.tweetfeature.has_native_video"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_NATIVE_IMAGE = new Binary( - name("recap.tweetfeature.has_native_image"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_CARD = new Binary( - name("recap.tweetfeature.has_card"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_IMAGE = new Binary( - name("recap.tweetfeature.has_image"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_NEWS = new Binary( - name("recap.tweetfeature.has_news"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_VIDEO = new Binary( - name("recap.tweetfeature.has_video"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_VISIBLE_LINK = new Binary( - name("recap.tweetfeature.has_visible_link"), - Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val LINK_COUNT = new Continuous( - name("recap.tweetfeature.link_count"), - Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) - val HAS_LINK = new Binary( - name("recap.tweetfeature.has_link"), - Set(UrlFoundFlag, PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val IS_OFFENSIVE = new Binary(name("recap.tweetfeature.is_offensive")) - val HAS_TREND = new Binary( - name("recap.tweetfeature.has_trend"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val HAS_MULTIPLE_HASHTAGS_OR_TRENDS = new Binary( - name("recap.tweetfeature.has_multiple_hashtag_or_trend"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val URL_DOMAINS = new SparseBinary( - name("recap.tweetfeature.url_domains"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val CONTAINS_MEDIA = new Binary( - name("recap.tweetfeature.contains_media"), - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val RETWEET_SEARCHER = new Binary(name("recap.tweetfeature.retweet_searcher")) - val REPLY_SEARCHER = new Binary(name("recap.tweetfeature.reply_searcher")) - val MENTION_SEARCHER = - new Binary(name("recap.tweetfeature.mention_searcher"), Set(UserVisibleFlag).asJava) - val REPLY_OTHER = - new Binary(name("recap.tweetfeature.reply_other"), Set(PublicReplies, PrivateReplies).asJava) - val RETWEET_OTHER = new Binary( - name("recap.tweetfeature.retweet_other"), - Set(PublicRetweets, PrivateRetweets).asJava) - val IS_REPLY = - new Binary(name("recap.tweetfeature.is_reply"), Set(PublicReplies, PrivateReplies).asJava) - val IS_RETWEET = - new Binary(name("recap.tweetfeature.is_retweet"), Set(PublicRetweets, PrivateRetweets).asJava) - val IS_EXTENDED_REPLY = new Binary( - name("recap.tweetfeature.is_extended_reply"), - Set(PublicReplies, PrivateReplies).asJava) - val MATCH_UI_LANG = new Binary( - name("recap.tweetfeature.match_ui_lang"), - Set(ProvidedLanguage, InferredLanguage).asJava) - val MATCH_SEARCHER_MAIN_LANG = new Binary( - name("recap.tweetfeature.match_searcher_main_lang"), - Set(ProvidedLanguage, InferredLanguage).asJava) - val MATCH_SEARCHER_LANGS = new Binary( - name("recap.tweetfeature.match_searcher_langs"), - Set(ProvidedLanguage, InferredLanguage).asJava) - val BIDIRECTIONAL_REPLY_COUNT = new Continuous( - name("recap.tweetfeature.bidirectional_reply_count"), - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) - val UNIDIRECTIONAL_REPLY_COUNT = new Continuous( - name("recap.tweetfeature.unidirectional_reply_count"), - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) - val BIDIRECTIONAL_RETWEET_COUNT = new Continuous( - name("recap.tweetfeature.bidirectional_retweet_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val UNIDIRECTIONAL_RETWEET_COUNT = new Continuous( - name("recap.tweetfeature.unidirectional_retweet_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val BIDIRECTIONAL_FAV_COUNT = new Continuous( - name("recap.tweetfeature.bidirectional_fav_count"), - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val UNIDIRECTIONAL_FAV_COUNT = new Continuous( - name("recap.tweetfeature.unidirectiona_fav_count"), - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val CONVERSATIONAL_COUNT = new Continuous( - name("recap.tweetfeature.conversational_count"), - Set(CountOfPrivateTweets, CountOfPublicTweets).asJava) - // tweet impressions on an embedded tweet - val EMBEDS_IMPRESSION_COUNT = new Continuous( - name("recap.tweetfeature.embeds_impression_count"), - Set(CountOfImpression).asJava) - // number of URLs that embed the tweet - val EMBEDS_URL_COUNT = new Continuous( - name("recap.tweetfeature.embeds_url_count"), - Set(CountOfPrivateTweetEntitiesAndMetadata, CountOfPublicTweetEntitiesAndMetadata).asJava) - // currently only counts views on Snappy and Amplify pro videos. Counts for other videos forthcoming - val VIDEO_VIEW_COUNT = new Continuous( - name("recap.tweetfeature.video_view_count"), - Set( - CountOfTweetEntitiesClicked, - CountOfPrivateTweetEntitiesAndMetadata, - CountOfPublicTweetEntitiesAndMetadata, - EngagementsPrivate, - EngagementsPublic).asJava - ) - val TWEET_COUNT_FROM_USER_IN_SNAPSHOT = new Continuous( - name("recap.tweetfeature.tweet_count_from_user_in_snapshot"), - Set(CountOfPrivateTweets, CountOfPublicTweets).asJava) - val NORMALIZED_PARUS_SCORE = - new Continuous("recap.tweetfeature.normalized_parus_score", Set(EngagementScore).asJava) - val PARUS_SCORE = new Continuous("recap.tweetfeature.parus_score", Set(EngagementScore).asJava) - val REAL_GRAPH_WEIGHT = - new Continuous("recap.tweetfeature.real_graph_weight", Set(UsersRealGraphScore).asJava) - val SARUS_GRAPH_WEIGHT = new Continuous("recap.tweetfeature.sarus_graph_weight") - val TOPIC_SIM_SEARCHER_INTERSTED_IN_AUTHOR_KNOWN_FOR = new Continuous( - "recap.tweetfeature.topic_sim_searcher_interested_in_author_known_for") - val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_INTERESTED_IN = new Continuous( - "recap.tweetfeature.topic_sim_searcher_author_both_interested_in") - val TOPIC_SIM_SEARCHER_AUTHOR_BOTH_KNOWN_FOR = new Continuous( - "recap.tweetfeature.topic_sim_searcher_author_both_known_for") - val TOPIC_SIM_SEARCHER_INTERESTED_IN_TWEET = new Continuous( - "recap.tweetfeature.topic_sim_searcher_interested_in_tweet") - val IS_RETWEETER_PROFILE_EGG = - new Binary(name("recap.v2.tweetfeature.is_retweeter_profile_egg"), Set(UserType).asJava) - val IS_RETWEETER_NEW = - new Binary(name("recap.v2.tweetfeature.is_retweeter_new"), Set(UserType, UserState).asJava) - val IS_RETWEETER_BOT = - new Binary( - name("recap.v2.tweetfeature.is_retweeter_bot"), - Set(UserType, UserSafetyLabels).asJava) - val IS_RETWEETER_NSFW = - new Binary( - name("recap.v2.tweetfeature.is_retweeter_nsfw"), - Set(UserType, UserSafetyLabels).asJava) - val IS_RETWEETER_SPAM = - new Binary( - name("recap.v2.tweetfeature.is_retweeter_spam"), - Set(UserType, UserSafetyLabels).asJava) - val RETWEET_OF_MUTUAL_FOLLOW = new Binary( - name("recap.v2.tweetfeature.retweet_of_mutual_follow"), - Set(PublicRetweets, PrivateRetweets).asJava) - val SOURCE_AUTHOR_REP = new Continuous(name("recap.v2.tweetfeature.source_author_rep")) - val IS_RETWEET_OF_REPLY = new Binary( - name("recap.v2.tweetfeature.is_retweet_of_reply"), - Set(PublicRetweets, PrivateRetweets).asJava) - val RETWEET_DIRECTED_AT_USER_IN_FIRST_DEGREE = new Binary( - name("recap.v2.tweetfeature.is_retweet_directed_at_user_in_first_degree"), - Set(PublicRetweets, PrivateRetweets, Follow).asJava) - val MENTIONED_SCREEN_NAMES = new SparseBinary( - "entities.users.mentioned_screen_names", - Set(DisplayName, UserVisibleFlag).asJava) - val MENTIONED_SCREEN_NAME = new Text( - "entities.users.mentioned_screen_names.member", - Set(DisplayName, UserVisibleFlag).asJava) - val HASHTAGS = new SparseBinary( - "entities.hashtags", - Set(PublicTweetEntitiesAndMetadata, PrivateTweetEntitiesAndMetadata).asJava) - val URL_SLUGS = new SparseBinary(name("recap.linkfeature.url_slugs"), Set(UrlFoundFlag).asJava) - - // features from ThriftSearchResultMetadata - val REPLY_COUNT = new Continuous( - name("recap.searchfeature.reply_count"), - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) - val RETWEET_COUNT = new Continuous( - name("recap.searchfeature.retweet_count"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val FAV_COUNT = new Continuous( - name("recap.searchfeature.fav_count"), - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val BLENDER_SCORE = new Continuous(name("recap.searchfeature.blender_score")) - val TEXT_SCORE = new Continuous(name("recap.searchfeature.text_score")) - - // features related to content source - val SOURCE_TYPE = new Discrete(name("recap.source.type")) - - // features from addressbook - // the author is in the user's email addressbook - val USER_TO_AUTHOR_EMAIL_REACHABLE = - new Binary(name("recap.addressbook.user_to_author_email_reachable"), Set(AddressBook).asJava) - // the author is in the user's phone addressbook - val USER_TO_AUTHOR_PHONE_REACHABLE = - new Binary(name("recap.addressbook.user_to_author_phone_reachable"), Set(AddressBook).asJava) - // the user is in the author's email addressbook - val AUTHOR_TO_USER_EMAIL_REACHABLE = - new Binary(name("recap.addressbook.author_to_user_email_reachable"), Set(AddressBook).asJava) - // the user is in the user's phone addressbook - val AUTHOR_TO_USER_PHONE_REACHABLE = - new Binary(name("recap.addressbook.author_to_user_phone_reachable"), Set(AddressBook).asJava) - - // predicted engagement (these features are used by prediction service to return the predicted engagement probability) - // these should match the names in engagement_to_score_feature_mapping - val PREDICTED_IS_FAVORITED = - new Continuous(name("recap.engagement_predicted.is_favorited"), Set(EngagementScore).asJava) - val PREDICTED_IS_RETWEETED = - new Continuous(name("recap.engagement_predicted.is_retweeted"), Set(EngagementScore).asJava) - val PREDICTED_IS_QUOTED = - new Continuous(name("recap.engagement_predicted.is_quoted"), Set(EngagementScore).asJava) - val PREDICTED_IS_REPLIED = - new Continuous(name("recap.engagement_predicted.is_replied"), Set(EngagementScore).asJava) - val PREDICTED_IS_GOOD_OPEN_LINK = new Continuous( - name("recap.engagement_predicted.is_good_open_link"), - Set(EngagementScore).asJava) - val PREDICTED_IS_PROFILE_CLICKED = new Continuous( - name("recap.engagement_predicted.is_profile_clicked"), - Set(EngagementScore).asJava) - val PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED = new Continuous( - name("recap.engagement_predicted.is_profile_clicked_and_profile_engaged"), - Set(EngagementScore).asJava) - val PREDICTED_IS_CLICKED = - new Continuous(name("recap.engagement_predicted.is_clicked"), Set(EngagementScore).asJava) - val PREDICTED_IS_PHOTO_EXPANDED = new Continuous( - name("recap.engagement_predicted.is_photo_expanded"), - Set(EngagementScore).asJava) - val PREDICTED_IS_DONT_LIKE = - new Continuous(name("recap.engagement_predicted.is_dont_like"), Set(EngagementScore).asJava) - val PREDICTED_IS_VIDEO_PLAYBACK_50 = new Continuous( - name("recap.engagement_predicted.is_video_playback_50"), - Set(EngagementScore).asJava) - val PREDICTED_IS_VIDEO_QUALITY_VIEWED = new Continuous( - name("recap.engagement_predicted.is_video_quality_viewed"), - Set(EngagementScore).asJava) - val PREDICTED_IS_BOOKMARKED = - new Continuous(name("recap.engagement_predicted.is_bookmarked"), Set(EngagementScore).asJava) - val PREDICTED_IS_SHARED = - new Continuous(name("recap.engagement_predicted.is_shared"), Set(EngagementScore).asJava) - val PREDICTED_IS_SHARE_MENU_CLICKED = - new Continuous( - name("recap.engagement_predicted.is_share_menu_clicked"), - Set(EngagementScore).asJava) - val PREDICTED_IS_PROFILE_DWELLED_20_SEC = new Continuous( - name("recap.engagement_predicted.is_profile_dwelled_20_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_5_SEC = new Continuous( - name("recap.engagement_predicted.is_fullscreen_video_dwelled_5_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_10_SEC = new Continuous( - name("recap.engagement_predicted.is_fullscreen_video_dwelled_10_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_20_SEC = new Continuous( - name("recap.engagement_predicted.is_fullscreen_video_dwelled_20_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_FULLSCREEN_VIDEO_DWELLED_30_SEC = new Continuous( - name("recap.engagement_predicted.is_fullscreen_video_dwelled_30_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_UNIFIED_ENGAGEMENT = new Continuous( - name("recap.engagement_predicted.is_unified_engagement"), - Set(EngagementScore).asJava) - val PREDICTED_IS_COMPOSE_TRIGGERED = new Continuous( - name("recap.engagement_predicted.is_compose_triggered"), - Set(EngagementScore).asJava) - val PREDICTED_IS_REPLIED_REPLY_IMPRESSED_BY_AUTHOR = new Continuous( - name("recap.engagement_predicted.is_replied_reply_impressed_by_author"), - Set(EngagementScore).asJava) - val PREDICTED_IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR = new Continuous( - name("recap.engagement_predicted.is_replied_reply_engaged_by_author"), - Set(EngagementScore).asJava) - val PREDICTED_IS_GOOD_CLICKED_V1 = new Continuous( - name("recap.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied"), - Set(EngagementScore).asJava) - val PREDICTED_IS_GOOD_CLICKED_V2 = new Continuous( - name("recap.engagement_predicted.is_good_clicked_convo_desc_v2"), - Set(EngagementScore).asJava) - val PREDICTED_IS_TWEET_DETAIL_DWELLED_8_SEC = new Continuous( - name("recap.engagement_predicted.is_tweet_detail_dwelled_8_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_TWEET_DETAIL_DWELLED_15_SEC = new Continuous( - name("recap.engagement_predicted.is_tweet_detail_dwelled_15_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_TWEET_DETAIL_DWELLED_25_SEC = new Continuous( - name("recap.engagement_predicted.is_tweet_detail_dwelled_25_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_TWEET_DETAIL_DWELLED_30_SEC = new Continuous( - name("recap.engagement_predicted.is_tweet_detail_dwelled_30_sec"), - Set(EngagementScore).asJava) - val PREDICTED_IS_FAVORITED_FAV_ENGAGED_BY_AUTHOR = new Continuous( - name("recap.engagement_predicted.is_favorited_fav_engaged_by_author"), - Set(EngagementScore).asJava) - val PREDICTED_IS_GOOD_CLICKED_WITH_DWELL_SUM_GTE_60S = new Continuous( - name( - "recap.engagement_predicted.is_good_clicked_convo_desc_favorited_or_replied_or_dwell_sum_gte_60_secs"), - Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_IN_BOUNDS_V1 = new Continuous( - name("recap.engagement_predicted.is_dwelled_in_bounds_v1"), - Set(EngagementScore).asJava) - val PREDICTED_DWELL_NORMALIZED_OVERALL = new Continuous( - name("recap.engagement_predicted.dwell_normalized_overall"), - Set(EngagementScore).asJava) - val PREDICTED_DWELL_CDF = - new Continuous(name("recap.engagement_predicted.dwell_cdf"), Set(EngagementScore).asJava) - val PREDICTED_DWELL_CDF_OVERALL = new Continuous( - name("recap.engagement_predicted.dwell_cdf_overall"), - Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED = - new Continuous(name("recap.engagement_predicted.is_dwelled"), Set(EngagementScore).asJava) - - val PREDICTED_IS_DWELLED_1S = - new Continuous(name("recap.engagement_predicted.is_dwelled_1s"), Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_2S = - new Continuous(name("recap.engagement_predicted.is_dwelled_2s"), Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_3S = - new Continuous(name("recap.engagement_predicted.is_dwelled_3s"), Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_4S = - new Continuous(name("recap.engagement_predicted.is_dwelled_4s"), Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_5S = - new Continuous(name("recap.engagement_predicted.is_dwelled_5s"), Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_6S = - new Continuous(name("recap.engagement_predicted.is_dwelled_6s"), Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_7S = - new Continuous(name("recap.engagement_predicted.is_dwelled_7s"), Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_8S = - new Continuous(name("recap.engagement_predicted.is_dwelled_8s"), Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_9S = - new Continuous(name("recap.engagement_predicted.is_dwelled_9s"), Set(EngagementScore).asJava) - val PREDICTED_IS_DWELLED_10S = - new Continuous(name("recap.engagement_predicted.is_dwelled_10s"), Set(EngagementScore).asJava) - - val PREDICTED_IS_SKIPPED_1S = - new Continuous(name("recap.engagement_predicted.is_skipped_1s"), Set(EngagementScore).asJava) - val PREDICTED_IS_SKIPPED_2S = - new Continuous(name("recap.engagement_predicted.is_skipped_2s"), Set(EngagementScore).asJava) - val PREDICTED_IS_SKIPPED_3S = - new Continuous(name("recap.engagement_predicted.is_skipped_3s"), Set(EngagementScore).asJava) - val PREDICTED_IS_SKIPPED_4S = - new Continuous(name("recap.engagement_predicted.is_skipped_4s"), Set(EngagementScore).asJava) - val PREDICTED_IS_SKIPPED_5S = - new Continuous(name("recap.engagement_predicted.is_skipped_5s"), Set(EngagementScore).asJava) - val PREDICTED_IS_SKIPPED_6S = - new Continuous(name("recap.engagement_predicted.is_skipped_6s"), Set(EngagementScore).asJava) - val PREDICTED_IS_SKIPPED_7S = - new Continuous(name("recap.engagement_predicted.is_skipped_7s"), Set(EngagementScore).asJava) - val PREDICTED_IS_SKIPPED_8S = - new Continuous(name("recap.engagement_predicted.is_skipped_8s"), Set(EngagementScore).asJava) - val PREDICTED_IS_SKIPPED_9S = - new Continuous(name("recap.engagement_predicted.is_skipped_9s"), Set(EngagementScore).asJava) - val PREDICTED_IS_SKIPPED_10S = - new Continuous(name("recap.engagement_predicted.is_skipped_10s"), Set(EngagementScore).asJava) - - val PREDICTED_IS_HOME_LATEST_VISITED = new Continuous( - name("recap.engagement_predicted.is_home_latest_visited"), - Set(EngagementScore).asJava) - val PREDICTED_IS_NEGATIVE_FEEDBACK = - new Continuous( - name("recap.engagement_predicted.is_negative_feedback"), - Set(EngagementScore).asJava) - val PREDICTED_IS_NEGATIVE_FEEDBACK_V2 = - new Continuous( - name("recap.engagement_predicted.is_negative_feedback_v2"), - Set(EngagementScore).asJava) - val PREDICTED_IS_WEAK_NEGATIVE_FEEDBACK = - new Continuous( - name("recap.engagement_predicted.is_weak_negative_feedback"), - Set(EngagementScore).asJava) - val PREDICTED_IS_STRONG_NEGATIVE_FEEDBACK = - new Continuous( - name("recap.engagement_predicted.is_strong_negative_feedback"), - Set(EngagementScore).asJava) - val PREDICTED_IS_REPORT_TWEET_CLICKED = - new Continuous( - name("recap.engagement_predicted.is_report_tweet_clicked"), - Set(EngagementScore).asJava) - val PREDICTED_IS_UNFOLLOW_TOPIC = - new Continuous( - name("recap.engagement_predicted.is_unfollow_topic"), - Set(EngagementScore).asJava) - val PREDICTED_IS_RELEVANCE_PROMPT_YES_CLICKED = new Continuous( - name("recap.engagement_predicted.is_relevance_prompt_yes_clicked"), - Set(EngagementScore).asJava) - - // engagement for following user from any surface area - val PREDICTED_IS_FOLLOWED_FROM_ANY_SURFACE_AREA = new Continuous( - "recap.engagement_predicted.is_followed_from_any_surface_area", - Set(EngagementScore).asJava) - - - // These are global engagement counts for the Tweets. - val FAV_COUNT_V2 = new Continuous( - name("recap.earlybird.fav_count_v2"), - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava) - val RETWEET_COUNT_V2 = new Continuous( - name("recap.earlybird.retweet_count_v2"), - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava) - val REPLY_COUNT_V2 = new Continuous( - name("recap.earlybird.reply_count_v2"), - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava) - - val HAS_US_POLITICAL_ANNOTATION = new Binary( - name("recap.has_us_political_annotation"), - Set(SemanticcoreClassification).asJava - ) - - val HAS_US_POLITICAL_ALL_GROUPS_ANNOTATION = new Binary( - name("recap.has_us_political_all_groups_annotation"), - Set(SemanticcoreClassification).asJava - ) - - val HAS_US_POLITICAL_ANNOTATION_HIGH_RECALL = new Binary( - name("recap.has_us_political_annotation_high_recall"), - Set(SemanticcoreClassification).asJava - ) - - val HAS_US_POLITICAL_ANNOTATION_HIGH_RECALL_V2 = new Binary( - name("recap.has_us_political_annotation_high_recall_v2"), - Set(SemanticcoreClassification).asJava - ) - - val HAS_US_POLITICAL_ANNOTATION_HIGH_PRECISION_V0 = new Binary( - name("recap.has_us_political_annotation_high_precision_v0"), - Set(SemanticcoreClassification).asJava - ) - - val HAS_US_POLITICAL_ANNOTATION_BALANCED_PRECISION_RECALL_V0 = new Binary( - name("recap.has_us_political_annotation_balanced_precision_recall_v0"), - Set(SemanticcoreClassification).asJava - ) - - val HAS_US_POLITICAL_ANNOTATION_HIGH_RECALL_V3 = new Binary( - name("recap.has_us_political_annotation_high_recall_v3"), - Set(SemanticcoreClassification).asJava - ) - - val HAS_US_POLITICAL_ANNOTATION_HIGH_PRECISION_V3 = new Binary( - name("recap.has_us_political_annotation_high_precision_v3"), - Set(SemanticcoreClassification).asJava - ) - - val HAS_US_POLITICAL_ANNOTATION_BALANCED_V3 = new Binary( - name("recap.has_us_political_annotation_balanced_v3"), - Set(SemanticcoreClassification).asJava - ) - -} diff --git a/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeaturesUtils.docx b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeaturesUtils.docx new file mode 100644 index 000000000..993fcf1c0 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeaturesUtils.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeaturesUtils.scala b/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeaturesUtils.scala deleted file mode 100644 index edf152cda..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/recap/RecapFeaturesUtils.scala +++ /dev/null @@ -1,29 +0,0 @@ -package com.twitter.timelines.prediction.features.recap - -object RecapFeaturesUtils { - // This needs to be updated if an engagement model is added or removed from prediction service. - val scoreFeatureIdsMap: Map[String, Long] = Map( - RecapFeatures.IS_FAVORITED.getFeatureName -> RecapFeatures.PREDICTED_IS_FAVORITED.getFeatureId, - RecapFeatures.IS_REPLIED.getFeatureName -> RecapFeatures.PREDICTED_IS_REPLIED.getFeatureId, - RecapFeatures.IS_RETWEETED.getFeatureName -> RecapFeatures.PREDICTED_IS_RETWEETED.getFeatureId, - RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V1.getFeatureName -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V1.getFeatureId, - RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V2.getFeatureName -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V2.getFeatureId, -// RecapFeatures.IS_NEGATIVE_FEEDBACK_V2.getFeatureName -> RecapFeatures.PREDICTED_IS_NEGATIVE_FEEDBACK_V2.getFeatureId, - RecapFeatures.IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureName -> RecapFeatures.PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureId, - RecapFeatures.IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureName -> RecapFeatures.PREDICTED_IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureId - ) - - // This needs to be updated if an engagement model is added or removed from prediction service. - val labelFeatureIdToScoreFeatureIdsMap: Map[Long, Long] = Map( - RecapFeatures.IS_FAVORITED.getFeatureId -> RecapFeatures.PREDICTED_IS_FAVORITED.getFeatureId, - RecapFeatures.IS_REPLIED.getFeatureId -> RecapFeatures.PREDICTED_IS_REPLIED.getFeatureId, - RecapFeatures.IS_RETWEETED.getFeatureId -> RecapFeatures.PREDICTED_IS_RETWEETED.getFeatureId, - RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V1.getFeatureId -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V1.getFeatureId, - RecapFeatures.IS_GOOD_CLICKED_CONVO_DESC_V2.getFeatureId -> RecapFeatures.PREDICTED_IS_GOOD_CLICKED_V2.getFeatureId, - // RecapFeatures.IS_NEGATIVE_FEEDBACK_V2.getFeatureName -> RecapFeatures.PREDICTED_IS_NEGATIVE_FEEDBACK_V2.getFeatureId, - RecapFeatures.IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureId -> RecapFeatures.PREDICTED_IS_PROFILE_CLICKED_AND_PROFILE_ENGAGED.getFeatureId, - RecapFeatures.IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureId -> RecapFeatures.PREDICTED_IS_REPLIED_REPLY_ENGAGED_BY_AUTHOR.getFeatureId - ) - - val labelFeatureNames: Seq[String] = scoreFeatureIdsMap.keys.toSeq -} diff --git a/src/scala/com/twitter/timelines/prediction/features/request_context/BUILD b/src/scala/com/twitter/timelines/prediction/features/request_context/BUILD deleted file mode 100644 index 6fc497bf3..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/request_context/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/request_context/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/request_context/BUILD.docx new file mode 100644 index 000000000..07d639e5e Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/request_context/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/request_context/RequestContextFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/request_context/RequestContextFeatures.docx new file mode 100644 index 000000000..a7fad92aa Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/request_context/RequestContextFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/request_context/RequestContextFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/request_context/RequestContextFeatures.scala deleted file mode 100644 index a7dd28852..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/request_context/RequestContextFeatures.scala +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.timelines.prediction.features.request_context - -import com.twitter.ml.api.FeatureContext -import com.twitter.ml.api.Feature._ -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import scala.collection.JavaConverters._ - -object RequestContextFeatures { - val COUNTRY_CODE = - new Text("request_context.country_code", Set(PrivateCountryOrRegion, InferredCountry).asJava) - val LANGUAGE_CODE = new Text( - "request_context.language_code", - Set(GeneralSettings, ProvidedLanguage, InferredLanguage).asJava) - val REQUEST_PROVENANCE = new Text("request_context.request_provenance", Set(AppUsage).asJava) - val DISPLAY_WIDTH = new Continuous("request_context.display_width", Set(OtherDeviceInfo).asJava) - val DISPLAY_HEIGHT = new Continuous("request_context.display_height", Set(OtherDeviceInfo).asJava) - val DISPLAY_DPI = new Continuous("request_context.display_dpi", Set(OtherDeviceInfo).asJava) - - // the following features are not Continuous Features because for e.g. continuity between - // 23 and 0 hours cannot be handled that way. instead, we will treat each slice of hours/days - // independently, like a set of sparse binary features. - val TIMESTAMP_GMT_HOUR = - new Discrete("request_context.timestamp_gmt_hour", Set(PrivateTimestamp).asJava) - val TIMESTAMP_GMT_DOW = - new Discrete("request_context.timestamp_gmt_dow", Set(PrivateTimestamp).asJava) - - val IS_GET_INITIAL = new Binary("request_context.is_get_initial") - val IS_GET_MIDDLE = new Binary("request_context.is_get_middle") - val IS_GET_NEWER = new Binary("request_context.is_get_newer") - val IS_GET_OLDER = new Binary("request_context.is_get_older") - - // the following features are not Binary Features because the source field is Option[Boolean], - // and we want to distinguish Some(false) from None. None will be converted to -1. - val IS_POLLING = new Discrete("request_context.is_polling") - val IS_SESSION_START = new Discrete("request_context.is_session_start") - - // Helps distinguish requests from "home" vs "home_latest" (reverse chron home view). - val TIMELINE_KIND = new Text("request_context.timeline_kind") - - val featureContext = new FeatureContext( - COUNTRY_CODE, - LANGUAGE_CODE, - REQUEST_PROVENANCE, - DISPLAY_WIDTH, - DISPLAY_HEIGHT, - DISPLAY_DPI, - TIMESTAMP_GMT_HOUR, - TIMESTAMP_GMT_DOW, - IS_GET_INITIAL, - IS_GET_MIDDLE, - IS_GET_NEWER, - IS_GET_OLDER, - IS_POLLING, - IS_SESSION_START, - TIMELINE_KIND - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/BUILD b/src/scala/com/twitter/timelines/prediction/features/simcluster/BUILD deleted file mode 100644 index ec194353b..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/simcluster/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - "src/thrift/com/twitter/timelines/suggests/common:record-scala", - "timelines/data_processing/ml_util/aggregation_framework:common_types", - "timelines/data_processing/ml_util/aggregation_framework/conversion:for-timelines", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/simcluster/BUILD.docx new file mode 100644 index 000000000..3b4f7db56 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/simcluster/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterFeatures.docx new file mode 100644 index 000000000..6032098fb Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterFeatures.scala deleted file mode 100644 index 4d2b4db81..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterFeatures.scala +++ /dev/null @@ -1,61 +0,0 @@ -package com.twitter.timelines.prediction.features.simcluster - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.ml.api.Feature._ -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup -import scala.collection.JavaConverters._ - -class SimclusterFeaturesHelper(statsReceiver: StatsReceiver) { - import SimclusterFeatures._ - - private[this] val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName) - private[this] val invalidSimclusterModelVersion = scopedStatsReceiver - .counter("invalidSimclusterModelVersion") - - def fromUserClusterInterestsPair( - userInterestClustersPair: (Long, ClustersUserIsInterestedIn) - ): Option[SimclusterFeatures] = { - val (userId, userInterestClusters) = userInterestClustersPair - if (userInterestClusters.knownForModelVersion == SIMCLUSTER_MODEL_VERSION) { - val userInterestClustersFavScores = for { - (clusterId, scores) <- userInterestClusters.clusterIdToScores - favScore <- scores.favScore - } yield (clusterId.toString, favScore) - Some( - SimclusterFeatures( - userId, - userInterestClusters.knownForModelVersion, - userInterestClustersFavScores.toMap - ) - ) - } else { - // We maintain this counter to make sure that the hardcoded modelVersion we are using is correct. - invalidSimclusterModelVersion.incr - None - } - } -} - -object SimclusterFeatures { - // Check http://go/simclustersv2runbook for production versions - // Our models are trained for this specific model version only. - val SIMCLUSTER_MODEL_VERSION = "20M_145K_dec11" - val prefix = s"simcluster.v2.$SIMCLUSTER_MODEL_VERSION" - - val SIMCLUSTER_USER_INTEREST_CLUSTER_SCORES = new SparseContinuous( - s"$prefix.user_interest_cluster_scores", - Set(EngagementScore, InferredInterests).asJava - ) - val SIMCLUSTER_USER_INTEREST_CLUSTER_IDS = new SparseBinary( - s"$prefix.user_interest_cluster_ids", - Set(InferredInterests).asJava - ) - val SIMCLUSTER_MODEL_VERSION_METADATA = new Text("meta.simcluster_version") -} - -case class SimclusterFeatures( - userId: Long, - modelVersion: String, - interestClusterScoresMap: Map[String, Double]) diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterTweetFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterTweetFeatures.docx new file mode 100644 index 000000000..2e9c6a434 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterTweetFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterTweetFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterTweetFeatures.scala deleted file mode 100644 index 355a89c22..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclusterTweetFeatures.scala +++ /dev/null @@ -1,150 +0,0 @@ -package com.twitter.timelines.prediction.features.simcluster - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.ml.api.{Feature, FeatureContext} -import com.twitter.ml.api.Feature.{Continuous, SparseBinary, SparseContinuous} -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion._ -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup -import com.twitter.timelines.suggests.common.record.thriftscala.SuggestionRecord -import scala.collection.JavaConverters._ - -class SimclusterTweetFeatures(statsReceiver: StatsReceiver) extends CombineCountsBase { - import SimclusterTweetFeatures._ - - private[this] val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName) - private[this] val invalidSimclusterModelVersion = scopedStatsReceiver - .counter("invalidSimclusterModelVersion") - private[this] val getFeaturesFromOverlappingSimclusterIdsCount = scopedStatsReceiver - .counter("getFeaturesFromOverlappingSimclusterIdsCount") - private[this] val emptySimclusterMaps = scopedStatsReceiver - .counter("emptySimclusterMaps") - private[this] val nonOverlappingSimclusterMaps = scopedStatsReceiver - .counter("nonOverlappingSimclusterMaps") - - // Parameters required by CombineCountsBase - override val topK: Int = 5 - override val hardLimit: Option[Int] = None - override val precomputedCountFeatures: Seq[Feature[_]] = Seq( - SIMCLUSTER_TWEET_TOPK_SORT_BY_TWEET_SCORE, - SIMCLUSTER_TWEET_TOPK_SORT_BY_COMBINED_SCORE - ) - - private def getFeaturesFromOverlappingSimclusterIds( - userSimclustersInterestedInMap: Map[String, Double], - tweetSimclustersTopKMap: Map[String, Double] - ): Map[Feature[_], List[Double]] = { - getFeaturesFromOverlappingSimclusterIdsCount.incr - if (userSimclustersInterestedInMap.isEmpty || tweetSimclustersTopKMap.isEmpty) { - emptySimclusterMaps.incr - Map.empty - } else { - val overlappingSimclusterIds = - userSimclustersInterestedInMap.keySet intersect tweetSimclustersTopKMap.keySet - if (overlappingSimclusterIds.isEmpty) { - nonOverlappingSimclusterMaps.incr - Map.empty - } else { - val (combinedScores, tweetScores) = overlappingSimclusterIds.map { id => - val tweetScore = tweetSimclustersTopKMap.getOrElse(id, 0.0) - val combinedScore = userSimclustersInterestedInMap.getOrElse(id, 0.0) * tweetScore - (combinedScore, tweetScore) - }.unzip - Map( - SIMCLUSTER_TWEET_TOPK_SORT_BY_COMBINED_SCORE -> combinedScores.toList, - SIMCLUSTER_TWEET_TOPK_SORT_BY_TWEET_SCORE -> tweetScores.toList - ) - } - } - } - - def getCountFeaturesValuesMap( - suggestionRecord: SuggestionRecord, - simclustersTweetTopKMap: Map[String, Double] - ): Map[Feature[_], List[Double]] = { - val userSimclustersInterestedInMap = formatUserSimclustersInterestedIn(suggestionRecord) - - val tweetSimclustersTopKMap = formatTweetSimclustersTopK(simclustersTweetTopKMap) - - getFeaturesFromOverlappingSimclusterIds(userSimclustersInterestedInMap, tweetSimclustersTopKMap) - } - - def filterByModelVersion( - simclustersMapOpt: Option[Map[String, Double]] - ): Option[Map[String, Double]] = { - simclustersMapOpt.flatMap { simclustersMap => - val filteredSimclustersMap = simclustersMap.filter { - case (clusterId, score) => - // The clusterId format is ModelVersion.IntegerClusterId.ScoreType as specified at - // com.twitter.ml.featurestore.catalog.features.recommendations.SimClustersV2TweetTopClusters - clusterId.contains(SimclusterFeatures.SIMCLUSTER_MODEL_VERSION) - } - - // The assumption is that the simclustersMap will contain clusterIds with the same modelVersion. - // We maintain this counter to make sure that the hardcoded modelVersion we are using is correct. - if (simclustersMap.size > filteredSimclustersMap.size) { - invalidSimclusterModelVersion.incr - } - - if (filteredSimclustersMap.nonEmpty) Some(filteredSimclustersMap) else None - } - } - - val allFeatures: Seq[Feature[_]] = outputFeaturesPostMerge.toSeq ++ Seq( - SIMCLUSTER_TWEET_TOPK_CLUSTER_IDS, - SIMCLUSTER_TWEET_TOPK_CLUSTER_SCORES) - val featureContext = new FeatureContext(allFeatures: _*) -} - -object SimclusterTweetFeatures { - val SIMCLUSTER_TWEET_TOPK_CLUSTER_IDS = new SparseBinary( - s"${SimclusterFeatures.prefix}.tweet_topk_cluster_ids", - Set(InferredInterests).asJava - ) - val SIMCLUSTER_TWEET_TOPK_CLUSTER_SCORES = new SparseContinuous( - s"${SimclusterFeatures.prefix}.tweet_topk_cluster_scores", - Set(EngagementScore, InferredInterests).asJava - ) - - val SIMCLUSTER_TWEET_TOPK_CLUSTER_ID = - TypedAggregateGroup.sparseFeature(SIMCLUSTER_TWEET_TOPK_CLUSTER_IDS) - - val SIMCLUSTER_TWEET_TOPK_SORT_BY_TWEET_SCORE = new Continuous( - s"${SimclusterFeatures.prefix}.tweet_topk_sort_by_tweet_score", - Set(EngagementScore, InferredInterests).asJava - ) - - val SIMCLUSTER_TWEET_TOPK_SORT_BY_COMBINED_SCORE = new Continuous( - s"${SimclusterFeatures.prefix}.tweet_topk_sort_by_combined_score", - Set(EngagementScore, InferredInterests).asJava - ) - - def formatUserSimclustersInterestedIn(suggestionRecord: SuggestionRecord): Map[String, Double] = { - suggestionRecord.userSimclustersInterestedIn - .map { clustersUserIsInterestedIn => - if (clustersUserIsInterestedIn.knownForModelVersion == SimclusterFeatures.SIMCLUSTER_MODEL_VERSION) { - clustersUserIsInterestedIn.clusterIdToScores.collect { - case (clusterId, scores) if scores.favScore.isDefined => - (clusterId.toString, scores.favScore.get) - } - } else Map.empty[String, Double] - }.getOrElse(Map.empty[String, Double]) - .toMap - } - - def formatTweetSimclustersTopK( - simclustersTweetTopKMap: Map[String, Double] - ): Map[String, Double] = { - simclustersTweetTopKMap.collect { - case (clusterId, score) => - // The clusterId format is as specified at - // com.twitter.ml.featurestore.catalog.features.recommendations.SimClustersV2TweetTopClusters - // and we want to extract the IntegerClusterId. - // The split function takes a regex; therefore, we need to escape . and we also need to escape - // \ since they are both special characters. Hence, the double \\. - val clusterIdSplit = clusterId.split("\\.") - val integerClusterId = clusterIdSplit(1) // The IntegerClusterId is at position 1. - (integerClusterId, score) - } - } -} diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclustersScoresFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclustersScoresFeatures.docx new file mode 100644 index 000000000..6ab4aaa45 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclustersScoresFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclustersScoresFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclustersScoresFeatures.scala deleted file mode 100644 index 0629636c0..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/simcluster/SimclustersScoresFeatures.scala +++ /dev/null @@ -1,43 +0,0 @@ -package com.twitter.timelines.prediction.features.simcluster - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType.SemanticcoreClassification -import com.twitter.ml.api.Feature -import com.twitter.ml.api.Feature.Continuous -import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion.CombineCountsBase -import scala.collection.JavaConverters._ - -object SimclustersScoresFeatures extends CombineCountsBase { - override def topK: Int = 2 - - override def hardLimit: Option[Int] = Some(20) - - val prefix = s"recommendations.sim_clusters_scores" - val TOPIC_CONSUMER_TWEET_EMBEDDING_Cs = new Continuous( - s"$prefix.localized_topic_consumer_tweet_embedding_cosine_similarity", - Set(SemanticcoreClassification).asJava) - val TOPIC_PRODUCER_TWEET_EMBEDDING_Cs = new Continuous( - s"$prefix.topic_producer_tweet_embedding_cosine_similarity", - Set(SemanticcoreClassification).asJava) - val USER_TOPIC_CONSUMER_TWEET_EMBEDDING_COSINE_SIM = new Continuous( - s"$prefix.user_interested_in_localized_topic_consumer_embedding_cosine_similarity", - Set(SemanticcoreClassification).asJava) - val USER_TOPIC_CONSUMER_TWEET_EMBEDDING_DOT_PRODUCT = new Continuous( - s"$prefix.user_interested_in_localized_topic_consumer_embedding_dot_product", - Set(SemanticcoreClassification).asJava) - val USER_TOPIC_PRODUCER_TWEET_EMBEDDING_COSINE_SIM = new Continuous( - s"$prefix.user_interested_in_localized_topic_producer_embedding_cosine_similarity", - Set(SemanticcoreClassification).asJava) - val USER_TOPIC_PRODUCER_TWEET_EMBEDDING_DOT_PRODUCT = new Continuous( - s"$prefix.user_interested_in_localized_topic_producer_embedding_dot_product", - Set(SemanticcoreClassification).asJava) - - override def precomputedCountFeatures: Seq[Feature[_]] = - Seq( - TOPIC_CONSUMER_TWEET_EMBEDDING_Cs, - TOPIC_PRODUCER_TWEET_EMBEDDING_Cs, - USER_TOPIC_CONSUMER_TWEET_EMBEDDING_COSINE_SIM, - USER_TOPIC_CONSUMER_TWEET_EMBEDDING_DOT_PRODUCT, - USER_TOPIC_PRODUCER_TWEET_EMBEDDING_COSINE_SIM, - USER_TOPIC_PRODUCER_TWEET_EMBEDDING_DOT_PRODUCT - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/features/socialproof/BUILD b/src/scala/com/twitter/timelines/prediction/features/socialproof/BUILD deleted file mode 100644 index 0c00b1e5b..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/socialproof/BUILD +++ /dev/null @@ -1,15 +0,0 @@ -scala_library( - name = "socialproof_features", - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/ibm/icu:icu4j", - "src/java/com/twitter/ml/api:api-base", - "src/scala/com/twitter/ml/api/util", - "src/scala/com/twitter/timelines/util", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/ml/api:data-java", - "src/thrift/com/twitter/timelines/socialproof:socialproof-scala", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/socialproof/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/socialproof/BUILD.docx new file mode 100644 index 000000000..47e5d42e0 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/socialproof/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/socialproof/SocialProofFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/socialproof/SocialProofFeatures.docx new file mode 100644 index 000000000..ea3ebf53f Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/socialproof/SocialProofFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/socialproof/SocialProofFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/socialproof/SocialProofFeatures.scala deleted file mode 100644 index 163ba7efa..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/socialproof/SocialProofFeatures.scala +++ /dev/null @@ -1,172 +0,0 @@ -package com.twitter.timelines.prediction.features.socialproof - -import com.twitter.ml.api.DataRecord -import com.twitter.ml.api.Feature.Binary -import com.twitter.ml.api.Feature.Continuous -import com.twitter.ml.api.Feature.SparseBinary -import com.twitter.ml.api.util.FDsl._ -import com.twitter.timelines.prediction.features.socialproof.SocialProofDataRecordFeatures._ -import com.twitter.timelines.socialproof.thriftscala.SocialProof -import com.twitter.timelines.socialproof.v1.thriftscala.SocialProofType -import com.twitter.timelines.util.CommonTypes.UserId -import scala.collection.JavaConverters._ -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ - -abstract class SocialProofUserGroundTruth(userIds: Seq[UserId], count: Int) { - require( - count >= userIds.size, - "count must be equal to or greater than the number of entries in userIds" - ) - // Using Double as the return type to make it more convenient for these values to be used as - // ML feature values. - val displayedUserCount: Double = userIds.size.toDouble - val undisplayedUserCount: Double = count - userIds.size.toDouble - val totalCount: Double = count.toDouble - - def featureDisplayedUsers: SparseBinary - def featureDisplayedUserCount: Continuous - def featureUndisplayedUserCount: Continuous - def featureTotalUserCount: Continuous - - def setFeatures(rec: DataRecord): Unit = { - rec.setFeatureValue(featureDisplayedUsers, toStringSet(userIds)) - rec.setFeatureValue(featureDisplayedUserCount, displayedUserCount) - rec.setFeatureValue(featureUndisplayedUserCount, undisplayedUserCount) - rec.setFeatureValue(featureTotalUserCount, totalCount) - } - protected def toStringSet(value: Seq[Long]): Set[String] = { - value.map(_.toString).toSet - } -} - -case class FavoritedBySocialProofUserGroundTruth(userIds: Seq[UserId] = Seq.empty, count: Int = 0) - extends SocialProofUserGroundTruth(userIds, count) { - - override val featureDisplayedUsers = SocialProofDisplayedFavoritedByUsers - override val featureDisplayedUserCount = SocialProofDisplayedFavoritedByUserCount - override val featureUndisplayedUserCount = SocialProofUndisplayedFavoritedByUserCount - override val featureTotalUserCount = SocialProofTotalFavoritedByUserCount -} - -case class RetweetedBySocialProofUserGroundTruth(userIds: Seq[UserId] = Seq.empty, count: Int = 0) - extends SocialProofUserGroundTruth(userIds, count) { - - override val featureDisplayedUsers = SocialProofDisplayedRetweetedByUsers - override val featureDisplayedUserCount = SocialProofDisplayedRetweetedByUserCount - override val featureUndisplayedUserCount = SocialProofUndisplayedRetweetedByUserCount - override val featureTotalUserCount = SocialProofTotalRetweetedByUserCount -} - -case class RepliedBySocialProofUserGroundTruth(userIds: Seq[UserId] = Seq.empty, count: Int = 0) - extends SocialProofUserGroundTruth(userIds, count) { - - override val featureDisplayedUsers = SocialProofDisplayedRepliedByUsers - override val featureDisplayedUserCount = SocialProofDisplayedRepliedByUserCount - override val featureUndisplayedUserCount = SocialProofUndisplayedRepliedByUserCount - override val featureTotalUserCount = SocialProofTotalRepliedByUserCount -} - -case class SocialProofFeatures( - hasSocialProof: Boolean, - favoritedBy: FavoritedBySocialProofUserGroundTruth = FavoritedBySocialProofUserGroundTruth(), - retweetedBy: RetweetedBySocialProofUserGroundTruth = RetweetedBySocialProofUserGroundTruth(), - repliedBy: RepliedBySocialProofUserGroundTruth = RepliedBySocialProofUserGroundTruth()) { - - def setFeatures(dataRecord: DataRecord): Unit = - if (hasSocialProof) { - dataRecord.setFeatureValue(HasSocialProof, hasSocialProof) - favoritedBy.setFeatures(dataRecord) - retweetedBy.setFeatures(dataRecord) - repliedBy.setFeatures(dataRecord) - } -} - -object SocialProofFeatures { - def apply(socialProofs: Seq[SocialProof]): SocialProofFeatures = - socialProofs.foldLeft(SocialProofFeatures(hasSocialProof = socialProofs.nonEmpty))( - (prevFeatures, socialProof) => { - val userIds = socialProof.v1.userIds - val count = socialProof.v1.count - socialProof.v1.socialProofType match { - case SocialProofType.FavoritedBy => - prevFeatures.copy(favoritedBy = FavoritedBySocialProofUserGroundTruth(userIds, count)) - case SocialProofType.RetweetedBy => - prevFeatures.copy(retweetedBy = RetweetedBySocialProofUserGroundTruth(userIds, count)) - case SocialProofType.RepliedBy => - prevFeatures.copy(repliedBy = RepliedBySocialProofUserGroundTruth(userIds, count)) - case _ => - prevFeatures // skip silently instead of breaking jobs, since this isn't used yet - } - }) -} - -object SocialProofDataRecordFeatures { - val HasSocialProof = new Binary("recap.social_proof.has_social_proof") - - val SocialProofDisplayedFavoritedByUsers = new SparseBinary( - "recap.social_proof.list.displayed.favorited_by", - Set(UserId, PublicLikes, PrivateLikes).asJava - ) - val SocialProofDisplayedFavoritedByUserCount = new Continuous( - "recap.social_proof.count.displayed.favorited_by", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - val SocialProofUndisplayedFavoritedByUserCount = new Continuous( - "recap.social_proof.count.undisplayed.favorited_by", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - val SocialProofTotalFavoritedByUserCount = new Continuous( - "recap.social_proof.count.total.favorited_by", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - - val SocialProofDisplayedRetweetedByUsers = new SparseBinary( - "recap.social_proof.list.displayed.retweeted_by", - Set(UserId, PublicRetweets, PrivateRetweets).asJava - ) - val SocialProofDisplayedRetweetedByUserCount = new Continuous( - "recap.social_proof.count.displayed.retweeted_by", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val SocialProofUndisplayedRetweetedByUserCount = new Continuous( - "recap.social_proof.count.undisplayed.retweeted_by", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val SocialProofTotalRetweetedByUserCount = new Continuous( - "recap.social_proof.count.total.retweeted_by", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - - val SocialProofDisplayedRepliedByUsers = new SparseBinary( - "recap.social_proof.list.displayed.replied_by", - Set(UserId, PublicReplies, PrivateReplies).asJava - ) - val SocialProofDisplayedRepliedByUserCount = new Continuous( - "recap.social_proof.count.displayed.replied_by", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - val SocialProofUndisplayedRepliedByUserCount = new Continuous( - "recap.social_proof.count.undisplayed.replied_by", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - val SocialProofTotalRepliedByUserCount = new Continuous( - "recap.social_proof.count.total.replied_by", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - - val AllFeatures = Seq( - HasSocialProof, - SocialProofDisplayedFavoritedByUsers, - SocialProofDisplayedFavoritedByUserCount, - SocialProofUndisplayedFavoritedByUserCount, - SocialProofTotalFavoritedByUserCount, - SocialProofDisplayedRetweetedByUsers, - SocialProofDisplayedRetweetedByUserCount, - SocialProofUndisplayedRetweetedByUserCount, - SocialProofTotalRetweetedByUserCount, - SocialProofDisplayedRepliedByUsers, - SocialProofDisplayedRepliedByUserCount, - SocialProofUndisplayedRepliedByUserCount, - SocialProofTotalRepliedByUserCount - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/features/time_features/BUILD b/src/scala/com/twitter/timelines/prediction/features/time_features/BUILD deleted file mode 100644 index b5c49af36..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/time_features/BUILD +++ /dev/null @@ -1,10 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/timelines/time_features:time_features-scala", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/time_features/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/time_features/BUILD.docx new file mode 100644 index 000000000..2b2bb7cb2 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/time_features/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/time_features/TimeDataRecordFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/time_features/TimeDataRecordFeatures.docx new file mode 100644 index 000000000..205e95a36 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/time_features/TimeDataRecordFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/time_features/TimeDataRecordFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/time_features/TimeDataRecordFeatures.scala deleted file mode 100644 index b398203c3..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/time_features/TimeDataRecordFeatures.scala +++ /dev/null @@ -1,111 +0,0 @@ -package com.twitter.timelines.prediction.features.time_features - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import com.twitter.ml.api.Feature._ -import scala.collection.JavaConverters._ -import com.twitter.util.Duration -import com.twitter.conversions.DurationOps._ - -object TimeDataRecordFeatures { - val TIME_BETWEEN_NON_POLLING_REQUESTS_AVG = new Continuous( - "time_features.time_between_non_polling_requests_avg", - Set(PrivateTimestamp).asJava - ) - val TIME_SINCE_TWEET_CREATION = new Continuous("time_features.time_since_tweet_creation") - val TIME_SINCE_SOURCE_TWEET_CREATION = new Continuous( - "time_features.time_since_source_tweet_creation" - ) - val TIME_SINCE_LAST_NON_POLLING_REQUEST = new Continuous( - "time_features.time_since_last_non_polling_request", - Set(PrivateTimestamp).asJava - ) - val NON_POLLING_REQUESTS_SINCE_TWEET_CREATION = new Continuous( - "time_features.non_polling_requests_since_tweet_creation", - Set(PrivateTimestamp).asJava - ) - val TWEET_AGE_RATIO = new Continuous("time_features.tweet_age_ratio") - val IS_TWEET_RECYCLED = new Binary("time_features.is_tweet_recycled") - // Last Engagement features - val LAST_FAVORITE_SINCE_CREATION_HRS = new Continuous( - "time_features.earlybird.last_favorite_since_creation_hrs", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - val LAST_RETWEET_SINCE_CREATION_HRS = new Continuous( - "time_features.earlybird.last_retweet_since_creation_hrs", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val LAST_REPLY_SINCE_CREATION_HRS = new Continuous( - "time_features.earlybird.last_reply_since_creation_hrs", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - val LAST_QUOTE_SINCE_CREATION_HRS = new Continuous( - "time_features.earlybird.last_quote_since_creation_hrs", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val TIME_SINCE_LAST_FAVORITE_HRS = new Continuous( - "time_features.earlybird.time_since_last_favorite", - Set(CountOfPrivateLikes, CountOfPublicLikes).asJava - ) - val TIME_SINCE_LAST_RETWEET_HRS = new Continuous( - "time_features.earlybird.time_since_last_retweet", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - val TIME_SINCE_LAST_REPLY_HRS = new Continuous( - "time_features.earlybird.time_since_last_reply", - Set(CountOfPrivateReplies, CountOfPublicReplies).asJava - ) - val TIME_SINCE_LAST_QUOTE_HRS = new Continuous( - "time_features.earlybird.time_since_last_quote", - Set(CountOfPrivateRetweets, CountOfPublicRetweets).asJava - ) - - val TIME_SINCE_VIEWER_ACCOUNT_CREATION_SECS = - new Continuous( - "time_features.time_since_viewer_account_creation_secs", - Set(AccountCreationTime, AgeOfAccount).asJava) - - val USER_ID_IS_SNOWFLAKE_ID = - new Binary("time_features.time_user_id_is_snowflake_id", Set(UserType).asJava) - - val IS_30_DAY_NEW_USER = - new Binary("time_features.is_day_30_new_user", Set(AccountCreationTime, AgeOfAccount).asJava) - val IS_12_MONTH_NEW_USER = - new Binary("time_features.is_month_12_new_user", Set(AccountCreationTime, AgeOfAccount).asJava) - val ACCOUNT_AGE_INTERVAL = - new Discrete("time_features.account_age_interval", Set(AgeOfAccount).asJava) -} - -object AccountAgeInterval extends Enumeration { - val LTE_1_DAY, GT_1_DAY_LTE_5_DAY, GT_5_DAY_LTE_14_DAY, GT_14_DAY_LTE_30_DAY = Value - - def fromDuration(accountAge: Duration): Option[AccountAgeInterval.Value] = { - accountAge match { - case a if (a <= 1.day) => Some(LTE_1_DAY) - case a if (1.day < a && a <= 5.days) => Some(GT_1_DAY_LTE_5_DAY) - case a if (5.days < a && a <= 14.days) => Some(GT_5_DAY_LTE_14_DAY) - case a if (14.days < a && a <= 30.days) => Some(GT_14_DAY_LTE_30_DAY) - case _ => None - } - } -} - -case class TimeFeatures( - isTweetRecycled: Boolean, - timeSinceTweetCreation: Double, - isDay30NewUser: Boolean, - isMonth12NewUser: Boolean, - timeSinceSourceTweetCreation: Double, // same as timeSinceTweetCreation for non-retweets - timeSinceViewerAccountCreationSecs: Option[Double], - timeBetweenNonPollingRequestsAvg: Option[Double] = None, - timeSinceLastNonPollingRequest: Option[Double] = None, - nonPollingRequestsSinceTweetCreation: Option[Double] = None, - tweetAgeRatio: Option[Double] = None, - lastFavSinceCreationHrs: Option[Double] = None, - lastRetweetSinceCreationHrs: Option[Double] = None, - lastReplySinceCreationHrs: Option[Double] = None, - lastQuoteSinceCreationHrs: Option[Double] = None, - timeSinceLastFavoriteHrs: Option[Double] = None, - timeSinceLastRetweetHrs: Option[Double] = None, - timeSinceLastReplyHrs: Option[Double] = None, - timeSinceLastQuoteHrs: Option[Double] = None, - accountAgeInterval: Option[AccountAgeInterval.Value] = None) diff --git a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/BUILD b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/BUILD deleted file mode 100644 index a4ad0eabf..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/BUILD +++ /dev/null @@ -1,10 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "graph-feature-service/src/main/thrift/com/twitter/graph_feature_service:graph_feature_service_thrift-scala", - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/BUILD.docx new file mode 100644 index 000000000..ddee60e97 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeatures.docx new file mode 100644 index 000000000..eee752f33 Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeatures.scala deleted file mode 100644 index 03a112578..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeatures.scala +++ /dev/null @@ -1,93 +0,0 @@ -package com.twitter.timelines.prediction.features.two_hop_features - -import com.twitter.graph_feature_service.thriftscala.EdgeType -import com.twitter.ml.api.Feature._ -import scala.collection.JavaConverters._ -import TwoHopFeaturesConfig.personalDataTypesMap - -object TwoHopFeaturesDescriptor { - val prefix = "two_hop" - val normalizedPostfix = "normalized" - val leftNodeDegreePostfix = "left_degree" - val rightNodeDegreePostfix = "right_degree" - - type TwoHopFeatureMap = Map[(EdgeType, EdgeType), Continuous] - type TwoHopFeatureNodeDegreeMap = Map[EdgeType, Continuous] - - def apply(edgeTypePairs: Seq[(EdgeType, EdgeType)]): TwoHopFeaturesDescriptor = { - new TwoHopFeaturesDescriptor(edgeTypePairs) - } -} - -class TwoHopFeaturesDescriptor(edgeTypePairs: Seq[(EdgeType, EdgeType)]) { - import TwoHopFeaturesDescriptor._ - - def getLeftEdge(edgeTypePair: (EdgeType, EdgeType)): EdgeType = { - edgeTypePair._1 - } - - def getLeftEdgeName(edgeTypePair: (EdgeType, EdgeType)): String = { - getLeftEdge(edgeTypePair).originalName.toLowerCase - } - - def getRightEdge(edgeTypePair: (EdgeType, EdgeType)): EdgeType = { - edgeTypePair._2 - } - - def getRightEdgeName(edgeTypePair: (EdgeType, EdgeType)): String = { - getRightEdge(edgeTypePair).originalName.toLowerCase - } - - val rawFeaturesMap: TwoHopFeatureMap = edgeTypePairs.map(edgeTypePair => { - val leftEdgeType = getLeftEdge(edgeTypePair) - val leftEdgeName = getLeftEdgeName(edgeTypePair) - val rightEdgeType = getRightEdge(edgeTypePair) - val rightEdgeName = getRightEdgeName(edgeTypePair) - val personalDataTypes = ( - personalDataTypesMap.getOrElse(leftEdgeType, Set.empty) ++ - personalDataTypesMap.getOrElse(rightEdgeType, Set.empty) - ).asJava - val rawFeature = new Continuous(s"$prefix.$leftEdgeName.$rightEdgeName", personalDataTypes) - edgeTypePair -> rawFeature - })(collection.breakOut) - - val leftNodeDegreeFeaturesMap: TwoHopFeatureNodeDegreeMap = edgeTypePairs.map(edgeTypePair => { - val leftEdgeType = getLeftEdge(edgeTypePair) - val leftEdgeName = getLeftEdgeName(edgeTypePair) - val personalDataTypes = personalDataTypesMap.getOrElse(leftEdgeType, Set.empty).asJava - val leftNodeDegreeFeature = - new Continuous(s"$prefix.$leftEdgeName.$leftNodeDegreePostfix", personalDataTypes) - leftEdgeType -> leftNodeDegreeFeature - })(collection.breakOut) - - val rightNodeDegreeFeaturesMap: TwoHopFeatureNodeDegreeMap = edgeTypePairs.map(edgeTypePair => { - val rightEdgeType = getRightEdge(edgeTypePair) - val rightEdgeName = getRightEdgeName(edgeTypePair) - val personalDataTypes = personalDataTypesMap.getOrElse(rightEdgeType, Set.empty).asJava - val rightNodeDegreeFeature = - new Continuous(s"$prefix.$rightEdgeName.$rightNodeDegreePostfix", personalDataTypes) - rightEdgeType -> rightNodeDegreeFeature - })(collection.breakOut) - - val normalizedFeaturesMap: TwoHopFeatureMap = edgeTypePairs.map(edgeTypePair => { - val leftEdgeType = getLeftEdge(edgeTypePair) - val leftEdgeName = getLeftEdgeName(edgeTypePair) - val rightEdgeType = getRightEdge(edgeTypePair) - val rightEdgeName = getRightEdgeName(edgeTypePair) - val personalDataTypes = ( - personalDataTypesMap.getOrElse(leftEdgeType, Set.empty) ++ - personalDataTypesMap.getOrElse(rightEdgeType, Set.empty) - ).asJava - val normalizedFeature = - new Continuous(s"$prefix.$leftEdgeName.$rightEdgeName.$normalizedPostfix", personalDataTypes) - edgeTypePair -> normalizedFeature - })(collection.breakOut) - - private val rawFeaturesSeq: Seq[Continuous] = rawFeaturesMap.values.toSeq - private val leftNodeDegreeFeaturesSeq: Seq[Continuous] = leftNodeDegreeFeaturesMap.values.toSeq - private val rightNodeDegreeFeaturesSeq: Seq[Continuous] = rightNodeDegreeFeaturesMap.values.toSeq - private val normalizedFeaturesSeq: Seq[Continuous] = normalizedFeaturesMap.values.toSeq - - val featuresSeq: Seq[Continuous] = - rawFeaturesSeq ++ leftNodeDegreeFeaturesSeq ++ rightNodeDegreeFeaturesSeq ++ normalizedFeaturesSeq -} diff --git a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeaturesConfig.docx b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeaturesConfig.docx new file mode 100644 index 000000000..4168010af Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeaturesConfig.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeaturesConfig.scala b/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeaturesConfig.scala deleted file mode 100644 index ece502e30..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/two_hop_features/TwoHopFeaturesConfig.scala +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.timelines.prediction.features.two_hop_features - -import com.twitter.dal.personal_data.thriftjava.PersonalDataType -import com.twitter.graph_feature_service.thriftscala.{EdgeType, FeatureType} - -object TwoHopFeaturesConfig { - val leftEdgeTypes = Seq(EdgeType.Following, EdgeType.Favorite, EdgeType.MutualFollow) - val rightEdgeTypes = Seq( - EdgeType.FollowedBy, - EdgeType.FavoritedBy, - EdgeType.RetweetedBy, - EdgeType.MentionedBy, - EdgeType.MutualFollow) - - val edgeTypePairs: Seq[(EdgeType, EdgeType)] = { - for (leftEdgeType <- leftEdgeTypes; rightEdgeType <- rightEdgeTypes) - yield (leftEdgeType, rightEdgeType) - } - - val featureTypes: Seq[FeatureType] = edgeTypePairs.map(pair => FeatureType(pair._1, pair._2)) - - val personalDataTypesMap: Map[EdgeType, Set[PersonalDataType]] = Map( - EdgeType.Following -> Set(PersonalDataType.CountOfFollowersAndFollowees), - EdgeType.Favorite -> Set( - PersonalDataType.CountOfPrivateLikes, - PersonalDataType.CountOfPublicLikes), - EdgeType.MutualFollow -> Set(PersonalDataType.CountOfFollowersAndFollowees), - EdgeType.FollowedBy -> Set(PersonalDataType.CountOfFollowersAndFollowees) - ) -} diff --git a/src/scala/com/twitter/timelines/prediction/features/user_health/BUILD b/src/scala/com/twitter/timelines/prediction/features/user_health/BUILD deleted file mode 100644 index 598e0c066..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/user_health/BUILD +++ /dev/null @@ -1,10 +0,0 @@ -scala_library( - sources = ["*.scala"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "src/java/com/twitter/ml/api:api-base", - "src/thrift/com/twitter/dal/personal_data:personal_data-java", - "src/thrift/com/twitter/timelines/author_features/user_health:thrift-scala", - ], -) diff --git a/src/scala/com/twitter/timelines/prediction/features/user_health/BUILD.docx b/src/scala/com/twitter/timelines/prediction/features/user_health/BUILD.docx new file mode 100644 index 000000000..69560da7f Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/user_health/BUILD.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/user_health/UserHealthFeatures.docx b/src/scala/com/twitter/timelines/prediction/features/user_health/UserHealthFeatures.docx new file mode 100644 index 000000000..3b390d55c Binary files /dev/null and b/src/scala/com/twitter/timelines/prediction/features/user_health/UserHealthFeatures.docx differ diff --git a/src/scala/com/twitter/timelines/prediction/features/user_health/UserHealthFeatures.scala b/src/scala/com/twitter/timelines/prediction/features/user_health/UserHealthFeatures.scala deleted file mode 100644 index 7c8c7f8b1..000000000 --- a/src/scala/com/twitter/timelines/prediction/features/user_health/UserHealthFeatures.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.twitter.timelines.prediction.features.user_health - -import com.twitter.ml.api.Feature -import com.twitter.timelines.author_features.user_health.thriftscala.UserState -import com.twitter.dal.personal_data.thriftjava.PersonalDataType.{UserState => UserStatePDT} -import com.twitter.dal.personal_data.thriftjava.PersonalDataType._ -import scala.collection.JavaConverters._ - -object UserHealthFeatures { - val UserState = new Feature.Discrete("user_health.user_state", Set(UserStatePDT, UserType).asJava) - val IsLightMinusUser = - new Feature.Binary("user_health.is_light_minus_user", Set(UserStatePDT, UserType).asJava) - val AuthorState = - new Feature.Discrete("user_health.author_state", Set(UserStatePDT, UserType).asJava) - val NumAuthorFollowers = - new Feature.Continuous("author_health.num_followers", Set(CountOfFollowersAndFollowees).asJava) - val NumAuthorConnectDays = new Feature.Continuous("author_health.num_connect_days") - val NumAuthorConnect = new Feature.Continuous("author_health.num_connect") - - val IsUserVerifiedUnion = new Feature.Binary("user_account.is_user_verified_union") -} - -case class UserHealthFeatures(id: Long, userStateOpt: Option[UserState]) diff --git a/src/thrift/com/twitter/interaction_graph/BUILD b/src/thrift/com/twitter/interaction_graph/BUILD deleted file mode 100644 index 500c73d77..000000000 --- a/src/thrift/com/twitter/interaction_graph/BUILD +++ /dev/null @@ -1,15 +0,0 @@ -create_thrift_libraries( - base_name = "interaction_graph", - sources = ["*.thrift"], - platform = "java8", - tags = ["bazel-compatible"], - dependency_roots = [ - ], - generate_languages = [ - "java", - "scala", - "strato", - ], - provides_java_name = "interaction_graph-thrift-java", - provides_scala_name = "interaction_graph-thrift-scala", -) diff --git a/src/thrift/com/twitter/interaction_graph/BUILD.docx b/src/thrift/com/twitter/interaction_graph/BUILD.docx new file mode 100644 index 000000000..ce119296a Binary files /dev/null and b/src/thrift/com/twitter/interaction_graph/BUILD.docx differ diff --git a/src/thrift/com/twitter/interaction_graph/interaction_graph.docx b/src/thrift/com/twitter/interaction_graph/interaction_graph.docx new file mode 100644 index 000000000..ec4639bf8 Binary files /dev/null and b/src/thrift/com/twitter/interaction_graph/interaction_graph.docx differ diff --git a/src/thrift/com/twitter/interaction_graph/interaction_graph.thrift b/src/thrift/com/twitter/interaction_graph/interaction_graph.thrift deleted file mode 100644 index d90df54cf..000000000 --- a/src/thrift/com/twitter/interaction_graph/interaction_graph.thrift +++ /dev/null @@ -1,98 +0,0 @@ -namespace java com.twitter.interaction_graph.thriftjava -#@namespace scala com.twitter.interaction_graph.thriftscala -#@namespace strato com.twitter.interaction_graph - -// These could be either a Vertex or an edge feature name -// when you add a new feature, update VertexFeatureCombiner.java and EdgeFeatureCombiner.java. -enum FeatureName { - num_retweets = 1 - num_favorites = 2 - num_mentions = 3 - num_direct_messages = 4 - num_tweet_clicks = 5 - num_link_clicks = 6 - num_profile_views = 7 - num_follows = 8 - num_unfollows = 9 - num_mutual_follows = 10 - address_book_email = 11 - address_book_phone = 12 - address_book_in_both = 13 - address_book_mutual_edge_email = 14 - address_book_mutual_edge_phone = 15 - address_book_mutual_edge_in_both = 16 - total_dwell_time = 17 - num_inspected_statuses = 18 - num_photo_tags = 19 - num_blocks = 20 - num_mutes = 21 - num_report_as_abuses = 22 - num_report_as_spams = 23 - num_tweet_quotes = 24 - num_push_opens = 25 - num_ntab_clicks = 26, - num_rt_favories = 27, - num_rt_replies = 28, - num_rt_tweet_quotes = 29, - num_rt_retweets = 30, - num_rt_mentions = 31, - num_rt_tweet_clicks = 32, - num_rt_link_clicks = 33 - num_shares = 34, - num_email_click = 35, - num_email_open = 36, - num_ntab_dislike_7_days = 37, - num_push_dismiss = 38, - num_push_report_tweet_click = 39, - num_push_report_user_click = 40, - num_replies = 41, - // vertex features after 128 - num_create_tweets = 129, -} -// do remember to update the tests in InteractionGraphAggregationJobTest when adding new features but not updating agg_all - -struct TimeSeriesStatistics { - 1: required double mean; - // For computing variance online: http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm - 2: required double m2_for_variance; - 3: required double ewma; // Exponentially weighted moving average: ewma_t = \alpha x_t + (1-\alpha) ewma_{t-1} - 4: required i32 num_elapsed_days; // Total number of days since we started counting this feature - 5: required i32 num_non_zero_days; // Number of days when the interaction was non-zero (used to compute mean/variance) - 6: optional i32 num_days_since_last; // Number of days since the latest interaction happen -}(persisted="true", hasPersonalData = 'false') - -struct VertexFeature { - 1: required FeatureName name; - 2: required bool outgoing; // direction e.g. true is num_retweets_by_user, and false is num_retweets_for_user - 3: required TimeSeriesStatistics tss; -}(persisted="true", hasPersonalData = 'false') - -struct Vertex { - 1: required i64 user_id(personalDataType = 'UserId'); - 2: optional double weight; - 3: list features; -}(persisted="true", hasPersonalData = 'true') - -/* - * These features are for an edge (a->b). Examples: - * (i) follow is whether a follows b - * (ii) num_retweets is number of b's tweets retweet by a - */ -struct EdgeFeature { - 1: required FeatureName name; - 2: required TimeSeriesStatistics tss; -}(persisted="true", hasPersonalData = 'false') - -struct Edge { - 1: required i64 source_id(personalDataType = 'UserId'); - 2: required i64 destination_id(personalDataType = 'UserId'); - 3: optional double weight; - 4: list features; -}(persisted="true", hasPersonalData = 'true') - -// these structs below are used by our ml pipeline -struct EdgeLabel { - 1: required i64 source_id(personalDataType = 'UserId'); - 2: required i64 destination_id(personalDataType = 'UserId'); - 3: required set labels(personalDataType = 'AggregateImpressionEngagementData'); -}(persisted="true", hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/recos/recos.docx b/src/thrift/com/twitter/recos/recos.docx new file mode 100644 index 000000000..ffa8efc41 Binary files /dev/null and b/src/thrift/com/twitter/recos/recos.docx differ diff --git a/src/thrift/com/twitter/recos/recos.thrift b/src/thrift/com/twitter/recos/recos.thrift deleted file mode 100644 index a0c6c8f03..000000000 --- a/src/thrift/com/twitter/recos/recos.thrift +++ /dev/null @@ -1,176 +0,0 @@ -namespace java com.twitter.recos.thriftjava -#@namespace scala com.twitter.recos.thriftscala -namespace rb Recos - -include "com/twitter/recos/features/tweet.thrift" - -enum RecommendTweetDisplayLocation { - HomeTimeline = 0 - Peek = 1 - WelcomeFlow = 2 - NetworkDigest = 3 - BackfillDigest = 4 - NetworkDigestExp1 = 5 - NetworkDigestExp2 = 6 // deprecated - NetworkDigestExp3 = 7 // deprecated - HttpEndpoint = 8 - HomeTimeline1 = 9 - HomeTimeline2 = 10 - HomeTimeline3 = 11 - HomeTimeline4 = 12 - Poptart = 13 - NetworkDigestExp4 = 14 - NetworkDigestExp5 = 15 - NetworkDigestExp6 = 16 - NetworkDigestExp7 = 17 - NetworkDigestExp8 = 18 - NetworkDigestExp9 = 19 - InstantTimeline1 = 20 // AB1 + whitelist - InstantTimeline2 = 21 // AB1 + !whitelist - InstantTimeline3 = 22 // AB2 + whitelist - InstantTimeline4 = 23 // AB2 + !whitelist - BackfillDigestActive = 24 // deprecated - BackfillDigestDormant = 25 // deprecated - ExploreUS = 26 // deprecated - ExploreBR = 27 // deprecated - ExploreIN = 28 // deprecated - ExploreES = 29 // deprecated - ExploreJP = 30 // deprecated - MagicRecs = 31 - MagicRecs1 = 32 - MagicRecs2 = 33 - MagicRecs3 = 34 - SMSDiscover = 35 - FastFollower = 36 - InstantTimeline5 = 37 // for instant timeline experiment - InstantTimeline6 = 38 // for instant timeline experiment - InstantTimeline7 = 39 // for instant timeline experiment - InstantTimeline8 = 40 // for instant timeline experiment - LoggedOutProfile = 41 - LoggedOutPermalink = 42 - Poptart2 = 43 -} - -enum RelatedTweetDisplayLocation { - Permalink = 0 - Permalink1 = 1 - MobilePermalink = 2 - Permalink3 = 3 - Permalink4 = 4 - RelatedTweets = 5 - RelatedTweets1 = 6 - RelatedTweets2 = 7 - RelatedTweets3 = 8 - RelatedTweets4 = 9 - LoggedOutProfile = 10 - LoggedOutPermalink = 11 -} - -enum DDGBucket { - Control = 0 - Treatment = 1 - None = 2 -} - -struct RecommendTweetRequest { - 1: required i64 requesterId // user id of the requesting user - 2: required RecommendTweetDisplayLocation displayLocation // display location from the client - 3: optional i64 clientId // twitter api client id - 4: optional i32 maxResults // number of suggested results to return - 5: optional list excludedTweetIds // list of tweet ids to exclude from response - 6: optional list excludedAuthorIds // list of author ids to exclude from response - 7: optional i64 guestId // guestId - 8: optional string languageCode // Language code - 9: optional string countryCode // Country code - 10: optional string ipAddress // ip address of the user - 11: optional string deviceId // udid/uuid of device - 12: optional bool populateTweetFeatures // whether to populate tweet features. RecommendedTweet.tweetFeatures in the response will only be populated if this is set. -} - -struct Bucket { - 1: required string experimentName // name of experiment (or not). experiment could be production or whatever fits - 2: required string bucket // name of bucket (may or may not be a DDG bucket, e.g., production) -} - -struct RelatedTweetRequest { - 1: required i64 tweetId // original tweet id - 2: required RelatedTweetDisplayLocation displayLocation // display location from the client - 3: optional i64 clientId // twitter api client id - 4: optional i64 requesterId // user id of the requesting user - 5: optional i32 maxResults // number of suggested results to return - 6: optional list excludeTweetIds // list of tweet ids to exclude from response - 7: optional list excludedAuthorIds // list of author ids to exclude from response - 8: optional i64 guestId // guestId - 9: optional string languageCode // Language code - 10: optional string countryCode // Country code - 11: optional string ipAddress // ip address of the user - 12: optional string deviceId // udid/uuid of device - 13: optional string userAgent // userAgent of the requesting user -} - -enum SocialProofType { - FollowedBy = 1, - FavoritedBy = 2, - RetweetedBy = 3, - SimilarTo = 4, - RESERVED_2 = 5, - RESERVED_3 = 6, - RESERVED_4 = 7, - RESERVED_5 = 8, - RESERVED_6 = 9, - RESERVED_7 = 10 -} - -enum Algorithm { - Salsa = 1, - PastEmailClicks = 2, - SimilarToEmailClicks = 3, - PastClientEventClicks = 4, - VitNews = 5, - StrongTieScoring = 6, - PollsFromGraph = 7, - PollsBasedOnGeo = 8, - RESERVED_9 = 9, - RESERVED_10 = 10, - RESERVED_11 = 11, -} - -struct RecommendedTweet { - 1: required i64 tweetId - 2: required i64 authorId - 3: required list socialProof - 4: required string feedbackToken - 5: optional list favBy // optionally provide a list of users who fav'ed the tweet if exist - 6: optional tweet.RecommendedTweetFeatures tweetFeatures // the features of a recommended tweet - 7: optional SocialProofType socialProofType // type of social proof. favBy should be deprecated soon - 8: optional string socialProofOverride // should be set only for DDGs, for en-only experiments. SocialProofType is ignored when this field is set - 9: optional Algorithm algorithm // algorithm used - 10: optional double score // score - 11: optional bool isFollowingAuthor // true if the target user follows the author of the tweet -} - -struct RelatedTweet { - 1: required i64 tweetId - 2: required i64 authorId - 3: required double score - 4: required string feedbackToken -} - -struct RecommendTweetResponse { - 1: required list tweets - 2: optional DDGBucket bucket // deprecated - 3: optional Bucket assignedBucket // for client-side experimentation -} - -struct RelatedTweetResponse { - 1: required list tweets // a list of related tweets - 2: optional Bucket assignedBucket // the bucket used for treatment -} - -/** - * The main interface-definition for Recos. - */ -service Recos { - RecommendTweetResponse recommendTweets (RecommendTweetRequest request) - RelatedTweetResponse relatedTweets (RelatedTweetRequest request) -} diff --git a/src/thrift/com/twitter/recos/recos_common.docx b/src/thrift/com/twitter/recos/recos_common.docx new file mode 100644 index 000000000..739f8a659 Binary files /dev/null and b/src/thrift/com/twitter/recos/recos_common.docx differ diff --git a/src/thrift/com/twitter/recos/recos_common.thrift b/src/thrift/com/twitter/recos/recos_common.thrift deleted file mode 100644 index ece39b8df..000000000 --- a/src/thrift/com/twitter/recos/recos_common.thrift +++ /dev/null @@ -1,54 +0,0 @@ -namespace java com.twitter.recos.recos_common.thriftjava -namespace py gen.twitter.recos.recos_common -#@namespace scala com.twitter.recos.recos_common.thriftscala -#@namespace strato com.twitter.recos.recos_common -namespace rb Recos - -// Social proof types for user moment recommendations -enum MomentSocialProofType { - PUBLISH = 0 - LIKE = 1 - CAPSULE_OPEN = 2 -} - -// Social proof types for tweet/entity recommendations -enum SocialProofType { - CLICK = 0 - FAVORITE = 1 - RETWEET = 2 - REPLY = 3 - TWEET = 4 - IS_MENTIONED = 5 - IS_MEDIATAGGED = 6 - QUOTE = 7 -} - -struct SocialProof { - 1: required i64 userId - 2: optional i64 metadata -} - -// Social proof types for user recommendations -enum UserSocialProofType { - FOLLOW = 0 - MENTION = 1 - MEDIATAG = 2 -} - -struct GetRecentEdgesRequest { - 1: required i64 requestId // the node to query from - 2: optional i32 maxNumEdges // the max number of recent edges -} - -struct RecentEdge { - 1: required i64 nodeId // the connecting node id - 2: required SocialProofType engagementType // the engagement type of the edge -} - -struct GetRecentEdgesResponse { - 1: required list edges // the _ most recent edges from the query node -} - -struct NodeInfo { - 1: required list edges -} diff --git a/src/thrift/com/twitter/recos/recos_injector.docx b/src/thrift/com/twitter/recos/recos_injector.docx new file mode 100644 index 000000000..ed76c0544 Binary files /dev/null and b/src/thrift/com/twitter/recos/recos_injector.docx differ diff --git a/src/thrift/com/twitter/recos/recos_injector.thrift b/src/thrift/com/twitter/recos/recos_injector.thrift deleted file mode 100644 index b11bc5c09..000000000 --- a/src/thrift/com/twitter/recos/recos_injector.thrift +++ /dev/null @@ -1,22 +0,0 @@ -namespace java com.twitter.recos.recos_injector.thriftjava -namespace py gen.twitter.recos.recos_injector -#@namespace scala com.twitter.recos.recos_injector.thriftscala -namespace rb RecosInjector - -####### FOR RECOS INTERNAL USE ONLY -- please do NOT use this in client code ######## - -struct UserTweetAuthorGraphMessage { - 1: required i64 leftId - 2: required i64 rightId - 3: required i8 action - 4: optional i8 card - 5: optional i64 authorId - 6: optional Features features -} - -struct Features { - 1: optional bool hasPhoto - 2: optional bool hasVideo - 3: optional bool hasUrl - 4: optional bool hasHashtag -} diff --git a/src/thrift/com/twitter/recos/user_tweet_entity_graph/BUILD b/src/thrift/com/twitter/recos/user_tweet_entity_graph/BUILD deleted file mode 100644 index ffd17d734..000000000 --- a/src/thrift/com/twitter/recos/user_tweet_entity_graph/BUILD +++ /dev/null @@ -1,19 +0,0 @@ -RECOSGRAPH_SOURCES = ["user_tweet_entity_graph.thrift"] - -create_thrift_libraries( - base_name = "user_tweet_entity_graph", - sources = RECOSGRAPH_SOURCES, - platform = "java8", - tags = ["bazel-compatible"], - dependency_roots = [ - "src/thrift/com/twitter/recos:recos-common", - "src/thrift/com/twitter/recos/features:tweet", - ], - generate_languages = [ - "java", - "scala", - "strato", - ], - provides_java_name = "user_tweet_entity_graph-java", - provides_scala_name = "user_tweet_entity_graph-scala", -) diff --git a/src/thrift/com/twitter/recos/user_tweet_entity_graph/BUILD.docx b/src/thrift/com/twitter/recos/user_tweet_entity_graph/BUILD.docx new file mode 100644 index 000000000..9bbc81829 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_tweet_entity_graph/BUILD.docx differ diff --git a/src/thrift/com/twitter/recos/user_tweet_entity_graph/CONFIG.docx b/src/thrift/com/twitter/recos/user_tweet_entity_graph/CONFIG.docx new file mode 100644 index 000000000..4c77b68b2 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_tweet_entity_graph/CONFIG.docx differ diff --git a/src/thrift/com/twitter/recos/user_tweet_entity_graph/CONFIG.ini b/src/thrift/com/twitter/recos/user_tweet_entity_graph/CONFIG.ini deleted file mode 100644 index eae222a68..000000000 --- a/src/thrift/com/twitter/recos/user_tweet_entity_graph/CONFIG.ini +++ /dev/null @@ -1,7 +0,0 @@ -; See http://go/CONFIG.ini - -[jira] -project: SD - -[kite] -project: recos diff --git a/src/thrift/com/twitter/recos/user_tweet_entity_graph/user_tweet_entity_graph.docx b/src/thrift/com/twitter/recos/user_tweet_entity_graph/user_tweet_entity_graph.docx new file mode 100644 index 000000000..fa337b877 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_tweet_entity_graph/user_tweet_entity_graph.docx differ diff --git a/src/thrift/com/twitter/recos/user_tweet_entity_graph/user_tweet_entity_graph.thrift b/src/thrift/com/twitter/recos/user_tweet_entity_graph/user_tweet_entity_graph.thrift deleted file mode 100644 index 961fd2bc5..000000000 --- a/src/thrift/com/twitter/recos/user_tweet_entity_graph/user_tweet_entity_graph.thrift +++ /dev/null @@ -1,187 +0,0 @@ -namespace java com.twitter.recos.user_tweet_entity_graph.thriftjava -namespace py gen.twitter.recos.user_tweet_entity_graph -#@namespace scala com.twitter.recos.user_tweet_entity_graph.thriftscala -#@namespace strato com.twitter.recos.user_tweet_entity_graph -namespace rb UserTweetEntityGraph - -include "com/twitter/recos/features/tweet.thrift" -include "com/twitter/recos/recos_common.thrift" - -enum TweetType { - Summary = 0 - Photo = 1 - Player = 2 - Promote = 3 - Regular = 4 -} - -enum RecommendationType { - Tweet = 0 - Hashtag = 1 // Entity type - Url = 2 // Entity type -} - -enum TweetEntityDisplayLocation { - MagicRecs = 0 - HomeTimeline = 1 - HighlightsEmailUrlRecs = 2 - Highlights = 3 - Email = 4 - MagicRecsF1 = 5 - GuideVideo = 6 - MagicRecsRareTweet = 7 - TopArticles = 8 // Twitter Blue most shared articles page - ContentRecommender = 9 - FrigateNTab = 10 -} - -struct RecommendTweetEntityRequest { - // user id of the requesting user - 1: required i64 requesterId - - // display location from the client - 2: required TweetEntityDisplayLocation displayLocation - - // the recommendation entity types to return - 3: required list recommendationTypes - - // seed ids and weights used in left hand side - 4: required map seedsWithWeights - - // number of suggested results per recommendation entity type - 5: optional map maxResultsByType - - // the tweet age threshold in milliseconds - 6: optional i64 maxTweetAgeInMillis - - // list of tweet ids to exclude from response - 7: optional list excludedTweetIds - - // max user social proof size per engagement type - 8: optional i32 maxUserSocialProofSize - - // max tweet social proof size per user - 9: optional i32 maxTweetSocialProofSize - - // min user social proof size per each recommendation entity type - 10: optional map minUserSocialProofSizes - - // summary, photo, player, promote, regular - 11: optional list tweetTypes - - // the list of social proof types to return - 12: optional list socialProofTypes - - // set of groups of social proof types allowed to be combined for comparison against minUserSocialProofSizes. - // e.g. if the input is set>, then the union of those two social proofs - // will be compared against the minUserSocialProofSize of Tweet RecommendationType. - 13: optional set> socialProofTypeUnions - - // the recommendations returned in the response are authored by the following users - 14: optional set tweetAuthors - - // the tweet engagement age threshold in milliseconds - 15: optional i64 maxEngagementAgeInMillis - - // the recommendations will not return any tweet authored by the following users - 16: optional set excludedTweetAuthors -} - -struct TweetRecommendation { - // tweet id - 1: required i64 tweetId - // sum of weights of seed users who engaged with the tweet. - // If a user engaged with the same tweet twice, liked it and retweeted it, then his/her weight was counted twice. - 2: required double score - // user social proofs per engagement type - 3: required map> socialProofByType - // user social proofs along with edge metadata per engagement type. The value of the map is a list of SocialProofs. - 4: optional map> socialProofs -} - -struct HashtagRecommendation { - 1: required i32 id // integer hashtag id, which will be converted to hashtag string by client library. - 2: required double score - // sum of weights of seed users who engaged with the hashtag. - // If a user engaged with the same hashtag twice, liked it and retweeted it, then his/her weight was counted twice. - 3: required map>> socialProofByType - // user and tweet social proofs per engagement type. The key of inner map is user id, and the value of inner map is - // a list of tweet ids that the user engaged with. -} - -struct UrlRecommendation { - 1: required i32 id // integer url id, which will be converted to url string by client library. - 2: required double score - // sum of weights of seed users who engaged with the url. - // If a user engaged with the same url twice, liked it and retweeted it, then his/her weight was counted twice. - 3: required map>> socialProofByType - // user and tweet social proofs per engagement type. The key of inner map is user id, and the value of inner map is - // a list of tweet ids that the user engaged with. -} - -union UserTweetEntityRecommendationUnion { - 1: TweetRecommendation tweetRec - 2: HashtagRecommendation hashtagRec - 3: UrlRecommendation urlRec -} - -struct RecommendTweetEntityResponse { - 1: required list recommendations -} - -struct SocialProofRequest { - 1: required list inputTweets // Only for some tweets we need requst its social proofs. - 2: required map seedsWithWeights // a set of seed users with weights - 3: optional i64 requesterId // id of the requesting user - 4: optional list socialProofTypes // the list of social proof types to return -} - -struct SocialProofResponse { - 1: required list socialProofResults -} - -struct RecommendationSocialProofRequest { - /** - * Clients can request social proof from multiple recommendation types in a single request. - * NOTE: Avoid mixing tweet social proof requests with entity social proof requests as the - * underlying library call retrieves these differently. - */ - 1: required map> recommendationIdsForSocialProof - // These will be the only valid LHS nodes used to fetch social proof. - 2: required map seedsWithWeights - 3: optional i64 requesterId - // The list of valid social proof types to return, e.g. we may only want Favorite and Tweet proofs. - 4: optional list socialProofTypes -} - -struct RecommendationSocialProofResponse { - 1: required list socialProofResults -} - -/** - * The main interface-definition for UserTweetEntityGraph. - */ -service UserTweetEntityGraph { - RecommendTweetEntityResponse recommendTweets (RecommendTweetEntityRequest request) - - /** - * Given a query user, its seed users, and a set of input tweets, return the social proofs of - * input tweets if any. - * - * Currently this supports clients such as Email Recommendations, MagicRecs, and HomeTimeline. - * In order to avoid heavy migration work, we are retaining this endpoint. - */ - SocialProofResponse findTweetSocialProofs(SocialProofRequest request) - - /** - * Find social proof for the specified RecommendationType given a set of input ids of that type. - * Only find social proofs from the specified seed users with the specified social proof types. - * - * Currently this supports url social proof generation for Guide. - * - * This endpoint is flexible enough to support social proof generation for all recommendation - * types, and should be used for all future clients of this service. - */ - RecommendationSocialProofResponse findRecommendationSocialProofs(RecommendationSocialProofRequest request) -} - diff --git a/src/thrift/com/twitter/recos/user_tweet_graph/BUILD b/src/thrift/com/twitter/recos/user_tweet_graph/BUILD deleted file mode 100644 index 5f9f68eb3..000000000 --- a/src/thrift/com/twitter/recos/user_tweet_graph/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -RECOSGRAPH_SOURCES = ["user_tweet_graph.thrift"] - -create_thrift_libraries( - base_name = "user_tweet_graph", - sources = RECOSGRAPH_SOURCES, - platform = "java8", - tags = ["bazel-compatible"], - dependency_roots = [ - "src/thrift/com/twitter/recos:recos-common", - "src/thrift/com/twitter/recos/features:tweet", - ], - export_roots = [ - "src/thrift/com/twitter/recos/features:tweet", - ], - generate_languages = [ - "java", - "scala", - "strato", - ], - provides_java_name = "user_tweet_graph-java", - provides_scala_name = "user_tweet_graph-scala", -) diff --git a/src/thrift/com/twitter/recos/user_tweet_graph/BUILD.docx b/src/thrift/com/twitter/recos/user_tweet_graph/BUILD.docx new file mode 100644 index 000000000..8facd45cb Binary files /dev/null and b/src/thrift/com/twitter/recos/user_tweet_graph/BUILD.docx differ diff --git a/src/thrift/com/twitter/recos/user_tweet_graph/CONFIG.docx b/src/thrift/com/twitter/recos/user_tweet_graph/CONFIG.docx new file mode 100644 index 000000000..1aa36df18 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_tweet_graph/CONFIG.docx differ diff --git a/src/thrift/com/twitter/recos/user_tweet_graph/CONFIG.ini b/src/thrift/com/twitter/recos/user_tweet_graph/CONFIG.ini deleted file mode 100644 index eae222a68..000000000 --- a/src/thrift/com/twitter/recos/user_tweet_graph/CONFIG.ini +++ /dev/null @@ -1,7 +0,0 @@ -; See http://go/CONFIG.ini - -[jira] -project: SD - -[kite] -project: recos diff --git a/src/thrift/com/twitter/recos/user_tweet_graph/user_tweet_graph.docx b/src/thrift/com/twitter/recos/user_tweet_graph/user_tweet_graph.docx new file mode 100644 index 000000000..4609f8f17 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_tweet_graph/user_tweet_graph.docx differ diff --git a/src/thrift/com/twitter/recos/user_tweet_graph/user_tweet_graph.thrift b/src/thrift/com/twitter/recos/user_tweet_graph/user_tweet_graph.thrift deleted file mode 100644 index 43f294eb1..000000000 --- a/src/thrift/com/twitter/recos/user_tweet_graph/user_tweet_graph.thrift +++ /dev/null @@ -1,172 +0,0 @@ -namespace java com.twitter.recos.user_tweet_graph.thriftjava -namespace py gen.twitter.recos.user_tweet_graph -#@namespace scala com.twitter.recos.user_tweet_graph.thriftscala -#@namespace strato com.twitter.recos.user_tweet_graph -namespace rb UserTweetGraph - -include "com/twitter/recos/features/tweet.thrift" -include "com/twitter/recos/recos_common.thrift" - -enum TweetType { - Summary = 0 - Photo = 1 - Player = 2 - Promote = 3 - Regular = 4 -} - -enum Algorithm { - Salsa = 0 - SubGraphSalsa = 1 -} - -enum RecommendTweetDisplayLocation { - HomeTimeline = 0 - WelcomeFlow = 1 - NetworkDigest = 2 - BackfillDigest = 3 - HttpEndpoint = 4 - Poptart = 5 - InstantTimeline = 6 - Explore = 7 - MagicRecs = 8 - LoggedOutProfile = 9 - LoggedOutPermalink = 10 - VideoHome = 11 -} - -struct RecommendTweetRequest { - 1: required i64 requesterId // user id of the requesting user - 2: required RecommendTweetDisplayLocation displayLocation // display location from the client - 3: required i32 maxResults // number of suggested results to return - 4: required list excludedTweetIds // list of tweet ids to exclude from response - 5: required map seeds // seeds used in salsa random walk - 6: required i64 tweetRecency // the tweet recency threshold - 7: required i32 minInteraction // minimum interaction threshold - 8: required list includeTweetTypes // summary, photo, player, promote, other - 9: required double resetProbability // reset probability to query node - 10: required double queryNodeWeightFraction // the percentage of weights assigned to query node in seeding - 11: required i32 numRandomWalks // number of random walks - 12: required i32 maxRandomWalkLength // max random walk length - 13: required i32 maxSocialProofSize // max social proof size - 14: required Algorithm algorithm // algorithm type - 15: optional list socialProofTypes // the list of social proof types to return -} - -struct RecommendedTweet { - 1: required i64 tweetId - 2: required double score - 3: optional list socialProof // social proof in aggregate - 4: optional map> socialProofPerType // social proofs per engagement type -} - -struct RecommendTweetResponse { - 1: required list tweets -} - -enum RelatedTweetDisplayLocation { - Permalink = 0 - Permalink1 = 1 - MobilePermalink = 2 - Permalink3 = 3 - Permalink4 = 4 - RelatedTweets = 5 - RelatedTweets1 = 6 - RelatedTweets2 = 7 - RelatedTweets3 = 8 - RelatedTweets4 = 9 - LoggedOutProfile = 10 - LoggedOutPermalink = 11 -} - -struct UserTweetFeatureResponse { - 1: optional double favAdamicAdarAvg - 2: optional double favAdamicAdarMax - 3: optional double favLogCosineAvg - 4: optional double favLogCosineMax - 5: optional double retweetAdamicAdarAvg - 6: optional double retweetAdamicAdarMax - 7: optional double retweetLogCosineAvg - 8: optional double retweetLogCosineMax -} - -struct RelatedTweetRequest { - 1: required i64 tweetId // original tweet id - 2: required RelatedTweetDisplayLocation displayLocation // display location from the client - 3: optional string algorithm // additional parameter that the system can interpret - 4: optional i64 requesterId // user id of the requesting user - 5: optional i32 maxResults // number of suggested results to return - 6: optional list excludeTweetIds // list of tweet ids to exclude from response - 7: optional i32 maxNumNeighbors - 8: optional i32 minNeighborDegree - 9: optional i32 maxNumSamplesPerNeighbor - 10: optional i32 minCooccurrence - 11: optional i32 minQueryDegree - 12: optional double maxLowerMultiplicativeDeviation - 13: optional double maxUpperMultiplicativeDeviation - 14: optional bool populateTweetFeatures // whether to populate graph features - 15: optional i32 minResultDegree - 16: optional list additionalTweetIds - 17: optional double minScore - 18: optional i32 maxTweetAgeInHours -} - -struct TweetBasedRelatedTweetRequest { - 1: required i64 tweetId // query tweet id - 2: optional i32 maxResults // number of suggested results to return - 3: optional list excludeTweetIds // list of tweet ids to exclude from response - 4: optional i32 minQueryDegree // min degree of query tweet - 5: optional i32 maxNumSamplesPerNeighbor // max number of sampled users who engaged with the query tweet - 6: optional i32 minCooccurrence // min co-occurrence of related tweet candidate - 7: optional i32 minResultDegree // min degree of related tweet candidate - 8: optional double minScore // min score of related tweet candidate - 9: optional i32 maxTweetAgeInHours // max tweet age in hours of related tweet candidate -} - -struct ProducerBasedRelatedTweetRequest { - 1: required i64 producerId // query producer id - 2: optional i32 maxResults // number of suggested results to return - 3: optional list excludeTweetIds // list of tweet ids to exclude from response - 4: optional i32 minQueryDegree // min degree of query producer, e.g. number of followers - 5: optional i32 maxNumFollowers // max number of sampled users who follow the query producer - 6: optional i32 minCooccurrence // min co-occurrence of related tweet candidate - 7: optional i32 minResultDegree // min degree of related tweet candidate - 8: optional double minScore // min score of related tweet candidate - 9: optional i32 maxTweetAgeInHours // max tweet age in hours of related tweet candidate -} - -struct ConsumersBasedRelatedTweetRequest { - 1: required list consumerSeedSet // query consumer userId set - 2: optional i32 maxResults // number of suggested results to return - 3: optional list excludeTweetIds // list of tweet ids to exclude from response - 4: optional i32 minCooccurrence // min co-occurrence of related tweet candidate - 5: optional i32 minResultDegree // min degree of related tweet candidate - 6: optional double minScore // min score of related tweet candidate - 7: optional i32 maxTweetAgeInHours // max tweet age in hours of related tweet candidate -} - -struct RelatedTweet { - 1: required i64 tweetId - 2: required double score - 3: optional tweet.GraphFeaturesForTweet relatedTweetGraphFeatures -} - -struct RelatedTweetResponse { - 1: required list tweets - 2: optional tweet.GraphFeaturesForQuery queryTweetGraphFeatures -} - -/** - * The main interface-definition for UserTweetGraph. - */ -service UserTweetGraph { - RecommendTweetResponse recommendTweets (RecommendTweetRequest request) - recos_common.GetRecentEdgesResponse getLeftNodeEdges (recos_common.GetRecentEdgesRequest request) - recos_common.NodeInfo getRightNode (i64 node) - RelatedTweetResponse relatedTweets (RelatedTweetRequest request) - RelatedTweetResponse tweetBasedRelatedTweets (TweetBasedRelatedTweetRequest request) - RelatedTweetResponse producerBasedRelatedTweets (ProducerBasedRelatedTweetRequest request) - RelatedTweetResponse consumersBasedRelatedTweets (ConsumersBasedRelatedTweetRequest request) - UserTweetFeatureResponse userTweetFeatures (1: required i64 userId, 2: required i64 tweetId) -} - diff --git a/src/thrift/com/twitter/recos/user_user_graph/BUILD b/src/thrift/com/twitter/recos/user_user_graph/BUILD deleted file mode 100644 index ef53f847a..000000000 --- a/src/thrift/com/twitter/recos/user_user_graph/BUILD +++ /dev/null @@ -1,19 +0,0 @@ -RECOSGRAPH_SOURCES = ["user_user_graph.thrift"] - -create_thrift_libraries( - base_name = "user_user_graph", - sources = RECOSGRAPH_SOURCES, - platform = "java8", - tags = ["bazel-compatible"], - dependency_roots = [ - "src/thrift/com/twitter/recos:recos-common", - "src/thrift/com/twitter/recos/features:tweet", - ], - generate_languages = [ - "java", - "scala", - "strato", - ], - provides_java_name = "user_user_graph-java", - provides_scala_name = "user_user_graph-scala", -) diff --git a/src/thrift/com/twitter/recos/user_user_graph/BUILD.docx b/src/thrift/com/twitter/recos/user_user_graph/BUILD.docx new file mode 100644 index 000000000..79bb31e78 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_user_graph/BUILD.docx differ diff --git a/src/thrift/com/twitter/recos/user_user_graph/CONFIG.docx b/src/thrift/com/twitter/recos/user_user_graph/CONFIG.docx new file mode 100644 index 000000000..1aa36df18 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_user_graph/CONFIG.docx differ diff --git a/src/thrift/com/twitter/recos/user_user_graph/CONFIG.ini b/src/thrift/com/twitter/recos/user_user_graph/CONFIG.ini deleted file mode 100644 index eae222a68..000000000 --- a/src/thrift/com/twitter/recos/user_user_graph/CONFIG.ini +++ /dev/null @@ -1,7 +0,0 @@ -; See http://go/CONFIG.ini - -[jira] -project: SD - -[kite] -project: recos diff --git a/src/thrift/com/twitter/recos/user_user_graph/user_user_graph.docx b/src/thrift/com/twitter/recos/user_user_graph/user_user_graph.docx new file mode 100644 index 000000000..ba8385d35 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_user_graph/user_user_graph.docx differ diff --git a/src/thrift/com/twitter/recos/user_user_graph/user_user_graph.thrift b/src/thrift/com/twitter/recos/user_user_graph/user_user_graph.thrift deleted file mode 100644 index 10115c8d9..000000000 --- a/src/thrift/com/twitter/recos/user_user_graph/user_user_graph.thrift +++ /dev/null @@ -1,45 +0,0 @@ -namespace java com.twitter.recos.user_user_graph.thriftjava -namespace py gen.twitter.recos.user_user_graph -#@namespace scala com.twitter.recos.user_user_graph.thriftscala -#@namespace strato com.twitter.recos.user_user_graph -namespace rb UserUserGraph - -include "com/twitter/recos/recos_common.thrift" - -enum RecommendUserDisplayLocation { - MagicRecs = 0 - HomeTimeLine = 1 - ConnectTab = 2 -} - -struct RecommendUserRequest { - 1: required i64 requesterId // user id of the requesting user - 2: required RecommendUserDisplayLocation displayLocation // display location from the client - 3: required map seedsWithWeights // seed ids and weights used in left hand side - 4: optional list excludedUserIds // list of users to exclude from response - 5: optional i32 maxNumResults // number of results to return - 6: optional i32 maxNumSocialProofs // number of social proofs per recommendation - 7: optional map minUserPerSocialProof // minimum number of users for each social proof type - 8: optional list socialProofTypes // list of required social proof types. Any recommended user - // must at least have all of these social proof types - 9: optional i64 maxEdgeEngagementAgeInMillis // only events created during this period are counted -} - -struct RecommendedUser { - 1: required i64 userId // user id of recommended user - 2: required double score // weight of the recommended user - 3: required map> socialProofs // the social proofs of the recommended user -} - -struct RecommendUserResponse { - 1: required list recommendedUsers // list of recommended users -} - -/** - * The main interface-definition for UserUserGraph. - */ -service UserUserGraph { - // Given a request for recommendations for a specific user, - // return a list of candidate users along with their social proofs - RecommendUserResponse recommendUsers (RecommendUserRequest request) -} diff --git a/src/thrift/com/twitter/recos/user_video_graph/BUILD b/src/thrift/com/twitter/recos/user_video_graph/BUILD deleted file mode 100644 index f9dcbb8b1..000000000 --- a/src/thrift/com/twitter/recos/user_video_graph/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -RECOSGRAPH_SOURCES = ["user_video_graph.thrift"] - -create_thrift_libraries( - base_name = "user_video_graph", - sources = RECOSGRAPH_SOURCES, - platform = "java8", - tags = ["bazel-compatible"], - dependency_roots = [ - "src/thrift/com/twitter/recos:recos-common", - "src/thrift/com/twitter/recos/features:tweet", - ], - export_roots = [ - "src/thrift/com/twitter/recos/features:tweet", - ], - generate_languages = [ - "java", - "scala", - "strato", - ], - provides_java_name = "user_video_graph-java", - provides_scala_name = "user_video_graph-scala", -) diff --git a/src/thrift/com/twitter/recos/user_video_graph/BUILD.docx b/src/thrift/com/twitter/recos/user_video_graph/BUILD.docx new file mode 100644 index 000000000..f7d426e30 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_video_graph/BUILD.docx differ diff --git a/src/thrift/com/twitter/recos/user_video_graph/CONFIG.docx b/src/thrift/com/twitter/recos/user_video_graph/CONFIG.docx new file mode 100644 index 000000000..1aa36df18 Binary files /dev/null and b/src/thrift/com/twitter/recos/user_video_graph/CONFIG.docx differ diff --git a/src/thrift/com/twitter/recos/user_video_graph/CONFIG.ini b/src/thrift/com/twitter/recos/user_video_graph/CONFIG.ini deleted file mode 100644 index eae222a68..000000000 --- a/src/thrift/com/twitter/recos/user_video_graph/CONFIG.ini +++ /dev/null @@ -1,7 +0,0 @@ -; See http://go/CONFIG.ini - -[jira] -project: SD - -[kite] -project: recos diff --git a/src/thrift/com/twitter/recos/user_video_graph/user_video_graph.docx b/src/thrift/com/twitter/recos/user_video_graph/user_video_graph.docx new file mode 100644 index 000000000..6c159d77d Binary files /dev/null and b/src/thrift/com/twitter/recos/user_video_graph/user_video_graph.docx differ diff --git a/src/thrift/com/twitter/recos/user_video_graph/user_video_graph.thrift b/src/thrift/com/twitter/recos/user_video_graph/user_video_graph.thrift deleted file mode 100644 index a5d83c1d6..000000000 --- a/src/thrift/com/twitter/recos/user_video_graph/user_video_graph.thrift +++ /dev/null @@ -1,64 +0,0 @@ -namespace java com.twitter.recos.user_video_graph.thriftjava -namespace py gen.twitter.recos.user_video_graph -#@namespace scala com.twitter.recos.user_video_graph.thriftscala -#@namespace strato com.twitter.recos.user_video_graph -namespace rb UserVideoGraph - -include "com/twitter/recos/features/tweet.thrift" -include "com/twitter/recos/recos_common.thrift" - - -struct TweetBasedRelatedTweetRequest { - 1: required i64 tweetId // query tweet id - 2: optional i32 maxResults // number of suggested results to return - 3: optional list excludeTweetIds // list of tweet ids to exclude from response - 4: optional i32 minQueryDegree // min degree of query tweet - 5: optional i32 maxNumSamplesPerNeighbor // max number of sampled users who engaged with the query tweet - 6: optional i32 minCooccurrence // min co-occurrence of related tweet candidate - 7: optional i32 minResultDegree // min degree of related tweet candidate - 8: optional double minScore // min score of related tweet candidate - 9: optional i32 maxTweetAgeInHours // max tweet age in hours of related tweet candidate -} - -struct ProducerBasedRelatedTweetRequest { - 1: required i64 producerId // query producer id - 2: optional i32 maxResults // number of suggested results to return - 3: optional list excludeTweetIds // list of tweet ids to exclude from response - 4: optional i32 minQueryDegree // min degree of query producer, e.g. number of followers - 5: optional i32 maxNumFollowers // max number of sampled users who follow the query producer - 6: optional i32 minCooccurrence // min co-occurrence of related tweet candidate - 7: optional i32 minResultDegree // min degree of related tweet candidate - 8: optional double minScore // min score of related tweet candidate - 9: optional i32 maxTweetAgeInHours // max tweet age in hours of related tweet candidate -} - -struct ConsumersBasedRelatedTweetRequest { - 1: required list consumerSeedSet // query consumer userId set - 2: optional i32 maxResults // number of suggested results to return - 3: optional list excludeTweetIds // list of tweet ids to exclude from response - 4: optional i32 minCooccurrence // min co-occurrence of related tweet candidate - 5: optional i32 minResultDegree // min degree of related tweet candidate - 6: optional double minScore // min score of related tweet candidate - 7: optional i32 maxTweetAgeInHours // max tweet age in hours of related tweet candidate -} - -struct RelatedTweet { - 1: required i64 tweetId - 2: required double score - 3: optional tweet.GraphFeaturesForTweet relatedTweetGraphFeatures -} - -struct RelatedTweetResponse { - 1: required list tweets - 2: optional tweet.GraphFeaturesForQuery queryTweetGraphFeatures -} - -/** - * The main interface-definition for UserVideoGraph. - */ -service UserVideoGraph { - RelatedTweetResponse tweetBasedRelatedTweets (TweetBasedRelatedTweetRequest request) - RelatedTweetResponse producerBasedRelatedTweets (ProducerBasedRelatedTweetRequest request) - RelatedTweetResponse consumersBasedRelatedTweets (ConsumersBasedRelatedTweetRequest request) -} - diff --git a/src/thrift/com/twitter/search/common/ranking/ranking.docx b/src/thrift/com/twitter/search/common/ranking/ranking.docx new file mode 100644 index 000000000..29aff4223 Binary files /dev/null and b/src/thrift/com/twitter/search/common/ranking/ranking.docx differ diff --git a/src/thrift/com/twitter/search/common/ranking/ranking.thrift b/src/thrift/com/twitter/search/common/ranking/ranking.thrift deleted file mode 100644 index bd1cff929..000000000 --- a/src/thrift/com/twitter/search/common/ranking/ranking.thrift +++ /dev/null @@ -1,366 +0,0 @@ -namespace java com.twitter.search.common.ranking.thriftjava -#@namespace scala com.twitter.search.common.ranking.thriftscala -#@namespace strato com.twitter.search.common.ranking -namespace py gen.twitter.search.common.ranking.ranking - -struct ThriftLinearFeatureRankingParams { - // values below this will set the score to the minimal one - 1: optional double min = -1e+100 - // values above this will set the score to the minimal one - 2: optional double max = 1e+100 - 3: optional double weight = 0 -}(persisted='true') - -struct ThriftAgeDecayRankingParams { - // the rate in which the score of older tweets decreases - 1: optional double slope = 0.003 - // the age, in minutes, where the age score of a tweet is half of the latest tweet - 2: optional double halflife = 360.0 - // the minimal age decay score a tweet will have - 3: optional double base = 0.6 -}(persisted='true') - -enum ThriftScoringFunctionType { - LINEAR = 1, - MODEL_BASED = 4, - TENSORFLOW_BASED = 5, - - // deprecated - TOPTWEETS = 2, - EXPERIMENTAL = 3, -} - -// The struct to define a class that is to be dynamically loaded in earlybird for -// experimentation. -struct ThriftExperimentClass { - // the fully qualified class name. - 1: required string name - // data source location (class/jar file) for this dynamic class on HDFS - 2: optional string location - // parameters in key-value pairs for this experimental class - 3: optional map params -}(persisted='true') - -// Deprecated!! -struct ThriftQueryEngagementParams { - // Rate Boosts: given a rate (usually a small fraction), the score will be multiplied by - // (1 + rate) ^ boost - // 0 mean no boost, negative numbers are dampens - 1: optional double retweetRateBoost = 0 - 2: optional double replyRateBoost = 0 - 3: optional double faveRateBoost = 0 -}(persisted='true') - -struct ThriftHostQualityParams { - // Multiplier applied to host score, for tweets that have links. - // A multiplier of 0 means that this boost is not applied - 1: optional double multiplier = 0.0 - - // Do not apply the multiplier to hosts with score above this level. - // If 0, the multiplier will be applied to any host. - 2: optional double maxScoreToModify = 0.0 - - // Do not apply the multiplier to hosts with score below this level. - // If 0, the multiplier will be applied to any host. - 3: optional double minScoreToModify = 0.0 - - // If true, score modification will be applied to hosts that have unknown scores. - // The host-score used will be lower than the score of any known host. - 4: optional bool applyToUnknownHosts = 0 -}(persisted='true') - -struct ThriftCardRankingParams { - 1: optional double hasCardBoost = 1.0 - 2: optional double domainMatchBoost = 1.0 - 3: optional double authorMatchBoost = 1.0 - 4: optional double titleMatchBoost = 1.0 - 5: optional double descriptionMatchBoost = 1.0 -}(persisted='true') - -# The ids are assigned in 'blocks'. For adding a new field, find an unused id in the appropriate -# block. Be sure to mention explicitly which ids have been removed so that they are not used again. -struct ThriftRankingParams { - 1: optional ThriftScoringFunctionType type - - // Dynamically loaded scorer and collector for quick experimentation. - 40: optional ThriftExperimentClass expScorer - 41: optional ThriftExperimentClass expCollector - - // we must set it to a value that fits into a float: otherwise - // some earlybird classes that convert it to float will interpret - // it as Float.NEGATIVE_INFINITY, and some comparisons will fail - 2: optional double minScore = -1e+30 - - 10: optional ThriftLinearFeatureRankingParams parusScoreParams - 11: optional ThriftLinearFeatureRankingParams retweetCountParams - 12: optional ThriftLinearFeatureRankingParams replyCountParams - 15: optional ThriftLinearFeatureRankingParams reputationParams - 16: optional ThriftLinearFeatureRankingParams luceneScoreParams - 18: optional ThriftLinearFeatureRankingParams textScoreParams - 19: optional ThriftLinearFeatureRankingParams urlParams - 20: optional ThriftLinearFeatureRankingParams isReplyParams - 21: optional ThriftLinearFeatureRankingParams directFollowRetweetCountParams - 22: optional ThriftLinearFeatureRankingParams trustedCircleRetweetCountParams - 23: optional ThriftLinearFeatureRankingParams favCountParams - 24: optional ThriftLinearFeatureRankingParams multipleReplyCountParams - 27: optional ThriftLinearFeatureRankingParams embedsImpressionCountParams - 28: optional ThriftLinearFeatureRankingParams embedsUrlCountParams - 29: optional ThriftLinearFeatureRankingParams videoViewCountParams - 66: optional ThriftLinearFeatureRankingParams quotedCountParams - - // A map from MutableFeatureType to linear ranking params - 25: optional map offlineExperimentalFeatureRankingParams - - // if min/max for score or ThriftLinearFeatureRankingParams should always be - // applied or only to non-follows, non-self, non-verified - 26: optional bool applyFiltersAlways = 0 - - // Whether to apply promotion/demotion at all for FeatureBasedScoringFunction - 70: optional bool applyBoosts = 1 - - // UI language is english, tweet language is not - 30: optional double langEnglishUIBoost = 0.3 - // tweet language is english, UI language is not - 31: optional double langEnglishTweetBoost = 0.7 - // user language differs from tweet language, and neither is english - 32: optional double langDefaultBoost = 0.1 - // user that produced tweet is marked as spammer by metastore - 33: optional double spamUserBoost = 1.0 - // user that produced tweet is marked as nsfw by metastore - 34: optional double nsfwUserBoost = 1.0 - // user that produced tweet is marked as bot (self similarity) by metastore - 35: optional double botUserBoost = 1.0 - - // An alternative way of using lucene score in the ranking function. - 38: optional bool useLuceneScoreAsBoost = 0 - 39: optional double maxLuceneScoreBoost = 1.2 - - // Use user's consumed and produced languages for scoring - 42: optional bool useUserLanguageInfo = 0 - - // Boost (demotion) if the tweet language is not one of user's understandable languages, - // nor interface language. - 43: optional double unknownLanguageBoost = 0.01 - - // Use topic ids for scoring. - // Deprecated in SEARCH-8616. - 44: optional bool deprecated_useTopicIDsBoost = 0 - // Parameters for topic id scoring. See TopicIDsBoostScorer (and its test) for details. - 46: optional double deprecated_maxTopicIDsBoost = 3.0 - 47: optional double deprecated_topicIDsBoostExponent = 2.0; - 48: optional double deprecated_topicIDsBoostSlope = 2.0; - - // Hit Attribute Demotion - 60: optional bool enableHitDemotion = 0 - 61: optional double noTextHitDemotion = 1.0 - 62: optional double urlOnlyHitDemotion = 1.0 - 63: optional double nameOnlyHitDemotion = 1.0 - 64: optional double separateTextAndNameHitDemotion = 1.0 - 65: optional double separateTextAndUrlHitDemotion = 1.0 - - // multiplicative score boost for results deemed offensive - 100: optional double offensiveBoost = 1 - // multiplicative score boost for results in the searcher's social circle - 101: optional double inTrustedCircleBoost = 1 - // multiplicative score dampen for results with more than one hash tag - 102: optional double multipleHashtagsOrTrendsBoost = 1 - // multiplicative score boost for results in the searcher's direct follows - 103: optional double inDirectFollowBoost = 1 - // multiplicative score boost for results that has trends - 104: optional double tweetHasTrendBoost = 1 - // is tweet from verified account? - 106: optional double tweetFromVerifiedAccountBoost = 1 - // is tweet authored by the searcher? (boost is in addition to social boost) - 107: optional double selfTweetBoost = 1 - // multiplicative score boost for a tweet that has image url. - 108: optional double tweetHasImageUrlBoost = 1 - // multiplicative score boost for a tweet that has video url. - 109: optional double tweetHasVideoUrlBoost = 1 - // multiplicative score boost for a tweet that has news url. - 110: optional double tweetHasNewsUrlBoost = 1 - // is tweet from a blue-verified account? - 111: optional double tweetFromBlueVerifiedAccountBoost = 1 (personalDataType = 'UserVerifiedFlag') - - // subtractive penalty applied after boosts for out-of-network replies. - 120: optional double outOfNetworkReplyPenalty = 10.0 - - 150: optional ThriftQueryEngagementParams deprecatedQueryEngagementParams - - 160: optional ThriftHostQualityParams deprecatedHostQualityParams - - // age decay params for regular tweets - 203: optional ThriftAgeDecayRankingParams ageDecayParams - - // for card ranking: map between card name ordinal (defined in com.twitter.search.common.constants.CardConstants) - // to ranking params - 400: optional map cardRankingParams - - // A map from tweet IDs to the score adjustment for that tweet. These are score - // adjustments that include one or more features that can depend on the query - // string. These features aren't indexed by Earlybird, and so their total contribution - // to the scoring function is passed in directly as part of the request. If present, - // the score adjustment for a tweet is directly added to the linear component of the - // scoring function. Since this signal can be made up of multiple features, any - // reweighting or combination of these features is assumed to be done by the caller - // (hence there is no need for a weight parameter -- the weights of the features - // included in this signal have already been incorporated by the caller). - 151: optional map querySpecificScoreAdjustments - - // A map from user ID to the score adjustment for tweets from that author. - // This field provides a way for adjusting the tweets of a specific set of users with a score - // that is not present in the Earlybird features but has to be passed from the clients, such as - // real graph weights or a combination of multiple features. - // This field should be used mainly for experimentation since it increases the size of the thrift - // requests. - 154: optional map authorSpecificScoreAdjustments - - // -------- Parameters for ThriftScoringFunctionType.MODEL_BASED -------- - // Selected models along with their weights for the linear combination - 152: optional map selectedModels - 153: optional bool useLogitScore = false - - // -------- Parameters for ThriftScoringFunctionType.TENSORFLOW_BASED -------- - // Selected tensorflow model - 303: optional string selectedTensorflowModel - - // -------- Deprecated Fields -------- - // ID 303 has been used in the past. Resume additional deprecated fields from 304 - 105: optional double deprecatedTweetHasTrendInTrendingQueryBoost = 1 - 200: optional double deprecatedAgeDecaySlope = 0.003 - 201: optional double deprecatedAgeDecayHalflife = 360.0 - 202: optional double deprecatedAgeDecayBase = 0.6 - 204: optional ThriftAgeDecayRankingParams deprecatedAgeDecayForTrendsParams - 301: optional double deprecatedNameQueryConfidence = 0.0 - 302: optional double deprecatedHashtagQueryConfidence = 0.0 - // Whether to use old-style engagement features (normalized by LogNormalizer) - // or new ones (normalized by SingleBytePositiveFloatNormalizer) - 50: optional bool useGranularEngagementFeatures = 0 // DEPRECATED! -}(persisted='true') - -// This sorting mode is used by earlybird to retrieve the top-n facets that -// are returned to blender -enum ThriftFacetEarlybirdSortingMode { - SORT_BY_SIMPLE_COUNT = 0, - SORT_BY_WEIGHTED_COUNT = 1, -} - -// This is the final sort order used by blender after all results from -// the earlybirds are merged -enum ThriftFacetFinalSortOrder { - // using the created_at date of the first tweet that contained the facet - SCORE = 0, - SIMPLE_COUNT = 1, - WEIGHTED_COUNT = 2, - CREATED_AT = 3 -} - -struct ThriftFacetRankingOptions { - // next available field ID = 38 - - // ====================================================================== - // EARLYBIRD SETTINGS - // - // These parameters primarily affect how earlybird creates the top-k - // candidate list to be re-ranked by blender - // ====================================================================== - // Dynamically loaded scorer and collector for quick experimentation. - 26: optional ThriftExperimentClass expScorer - 27: optional ThriftExperimentClass expCollector - - // It should be less than or equal to reputationParams.min, and all - // tweepcreds between the two get a score of 1.0. - 21: optional i32 minTweepcredFilterThreshold - - // the maximum score a single tweet can contribute to the weightedCount - 22: optional i32 maxScorePerTweet - - 15: optional ThriftFacetEarlybirdSortingMode sortingMode - // The number of top candidates earlybird returns to blender - 16: optional i32 numCandidatesFromEarlybird = 100 - - // when to early terminate for facet search, overrides the setting in ThriftSearchQuery - 34: optional i32 maxHitsToProcess = 1000 - - // for anti-gaming we want to limit the maximum amount of hits the same user can - // contribute. Set to -1 to disable the anti-gaming filter. Overrides the setting in - // ThriftSearchQuery - 35: optional i32 maxHitsPerUser = 3 - - // if the tweepcred of the user is bigger than this value it will not be excluded - // by the anti-gaming filter. Overrides the setting in ThriftSearchQuery - 36: optional i32 maxTweepcredForAntiGaming = 65 - - // these settings affect how earlybird computes the weightedCount - 2: optional ThriftLinearFeatureRankingParams parusScoreParams - 3: optional ThriftLinearFeatureRankingParams reputationParams - 17: optional ThriftLinearFeatureRankingParams favoritesParams - 33: optional ThriftLinearFeatureRankingParams repliesParams - 37: optional map rankingExpScoreParams - - // penalty counter settings - 6: optional i32 offensiveTweetPenalty // set to -1 to disable the offensive filter - 7: optional i32 antigamingPenalty // set to -1 to disable antigaming filtering - // weight of penalty counts from all tweets containing a facet, not just the tweets - // matching the query - 9: optional double queryIndependentPenaltyWeight // set to 0 to not use query independent penalty weights - // penalty for keyword stuffing - 60: optional i32 multipleHashtagsOrTrendsPenalty - - // Language related boosts, similar to those in relevance ranking options. By default they are - // all 1.0 (no-boost). - // When the user language is english, facet language is not - 11: optional double langEnglishUIBoost = 1.0 - // When the facet language is english, user language is not - 12: optional double langEnglishFacetBoost = 1.0 - // When the user language differs from facet/tweet language, and neither is english - 13: optional double langDefaultBoost = 1.0 - - // ====================================================================== - // BLENDER SETTINGS - // - // Settings for the facet relevance scoring happening in blender - // ====================================================================== - - // This block of parameters are only used in the FacetsFutureManager. - // limits to discard facets - // if a facet has a higher penalty count, it will not be returned - 5: optional i32 maxPenaltyCount - // if a facet has a lower simple count, it will not be returned - 28: optional i32 minSimpleCount - // if a facet has a lower weighted count, it will not be returned - 8: optional i32 minCount - // the maximum allowed value for offensiveCount/facetCount a facet can have in order to be returned - 10: optional double maxPenaltyCountRatio - // if set to true, then facets with offensive display tweets are excluded from the resultset - 29: optional bool excludePossiblySensitiveFacets - // if set to true, then only facets that have a display tweet in their ThriftFacetCountMetadata object - // will be returned to the caller - 30: optional bool onlyReturnFacetsWithDisplayTweet - - // parameters for scoring force-inserted media items - // Please check FacetReRanker.java computeScoreForInserted() for their usage. - 38: optional double forceInsertedBackgroundExp = 0.3 - 39: optional double forceInsertedMinBackgroundCount = 2 - 40: optional double forceInsertedMultiplier = 0.01 - - // ----------------------------------------------------- - // weights for the facet ranking formula - 18: optional double simpleCountWeight_DEPRECATED - 19: optional double weightedCountWeight_DEPRECATED - 20: optional double backgroundModelBoost_DEPRECATED - - // ----------------------------------------------------- - // Following parameters are used in the FacetsReRanker - // age decay params - 14: optional ThriftAgeDecayRankingParams ageDecayParams - - // used in the facets reranker - 23: optional double maxNormBoost = 5.0 - 24: optional double globalCountExponent = 3.0 - 25: optional double simpleCountExponent = 3.0 - - 31: optional ThriftFacetFinalSortOrder finalSortOrder - - // Run facets search as if they happen at this specific time (ms since epoch). - 32: optional i64 fakeCurrentTimeMs // not really used anywhere, remove? -}(persisted='true') diff --git a/src/thrift/com/twitter/search/earlybird/thrift/earlybird.docx b/src/thrift/com/twitter/search/earlybird/thrift/earlybird.docx new file mode 100644 index 000000000..56c191ae1 Binary files /dev/null and b/src/thrift/com/twitter/search/earlybird/thrift/earlybird.docx differ diff --git a/src/thrift/com/twitter/search/earlybird/thrift/earlybird.thrift b/src/thrift/com/twitter/search/earlybird/thrift/earlybird.thrift deleted file mode 100644 index 0d4547264..000000000 --- a/src/thrift/com/twitter/search/earlybird/thrift/earlybird.thrift +++ /dev/null @@ -1,1416 +0,0 @@ -namespace java com.twitter.search.earlybird.thrift -#@namespace scala com.twitter.search.earlybird.thriftscala -#@namespace strato com.twitter.search.earlybird -namespace py gen.twitter.search.earlybird - -include "com/twitter/ads/adserver/adserver_common.thrift" -include "com/twitter/search/common/caching/caching.thrift" -include "com/twitter/search/common/constants/query.thrift" -include "com/twitter/search/common/constants/search_language.thrift" -include "com/twitter/search/common/conversation/conversation.thrift" -include "com/twitter/search/common/features/features.thrift" -include "com/twitter/search/common/indexing/status.thrift" -include "com/twitter/search/common/query/search.thrift" -include "com/twitter/search/common/ranking/ranking.thrift" -include "com/twitter/search/common/results/expansions.thrift" -include "com/twitter/search/common/results/highlight.thrift" -include "com/twitter/search/common/results/hit_attribution.thrift" -include "com/twitter/search/common/results/hits.thrift" -include "com/twitter/search/common/results/social.thrift" -include "com/twitter/service/spiderduck/gen/metadata_store.thrift" -include "com/twitter/tweetypie/deprecated.thrift" -include "com/twitter/tweetypie/tweet.thrift" -include "com/twitter/escherbird/tweet_annotation.thrift" - -enum ThriftSearchRankingMode { - // good old realtime search mode - RECENCY = 0, - // new super fancy relevance ranking - RELEVANCE = 1, - DEPRECATED_DISCOVERY = 2, - // top tweets ranking mode - TOPTWEETS = 3, - // results from accounts followed by the searcher - FOLLOWS = 4, - - PLACE_HOLDER5 = 5, - PLACE_HOLDER6 = 6, -} - -enum ThriftSearchResultType { - // it's a time-ordered result. - RECENCY = 0, - // it's a highly relevant tweet (aka top tweet). - RELEVANCE = 1, - // top tweet result type - POPULAR = 2, - // promoted tweets (ads) - PROMOTED = 3, - // relevance-ordered (as opposed to time-ordered) tweets generated from a variety of candidates - RELEVANCE_ORDERED = 4, - - PLACE_HOLDER5 = 5, - PLACE_HOLDER6 = 6, -} - -enum ThriftSocialFilterType { - // filter only users that the searcher is directly following. - FOLLOWS = 0, - // filter only users that are in searcher's social circle of trust. - TRUSTED = 1, - // filter both follows and trusted. - ALL = 2, - - PLACE_HOLDER3 = 3, - PLACE_HOLDER4 = 4, - -} - -enum ThriftTweetSource { - ///// enums set by Earlybird - REALTIME_CLUSTER = 1, - FULL_ARCHIVE_CLUSTER = 2, - REALTIME_PROTECTED_CLUSTER = 4, - - ///// enums set inside Blender - ADSERVER = 0, - // from top news search, only used in universal search - TOP_NEWS = 3, - // special tweets included just for EventParrot. - FORCE_INCLUDED = 5, - // from Content Recommender - // from topic to Tweet path - CONTENT_RECS_TOPIC_TO_TWEET = 6, - // used for hydrating QIG Tweets (go/qig) - QIG = 8, - // used for TOPTWEETS ranking mode - TOP_TWEET = 9, - // used for experimental candidate sources - EXPERIMENTAL = 7, - // from Scanr service - SCANR = 10, - - PLACE_HOLDER11 = 11, - PLACE_HOLDER12 = 12 -} - -enum NamedEntitySource { - TEXT = 0, - URL = 1, - - PLACE_HOLDER2 = 2, - PLACE_HOLDER3 = 3, - PLACE_HOLDER4 = 4, -} - -enum ExperimentCluster { - EXP0 = 0, // Send requests to the earlybird-realtime-exp0 cluster - PLACE_HOLDER1 = 1, - PLACE_HOLDER2 = 2, -} - -enum AudioSpaceState { - RUNNING = 0, - ENDED = 1, - - PLACE_HOLDER2 = 2, - PLACE_HOLDER3 = 3, - PLACE_HOLDER4 = 4, - PLACE_HOLDER5 = 5, -} - -// Contains all scoring and relevance-filtering related controls and options for Earlybird. -struct ThriftSearchRelevanceOptions { - // Next available field ID: 31 and note that 45 and 50 have been used already - - 2: optional bool filterDups = 0 // filter out duplicate search results - 26: optional bool keepDupWithHigherScore = 1 // keep the duplicate tweet with the higher score - - 3: optional bool proximityScoring = 0 // whether to do proximity scoring or not - 4: optional i32 maxConsecutiveSameUser // filter consecutive results from the same user - 5: optional ranking.ThriftRankingParams rankingParams // composed by blender - // deprecated in favor of the maxHitsToProcess in CollectorParams - 6: optional i32 maxHitsToProcess // when to early-terminate for relevance - 7: optional string experimentName // what relevance experiment is running - 8: optional string experimentBucket // what bucket the user is in; DDG defaults to hard-coded 'control' - 9: optional bool interpretSinceId = 1 // whether to interpret since_id operator - - 24: optional i32 maxHitsPerUser // Overrides ThriftSearchQuery.maxHitsPerUser - - // only used by discovery for capping direct follow tweets - 10: optional i32 maxConsecutiveDirectFollows - - // Note - the orderByRelevance flag is critical to understanding how merging - // and trimming works in relevance mode in the search root. - // - // When orderByRelevance is true, results are trimmed in score-order. This means the - // client will get the top results from (maxHitsToProcess * numHashPartitions) hits, - // ordered by score. - // - // When orderByRelevance is false, results are trimmed in id-order. This means the - // client will get the top results from an approximation of maxHitsToProcess hits - // (across the entire corpus). These results ordered by ID. - 14: optional bool orderByRelevance = 0 - - // Max blending count for results returned due to from:user rewrites - 16: optional i32 maxUserBlendCount - - // The weight for proximity phrases generated while translating the serialized query to the - // lucene query. - 19: optional double proximityPhraseWeight = 1.0 - 20: optional i32 proximityPhraseSlop = 255 - - // Override the weights of searchable fields. - // Negative weight means the the field is not enabled for search by default, - // but if it is (e.g., by annotation), the absolute value of the weight shall be - // used (if the annotation does not specify a weight). - 21: optional map fieldWeightMapOverride - - // whether disable the coordination in the rewritten disjunction query, term query and phrase query - // the details can be found in LuceneVisitor - 22: optional bool deprecated_disableCoord = 0 - - // Root only. Returns all results seen by root to the client without trimming - // if set to true. - 23: optional bool returnAllResults - - // DEPRECATED: All v2 counters will be used explicitly in the scoring function and - // returned in their own field (in either metadata or feature map in response). - 25: optional bool useEngagementCountersV2 = 0 - - // -------- PERSONALIZATION-RELATED RELEVANCE OPTIONS -------- - // Take special care with these options when reasoning about caching. - - // Deprecated in SEARCH-8616. - 45: optional map deprecated_topicIDWeights - - // Collect hit attribution on queries and likedByUserIDFilter64-enhanced queries to - // get likedByUserIds list in metadata field. - // NOTE: this flag has no affect on fromUserIDFilter64. - 50: optional bool collectFieldHitAttributions = 0 - - // Whether to collect all hits regardless of their score with RelevanceAllCollector. - 27: optional bool useRelevanceAllCollector = 0 - - // Override features of specific tweets before the tweets are scored. - 28: optional map perTweetFeaturesOverride - - // Override features of all tweets from specific users before the tweets are scored. - 29: optional map perUserFeaturesOverride - - // Override features of all tweets before the tweets are scored. - 30: optional features.ThriftSearchResultFeatures globalFeaturesOverride -}(persisted='true') - -// Facets types that may have different ranking parameters. -enum ThriftFacetType { - DEFAULT = 0, - MENTIONS_FACET = 1, - HASHTAGS_FACET = 2, - // Deprecated in SEARCH-13708 - DEPRECATED_NAMED_ENTITIES_FACET = 3, - STOCKS_FACET = 4, - VIDEOS_FACET = 5, - IMAGES_FACET = 6, - NEWS_FACET = 7, - LANGUAGES_FACET = 8, - SOURCES_FACET = 9, - TWIMG_FACET = 10, - FROM_USER_ID_FACET = 11, - DEPRECATED_TOPIC_IDS_FACET = 12, - RETWEETS_FACET = 13, - LINKS_FACET = 14, - - PLACE_HOLDER15 = 15, - PLACE_HOLDER16 = 16, -} - -struct ThriftSearchDebugOptions { - // Make earlybird only score and return tweets (specified by tweet id) here, regardless - // if they have a hit for the current query or not. - 1: optional set statusIds; - - // Assorted structures to pass in debug options. - 2: optional map stringMap; - 3: optional map valueMap; - 4: optional list valueList; -}(persisted='true') - -// These options control what metadata will be returned by earlybird for each search result -// in the ThriftSearchResultMetadata struct. These options are currently mostly supported by -// AbstractRelevanceCollector and partially in SearchResultsCollector. Most are true by default to -// preserve backwards compatibility, but can be disabled as necessary to optimize searches returning -// many results (such as discover). -struct ThriftSearchResultMetadataOptions { - // If true, fills in the tweetUrls field in ThriftSearchResultMetadata. - // Populated by AbstractRelevanceCollector. - 1: optional bool getTweetUrls = 1 - - // If true, fills in the resultLocation field in ThriftSearchResultMetadata. - // Populated by AbstractRelevanceCollector. - 2: optional bool getResultLocation = 1 - - // Deprecated in SEARCH-8616. - 3: optional bool deprecated_getTopicIDs = 1 - - // If true, fills in the luceneScore field in ThriftSearchResultMetadata. - // Populated by LinearScoringFunction. - 4: optional bool getLuceneScore = 0 - - // Deprecated but used to be for Offline feature values for static index - 5: optional bool deprecated_getExpFeatureValues = 0 - - // If true, will omit all features derivable from packedFeatures, and set packedFeatures - // instead. - 6: optional bool deprecated_usePackedFeatures = 0 - - // If true, fills sharedStatusId. For replies this is the in-reply-to status id and for - // retweets this is the retweet source status id. - // Also fills in the the isRetweet and isReply flags. - 7: optional bool getInReplyToStatusId = 0 - - // If true, fills referencedTweetAuthorId. Also fills in the the isRetweet and isReply flags. - 8: optional bool getReferencedTweetAuthorId = 0 - - // If true, fills media bits (video/vine/periscope/etc.) - 9: optional bool getMediaBits = 0 - - // If true, will return all defined features in the packed features. This flag does not cover - // the above defined features. - 10: optional bool getAllFeatures = 0 - - // If true, will return all features as ThriftSearchResultFeatures format. - 11: optional bool returnSearchResultFeatures = 0 - - // If the client caches some features schemas, client can indicate its cache schemas through - // this field based on (version, checksum). - 12: optional list featureSchemasAvailableInClient - - // Specific feature IDs to return for recency requests. Populated in SearchResultFeatures. - // Values must be IDs of CSF fields from EarlybirdFieldConstants. - 13: optional list requestedFeatureIDs - - // If true, fills in the namedEntities field in ThriftSearchResultExtraMetadata - 14: optional bool getNamedEntities = 0 - - // If true, fills in the entityAnnotations field in ThriftSearchResultExtraMetadata - 15: optional bool getEntityAnnotations = 0 - - // If true, fills in the fromUserId field in the ThriftSearchResultExtraMetadata - 16: optional bool getFromUserId = 0 - - // If true, fills in the spaces field in the ThriftSearchResultExtraMetadata - 17: optional bool getSpaces = 0 - - 18: optional bool getExclusiveConversationAuthorId = 0 -}(persisted='true') - - -// ThriftSearchQuery describes an earlybird search request, which typically consists -// of these parts: -// - a query to retrieve hits -// - relevance options to score hits -// - a collector to collect hits and process into search results -// Note that this struct is used in both ThriftBlenderRequest and EarlybirdRequest. -// Most fields are not set when this struct is embedded in ThriftBlenderRequest, and -// are filled in by the blender before sending to earlybird. -struct ThriftSearchQuery { - // Next available field ID: 42 - - // -------- SECTION ZERO: THINGS USED ONLY BY THE BLENDER -------- - // See SEARCHQUAL-2398 - // These fields are used by the blender and clients of the blender, but not by earlybird. - - // blender use only - // The raw un-parsed user search query. - 6: optional string rawQuery(personalDataType = 'SearchQuery') - - // blender use only - // Language of the rawQuery. - 18: optional string queryLang(personalDataType = 'InferredLanguage') - - // blender use only - // What page of results to return, indexed from 1. - 7: optional i32 page = 1 - - // blender use only - // Number of results to skip (for pagination). Indexed from 0. - 2: optional i32 deprecated_resultOffset = 0 - - - // -------- SECTION ONE: RETRIEVAL OPTIONS -------- - // These options control the query that will be used to retrieve documents / hits. - - // The parsed query tree, serialized to a string. Restricts the search results to - // tweets matching this query. - 1: optional string serializedQuery(personalDataType = 'SearchQuery') - - // Restricts the search results to tweets having this minimum tweep cred, out of 100. - 5: optional i32 minTweepCredFilter = -1 - - // Restricts the search results to tweets from these users. - 34: optional list fromUserIDFilter64(personalDataType = 'PrivateAccountsFollowing, PublicAccountsFollowing') - // Restricts the search results to tweets liked by these users. - 40: optional list likedByUserIDFilter64(personalDataType = 'PrivateAccountsFollowing, PublicAccountsFollowing') - - // If searchStatusIds are present, earlybird will ignore the serializedQuery completely - // and simply score each of searchStatusIds, also bypassing features like duplicate - // filtering and early termination. - // IMPORTANT: this means that it is possible to get scores equal to ScoringFunction.SKIP_HIT, - // for results skipped by the scoring function. - 31: optional set searchStatusIds - - 35: optional set deprecated_eventClusterIdsFilter - - 41: optional map> namedDisjunctionMap - - // -------- SECTION TWO: HIT COLLECTOR OPTIONS -------- - // These options control what hits will be collected by the hit collector. - // Whether we want to collect and return per-field hit attributions is set in RelevanceOptions. - // See SEARCH-2784 - // Number of results to return (after offset/page correction). - // This is ignored when searchStatusIds is set. - 3: required i32 numResults - - // Maximum number of hits to process by the collector. - // deprecated in favor of the maxHitsToProcess in CollectorParams - 4: optional i32 maxHitsToProcess = 1000 - - // Collect hit counts for these time periods (in milliseconds). - 30: optional list hitCountBuckets - - // If set, earlybird will also return the facet labels of the specified facet fields - // in result tweets. - 33: optional list facetFieldNames - - // Options controlling which search result metadata is returned. - 36: optional ThriftSearchResultMetadataOptions resultMetadataOptions - - // Collection related Params - 38: optional search.CollectorParams collectorParams - - // Whether to collect conversation IDs - 39: optional bool collectConversationId = 0 - - // -------- SECTION THREE: RELEVANCE OPTIONS -------- - // These options control relevance scoring and anti-gaming. - - // Ranking mode (RECENCY means time-ordered ranking with no relevance). - 8: optional ThriftSearchRankingMode rankingMode = ThriftSearchRankingMode.RECENCY - - // Relevance scoring options. - 9: optional ThriftSearchRelevanceOptions relevanceOptions - - // Limits the number of hits that can be contributed by the same user, for anti-gaming. - // Set to -1 to disable the anti-gaming filter. This is ignored when searchStatusIds - // is set. - 11: optional i32 maxHitsPerUser = 3 - - // Disables anti-gaming filter checks for any tweets that exceed this tweepcred. - 12: optional i32 maxTweepcredForAntiGaming = 65 - - // -------- PERSONALIZATION-RELATED RELEVANCE OPTIONS -------- - // Take special care with these options when reasoning about caching. All of these - // options, if set, will bypass the cache with the exception of uiLang which is the - // only form of personalization allowed for caching. - - // User ID of searcher. This is used for relevance, and will be used for retrieval - // by the protected tweets index. If set, query will not be cached. - 20: optional i64 searcherId(personalDataType = 'UserId') - - // Bloom filter containing trusted user IDs. If set, query will not be cached. - 10: optional binary trustedFilter(personalDataType = 'UserId') - - // Bloom filter containing direct follow user IDs. If set, query will not be cached. - 16: optional binary directFollowFilter(personalDataType = 'UserId, PrivateAccountsFollowing, PublicAccountsFollowing') - - // UI language from the searcher's profile settings. - 14: optional string uiLang(personalDataType = 'GeneralSettings') - - // Confidence of the understandability of different languages for this user. - // uiLang field above is treated as a userlang with a confidence of 1.0. - 28: optional map userLangs(personalDataTypeKey = 'InferredLanguage') - - // An alternative to fromUserIDFilter64 that relies on the relevance bloom filters - // for user filtering. Not currently used in production. Only supported for realtime - // searches. - // If set, earlybird expects both trustedFilter and directFollowFilter to also be set. - 17: optional ThriftSocialFilterType socialFilterType - - // -------- SECTION FOUR: DEBUG OPTIONS, FORGOTTEN FEATURES -------- - - // Earlybird search debug options. - 19: optional ThriftSearchDebugOptions debugOptions - - // Overrides the query time for debugging. - 29: optional i64 timestampMsecs = 0 - - // Support for this feature has been removed and this field is left for backwards compatibility - // (and to detect improper usage by clients when it is set). - 25: optional list deprecated_iterativeQueries - - // Specifies a lucene query that will only be used if serializedQuery is not set, - // for debugging. Not currently used in production. - 27: optional string luceneQuery(personalDataType = 'SearchQuery') - - // This field is deprecated and is not used by earlybirds when processing the query. - 21: optional i32 deprecated_minDocsToProcess = 0 -}(persisted='true', hasPersonalData = 'true') - - -struct ThriftFacetLabel { - 1: required string fieldName - 2: required string label - // the number of times this facet has shown up in tweets with offensive words. - 3: optional i32 offensiveCount = 0 - - // only filled for TWIMG facets - 4: optional string nativePhotoUrl -}(persisted='true') - -struct ThriftSearchResultGeoLocation { - 1: optional double latitude(personalDataType = 'GpsCoordinates') - 2: optional double longitude(personalDataType = 'GpsCoordinates') - 3: optional double distanceKm -}(persisted='true', hasPersonalData = 'true') - -// Contains an expanded url and media type from the URL facet fields in earlybird. -// Note: thrift copied from status.thrift with unused fields renamed. -struct ThriftSearchResultUrl { - // Next available field ID: 6. Fields 2-4 removed. - - // Note: this is actually the expanded url. Rename after deprecated fields are removed. - 1: required string originalUrl - - // Media type of the url. - 5: optional metadata_store.MediaTypes mediaType -}(persisted='true') - -struct ThriftSearchResultNamedEntity { - 1: required string canonicalName - 2: required string entityType - 3: required NamedEntitySource source -}(persisted='true') - -struct ThriftSearchResultAudioSpace { - 1: required string id - 2: required AudioSpaceState state -}(persisted='true') - -// Even more metadata -struct ThriftSearchResultExtraMetadata { - // Next available field ID: 49 - - 1: optional double userLangScore - 2: optional bool hasDifferentLang - 3: optional bool hasEnglishTweetAndDifferentUILang - 4: optional bool hasEnglishUIAndDifferentTweetLang - 5: optional i32 quotedCount - 6: optional double querySpecificScore - 7: optional bool hasQuote - 29: optional i64 quotedTweetId - 30: optional i64 quotedUserId - 31: optional search_language.ThriftLanguage cardLang - 8: optional i64 conversationId - 9: optional bool isSensitiveContent - 10: optional bool hasMultipleMediaFlag - 11: optional bool profileIsEggFlag - 12: optional bool isUserNewFlag - 26: optional double authorSpecificScore - 28: optional bool isComposerSourceCamera - - // temporary V2 engagement counters, original ones in ThriftSearchResultMetadata has log() - // applied on them and then converted to int in Thrift, which is effectively a premature - // discretization. It doesn't affect the scoring inside Earlybird but for scoring and ML training - // outside earlybird, they were bad. These newly added ones stores a proper value of these - // counts. This also provides an easier transition to v2 counter when Earlybird is eventually - // ready to consume them from DL - // See SEARCHQUAL-9536, SEARCH-11181 - 18: optional i32 retweetCountV2 - 19: optional i32 favCountV2 - 20: optional i32 replyCountV2 - // Tweepcred weighted version of various engagement counts - 22: optional i32 weightedRetweetCount - 23: optional i32 weightedReplyCount - 24: optional i32 weightedFavCount - 25: optional i32 weightedQuoteCount - - // 2 bits - 0, 1, 2, 3+ - 13: optional i32 numMentions - 14: optional i32 numHashtags - - // 1 byte - 256 possible languages - 15: optional i32 linkLanguage - // 6 bits - 64 possible values - 16: optional i32 prevUserTweetEngagement - - 17: optional features.ThriftSearchResultFeatures features - - // If the ThriftSearchQuery.likedByUserIdFilter64 and ThriftSearchRelevanceOptions.collectFieldHitAttributions - // fields are set, then this field will contain the list of all users in the query that liked this tweet. - // Otherwise, this field is not set. - 27: optional list likedByUserIds - - - // Deprecated. See SEARCHQUAL-10321 - 21: optional double dopamineNonPersonalizedScore - - 32: optional list namedEntities - 33: optional list entityAnnotations - - // Health model scores from HML - 34: optional double toxicityScore // (go/toxicity) - 35: optional double pBlockScore // (go/pblock) - 36: optional double experimentalHealthModelScore1 - 37: optional double experimentalHealthModelScore2 - 38: optional double experimentalHealthModelScore3 - 39: optional double experimentalHealthModelScore4 - - 40: optional i64 directedAtUserId - - // Health model scores from HML (cont.) - 41: optional double pSpammyTweetScore // (go/pspammytweet) - 42: optional double pReportedTweetScore // (go/preportedtweet) - 43: optional double spammyTweetContentScore // (go/spammy-tweet-content) - // it is populated by looking up user table and it is only available in archive earlybirds response - 44: optional bool isUserProtected - 45: optional list spaces - - 46: optional i64 exclusiveConversationAuthorId - 47: optional string cardUri - 48: optional bool fromBlueVerifiedAccount(personalDataType = 'UserVerifiedFlag') -}(persisted='true') - -// Some basic metadata about a search result. Useful for re-sorting, filtering, etc. -// -// NOTE: DO NOT ADD NEW FIELD!! -// Stop adding new fields to this struct, all new fields should go to -// ThriftSearchResultExtraMetadata (VM-1897), or there will be performance issues in production. -struct ThriftSearchResultMetadata { - // Next available field ID: 86 - - // -------- BASIC SCORING METADATA -------- - - // When resultType is RECENCY most scoring metadata will not be available. - 1: required ThriftSearchResultType resultType - - // Relevance score computed for this result. - 3: optional double score - - // True if the result was skipped by the scoring function. Only set when the collect-all - // results collector was used - in other cases skipped results are not returned. - // The score will be ScoringFunction.SKIP_HIT when skipped is true. - 43: optional bool skipped - - // optionally a Lucene-style explanation for this result - 5: optional string explanation - - - // -------- NETWORK-BASED SCORING METADATA -------- - - // Found the tweet in the trusted circle. - 6: optional bool isTrusted - - // Found the tweet in the direct follows. - 8: optional bool isFollow - - // True if the fromUserId of this tweet was whitelisted by the dup / antigaming filter. - // This typically indicates the result was from a tweet that matched a fromUserId query. - 9: optional bool dontFilterUser - - - // -------- COMMON DOCUMENT METADATA -------- - - // User ID of the author. When isRetweet is true, this is the user ID of the retweeter - // and NOT that of the original tweet. - 7: optional i64 fromUserId = 0 - - // When isRetweet (or packed features equivalent) is true, this is the status id of the - // original tweet. When isReply and getReplySource are true, this is the status id of the - // original tweet. In all other circumstances this is 0. - 40: optional i64 sharedStatusId = 0 - - // When hasCard (or packed features equivalent) is true, this is one of SearchCardType. - 49: optional i8 cardType = 0 - - // -------- EXTENDED DOCUMENT METADATA -------- - // This is additional metadata from facet fields and column stride fields. - // Return of these fields is controlled by ThriftSearchResultMetadataOptions to - // allow for fine-grained control over when these fields are returned, as an - // optimization for searches returning a large quantity of results. - - // Lucene component of the relevance score. Only returned when - // ThriftSearchResultMetadataOptions.getLuceneScore is true. - 31: optional double luceneScore = 0.0 - - // Urls found in the tweet. Only returned when - // ThriftSearchResultMetadataOptions.getTweetUrls is true. - 18: optional list tweetUrls - - // Deprecated in SEARCH-8616. - 36: optional list deprecated_topicIDs - - // Facets available in this tweet, this will only be filled if - // ThriftSearchQuery.facetFieldNames is set in the request. - 22: optional list facetLabels - - // The location of the result, and the distance to it from the center of the query - // location. Only returned when ThriftSearchResultMetadataOptions.getResultLocation is true. - 35: optional ThriftSearchResultGeoLocation resultLocation - - // Per field hit attribution. - 55: optional hit_attribution.FieldHitAttribution fieldHitAttribution - - // whether this has geolocation_type:geotag hit - 57: optional bool geotagHit = 0 - - // the user id of the author of the source/referenced tweet (the tweet one replied - // to, retweeted and possibly quoted, etc.) (SEARCH-8561) - // Only returned when ThriftSearchResultMetadataOptions.getReferencedTweetAuthorId is true. - 60: optional i64 referencedTweetAuthorId = 0 - - // Whether this tweet has certain types of media. - // Only returned when ThriftSearchResultMetadataOptions.getMediaBits is true. - // "Native video" is either consumer, pro, vine, or periscope. - // "Native image" is an image hosted on pic.twitter.com. - 62: optional bool hasConsumerVideo - 63: optional bool hasProVideo - 64: optional bool hasVine - 65: optional bool hasPeriscope - 66: optional bool hasNativeVideo - 67: optional bool hasNativeImage - - // Packed features for this result. This field is never populated. - 50: optional status.PackedFeatures deprecated_packedFeatures - - // The features stored in earlybird - - // From integer 0 from EarlybirdFeatureConfiguration: - 16: optional bool isRetweet - 71: optional bool isSelfTweet - 10: optional bool isOffensive - 11: optional bool hasLink - 12: optional bool hasTrend - 13: optional bool isReply - 14: optional bool hasMultipleHashtagsOrTrends - 23: optional bool fromVerifiedAccount - // Static text quality score. This is actually an int between 0 and 100. - 30: optional double textScore - 51: optional search_language.ThriftLanguage language - - // From integer 1 from EarlybirdFeatureConfiguration: - 52: optional bool hasImage - 53: optional bool hasVideo - 28: optional bool hasNews - 48: optional bool hasCard - 61: optional bool hasVisibleLink - // Tweep cred aka user rep. This is actually an int between 0 and 100. - 32: optional double userRep - 24: optional bool isUserSpam - 25: optional bool isUserNSFW - 26: optional bool isUserBot - 54: optional bool isUserAntiSocial - - // From integer 2 from EarlybirdFeatureConfiguration: - - // Retweet, fav, reply, embeds counts, and video view counts are APPROXIMATE ONLY. - // Note that retweetCount, favCount and replyCount are not original unnormalized values, - // but after a log2() function for historical reason, this loses us some granularity. - // For more accurate counts, use {retweet, fav, reply}CountV2 in extraMetadata. - 2: optional i32 retweetCount - 33: optional i32 favCount - 34: optional i32 replyCount - 58: optional i32 embedsImpressionCount - 59: optional i32 embedsUrlCount - 68: optional i32 videoViewCount - - // Parus score. This is actually an int between 0 and 100. - 29: optional double parusScore - - // Extra feature data, all new feature fields you want to return from Earlybird should go into - // this one, the outer one is always reaching its limit of the number of fields JVM can - // comfortably support!! - 86: optional ThriftSearchResultExtraMetadata extraMetadata - - // Integer 3 is omitted, see expFeatureValues above for more details. - - // From integer 4 from EarlybirdFeatureConfiguration: - // Signature, for duplicate detection and removal. - 4: optional i32 signature - - // -------- THINGS USED ONLY BY THE BLENDER -------- - - // Social proof of the tweet, for network discovery. - // Do not use these fields outside of network discovery. - 41: optional list retweetedUserIDs64 - 42: optional list replyUserIDs64 - - // Social connection between the search user and this result. - 19: optional social.ThriftSocialContext socialContext - - // used by RelevanceTimelineSearchWorkflow, whether a tweet should be highlighted or not - 46: optional bool highlightResult - - // used by RelevanceTimelineSearchWorkflow, the highlight context of the highlighted tweet - 47: optional highlight.ThriftHighlightContext highlightContext - - // the penguin version used to tokenize the tweets by the serving earlybird index as defined - // in com.twitter.common.text.version.PenguinVersion - 56: optional i8 penguinVersion - - 69: optional bool isNullcast - - // This is the normalized ratio(0.00 to 1.00) of nth token(starting before 140) divided by - // numTokens and then normalized into 16 positions(4 bits) but on a scale of 0 to 100% as - // we unnormalize it for you - 70: optional double tokenAt140DividedByNumTokensBucket - -}(persisted='true') - -// Query level result stats. -// Next id: 20 -struct ThriftSearchResultsRelevanceStats { - 1: optional i32 numScored = 0 - // Skipped documents count, they were also scored but their scores got ignored (skipped), note that this is different - // from numResultsSkipped in the ThriftSearchResults. - 2: optional i32 numSkipped = 0 - 3: optional i32 numSkippedForAntiGaming = 0 - 4: optional i32 numSkippedForLowReputation = 0 - 5: optional i32 numSkippedForLowTextScore = 0 - 6: optional i32 numSkippedForSocialFilter = 0 - 7: optional i32 numSkippedForLowFinalScore = 0 - 8: optional i32 oldestScoredTweetAgeInSeconds = 0 - - // More counters for various features. - 9: optional i32 numFromDirectFollows = 0 - 10: optional i32 numFromTrustedCircle = 0 - 11: optional i32 numReplies = 0 - 12: optional i32 numRepliesTrusted = 0 - 13: optional i32 numRepliesOutOfNetwork = 0 - 14: optional i32 numSelfTweets = 0 - 15: optional i32 numWithMedia = 0 - 16: optional i32 numWithNews = 0 - 17: optional i32 numSpamUser = 0 - 18: optional i32 numOffensive = 0 - 19: optional i32 numBot = 0 -}(persisted='true') - -// Per result debug info. -struct ThriftSearchResultDebugInfo { - 1: optional string hostname - 2: optional string clusterName - 3: optional i32 partitionId - 4: optional string tiername -}(persisted='true') - -struct ThriftSearchResult { - // Next available field ID: 22 - - // Result status id. - 1: required i64 id - - // TweetyPie status of the search result - 7: optional deprecated.Status tweetypieStatus - 19: optional tweet.Tweet tweetypieTweet // v2 struct - - // If the search result is a retweet, this field contains the source TweetyPie status. - 10: optional deprecated.Status sourceTweetypieStatus - 20: optional tweet.Tweet sourceTweetypieTweet // v2 struct - - // If the search result is a quote tweet, this field contains the quoted TweetyPie status. - 17: optional deprecated.Status quotedTweetypieStatus - 21: optional tweet.Tweet quotedTweetypieTweet // v2 struct - - // Additional metadata about a search result. - 5: optional ThriftSearchResultMetadata metadata - - // Hit highlights for various parts of this tweet - // for tweet text - 6: optional list hitHighlights - // for the title and description in the card expando. - 12: optional list cardTitleHitHighlights - 13: optional list cardDescriptionHitHighlights - - // Expansion types, if expandResult == False, the expansions set should be ignored. - 8: optional bool expandResult = 0 - 9: optional set expansions - - // Only set if this is a promoted tweet - 11: optional adserver_common.AdImpression adImpression - - // where this tweet is from - // Since ThriftSearchResult used not only as an Earlybird response, but also an internal - // data transfer object of Blender, the value of this field is mutable in Blender, not - // necessarily reflecting Earlybird response. - 14: optional ThriftTweetSource tweetSource - - // the features of a tweet used for relevance timeline - // this field is populated by blender in RelevanceTimelineSearchWorkflow - 15: optional features.ThriftTweetFeatures tweetFeatures - - // the conversation context of a tweet - 16: optional conversation.ThriftConversationContext conversationContext - - // per-result debugging info that's persisted across merges. - 18: optional ThriftSearchResultDebugInfo debugInfo -}(persisted='true') - -enum ThriftFacetRankingMode { - COUNT = 0, - FILTER_WITH_TERM_STATISTICS = 1, -} - -struct ThriftFacetFieldRequest { - // next available field ID: 4 - 1: required string fieldName - 2: optional i32 numResults = 5 - - // use facetRankingOptions in ThriftFacetRequest instead - 3: optional ThriftFacetRankingMode rankingMode = ThriftFacetRankingMode.COUNT -}(persisted='true') - -struct ThriftFacetRequest { - // Next available field ID: 7 - 1: optional list facetFields - 5: optional ranking.ThriftFacetRankingOptions facetRankingOptions - 6: optional bool usingQueryCache = 0 -}(persisted='true') - -struct ThriftTermRequest { - 1: optional string fieldName = "text" - 2: required string term -}(persisted='true') - -enum ThriftHistogramGranularityType { - MINUTES = 0 - HOURS = 1, - DAYS = 2, - CUSTOM = 3, - - PLACE_HOLDER4 = 4, - PLACE_HOLDER5 = 5, -} - -struct ThriftHistogramSettings { - 1: required ThriftHistogramGranularityType granularity - 2: optional i32 numBins = 60 - 3: optional i32 samplingRate = 1 - 4: optional i32 binSizeInSeconds // the bin size, only used if granularity is set to CUSTOM. -}(persisted='true') - -// next id is 4 -struct ThriftTermStatisticsRequest { - 1: optional list termRequests - 2: optional ThriftHistogramSettings histogramSettings - // If this is set to true, even if there is no termRequests above, so long as the histogramSettings - // is set, Earlybird will return a null->ThriftTermResults entry in the termResults map, containing - // the global tweet count histogram for current query, which is the number of tweets matching this - // query in different minutes/hours/days. - 3: optional bool includeGlobalCounts = 0 - // When this is set, the background facets call does another search in order to find the best - // representative tweet for a given term request, the representative tweet is stored in the - // metadata of the termstats result - 4: optional bool scoreTweetsForRepresentatives = 0 -}(persisted='true') - -// Next id is 12 -struct ThriftFacetCountMetadata { - // this is the id of the first tweet in the index that contained this facet - 1: optional i64 statusId = -1 - - // whether the tweet with the above statusId is NSFW, from an antisocial user, - // marked as sensitive content, etc. - 10: optional bool statusPossiblySensitive - - // the id of the user who sent the tweet above - only returned if - // statusId is returned too - // NOTE: for native photos we may not be able to determine the user, - // even though the statusId can be returned. This is because the statusId - // can be determined from the url, but the user can't and the tweet may - // not be in the index anymore. In this case statusId would be set but - // twitterUserId would not. - 2: optional i64 twitterUserId = -1 - - // the language of the tweet above. - 8: optional search_language.ThriftLanguage statusLanguage - - // optionally whitelist the fromUserId from dup/twitterUserId filtering - 3: optional bool dontFilterUser = 0; - - // if this facet is a native photo we return for convenience the - // twimg url - 4: optional string nativePhotoUrl - - // optionally returns some debug information about this facet - 5: optional string explanation - - // the created_at value for the tweet from statusId - only returned - // if statusId is returned too - 6: optional i64 created_at - - // the maximum tweepcred of the hits that contained this facet - 7: optional i32 maxTweepCred - - // Whether this facet result is force inserted, instead of organically returned from search. - // This field is only used in Blender to mark the force-inserted facet results - // (from recent tweets, etc). - 11: optional bool forceInserted = 0 -}(persisted='true') - -struct ThriftTermResults { - 1: required i32 totalCount - 2: optional list histogramBins - 3: optional ThriftFacetCountMetadata metadata -}(persisted='true') - -struct ThriftTermStatisticsResults { - 1: required map termResults - 2: optional ThriftHistogramSettings histogramSettings - // If histogramSettings are set, this will have a list of ThriftHistogramSettings.numBins binIds, - // that the corresponding histogramBins in ThriftTermResults will have counts for. - // The binIds will correspond to the times of the hits matching the driving search query for this - // term statistics request. - // If there were no hits matching the search query, numBins binIds will be returned, but the - // values of the binIds will not meaningfully correspond to anything related to the query, and - // should not be used. Such cases can be identified by ThriftSearchResults.numHitsProcessed being - // set to 0 in the response, and the response not being early terminated. - 3: optional list binIds - // If set, this id indicates the id of the minimum (oldest) bin that has been completely searched, - // even if the query was early terminated. If not set no bin was searched fully, or no histogram - // was requested. - // Note that if e.g. a query only matches a bin partially (due to e.g. a since operator) the bin - // is still considered fully searched if the query did not early terminate. - 4: optional i32 minCompleteBinId -}(persisted='true') - -struct ThriftFacetCount { - // the text of the facet - 1: required string facetLabel - - // deprecated; currently matches weightedCount for backwards-compatibility reasons - 2: optional i32 facetCount - - // the simple count of tweets that contained this facet, without any - // weighting applied - 7: optional i32 simpleCount - - // a weighted version of the count, using signals like tweepcred, parus, etc. - 8: optional i32 weightedCount - - // the number of times this facet occurred in tweets matching the background query - // using the term statistics API - only set if FILTER_WITH_TERM_STATISTICS was used - 3: optional i32 backgroundCount - - // the relevance score that was computed for this facet if FILTER_WITH_TERM_STATISTICS - // was used - 4: optional double score - - // a counter for how often this facet was penalized - 5: optional i32 penaltyCount - - 6: optional ThriftFacetCountMetadata metadata -}(persisted='true') - -// List of facet labels and counts for a given facet field, the -// total count for this field, and a quality score for this field -struct ThriftFacetFieldResults { - 1: required list topFacets - 2: required i32 totalCount - 3: optional double scoreQuality - 4: optional i32 totalScore - 5: optional i32 totalPenalty - - // The ratio of the tweet language in the tweets with this facet field, a map from the language - // name to a number between (0.0, 1.0]. Only languages with ratio higher than 0.1 will be included. - 6: optional map languageHistogram -} - -struct ThriftFacetResults { - 1: required map facetFields - 2: optional i32 backgroundNumHits - // returns optionally a list of user ids that should not get filtered - // out by things like antigaming filters, because these users were explicitly - // queried for - // Note that ThriftFacetCountMetadata returns already dontFilterUser - // for facet requests in which case this list is not needed. However, it - // is needed for subsequent term statistics queries, were user id lookups - // are performed, but a different background query is used. - 3: optional set userIDWhitelist -} - -struct ThriftSearchResults { - // Next available field ID: 23 - 1: required list results = [] - - // (SEARCH-11950): Now resultOffset is deprecated, so there is no use in numResultsSkipped too. - 9: optional i32 deprecated_numResultsSkipped - - // Number of docs that matched the query and were processed. - 7: optional i32 numHitsProcessed - - // Range of status IDs searched, from max ID to min ID (both inclusive). - // These may be unset in case that the search query contained ID or time - // operators that were completely out of range for the given index. - 10: optional i64 maxSearchedStatusID - 11: optional i64 minSearchedStatusID - - // Time range that was searched (both inclusive). - 19: optional i32 maxSearchedTimeSinceEpoch - 20: optional i32 minSearchedTimeSinceEpoch - - 12: optional ThriftSearchResultsRelevanceStats relevanceStats - - // Overall quality of this search result set - 13: optional double score = -1.0 - 18: optional double nsfwRatio = 0.0 - - // The count of hit documents in each language. - 14: optional map languageHistogram - - // Hit counts per time period: - // The key is a time cutoff in milliseconds (e.g. 60000 msecs ago). - // The value is the number of hits that are more recent than the cutoff. - 15: optional map hitCounts - - // the total cost for this query - 16: optional double queryCost - - // Set to non-0 if this query was terminated early (either due to a timeout, or exceeded query cost) - // When getting this response from a single earlybird, this will be set to 1, if the query - // terminated early. - // When getting this response from a search root, this should be set to the number of individual - // earlybird requests that were terminated early. - 17: optional i32 numPartitionsEarlyTerminated - - // If ThriftSearchResults returns features in features.ThriftSearchResultFeature format, this - // field would define the schema of the features. - // If the earlybird schema is already in the client cached schemas indicated in the request, then - // searchFeatureSchema would only have (version, checksum) information. - // - // Notice that earlybird root only sends one schema back to the superroot even though earlybird - // root might receive multiple version of schemas. - // - // Earlybird roots' schema merge/choose logic when returning results to superroot: - // . pick the most occurred versioned schema and return the schema to the superroot - // . if the superroot already caches the schema, only send the version information back - // - // Superroots' schema merge/choose logic when returning results to clients: - // . pick the schema based on the order of: realtime > protected > archive - // . because of the above ordering, it is possible that archive earlybird schema with a new flush - // version (with new bit features) might be lost to older realtime earlybird schema; this is - // considered to to be rare and acceptable because one realtime earlybird deploy would fix it - 21: optional features.ThriftSearchFeatureSchema featureSchema - - // How long it took to score the results in earlybird (in nanoseconds). The number of results - // that were scored should be set in numHitsProcessed. - // Expected to only be set for requests that actually do scoring (i.e. Relevance and TopTweets). - 22: optional i64 scoringTimeNanos - - 8: optional i32 deprecated_numDocsProcessed -} - -// Note: Earlybird no longer respects this field, as it does not contain statuses. -// Blender should respect it. -enum EarlybirdReturnStatusType { - NO_STATUS = 0 - // deprecated - DEPRECATED_BASIC_STATUS = 1, - // deprecated - DEPRECATED_SEARCH_STATUS = 2, - TWEETYPIE_STATUS = 3, - - PLACE_HOLDER4 = 4, - PLACE_HOLDER5 = 5, -} - -struct AdjustedRequestParams { - // Next available field ID: 4 - - // Adjusted value for EarlybirdRequest.searchQuery.numResults. - 1: optional i32 numResults - - // Adjusted value for EarlybirdRequest.searchQuery.maxHitsToProcess and - // EarlybirdRequest.searchQuery.relevanceOptions.maxHitsToProcess. - 2: optional i32 maxHitsToProcess - - // Adjusted value for EarlybirdRequest.searchQuery.relevanceOptions.returnAllResults - 3: optional bool returnAllResults -} - -struct EarlybirdRequest { - // Next available field ID: 36 - - // -------- COMMON REQUEST OPTIONS -------- - // These fields contain options respected by all kinds of earlybird requests. - - // Search query containing general earlybird retrieval and hit collection options. - // Also contains the options specific to search requests. - 1: required ThriftSearchQuery searchQuery - - // Common RPC information - client hostname and request ID. - 12: optional string clientHost - 13: optional string clientRequestID - - // A string identifying the client that initiated the request. - // Ex: macaw-search.prod, webforall.prod, webforall.staging. - // The intention is to track the load we get from each client, and eventually enforce - // per-client QPS quotas, but this field could also be used to allow access to certain features - // only to certain clients, etc. - 21: optional string clientId - - // The time (in millis since epoch) when the earlybird client issued this request. - // Can be used to estimate request timeout time, capturing in-transit time for the request. - 23: optional i64 clientRequestTimeMs - - // Caching parameters used by earlybird roots. - 24: optional caching.CachingParams cachingParams - - // Deprecated. See SEARCH-2784 - // Earlybird requests will be early terminated in a best-effort way to prevent them from - // exceeding the given timeout. If timeout is <= 0 this early termination criteria is - // disabled. - 17: optional i32 timeoutMs = -1 - - // Deprecated. See SEARCH-2784 - // Earlybird requests will be early terminated in a best-effort way to prevent them from - // exceeding the given query cost. If maxQueryCost <= 0 this early termination criteria - // is disabled. - 20: optional double maxQueryCost = -1 - - - // -------- REQUEST-TYPE SPECIFIC OPTIONS -------- - // These fields contain options for one specific kind of request. If one of these options - // is set the request will be considered to be the appropriate type of request. - - // Options for facet counting requests. - 11: optional ThriftFacetRequest facetRequest - - // Options for term statistics requests. - 14: optional ThriftTermStatisticsRequest termStatisticsRequest - - - // -------- DEBUG OPTIONS -------- - // Used for debugging only. - - // Debug mode, 0 for no debug information. - 15: optional i8 debugMode = 0 - - // Can be used to pass extra debug arguments to earlybird. - 34: optional EarlybirdDebugOptions debugOptions - - // Searches a specific segment by time slice id if set and segment id is > 0. - 22: optional i64 searchSegmentId - - // -------- THINGS USED ONLY BY THE BLENDER -------- - // These fields are used by the blender and clients of the blender, but not by earlybird. - - // Specifies what kind of status object to return, if any. - 7: optional EarlybirdReturnStatusType returnStatusType - - - // -------- THINGS USED BY THE ROOTS -------- - // These fields are not in use by earlybirds themselves, but are in use by earlybird roots - // (and their clients). - // These fields live here since we currently reuse the same thrift request and response structs - // for both earlybirds and earlybird roots, and could potentially be moved out if we were to - // introduce separate request / response structs specifically for the roots. - - // We have a threshold for how many hash partition requests need to succeed at the root level - // in order for the earlybird root request to be considered successful. - // Each type or earlybird queries (e.g. relevance, or term statistics) has a predefined default - // threshold value (e.g. 90% or hash partitions need to succeed for a recency query). - // The client can optionally set the threshold value to be something other than the default, - // by setting this field to a value in the range of 0 (exclusive) to 1 (inclusive). - // If this value is set outside of the (0, 1] range, a CLIENT_ERROR EarlybirdResponseCode will - // be returned. - 25: optional double successfulResponseThreshold - - // Where does the query come from? - 26: optional query.ThriftQuerySource querySource - - // Whether to get archive results This flag is advisory. A request may still be restricted from - // getting reqults from the archive based on the requesting client, query source, requested - // time/id range, etc. - 27: optional bool getOlderResults - - // The list of users followed by the current user. - // Used to restrict the values in the fromUserIDFilter64 field when sending a request - // to the protectected cluster. - 28: optional list followedUserIds - - // The adjusted parameters for the protected request. - 29: optional AdjustedRequestParams adjustedProtectedRequestParams - - // The adjusted parameters for the full archive request. - 30: optional AdjustedRequestParams adjustedFullArchiveRequestParams - - // Return only the protected tweets. This flag is used by the SuperRoot to return relevance - // results that contain only protected tweets. - 31: optional bool getProtectedTweetsOnly - - // Tokenize serialized queries with the appropriate Pengin version(s). - // Only has an effect on superroot. - 32: optional bool retokenizeSerializedQuery - - // Flag to ignore tweets that are very recent and could be incompletely indexed. - // If false, will allow queries to see results that may violate implicit streaming - // guarantees and will search Tweets that have been partially indexed. - // See go/indexing-latency for more details. When enabled, prevents seeing tweets - // that are less than 15 seconds old (or a similarly configured threshold). - // May be set to false unless explicitly set to true. - 33: optional bool skipVeryRecentTweets = 1 - - // Setting an experimental cluster will reroute traffic at the realtime root layer to an experimental - // Earlybird cluster. This will have no impact if set on requests to anywhere other than realtime root. - 35: optional ExperimentCluster experimentClusterToUse - - // Caps number of results returned by roots after merging results from different earlybird partitions/clusters. - // If not set, ThriftSearchQuery.numResults or CollectorParams.numResultsToReturn will be used to cap results. - // This parameter will be ignored if ThriftRelevanceOptions.returnAllResults is set to true. - 36: optional i32 numResultsToReturnAtRoot -} - -enum EarlybirdResponseCode { - SUCCESS = 0, - PARTITION_NOT_FOUND = 1, - PARTITION_DISABLED = 2, - TRANSIENT_ERROR = 3, - PERSISTENT_ERROR = 4, - CLIENT_ERROR = 5, - PARTITION_SKIPPED = 6, - // Request was queued up on the server for so long that it timed out, and was not - // executed at all. - SERVER_TIMEOUT_ERROR = 7, - TIER_SKIPPED = 8, - // Not enough partitions returned a successful response. The merged response will have partition - // counts and early termination info set, but will not have search results. - TOO_MANY_PARTITIONS_FAILED_ERROR = 9, - // Client went over its quota, and the request was throttled. - QUOTA_EXCEEDED_ERROR = 10, - // Client's request is blocked based on Search Infra's policy. Search Infra can can block client's - // requests based on the query source of the request. - REQUEST_BLOCKED_ERROR = 11, - - CLIENT_CANCEL_ERROR = 12, - - CLIENT_BLOCKED_BY_TIER_ERROR = 13, - - PLACE_HOLDER_2015_09_21 = 14, -} - -// A recorded request and response. -struct EarlybirdRequestResponse { - // Where did we send this request to. - 1: optional string sentTo; - 2: optional EarlybirdRequest request; - // This can't be an EarlybirdResponse, because the thrift compiler for Python - // doesn't allow cyclic references and we have some Python utilities that will fail. - 3: optional string response; -} - -struct EarlybirdDebugInfo { - 1: optional string host - 2: optional string parsedQuery - 3: optional string luceneQuery - // Requests sent to dependent services. For example, superroot sends to realtime root, - // archive root, etc. - 4: optional list sentRequests; - // segment level debug info (eg. hitsPerSegment, max/minSearchedTime etc.) - 5: optional list collectorDebugInfo - 6: optional list termStatisticsDebugInfo -} - -struct EarlybirdDebugOptions { - 1: optional bool includeCollectorDebugInfo -} - -struct TierResponse { - 1: optional EarlybirdResponseCode tierResponseCode - 2: optional i32 numPartitions - 3: optional i32 numSuccessfulPartitions -} - -struct EarlybirdServerStats { - // The hostname of the Earlybird that processed this request. - 1: optional string hostname - - // The partition to which this earlybird belongs. - 2: optional i32 partition - - // Current Earlybird QPS. - // Earlybirds should set this field at the end of a request (not at the start). This would give - // roots a more up-to-date view of the load on the earlybirds. - 3: optional i64 currentQps - - // The time the request waited in the queue before Earlybird started processing it. - // This does not include the time spent in the finagle queue: it's the time between the moment - // earlybird received the request, and the moment it started processing the request. - 4: optional i64 queueTimeMillis - - // The average request time in the queue before Earlybird started processing it. - // This does not include the time that requests spent in the finagle queue: it's the average time - // between the moment earlybird received its requests, and the moment it started processing them. - 5: optional i64 averageQueueTimeMillis - - // Current average per-request latency as perceived by Earlybird. - 6: optional i64 averageLatencyMicros - - // The tier to which this earlybird belongs. - 7: optional string tierName -} - -struct EarlybirdResponse { - // Next available field ID: 17 - 1: optional ThriftSearchResults searchResults - 5: optional ThriftFacetResults facetResults - 6: optional ThriftTermStatisticsResults termStatisticsResults - 2: required EarlybirdResponseCode responseCode - 3: required i64 responseTime - 7: optional i64 responseTimeMicros - // fields below will only be returned if debug > 1 in the request. - 4: optional string debugString - 8: optional EarlybirdDebugInfo debugInfo - - // Only exists for merged earlybird response. - 10: optional i32 numPartitions - 11: optional i32 numSuccessfulPartitions - // Only exists for merged earlybird response from multiple tiers. - 13: optional list perTierResponse - - // Total number of segments that were searched. Partially searched segments are fully counted. - // e.g. if we searched 1 segment fully, and early terminated half way through the second - // segment, this field should be set to 2. - 15: optional i32 numSearchedSegments - - // Whether the request early terminated, if so, the termination reason. - 12: optional search.EarlyTerminationInfo earlyTerminationInfo - - // Whether this response is from cache. - 14: optional bool cacheHit - - // Stats used by roots to determine if we should go into degraded mode. - 16: optional EarlybirdServerStats earlybirdServerStats -} - -enum EarlybirdStatusCode { - STARTING = 0, - CURRENT = 1, - STOPPING = 2, - UNHEALTHY = 3, - BLACKLISTED = 4, - - PLACE_HOLDER5 = 5, - PLACE_HOLDER6 = 6, -} - -struct EarlybirdStatusResponse { - 1: required EarlybirdStatusCode code - 2: required i64 aliveSince - 3: optional string message -} - -service EarlybirdService { - string getName(), - EarlybirdStatusResponse getStatus(), - EarlybirdResponse search( 1: EarlybirdRequest request ) -} diff --git a/src/thrift/com/twitter/simclusters_v2/BUILD b/src/thrift/com/twitter/simclusters_v2/BUILD deleted file mode 100644 index 221cc9184..000000000 --- a/src/thrift/com/twitter/simclusters_v2/BUILD +++ /dev/null @@ -1,23 +0,0 @@ -create_thrift_libraries( - base_name = "simclusters_v2-thrift", - sources = ["*.thrift"], - platform = "java8", - tags = ["bazel-compatible"], - dependency_roots = [ - "src/thrift/com/twitter/algebird_internal", - ], - export_roots = [ - "src/thrift/com/twitter/algebird_internal:algebird_internal", - ], - generate_languages = [ - "go", - "java", - "lua", - "python", - "ruby", - "scala", - "strato", - ], - provides_java_name = "simclusters_v2-thrift-java", - provides_scala_name = "simclusters_v2-thrift-scala", -) diff --git a/src/thrift/com/twitter/simclusters_v2/BUILD.docx b/src/thrift/com/twitter/simclusters_v2/BUILD.docx new file mode 100644 index 000000000..3bdfdf2cb Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/BUILD.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/abuse.docx b/src/thrift/com/twitter/simclusters_v2/abuse.docx new file mode 100644 index 000000000..29efae70a Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/abuse.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/abuse.thrift b/src/thrift/com/twitter/simclusters_v2/abuse.thrift deleted file mode 100644 index 60043244b..000000000 --- a/src/thrift/com/twitter/simclusters_v2/abuse.thrift +++ /dev/null @@ -1,53 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2 -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "embedding.thrift" -include "simclusters_presto.thrift" - -/** - * Struct that associates a user with simcluster scores for different - * interaction types. This is meant to be used as a feature to predict abuse. - * - * This thrift struct is meant for exploration purposes. It does not have any - * assumptions about what type of interactions we use or what types of scores - * we are keeping track of. - **/ -struct AdhocSingleSideClusterScores { - 1: required i64 userId(personalDataType = 'UserId') - // We can make the interaction types have arbitrary names. In the production - // version of this dataset. We should have a different field per interaction - // type so that API of what is included is more clear. - 2: required map interactionScores -}(persisted="true", hasPersonalData = 'true') - -/** -* This is a prod version of the single side features. It is meant to be used as a value in a key -* value store. The pair of healthy and unhealthy scores will be different depending on the use case. -* We will use different stores for different user cases. For instance, the first instance that -* we implement will use search abuse reports and impressions. We can build stores for new values -* in the future. -* -* The consumer creates the interactions which the author receives. For instance, the consumer -* creates an abuse report for an author. The consumer scores are related to the interaction creation -* behavior of the consumer. The author scores are related to the whether the author receives these -* interactions. -* -**/ -struct SingleSideUserScores { - 1: required i64 userId(personalDataType = 'UserId') - 2: required double consumerUnhealthyScore(personalDataType = 'EngagementScore') - 3: required double consumerHealthyScore(personalDataType = 'EngagementScore') - 4: required double authorUnhealthyScore(personalDataType = 'EngagementScore') - 5: required double authorHealthyScore(personalDataType = 'EngagementScore') -}(persisted="true", hasPersonalData = 'true') - -/** -* Struct that associates a cluster-cluster interaction scores for different -* interaction types. -**/ -struct AdhocCrossSimClusterInteractionScores { - 1: required i64 clusterId - 2: required list clusterScores -}(persisted="true") diff --git a/src/thrift/com/twitter/simclusters_v2/clustering.docx b/src/thrift/com/twitter/simclusters_v2/clustering.docx new file mode 100644 index 000000000..96164124c Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/clustering.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/clustering.thrift b/src/thrift/com/twitter/simclusters_v2/clustering.thrift deleted file mode 100644 index 81b8567cb..000000000 --- a/src/thrift/com/twitter/simclusters_v2/clustering.thrift +++ /dev/null @@ -1,18 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.clustering -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -/** - * Struct that represents an ordered list of producer clusters. - * The list is meant to be ordered by decreasing cluster size. - **/ -struct OrderedClustersAndMembers { - 1: required list> orderedClustersAndMembers (personalDataType = 'UserId') - // work around BQ not supporting nested struct such as list - 2: optional list orderedClustersAndMembersStruct (personalDataType = 'UserId') -}(persisted = 'true', hasPersonalData = 'true') - -struct ClusterMembers { - 1: required set clusterMembers (personalDataType = 'UserId') -}(persisted = 'true', hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/embedding.docx b/src/thrift/com/twitter/simclusters_v2/embedding.docx new file mode 100644 index 000000000..2835d50ca Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/embedding.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/embedding.thrift b/src/thrift/com/twitter/simclusters_v2/embedding.thrift deleted file mode 100644 index 110da0c65..000000000 --- a/src/thrift/com/twitter/simclusters_v2/embedding.thrift +++ /dev/null @@ -1,137 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.embedding -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "com/twitter/simclusters_v2/identifier.thrift" -include "com/twitter/simclusters_v2/online_store.thrift" - -struct SimClusterWithScore { - 1: required i32 clusterId(personalDataType = 'InferredInterests') - 2: required double score(personalDataType = 'EngagementScore') -}(persisted = 'true', hasPersonalData = 'true') - -struct TopSimClustersWithScore { - 1: required list topClusters - 2: required online_store.ModelVersion modelVersion -}(persisted = 'true', hasPersonalData = 'true') - -struct InternalIdWithScore { - 1: required identifier.InternalId internalId - 2: required double score(personalDataType = 'EngagementScore') -}(persisted = 'true', hasPersonalData = 'true') - -struct InternalIdEmbedding { - 1: required list embedding -}(persisted = 'true', hasPersonalData = 'true') - -struct SemanticCoreEntityWithScore { - 1: required i64 entityId(personalDataType = 'SemanticcoreClassification') - 2: required double score(personalDataType = 'EngagementScore') -}(persisted = 'true', hasPersonalData = 'true') - -struct TopSemanticCoreEntitiesWithScore { - 1: required list topEntities -}(persisted = 'true', hasPersonalData = 'true') - -struct PersistedFullClusterId { - 1: required online_store.ModelVersion modelVersion - 2: required i32 clusterId(personalDataType = 'InferredInterests') -}(persisted = 'true', hasPersonalData = 'true') - -struct DayPartitionedClusterId { - 1: required i32 clusterId(personalDataType = 'InferredInterests') - 2: required string dayPartition // format: yyyy-MM-dd -} - -struct TopProducerWithScore { - 1: required i64 userId(personalDataType = 'UserId') - 2: required double score(personalDataType = 'EngagementScore') -}(persisted = 'true', hasPersonalData = 'true') - -struct TopProducersWithScore { - 1: required list topProducers -}(persisted = 'true', hasPersonalData = 'true') - -struct TweetWithScore { - 1: required i64 tweetId(personalDataType = 'TweetId') - 2: required double score(personalDataType = 'EngagementScore') -}(persisted = 'true', hasPersonalData = 'true') - -struct TweetsWithScore { - 1: required list tweets -}(persisted = 'true', hasPersonalData = 'true') - -struct TweetTopKTweetsWithScore { - 1: required i64 tweetId(personalDataType = 'TweetId') - 2: required TweetsWithScore topkTweetsWithScore -}(persisted = 'true', hasPersonalData = 'true') - -/** - * The generic SimClustersEmbedding for online long-term storage and real-time calculation. - * Use SimClustersEmbeddingId as the only identifier. - * Warning: Doesn't include model version and embedding type in the value struct. - **/ -struct SimClustersEmbedding { - 1: required list embedding -}(persisted = 'true', hasPersonalData = 'true') - -struct SimClustersEmbeddingWithScore { - 1: required SimClustersEmbedding embedding - 2: required double score -}(persisted = 'true', hasPersonalData = 'false') - -/** - * This is the recommended structure for aggregating embeddings with time decay - the metadata - * stores the information needed for decayed aggregation. - **/ -struct SimClustersEmbeddingWithMetadata { - 1: required SimClustersEmbedding embedding - 2: required SimClustersEmbeddingMetadata metadata -}(hasPersonalData = 'true') - -struct SimClustersEmbeddingIdWithScore { - 1: required identifier.SimClustersEmbeddingId id - 2: required double score -}(persisted = 'true', hasPersonalData = 'false') - -struct SimClustersMultiEmbeddingByValues { - 1: required list embeddings -}(persisted = 'true', hasPersonalData = 'false') - -struct SimClustersMultiEmbeddingByIds { - 1: required list ids -}(persisted = 'true', hasPersonalData = 'false') - -/** - * Generic SimClusters Multiple Embeddings. The identifier.SimClustersMultiEmbeddingId is the key of - * the multiple embedding. - **/ -union SimClustersMultiEmbedding { - 1: SimClustersMultiEmbeddingByValues values - 2: SimClustersMultiEmbeddingByIds ids -}(persisted = 'true', hasPersonalData = 'false') - -/** - * The metadata of a SimClustersEmbedding. The updatedCount represent the version of the Embedding. - * For tweet embedding, the updatedCount is same/close to the favorite count. - **/ -struct SimClustersEmbeddingMetadata { - 1: optional i64 updatedAtMs - 2: optional i64 updatedCount -}(persisted = 'true', hasPersonalData = 'true') - -/** - * The data structure for PersistentSimClustersEmbedding Store - **/ -struct PersistentSimClustersEmbedding { - 1: required SimClustersEmbedding embedding - 2: required SimClustersEmbeddingMetadata metadata -}(persisted = 'true', hasPersonalData = 'true') - -/** - * The data structure for the Multi Model PersistentSimClustersEmbedding Store - **/ -struct MultiModelPersistentSimClustersEmbedding { - 1: required map multiModelPersistentSimClustersEmbedding -}(persisted = 'true', hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/entity.docx b/src/thrift/com/twitter/simclusters_v2/entity.docx new file mode 100644 index 000000000..a45ee558c Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/entity.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/entity.thrift b/src/thrift/com/twitter/simclusters_v2/entity.thrift deleted file mode 100644 index 1d0ee6946..000000000 --- a/src/thrift/com/twitter/simclusters_v2/entity.thrift +++ /dev/null @@ -1,51 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.entity -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "com/twitter/algebird_internal/algebird.thrift" - -/** - * Penguin text entity. All fields are required as this is used as a part of a memcache key. - **/ -struct PenguinKey { - 1: required string textEntity -}(hasPersonalData = 'false') - -/** - * NER text entity. All fields are required as this is used as a part of a memcache key. - **/ -struct NerKey { - 1: required string textEntity - 2: required i32 wholeEntityType -}(hasPersonalData = 'false') - -/** - * Semantic Core text entity. All fields are required as this is used as a part of a memcache key. - **/ -struct SemanticCoreKey { - 1: required i64 entityId(personalDataType = 'SemanticcoreClassification') -}(hasPersonalData = 'true') - -/** - * Represents an entity extracted from a tweet. - **/ -union TweetTextEntity { - 1: string hashtag - 2: PenguinKey penguin - 3: NerKey ner - 4: SemanticCoreKey semanticCore -}(hasPersonalData = 'true') - -struct SpaceId { - 1: string id -}(hasPersonalData = 'true') - -/** - * All possible entities that simclusters are associated with. - **/ -union SimClusterEntity { - 1: i64 tweetId(personalDataType = 'TweetId') - 2: TweetTextEntity tweetEntity - 3: SpaceId spaceId -}(hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/evaluation.docx b/src/thrift/com/twitter/simclusters_v2/evaluation.docx new file mode 100644 index 000000000..b6f0bb33d Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/evaluation.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/evaluation.thrift b/src/thrift/com/twitter/simclusters_v2/evaluation.thrift deleted file mode 100644 index 85414baf9..000000000 --- a/src/thrift/com/twitter/simclusters_v2/evaluation.thrift +++ /dev/null @@ -1,65 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.evaluation -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -/** - * Surface area at which the reference tweet was displayed to the user - **/ -enum DisplayLocation { - TimelinesRecap = 1, - TimelinesRectweet = 2 -}(hasPersonalData = 'false') - -struct TweetLabels { - 1: required bool isClicked = false(personalDataType = 'EngagementsPrivate') - 2: required bool isLiked = false(personalDataType = 'EngagementsPublic') - 3: required bool isRetweeted = false(personalDataType = 'EngagementsPublic') - 4: required bool isQuoted = false(personalDataType = 'EngagementsPublic') - 5: required bool isReplied = false(personalDataType = 'EngagementsPublic') -}(persisted = 'true', hasPersonalData = 'true') - -/** - * Data container of a reference tweet with scribed user engagement labels - */ -struct ReferenceTweet { - 1: required i64 tweetId(personalDataType = 'TweetId') - 2: required i64 authorId(personalDataType = 'UserId') - 3: required i64 timestamp(personalDataType = 'PublicTimestamp') - 4: required DisplayLocation displayLocation - 5: required TweetLabels labels -}(persisted="true", hasPersonalData = 'true') - -/** - * Data container of a candidate tweet generated by the candidate algorithm - */ -struct CandidateTweet { - 1: required i64 tweetId(personalDataType = 'TweetId') - 2: optional double score(personalDataType = 'EngagementScore') - // The timestamp here is a synthetically generated timestamp. - // for evaluation purpose. Hence left unannotated - 3: optional i64 timestamp -}(hasPersonalData = 'true') - -/** - * An encapsulated collection of candidate tweets - **/ -struct CandidateTweets { - 1: required i64 targetUserId(personalDataType = 'UserId') - 2: required list recommendedTweets -}(hasPersonalData = 'true') - -/** - * An encapsulated collection of reference tweets - **/ -struct ReferenceTweets { - 1: required i64 targetUserId(personalDataType = 'UserId') - 2: required list impressedTweets -}(persisted="true", hasPersonalData = 'true') - -/** - * A list of candidate tweets - **/ -struct CandidateTweetsList { - 1: required list recommendedTweets -}(hasPersonalData = 'true') \ No newline at end of file diff --git a/src/thrift/com/twitter/simclusters_v2/graph.docx b/src/thrift/com/twitter/simclusters_v2/graph.docx new file mode 100644 index 000000000..abacbe504 Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/graph.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/graph.thrift b/src/thrift/com/twitter/simclusters_v2/graph.thrift deleted file mode 100644 index e67c860d2..000000000 --- a/src/thrift/com/twitter/simclusters_v2/graph.thrift +++ /dev/null @@ -1,61 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.graph -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -struct DecayedSums { - // last time the decayed sum was updated, in millis. - 1: required i64 lastUpdatedTimestamp - - // a map from half life (specified in days) to the decayed sum - 2: required map halfLifeInDaysToDecayedSums -}(persisted = 'true', hasPersonalData = 'false') - -struct EdgeWithDecayedWeights { - 1: required i64 sourceId(personalDataType = 'UserId') - 2: required i64 destinationId(personalDataType = 'UserId') - 3: required DecayedSums weights -}(persisted="true", hasPersonalData = "true") - -struct NeighborWithWeights { - 1: required i64 neighborId(personalDataType = 'UserId') - 2: optional bool isFollowed(personalDataType = 'Follow') - 3: optional double followScoreNormalizedByNeighborFollowersL2(personalDataType = 'EngagementsPublic') - 4: optional double favScoreHalfLife100Days(personalDataType = 'EngagementsPublic') - 5: optional double favScoreHalfLife100DaysNormalizedByNeighborFaversL2(personalDataType = 'EngagementsPublic') - - // log(favScoreHalfLife100Days + 1) - 6: optional double logFavScore(personalDataType = 'EngagementsPublic') - - // log(favScoreHalfLife100Days + 1) normalized so that a user's incoming weights have unit l2 norm - 7: optional double logFavScoreL2Normalized(personalDataType = 'EngagementsPublic') - -}(persisted = 'true', hasPersonalData = 'true') - -struct UserAndNeighbors { - 1: required i64 userId(personalDataType = 'UserId') - 2: required list neighbors -}(persisted="true", hasPersonalData = 'true') - -struct NormsAndCounts { - 1: required i64 userId(personalDataType = 'UserId') - 2: optional double followerL2Norm(personalDataType = 'CountOfFollowersAndFollowees') - 3: optional double faverL2Norm(personalDataType = 'EngagementsPublic') - 4: optional i64 followerCount(personalDataType = 'CountOfFollowersAndFollowees') - 5: optional i64 faverCount(personalDataType = 'EngagementsPublic') - - // sum of the weights on the incoming edges where someone fav'ed this producer - 6: optional double favWeightsOnFavEdgesSum(personalDataType = 'EngagementsPublic') - - // sum of the fav weights on all the followers of this producer - 7: optional double favWeightsOnFollowEdgesSum(personalDataType = 'EngagementsPublic') - // log(favScore + 1) - 8: optional double logFavL2Norm(personalDataType = 'EngagementsPublic') - - // sum of log(favScore + 1) on the incoming edges where someone fav'ed this producer - 9: optional double logFavWeightsOnFavEdgesSum(personalDataType = 'EngagementsPublic') - - // sum of log(favScore + 1) on all the followers of this producer - 10: optional double logFavWeightsOnFollowEdgesSum(personalDataType = 'EngagementsPublic') - -}(persisted="true", hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/identifier.docx b/src/thrift/com/twitter/simclusters_v2/identifier.docx new file mode 100644 index 000000000..e5720db3f Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/identifier.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/identifier.thrift b/src/thrift/com/twitter/simclusters_v2/identifier.thrift deleted file mode 100644 index b4285e699..000000000 --- a/src/thrift/com/twitter/simclusters_v2/identifier.thrift +++ /dev/null @@ -1,205 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.identifier -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "com/twitter/simclusters_v2/online_store.thrift" - -/** - * The uniform type for a SimClusters Embeddings. - * Each embeddings have the uniform underlying storage. - * Warning: Every EmbeddingType should map to one and only one InternalId. - **/ -enum EmbeddingType { - // Reserve 001 - 99 for Tweet embeddings - FavBasedTweet = 1, // Deprecated - FollowBasedTweet = 2, // Deprecated - LogFavBasedTweet = 3, // Production Version - FavBasedTwistlyTweet = 10, // Deprecated - LogFavBasedTwistlyTweet = 11, // Deprecated - LogFavLongestL2EmbeddingTweet = 12, // Production Version - - // Tweet embeddings generated from non-fav events - // Naming convention: {Event}{Score}BasedTweet - // {Event}: The interaction event we use to build the tweet embeddings - // {Score}: The score from user InterestedIn embeddings - VideoPlayBack50LogFavBasedTweet = 21, - RetweetLogFavBasedTweet = 22, - ReplyLogFavBasedTweet = 23, - PushOpenLogFavBasedTweet = 24, - - // [Experimental] Offline generated FavThroughRate-based Tweet Embedding - Pop1000RankDecay11Tweet = 30, - Pop10000RankDecay11Tweet = 31, - OonPop1000RankDecayTweet = 32, - - // [Experimental] Offline generated production-like LogFavScore-based Tweet Embedding - OfflineGeneratedLogFavBasedTweet = 40, - - // Reserve 51-59 for Ads Embedding - LogFavBasedAdsTweet = 51, // Experimental embedding for ads tweet candidate - LogFavClickBasedAdsTweet = 52, // Experimental embedding for ads tweet candidate - - // Reserve 60-69 for Evergreen content - LogFavBasedEvergreenTweet = 60, - LogFavBasedRealTimeTweet = 65, - - // Reserve 101 to 149 for Semantic Core Entity embeddings - FavBasedSematicCoreEntity = 101, // Deprecated - FollowBasedSematicCoreEntity = 102, // Deprecated - FavBasedHashtagEntity = 103, // Deprecated - FollowBasedHashtagEntity = 104, // Deprecated - ProducerFavBasedSemanticCoreEntity = 105, // Deprecated - ProducerFollowBasedSemanticCoreEntity = 106,// Deprecated - FavBasedLocaleSemanticCoreEntity = 107, // Deprecated - FollowBasedLocaleSemanticCoreEntity = 108, // Deprecated - LogFavBasedLocaleSemanticCoreEntity = 109, // Deprecated - LanguageFilteredProducerFavBasedSemanticCoreEntity = 110, // Deprecated - LanguageFilteredFavBasedLocaleSemanticCoreEntity = 111, // Deprecated - FavTfgTopic = 112, // TFG topic embedding built from fav-based user interestedIn - LogFavTfgTopic = 113, // TFG topic embedding built from logfav-based user interestedIn - FavInferredLanguageTfgTopic = 114, // TFG topic embedding built using inferred consumed languages - FavBasedKgoApeTopic = 115, // topic embedding using fav-based aggregatable producer embedding of KGO seed accounts. - LogFavBasedKgoApeTopic = 116, // topic embedding using log fav-based aggregatable producer embedding of KGO seed accounts. - FavBasedOnboardingApeTopic = 117, // topic embedding using fav-based aggregatable producer embedding of onboarding seed accounts. - LogFavBasedOnboardingApeTopic = 118, // topic embedding using log fav-based aggregatable producer embedding of onboarding seed accounts. - LogFavApeBasedMuseTopic = 119, // Deprecated - LogFavApeBasedMuseTopicExperiment = 120 // Deprecated - - // Reserved 201 - 299 for Producer embeddings (KnownFor) - FavBasedProducer = 201 - FollowBasedProducer = 202 - AggregatableFavBasedProducer = 203 // fav-based aggregatable producer embedding. - AggregatableLogFavBasedProducer = 204 // logfav-based aggregatable producer embedding. - RelaxedAggregatableLogFavBasedProducer = 205 // logfav-based aggregatable producer embedding. - AggregatableFollowBasedProducer = 206 // follow-based aggregatable producer embedding. - KnownFor = 300 - - // Reserved 301 - 399 for User InterestedIn embeddings - FavBasedUserInterestedIn = 301 - FollowBasedUserInterestedIn = 302 - LogFavBasedUserInterestedIn = 303 - RecentFollowBasedUserInterestedIn = 304 // interested-in embedding based on aggregating producer embeddings of recent follows - FilteredUserInterestedIn = 305 // interested-in embedding used by twistly read path - LogFavBasedUserInterestedInFromAPE = 306 - FollowBasedUserInterestedInFromAPE = 307 - TwiceUserInterestedIn = 308 // interested-in multi-embedding based on clustering producer embeddings of neighbors - UnfilteredUserInterestedIn = 309 - UserNextInterestedIn = 310 // next interested-in embedding generated from BeT - - // Denser User InterestedIn, generated by Producer embeddings. - FavBasedUserInterestedInFromPE = 311 - FollowBasedUserInterestedInFromPE = 312 - LogFavBasedUserInterestedInFromPE = 313 - FilteredUserInterestedInFromPE = 314 // interested-in embedding used by twistly read path - - // [Experimental] Denser User InterestedIn, generated by aggregating IIAPE embedding from AddressBook - LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE = 320 - LogFavBasedUserInterestedAverageAddressBookFromIIAPE = 321 - LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE = 322 - LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE = 323 - LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE = 324 - LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE = 325 - - //Reserved 401 - 500 for Space embedding - FavBasedApeSpace = 401 // DEPRECATED - LogFavBasedListenerSpace = 402 // DEPRECATED - LogFavBasedAPESpeakerSpace = 403 // DEPRECATED - LogFavBasedUserInterestedInListenerSpace = 404 // DEPRECATED - - // Experimental, internal-only IDs - ExperimentalThirtyDayRecentFollowBasedUserInterestedIn = 10000 // Like RecentFollowBasedUserInterestedIn, except limited to last 30 days - ExperimentalLogFavLongestL2EmbeddingTweet = 10001 // DEPRECATED -}(persisted = 'true', hasPersonalData = 'false') - -/** - * The uniform type for a SimClusters MultiEmbeddings. - * Warning: Every MultiEmbeddingType should map to one and only one InternalId. - **/ -enum MultiEmbeddingType { - // Reserved 0-99 for Tweet based MultiEmbedding - - // Reserved 100 - 199 for Topic based MultiEmbedding - LogFavApeBasedMuseTopic = 100 // Deprecated - LogFavApeBasedMuseTopicExperiment = 101 // Deprecated - - // Reserved 301 - 399 for User InterestedIn embeddings - TwiceUserInterestedIn = 301 // interested-in multi-embedding based on clustering producer embeddings of neighbors -}(persisted = 'true', hasPersonalData = 'true') - -// Deprecated. Please use TopicId for future cases. -struct LocaleEntityId { - 1: i64 entityId - 2: string language -}(persisted = 'true', hasPersonalData = 'false') - -enum EngagementType { - Favorite = 1, - Retweet = 2, -} - -struct UserEngagedTweetId { - 1: i64 tweetId(personalDataType = 'TweetId') - 2: i64 userId(personalDataType = 'UserId') - 3: EngagementType engagementType(personalDataType = 'EventType') -}(persisted = 'true', hasPersonalData = 'true') - -struct TopicId { - 1: i64 entityId (personalDataType = 'SemanticcoreClassification') - // 2-letter ISO 639-1 language code - 2: optional string language - // 2-letter ISO 3166-1 alpha-2 country code - 3: optional string country -}(persisted = 'true', hasPersonalData = 'false') - -struct TopicSubId { - 1: i64 entityId (personalDataType = 'SemanticcoreClassification') - // 2-letter ISO 639-1 language code - 2: optional string language - // 2-letter ISO 3166-1 alpha-2 country code - 3: optional string country - 4: i32 subId -}(persisted = 'true', hasPersonalData = 'true') - -// Will be used for testing purposes in DDG 15536, 15534 -struct UserWithLanguageId { - 1: required i64 userId(personalDataType = 'UserId') - 2: optional string langCode(personalDataType = 'InferredLanguage') -}(persisted = 'true', hasPersonalData = 'true') - -/** - * The internal identifier type. - * Need to add ordering in [[com.twitter.simclusters_v2.common.SimClustersEmbeddingId]] - * when adding a new type. - **/ -union InternalId { - 1: i64 tweetId(personalDataType = 'TweetId') - 2: i64 userId(personalDataType = 'UserId') - 3: i64 entityId(personalDataType = 'SemanticcoreClassification') - 4: string hashtag(personalDataType = 'PublicTweetEntitiesAndMetadata') - 5: i32 clusterId - 6: LocaleEntityId localeEntityId(personalDataType = 'SemanticcoreClassification') - 7: UserEngagedTweetId userEngagedTweetId - 8: TopicId topicId - 9: TopicSubId topicSubId - 10: string spaceId - 11: UserWithLanguageId userWithLanguageId -}(persisted = 'true', hasPersonalData = 'true') - -/** - * A uniform identifier type for all kinds of SimClusters based embeddings. - **/ -struct SimClustersEmbeddingId { - 1: required EmbeddingType embeddingType - 2: required online_store.ModelVersion modelVersion - 3: required InternalId internalId -}(persisted = 'true', hasPersonalData = 'true') - -/** - * A uniform identifier type for multiple SimClusters embeddings - **/ -struct SimClustersMultiEmbeddingId { - 1: required MultiEmbeddingType embeddingType - 2: required online_store.ModelVersion modelVersion - 3: required InternalId internalId -}(persisted = 'true', hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/inferred_entities.docx b/src/thrift/com/twitter/simclusters_v2/inferred_entities.docx new file mode 100644 index 000000000..ddb07ff81 Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/inferred_entities.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/inferred_entities.thrift b/src/thrift/com/twitter/simclusters_v2/inferred_entities.thrift deleted file mode 100644 index db667fb68..000000000 --- a/src/thrift/com/twitter/simclusters_v2/inferred_entities.thrift +++ /dev/null @@ -1,38 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.inferred_entities -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -// The SimClusters type we use to infer entity interests about a user -// Currently used for SimClusters Compliance to store a user's inferred interests - -include "online_store.thrift" - -enum ClusterType { - KnownFor = 1, - InterestedIn = 2 -}(persisted = 'true', hasPersonalData = 'false') - -struct SimClustersSource { - 1: required ClusterType clusterType - 2: required online_store.ModelVersion modelVersion -}(persisted = 'true', hasPersonalData = 'false') - -// The source of entities we use to infer entity interests about a user -enum EntitySource { - SimClusters20M145KDec11EntityEmbeddingsByFavScore = 1, // deprecated - SimClusters20M145KUpdatedEntityEmbeddingsByFavScore = 2, // deprecated - UTTAccountRecommendations = 3 # dataset built by Onboarding team - SimClusters20M145K2020EntityEmbeddingsByFavScore = 4 -}(persisted = 'true', hasPersonalData = 'false') - -struct InferredEntity { - 1: required i64 entityId(personalDataType = 'SemanticcoreClassification') - 2: required double score(personalDataType = 'EngagementScore') - 3: optional SimClustersSource simclusterSource - 4: optional EntitySource entitySource -}(persisted = 'true', hasPersonalData = 'true') - -struct SimClustersInferredEntities { - 1: required list entities -}(persisted = 'true', hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/interests.docx b/src/thrift/com/twitter/simclusters_v2/interests.docx new file mode 100644 index 000000000..2674747f1 Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/interests.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/interests.thrift b/src/thrift/com/twitter/simclusters_v2/interests.thrift deleted file mode 100644 index 5c1a04970..000000000 --- a/src/thrift/com/twitter/simclusters_v2/interests.thrift +++ /dev/null @@ -1,259 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.interests -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -/** - * All of the scores below assume that the knownFor vector for each cluster is already - * of unit L2 norm i.e. sum of squares is 1. - **/ -struct UserToInterestedInClusterScores { - // dot product of user's binary follow vector with knownFor vector for this cluster - // TIP: By default, use this score or favScore. - 1: optional double followScore(personalDataType = 'CountOfFollowersAndFollowees') - - // first compute followScore as defined above - // then compute L2 norm of the vector of these scores for this cluster - // divide by that. - // essentially the more people are interested in this cluster, the lower this score gets - // TIP: Use this score if your use case needs to penalize clusters that a lot of other - // users are also interested in - 2: optional double followScoreClusterNormalizedOnly(personalDataType = 'CountOfFollowersAndFollowees') - - // dot product of user's producer normalized follow vector and knownFor vector for this cluster - // i.e. i^th entry in the normalized follow vector = 1.0/sqrt(number of followers of user i) - // TIP: Use this score if your use case needs to penalize clusters where the users known for - // that cluster are popular. - 3: optional double followScoreProducerNormalizedOnly(personalDataType = 'CountOfFollowersAndFollowees') - - // first compute followScoreProducerNormalizedOnly - // then compute L2 norm of the vector of these scores for this cluster - // divide by that. - // essentially the more people are interested in this cluster, the lower this score gets - // TIP: Use this score if your use case needs to penalize both clusters that a lot of other - // users are interested in, as well as clusters where the users known for that cluster are - // popular. - 4: optional double followScoreClusterAndProducerNormalized(personalDataType = 'CountOfFollowersAndFollowees') - - // dot product of user's favScoreHalfLife100Days vector with knownFor vector for this cluster - // TIP: By default, use this score or followScore. - 5: optional double favScore(personalDataType = 'EngagementsPublic') - - // first compute favScore as defined above - // then compute L2 norm of the vector of these scores for this cluster - // divide by that. - // essentially the more people are interested in this cluster, the lower this score gets - // TIP: Use this score if your use case needs to penalize clusters that a lot of other - // users are also interested in - 6: optional double favScoreClusterNormalizedOnly(personalDataType = 'EngagementsPublic') - - // dot product of user's favScoreHalfLife100DaysNormalizedByNeighborFaversL2 vector with - // knownFor vector for this cluster - // TIP: Use this score if your use case needs to penalize clusters where the users known for - // that cluster are popular. - 7: optional double favScoreProducerNormalizedOnly(personalDataType = 'EngagementsPublic') - - // first compute favScoreProducerNormalizedOnly as defined above - // then compute L2 norm of the vector of these scores for this cluster - // divide by that. - // essentially the more people are interested in this cluster, the lower this score gets - // TIP: Use this score if your use case needs to penalize both clusters that a lot of other - // users are interested in, as well as clusters where the users known for that cluster are - // popular. - 8: optional double favScoreClusterAndProducerNormalized(personalDataType = 'EngagementsPublic') - - // list of users who're known for this cluster as well as are being followed by the user. - 9: optional list usersBeingFollowed(personalDataType = 'UserId') - - // list of users who're known for this cluster as well as were faved at some point by the user. - 10: optional list usersThatWereFaved(personalDataType = 'UserId') - - // A pretty close upper bound on the number of users who are interested in this cluster. - // Useful to know if this is a niche community or a popular topic. - 11: optional i32 numUsersInterestedInThisClusterUpperBound - - // dot product of user's logFavScore vector with knownFor vector for this cluster - // TIP: this score is under experimentations - 12: optional double logFavScore(personalDataType = 'EngagementsPublic') - - // first compute logFavScore as defined above - // then compute L2 norm of the vector of these scores for this cluster - // divide by that. - // essentially the more people are interested in this cluster, the lower this score gets - // TIP: this score is under experimentations - 13: optional double logFavScoreClusterNormalizedOnly(personalDataType = 'EngagementsPublic') - - // actual count of number of users who're known for this cluster as well as are being followed by the user. - 14: optional i32 numUsersBeingFollowed - - // actual count of number of users who're known for this cluster as well as were faved at some point by the user. - 15: optional i32 numUsersThatWereFaved -}(persisted = 'true', hasPersonalData = 'true') - -struct UserToInterestedInClusters { - 1: required i64 userId(personalDataType = 'UserId') - 2: required string knownForModelVersion - 3: required map clusterIdToScores(personalDataTypeKey = 'InferredInterests') -}(persisted="true", hasPersonalData = 'true') - -struct LanguageToClusters { - 1: required string language - 2: required string knownForModelVersion - 3: required map clusterIdToScores(personalDataTypeKey = 'InferredInterests') -}(persisted="true", hasPersonalData = 'true') - -struct ClustersUserIsInterestedIn { - 1: required string knownForModelVersion - 2: required map clusterIdToScores(personalDataTypeKey = 'InferredInterests') -}(persisted = 'true', hasPersonalData = 'true') - -struct UserToKnownForClusters { - 1: required i64 userId(personalDataType = 'UserId') - 2: required string knownForModelVersion - 3: required map clusterIdToScores(personalDataTypeKey = 'InferredInterests') -}(persisted="true", hasPersonalData = 'true') - -struct UserToKnownForClusterScores { - 1: optional double knownForScore -}(persisted = 'true', hasPersonalData = 'false') - -struct ClustersUserIsKnownFor { - 1: required string knownForModelVersion - 2: required map clusterIdToScores(personalDataTypeKey = 'InferredInterests') -}(persisted = 'true', hasPersonalData = 'true') - -/** Thrift struct for storing quantile bounds output by QTreeMonoid in Algebird */ -struct QuantileBounds { - 1: required double lowerBound - 2: required double upperBound -}(persisted = 'true', hasPersonalData = 'false') - -/** Thrift struct giving the details of the distribution of a set of doubles */ -struct DistributionDetails { - 1: required double mean - 2: optional double standardDeviation - 3: optional double min - 4: optional QuantileBounds p25 - 5: optional QuantileBounds p50 - 6: optional QuantileBounds p75 - 7: optional QuantileBounds p95 - 8: optional double max -}(persisted = 'true', hasPersonalData = 'false') - -/** Note that the modelVersion here is specified somewhere outside, specifically, as part of the key */ -struct ClusterNeighbor { - 1: required i32 clusterId - /** Note that followCosineSimilarity is same as dot product over followScoreClusterNormalizedOnly - * since those scores form a unit vector **/ - 2: optional double followCosineSimilarity - /** Note that favCosineSimilarity is same as dot product over favScoreClusterNormalizedOnly - * since those scores form a unit vector **/ - 3: optional double favCosineSimilarity - /** Note that logFavCosineSimilarity is same as dot product over logFavScoreClusterNormalizedOnly - * since those scores form a unit vector **/ - 4: optional double logFavCosineSimilarity -}(persisted = 'true', hasPersonalData = 'false') - -/** Useful for storing the list of users known for a cluster */ -struct UserWithScore { - 1: required i64 userId(personalDataType = 'UserId') - 2: required double score -}(persisted="true", hasPersonalData = 'true') - -// deprecated -struct EdgeCut { - 1: required double cutEdges - 2: required double totalVolume -}(persisted = 'true', hasPersonalData = 'false') - -struct ClusterQuality { - // deprecated - 1: optional EdgeCut deprecated_unweightedEdgeCut - // deprecated - 2: optional EdgeCut deprecated_edgeWeightedCut - // deprecated - 3: optional EdgeCut deprecated_nodeAndEdgeWeightedCut - - // correlation of actual weight of (u, v) with I(u & v in same cluster) * score(u) * score(v) - 4: optional double weightAndProductOfNodeScoresCorrelation - - // fraction of edges staying inside cluster divided by total edges from nodes in the cluster - 5: optional double unweightedRecall - - // fraction of edge weights staying inside cluster divided by total edge weights from nodes in the cluster - 6: optional double weightedRecall - - // total edges from nodes in the cluster - 7: optional double unweightedRecallDenominator - - // total edge weights from nodes in the cluster - 8: optional double weightedRecallDenominator - - // sum of edge weights inside cluster / { #nodes * (#nodes - 1) } - 9: optional double relativePrecisionNumerator - - // above divided by the sum of edge weights in the total graph / { n * (n - 1) } - 10: optional double relativePrecision -}(persisted = 'true', hasPersonalData = 'false') - -/** -* This struct is the value of the ClusterDetails key-value dataset. -* The key is (modelVersion, clusterId) -**/ -struct ClusterDetails { - 1: required i32 numUsersWithAnyNonZeroScore - 2: required i32 numUsersWithNonZeroFollowScore - 3: required i32 numUsersWithNonZeroFavScore - 4: optional DistributionDetails followScoreDistributionDetails - 5: optional DistributionDetails favScoreDistributionDetails - 6: optional list knownForUsersAndScores - 7: optional list neighborClusters - // fraction of users who're known for this cluster who're marked NSFW_User in UserSource - 8: optional double fractionKnownForMarkedNSFWUser - // the major languages that this cluster's known_fors have as their "language" field in - // UserSource, and the fractions - 9: optional map languageToFractionDeviceLanguage - // the major country codes that this cluster's known_fors have as their "account_country_code" - // field in UserSource, and the fractions - 10: optional map countryCodeToFractionKnownForWithCountryCode - 11: optional ClusterQuality qualityMeasuredOnSimsGraph - 12: optional DistributionDetails logFavScoreDistributionDetails - // fraction of languages this cluster's known_fors produce based on what penguin_user_languages dataset infers - 13: optional map languageToFractionInferredLanguage -}(persisted="true", hasPersonalData = 'true') - -struct SampledEdge { - 1: required i64 followerId(personalDataType = 'UserId') - 2: required i64 followeeId(personalDataType = 'UserId') - 3: optional double favWtIfFollowEdge - 4: optional double favWtIfFavEdge - 5: optional double followScoreToCluster - 6: optional double favScoreToCluster - 7: optional double predictedFollowScore - 8: optional double predictedFavScore -}(persisted="true", hasPersonalData = 'true') - -/** -* The key here is (modelVersion, clusterId) -**/ -struct BipartiteClusterQuality { - 1: optional double inClusterFollowEdges - 2: optional double inClusterFavEdges - 3: optional double favWtSumOfInClusterFollowEdges - 4: optional double favWtSumOfInClusterFavEdges - 5: optional double outgoingFollowEdges - 6: optional double outgoingFavEdges - 7: optional double favWtSumOfOutgoingFollowEdges - 8: optional double favWtSumOfOutgoingFavEdges - 9: optional double incomingFollowEdges - 10: optional double incomingFavEdges - 11: optional double favWtSumOfIncomingFollowEdges - 12: optional double favWtSumOfIncomingFavEdges - 13: optional i32 interestedInSize - 14: optional list sampledEdges - 15: optional i32 knownForSize - 16: optional double correlationOfFavWtIfFollowWithPredictedFollow - 17: optional double correlationOfFavWtIfFavWithPredictedFav - 18: optional double relativePrecisionUsingFavWtIfFav - 19: optional double averagePrecisionOfWholeGraphUsingFavWtIfFav -}(persisted="true", hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/multi_type_graph.docx b/src/thrift/com/twitter/simclusters_v2/multi_type_graph.docx new file mode 100644 index 000000000..f2a06a2a6 Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/multi_type_graph.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/multi_type_graph.thrift b/src/thrift/com/twitter/simclusters_v2/multi_type_graph.thrift deleted file mode 100644 index f7dee7381..000000000 --- a/src/thrift/com/twitter/simclusters_v2/multi_type_graph.thrift +++ /dev/null @@ -1,110 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.multi_type_graph -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "entity.thrift" - -union LeftNode { - 1: i64 userId(personalDataType = 'UserId') -}(persisted = 'true', hasPersonalData = 'true') - -struct RightNode { - 1: required RightNodeType rightNodeType(personalDataType = 'EngagementsPublic') - 2: required Noun noun -}(persisted = 'true', hasPersonalData = 'true') - -struct RightNodeWithEdgeWeight { - 1: required RightNode rightNode - 2: required double weight(personalDataType = 'EngagementScore') -}(persisted = 'true', hasPersonalData = 'true') - -enum RightNodeType { - FollowUser = 1, - FavUser = 2, - BlockUser = 3, - AbuseReportUser = 4, - SpamReportUser = 5, - FollowTopic = 6, - SignUpCountry = 7, - ConsumedLanguage = 8, - FavTweet = 9, - ReplyTweet = 10, - RetweetTweet = 11, - NotifOpenOrClickTweet = 12, - SearchQuery = 13 -}(persisted = 'true') - -union Noun { -// Note: Each of the following needs to have an ordering defined in Ordering[Noun] -// in file: multi_type_graph/assemble_multi_type_graph/AssembleMultiTypeGraph.scala -// Please take note to make changes to Ordering[Noun] when modifying/adding new noun type here - 1: i64 userId(personalDataType = 'UserId') - 2: string country(personalDataType = 'InferredCountry') - 3: string language(personalDataType = 'InferredLanguage') - 4: i64 topicId(personalDataType = 'TopicFollow') - 5: i64 tweetId(personalDataType = 'TweetId') - 6: string query(personalDataType = 'SearchQuery') -}(persisted = 'true', hasPersonalData = 'true') - -struct RightNodeWithEdgeWeightList { - 1: required list rightNodeWithEdgeWeightList -}(persisted = 'true', hasPersonalData = 'true') - -struct NounWithFrequency { - 1: required Noun noun - 2: required double frequency (personalDataType = 'EngagementScore') -}(persisted = 'true', hasPersonalData = 'true') - -struct NounWithFrequencyList { - 1: required list nounWithFrequencyList -}(persisted = 'true', hasPersonalData = 'true') - -struct RightNodeTypeStruct { - 1: required RightNodeType rightNodeType -}(persisted = 'true', hasPersonalData = 'false') - -struct MultiTypeGraphEdge{ - 1: required LeftNode leftNode - 2: required RightNodeWithEdgeWeight rightNodeWithEdgeWeight -}(persisted = 'true', hasPersonalData = 'true') - -struct LeftNodeToRightNodeWithEdgeWeightList{ - 1: required LeftNode leftNode - 2: required RightNodeWithEdgeWeightList rightNodeWithEdgeWeightList -}(persisted = 'true', hasPersonalData = 'true') - -struct RightNodeSimHashSketch { - 1: required RightNode rightNode - 2: required list simHashOfEngagers - 3: optional double normalizer -}(persisted='true', hasPersonalData = 'false') - -struct SimilarRightNode { - 1: required RightNode rightNode - 2: required double score (personalDataType = 'EngagementScore') -}(persisted='true', hasPersonalData = 'true') - -struct SimilarRightNodes { - 1: required list rightNodesWithScores -}(persisted='true', hasPersonalData = 'true') - -struct RightNodeWithScore { - 1: required RightNode rightNode - 2: required double clusterScore (personalDataType = 'EngagementScore') -}(persisted='true', hasPersonalData = 'true') - -struct RightNodeWithScoreList { - 1: required list rightNodeWithScoreList -}(persisted='true', hasPersonalData = 'true') - -struct RightNodeWithClusters { - 1: required RightNode rightNode - 2: required string modelVersion (personalDataType = 'EngagementId') - 3: required map clusterIdToScores (personalDataTypeKey = 'EngagementId', personalDataTypeValue = 'EngagementScore') -}(persisted="true", hasPersonalData = 'true') - -struct ModelVersionWithClusterScores { - 1: required string modelVersion (personalDataType = 'EngagementId') - 2: required map clusterIdToScores (personalDataTypeKey = 'EngagementId', personalDataTypeValue = 'EngagementScore') -}(persisted = 'true', hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/offline_job_internal.docx b/src/thrift/com/twitter/simclusters_v2/offline_job_internal.docx new file mode 100644 index 000000000..9ef45e643 Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/offline_job_internal.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/offline_job_internal.thrift b/src/thrift/com/twitter/simclusters_v2/offline_job_internal.thrift deleted file mode 100644 index 257ef1f99..000000000 --- a/src/thrift/com/twitter/simclusters_v2/offline_job_internal.thrift +++ /dev/null @@ -1,63 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.offline_job_internal -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "com/twitter/algebird_internal/algebird.thrift" - -// For internal usage only. Mainly for offline_evaluation. -// Deprecated. Please use 'online_store/ModelVersion' -enum PersistedModelVersion { - MODEL_20M_145K_dec11 = 1, - MODEL_20M_145K_updated = 2, - MODEL_20M_145K_2020 = 3, - RESERVED_4 = 4, - RESERVED_5 = 5 -}(persisted = 'true', hasPersonalData = 'false') - -enum PersistedScoreType { - NORMALIZED_FAV_8_HR_HALF_LIFE = 1, - NORMALIZED_FOLLOW_8_HR_HALF_LIFE = 2, - NORMALIZED_LOG_FAV_8_HR_HALF_LIFE = 3, - RESERVED_4 = 4, - RESERVED_5 = 5 -}(persisted = 'true', hasPersonalData = 'false') - -struct PersistedScores { - 1: optional algebird.DecayedValue score -}(persisted = 'true', hasPersonalData = 'false') - -struct TweetAndClusterScores { - 1: required i64 tweetId(personalDataType = 'TweetId') - 2: required i32 clusterId(personalDataType = 'InferredInterests') - 3: required PersistedModelVersion modelVersion - 4: required PersistedScores scores(personalDataType = 'EngagementScore') - 5: optional PersistedScoreType scoreType -}(persisted="true", hasPersonalData = 'true') - -struct TweetTopKClustersWithScores { - 1: required i64 tweetId(personalDataType = 'TweetId') - 2: required PersistedModelVersion modelVersion - 3: required map topKClusters(personalDataTypeKey = 'InferredInterests') - 4: optional PersistedScoreType scoreType -}(persisted="true", hasPersonalData = 'true') - -struct ClusterTopKTweetsWithScores { - 1: required i32 clusterId(personalDataType = 'InferredInterests') - 2: required PersistedModelVersion modelVersion - 3: required map topKTweets(personalDataTypeKey = 'TweetId') - 4: optional PersistedScoreType scoreType -}(persisted = 'true', hasPersonalData = 'true') - -struct QueryAndClusterScores { - 1: required string query(personalDataType = 'SearchQuery') - 2: required i32 clusterId - 3: required PersistedModelVersion modelVersion - 4: required PersistedScores scores -}(persisted = 'true', hasPersonalData = 'true') - -struct QueryTopKClustersWithScores { - 1: required string query(personalDataType = 'SearchQuery') - 2: required PersistedModelVersion modelVersion - 3: required map topKClusters -}(persisted = 'true', hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/online_store.docx b/src/thrift/com/twitter/simclusters_v2/online_store.docx new file mode 100644 index 000000000..342de0c1d Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/online_store.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/online_store.thrift b/src/thrift/com/twitter/simclusters_v2/online_store.thrift deleted file mode 100644 index fb5aff6ad..000000000 --- a/src/thrift/com/twitter/simclusters_v2/online_store.thrift +++ /dev/null @@ -1,92 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.online_store -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "entity.thrift" -include "com/twitter/algebird_internal/algebird.thrift" - -/** - * A SimClusters model version. - **/ -enum ModelVersion { - MODEL_20M_145K_dec11 = 1, // DEPRECATED - MODEL_20M_145K_updated = 2, // DEPRECATED - MODEL_20M_145K_2020 = 3, - RESERVED_4 = 4, - RESERVED_5 = 5, - RESERVED_6 = 6 -}(persisted = 'true', hasPersonalData = 'false') - -/** - * Uniquely identifies a SimCluster. All fields are required as this is used as a memcache key. - **/ -struct FullClusterId { - 1: required ModelVersion modelVersion - 2: required i32 clusterId -}(persisted='true', hasPersonalData = 'false') - -/** - * Contains a set of scores per cluster. - **/ -struct Scores { - 1: optional algebird.DecayedValue favClusterNormalized8HrHalfLifeScore - 2: optional algebird.DecayedValue followClusterNormalized8HrHalfLifeScore -}(hasPersonalData = 'false') - -/** - * A combination of entity and model. All fields are required as this is used as a memcache key. - **/ -struct EntityWithVersion { - 1: required entity.SimClusterEntity entity - 2: required ModelVersion version -}(hasPersonalData = 'true') - -/** - * Contains top K clusters with corresponding scores. We're representing clusters purely using ints, and - * omitting the modelVersion, since that is included in the memcache key. - **/ -struct TopKClustersWithScores { - 1: optional map topClustersByFavClusterNormalizedScore(personalDataTypeKey = 'InferredInterests') - 2: optional map topClustersByFollowClusterNormalizedScore(personalDataTypeKey = 'InferredInterests') -}(hasPersonalData = 'true') - -/** - * Contains top K text entities with corresponding scores. We're omitting the modelVersion, - * since that is included in the memcache key. - **/ -struct TopKEntitiesWithScores { - 1: optional map topEntitiesByFavClusterNormalizedScore - 2: optional map topEntitiesByFollowClusterNormalizedScore -}(hasPersonalData = 'true') - -/** - * Contains top K tweets with corresponding scores. We're omitting the modelVersion, - * since that is included in the memcache key. - **/ -struct TopKTweetsWithScores { - 1: optional map topTweetsByFavClusterNormalizedScore(personalDataTypeKey='TweetId') - 2: optional map topTweetsByFollowClusterNormalizedScore(personalDataTypeKey='TweetId') -}(hasPersonalData = 'true') - -/** - * Contains FullClusterId and the corresponding top K tweets and scores. - **/ -struct ClusterIdToTopKTweetsWithScores { - 1: required FullClusterId clusterId - 2: required TopKTweetsWithScores topKTweetsWithScores -}(hasPersonalData = 'true') - -/** - * Contains a map of Model Version to top K clusters with corresponding scores. - **/ -struct MultiModelTopKClustersWithScores { - 1: optional map multiModelTopKClustersWithScores -}(hasPersonalData = 'true') - -/** - * Contains a map of Model Version top K tweets with corresponding scores. - **/ -struct MultiModelTopKTweetsWithScores { - 1: optional map multiModelTopKTweetsWithScores -}(hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/online_store_internal.docx b/src/thrift/com/twitter/simclusters_v2/online_store_internal.docx new file mode 100644 index 000000000..85aa50dda Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/online_store_internal.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/online_store_internal.thrift b/src/thrift/com/twitter/simclusters_v2/online_store_internal.thrift deleted file mode 100644 index b5fd6afb9..000000000 --- a/src/thrift/com/twitter/simclusters_v2/online_store_internal.thrift +++ /dev/null @@ -1,30 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.online_store_internal -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "online_store.thrift" - -/** - * Contains a hash bucket of the clusterId along with the Model Version. - * All fields are required as this is used as a memcache key. - **/ -struct FullClusterIdBucket { - 1: required online_store.ModelVersion modelVersion - // (hash(clusterId) mod NUM_BUCKETS_XXXXXX) - 2: required i32 bucket -}(hasPersonalData = 'false') - -/** - * Contains scores per clusters. The model is not stored here as it's encoded into the memcache key. - **/ -struct ClustersWithScores { - 1: optional map clustersToScore(personalDataTypeKey = 'InferredInterests') -}(hasPersonalData = 'true') - -/** - * Contains a map of model version to scores per clusters. - **/ -struct MultiModelClustersWithScores { - 1: optional map multiModelClustersWithScores -}(hasPersonalData = 'true') diff --git a/src/thrift/com/twitter/simclusters_v2/score.docx b/src/thrift/com/twitter/simclusters_v2/score.docx new file mode 100644 index 000000000..a9a3b842c Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/score.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/score.thrift b/src/thrift/com/twitter/simclusters_v2/score.thrift deleted file mode 100644 index 8ee20e72c..000000000 --- a/src/thrift/com/twitter/simclusters_v2/score.thrift +++ /dev/null @@ -1,71 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.score -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "com/twitter/simclusters_v2/embedding.thrift" -include "com/twitter/simclusters_v2/identifier.thrift" - -/** - * The algorithm type to identify the score algorithm. - * Assume that a algorithm support and only support one kind - * of [[ScoreInternalId]] - **/ -enum ScoringAlgorithm { - // Reserve 0001 - 999 for Basic Pairwise Scoring Calculation - PairEmbeddingDotProduct = 1, - PairEmbeddingCosineSimilarity = 2, - PairEmbeddingJaccardSimilarity = 3, - PairEmbeddingEuclideanDistance = 4, - PairEmbeddingManhattanDistance = 5, - PairEmbeddingLogCosineSimilarity = 6, - PairEmbeddingExpScaledCosineSimilarity = 7, - - // Reserve 1000 - 1999 for Tweet Similarity Model - TagSpaceCosineSimilarity = 1000, - WeightedSumTagSpaceRankingExperiment1 = 1001, //deprecated - WeightedSumTagSpaceRankingExperiment2 = 1002, //deprecated - WeightedSumTagSpaceANNExperiment = 1003, //deprecated - - // Reserved for 10001 - 20000 for Aggregate scoring - WeightedSumTopicTweetRanking = 10001, - CortexTopicTweetLabel = 10002, - // Reserved 20001 - 30000 for Topic Tweet scores - CertoNormalizedDotProductScore = 20001, - CertoNormalizedCosineScore = 20002 -}(hasPersonalData = 'false') - -/** - * The identifier type for the score between a pair of SimClusters Embedding. - * Used as the persistent key of a SimClustersEmbedding score. - * Support score between different [[EmbeddingType]] / [[ModelVersion]] - **/ -struct SimClustersEmbeddingPairScoreId { - 1: required identifier.SimClustersEmbeddingId id1 - 2: required identifier.SimClustersEmbeddingId id2 -}(hasPersonalData = 'true') - -/** - * The identifier type for the score between a pair of InternalId. - **/ -struct GenericPairScoreId { - 1: required identifier.InternalId id1 - 2: required identifier.InternalId id2 -}(hasPersonalData = 'true') - -union ScoreInternalId { - 1: GenericPairScoreId genericPairScoreId - 2: SimClustersEmbeddingPairScoreId simClustersEmbeddingPairScoreId -} - -/** - * A uniform Identifier type for all kinds of Calculation Score - **/ -struct ScoreId { - 1: required ScoringAlgorithm algorithm - 2: required ScoreInternalId internalId -}(hasPersonalData = 'true') - -struct Score { - 1: required double score -}(hasPersonalData = 'false') diff --git a/src/thrift/com/twitter/simclusters_v2/simclusters_presto.docx b/src/thrift/com/twitter/simclusters_v2/simclusters_presto.docx new file mode 100644 index 000000000..a7d872272 Binary files /dev/null and b/src/thrift/com/twitter/simclusters_v2/simclusters_presto.docx differ diff --git a/src/thrift/com/twitter/simclusters_v2/simclusters_presto.thrift b/src/thrift/com/twitter/simclusters_v2/simclusters_presto.thrift deleted file mode 100644 index 93eae6c62..000000000 --- a/src/thrift/com/twitter/simclusters_v2/simclusters_presto.thrift +++ /dev/null @@ -1,59 +0,0 @@ -namespace java com.twitter.simclusters_v2.thriftjava -namespace py gen.twitter.simclusters_v2.simclusters_presto -#@namespace scala com.twitter.simclusters_v2.thriftscala -#@namespace strato com.twitter.simclusters_v2 - -include "embedding.thrift" -include "identifier.thrift" -include "interests.thrift" -include "online_store.thrift" - -/** - * This struct is the presto-compatible "lite" version of the ClusterDetails thrift - */ -struct ClusterDetailsLite { - 1: required online_store.FullClusterId fullClusterId - 2: required i32 numUsersWithAnyNonZeroScore - 3: required i32 numUsersWithNonZeroFollowScore - 4: required i32 numUsersWithNonZeroFavScore - 5: required list knownForUsersAndScores -}(persisted="true", hasPersonalData = 'true') - -struct EmbeddingsLite { - 1: required i64 entityId - 2: required i32 clusterId - 3: required double score -}(persisted="true", hasPersonalData = 'true') - -struct SimClustersEmbeddingWithId { - 1: required identifier.SimClustersEmbeddingId embeddingId - 2: required embedding.SimClustersEmbedding embedding -}(persisted="true", hasPersonalData = 'true') - -struct InternalIdEmbeddingWithId { - 1: required identifier.SimClustersEmbeddingId embeddingId - 2: required embedding.InternalIdEmbedding embedding -}(persisted="true", hasPersonalData = 'true') - -/** -* This struct is the presto-compatible version of the fav_tfg_topic_embeddings -*/ -struct ClustersScore { - 1: required i64 clusterId(personalDataType = 'SemanticcoreClassification') - 2: required double score(personalDataType = 'EngagementScore') -}(persisted="true", hasPersonalData = 'true') - -struct FavTfgTopicEmbeddings { - 1: required identifier.TopicId topicId - 2: required list clusterScore -}(persisted="true", hasPersonalData = 'true') - -struct TfgTopicEmbeddings { - 1: required identifier.TopicId topicId - 2: required list clusterScore -}(persisted="true", hasPersonalData = 'true') - -struct UserTopicWeightedEmbedding { - 1: required i64 userId(personalDataType = 'UserId') - 2: required list clusterScore -}(persisted="true", hasPersonalData = 'true')