diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.docx b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.docx
new file mode 100644
index 000000000..5135f363e
Binary files /dev/null and b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.docx differ
diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala
deleted file mode 100644
index dd00ea126..000000000
--- a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala
+++ /dev/null
@@ -1,39 +0,0 @@
-package com.twitter.representation_manager.store
-
-import com.twitter.servo.decider.DeciderKeyEnum
-
-object DeciderConstants {
- // Deciders inherited from CR and RSX and only used in LegacyRMS
- // Their value are manipulated by CR and RSX's yml file and their decider dashboard
- // We will remove them after migration completed
- val enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore =
- "enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore"
-
- val enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore =
- "enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore"
-
- val enablelogFavBased20M145K2020TweetEmbeddingStoreTimeouts =
- "enable_log_fav_based_tweet_embedding_20m145k2020_timeouts"
- val logFavBased20M145K2020TweetEmbeddingStoreTimeoutValueMillis =
- "log_fav_based_tweet_embedding_20m145k2020_timeout_value_millis"
-
- val enablelogFavBased20M145KUpdatedTweetEmbeddingStoreTimeouts =
- "enable_log_fav_based_tweet_embedding_20m145kUpdated_timeouts"
- val logFavBased20M145KUpdatedTweetEmbeddingStoreTimeoutValueMillis =
- "log_fav_based_tweet_embedding_20m145kUpdated_timeout_value_millis"
-
- val enableSimClustersEmbeddingStoreTimeouts = "enable_sim_clusters_embedding_store_timeouts"
- val simClustersEmbeddingStoreTimeoutValueMillis =
- "sim_clusters_embedding_store_timeout_value_millis"
-}
-
-// Necessary for using servo Gates
-object DeciderKey extends DeciderKeyEnum {
- val enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore: Value = Value(
- DeciderConstants.enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore
- )
-
- val enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore: Value = Value(
- DeciderConstants.enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore
- )
-}
diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.docx b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.docx
new file mode 100644
index 000000000..f28e4a3b8
Binary files /dev/null and b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.docx differ
diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala
deleted file mode 100644
index cc6485b79..000000000
--- a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala
+++ /dev/null
@@ -1,198 +0,0 @@
-package com.twitter.representation_manager.store
-
-import com.twitter.contentrecommender.store.ApeEntityEmbeddingStore
-import com.twitter.contentrecommender.store.InterestsOptOutStore
-import com.twitter.contentrecommender.store.SemanticCoreTopicSeedStore
-import com.twitter.conversions.DurationOps._
-import com.twitter.escherbird.util.uttclient.CachedUttClientV2
-import com.twitter.finagle.memcached.Client
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.frigate.common.store.strato.StratoFetchableStore
-import com.twitter.frigate.common.util.SeqLongInjection
-import com.twitter.hermit.store.common.ObservedCachedReadableStore
-import com.twitter.hermit.store.common.ObservedMemcachedReadableStore
-import com.twitter.hermit.store.common.ObservedReadableStore
-import com.twitter.interests.thriftscala.InterestsThriftService
-import com.twitter.representation_manager.common.MemCacheConfig
-import com.twitter.representation_manager.common.RepresentationManagerDecider
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType._
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.ModelVersion._
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.TopicId
-import com.twitter.simclusters_v2.thriftscala.LocaleEntityId
-import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding}
-import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams
-import com.twitter.storehaus.ReadableStore
-import com.twitter.strato.client.{Client => StratoClient}
-import com.twitter.tweetypie.util.UserId
-import javax.inject.Inject
-
-class TopicSimClustersEmbeddingStore @Inject() (
- stratoClient: StratoClient,
- cacheClient: Client,
- globalStats: StatsReceiver,
- mhMtlsParams: ManhattanKVClientMtlsParams,
- rmsDecider: RepresentationManagerDecider,
- interestService: InterestsThriftService.MethodPerEndpoint,
- uttClient: CachedUttClientV2) {
-
- private val stats = globalStats.scope(this.getClass.getSimpleName)
- private val interestsOptOutStore = InterestsOptOutStore(interestService)
-
- /**
- * Note this is NOT an embedding store. It is a list of author account ids we use to represent
- * topics
- */
- private val semanticCoreTopicSeedStore: ReadableStore[
- SemanticCoreTopicSeedStore.Key,
- Seq[UserId]
- ] = {
- /*
- Up to 1000 Long seeds per topic/language = 62.5kb per topic/language (worst case)
- Assume ~10k active topic/languages ~= 650MB (worst case)
- */
- val underlying = new SemanticCoreTopicSeedStore(uttClient, interestsOptOutStore)(
- stats.scope("semantic_core_topic_seed_store"))
-
- val memcacheStore = ObservedMemcachedReadableStore.fromCacheClient(
- backingStore = underlying,
- cacheClient = cacheClient,
- ttl = 12.hours)(
- valueInjection = SeqLongInjection,
- statsReceiver = stats.scope("topic_producer_seed_store_mem_cache"),
- keyToString = { k => s"tpss:${k.entityId}_${k.languageCode}" }
- )
-
- ObservedCachedReadableStore.from[SemanticCoreTopicSeedStore.Key, Seq[UserId]](
- store = memcacheStore,
- ttl = 6.hours,
- maxKeys = 20e3.toInt,
- cacheName = "topic_producer_seed_store_cache",
- windowSize = 5000
- )(stats.scope("topic_producer_seed_store_cache"))
- }
-
- private val favBasedTfgTopicEmbedding20m145k2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore =
- StratoFetchableStore
- .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding](
- stratoClient,
- "recommendations/simclusters_v2/embeddings/favBasedTFGTopic20M145K2020").mapValues(
- embedding => SimClustersEmbedding(embedding, truncate = 50).toThrift)
- .composeKeyMapping[LocaleEntityId] { localeEntityId =>
- SimClustersEmbeddingId(
- FavTfgTopic,
- Model20m145k2020,
- InternalId.LocaleEntityId(localeEntityId))
- }
-
- buildLocaleEntityIdMemCacheStore(rawStore, FavTfgTopic, Model20m145k2020)
- }
-
- private val logFavBasedApeEntity20M145K2020EmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val apeStore = StratoFetchableStore
- .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding](
- stratoClient,
- "recommendations/simclusters_v2/embeddings/logFavBasedAPE20M145K2020")
- .mapValues(embedding => SimClustersEmbedding(embedding, truncate = 50))
- .composeKeyMapping[UserId]({ id =>
- SimClustersEmbeddingId(
- AggregatableLogFavBasedProducer,
- Model20m145k2020,
- InternalId.UserId(id))
- })
- val rawStore = new ApeEntityEmbeddingStore(
- semanticCoreSeedStore = semanticCoreTopicSeedStore,
- aggregatableProducerEmbeddingStore = apeStore,
- statsReceiver = stats.scope("log_fav_based_ape_entity_2020_embedding_store"))
- .mapValues(embedding => SimClustersEmbedding(embedding.toThrift, truncate = 50).toThrift)
- .composeKeyMapping[TopicId] { topicId =>
- SimClustersEmbeddingId(
- LogFavBasedKgoApeTopic,
- Model20m145k2020,
- InternalId.TopicId(topicId))
- }
-
- buildTopicIdMemCacheStore(rawStore, LogFavBasedKgoApeTopic, Model20m145k2020)
- }
-
- private def buildTopicIdMemCacheStore(
- rawStore: ReadableStore[TopicId, ThriftSimClustersEmbedding],
- embeddingType: EmbeddingType,
- modelVersion: ModelVersion
- ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = {
- val observedStore: ObservedReadableStore[TopicId, ThriftSimClustersEmbedding] =
- ObservedReadableStore(
- store = rawStore
- )(stats.scope(embeddingType.name).scope(modelVersion.name))
-
- val storeWithKeyMapping = observedStore.composeKeyMapping[SimClustersEmbeddingId] {
- case SimClustersEmbeddingId(_, _, InternalId.TopicId(topicId)) =>
- topicId
- }
-
- MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding(
- storeWithKeyMapping,
- cacheClient,
- embeddingType,
- modelVersion,
- stats
- )
- }
-
- private def buildLocaleEntityIdMemCacheStore(
- rawStore: ReadableStore[LocaleEntityId, ThriftSimClustersEmbedding],
- embeddingType: EmbeddingType,
- modelVersion: ModelVersion
- ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = {
- val observedStore: ObservedReadableStore[LocaleEntityId, ThriftSimClustersEmbedding] =
- ObservedReadableStore(
- store = rawStore
- )(stats.scope(embeddingType.name).scope(modelVersion.name))
-
- val storeWithKeyMapping = observedStore.composeKeyMapping[SimClustersEmbeddingId] {
- case SimClustersEmbeddingId(_, _, InternalId.LocaleEntityId(localeEntityId)) =>
- localeEntityId
- }
-
- MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding(
- storeWithKeyMapping,
- cacheClient,
- embeddingType,
- modelVersion,
- stats
- )
- }
-
- private val underlyingStores: Map[
- (EmbeddingType, ModelVersion),
- ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding]
- ] = Map(
- // Topic Embeddings
- (FavTfgTopic, Model20m145k2020) -> favBasedTfgTopicEmbedding20m145k2020Store,
- (LogFavBasedKgoApeTopic, Model20m145k2020) -> logFavBasedApeEntity20M145K2020EmbeddingStore,
- )
-
- val topicSimClustersEmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- SimClustersEmbeddingStore.buildWithDecider(
- underlyingStores = underlyingStores,
- decider = rmsDecider.decider,
- statsReceiver = stats
- )
- }
-
-}
diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.docx b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.docx
new file mode 100644
index 000000000..ddeb7e935
Binary files /dev/null and b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.docx differ
diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala
deleted file mode 100644
index 857e38649..000000000
--- a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala
+++ /dev/null
@@ -1,141 +0,0 @@
-package com.twitter.representation_manager.store
-
-import com.twitter.finagle.memcached.Client
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.hermit.store.common.ObservedReadableStore
-import com.twitter.representation_manager.common.MemCacheConfig
-import com.twitter.representation_manager.common.RepresentationManagerDecider
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore
-import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType._
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.ModelVersion._
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding}
-import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams
-import com.twitter.storehaus.ReadableStore
-import javax.inject.Inject
-
-class TweetSimClustersEmbeddingStore @Inject() (
- cacheClient: Client,
- globalStats: StatsReceiver,
- mhMtlsParams: ManhattanKVClientMtlsParams,
- rmsDecider: RepresentationManagerDecider) {
-
- private val stats = globalStats.scope(this.getClass.getSimpleName)
-
- val logFavBasedLongestL2Tweet20M145KUpdatedEmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore =
- PersistentTweetEmbeddingStore
- .longestL2NormTweetEmbeddingStoreManhattan(
- mhMtlsParams,
- PersistentTweetEmbeddingStore.LogFavBased20m145kUpdatedDataset,
- stats
- ).mapValues(_.toThrift)
-
- buildMemCacheStore(rawStore, LogFavLongestL2EmbeddingTweet, Model20m145kUpdated)
- }
-
- val logFavBasedLongestL2Tweet20M145K2020EmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore =
- PersistentTweetEmbeddingStore
- .longestL2NormTweetEmbeddingStoreManhattan(
- mhMtlsParams,
- PersistentTweetEmbeddingStore.LogFavBased20m145k2020Dataset,
- stats
- ).mapValues(_.toThrift)
-
- buildMemCacheStore(rawStore, LogFavLongestL2EmbeddingTweet, Model20m145k2020)
- }
-
- val logFavBased20M145KUpdatedTweetEmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore =
- PersistentTweetEmbeddingStore
- .mostRecentTweetEmbeddingStoreManhattan(
- mhMtlsParams,
- PersistentTweetEmbeddingStore.LogFavBased20m145kUpdatedDataset,
- stats
- ).mapValues(_.toThrift)
-
- buildMemCacheStore(rawStore, LogFavBasedTweet, Model20m145kUpdated)
- }
-
- val logFavBased20M145K2020TweetEmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore =
- PersistentTweetEmbeddingStore
- .mostRecentTweetEmbeddingStoreManhattan(
- mhMtlsParams,
- PersistentTweetEmbeddingStore.LogFavBased20m145k2020Dataset,
- stats
- ).mapValues(_.toThrift)
-
- buildMemCacheStore(rawStore, LogFavBasedTweet, Model20m145k2020)
- }
-
- private def buildMemCacheStore(
- rawStore: ReadableStore[TweetId, ThriftSimClustersEmbedding],
- embeddingType: EmbeddingType,
- modelVersion: ModelVersion
- ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = {
- val observedStore: ObservedReadableStore[TweetId, ThriftSimClustersEmbedding] =
- ObservedReadableStore(
- store = rawStore
- )(stats.scope(embeddingType.name).scope(modelVersion.name))
-
- val storeWithKeyMapping = observedStore.composeKeyMapping[SimClustersEmbeddingId] {
- case SimClustersEmbeddingId(_, _, InternalId.TweetId(tweetId)) =>
- tweetId
- }
-
- MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding(
- storeWithKeyMapping,
- cacheClient,
- embeddingType,
- modelVersion,
- stats
- )
- }
-
- private val underlyingStores: Map[
- (EmbeddingType, ModelVersion),
- ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding]
- ] = Map(
- // Tweet Embeddings
- (LogFavBasedTweet, Model20m145kUpdated) -> logFavBased20M145KUpdatedTweetEmbeddingStore,
- (LogFavBasedTweet, Model20m145k2020) -> logFavBased20M145K2020TweetEmbeddingStore,
- (
- LogFavLongestL2EmbeddingTweet,
- Model20m145kUpdated) -> logFavBasedLongestL2Tweet20M145KUpdatedEmbeddingStore,
- (
- LogFavLongestL2EmbeddingTweet,
- Model20m145k2020) -> logFavBasedLongestL2Tweet20M145K2020EmbeddingStore,
- )
-
- val tweetSimClustersEmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- SimClustersEmbeddingStore.buildWithDecider(
- underlyingStores = underlyingStores,
- decider = rmsDecider.decider,
- statsReceiver = stats
- )
- }
-
-}
diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.docx b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.docx
new file mode 100644
index 000000000..ec6c2e279
Binary files /dev/null and b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.docx differ
diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala
deleted file mode 100644
index b416d9b17..000000000
--- a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala
+++ /dev/null
@@ -1,602 +0,0 @@
-package com.twitter.representation_manager.store
-
-import com.twitter.contentrecommender.twistly
-import com.twitter.finagle.memcached.Client
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.frigate.common.store.strato.StratoFetchableStore
-import com.twitter.hermit.store.common.ObservedReadableStore
-import com.twitter.representation_manager.common.MemCacheConfig
-import com.twitter.representation_manager.common.RepresentationManagerDecider
-import com.twitter.simclusters_v2.common.ModelVersions
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore
-import com.twitter.simclusters_v2.summingbird.stores.ProducerClusterEmbeddingReadableStores
-import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore
-import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.getStore
-import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.modelVersionToDatasetMap
-import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.knownModelVersions
-import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.toSimClustersEmbedding
-import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType._
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.ModelVersion._
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding}
-import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams
-import com.twitter.storehaus.ReadableStore
-import com.twitter.storehaus_internal.manhattan.Apollo
-import com.twitter.storehaus_internal.manhattan.ManhattanCluster
-import com.twitter.strato.client.{Client => StratoClient}
-import com.twitter.strato.thrift.ScroogeConvImplicits._
-import com.twitter.tweetypie.util.UserId
-import com.twitter.util.Future
-import javax.inject.Inject
-
-class UserSimClustersEmbeddingStore @Inject() (
- stratoClient: StratoClient,
- cacheClient: Client,
- globalStats: StatsReceiver,
- mhMtlsParams: ManhattanKVClientMtlsParams,
- rmsDecider: RepresentationManagerDecider) {
-
- private val stats = globalStats.scope(this.getClass.getSimpleName)
-
- private val favBasedProducer20M145KUpdatedEmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore = ProducerClusterEmbeddingReadableStores
- .getProducerTopKSimClustersEmbeddingsStore(
- mhMtlsParams
- ).mapValues { topSimClustersWithScore =>
- ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters)
- }.composeKeyMapping[SimClustersEmbeddingId] {
- case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) =>
- userId
- }
-
- buildMemCacheStore(rawStore, FavBasedProducer, Model20m145kUpdated)
- }
-
- private val favBasedProducer20M145K2020EmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore = ProducerClusterEmbeddingReadableStores
- .getProducerTopKSimClusters2020EmbeddingsStore(
- mhMtlsParams
- ).mapValues { topSimClustersWithScore =>
- ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters)
- }.composeKeyMapping[SimClustersEmbeddingId] {
- case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) =>
- userId
- }
-
- buildMemCacheStore(rawStore, FavBasedProducer, Model20m145k2020)
- }
-
- private val followBasedProducer20M145K2020EmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore = ProducerClusterEmbeddingReadableStores
- .getProducerTopKSimClustersEmbeddingsByFollowStore(
- mhMtlsParams
- ).mapValues { topSimClustersWithScore =>
- ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters)
- }.composeKeyMapping[SimClustersEmbeddingId] {
- case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) =>
- userId
- }
-
- buildMemCacheStore(rawStore, FollowBasedProducer, Model20m145k2020)
- }
-
- private val logFavBasedApe20M145K2020EmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore = StratoFetchableStore
- .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding](
- stratoClient,
- "recommendations/simclusters_v2/embeddings/logFavBasedAPE20M145K2020")
- .mapValues(embedding => SimClustersEmbedding(embedding, truncate = 50).toThrift)
-
- buildMemCacheStore(rawStore, AggregatableLogFavBasedProducer, Model20m145k2020)
- }
-
- private val rawRelaxedLogFavBasedApe20M145K2020EmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- ThriftSimClustersEmbedding
- ] = {
- StratoFetchableStore
- .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding](
- stratoClient,
- "recommendations/simclusters_v2/embeddings/logFavBasedAPERelaxedFavEngagementThreshold20M145K2020")
- .mapValues(embedding => SimClustersEmbedding(embedding, truncate = 50).toThrift)
- }
-
- private val relaxedLogFavBasedApe20M145K2020EmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildMemCacheStore(
- rawRelaxedLogFavBasedApe20M145K2020EmbeddingStore,
- RelaxedAggregatableLogFavBasedProducer,
- Model20m145k2020)
- }
-
- private val relaxedLogFavBasedApe20m145kUpdatedEmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore = rawRelaxedLogFavBasedApe20M145K2020EmbeddingStore
- .composeKeyMapping[SimClustersEmbeddingId] {
- case SimClustersEmbeddingId(
- RelaxedAggregatableLogFavBasedProducer,
- Model20m145kUpdated,
- internalId) =>
- SimClustersEmbeddingId(
- RelaxedAggregatableLogFavBasedProducer,
- Model20m145k2020,
- internalId)
- }
-
- buildMemCacheStore(rawStore, RelaxedAggregatableLogFavBasedProducer, Model20m145kUpdated)
- }
-
- private val logFavBasedInterestedInFromAPE20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildUserInterestedInStore(
- UserInterestedInReadableStore.defaultIIAPESimClustersEmbeddingStoreWithMtls,
- LogFavBasedUserInterestedInFromAPE,
- Model20m145k2020)
- }
-
- private val followBasedInterestedInFromAPE20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildUserInterestedInStore(
- UserInterestedInReadableStore.defaultIIAPESimClustersEmbeddingStoreWithMtls,
- FollowBasedUserInterestedInFromAPE,
- Model20m145k2020)
- }
-
- private val favBasedUserInterestedIn20M145KUpdatedStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildUserInterestedInStore(
- UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls,
- FavBasedUserInterestedIn,
- Model20m145kUpdated)
- }
-
- private val favBasedUserInterestedIn20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildUserInterestedInStore(
- UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls,
- FavBasedUserInterestedIn,
- Model20m145k2020)
- }
-
- private val followBasedUserInterestedIn20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildUserInterestedInStore(
- UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls,
- FollowBasedUserInterestedIn,
- Model20m145k2020)
- }
-
- private val logFavBasedUserInterestedIn20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildUserInterestedInStore(
- UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls,
- LogFavBasedUserInterestedIn,
- Model20m145k2020)
- }
-
- private val favBasedUserInterestedInFromPE20M145KUpdatedStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildUserInterestedInStore(
- UserInterestedInReadableStore.defaultIIPESimClustersEmbeddingStoreWithMtls,
- FavBasedUserInterestedInFromPE,
- Model20m145kUpdated)
- }
-
- private val twistlyUserInterestedInStore: ReadableStore[
- SimClustersEmbeddingId,
- ThriftSimClustersEmbedding
- ] = {
- val interestedIn20M145KUpdatedStore = {
- UserInterestedInReadableStore.defaultStoreWithMtls(
- mhMtlsParams,
- modelVersion = ModelVersions.Model20M145KUpdated
- )
- }
- val interestedIn20M145K2020Store = {
- UserInterestedInReadableStore.defaultStoreWithMtls(
- mhMtlsParams,
- modelVersion = ModelVersions.Model20M145K2020
- )
- }
- val interestedInFromPE20M145KUpdatedStore = {
- UserInterestedInReadableStore.defaultIIPEStoreWithMtls(
- mhMtlsParams,
- modelVersion = ModelVersions.Model20M145KUpdated)
- }
- val simClustersInterestedInStore: ReadableStore[
- (UserId, ModelVersion),
- ClustersUserIsInterestedIn
- ] = {
- new ReadableStore[(UserId, ModelVersion), ClustersUserIsInterestedIn] {
- override def get(k: (UserId, ModelVersion)): Future[Option[ClustersUserIsInterestedIn]] = {
- k match {
- case (userId, Model20m145kUpdated) =>
- interestedIn20M145KUpdatedStore.get(userId)
- case (userId, Model20m145k2020) =>
- interestedIn20M145K2020Store.get(userId)
- case _ =>
- Future.None
- }
- }
- }
- }
- val simClustersInterestedInFromProducerEmbeddingsStore: ReadableStore[
- (UserId, ModelVersion),
- ClustersUserIsInterestedIn
- ] = {
- new ReadableStore[(UserId, ModelVersion), ClustersUserIsInterestedIn] {
- override def get(k: (UserId, ModelVersion)): Future[Option[ClustersUserIsInterestedIn]] = {
- k match {
- case (userId, ModelVersion.Model20m145kUpdated) =>
- interestedInFromPE20M145KUpdatedStore.get(userId)
- case _ =>
- Future.None
- }
- }
- }
- }
- new twistly.interestedin.EmbeddingStore(
- interestedInStore = simClustersInterestedInStore,
- interestedInFromProducerEmbeddingStore = simClustersInterestedInFromProducerEmbeddingsStore,
- statsReceiver = stats
- ).mapValues(_.toThrift)
- }
-
- private val userNextInterestedIn20m145k2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildUserInterestedInStore(
- UserInterestedInReadableStore.defaultNextInterestedInStoreWithMtls,
- UserNextInterestedIn,
- Model20m145k2020)
- }
-
- private val filteredUserInterestedIn20m145kUpdatedStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildMemCacheStore(twistlyUserInterestedInStore, FilteredUserInterestedIn, Model20m145kUpdated)
- }
-
- private val filteredUserInterestedIn20m145k2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildMemCacheStore(twistlyUserInterestedInStore, FilteredUserInterestedIn, Model20m145k2020)
- }
-
- private val filteredUserInterestedInFromPE20m145kUpdatedStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildMemCacheStore(
- twistlyUserInterestedInStore,
- FilteredUserInterestedInFromPE,
- Model20m145kUpdated)
- }
-
- private val unfilteredUserInterestedIn20m145kUpdatedStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildMemCacheStore(
- twistlyUserInterestedInStore,
- UnfilteredUserInterestedIn,
- Model20m145kUpdated)
- }
-
- private val unfilteredUserInterestedIn20m145k2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- buildMemCacheStore(twistlyUserInterestedInStore, UnfilteredUserInterestedIn, Model20m145k2020)
- }
-
- // [Experimental] User InterestedIn, generated by aggregating IIAPE embedding from AddressBook
-
- private val logFavBasedInterestedMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val datasetName = "addressbook_sims_embedding_iiape_maxpooling"
- val appId = "wtf_embedding_apollo"
- buildUserInterestedInStoreGeneric(
- simClustersEmbeddingStoreWithMtls,
- LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020,
- datasetName = datasetName,
- appId = appId,
- manhattanCluster = Apollo
- )
- }
-
- private val logFavBasedInterestedAverageAddressBookFromIIAPE20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val datasetName = "addressbook_sims_embedding_iiape_average"
- val appId = "wtf_embedding_apollo"
- buildUserInterestedInStoreGeneric(
- simClustersEmbeddingStoreWithMtls,
- LogFavBasedUserInterestedAverageAddressBookFromIIAPE,
- Model20m145k2020,
- datasetName = datasetName,
- appId = appId,
- manhattanCluster = Apollo
- )
- }
-
- private val logFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val datasetName = "addressbook_sims_embedding_iiape_booktype_maxpooling"
- val appId = "wtf_embedding_apollo"
- buildUserInterestedInStoreGeneric(
- simClustersEmbeddingStoreWithMtls,
- LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020,
- datasetName = datasetName,
- appId = appId,
- manhattanCluster = Apollo
- )
- }
-
- private val logFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val datasetName = "addressbook_sims_embedding_iiape_largestdim_maxpooling"
- val appId = "wtf_embedding_apollo"
- buildUserInterestedInStoreGeneric(
- simClustersEmbeddingStoreWithMtls,
- LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020,
- datasetName = datasetName,
- appId = appId,
- manhattanCluster = Apollo
- )
- }
-
- private val logFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val datasetName = "addressbook_sims_embedding_iiape_louvain_maxpooling"
- val appId = "wtf_embedding_apollo"
- buildUserInterestedInStoreGeneric(
- simClustersEmbeddingStoreWithMtls,
- LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020,
- datasetName = datasetName,
- appId = appId,
- manhattanCluster = Apollo
- )
- }
-
- private val logFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val datasetName = "addressbook_sims_embedding_iiape_connected_maxpooling"
- val appId = "wtf_embedding_apollo"
- buildUserInterestedInStoreGeneric(
- simClustersEmbeddingStoreWithMtls,
- LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020,
- datasetName = datasetName,
- appId = appId,
- manhattanCluster = Apollo
- )
- }
-
- /**
- * Helper func to build a readable store for some UserInterestedIn embeddings with
- * 1. A storeFunc from UserInterestedInReadableStore
- * 2. EmbeddingType
- * 3. ModelVersion
- * 4. MemCacheConfig
- * */
- private def buildUserInterestedInStore(
- storeFunc: (ManhattanKVClientMtlsParams, EmbeddingType, ModelVersion) => ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ],
- embeddingType: EmbeddingType,
- modelVersion: ModelVersion
- ): ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore = storeFunc(mhMtlsParams, embeddingType, modelVersion)
- .mapValues(_.toThrift)
- val observedStore = ObservedReadableStore(
- store = rawStore
- )(stats.scope(embeddingType.name).scope(modelVersion.name))
-
- MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding(
- observedStore,
- cacheClient,
- embeddingType,
- modelVersion,
- stats
- )
- }
-
- private def buildUserInterestedInStoreGeneric(
- storeFunc: (ManhattanKVClientMtlsParams, EmbeddingType, ModelVersion, String, String,
- ManhattanCluster) => ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ],
- embeddingType: EmbeddingType,
- modelVersion: ModelVersion,
- datasetName: String,
- appId: String,
- manhattanCluster: ManhattanCluster
- ): ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- val rawStore =
- storeFunc(mhMtlsParams, embeddingType, modelVersion, datasetName, appId, manhattanCluster)
- .mapValues(_.toThrift)
- val observedStore = ObservedReadableStore(
- store = rawStore
- )(stats.scope(embeddingType.name).scope(modelVersion.name))
-
- MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding(
- observedStore,
- cacheClient,
- embeddingType,
- modelVersion,
- stats
- )
- }
-
- private def simClustersEmbeddingStoreWithMtls(
- mhMtlsParams: ManhattanKVClientMtlsParams,
- embeddingType: EmbeddingType,
- modelVersion: ModelVersion,
- datasetName: String,
- appId: String,
- manhattanCluster: ManhattanCluster
- ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = {
-
- if (!modelVersionToDatasetMap.contains(ModelVersions.toKnownForModelVersion(modelVersion))) {
- throw new IllegalArgumentException(
- "Unknown model version: " + modelVersion + ". Known model versions: " + knownModelVersions)
- }
- getStore(appId, mhMtlsParams, datasetName, manhattanCluster)
- .composeKeyMapping[SimClustersEmbeddingId] {
- case SimClustersEmbeddingId(theEmbeddingType, theModelVersion, InternalId.UserId(userId))
- if theEmbeddingType == embeddingType && theModelVersion == modelVersion =>
- userId
- }.mapValues(toSimClustersEmbedding(_, embeddingType))
- }
-
- private def buildMemCacheStore(
- rawStore: ReadableStore[SimClustersEmbeddingId, ThriftSimClustersEmbedding],
- embeddingType: EmbeddingType,
- modelVersion: ModelVersion
- ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = {
- val observedStore = ObservedReadableStore(
- store = rawStore
- )(stats.scope(embeddingType.name).scope(modelVersion.name))
-
- MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding(
- observedStore,
- cacheClient,
- embeddingType,
- modelVersion,
- stats
- )
- }
-
- private val underlyingStores: Map[
- (EmbeddingType, ModelVersion),
- ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding]
- ] = Map(
- // KnownFor Embeddings
- (FavBasedProducer, Model20m145kUpdated) -> favBasedProducer20M145KUpdatedEmbeddingStore,
- (FavBasedProducer, Model20m145k2020) -> favBasedProducer20M145K2020EmbeddingStore,
- (FollowBasedProducer, Model20m145k2020) -> followBasedProducer20M145K2020EmbeddingStore,
- (AggregatableLogFavBasedProducer, Model20m145k2020) -> logFavBasedApe20M145K2020EmbeddingStore,
- (
- RelaxedAggregatableLogFavBasedProducer,
- Model20m145kUpdated) -> relaxedLogFavBasedApe20m145kUpdatedEmbeddingStore,
- (
- RelaxedAggregatableLogFavBasedProducer,
- Model20m145k2020) -> relaxedLogFavBasedApe20M145K2020EmbeddingStore,
- // InterestedIn Embeddings
- (
- LogFavBasedUserInterestedInFromAPE,
- Model20m145k2020) -> logFavBasedInterestedInFromAPE20M145K2020Store,
- (
- FollowBasedUserInterestedInFromAPE,
- Model20m145k2020) -> followBasedInterestedInFromAPE20M145K2020Store,
- (FavBasedUserInterestedIn, Model20m145kUpdated) -> favBasedUserInterestedIn20M145KUpdatedStore,
- (FavBasedUserInterestedIn, Model20m145k2020) -> favBasedUserInterestedIn20M145K2020Store,
- (FollowBasedUserInterestedIn, Model20m145k2020) -> followBasedUserInterestedIn20M145K2020Store,
- (LogFavBasedUserInterestedIn, Model20m145k2020) -> logFavBasedUserInterestedIn20M145K2020Store,
- (
- FavBasedUserInterestedInFromPE,
- Model20m145kUpdated) -> favBasedUserInterestedInFromPE20M145KUpdatedStore,
- (FilteredUserInterestedIn, Model20m145kUpdated) -> filteredUserInterestedIn20m145kUpdatedStore,
- (FilteredUserInterestedIn, Model20m145k2020) -> filteredUserInterestedIn20m145k2020Store,
- (
- FilteredUserInterestedInFromPE,
- Model20m145kUpdated) -> filteredUserInterestedInFromPE20m145kUpdatedStore,
- (
- UnfilteredUserInterestedIn,
- Model20m145kUpdated) -> unfilteredUserInterestedIn20m145kUpdatedStore,
- (UnfilteredUserInterestedIn, Model20m145k2020) -> unfilteredUserInterestedIn20m145k2020Store,
- (UserNextInterestedIn, Model20m145k2020) -> userNextInterestedIn20m145k2020Store,
- (
- LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020) -> logFavBasedInterestedMaxpoolingAddressBookFromIIAPE20M145K2020Store,
- (
- LogFavBasedUserInterestedAverageAddressBookFromIIAPE,
- Model20m145k2020) -> logFavBasedInterestedAverageAddressBookFromIIAPE20M145K2020Store,
- (
- LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020) -> logFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE20M145K2020Store,
- (
- LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020) -> logFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE20M145K2020Store,
- (
- LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020) -> logFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE20M145K2020Store,
- (
- LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020) -> logFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE20M145K2020Store,
- )
-
- val userSimClustersEmbeddingStore: ReadableStore[
- SimClustersEmbeddingId,
- SimClustersEmbedding
- ] = {
- SimClustersEmbeddingStore.buildWithDecider(
- underlyingStores = underlyingStores,
- decider = rmsDecider.decider,
- statsReceiver = stats
- )
- }
-
-}
diff --git a/representation-manager/server/src/main/thrift/BUILD b/representation-manager/server/src/main/thrift/BUILD
deleted file mode 100644
index f4edb5dcb..000000000
--- a/representation-manager/server/src/main/thrift/BUILD
+++ /dev/null
@@ -1,18 +0,0 @@
-create_thrift_libraries(
- base_name = "thrift",
- sources = [
- "com/twitter/representation_manager/service.thrift",
- ],
- platform = "java8",
- tags = [
- "bazel-compatible",
- ],
- dependency_roots = [
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift",
- ],
- generate_languages = [
- "java",
- "scala",
- "strato",
- ],
-)
diff --git a/representation-manager/server/src/main/thrift/BUILD.docx b/representation-manager/server/src/main/thrift/BUILD.docx
new file mode 100644
index 000000000..4ad5aa910
Binary files /dev/null and b/representation-manager/server/src/main/thrift/BUILD.docx differ
diff --git a/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.docx b/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.docx
new file mode 100644
index 000000000..de1661949
Binary files /dev/null and b/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.docx differ
diff --git a/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift b/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift
deleted file mode 100644
index 4eb36e999..000000000
--- a/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift
+++ /dev/null
@@ -1,14 +0,0 @@
-namespace java com.twitter.representation_manager.thriftjava
-#@namespace scala com.twitter.representation_manager.thriftscala
-#@namespace strato com.twitter.representation_manager
-
-include "com/twitter/simclusters_v2/online_store.thrift"
-include "com/twitter/simclusters_v2/identifier.thrift"
-
-/**
- * A uniform column view for all kinds of SimClusters based embeddings.
- **/
-struct SimClustersEmbeddingView {
- 1: required identifier.EmbeddingType embeddingType
- 2: required online_store.ModelVersion modelVersion
-}(persisted = 'false', hasPersonalData = 'false')
diff --git a/representation-scorer/BUILD.bazel b/representation-scorer/BUILD.bazel
deleted file mode 100644
index 1624a57d4..000000000
--- a/representation-scorer/BUILD.bazel
+++ /dev/null
@@ -1 +0,0 @@
-# This prevents SQ query from grabbing //:all since it traverses up once to find a BUILD
diff --git a/representation-scorer/BUILD.docx b/representation-scorer/BUILD.docx
new file mode 100644
index 000000000..b090a5bc7
Binary files /dev/null and b/representation-scorer/BUILD.docx differ
diff --git a/representation-scorer/README.docx b/representation-scorer/README.docx
new file mode 100644
index 000000000..cfd244236
Binary files /dev/null and b/representation-scorer/README.docx differ
diff --git a/representation-scorer/README.md b/representation-scorer/README.md
deleted file mode 100644
index b74e3472f..000000000
--- a/representation-scorer/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Representation Scorer #
-
-**Representation Scorer** (RSX) serves as a centralized scoring system, offering SimClusters or other embedding-based scoring solutions as machine learning features.
-
-The Representation Scorer acquires user behavior data from the User Signal Service (USS) and extracts embeddings from the Representation Manager (RMS). It then calculates both pairwise and listwise features. These features are used at various stages, including candidate retrieval and ranking.
\ No newline at end of file
diff --git a/representation-scorer/bin/canary-check.docx b/representation-scorer/bin/canary-check.docx
new file mode 100644
index 000000000..c56174979
Binary files /dev/null and b/representation-scorer/bin/canary-check.docx differ
diff --git a/representation-scorer/bin/canary-check.sh b/representation-scorer/bin/canary-check.sh
deleted file mode 100755
index cbb31f9ad..000000000
--- a/representation-scorer/bin/canary-check.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-
-export CANARY_CHECK_ROLE="representation-scorer"
-export CANARY_CHECK_NAME="representation-scorer"
-export CANARY_CHECK_INSTANCES="0-19"
-
-python3 relevance-platform/tools/canary_check.py "$@"
-
diff --git a/representation-scorer/bin/deploy.docx b/representation-scorer/bin/deploy.docx
new file mode 100644
index 000000000..c57726f04
Binary files /dev/null and b/representation-scorer/bin/deploy.docx differ
diff --git a/representation-scorer/bin/deploy.sh b/representation-scorer/bin/deploy.sh
deleted file mode 100755
index 2f1ab8a69..000000000
--- a/representation-scorer/bin/deploy.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/usr/bin/env bash
-
-JOB=representation-scorer bazel run --ui_event_filters=-info,-stdout,-stderr --noshow_progress \
- //relevance-platform/src/main/python/deploy -- "$@"
diff --git a/representation-scorer/bin/remote-debug-tunnel.docx b/representation-scorer/bin/remote-debug-tunnel.docx
new file mode 100644
index 000000000..1c0381873
Binary files /dev/null and b/representation-scorer/bin/remote-debug-tunnel.docx differ
diff --git a/representation-scorer/bin/remote-debug-tunnel.sh b/representation-scorer/bin/remote-debug-tunnel.sh
deleted file mode 100755
index 2a6e71511..000000000
--- a/representation-scorer/bin/remote-debug-tunnel.sh
+++ /dev/null
@@ -1,66 +0,0 @@
-#!/bin/bash
-
-set -o nounset
-set -eu
-
-DC="atla"
-ROLE="$USER"
-SERVICE="representation-scorer"
-INSTANCE="0"
-KEY="$DC/$ROLE/devel/$SERVICE/$INSTANCE"
-
-while test $# -gt 0; do
- case "$1" in
- -h|--help)
- echo "$0 Set up an ssh tunnel for $SERVICE remote debugging and disable aurora health checks"
- echo " "
- echo "See representation-scorer/README.md for details of how to use this script, and go/remote-debug for"
- echo "general information about remote debugging in Aurora"
- echo " "
- echo "Default instance if called with no args:"
- echo " $KEY"
- echo " "
- echo "Positional args:"
- echo " $0 [datacentre] [role] [service_name] [instance]"
- echo " "
- echo "Options:"
- echo " -h, --help show brief help"
- exit 0
- ;;
- *)
- break
- ;;
- esac
-done
-
-if [ -n "${1-}" ]; then
- DC="$1"
-fi
-
-if [ -n "${2-}" ]; then
- ROLE="$2"
-fi
-
-if [ -n "${3-}" ]; then
- SERVICE="$3"
-fi
-
-if [ -n "${4-}" ]; then
- INSTANCE="$4"
-fi
-
-KEY="$DC/$ROLE/devel/$SERVICE/$INSTANCE"
-read -p "Set up remote debugger tunnel for $KEY? (y/n) " -r CONFIRM
-if [[ ! $CONFIRM =~ ^[Yy]$ ]]; then
- echo "Exiting, tunnel not created"
- exit 1
-fi
-
-echo "Disabling health check and opening tunnel. Exit with control-c when you're finished"
-CMD="aurora task ssh $KEY -c 'touch .healthchecksnooze' && aurora task ssh $KEY -L '5005:debug' --ssh-options '-N -S none -v '"
-
-echo "Running $CMD"
-eval "$CMD"
-
-
-
diff --git a/representation-scorer/docs/index.docx b/representation-scorer/docs/index.docx
new file mode 100644
index 000000000..1873fb82e
Binary files /dev/null and b/representation-scorer/docs/index.docx differ
diff --git a/representation-scorer/docs/index.rst b/representation-scorer/docs/index.rst
deleted file mode 100644
index c4fd8966d..000000000
--- a/representation-scorer/docs/index.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-Representation Scorer (RSX)
-###########################
-
-Overview
-========
-
-Representation Scorer (RSX) is a StratoFed service which serves scores for pairs of entities (User, Tweet, Topic...) based on some representation of those entities. For example, it serves User-Tweet scores based on the cosine similarity of SimClusters embeddings for each of these. It aims to provide these with low latency and at high scale, to support applications such as scoring for ANN candidate generation and feature hydration via feature store.
-
-
-Current use cases
------------------
-
-RSX currently serves traffic for the following use cases:
-
-- User-Tweet similarity scores for Home ranking, using SimClusters embedding dot product
-- Topic-Tweet similarity scores for topical tweet candidate generation and topic social proof, using SimClusters embedding cosine similarity and CERTO scores
-- Tweet-Tweet and User-Tweet similarity scores for ANN candidate generation, using SimClusters embedding cosine similarity
-- (in development) User-Tweet similarity scores for Home ranking, based on various aggregations of similarities with recent faves, retweets and follows performed by the user
-
-Getting Started
-===============
-
-Fetching scores
----------------
-
-Scores are served from the recommendations/representation_scorer/score column.
-
-Using RSX for your application
-------------------------------
-
-RSX may be a good fit for your application if you need scores based on combinations of SimCluster embeddings for core nouns. We also plan to support other embeddings and scoring approaches in the future.
-
-.. toctree::
- :maxdepth: 2
- :hidden:
-
- index
-
-
diff --git a/representation-scorer/server/BUILD b/representation-scorer/server/BUILD
deleted file mode 100644
index cc7325192..000000000
--- a/representation-scorer/server/BUILD
+++ /dev/null
@@ -1,22 +0,0 @@
-jvm_binary(
- name = "bin",
- basename = "representation-scorer",
- main = "com.twitter.representationscorer.RepresentationScorerFedServerMain",
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "finatra/inject/inject-logback/src/main/scala",
- "loglens/loglens-logback/src/main/scala/com/twitter/loglens/logback",
- "representation-scorer/server/src/main/resources",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer",
- "twitter-server/logback-classic/src/main/scala",
- ],
-)
-
-# Aurora Workflows build phase convention requires a jvm_app named with ${project-name}-app
-jvm_app(
- name = "representation-scorer-app",
- archive = "zip",
- binary = ":bin",
- tags = ["bazel-compatible"],
-)
diff --git a/representation-scorer/server/BUILD.docx b/representation-scorer/server/BUILD.docx
new file mode 100644
index 000000000..d267cb076
Binary files /dev/null and b/representation-scorer/server/BUILD.docx differ
diff --git a/representation-scorer/server/src/main/resources/BUILD b/representation-scorer/server/src/main/resources/BUILD
deleted file mode 100644
index 150a224ff..000000000
--- a/representation-scorer/server/src/main/resources/BUILD
+++ /dev/null
@@ -1,9 +0,0 @@
-resources(
- sources = [
- "*.xml",
- "*.yml",
- "com/twitter/slo/slo.json",
- "config/*.yml",
- ],
- tags = ["bazel-compatible"],
-)
diff --git a/representation-scorer/server/src/main/resources/BUILD.docx b/representation-scorer/server/src/main/resources/BUILD.docx
new file mode 100644
index 000000000..5a66b8601
Binary files /dev/null and b/representation-scorer/server/src/main/resources/BUILD.docx differ
diff --git a/representation-scorer/server/src/main/resources/com/twitter/slo/slo.docx b/representation-scorer/server/src/main/resources/com/twitter/slo/slo.docx
new file mode 100644
index 000000000..6294669f7
Binary files /dev/null and b/representation-scorer/server/src/main/resources/com/twitter/slo/slo.docx differ
diff --git a/representation-scorer/server/src/main/resources/com/twitter/slo/slo.json b/representation-scorer/server/src/main/resources/com/twitter/slo/slo.json
deleted file mode 100644
index 836b44058..000000000
--- a/representation-scorer/server/src/main/resources/com/twitter/slo/slo.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
- "servers": [
- {
- "name": "strato",
- "indicators": [
- {
- "id": "success_rate_3m",
- "indicator_type": "SuccessRateIndicator",
- "duration": 3,
- "duration_unit": "MINUTES"
- }, {
- "id": "latency_3m_p99",
- "indicator_type": "LatencyIndicator",
- "duration": 3,
- "duration_unit": "MINUTES",
- "percentile": 0.99
- }
- ],
- "objectives": [
- {
- "indicator": "success_rate_3m",
- "objective_type": "SuccessRateObjective",
- "operator": ">=",
- "threshold": 0.995
- },
- {
- "indicator": "latency_3m_p99",
- "objective_type": "LatencyObjective",
- "operator": "<=",
- "threshold": 50
- }
- ],
- "long_term_objectives": [
- {
- "id": "success_rate_28_days",
- "objective_type": "SuccessRateObjective",
- "operator": ">=",
- "threshold": 0.993,
- "duration": 28,
- "duration_unit": "DAYS"
- },
- {
- "id": "latency_p99_28_days",
- "objective_type": "LatencyObjective",
- "operator": "<=",
- "threshold": 60,
- "duration": 28,
- "duration_unit": "DAYS",
- "percentile": 0.99
- }
- ]
- }
- ],
- "@version": 1
-}
diff --git a/representation-scorer/server/src/main/resources/config/decider.docx b/representation-scorer/server/src/main/resources/config/decider.docx
new file mode 100644
index 000000000..5b7ee2751
Binary files /dev/null and b/representation-scorer/server/src/main/resources/config/decider.docx differ
diff --git a/representation-scorer/server/src/main/resources/config/decider.yml b/representation-scorer/server/src/main/resources/config/decider.yml
deleted file mode 100644
index 56ae90418..000000000
--- a/representation-scorer/server/src/main/resources/config/decider.yml
+++ /dev/null
@@ -1,155 +0,0 @@
-enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore:
- comment: "Enable to use the non-empty store for logFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore (from 0% to 100%). 0 means use EMPTY readable store for all requests."
- default_availability: 0
-
-enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore:
- comment: "Enable to use the non-empty store for logFavBasedApeEntity20M145K2020EmbeddingCachedStore (from 0% to 100%). 0 means use EMPTY readable store for all requests."
- default_availability: 0
-
-representation-scorer_forward_dark_traffic:
- comment: "Defines the percentage of traffic to forward to diffy-proxy. Set to 0 to disable dark traffic forwarding"
- default_availability: 0
-
-"representation-scorer_load_shed_non_prod_callers":
- comment: "Discard traffic from all non-prod callers"
- default_availability: 0
-
-enable_log_fav_based_tweet_embedding_20m145k2020_timeouts:
- comment: "If enabled, set a timeout on calls to the logFavBased20M145K2020TweetEmbeddingStore"
- default_availability: 0
-
-log_fav_based_tweet_embedding_20m145k2020_timeout_value_millis:
- comment: "The value of this decider defines the timeout (in milliseconds) to use on calls to the logFavBased20M145K2020TweetEmbeddingStore, i.e. 1.50% is 150ms. Only applied if enable_log_fav_based_tweet_embedding_20m145k2020_timeouts is true"
- default_availability: 2000
-
-enable_log_fav_based_tweet_embedding_20m145kUpdated_timeouts:
- comment: "If enabled, set a timeout on calls to the logFavBased20M145KUpdatedTweetEmbeddingStore"
- default_availability: 0
-
-log_fav_based_tweet_embedding_20m145kUpdated_timeout_value_millis:
- comment: "The value of this decider defines the timeout (in milliseconds) to use on calls to the logFavBased20M145KUpdatedTweetEmbeddingStore, i.e. 1.50% is 150ms. Only applied if enable_log_fav_based_tweet_embedding_20m145kUpdated_timeouts is true"
- default_availability: 2000
-
-enable_cluster_tweet_index_store_timeouts:
- comment: "If enabled, set a timeout on calls to the ClusterTweetIndexStore"
- default_availability: 0
-
-cluster_tweet_index_store_timeout_value_millis:
- comment: "The value of this decider defines the timeout (in milliseconds) to use on calls to the ClusterTweetIndexStore, i.e. 1.50% is 150ms. Only applied if enable_cluster_tweet_index_store_timeouts is true"
- default_availability: 2000
-
-representation_scorer_fetch_signal_share:
- comment: "If enabled, fetches share signals from USS"
- default_availability: 0
-
-representation_scorer_fetch_signal_reply:
- comment: "If enabled, fetches reply signals from USS"
- default_availability: 0
-
-representation_scorer_fetch_signal_original_tweet:
- comment: "If enabled, fetches original tweet signals from USS"
- default_availability: 0
-
-representation_scorer_fetch_signal_video_playback:
- comment: "If enabled, fetches video playback signals from USS"
- default_availability: 0
-
-representation_scorer_fetch_signal_block:
- comment: "If enabled, fetches account block signals from USS"
- default_availability: 0
-
-representation_scorer_fetch_signal_mute:
- comment: "If enabled, fetches account mute signals from USS"
- default_availability: 0
-
-representation_scorer_fetch_signal_report:
- comment: "If enabled, fetches tweet report signals from USS"
- default_availability: 0
-
-representation_scorer_fetch_signal_dont_like:
- comment: "If enabled, fetches tweet don't like signals from USS"
- default_availability: 0
-
-representation_scorer_fetch_signal_see_fewer:
- comment: "If enabled, fetches tweet see fewer signals from USS"
- default_availability: 0
-
-# To create a new decider, add here with the same format and caller's details : "representation-scorer_load_shed_by_caller_id_twtr:{{role}}:{{name}}:{{environment}}:{{cluster}}"
-# All the deciders below are generated by this script - ./strato/bin/fed deciders ./ --service-role=representation-scorer --service-name=representation-scorer
-# If you need to run the script and paste the output, add only the prod deciders here. Non-prod ones are being taken care of by representation-scorer_load_shed_non_prod_callers
-
-"representation-scorer_load_shed_by_caller_id_all":
- comment: "Reject all traffic from caller id: all"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice-canary:prod:atla":
- comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice-canary:prod:atla"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice-canary:prod:pdxa":
- comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice-canary:prod:pdxa"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice-send:prod:atla":
- comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice-send:prod:atla"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice:prod:atla":
- comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice:prod:atla"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice:prod:pdxa":
- comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice:prod:pdxa"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice:staging:atla":
- comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice:staging:atla"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice:staging:pdxa":
- comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice:staging:pdxa"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:home-scorer:home-scorer:prod:atla":
- comment: "Reject all traffic from caller id: twtr:svc:home-scorer:home-scorer:prod:atla"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:home-scorer:home-scorer:prod:pdxa":
- comment: "Reject all traffic from caller id: twtr:svc:home-scorer:home-scorer:prod:pdxa"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:stratostore:stratoapi:prod:atla":
- comment: "Reject all traffic from caller id: twtr:svc:stratostore:stratoapi:prod:atla"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:stratostore:stratoserver:prod:atla":
- comment: "Reject all traffic from caller id: twtr:svc:stratostore:stratoserver:prod:atla"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:stratostore:stratoserver:prod:pdxa":
- comment: "Reject all traffic from caller id: twtr:svc:stratostore:stratoserver:prod:pdxa"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:timelinescorer:timelinescorer:prod:atla":
- comment: "Reject all traffic from caller id: twtr:svc:timelinescorer:timelinescorer:prod:atla"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:timelinescorer:timelinescorer:prod:pdxa":
- comment: "Reject all traffic from caller id: twtr:svc:timelinescorer:timelinescorer:prod:pdxa"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:topic-social-proof:topic-social-proof:prod:atla":
- comment: "Reject all traffic from caller id: twtr:svc:topic-social-proof:topic-social-proof:prod:atla"
- default_availability: 0
-
-"representation-scorer_load_shed_by_caller_id_twtr:svc:topic-social-proof:topic-social-proof:prod:pdxa":
- comment: "Reject all traffic from caller id: twtr:svc:topic-social-proof:topic-social-proof:prod:pdxa"
- default_availability: 0
-
-"enable_sim_clusters_embedding_store_timeouts":
- comment: "If enabled, set a timeout on calls to the SimClustersEmbeddingStore"
- default_availability: 10000
-
-sim_clusters_embedding_store_timeout_value_millis:
- comment: "The value of this decider defines the timeout (in milliseconds) to use on calls to the SimClustersEmbeddingStore, i.e. 1.50% is 150ms. Only applied if enable_sim_clusters_embedding_store_timeouts is true"
- default_availability: 2000
diff --git a/representation-scorer/server/src/main/resources/logback.docx b/representation-scorer/server/src/main/resources/logback.docx
new file mode 100644
index 000000000..e1c3b30e4
Binary files /dev/null and b/representation-scorer/server/src/main/resources/logback.docx differ
diff --git a/representation-scorer/server/src/main/resources/logback.xml b/representation-scorer/server/src/main/resources/logback.xml
deleted file mode 100644
index cf7028151..000000000
--- a/representation-scorer/server/src/main/resources/logback.xml
+++ /dev/null
@@ -1,165 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- true
-
-
-
-
-
-
-
-
-
-
- ${log.service.output}
-
-
- ${log.service.output}.%d.gz
-
- 3GB
-
- 21
- true
-
-
- %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n
-
-
-
-
-
- ${log.access.output}
-
-
- ${log.access.output}.%d.gz
-
- 100MB
-
- 7
- true
-
-
- ${DEFAULT_ACCESS_PATTERN}%n
-
-
-
-
-
-
-
-
-
-
-
- allow_listed_pipeline_executions.log
-
-
- allow_listed_pipeline_executions.log.%d.gz
-
- 100MB
-
- 7
- true
-
-
- %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD
deleted file mode 100644
index fdb60da54..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "finagle-internal/slo/src/main/scala/com/twitter/finagle/slo",
- "finatra/inject/inject-thrift-client",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns",
- "strato/src/main/scala/com/twitter/strato/fed",
- "strato/src/main/scala/com/twitter/strato/fed/server",
- "twitter-server-internal/src/main/scala",
- ],
-)
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD.docx
new file mode 100644
index 000000000..4a3761281
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.docx
new file mode 100644
index 000000000..8a9c4b7f3
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.scala
deleted file mode 100644
index a0a203311..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.scala
+++ /dev/null
@@ -1,38 +0,0 @@
-package com.twitter.representationscorer
-
-import com.google.inject.Module
-import com.twitter.inject.thrift.modules.ThriftClientIdModule
-import com.twitter.representationscorer.columns.ListScoreColumn
-import com.twitter.representationscorer.columns.ScoreColumn
-import com.twitter.representationscorer.columns.SimClustersRecentEngagementSimilarityColumn
-import com.twitter.representationscorer.columns.SimClustersRecentEngagementSimilarityUserTweetEdgeColumn
-import com.twitter.representationscorer.modules.CacheModule
-import com.twitter.representationscorer.modules.EmbeddingStoreModule
-import com.twitter.representationscorer.modules.RMSConfigModule
-import com.twitter.representationscorer.modules.TimerModule
-import com.twitter.representationscorer.twistlyfeatures.UserSignalServiceRecentEngagementsClientModule
-import com.twitter.strato.fed._
-import com.twitter.strato.fed.server._
-
-object RepresentationScorerFedServerMain extends RepresentationScorerFedServer
-
-trait RepresentationScorerFedServer extends StratoFedServer {
- override def dest: String = "/s/representation-scorer/representation-scorer"
- override val modules: Seq[Module] =
- Seq(
- CacheModule,
- ThriftClientIdModule,
- UserSignalServiceRecentEngagementsClientModule,
- TimerModule,
- RMSConfigModule,
- EmbeddingStoreModule
- )
-
- override def columns: Seq[Class[_ <: StratoFed.Column]] =
- Seq(
- classOf[ListScoreColumn],
- classOf[ScoreColumn],
- classOf[SimClustersRecentEngagementSimilarityUserTweetEdgeColumn],
- classOf[SimClustersRecentEngagementSimilarityColumn]
- )
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD
deleted file mode 100644
index 3352a51b9..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "content-recommender/thrift/src/main/thrift:thrift-scala",
- "finatra/inject/inject-core/src/main/scala",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer/common",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures",
- "representation-scorer/server/src/main/thrift:thrift-scala",
- "strato/src/main/scala/com/twitter/strato/fed",
- "strato/src/main/scala/com/twitter/strato/fed/server",
- ],
-)
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD.docx
new file mode 100644
index 000000000..d108c391f
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.docx
new file mode 100644
index 000000000..5a3d00bde
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.scala
deleted file mode 100644
index 3b14a491f..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.scala
+++ /dev/null
@@ -1,13 +0,0 @@
-package com.twitter.representationscorer.columns
-
-import com.twitter.strato.config.{ContactInfo => StratoContactInfo}
-
-object Info {
- val contactInfo: StratoContactInfo = StratoContactInfo(
- description = "Please contact Relevance Platform team for more details",
- contactEmail = "no-reply@twitter.com",
- ldapGroup = "representation-scorer-admins",
- jiraProject = "JIRA",
- links = Seq("http://go.twitter.biz/rsx-runbook")
- )
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.docx
new file mode 100644
index 000000000..0f87c9d1a
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.scala
deleted file mode 100644
index 04d8b8cb1..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.scala
+++ /dev/null
@@ -1,116 +0,0 @@
-package com.twitter.representationscorer.columns
-
-import com.twitter.representationscorer.thriftscala.ListScoreId
-import com.twitter.representationscorer.thriftscala.ListScoreResponse
-import com.twitter.representationscorer.scorestore.ScoreStore
-import com.twitter.representationscorer.thriftscala.ScoreResult
-import com.twitter.simclusters_v2.common.SimClustersEmbeddingId.LongInternalId
-import com.twitter.simclusters_v2.common.SimClustersEmbeddingId.LongSimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.Score
-import com.twitter.simclusters_v2.thriftscala.ScoreId
-import com.twitter.simclusters_v2.thriftscala.ScoreInternalId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingPairScoreId
-import com.twitter.stitch
-import com.twitter.stitch.Stitch
-import com.twitter.strato.catalog.OpMetadata
-import com.twitter.strato.config.ContactInfo
-import com.twitter.strato.config.Policy
-import com.twitter.strato.data.Conv
-import com.twitter.strato.data.Description.PlainText
-import com.twitter.strato.data.Lifecycle
-import com.twitter.strato.fed._
-import com.twitter.strato.thrift.ScroogeConv
-import com.twitter.util.Future
-import com.twitter.util.Return
-import com.twitter.util.Throw
-import javax.inject.Inject
-
-class ListScoreColumn @Inject() (scoreStore: ScoreStore)
- extends StratoFed.Column("recommendations/representation_scorer/listScore")
- with StratoFed.Fetch.Stitch {
-
- override val policy: Policy = Common.rsxReadPolicy
-
- override type Key = ListScoreId
- override type View = Unit
- override type Value = ListScoreResponse
-
- override val keyConv: Conv[Key] = ScroogeConv.fromStruct[ListScoreId]
- override val viewConv: Conv[View] = Conv.ofType
- override val valueConv: Conv[Value] = ScroogeConv.fromStruct[ListScoreResponse]
-
- override val contactInfo: ContactInfo = Info.contactInfo
-
- override val metadata: OpMetadata = OpMetadata(
- lifecycle = Some(Lifecycle.Production),
- description = Some(
- PlainText(
- "Scoring for multiple candidate entities against a single target entity"
- ))
- )
-
- override def fetch(key: Key, view: View): Stitch[Result[Value]] = {
-
- val target = SimClustersEmbeddingId(
- embeddingType = key.targetEmbeddingType,
- modelVersion = key.modelVersion,
- internalId = key.targetId
- )
- val scoreIds = key.candidateIds.map { candidateId =>
- val candidate = SimClustersEmbeddingId(
- embeddingType = key.candidateEmbeddingType,
- modelVersion = key.modelVersion,
- internalId = candidateId
- )
- ScoreId(
- algorithm = key.algorithm,
- internalId = ScoreInternalId.SimClustersEmbeddingPairScoreId(
- SimClustersEmbeddingPairScoreId(target, candidate)
- )
- )
- }
-
- Stitch
- .callFuture {
- val (keys: Iterable[ScoreId], vals: Iterable[Future[Option[Score]]]) =
- scoreStore.uniformScoringStore.multiGet(scoreIds.toSet).unzip
- val results: Future[Iterable[Option[Score]]] = Future.collectToTry(vals.toSeq) map {
- tryOptVals =>
- tryOptVals map {
- case Return(Some(v)) => Some(v)
- case Return(None) => None
- case Throw(_) => None
- }
- }
- val scoreMap: Future[Map[Long, Double]] = results.map { scores =>
- keys
- .zip(scores).collect {
- case (
- ScoreId(
- _,
- ScoreInternalId.SimClustersEmbeddingPairScoreId(
- SimClustersEmbeddingPairScoreId(
- _,
- LongSimClustersEmbeddingId(candidateId)))),
- Some(score)) =>
- (candidateId, score.score)
- }.toMap
- }
- scoreMap
- }
- .map { (scores: Map[Long, Double]) =>
- val orderedScores = key.candidateIds.collect {
- case LongInternalId(id) => ScoreResult(scores.get(id))
- case _ =>
- // This will return None scores for candidates which don't have Long ids, but that's fine:
- // at the moment we're only scoring for Tweets
- ScoreResult(None)
- }
- found(ListScoreResponse(orderedScores))
- }
- .handle {
- case stitch.NotFound => missing
- }
- }
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.docx
new file mode 100644
index 000000000..bc3d6b2c8
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.scala
deleted file mode 100644
index 6b565288b..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.twitter.representationscorer.columns
-
-import com.twitter.contentrecommender.thriftscala.ScoringResponse
-import com.twitter.representationscorer.scorestore.ScoreStore
-import com.twitter.simclusters_v2.thriftscala.ScoreId
-import com.twitter.stitch
-import com.twitter.stitch.Stitch
-import com.twitter.strato.config.ContactInfo
-import com.twitter.strato.config.Policy
-import com.twitter.strato.catalog.OpMetadata
-import com.twitter.strato.data.Conv
-import com.twitter.strato.data.Lifecycle
-import com.twitter.strato.data.Description.PlainText
-import com.twitter.strato.fed._
-import com.twitter.strato.thrift.ScroogeConv
-import javax.inject.Inject
-
-class ScoreColumn @Inject() (scoreStore: ScoreStore)
- extends StratoFed.Column("recommendations/representation_scorer/score")
- with StratoFed.Fetch.Stitch {
-
- override val policy: Policy = Common.rsxReadPolicy
-
- override type Key = ScoreId
- override type View = Unit
- override type Value = ScoringResponse
-
- override val keyConv: Conv[Key] = ScroogeConv.fromStruct[ScoreId]
- override val viewConv: Conv[View] = Conv.ofType
- override val valueConv: Conv[Value] = ScroogeConv.fromStruct[ScoringResponse]
-
- override val contactInfo: ContactInfo = Info.contactInfo
-
- override val metadata: OpMetadata = OpMetadata(
- lifecycle = Some(Lifecycle.Production),
- description = Some(PlainText(
- "The Uniform Scoring Endpoint in Representation Scorer for the Content-Recommender." +
- " TDD: http://go/representation-scorer-tdd Guideline: http://go/uniform-scoring-guideline"))
- )
-
- override def fetch(key: Key, view: View): Stitch[Result[Value]] =
- scoreStore
- .uniformScoringStoreStitch(key)
- .map(score => found(ScoringResponse(Some(score))))
- .handle {
- case stitch.NotFound => missing
- }
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.docx
new file mode 100644
index 000000000..72d4182cf
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.scala
deleted file mode 100644
index e14a67eae..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-package com.twitter.representationscorer.columns
-
-import com.twitter.representationscorer.common.TweetId
-import com.twitter.representationscorer.common.UserId
-import com.twitter.representationscorer.thriftscala.RecentEngagementSimilaritiesResponse
-import com.twitter.representationscorer.twistlyfeatures.Scorer
-import com.twitter.stitch
-import com.twitter.stitch.Stitch
-import com.twitter.strato.catalog.OpMetadata
-import com.twitter.strato.config.ContactInfo
-import com.twitter.strato.config.Policy
-import com.twitter.strato.data.Conv
-import com.twitter.strato.data.Description.PlainText
-import com.twitter.strato.data.Lifecycle
-import com.twitter.strato.fed._
-import com.twitter.strato.thrift.ScroogeConv
-import javax.inject.Inject
-
-class SimClustersRecentEngagementSimilarityColumn @Inject() (scorer: Scorer)
- extends StratoFed.Column(
- "recommendations/representation_scorer/simClustersRecentEngagementSimilarity")
- with StratoFed.Fetch.Stitch {
-
- override val policy: Policy = Common.rsxReadPolicy
-
- override type Key = (UserId, Seq[TweetId])
- override type View = Unit
- override type Value = RecentEngagementSimilaritiesResponse
-
- override val keyConv: Conv[Key] = Conv.ofType[(Long, Seq[Long])]
- override val viewConv: Conv[View] = Conv.ofType
- override val valueConv: Conv[Value] =
- ScroogeConv.fromStruct[RecentEngagementSimilaritiesResponse]
-
- override val contactInfo: ContactInfo = Info.contactInfo
-
- override val metadata: OpMetadata = OpMetadata(
- lifecycle = Some(Lifecycle.Production),
- description = Some(
- PlainText(
- "User-Tweet scores based on the user's recent engagements for multiple tweets."
- ))
- )
-
- override def fetch(key: Key, view: View): Stitch[Result[Value]] =
- scorer
- .get(key._1, key._2)
- .map(results => found(RecentEngagementSimilaritiesResponse(results)))
- .handle {
- case stitch.NotFound => missing
- }
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.docx
new file mode 100644
index 000000000..77719e391
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.scala
deleted file mode 100644
index e54d3a71b..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-package com.twitter.representationscorer.columns
-
-import com.twitter.representationscorer.common.TweetId
-import com.twitter.representationscorer.common.UserId
-import com.twitter.representationscorer.thriftscala.SimClustersRecentEngagementSimilarities
-import com.twitter.representationscorer.twistlyfeatures.Scorer
-import com.twitter.stitch
-import com.twitter.stitch.Stitch
-import com.twitter.strato.catalog.OpMetadata
-import com.twitter.strato.config.ContactInfo
-import com.twitter.strato.config.Policy
-import com.twitter.strato.data.Conv
-import com.twitter.strato.data.Description.PlainText
-import com.twitter.strato.data.Lifecycle
-import com.twitter.strato.fed._
-import com.twitter.strato.thrift.ScroogeConv
-import javax.inject.Inject
-
-class SimClustersRecentEngagementSimilarityUserTweetEdgeColumn @Inject() (scorer: Scorer)
- extends StratoFed.Column(
- "recommendations/representation_scorer/simClustersRecentEngagementSimilarity.UserTweetEdge")
- with StratoFed.Fetch.Stitch {
-
- override val policy: Policy = Common.rsxReadPolicy
-
- override type Key = (UserId, TweetId)
- override type View = Unit
- override type Value = SimClustersRecentEngagementSimilarities
-
- override val keyConv: Conv[Key] = Conv.ofType[(Long, Long)]
- override val viewConv: Conv[View] = Conv.ofType
- override val valueConv: Conv[Value] =
- ScroogeConv.fromStruct[SimClustersRecentEngagementSimilarities]
-
- override val contactInfo: ContactInfo = Info.contactInfo
-
- override val metadata: OpMetadata = OpMetadata(
- lifecycle = Some(Lifecycle.Production),
- description = Some(
- PlainText(
- "User-Tweet scores based on the user's recent engagements"
- ))
- )
-
- override def fetch(key: Key, view: View): Stitch[Result[Value]] =
- scorer
- .get(key._1, key._2)
- .map(found(_))
- .handle {
- case stitch.NotFound => missing
- }
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD
deleted file mode 100644
index 018cef9eb..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD
+++ /dev/null
@@ -1,9 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "decider/src/main/scala",
- "src/scala/com/twitter/simclusters_v2/common",
- ],
-)
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD.docx
new file mode 100644
index 000000000..e153443a2
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.docx
new file mode 100644
index 000000000..5ba2973a9
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.scala
deleted file mode 100644
index 838835616..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.scala
+++ /dev/null
@@ -1,7 +0,0 @@
-package com.twitter.representationscorer
-
-object DeciderConstants {
- val enableSimClustersEmbeddingStoreTimeouts = "enable_sim_clusters_embedding_store_timeouts"
- val simClustersEmbeddingStoreTimeoutValueMillis =
- "sim_clusters_embedding_store_timeout_value_millis"
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.docx
new file mode 100644
index 000000000..b1fbcd983
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.scala
deleted file mode 100644
index 5aa4b4f2c..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-package com.twitter.representationscorer.common
-
-import com.twitter.decider.Decider
-import com.twitter.decider.RandomRecipient
-import com.twitter.decider.Recipient
-import com.twitter.simclusters_v2.common.DeciderGateBuilderWithIdHashing
-import javax.inject.Inject
-import javax.inject.Singleton
-
-@Singleton
-case class RepresentationScorerDecider @Inject() (decider: Decider) {
-
- val deciderGateBuilder = new DeciderGateBuilderWithIdHashing(decider)
-
- def isAvailable(feature: String, recipient: Option[Recipient]): Boolean = {
- decider.isAvailable(feature, recipient)
- }
-
- /**
- * When useRandomRecipient is set to false, the decider is either completely on or off.
- * When useRandomRecipient is set to true, the decider is on for the specified % of traffic.
- */
- def isAvailable(feature: String, useRandomRecipient: Boolean = true): Boolean = {
- if (useRandomRecipient) isAvailable(feature, Some(RandomRecipient))
- else isAvailable(feature, None)
- }
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.docx
new file mode 100644
index 000000000..317943c84
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.scala
deleted file mode 100644
index c5bf9c60a..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.scala
+++ /dev/null
@@ -1,6 +0,0 @@
-package com.twitter.representationscorer
-
-package object common {
- type UserId = Long
- type TweetId = Long
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD
deleted file mode 100644
index c73f2a68e..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
- "finagle/finagle-stats",
- "finatra/inject/inject-core/src/main/scala",
- "representation-manager/client/src/main/scala/com/twitter/representation_manager",
- "representation-manager/client/src/main/scala/com/twitter/representation_manager/config",
- "representation-manager/server/src/main/scala/com/twitter/representation_manager/migration",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer/common",
- "servo/util",
- "src/scala/com/twitter/simclusters_v2/stores",
- "src/scala/com/twitter/storehaus_internal/memcache",
- "src/scala/com/twitter/storehaus_internal/util",
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
- ],
-)
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD.docx
new file mode 100644
index 000000000..893395a81
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.docx
new file mode 100644
index 000000000..a8f23b243
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.scala
deleted file mode 100644
index b8b815872..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-package com.twitter.representationscorer.modules
-
-import com.google.inject.Provides
-import com.twitter.finagle.memcached.Client
-import javax.inject.Singleton
-import com.twitter.conversions.DurationOps._
-import com.twitter.inject.TwitterModule
-import com.twitter.finagle.mtls.authentication.ServiceIdentifier
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.storehaus_internal.memcache.MemcacheStore
-import com.twitter.storehaus_internal.util.ClientName
-import com.twitter.storehaus_internal.util.ZkEndPoint
-
-object CacheModule extends TwitterModule {
-
- private val cacheDest = flag[String]("cache_module.dest", "Path to memcache service")
- private val timeout = flag[Int]("memcache.timeout", "Memcache client timeout")
- private val retries = flag[Int]("memcache.retries", "Memcache timeout retries")
-
- @Singleton
- @Provides
- def providesCache(
- serviceIdentifier: ServiceIdentifier,
- stats: StatsReceiver
- ): Client =
- MemcacheStore.memcachedClient(
- name = ClientName("memcache_representation_manager"),
- dest = ZkEndPoint(cacheDest()),
- timeout = timeout().milliseconds,
- retries = retries(),
- statsReceiver = stats.scope("cache_client"),
- serviceIdentifier = serviceIdentifier
- )
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.docx
new file mode 100644
index 000000000..b0b62f21c
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.scala
deleted file mode 100644
index bff5d491c..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.scala
+++ /dev/null
@@ -1,100 +0,0 @@
-package com.twitter.representationscorer.modules
-
-import com.google.inject.Provides
-import com.twitter.decider.Decider
-import com.twitter.finagle.memcached.{Client => MemcachedClient}
-import com.twitter.finagle.mtls.authentication.ServiceIdentifier
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.finagle.thrift.ClientId
-import com.twitter.hermit.store.common.ObservedReadableStore
-import com.twitter.inject.TwitterModule
-import com.twitter.relevance_platform.common.readablestore.ReadableStoreWithTimeout
-import com.twitter.representation_manager.migration.LegacyRMS
-import com.twitter.representationscorer.DeciderConstants
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType._
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.ModelVersion._
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.storehaus.ReadableStore
-import com.twitter.util.Timer
-import javax.inject.Singleton
-
-object EmbeddingStoreModule extends TwitterModule {
- @Singleton
- @Provides
- def providesEmbeddingStore(
- memCachedClient: MemcachedClient,
- serviceIdentifier: ServiceIdentifier,
- clientId: ClientId,
- timer: Timer,
- decider: Decider,
- stats: StatsReceiver
- ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = {
- val cacheHashKeyPrefix: String = "RMS"
- val embeddingStoreClient = new LegacyRMS(
- serviceIdentifier,
- memCachedClient,
- stats,
- decider,
- clientId,
- timer,
- cacheHashKeyPrefix
- )
-
- val underlyingStores: Map[
- (EmbeddingType, ModelVersion),
- ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding]
- ] = Map(
- // Tweet Embeddings
- (
- LogFavBasedTweet,
- Model20m145k2020) -> embeddingStoreClient.logFavBased20M145K2020TweetEmbeddingStore,
- (
- LogFavLongestL2EmbeddingTweet,
- Model20m145k2020) -> embeddingStoreClient.logFavBasedLongestL2Tweet20M145K2020EmbeddingStore,
- // InterestedIn Embeddings
- (
- LogFavBasedUserInterestedInFromAPE,
- Model20m145k2020) -> embeddingStoreClient.LogFavBasedInterestedInFromAPE20M145K2020Store,
- (
- FavBasedUserInterestedIn,
- Model20m145k2020) -> embeddingStoreClient.favBasedUserInterestedIn20M145K2020Store,
- // Author Embeddings
- (
- FavBasedProducer,
- Model20m145k2020) -> embeddingStoreClient.favBasedProducer20M145K2020EmbeddingStore,
- // Entity Embeddings
- (
- LogFavBasedKgoApeTopic,
- Model20m145k2020) -> embeddingStoreClient.logFavBasedApeEntity20M145K2020EmbeddingCachedStore,
- (FavTfgTopic, Model20m145k2020) -> embeddingStoreClient.favBasedTfgTopicEmbedding2020Store,
- )
-
- val simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = {
- val underlying: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] =
- SimClustersEmbeddingStore.buildWithDecider(
- underlyingStores = underlyingStores,
- decider = decider,
- statsReceiver = stats.scope("simClusters_embeddings_store_deciderable")
- )
-
- val underlyingWithTimeout: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] =
- new ReadableStoreWithTimeout(
- rs = underlying,
- decider = decider,
- enableTimeoutDeciderKey = DeciderConstants.enableSimClustersEmbeddingStoreTimeouts,
- timeoutValueKey = DeciderConstants.simClustersEmbeddingStoreTimeoutValueMillis,
- timer = timer,
- statsReceiver = stats.scope("simClusters_embedding_store_timeouts")
- )
-
- ObservedReadableStore(
- store = underlyingWithTimeout
- )(stats.scope("simClusters_embeddings_store"))
- }
- simClustersEmbeddingStore
- }
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.docx
new file mode 100644
index 000000000..77ee14dbd
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.scala
deleted file mode 100644
index 08ac0cb93..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.scala
+++ /dev/null
@@ -1,63 +0,0 @@
-package com.twitter.representationscorer.modules
-
-import com.google.inject.Provides
-import com.twitter.conversions.DurationOps._
-import com.twitter.inject.TwitterModule
-import com.twitter.representation_manager.config.ClientConfig
-import com.twitter.representation_manager.config.EnabledInMemoryCacheParams
-import com.twitter.representation_manager.config.InMemoryCacheParams
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType._
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.ModelVersion._
-import javax.inject.Singleton
-
-object RMSConfigModule extends TwitterModule {
- def getCacheName(embedingType: EmbeddingType, modelVersion: ModelVersion): String =
- s"${embedingType.name}_${modelVersion.name}_in_mem_cache"
-
- @Singleton
- @Provides
- def providesRMSClientConfig: ClientConfig = {
- val cacheParamsMap: Map[
- (EmbeddingType, ModelVersion),
- InMemoryCacheParams
- ] = Map(
- // Tweet Embeddings
- (LogFavBasedTweet, Model20m145k2020) -> EnabledInMemoryCacheParams(
- ttl = 10.minutes,
- maxKeys = 1048575, // 800MB
- cacheName = getCacheName(LogFavBasedTweet, Model20m145k2020)),
- (LogFavLongestL2EmbeddingTweet, Model20m145k2020) -> EnabledInMemoryCacheParams(
- ttl = 5.minute,
- maxKeys = 1048575, // 800MB
- cacheName = getCacheName(LogFavLongestL2EmbeddingTweet, Model20m145k2020)),
- // User - KnownFor Embeddings
- (FavBasedProducer, Model20m145k2020) -> EnabledInMemoryCacheParams(
- ttl = 1.day,
- maxKeys = 500000, // 400MB
- cacheName = getCacheName(FavBasedProducer, Model20m145k2020)),
- // User - InterestedIn Embeddings
- (LogFavBasedUserInterestedInFromAPE, Model20m145k2020) -> EnabledInMemoryCacheParams(
- ttl = 6.hours,
- maxKeys = 262143,
- cacheName = getCacheName(LogFavBasedUserInterestedInFromAPE, Model20m145k2020)),
- (FavBasedUserInterestedIn, Model20m145k2020) -> EnabledInMemoryCacheParams(
- ttl = 6.hours,
- maxKeys = 262143,
- cacheName = getCacheName(FavBasedUserInterestedIn, Model20m145k2020)),
- // Topic Embeddings
- (FavTfgTopic, Model20m145k2020) -> EnabledInMemoryCacheParams(
- ttl = 12.hours,
- maxKeys = 262143, // 200MB
- cacheName = getCacheName(FavTfgTopic, Model20m145k2020)),
- (LogFavBasedKgoApeTopic, Model20m145k2020) -> EnabledInMemoryCacheParams(
- ttl = 6.hours,
- maxKeys = 262143,
- cacheName = getCacheName(LogFavBasedKgoApeTopic, Model20m145k2020)),
- )
-
- new ClientConfig(inMemCacheParamsOverrides = cacheParamsMap)
- }
-
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.docx
new file mode 100644
index 000000000..d5c0e25b2
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.scala
deleted file mode 100644
index b425d516a..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.scala
+++ /dev/null
@@ -1,13 +0,0 @@
-package com.twitter.representationscorer.modules
-
-import com.google.inject.Provides
-import com.twitter.finagle.util.DefaultTimer
-import com.twitter.inject.TwitterModule
-import com.twitter.util.Timer
-import javax.inject.Singleton
-
-object TimerModule extends TwitterModule {
- @Singleton
- @Provides
- def providesTimer: Timer = DefaultTimer
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD
deleted file mode 100644
index 3c259cfc4..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD
+++ /dev/null
@@ -1,19 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/util",
- "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common",
- "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/injection",
- "representation-manager/client/src/main/scala/com/twitter/representation_manager",
- "representation-manager/client/src/main/scala/com/twitter/representation_manager/config",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer/common",
- "src/scala/com/twitter/simclusters_v2/score",
- "src/scala/com/twitter/topic_recos/common",
- "src/scala/com/twitter/topic_recos/stores",
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
- "src/thrift/com/twitter/topic_recos:topic_recos-thrift-scala",
- "stitch/stitch-storehaus",
- ],
-)
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD.docx
new file mode 100644
index 000000000..4cb511845
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.docx
new file mode 100644
index 000000000..ad63f4db0
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.scala
deleted file mode 100644
index db7cbefa9..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.scala
+++ /dev/null
@@ -1,168 +0,0 @@
-package com.twitter.representationscorer.scorestore
-
-import com.twitter.bijection.scrooge.BinaryScalaCodec
-import com.twitter.conversions.DurationOps._
-import com.twitter.finagle.memcached.Client
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.hashing.KeyHasher
-import com.twitter.hermit.store.common.ObservedCachedReadableStore
-import com.twitter.hermit.store.common.ObservedMemcachedReadableStore
-import com.twitter.hermit.store.common.ObservedReadableStore
-import com.twitter.relevance_platform.common.injection.LZ4Injection
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.score.ScoreFacadeStore
-import com.twitter.simclusters_v2.score.SimClustersEmbeddingPairScoreStore
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType.FavTfgTopic
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType.LogFavBasedKgoApeTopic
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType.LogFavBasedTweet
-import com.twitter.simclusters_v2.thriftscala.ModelVersion.Model20m145kUpdated
-import com.twitter.simclusters_v2.thriftscala.Score
-import com.twitter.simclusters_v2.thriftscala.ScoreId
-import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.stitch.storehaus.StitchOfReadableStore
-import com.twitter.storehaus.ReadableStore
-import com.twitter.strato.client.{Client => StratoClient}
-import com.twitter.topic_recos.stores.CertoTweetTopicScoresStore
-import javax.inject.Inject
-import javax.inject.Singleton
-
-@Singleton()
-class ScoreStore @Inject() (
- simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding],
- stratoClient: StratoClient,
- representationScorerCacheClient: Client,
- stats: StatsReceiver) {
-
- private val keyHasher = KeyHasher.FNV1A_64
- private val statsReceiver = stats.scope("score_store")
-
- /** ** Score Store *****/
- private val simClustersEmbeddingCosineSimilarityScoreStore =
- ObservedReadableStore(
- SimClustersEmbeddingPairScoreStore
- .buildCosineSimilarityStore(simClustersEmbeddingStore)
- .toThriftStore
- )(statsReceiver.scope("simClusters_embedding_cosine_similarity_score_store"))
-
- private val simClustersEmbeddingDotProductScoreStore =
- ObservedReadableStore(
- SimClustersEmbeddingPairScoreStore
- .buildDotProductStore(simClustersEmbeddingStore)
- .toThriftStore
- )(statsReceiver.scope("simClusters_embedding_dot_product_score_store"))
-
- private val simClustersEmbeddingJaccardSimilarityScoreStore =
- ObservedReadableStore(
- SimClustersEmbeddingPairScoreStore
- .buildJaccardSimilarityStore(simClustersEmbeddingStore)
- .toThriftStore
- )(statsReceiver.scope("simClusters_embedding_jaccard_similarity_score_store"))
-
- private val simClustersEmbeddingEuclideanDistanceScoreStore =
- ObservedReadableStore(
- SimClustersEmbeddingPairScoreStore
- .buildEuclideanDistanceStore(simClustersEmbeddingStore)
- .toThriftStore
- )(statsReceiver.scope("simClusters_embedding_euclidean_distance_score_store"))
-
- private val simClustersEmbeddingManhattanDistanceScoreStore =
- ObservedReadableStore(
- SimClustersEmbeddingPairScoreStore
- .buildManhattanDistanceStore(simClustersEmbeddingStore)
- .toThriftStore
- )(statsReceiver.scope("simClusters_embedding_manhattan_distance_score_store"))
-
- private val simClustersEmbeddingLogCosineSimilarityScoreStore =
- ObservedReadableStore(
- SimClustersEmbeddingPairScoreStore
- .buildLogCosineSimilarityStore(simClustersEmbeddingStore)
- .toThriftStore
- )(statsReceiver.scope("simClusters_embedding_log_cosine_similarity_score_store"))
-
- private val simClustersEmbeddingExpScaledCosineSimilarityScoreStore =
- ObservedReadableStore(
- SimClustersEmbeddingPairScoreStore
- .buildExpScaledCosineSimilarityStore(simClustersEmbeddingStore)
- .toThriftStore
- )(statsReceiver.scope("simClusters_embedding_exp_scaled_cosine_similarity_score_store"))
-
- // Use the default setting
- private val topicTweetRankingScoreStore =
- TopicTweetRankingScoreStore.buildTopicTweetRankingStore(
- FavTfgTopic,
- LogFavBasedKgoApeTopic,
- LogFavBasedTweet,
- Model20m145kUpdated,
- consumerEmbeddingMultiplier = 1.0,
- producerEmbeddingMultiplier = 1.0
- )
-
- private val topicTweetsCortexThresholdStore = TopicTweetsCosineSimilarityAggregateStore(
- TopicTweetsCosineSimilarityAggregateStore.DefaultScoreKeys,
- statsReceiver.scope("topic_tweets_cortex_threshold_store")
- )
-
- val topicTweetCertoScoreStore: ObservedCachedReadableStore[ScoreId, Score] = {
- val underlyingStore = ObservedReadableStore(
- TopicTweetCertoScoreStore(CertoTweetTopicScoresStore.prodStore(stratoClient))
- )(statsReceiver.scope("topic_tweet_certo_score_store"))
-
- val memcachedStore = ObservedMemcachedReadableStore
- .fromCacheClient(
- backingStore = underlyingStore,
- cacheClient = representationScorerCacheClient,
- ttl = 10.minutes
- )(
- valueInjection = LZ4Injection.compose(BinaryScalaCodec(Score)),
- statsReceiver = statsReceiver.scope("topic_tweet_certo_store_memcache"),
- keyToString = { k: ScoreId =>
- s"certocs:${keyHasher.hashKey(k.toString.getBytes)}"
- }
- )
-
- ObservedCachedReadableStore.from[ScoreId, Score](
- memcachedStore,
- ttl = 5.minutes,
- maxKeys = 1000000,
- cacheName = "topic_tweet_certo_store_cache",
- windowSize = 10000L
- )(statsReceiver.scope("topic_tweet_certo_store_cache"))
- }
-
- val uniformScoringStore: ReadableStore[ScoreId, Score] =
- ScoreFacadeStore.buildWithMetrics(
- readableStores = Map(
- ScoringAlgorithm.PairEmbeddingCosineSimilarity ->
- simClustersEmbeddingCosineSimilarityScoreStore,
- ScoringAlgorithm.PairEmbeddingDotProduct ->
- simClustersEmbeddingDotProductScoreStore,
- ScoringAlgorithm.PairEmbeddingJaccardSimilarity ->
- simClustersEmbeddingJaccardSimilarityScoreStore,
- ScoringAlgorithm.PairEmbeddingEuclideanDistance ->
- simClustersEmbeddingEuclideanDistanceScoreStore,
- ScoringAlgorithm.PairEmbeddingManhattanDistance ->
- simClustersEmbeddingManhattanDistanceScoreStore,
- ScoringAlgorithm.PairEmbeddingLogCosineSimilarity ->
- simClustersEmbeddingLogCosineSimilarityScoreStore,
- ScoringAlgorithm.PairEmbeddingExpScaledCosineSimilarity ->
- simClustersEmbeddingExpScaledCosineSimilarityScoreStore,
- // Certo normalized cosine score between topic-tweet pairs
- ScoringAlgorithm.CertoNormalizedCosineScore
- -> topicTweetCertoScoreStore,
- // Certo normalized dot-product score between topic-tweet pairs
- ScoringAlgorithm.CertoNormalizedDotProductScore
- -> topicTweetCertoScoreStore
- ),
- aggregatedStores = Map(
- ScoringAlgorithm.WeightedSumTopicTweetRanking ->
- topicTweetRankingScoreStore,
- ScoringAlgorithm.CortexTopicTweetLabel ->
- topicTweetsCortexThresholdStore,
- ),
- statsReceiver = stats
- )
-
- val uniformScoringStoreStitch: ScoreId => com.twitter.stitch.Stitch[Score] =
- StitchOfReadableStore(uniformScoringStore)
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.docx
new file mode 100644
index 000000000..3e88f4109
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.scala
deleted file mode 100644
index b6216985f..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.scala
+++ /dev/null
@@ -1,106 +0,0 @@
-package com.twitter.representationscorer.scorestore
-
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.thriftscala.ScoreInternalId.GenericPairScoreId
-import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm.CertoNormalizedDotProductScore
-import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm.CertoNormalizedCosineScore
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.TopicId
-import com.twitter.simclusters_v2.thriftscala.{Score => ThriftScore}
-import com.twitter.simclusters_v2.thriftscala.{ScoreId => ThriftScoreId}
-import com.twitter.storehaus.FutureOps
-import com.twitter.storehaus.ReadableStore
-import com.twitter.topic_recos.thriftscala.Scores
-import com.twitter.topic_recos.thriftscala.TopicToScores
-import com.twitter.util.Future
-
-/**
- * Score store to get Certo scores.
- * Currently, the store supports two Scoring Algorithms (i.e., two types of Certo scores):
- * 1. NormalizedDotProduct
- * 2. NormalizedCosine
- * Querying with corresponding scoring algorithms results in different Certo scores.
- */
-case class TopicTweetCertoScoreStore(certoStratoStore: ReadableStore[TweetId, TopicToScores])
- extends ReadableStore[ThriftScoreId, ThriftScore] {
-
- override def multiGet[K1 <: ThriftScoreId](ks: Set[K1]): Map[K1, Future[Option[ThriftScore]]] = {
- val tweetIds =
- ks.map(_.internalId).collect {
- case GenericPairScoreId(scoreId) =>
- ((scoreId.id1, scoreId.id2): @annotation.nowarn(
- "msg=may not be exhaustive|max recursion depth")) match {
- case (InternalId.TweetId(tweetId), _) => tweetId
- case (_, InternalId.TweetId(tweetId)) => tweetId
- }
- }
-
- val result = for {
- certoScores <- Future.collect(certoStratoStore.multiGet(tweetIds))
- } yield {
- ks.map { k =>
- (k.algorithm, k.internalId) match {
- case (CertoNormalizedDotProductScore, GenericPairScoreId(scoreId)) =>
- (scoreId.id1, scoreId.id2) match {
- case (InternalId.TweetId(tweetId), InternalId.TopicId(topicId)) =>
- (
- k,
- extractScore(
- tweetId,
- topicId,
- certoScores,
- _.followerL2NormalizedDotProduct8HrHalfLife))
- case (InternalId.TopicId(topicId), InternalId.TweetId(tweetId)) =>
- (
- k,
- extractScore(
- tweetId,
- topicId,
- certoScores,
- _.followerL2NormalizedDotProduct8HrHalfLife))
- case _ => (k, None)
- }
- case (CertoNormalizedCosineScore, GenericPairScoreId(scoreId)) =>
- (scoreId.id1, scoreId.id2) match {
- case (InternalId.TweetId(tweetId), InternalId.TopicId(topicId)) =>
- (
- k,
- extractScore(
- tweetId,
- topicId,
- certoScores,
- _.followerL2NormalizedCosineSimilarity8HrHalfLife))
- case (InternalId.TopicId(topicId), InternalId.TweetId(tweetId)) =>
- (
- k,
- extractScore(
- tweetId,
- topicId,
- certoScores,
- _.followerL2NormalizedCosineSimilarity8HrHalfLife))
- case _ => (k, None)
- }
- case _ => (k, None)
- }
- }.toMap
- }
- FutureOps.liftValues(ks, result)
- }
-
- /**
- * Given tweetToCertoScores, extract certain Certo score between the given tweetId and topicId.
- * The Certo score of interest is specified using scoreExtractor.
- */
- def extractScore(
- tweetId: TweetId,
- topicId: TopicId,
- tweetToCertoScores: Map[TweetId, Option[TopicToScores]],
- scoreExtractor: Scores => Double
- ): Option[ThriftScore] = {
- tweetToCertoScores.get(tweetId).flatMap {
- case Some(topicToScores) =>
- topicToScores.topicToScores.flatMap(_.get(topicId).map(scoreExtractor).map(ThriftScore(_)))
- case _ => Some(ThriftScore(0.0))
- }
- }
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.docx
new file mode 100644
index 000000000..228aa2528
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.scala
deleted file mode 100644
index 9ff502fd6..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.scala
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.twitter.representationscorer.scorestore
-
-import com.twitter.simclusters_v2.score.WeightedSumAggregatedScoreStore
-import com.twitter.simclusters_v2.score.WeightedSumAggregatedScoreStore.WeightedSumAggregatedScoreParameter
-import com.twitter.simclusters_v2.thriftscala.{EmbeddingType, ModelVersion, ScoringAlgorithm}
-
-object TopicTweetRankingScoreStore {
- val producerEmbeddingScoreMultiplier = 1.0
- val consumerEmbeddingScoreMultiplier = 1.0
-
- /**
- * Build the scoring store for TopicTweet Ranking based on Default Multipliers.
- * If you want to compare the ranking between different multipliers, register a new
- * ScoringAlgorithm and let the upstream uses different scoringAlgorithm by params.
- */
- def buildTopicTweetRankingStore(
- consumerEmbeddingType: EmbeddingType,
- producerEmbeddingType: EmbeddingType,
- tweetEmbeddingType: EmbeddingType,
- modelVersion: ModelVersion,
- consumerEmbeddingMultiplier: Double = consumerEmbeddingScoreMultiplier,
- producerEmbeddingMultiplier: Double = producerEmbeddingScoreMultiplier
- ): WeightedSumAggregatedScoreStore = {
- WeightedSumAggregatedScoreStore(
- List(
- WeightedSumAggregatedScoreParameter(
- ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- consumerEmbeddingMultiplier,
- WeightedSumAggregatedScoreStore.genericPairScoreIdToSimClustersEmbeddingPairScoreId(
- consumerEmbeddingType,
- tweetEmbeddingType,
- modelVersion
- )
- ),
- WeightedSumAggregatedScoreParameter(
- ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- producerEmbeddingMultiplier,
- WeightedSumAggregatedScoreStore.genericPairScoreIdToSimClustersEmbeddingPairScoreId(
- producerEmbeddingType,
- tweetEmbeddingType,
- modelVersion
- )
- )
- )
- )
- }
-
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.docx
new file mode 100644
index 000000000..d024c0dbe
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.scala
deleted file mode 100644
index f835158b8..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.scala
+++ /dev/null
@@ -1,148 +0,0 @@
-package com.twitter.representationscorer.scorestore
-
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.frigate.common.util.StatsUtil
-import com.twitter.representationscorer.scorestore.TopicTweetsCosineSimilarityAggregateStore.ScoreKey
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.score.AggregatedScoreStore
-import com.twitter.simclusters_v2.thriftscala.ScoreInternalId.GenericPairScoreId
-import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm.CortexTopicTweetLabel
-import com.twitter.simclusters_v2.thriftscala.{
- EmbeddingType,
- InternalId,
- ModelVersion,
- ScoreInternalId,
- ScoringAlgorithm,
- SimClustersEmbeddingId,
- TopicId,
- Score => ThriftScore,
- ScoreId => ThriftScoreId,
- SimClustersEmbeddingPairScoreId => ThriftSimClustersEmbeddingPairScoreId
-}
-import com.twitter.storehaus.ReadableStore
-import com.twitter.topic_recos.common.Configs.{DefaultModelVersion, MinCosineSimilarityScore}
-import com.twitter.topic_recos.common._
-import com.twitter.util.Future
-
-/**
- * Calculates the cosine similarity scores of arbitrary combinations of TopicEmbeddings and
- * TweetEmbeddings.
- * The class has 2 uses:
- * 1. For internal uses. TSP will call this store to fetch the raw scores for (topic, tweet) with
- * all available embedding types. We calculate all the scores here, so the caller can do filtering
- * & score caching on their side. This will make it possible to DDG different embedding scores.
- *
- * 2. For external calls from Cortex. We return true (or 1.0) for any given (topic, tweet) if their
- * cosine similarity passes the threshold for any of the embedding types.
- * The expected input type is
- * ScoreId(
- * PairEmbeddingCosineSimilarity,
- * GenericPairScoreId(TopicId, TweetId)
- * )
- */
-case class TopicTweetsCosineSimilarityAggregateStore(
- scoreKeys: Seq[ScoreKey],
- statsReceiver: StatsReceiver)
- extends AggregatedScoreStore {
-
- def toCortexScore(scoresMap: Map[ScoreKey, Double]): Double = {
- val passThreshold = scoresMap.exists {
- case (_, score) => score >= MinCosineSimilarityScore
- }
- if (passThreshold) 1.0 else 0.0
- }
-
- /**
- * To be called by Cortex through Unified Score API ONLY. Calculates all possible (topic, tweet),
- * return 1.0 if any of the embedding scores passes the minimum threshold.
- *
- * Expect a GenericPairScoreId(PairEmbeddingCosineSimilarity, (TopicId, TweetId)) as input
- */
- override def get(k: ThriftScoreId): Future[Option[ThriftScore]] = {
- StatsUtil.trackOptionStats(statsReceiver) {
- (k.algorithm, k.internalId) match {
- case (CortexTopicTweetLabel, GenericPairScoreId(genericPairScoreId)) =>
- (genericPairScoreId.id1, genericPairScoreId.id2) match {
- case (InternalId.TopicId(topicId), InternalId.TweetId(tweetId)) =>
- TopicTweetsCosineSimilarityAggregateStore
- .getRawScoresMap(topicId, tweetId, scoreKeys, scoreFacadeStore)
- .map { scoresMap => Some(ThriftScore(toCortexScore(scoresMap))) }
- case (InternalId.TweetId(tweetId), InternalId.TopicId(topicId)) =>
- TopicTweetsCosineSimilarityAggregateStore
- .getRawScoresMap(topicId, tweetId, scoreKeys, scoreFacadeStore)
- .map { scoresMap => Some(ThriftScore(toCortexScore(scoresMap))) }
- case _ =>
- Future.None
- // Do not accept other InternalId combinations
- }
- case _ =>
- // Do not accept other Id types for now
- Future.None
- }
- }
- }
-}
-
-object TopicTweetsCosineSimilarityAggregateStore {
-
- val TopicEmbeddingTypes: Seq[EmbeddingType] =
- Seq(
- EmbeddingType.FavTfgTopic,
- EmbeddingType.LogFavBasedKgoApeTopic
- )
-
- // Add the new embedding types if want to test the new Tweet embedding performance.
- val TweetEmbeddingTypes: Seq[EmbeddingType] = Seq(EmbeddingType.LogFavBasedTweet)
-
- val ModelVersions: Seq[ModelVersion] =
- Seq(DefaultModelVersion)
-
- val DefaultScoreKeys: Seq[ScoreKey] = {
- for {
- modelVersion <- ModelVersions
- topicEmbeddingType <- TopicEmbeddingTypes
- tweetEmbeddingType <- TweetEmbeddingTypes
- } yield {
- ScoreKey(
- topicEmbeddingType = topicEmbeddingType,
- tweetEmbeddingType = tweetEmbeddingType,
- modelVersion = modelVersion
- )
- }
- }
- case class ScoreKey(
- topicEmbeddingType: EmbeddingType,
- tweetEmbeddingType: EmbeddingType,
- modelVersion: ModelVersion)
-
- def getRawScoresMap(
- topicId: TopicId,
- tweetId: TweetId,
- scoreKeys: Seq[ScoreKey],
- uniformScoringStore: ReadableStore[ThriftScoreId, ThriftScore]
- ): Future[Map[ScoreKey, Double]] = {
- val scoresMapFut = scoreKeys.map { key =>
- val scoreInternalId = ScoreInternalId.SimClustersEmbeddingPairScoreId(
- ThriftSimClustersEmbeddingPairScoreId(
- buildTopicEmbedding(topicId, key.topicEmbeddingType, key.modelVersion),
- SimClustersEmbeddingId(
- key.tweetEmbeddingType,
- key.modelVersion,
- InternalId.TweetId(tweetId))
- ))
- val scoreFut = uniformScoringStore
- .get(
- ThriftScoreId(
- algorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, // Hard code as cosine sim
- internalId = scoreInternalId
- ))
- key -> scoreFut
- }.toMap
-
- Future
- .collect(scoresMapFut).map(_.collect {
- case (key, Some(ThriftScore(score))) =>
- (key, score)
- })
- }
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD
deleted file mode 100644
index 1c617e9a0..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/github/ben-manes/caffeine",
- "finatra/inject/inject-core/src/main/scala",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer/common",
- "representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore",
- "representation-scorer/server/src/main/thrift:thrift-scala",
- "src/thrift/com/twitter/twistly:twistly-scala",
- "stitch/stitch-core",
- "stitch/stitch-core:cache",
- "strato/config/columns/recommendations/twistly:twistly-strato-client",
- "strato/config/columns/recommendations/user-signal-service:user-signal-service-strato-client",
- "strato/src/main/scala/com/twitter/strato/client",
- "user-signal-service/thrift/src/main/thrift:thrift-scala",
- "util/util-core",
- ],
-)
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD.docx
new file mode 100644
index 000000000..536e20f40
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.docx
new file mode 100644
index 000000000..c49d547d2
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.scala
deleted file mode 100644
index 2da828ce6..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.scala
+++ /dev/null
@@ -1,65 +0,0 @@
-package com.twitter.representationscorer.twistlyfeatures
-
-import com.twitter.conversions.DurationOps._
-import com.twitter.util.Duration
-import com.twitter.util.Time
-
-case class Engagements(
- favs7d: Seq[UserSignal] = Nil,
- retweets7d: Seq[UserSignal] = Nil,
- follows30d: Seq[UserSignal] = Nil,
- shares7d: Seq[UserSignal] = Nil,
- replies7d: Seq[UserSignal] = Nil,
- originalTweets7d: Seq[UserSignal] = Nil,
- videoPlaybacks7d: Seq[UserSignal] = Nil,
- block30d: Seq[UserSignal] = Nil,
- mute30d: Seq[UserSignal] = Nil,
- report30d: Seq[UserSignal] = Nil,
- dontlike30d: Seq[UserSignal] = Nil,
- seeFewer30d: Seq[UserSignal] = Nil) {
-
- import Engagements._
-
- private val now = Time.now
- private val oneDayAgo = (now - OneDaySpan).inMillis
- private val sevenDaysAgo = (now - SevenDaysSpan).inMillis
-
- // All ids from the signals grouped by type (tweetIds, userIds, etc)
- val tweetIds: Seq[Long] =
- (favs7d ++ retweets7d ++ shares7d
- ++ replies7d ++ originalTweets7d ++ videoPlaybacks7d
- ++ report30d ++ dontlike30d ++ seeFewer30d)
- .map(_.targetId)
- val authorIds: Seq[Long] = (follows30d ++ block30d ++ mute30d).map(_.targetId)
-
- // Tweet signals
- val dontlike7d: Seq[UserSignal] = dontlike30d.filter(_.timestamp > sevenDaysAgo)
- val seeFewer7d: Seq[UserSignal] = seeFewer30d.filter(_.timestamp > sevenDaysAgo)
-
- val favs1d: Seq[UserSignal] = favs7d.filter(_.timestamp > oneDayAgo)
- val retweets1d: Seq[UserSignal] = retweets7d.filter(_.timestamp > oneDayAgo)
- val shares1d: Seq[UserSignal] = shares7d.filter(_.timestamp > oneDayAgo)
- val replies1d: Seq[UserSignal] = replies7d.filter(_.timestamp > oneDayAgo)
- val originalTweets1d: Seq[UserSignal] = originalTweets7d.filter(_.timestamp > oneDayAgo)
- val videoPlaybacks1d: Seq[UserSignal] = videoPlaybacks7d.filter(_.timestamp > oneDayAgo)
- val dontlike1d: Seq[UserSignal] = dontlike7d.filter(_.timestamp > oneDayAgo)
- val seeFewer1d: Seq[UserSignal] = seeFewer7d.filter(_.timestamp > oneDayAgo)
-
- // User signals
- val follows7d: Seq[UserSignal] = follows30d.filter(_.timestamp > sevenDaysAgo)
- val block7d: Seq[UserSignal] = block30d.filter(_.timestamp > sevenDaysAgo)
- val mute7d: Seq[UserSignal] = mute30d.filter(_.timestamp > sevenDaysAgo)
- val report7d: Seq[UserSignal] = report30d.filter(_.timestamp > sevenDaysAgo)
-
- val block1d: Seq[UserSignal] = block7d.filter(_.timestamp > oneDayAgo)
- val mute1d: Seq[UserSignal] = mute7d.filter(_.timestamp > oneDayAgo)
- val report1d: Seq[UserSignal] = report7d.filter(_.timestamp > oneDayAgo)
-}
-
-object Engagements {
- val OneDaySpan: Duration = 1.days
- val SevenDaysSpan: Duration = 7.days
- val ThirtyDaysSpan: Duration = 30.days
-}
-
-case class UserSignal(targetId: Long, timestamp: Long)
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.docx
new file mode 100644
index 000000000..b43f972ee
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.scala
deleted file mode 100644
index 71df34a19..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.scala
+++ /dev/null
@@ -1,3 +0,0 @@
-package com.twitter.representationscorer.twistlyfeatures
-
-case class ScoreResult(id: Long, score: Option[Double])
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.docx
new file mode 100644
index 000000000..b82e4eb84
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.scala
deleted file mode 100644
index 731412d0a..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.scala
+++ /dev/null
@@ -1,474 +0,0 @@
-package com.twitter.representationscorer.twistlyfeatures
-
-import com.twitter.finagle.stats.Counter
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.representationscorer.common.TweetId
-import com.twitter.representationscorer.common.UserId
-import com.twitter.representationscorer.scorestore.ScoreStore
-import com.twitter.representationscorer.thriftscala.SimClustersRecentEngagementSimilarities
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.ScoreId
-import com.twitter.simclusters_v2.thriftscala.ScoreInternalId
-import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingPairScoreId
-import com.twitter.stitch.Stitch
-import javax.inject.Inject
-
-class Scorer @Inject() (
- fetchEngagementsFromUSS: Long => Stitch[Engagements],
- scoreStore: ScoreStore,
- stats: StatsReceiver) {
-
- import Scorer._
-
- private val scoreStats = stats.scope("score")
- private val scoreCalculationStats = scoreStats.scope("calculation")
- private val scoreResultStats = scoreStats.scope("result")
-
- private val scoresNonEmptyCounter = scoreResultStats.scope("all").counter("nonEmpty")
- private val scoresNonZeroCounter = scoreResultStats.scope("all").counter("nonZero")
-
- private val tweetScoreStats = scoreCalculationStats.scope("tweetScore").stat("latency")
- private val userScoreStats = scoreCalculationStats.scope("userScore").stat("latency")
-
- private val favNonZero = scoreResultStats.scope("favs").counter("nonZero")
- private val favNonEmpty = scoreResultStats.scope("favs").counter("nonEmpty")
-
- private val retweetsNonZero = scoreResultStats.scope("retweets").counter("nonZero")
- private val retweetsNonEmpty = scoreResultStats.scope("retweets").counter("nonEmpty")
-
- private val followsNonZero = scoreResultStats.scope("follows").counter("nonZero")
- private val followsNonEmpty = scoreResultStats.scope("follows").counter("nonEmpty")
-
- private val sharesNonZero = scoreResultStats.scope("shares").counter("nonZero")
- private val sharesNonEmpty = scoreResultStats.scope("shares").counter("nonEmpty")
-
- private val repliesNonZero = scoreResultStats.scope("replies").counter("nonZero")
- private val repliesNonEmpty = scoreResultStats.scope("replies").counter("nonEmpty")
-
- private val originalTweetsNonZero = scoreResultStats.scope("originalTweets").counter("nonZero")
- private val originalTweetsNonEmpty = scoreResultStats.scope("originalTweets").counter("nonEmpty")
-
- private val videoViewsNonZero = scoreResultStats.scope("videoViews").counter("nonZero")
- private val videoViewsNonEmpty = scoreResultStats.scope("videoViews").counter("nonEmpty")
-
- private val blockNonZero = scoreResultStats.scope("block").counter("nonZero")
- private val blockNonEmpty = scoreResultStats.scope("block").counter("nonEmpty")
-
- private val muteNonZero = scoreResultStats.scope("mute").counter("nonZero")
- private val muteNonEmpty = scoreResultStats.scope("mute").counter("nonEmpty")
-
- private val reportNonZero = scoreResultStats.scope("report").counter("nonZero")
- private val reportNonEmpty = scoreResultStats.scope("report").counter("nonEmpty")
-
- private val dontlikeNonZero = scoreResultStats.scope("dontlike").counter("nonZero")
- private val dontlikeNonEmpty = scoreResultStats.scope("dontlike").counter("nonEmpty")
-
- private val seeFewerNonZero = scoreResultStats.scope("seeFewer").counter("nonZero")
- private val seeFewerNonEmpty = scoreResultStats.scope("seeFewer").counter("nonEmpty")
-
- private def getTweetScores(
- candidateTweetId: TweetId,
- sourceTweetIds: Seq[TweetId]
- ): Stitch[Seq[ScoreResult]] = {
- val getScoresStitch = Stitch.traverse(sourceTweetIds) { sourceTweetId =>
- scoreStore
- .uniformScoringStoreStitch(getTweetScoreId(sourceTweetId, candidateTweetId))
- .liftNotFoundToOption
- .map(score => ScoreResult(sourceTweetId, score.map(_.score)))
- }
-
- Stitch.time(getScoresStitch).flatMap {
- case (tryResult, duration) =>
- tweetScoreStats.add(duration.inMillis)
- Stitch.const(tryResult)
- }
- }
-
- private def getUserScores(
- tweetId: TweetId,
- authorIds: Seq[UserId]
- ): Stitch[Seq[ScoreResult]] = {
- val getScoresStitch = Stitch.traverse(authorIds) { authorId =>
- scoreStore
- .uniformScoringStoreStitch(getAuthorScoreId(authorId, tweetId))
- .liftNotFoundToOption
- .map(score => ScoreResult(authorId, score.map(_.score)))
- }
-
- Stitch.time(getScoresStitch).flatMap {
- case (tryResult, duration) =>
- userScoreStats.add(duration.inMillis)
- Stitch.const(tryResult)
- }
- }
-
- /**
- * Get the [[SimClustersRecentEngagementSimilarities]] result containing the similarity
- * features for the given userId-TweetId.
- */
- def get(
- userId: UserId,
- tweetId: TweetId
- ): Stitch[SimClustersRecentEngagementSimilarities] = {
- get(userId, Seq(tweetId)).map(x => x.head)
- }
-
- /**
- * Get a list of [[SimClustersRecentEngagementSimilarities]] results containing the similarity
- * features for the given tweets of the user Id.
- * Guaranteed to be the same number/order as requested.
- */
- def get(
- userId: UserId,
- tweetIds: Seq[TweetId]
- ): Stitch[Seq[SimClustersRecentEngagementSimilarities]] = {
- fetchEngagementsFromUSS(userId)
- .flatMap(engagements => {
- // For each tweet received in the request, compute the similarity scores between them
- // and the user signals fetched from USS.
- Stitch
- .join(
- Stitch.traverse(tweetIds)(id => getTweetScores(id, engagements.tweetIds)),
- Stitch.traverse(tweetIds)(id => getUserScores(id, engagements.authorIds)),
- )
- .map {
- case (tweetScoresSeq, userScoreSeq) =>
- // All seq have = size because when scores don't exist, they are returned as Option
- (tweetScoresSeq, userScoreSeq).zipped.map { (tweetScores, userScores) =>
- computeSimilarityScoresPerTweet(
- engagements,
- tweetScores.groupBy(_.id),
- userScores.groupBy(_.id))
- }
- }
- })
- }
-
- /**
- *
- * Computes the [[SimClustersRecentEngagementSimilarities]]
- * using the given tweet-tweet and user-tweet scores in TweetScoresMap
- * and the user signals in [[Engagements]].
- */
- private def computeSimilarityScoresPerTweet(
- engagements: Engagements,
- tweetScores: Map[TweetId, Seq[ScoreResult]],
- authorScores: Map[UserId, Seq[ScoreResult]]
- ): SimClustersRecentEngagementSimilarities = {
- val favs7d = engagements.favs7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val favs1d = engagements.favs1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val retweets7d = engagements.retweets7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val retweets1d = engagements.retweets1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val follows30d = engagements.follows30d.view
- .flatMap(s => authorScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val follows7d = engagements.follows7d.view
- .flatMap(s => authorScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val shares7d = engagements.shares7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val shares1d = engagements.shares1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val replies7d = engagements.replies7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val replies1d = engagements.replies1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val originalTweets7d = engagements.originalTweets7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val originalTweets1d = engagements.originalTweets1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val videoViews7d = engagements.videoPlaybacks7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val videoViews1d = engagements.videoPlaybacks1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val block30d = engagements.block30d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val block7d = engagements.block7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val block1d = engagements.block1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val mute30d = engagements.mute30d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val mute7d = engagements.mute7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val mute1d = engagements.mute1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val report30d = engagements.report30d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val report7d = engagements.report7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val report1d = engagements.report1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val dontlike30d = engagements.dontlike30d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val dontlike7d = engagements.dontlike7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val dontlike1d = engagements.dontlike1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val seeFewer30d = engagements.seeFewer30d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val seeFewer7d = engagements.seeFewer7d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val seeFewer1d = engagements.seeFewer1d.view
- .flatMap(s => tweetScores.get(s.targetId))
- .flatten.flatMap(_.score)
- .force
-
- val result = SimClustersRecentEngagementSimilarities(
- fav1dLast10Max = max(favs1d),
- fav1dLast10Avg = avg(favs1d),
- fav7dLast10Max = max(favs7d),
- fav7dLast10Avg = avg(favs7d),
- retweet1dLast10Max = max(retweets1d),
- retweet1dLast10Avg = avg(retweets1d),
- retweet7dLast10Max = max(retweets7d),
- retweet7dLast10Avg = avg(retweets7d),
- follow7dLast10Max = max(follows7d),
- follow7dLast10Avg = avg(follows7d),
- follow30dLast10Max = max(follows30d),
- follow30dLast10Avg = avg(follows30d),
- share1dLast10Max = max(shares1d),
- share1dLast10Avg = avg(shares1d),
- share7dLast10Max = max(shares7d),
- share7dLast10Avg = avg(shares7d),
- reply1dLast10Max = max(replies1d),
- reply1dLast10Avg = avg(replies1d),
- reply7dLast10Max = max(replies7d),
- reply7dLast10Avg = avg(replies7d),
- originalTweet1dLast10Max = max(originalTweets1d),
- originalTweet1dLast10Avg = avg(originalTweets1d),
- originalTweet7dLast10Max = max(originalTweets7d),
- originalTweet7dLast10Avg = avg(originalTweets7d),
- videoPlayback1dLast10Max = max(videoViews1d),
- videoPlayback1dLast10Avg = avg(videoViews1d),
- videoPlayback7dLast10Max = max(videoViews7d),
- videoPlayback7dLast10Avg = avg(videoViews7d),
- block1dLast10Max = max(block1d),
- block1dLast10Avg = avg(block1d),
- block7dLast10Max = max(block7d),
- block7dLast10Avg = avg(block7d),
- block30dLast10Max = max(block30d),
- block30dLast10Avg = avg(block30d),
- mute1dLast10Max = max(mute1d),
- mute1dLast10Avg = avg(mute1d),
- mute7dLast10Max = max(mute7d),
- mute7dLast10Avg = avg(mute7d),
- mute30dLast10Max = max(mute30d),
- mute30dLast10Avg = avg(mute30d),
- report1dLast10Max = max(report1d),
- report1dLast10Avg = avg(report1d),
- report7dLast10Max = max(report7d),
- report7dLast10Avg = avg(report7d),
- report30dLast10Max = max(report30d),
- report30dLast10Avg = avg(report30d),
- dontlike1dLast10Max = max(dontlike1d),
- dontlike1dLast10Avg = avg(dontlike1d),
- dontlike7dLast10Max = max(dontlike7d),
- dontlike7dLast10Avg = avg(dontlike7d),
- dontlike30dLast10Max = max(dontlike30d),
- dontlike30dLast10Avg = avg(dontlike30d),
- seeFewer1dLast10Max = max(seeFewer1d),
- seeFewer1dLast10Avg = avg(seeFewer1d),
- seeFewer7dLast10Max = max(seeFewer7d),
- seeFewer7dLast10Avg = avg(seeFewer7d),
- seeFewer30dLast10Max = max(seeFewer30d),
- seeFewer30dLast10Avg = avg(seeFewer30d),
- )
- trackStats(result)
- result
- }
-
- private def trackStats(result: SimClustersRecentEngagementSimilarities): Unit = {
- val scores = Seq(
- result.fav7dLast10Max,
- result.retweet7dLast10Max,
- result.follow30dLast10Max,
- result.share1dLast10Max,
- result.share7dLast10Max,
- result.reply7dLast10Max,
- result.originalTweet7dLast10Max,
- result.videoPlayback7dLast10Max,
- result.block30dLast10Max,
- result.mute30dLast10Max,
- result.report30dLast10Max,
- result.dontlike30dLast10Max,
- result.seeFewer30dLast10Max
- )
-
- val nonEmpty = scores.exists(_.isDefined)
- val nonZero = scores.exists { case Some(score) if score > 0 => true; case _ => false }
-
- if (nonEmpty) {
- scoresNonEmptyCounter.incr()
- }
-
- if (nonZero) {
- scoresNonZeroCounter.incr()
- }
-
- // We use the largest window of a given type of score,
- // because the largest window is inclusive of smaller windows.
- trackSignalStats(favNonEmpty, favNonZero, result.fav7dLast10Avg)
- trackSignalStats(retweetsNonEmpty, retweetsNonZero, result.retweet7dLast10Avg)
- trackSignalStats(followsNonEmpty, followsNonZero, result.follow30dLast10Avg)
- trackSignalStats(sharesNonEmpty, sharesNonZero, result.share7dLast10Avg)
- trackSignalStats(repliesNonEmpty, repliesNonZero, result.reply7dLast10Avg)
- trackSignalStats(originalTweetsNonEmpty, originalTweetsNonZero, result.originalTweet7dLast10Avg)
- trackSignalStats(videoViewsNonEmpty, videoViewsNonZero, result.videoPlayback7dLast10Avg)
- trackSignalStats(blockNonEmpty, blockNonZero, result.block30dLast10Avg)
- trackSignalStats(muteNonEmpty, muteNonZero, result.mute30dLast10Avg)
- trackSignalStats(reportNonEmpty, reportNonZero, result.report30dLast10Avg)
- trackSignalStats(dontlikeNonEmpty, dontlikeNonZero, result.dontlike30dLast10Avg)
- trackSignalStats(seeFewerNonEmpty, seeFewerNonZero, result.seeFewer30dLast10Avg)
- }
-
- private def trackSignalStats(nonEmpty: Counter, nonZero: Counter, score: Option[Double]): Unit = {
- if (score.nonEmpty) {
- nonEmpty.incr()
-
- if (score.get > 0)
- nonZero.incr()
- }
- }
-}
-
-object Scorer {
- def avg(s: Traversable[Double]): Option[Double] =
- if (s.isEmpty) None else Some(s.sum / s.size)
- def max(s: Traversable[Double]): Option[Double] =
- if (s.isEmpty) None else Some(s.foldLeft(0.0D) { (curr, _max) => math.max(curr, _max) })
-
- private def getAuthorScoreId(
- userId: UserId,
- tweetId: TweetId
- ) = {
- ScoreId(
- algorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- internalId = ScoreInternalId.SimClustersEmbeddingPairScoreId(
- SimClustersEmbeddingPairScoreId(
- SimClustersEmbeddingId(
- internalId = InternalId.UserId(userId),
- modelVersion = ModelVersion.Model20m145k2020,
- embeddingType = EmbeddingType.FavBasedProducer
- ),
- SimClustersEmbeddingId(
- internalId = InternalId.TweetId(tweetId),
- modelVersion = ModelVersion.Model20m145k2020,
- embeddingType = EmbeddingType.LogFavBasedTweet
- )
- ))
- )
- }
-
- private def getTweetScoreId(
- sourceTweetId: TweetId,
- candidateTweetId: TweetId
- ) = {
- ScoreId(
- algorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
- internalId = ScoreInternalId.SimClustersEmbeddingPairScoreId(
- SimClustersEmbeddingPairScoreId(
- SimClustersEmbeddingId(
- internalId = InternalId.TweetId(sourceTweetId),
- modelVersion = ModelVersion.Model20m145k2020,
- embeddingType = EmbeddingType.LogFavLongestL2EmbeddingTweet
- ),
- SimClustersEmbeddingId(
- internalId = InternalId.TweetId(candidateTweetId),
- modelVersion = ModelVersion.Model20m145k2020,
- embeddingType = EmbeddingType.LogFavBasedTweet
- )
- ))
- )
- }
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.docx
new file mode 100644
index 000000000..74f054937
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.scala
deleted file mode 100644
index fb09c1e57..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.scala
+++ /dev/null
@@ -1,155 +0,0 @@
-package com.twitter.representationscorer.twistlyfeatures
-
-import com.twitter.decider.SimpleRecipient
-import com.twitter.finagle.stats.Stat
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.representationscorer.common._
-import com.twitter.representationscorer.twistlyfeatures.Engagements._
-import com.twitter.simclusters_v2.common.SimClustersEmbeddingId.LongInternalId
-import com.twitter.stitch.Stitch
-import com.twitter.strato.generated.client.recommendations.user_signal_service.SignalsClientColumn
-import com.twitter.strato.generated.client.recommendations.user_signal_service.SignalsClientColumn.Value
-import com.twitter.usersignalservice.thriftscala.BatchSignalRequest
-import com.twitter.usersignalservice.thriftscala.SignalRequest
-import com.twitter.usersignalservice.thriftscala.SignalType
-import com.twitter.util.Time
-import scala.collection.mutable.ArrayBuffer
-import com.twitter.usersignalservice.thriftscala.ClientIdentifier
-
-class UserSignalServiceRecentEngagementsClient(
- stratoClient: SignalsClientColumn,
- decider: RepresentationScorerDecider,
- stats: StatsReceiver) {
-
- import UserSignalServiceRecentEngagementsClient._
-
- private val signalStats = stats.scope("user-signal-service", "signal")
- private val signalTypeStats: Map[SignalType, Stat] =
- SignalType.list.map(s => (s, signalStats.scope(s.name).stat("size"))).toMap
-
- def get(userId: UserId): Stitch[Engagements] = {
- val request = buildRequest(userId)
- stratoClient.fetcher.fetch(request).map(_.v).lowerFromOption().map { response =>
- val now = Time.now
- val sevenDaysAgo = now - SevenDaysSpan
- val thirtyDaysAgo = now - ThirtyDaysSpan
-
- Engagements(
- favs7d = getUserSignals(response, SignalType.TweetFavorite, sevenDaysAgo),
- retweets7d = getUserSignals(response, SignalType.Retweet, sevenDaysAgo),
- follows30d = getUserSignals(response, SignalType.AccountFollowWithDelay, thirtyDaysAgo),
- shares7d = getUserSignals(response, SignalType.TweetShareV1, sevenDaysAgo),
- replies7d = getUserSignals(response, SignalType.Reply, sevenDaysAgo),
- originalTweets7d = getUserSignals(response, SignalType.OriginalTweet, sevenDaysAgo),
- videoPlaybacks7d =
- getUserSignals(response, SignalType.VideoView90dPlayback50V1, sevenDaysAgo),
- block30d = getUserSignals(response, SignalType.AccountBlock, thirtyDaysAgo),
- mute30d = getUserSignals(response, SignalType.AccountMute, thirtyDaysAgo),
- report30d = getUserSignals(response, SignalType.TweetReport, thirtyDaysAgo),
- dontlike30d = getUserSignals(response, SignalType.TweetDontLike, thirtyDaysAgo),
- seeFewer30d = getUserSignals(response, SignalType.TweetSeeFewer, thirtyDaysAgo),
- )
- }
- }
-
- private def getUserSignals(
- response: Value,
- signalType: SignalType,
- earliestValidTimestamp: Time
- ): Seq[UserSignal] = {
- val signals = response.signalResponse
- .getOrElse(signalType, Seq.empty)
- .view
- .filter(_.timestamp > earliestValidTimestamp.inMillis)
- .map(s => s.targetInternalId.collect { case LongInternalId(id) => (id, s.timestamp) })
- .collect { case Some((id, engagedAt)) => UserSignal(id, engagedAt) }
- .take(EngagementsToScore)
- .force
-
- signalTypeStats(signalType).add(signals.size)
- signals
- }
-
- private def buildRequest(userId: Long) = {
- val recipient = Some(SimpleRecipient(userId))
-
- // Signals RSX always fetches
- val requestSignals = ArrayBuffer(
- SignalRequestFav,
- SignalRequestRetweet,
- SignalRequestFollow
- )
-
- // Signals under experimentation. We use individual deciders to disable them if necessary.
- // If experiments are successful, they will become permanent.
- if (decider.isAvailable(FetchSignalShareDeciderKey, recipient))
- requestSignals.append(SignalRequestShare)
-
- if (decider.isAvailable(FetchSignalReplyDeciderKey, recipient))
- requestSignals.append(SignalRequestReply)
-
- if (decider.isAvailable(FetchSignalOriginalTweetDeciderKey, recipient))
- requestSignals.append(SignalRequestOriginalTweet)
-
- if (decider.isAvailable(FetchSignalVideoPlaybackDeciderKey, recipient))
- requestSignals.append(SignalRequestVideoPlayback)
-
- if (decider.isAvailable(FetchSignalBlockDeciderKey, recipient))
- requestSignals.append(SignalRequestBlock)
-
- if (decider.isAvailable(FetchSignalMuteDeciderKey, recipient))
- requestSignals.append(SignalRequestMute)
-
- if (decider.isAvailable(FetchSignalReportDeciderKey, recipient))
- requestSignals.append(SignalRequestReport)
-
- if (decider.isAvailable(FetchSignalDontlikeDeciderKey, recipient))
- requestSignals.append(SignalRequestDontlike)
-
- if (decider.isAvailable(FetchSignalSeeFewerDeciderKey, recipient))
- requestSignals.append(SignalRequestSeeFewer)
-
- BatchSignalRequest(userId, requestSignals, Some(ClientIdentifier.RepresentationScorerHome))
- }
-}
-
-object UserSignalServiceRecentEngagementsClient {
- val FetchSignalShareDeciderKey = "representation_scorer_fetch_signal_share"
- val FetchSignalReplyDeciderKey = "representation_scorer_fetch_signal_reply"
- val FetchSignalOriginalTweetDeciderKey = "representation_scorer_fetch_signal_original_tweet"
- val FetchSignalVideoPlaybackDeciderKey = "representation_scorer_fetch_signal_video_playback"
- val FetchSignalBlockDeciderKey = "representation_scorer_fetch_signal_block"
- val FetchSignalMuteDeciderKey = "representation_scorer_fetch_signal_mute"
- val FetchSignalReportDeciderKey = "representation_scorer_fetch_signal_report"
- val FetchSignalDontlikeDeciderKey = "representation_scorer_fetch_signal_dont_like"
- val FetchSignalSeeFewerDeciderKey = "representation_scorer_fetch_signal_see_fewer"
-
- val EngagementsToScore = 10
- private val engagementsToScoreOpt: Option[Long] = Some(EngagementsToScore)
-
- val SignalRequestFav: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.TweetFavorite)
- val SignalRequestRetweet: SignalRequest = SignalRequest(engagementsToScoreOpt, SignalType.Retweet)
- val SignalRequestFollow: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.AccountFollowWithDelay)
- // New experimental signals
- val SignalRequestShare: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.TweetShareV1)
- val SignalRequestReply: SignalRequest = SignalRequest(engagementsToScoreOpt, SignalType.Reply)
- val SignalRequestOriginalTweet: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.OriginalTweet)
- val SignalRequestVideoPlayback: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.VideoView90dPlayback50V1)
-
- // Negative signals
- val SignalRequestBlock: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.AccountBlock)
- val SignalRequestMute: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.AccountMute)
- val SignalRequestReport: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.TweetReport)
- val SignalRequestDontlike: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.TweetDontLike)
- val SignalRequestSeeFewer: SignalRequest =
- SignalRequest(engagementsToScoreOpt, SignalType.TweetSeeFewer)
-}
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.docx
new file mode 100644
index 000000000..8b8376276
Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.docx differ
diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.scala
deleted file mode 100644
index ee9f61df4..000000000
--- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.scala
+++ /dev/null
@@ -1,57 +0,0 @@
-package com.twitter.representationscorer.twistlyfeatures
-
-import com.github.benmanes.caffeine.cache.Caffeine
-import com.twitter.stitch.cache.EvictingCache
-import com.google.inject.Provides
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.inject.TwitterModule
-import com.twitter.representationscorer.common.RepresentationScorerDecider
-import com.twitter.stitch.Stitch
-import com.twitter.stitch.cache.ConcurrentMapCache
-import com.twitter.stitch.cache.MemoizeQuery
-import com.twitter.strato.client.Client
-import com.twitter.strato.generated.client.recommendations.user_signal_service.SignalsClientColumn
-import java.util.concurrent.ConcurrentMap
-import java.util.concurrent.TimeUnit
-import javax.inject.Singleton
-
-object UserSignalServiceRecentEngagementsClientModule extends TwitterModule {
-
- @Singleton
- @Provides
- def provide(
- client: Client,
- decider: RepresentationScorerDecider,
- statsReceiver: StatsReceiver
- ): Long => Stitch[Engagements] = {
- val stratoClient = new SignalsClientColumn(client)
-
- /*
- This cache holds a users recent engagements for a short period of time, such that batched requests
- for multiple (userid, tweetid) pairs don't all need to fetch them.
-
- [1] Caffeine cache keys/values must be objects, so we cannot use the `Long` primitive directly.
- The boxed java.lang.Long works as a key, since it is an object. In most situations the compiler
- can see where auto(un)boxing can occur. However, here we seem to need some wrapper functions
- with explicit types to allow the boxing to happen.
- */
- val mapCache: ConcurrentMap[java.lang.Long, Stitch[Engagements]] =
- Caffeine
- .newBuilder()
- .expireAfterWrite(5, TimeUnit.SECONDS)
- .maximumSize(
- 1000 // We estimate 5M unique users in a 5m period - with 2k RSX instances, assume that one will see < 1k in a 5s period
- )
- .build[java.lang.Long, Stitch[Engagements]]
- .asMap
-
- statsReceiver.provideGauge("ussRecentEngagementsClient", "cache_size") { mapCache.size.toFloat }
-
- val engagementsClient =
- new UserSignalServiceRecentEngagementsClient(stratoClient, decider, statsReceiver)
-
- val f = (l: java.lang.Long) => engagementsClient.get(l) // See note [1] above
- val cachedCall = MemoizeQuery(f, EvictingCache.lazily(new ConcurrentMapCache(mapCache)))
- (l: Long) => cachedCall(l) // see note [1] above
- }
-}
diff --git a/representation-scorer/server/src/main/thrift/BUILD b/representation-scorer/server/src/main/thrift/BUILD
deleted file mode 100644
index f7ea37675..000000000
--- a/representation-scorer/server/src/main/thrift/BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-create_thrift_libraries(
- base_name = "thrift",
- sources = [
- "com/twitter/representationscorer/service.thrift",
- ],
- platform = "java8",
- tags = [
- "bazel-compatible",
- ],
- dependency_roots = [
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift",
- ],
- generate_languages = [
- "java",
- "scala",
- "strato",
- ],
- provides_java_name = "representationscorer-service-thrift-java",
- provides_scala_name = "representationscorer-service-thrift-scala",
-)
diff --git a/representation-scorer/server/src/main/thrift/BUILD.docx b/representation-scorer/server/src/main/thrift/BUILD.docx
new file mode 100644
index 000000000..87fb52cdd
Binary files /dev/null and b/representation-scorer/server/src/main/thrift/BUILD.docx differ
diff --git a/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.docx b/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.docx
new file mode 100644
index 000000000..6b8d943a3
Binary files /dev/null and b/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.docx differ
diff --git a/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.thrift b/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.thrift
deleted file mode 100644
index 0e2f23a31..000000000
--- a/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.thrift
+++ /dev/null
@@ -1,106 +0,0 @@
-namespace java com.twitter.representationscorer.thriftjava
-#@namespace scala com.twitter.representationscorer.thriftscala
-#@namespace strato com.twitter.representationscorer
-
-include "com/twitter/simclusters_v2/identifier.thrift"
-include "com/twitter/simclusters_v2/online_store.thrift"
-include "com/twitter/simclusters_v2/score.thrift"
-
-struct SimClustersRecentEngagementSimilarities {
- // All scores computed using cosine similarity
- // 1 - 1000 Positive Signals
- 1: optional double fav1dLast10Max // max score from last 10 faves in the last 1 day
- 2: optional double fav1dLast10Avg // avg score from last 10 faves in the last 1 day
- 3: optional double fav7dLast10Max // max score from last 10 faves in the last 7 days
- 4: optional double fav7dLast10Avg // avg score from last 10 faves in the last 7 days
- 5: optional double retweet1dLast10Max // max score from last 10 retweets in the last 1 days
- 6: optional double retweet1dLast10Avg // avg score from last 10 retweets in the last 1 days
- 7: optional double retweet7dLast10Max // max score from last 10 retweets in the last 7 days
- 8: optional double retweet7dLast10Avg // avg score from last 10 retweets in the last 7 days
- 9: optional double follow7dLast10Max // max score from the last 10 follows in the last 7 days
- 10: optional double follow7dLast10Avg // avg score from the last 10 follows in the last 7 days
- 11: optional double follow30dLast10Max // max score from the last 10 follows in the last 30 days
- 12: optional double follow30dLast10Avg // avg score from the last 10 follows in the last 30 days
- 13: optional double share1dLast10Max // max score from last 10 shares in the last 1 day
- 14: optional double share1dLast10Avg // avg score from last 10 shares in the last 1 day
- 15: optional double share7dLast10Max // max score from last 10 shares in the last 7 days
- 16: optional double share7dLast10Avg // avg score from last 10 shares in the last 7 days
- 17: optional double reply1dLast10Max // max score from last 10 replies in the last 1 day
- 18: optional double reply1dLast10Avg // avg score from last 10 replies in the last 1 day
- 19: optional double reply7dLast10Max // max score from last 10 replies in the last 7 days
- 20: optional double reply7dLast10Avg // avg score from last 10 replies in the last 7 days
- 21: optional double originalTweet1dLast10Max // max score from last 10 original tweets in the last 1 day
- 22: optional double originalTweet1dLast10Avg // avg score from last 10 original tweets in the last 1 day
- 23: optional double originalTweet7dLast10Max // max score from last 10 original tweets in the last 7 days
- 24: optional double originalTweet7dLast10Avg // avg score from last 10 original tweets in the last 7 days
- 25: optional double videoPlayback1dLast10Max // max score from last 10 video playback50 in the last 1 day
- 26: optional double videoPlayback1dLast10Avg // avg score from last 10 video playback50 in the last 1 day
- 27: optional double videoPlayback7dLast10Max // max score from last 10 video playback50 in the last 7 days
- 28: optional double videoPlayback7dLast10Avg // avg score from last 10 video playback50 in the last 7 days
-
- // 1001 - 2000 Implicit Signals
-
- // 2001 - 3000 Negative Signals
- // Block Series
- 2001: optional double block1dLast10Avg
- 2002: optional double block1dLast10Max
- 2003: optional double block7dLast10Avg
- 2004: optional double block7dLast10Max
- 2005: optional double block30dLast10Avg
- 2006: optional double block30dLast10Max
- // Mute Series
- 2101: optional double mute1dLast10Avg
- 2102: optional double mute1dLast10Max
- 2103: optional double mute7dLast10Avg
- 2104: optional double mute7dLast10Max
- 2105: optional double mute30dLast10Avg
- 2106: optional double mute30dLast10Max
- // Report Series
- 2201: optional double report1dLast10Avg
- 2202: optional double report1dLast10Max
- 2203: optional double report7dLast10Avg
- 2204: optional double report7dLast10Max
- 2205: optional double report30dLast10Avg
- 2206: optional double report30dLast10Max
- // Dontlike
- 2301: optional double dontlike1dLast10Avg
- 2302: optional double dontlike1dLast10Max
- 2303: optional double dontlike7dLast10Avg
- 2304: optional double dontlike7dLast10Max
- 2305: optional double dontlike30dLast10Avg
- 2306: optional double dontlike30dLast10Max
- // SeeFewer
- 2401: optional double seeFewer1dLast10Avg
- 2402: optional double seeFewer1dLast10Max
- 2403: optional double seeFewer7dLast10Avg
- 2404: optional double seeFewer7dLast10Max
- 2405: optional double seeFewer30dLast10Avg
- 2406: optional double seeFewer30dLast10Max
-}(persisted='true', hasPersonalData = 'true')
-
-/*
- * List score API
- */
-struct ListScoreId {
- 1: required score.ScoringAlgorithm algorithm
- 2: required online_store.ModelVersion modelVersion
- 3: required identifier.EmbeddingType targetEmbeddingType
- 4: required identifier.InternalId targetId
- 5: required identifier.EmbeddingType candidateEmbeddingType
- 6: required list candidateIds
-}(hasPersonalData = 'true')
-
-struct ScoreResult {
- // This api does not communicate why a score is missing. For example, it may be unavailable
- // because the referenced entities do not exist (e.g. the embedding was not found) or because
- // timeouts prevented us from calculating it.
- 1: optional double score
-}
-
-struct ListScoreResponse {
- 1: required list scores // Guaranteed to be the same number/order as requested
-}
-
-struct RecentEngagementSimilaritiesResponse {
- 1: required list results // Guaranteed to be the same number/order as requested
-}
diff --git a/science/search/ingester/config/README.docx b/science/search/ingester/config/README.docx
new file mode 100644
index 000000000..d230edd25
Binary files /dev/null and b/science/search/ingester/config/README.docx differ
diff --git a/science/search/ingester/config/README.md b/science/search/ingester/config/README.md
deleted file mode 100644
index 34f69d6e6..000000000
--- a/science/search/ingester/config/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-## Ingester Configs
-This directory contains pipeline configurations for the tweet ingesters (realtime, protected and realtime_cg) and the user-updates ingester. The pipeline configurations define an ordered sequence of stages that the tweet or user update goes through before reaching Earlybird. Source code for the various stages referenced in the configs can be found at src/java/com/twitter/search/ingester/pipeline/twitter.
\ No newline at end of file
diff --git a/science/search/ingester/config/pipeline-indexer.userupdates.docx b/science/search/ingester/config/pipeline-indexer.userupdates.docx
new file mode 100644
index 000000000..fe11d55a5
Binary files /dev/null and b/science/search/ingester/config/pipeline-indexer.userupdates.docx differ
diff --git a/science/search/ingester/config/pipeline-indexer.userupdates.xml b/science/search/ingester/config/pipeline-indexer.userupdates.xml
deleted file mode 100644
index f422b511d..000000000
--- a/science/search/ingester/config/pipeline-indexer.userupdates.xml
+++ /dev/null
@@ -1,30 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/science/search/ingester/config/pipeline-ingester.protected.docx b/science/search/ingester/config/pipeline-ingester.protected.docx
new file mode 100644
index 000000000..7ff79d6fc
Binary files /dev/null and b/science/search/ingester/config/pipeline-ingester.protected.docx differ
diff --git a/science/search/ingester/config/pipeline-ingester.protected.xml b/science/search/ingester/config/pipeline-ingester.protected.xml
deleted file mode 100644
index 434a621c2..000000000
--- a/science/search/ingester/config/pipeline-ingester.protected.xml
+++ /dev/null
@@ -1,202 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/science/search/ingester/config/pipeline-ingester.realtime.docx b/science/search/ingester/config/pipeline-ingester.realtime.docx
new file mode 100644
index 000000000..57381fe03
Binary files /dev/null and b/science/search/ingester/config/pipeline-ingester.realtime.docx differ
diff --git a/science/search/ingester/config/pipeline-ingester.realtime.xml b/science/search/ingester/config/pipeline-ingester.realtime.xml
deleted file mode 100644
index 65700bed2..000000000
--- a/science/search/ingester/config/pipeline-ingester.realtime.xml
+++ /dev/null
@@ -1,240 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/science/search/ingester/config/pipeline-ingester.realtime_cg.docx b/science/search/ingester/config/pipeline-ingester.realtime_cg.docx
new file mode 100644
index 000000000..b9dc5feb3
Binary files /dev/null and b/science/search/ingester/config/pipeline-ingester.realtime_cg.docx differ
diff --git a/science/search/ingester/config/pipeline-ingester.realtime_cg.xml b/science/search/ingester/config/pipeline-ingester.realtime_cg.xml
deleted file mode 100644
index 617af252e..000000000
--- a/science/search/ingester/config/pipeline-ingester.realtime_cg.xml
+++ /dev/null
@@ -1,199 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/simclusters-ann/BUILD.bazel b/simclusters-ann/BUILD.bazel
deleted file mode 100644
index 1624a57d4..000000000
--- a/simclusters-ann/BUILD.bazel
+++ /dev/null
@@ -1 +0,0 @@
-# This prevents SQ query from grabbing //:all since it traverses up once to find a BUILD
diff --git a/simclusters-ann/BUILD.docx b/simclusters-ann/BUILD.docx
new file mode 100644
index 000000000..99307005e
Binary files /dev/null and b/simclusters-ann/BUILD.docx differ
diff --git a/simclusters-ann/README.docx b/simclusters-ann/README.docx
new file mode 100644
index 000000000..407662cc7
Binary files /dev/null and b/simclusters-ann/README.docx differ
diff --git a/simclusters-ann/README.md b/simclusters-ann/README.md
deleted file mode 100644
index 69ff6cffa..000000000
--- a/simclusters-ann/README.md
+++ /dev/null
@@ -1,99 +0,0 @@
-# SimClusters ANN
-
-SimClusters ANN is a service that returns tweet candidate recommendations given a SimClusters embedding. The service implements tweet recommendations based on the Approximate Cosine Similarity algorithm.
-
-The cosine similarity between two Tweet SimClusters Embedding represents the relevance level of two tweets in SimCluster space. The traditional algorithm for calculating cosine similarity is expensive and hard to support by the existing infrastructure. Therefore, the Approximate Cosine Similarity algorithm is introduced to save response time by reducing I/O operations.
-
-## Background
-SimClusters V2 runtime infra introduces the SimClusters and its online and offline approaches. A heron job builds the mapping between SimClusters and Tweets. The job saves top 400 Tweets for a SimClusters and top 100 SimClusters for a Tweet. Favorite score and follow score are two types of tweet score. In the document, the top 100 SimClusters based on the favorite score for a Tweet stands for the Tweet SimClusters Embedding.
-
-The cosine similarity between two Tweet SimClusters Embedding presents the relevant level of two tweets in SimCluster space. The score varies from 0 to 1. The high cosine similarity score(>= 0.7 in Prod) means that the users who like two tweets share the same SimClusters.
-
-
-SimClusters from the Linear Algebra Perspective discussed the difference between the dot-product and cosine similarity in SimCluster space. We believe the cosine similarity approach is better because it avoids the bias of tweet popularity.
-
- However, calculating the cosine similarity between two Tweets is pretty expensive in Tweet candidate generation. In TWISTLY, we scan at most 15,000 (6 source tweets * 25 clusters * 100 tweets per clusters) tweet candidates for every Home Timeline request. The traditional algorithm needs to make API calls to fetch 15,000 tweet SimCluster embeddings. Consider that we need to process over 6,000 RPS, it’s hard to support by the existing infrastructure.
-
-
-## SimClusters Approximate Cosine Similarity Core Algorithm
-
-1. Provide a source SimCluster Embedding *SV*, *SV = [(SC1, Score), (SC2, Score), (SC3, Score) …]*
-
-2. Fetch top *M* tweets for each Top *N* SimClusters based on SV. In Prod, *M = 400*, *N = 50*. Tweets may appear in multiple SimClusters.
-
-| | | | |
-|---|---|---|---|
-| SC1 | T1:Score | T2: Score | ... |
-| SC2 | T3: Score | T4: Score | ... |
-
-
-3. Based on the previous table, generate an *(M x N) x N* Matrix *R*. The *R* represents the approximate SimCluster embeddings for *MxN* tweets. The embedding only contains top *N* SimClusters from *SV*. Only top *M* tweets from each SimCluster have the score. Others are 0.
-
-| | SC1 | SC2 | ... |
-|---|---|---|---|
-| T1 | Score | 0 | ... |
-| T2 | Score | 0 | ... |
-| T3 | 0 | Score | ... |
-
-4. Compute the dot product between source vector and the approximate vectors for each tweet. (Calculate *R • SV^T*). Take top *X* tweets. In Prod, *X = 200*
-
-5. Fetch *X* tweet SimClusters Embedding, Calculate Cosine Similarity between *X* tweets and *SV*, Return top *Y* above a certain threshold *Z*.
-
-Approximate Cosine Similarity is an approximate algorithm. Instead of fetching *M * N* tweets embedding, it only fetches *X* tweets embedding. In prod, *X / M * N * 100% = 6%*. Based on the metrics during TWISTLY development, most of the response time is consumed by I/O operation. The Approximate Cosine Similarity is a good approach to save a large amount of response time.
-
-The idea of the approximate algorithm is based on the assumption that the higher dot-product between source tweets’ SimCluster embedding and candidate tweet’s limited SimCluster Embedding, the possibility that these two tweets are relevant is higher. Additional Cosine Similarity filter is to guarantee that the results are not affected by popularity bias.
-
-Adjusting the M, N, X, Y, Z is able to balance the precision and recall for different products. The implementation of approximate cosine similarity is used by TWISTLY, Interest-based tweet recommendation, Similar Tweet in RUX, and Author based recommendation. This algorithm is also suitable for future user or entity recommendation based on SimClusters Embedding.
-
-
-# -------------------------------
-# Build and Test
-# -------------------------------
-Compile the service
-
- $ ./bazel build simclusters-ann/server:bin
-
-Unit tests
-
- $ ./bazel test simclusters-ann/server:bin
-
-# -------------------------------
-# Deploy
-# -------------------------------
-
-## Prerequisite for devel deployments
-First of all, you need to generate Service to Service certificates for use while developing locally. This only needs to be done ONCE:
-
-To add cert files to Aurora (if you want to deploy to DEVEL):
-```
-$ developer-cert-util --env devel --job simclusters-ann
-```
-
-## Deploying to devel/staging from a local build
-Reference -
-
- $ ./simclusters-ann/bin/deploy.sh --help
-
-Use the script to build the service in your local branch, upload it to packer and deploy in devel aurora:
-
- $ ./simclusters-ann/bin/deploy.sh atla $USER devel simclusters-ann
-
-You can also deploy to staging with this script. E.g. to deploy to instance 1:
-
- $ ./simclusters-ann/bin/deploy.sh atla simclusters-ann staging simclusters-ann
-
-## Deploying to production
-
-Production deploys should be managed by Workflows.
-_Do not_ deploy to production unless it is an emergency and you have approval from oncall.
-
-##### It is not recommended to deploy from Command Lines into production environments, unless 1) you're testing a small change in Canary shard [0,9]. 2) Tt is an absolute emergency. Be sure to make oncalls aware of the changes you're deploying.
-
- $ ./simclusters-ann/bin/deploy.sh atla simclusters-ann prod simclusters-ann
-In the case of multiple instances,
-
- $ ./simclusters-ann/bin/deploy.sh atla simclusters-ann prod simclusters-ann -
-
-## Checking Deployed Version and Rolling Back
-
-Wherever possible, roll back using Workflows by finding an earlier good version and clicking the "rollback" button in the UI. This is the safest and least error-prone method.
diff --git a/simclusters-ann/server/BUILD b/simclusters-ann/server/BUILD
deleted file mode 100644
index 9a62359c3..000000000
--- a/simclusters-ann/server/BUILD
+++ /dev/null
@@ -1,23 +0,0 @@
-jvm_binary(
- name = "bin",
- basename = "simclusters-ann",
- main = "com.twitter.simclustersann.SimClustersAnnServerMain",
- runtime_platform = "java11",
- tags = ["bazel-compatible"],
- dependencies = [
- "finagle/finagle-zipkin-scribe/src/main/scala",
- "finatra/inject/inject-logback/src/main/scala",
- "loglens/loglens-logback/src/main/scala/com/twitter/loglens/logback",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann",
- "twitter-server-internal/src/main/scala",
- "twitter-server/logback-classic/src/main/scala",
- ],
-)
-
-# Aurora Workflows build phase convention requires a jvm_app named with ${project-name}-app
-jvm_app(
- name = "simclusters-ann-app",
- archive = "zip",
- binary = ":bin",
- tags = ["bazel-compatible"],
-)
diff --git a/simclusters-ann/server/BUILD.docx b/simclusters-ann/server/BUILD.docx
new file mode 100644
index 000000000..32d1c77d1
Binary files /dev/null and b/simclusters-ann/server/BUILD.docx differ
diff --git a/simclusters-ann/server/src/main/resources/BUILD b/simclusters-ann/server/src/main/resources/BUILD
deleted file mode 100644
index b3a752276..000000000
--- a/simclusters-ann/server/src/main/resources/BUILD
+++ /dev/null
@@ -1,7 +0,0 @@
-resources(
- sources = [
- "*.xml",
- "config/*.yml",
- ],
- tags = ["bazel-compatible"],
-)
diff --git a/simclusters-ann/server/src/main/resources/BUILD.docx b/simclusters-ann/server/src/main/resources/BUILD.docx
new file mode 100644
index 000000000..5bff15e03
Binary files /dev/null and b/simclusters-ann/server/src/main/resources/BUILD.docx differ
diff --git a/simclusters-ann/server/src/main/resources/config/decider.docx b/simclusters-ann/server/src/main/resources/config/decider.docx
new file mode 100644
index 000000000..5d9d7f8c6
Binary files /dev/null and b/simclusters-ann/server/src/main/resources/config/decider.docx differ
diff --git a/simclusters-ann/server/src/main/resources/config/decider.yml b/simclusters-ann/server/src/main/resources/config/decider.yml
deleted file mode 100644
index 80469028a..000000000
--- a/simclusters-ann/server/src/main/resources/config/decider.yml
+++ /dev/null
@@ -1,95 +0,0 @@
-# SimClusters embedding store enable / disable decider values
-
-# ---------- Dark Traffic Proxy ----------
-dark_traffic_filter:
- comment: Proportion of the requests that are forwarded as dark traffic to the proxy
- default_availability: 0
-
-# Tweet embeddings
-enable_LogFavBasedTweet_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_LogFavLongestL2EmbeddingTweet_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-# Entity embeddings
-enable_FavTfgTopic_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-
-enable_LogFavBasedKgoApeTopic_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-# KnownFor embeddings
-enable_FavBasedProducer_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_FollowBasedProducer_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_RelaxedAggregatableLogFavBasedProducer_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-# InterestedIn embeddings
-enable_LogFavBasedUserInterestedInFromAPE_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_FollowBasedUserInterestedInFromAPE_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_FavBasedUserInterestedIn_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_FollowBasedUserInterestedIn_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_LogFavBasedUserInterestedIn_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_FilteredUserInterestedIn_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_UnfilteredUserInterestedIn_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_LogFavBasedUserInterestedAverageAddressBookFromIIAPE_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
-
-enable_UserNextInterestedIn_Model20m145k2020:
- comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests."
- default_availability: 10000
diff --git a/simclusters-ann/server/src/main/resources/logback.docx b/simclusters-ann/server/src/main/resources/logback.docx
new file mode 100644
index 000000000..90e89f629
Binary files /dev/null and b/simclusters-ann/server/src/main/resources/logback.docx differ
diff --git a/simclusters-ann/server/src/main/resources/logback.xml b/simclusters-ann/server/src/main/resources/logback.xml
deleted file mode 100644
index 0bb0d6646..000000000
--- a/simclusters-ann/server/src/main/resources/logback.xml
+++ /dev/null
@@ -1,167 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- true
-
-
-
-
-
-
-
-
-
-
- ${log.service.output}
-
-
- ${log.service.output}.%d.gz
-
- 3GB
-
- 21
- true
-
-
- %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n
-
-
-
-
-
- ${log.access.output}
-
-
- ${log.access.output}.%d.gz
-
- 100MB
-
- 7
- true
-
-
- ${DEFAULT_ACCESS_PATTERN}%n
-
-
-
-
-
-
-
-
-
-
-
- allow_listed_pipeline_executions.log
-
-
- allow_listed_pipeline_executions.log.%d.gz
-
- 100MB
-
- 7
- true
-
-
- %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD
deleted file mode 100644
index 00aefb800..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/google/inject:guice",
- "3rdparty/jvm/javax/inject:javax.inject",
- "3rdparty/jvm/net/codingwell:scala-guice",
- "finagle/finagle-core/src/main",
- "finagle/finagle-http/src/main/scala",
- "finagle/finagle-thriftmux/src/main/scala",
- "finatra-internal/decider/src/main/scala",
- "finatra-internal/mtls-thriftmux/src/main/scala",
- "finatra/inject/inject-app/src/main/scala",
- "finatra/inject/inject-core/src/main/scala",
- "finatra/inject/inject-server/src/main/scala",
- "finatra/inject/inject-thrift-client/src/main/scala",
- "finatra/inject/inject-utils/src/main/scala",
- "finatra/utils/src/main/java/com/twitter/finatra/annotations",
- "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/exceptions",
- "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/filters",
- "simclusters-ann/server/src/main/resources",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules",
- "simclusters-ann/thrift/src/main/thrift:thrift-scala",
- "src/thrift/com/twitter/search:earlybird-scala",
- "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms/view",
- "twitter-server/server/src/main/scala",
- "util/util-app/src/main/scala",
- "util/util-core:scala",
- ],
-)
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD.docx
new file mode 100644
index 000000000..0fa5c117d
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.docx
new file mode 100644
index 000000000..e3b5d0b93
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.scala
deleted file mode 100644
index 6168a871c..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-package com.twitter.simclustersann
-
-import com.google.inject.Module
-import com.twitter.finatra.decider.modules.DeciderModule
-import com.twitter.finatra.mtls.thriftmux.Mtls
-import com.twitter.finatra.thrift.ThriftServer
-import com.twitter.finatra.thrift.filters._
-import com.twitter.finatra.thrift.routing.ThriftRouter
-import com.twitter.inject.thrift.modules.ThriftClientIdModule
-import com.twitter.relevance_platform.common.exceptions._
-import com.twitter.simclustersann.controllers.SimClustersANNController
-import com.twitter.simclustersann.exceptions.InvalidRequestForSimClustersAnnVariantExceptionMapper
-import com.twitter.simclustersann.modules._
-import com.twitter.simclustersann.thriftscala.SimClustersANNService
-import com.twitter.finagle.Filter
-import com.twitter.finatra.annotations.DarkTrafficFilterType
-import com.twitter.inject.annotations.Flags
-import com.twitter.relevance_platform.common.filters.DarkTrafficFilterModule
-import com.twitter.relevance_platform.common.filters.ClientStatsFilter
-import com.twitter.simclustersann.common.FlagNames.DisableWarmup
-
-object SimClustersAnnServerMain extends SimClustersAnnServer
-
-class SimClustersAnnServer extends ThriftServer with Mtls {
- flag(
- name = DisableWarmup,
- default = false,
- help = "If true, no warmup will be run."
- )
-
- override val name = "simclusters-ann-server"
-
- override val modules: Seq[Module] = Seq(
- CacheModule,
- ServiceNameMapperModule,
- ClusterConfigMapperModule,
- ClusterConfigModule,
- ClusterTweetIndexProviderModule,
- DeciderModule,
- EmbeddingStoreModule,
- FlagsModule,
- FuturePoolProvider,
- RateLimiterModule,
- SimClustersANNCandidateSourceModule,
- StratoClientProviderModule,
- ThriftClientIdModule,
- new CustomMtlsThriftWebFormsModule[SimClustersANNService.MethodPerEndpoint](this),
- new DarkTrafficFilterModule[SimClustersANNService.ReqRepServicePerEndpoint]()
- )
-
- def configureThrift(router: ThriftRouter): Unit = {
- router
- .filter[LoggingMDCFilter]
- .filter[TraceIdMDCFilter]
- .filter[ThriftMDCFilter]
- .filter[ClientStatsFilter]
- .filter[ExceptionMappingFilter]
- .filter[Filter.TypeAgnostic, DarkTrafficFilterType]
- .exceptionMapper[InvalidRequestForSimClustersAnnVariantExceptionMapper]
- .exceptionMapper[DeadlineExceededExceptionMapper]
- .exceptionMapper[UnhandledExceptionMapper]
- .add[SimClustersANNController]
- }
-
- override protected def warmup(): Unit = {
- if (!injector.instance[Boolean](Flags.named(DisableWarmup))) {
- handle[SimclustersAnnWarmupHandler]()
- }
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.docx
new file mode 100644
index 000000000..c57be5136
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.scala
deleted file mode 100644
index ca1078b75..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.scala
+++ /dev/null
@@ -1,73 +0,0 @@
-package com.twitter.simclustersann
-
-import com.twitter.inject.Logging
-import com.twitter.inject.utils.Handler
-import javax.inject.Inject
-import scala.util.control.NonFatal
-import com.google.common.util.concurrent.RateLimiter
-import com.twitter.conversions.DurationOps.richDurationFromInt
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.storehaus.ReadableStore
-import com.twitter.util.Await
-import com.twitter.util.ExecutorServiceFuturePool
-import com.twitter.util.Future
-
-class SimclustersAnnWarmupHandler @Inject() (
- clusterTweetCandidatesStore: ReadableStore[ClusterId, Seq[(TweetId, Double)]],
- futurePool: ExecutorServiceFuturePool,
- rateLimiter: RateLimiter,
- statsReceiver: StatsReceiver)
- extends Handler
- with Logging {
-
- private val stats = statsReceiver.scope(this.getClass.getName)
-
- private val scopedStats = stats.scope("fetchFromCache")
- private val clusters = scopedStats.counter("clusters")
- private val fetchedKeys = scopedStats.counter("keys")
- private val failures = scopedStats.counter("failures")
- private val success = scopedStats.counter("success")
-
- private val SimclustersNumber = 144428
-
- override def handle(): Unit = {
- try {
- val clusterIds = List.range(1, SimclustersNumber)
- val futures: Seq[Future[Unit]] = clusterIds
- .map { clusterId =>
- clusters.incr()
- futurePool {
- rateLimiter.acquire()
-
- Await.result(
- clusterTweetCandidatesStore
- .get(clusterId)
- .onSuccess { _ =>
- success.incr()
- }
- .handle {
- case NonFatal(e) =>
- failures.incr()
- },
- timeout = 10.seconds
- )
- fetchedKeys.incr()
- }
- }
-
- Await.result(Future.collect(futures), timeout = 10.minutes)
-
- } catch {
- case NonFatal(e) => error(e.getMessage, e)
- } finally {
- try {
- futurePool.executor.shutdown()
- } catch {
- case NonFatal(_) =>
- }
- info("Warmup done.")
- }
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.docx
new file mode 100644
index 000000000..f29a575d6
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.scala
deleted file mode 100644
index b5264f0bb..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.scala
+++ /dev/null
@@ -1,129 +0,0 @@
-package com.twitter.simclustersann.candidate_source
-
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclustersann.thriftscala.ScoringAlgorithm
-import com.twitter.simclustersann.thriftscala.SimClustersANNConfig
-import com.twitter.snowflake.id.SnowflakeId
-import com.twitter.util.Duration
-import com.twitter.util.Time
-import scala.collection.mutable
-
-/**
- * This store looks for tweets whose similarity is close to a Source SimClustersEmbeddingId.
- *
- * Approximate cosine similarity is the core algorithm to drive this store.
- *
- * Step 1 - 4 are in "fetchCandidates" method.
- * 1. Retrieve the SimClusters Embedding by the SimClustersEmbeddingId
- * 2. Fetch top N clusters' top tweets from the clusterTweetCandidatesStore (TopTweetsPerCluster index).
- * 3. Calculate all the tweet candidates' dot-product or approximate cosine similarity to source tweets.
- * 4. Take top M tweet candidates by the step 3's score
- */
-trait ApproximateCosineSimilarity {
- type ScoredTweet = (Long, Double)
- def apply(
- sourceEmbedding: SimClustersEmbedding,
- sourceEmbeddingId: SimClustersEmbeddingId,
- config: SimClustersANNConfig,
- candidateScoresStat: Int => Unit,
- clusterTweetsMap: Map[ClusterId, Option[Seq[(TweetId, Double)]]],
- clusterTweetsMapArray: Map[ClusterId, Option[Array[(TweetId, Double)]]] = Map.empty
- ): Seq[ScoredTweet]
-}
-
-object ApproximateCosineSimilarity extends ApproximateCosineSimilarity {
-
- final val InitialCandidateMapSize = 16384
- val MaxNumResultsUpperBound = 1000
- final val MaxTweetCandidateAgeUpperBound = 175200
-
- private class HashMap[A, B](initSize: Int) extends mutable.HashMap[A, B] {
- override def initialSize: Int = initSize // 16 - by default
- }
-
- private def parseTweetId(embeddingId: SimClustersEmbeddingId): Option[TweetId] = {
- embeddingId.internalId match {
- case InternalId.TweetId(tweetId) =>
- Some(tweetId)
- case _ =>
- None
- }
- }
-
- override def apply(
- sourceEmbedding: SimClustersEmbedding,
- sourceEmbeddingId: SimClustersEmbeddingId,
- config: SimClustersANNConfig,
- candidateScoresStat: Int => Unit,
- clusterTweetsMap: Map[ClusterId, Option[Seq[(TweetId, Double)]]] = Map.empty,
- clusterTweetsMapArray: Map[ClusterId, Option[Array[(TweetId, Double)]]] = Map.empty
- ): Seq[ScoredTweet] = {
- val now = Time.now
- val earliestTweetId =
- if (config.maxTweetCandidateAgeHours >= MaxTweetCandidateAgeUpperBound)
- 0L // Disable max tweet age filter
- else
- SnowflakeId.firstIdFor(now - Duration.fromHours(config.maxTweetCandidateAgeHours))
- val latestTweetId =
- SnowflakeId.firstIdFor(now - Duration.fromHours(config.minTweetCandidateAgeHours))
-
- // Use Mutable map to optimize performance. The method is thread-safe.
-
- // Set initial map size to around p75 of map size distribution to avoid too many copying
- // from extending the size of the mutable hashmap
- val candidateScoresMap =
- new HashMap[TweetId, Double](InitialCandidateMapSize)
- val candidateNormalizationMap =
- new HashMap[TweetId, Double](InitialCandidateMapSize)
-
- clusterTweetsMap.foreach {
- case (clusterId, Some(tweetScores)) if sourceEmbedding.contains(clusterId) =>
- val sourceClusterScore = sourceEmbedding.getOrElse(clusterId)
-
- for (i <- 0 until Math.min(tweetScores.size, config.maxTopTweetsPerCluster)) {
- val (tweetId, score) = tweetScores(i)
-
- if (!parseTweetId(sourceEmbeddingId).contains(tweetId) &&
- tweetId >= earliestTweetId && tweetId <= latestTweetId) {
- candidateScoresMap.put(
- tweetId,
- candidateScoresMap.getOrElse(tweetId, 0.0) + score * sourceClusterScore)
- candidateNormalizationMap
- .put(tweetId, candidateNormalizationMap.getOrElse(tweetId, 0.0) + score * score)
- }
- }
- case _ => ()
- }
-
- candidateScoresStat(candidateScoresMap.size)
-
- // Re-Rank the candidate by configuration
- val processedCandidateScores: Seq[(TweetId, Double)] = candidateScoresMap.map {
- case (candidateId, score) =>
- // Enable Partial Normalization
- val processedScore = {
- // We applied the "log" version of partial normalization when we rank candidates
- // by log cosine similarity
- config.annAlgorithm match {
- case ScoringAlgorithm.LogCosineSimilarity =>
- score / sourceEmbedding.logNorm / math.log(1 + candidateNormalizationMap(candidateId))
- case ScoringAlgorithm.CosineSimilarity =>
- score / sourceEmbedding.l2norm / math.sqrt(candidateNormalizationMap(candidateId))
- case ScoringAlgorithm.CosineSimilarityNoSourceEmbeddingNormalization =>
- score / math.sqrt(candidateNormalizationMap(candidateId))
- case ScoringAlgorithm.DotProduct => score
- }
- }
- candidateId -> processedScore
- }.toSeq
-
- processedCandidateScores
- .filter(_._2 >= config.minScore)
- .sortBy(-_._2)
- .take(Math.min(config.maxNumResults, MaxNumResultsUpperBound))
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD
deleted file mode 100644
index 21411b854..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD
+++ /dev/null
@@ -1,14 +0,0 @@
-scala_library(
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/google/guava",
- "3rdparty/jvm/com/twitter/storehaus:core",
- "frigate/frigate-common:base",
- "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/base",
- "simclusters-ann/thrift/src/main/thrift:thrift-scala",
- "src/scala/com/twitter/simclusters_v2/common",
- "src/scala/com/twitter/simclusters_v2/summingbird/stores",
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
- "util/util-stats/src/main/scala/com/twitter/finagle/stats",
- ],
-)
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD.docx
new file mode 100644
index 000000000..98ca7c8a3
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.docx
new file mode 100644
index 000000000..e5b329072
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.scala
deleted file mode 100644
index 7be2728f6..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.scala
+++ /dev/null
@@ -1,131 +0,0 @@
-package com.twitter.simclustersann.candidate_source
-
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclustersann.thriftscala.ScoringAlgorithm
-import com.twitter.simclustersann.thriftscala.SimClustersANNConfig
-import com.twitter.snowflake.id.SnowflakeId
-import com.twitter.util.Duration
-import com.twitter.util.Time
-import com.google.common.collect.Comparators
-import com.twitter.simclusters_v2.common.ClusterId
-
-/**
- * A modified version of OptimizedApproximateCosineSimilarity which uses more java streams to avoid
- * materializing intermediate collections. Its performance is still under investigation.
- */
-object ExperimentalApproximateCosineSimilarity extends ApproximateCosineSimilarity {
-
- final val InitialCandidateMapSize = 16384
- val MaxNumResultsUpperBound = 1000
- final val MaxTweetCandidateAgeUpperBound = 175200
-
- private def parseTweetId(embeddingId: SimClustersEmbeddingId): Option[TweetId] = {
- embeddingId.internalId match {
- case InternalId.TweetId(tweetId) =>
- Some(tweetId)
- case _ =>
- None
- }
- }
- private val CompareByScore: java.util.Comparator[(Long, Double)] =
- new java.util.Comparator[(Long, Double)] {
- override def compare(o1: (Long, Double), o2: (Long, Double)): Int = {
- java.lang.Double.compare(o1._2, o2._2)
- }
- }
- class Scores(var score: Double, var norm: Double)
-
- override def apply(
- sourceEmbedding: SimClustersEmbedding,
- sourceEmbeddingId: SimClustersEmbeddingId,
- config: SimClustersANNConfig,
- candidateScoresStat: Int => Unit,
- clusterTweetsMap: Map[ClusterId, Option[Seq[(TweetId, Double)]]] = Map.empty,
- clusterTweetsMapArray: Map[ClusterId, Option[Array[(TweetId, Double)]]] = Map.empty
- ): Seq[ScoredTweet] = {
- val now = Time.now
- val earliestTweetId =
- if (config.maxTweetCandidateAgeHours >= MaxTweetCandidateAgeUpperBound)
- 0L // Disable max tweet age filter
- else
- SnowflakeId.firstIdFor(now - Duration.fromHours(config.maxTweetCandidateAgeHours))
- val latestTweetId =
- SnowflakeId.firstIdFor(now - Duration.fromHours(config.minTweetCandidateAgeHours))
-
- val candidateScoresMap = new java.util.HashMap[Long, Scores](InitialCandidateMapSize)
- val sourceTweetId = parseTweetId(sourceEmbeddingId).getOrElse(0L)
-
- clusterTweetsMap.foreach {
- case (clusterId, Some(tweetScores)) =>
- val sourceClusterScore = sourceEmbedding.getOrElse(clusterId)
-
- for (i <- 0 until Math.min(tweetScores.size, config.maxTopTweetsPerCluster)) {
- val (tweetId, score) = tweetScores(i)
-
- if (tweetId >= earliestTweetId &&
- tweetId <= latestTweetId &&
- tweetId != sourceTweetId) {
-
- val scores = candidateScoresMap.get(tweetId)
- if (scores == null) {
- val scorePair = new Scores(
- score = score * sourceClusterScore,
- norm = score * score
- )
- candidateScoresMap.put(tweetId, scorePair)
- } else {
- scores.score = scores.score + (score * sourceClusterScore)
- scores.norm = scores.norm + (score * score)
- }
- }
- }
- case _ => ()
- }
-
- candidateScoresStat(candidateScoresMap.size)
-
- val normFn: (Long, Scores) => (Long, Double) = config.annAlgorithm match {
- case ScoringAlgorithm.LogCosineSimilarity =>
- (candidateId: Long, score: Scores) =>
- (
- candidateId,
- score.score / sourceEmbedding.logNorm / math.log(1 + score.norm)
- )
- case ScoringAlgorithm.CosineSimilarity =>
- (candidateId: Long, score: Scores) =>
- (
- candidateId,
- score.score / sourceEmbedding.l2norm / math.sqrt(score.norm)
- )
- case ScoringAlgorithm.CosineSimilarityNoSourceEmbeddingNormalization =>
- (candidateId: Long, score: Scores) =>
- (
- candidateId,
- score.score / math.sqrt(score.norm)
- )
- case ScoringAlgorithm.DotProduct =>
- (candidateId: Long, score: Scores) =>
- (
- candidateId,
- score.score
- )
- }
-
- import scala.collection.JavaConverters._
-
- val topKCollector = Comparators.greatest(
- Math.min(config.maxNumResults, MaxNumResultsUpperBound),
- CompareByScore
- )
-
- candidateScoresMap
- .entrySet().stream()
- .map[(Long, Double)]((e: java.util.Map.Entry[Long, Scores]) => normFn(e.getKey, e.getValue))
- .filter((s: (Long, Double)) => s._2 >= config.minScore)
- .collect(topKCollector)
- .asScala
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.docx
new file mode 100644
index 000000000..41dfc75d1
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.scala
deleted file mode 100644
index db2e7613e..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.scala
+++ /dev/null
@@ -1,112 +0,0 @@
-package com.twitter.simclustersann.candidate_source
-
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclustersann.thriftscala.ScoringAlgorithm
-import com.twitter.simclustersann.thriftscala.SimClustersANNConfig
-import com.twitter.snowflake.id.SnowflakeId
-import com.twitter.util.Duration
-import com.twitter.util.Time
-
-/**
- * Compared with ApproximateCosineSimilarity, this implementation:
- * - moves some computation aroudn to reduce allocations
- * - uses a single hashmap to store both scores and normalization coefficients
- * - uses some java collections in place of scala ones
- * Testing is still in progress, but this implementation shows significant (> 2x) improvements in
- * CPU utilization and allocations with 800 tweets per cluster.
- */
-object OptimizedApproximateCosineSimilarity extends ApproximateCosineSimilarity {
-
- final val InitialCandidateMapSize = 16384
- val MaxNumResultsUpperBound = 1000
- final val MaxTweetCandidateAgeUpperBound = 175200
-
- private def parseTweetId(embeddingId: SimClustersEmbeddingId): Option[TweetId] = {
- embeddingId.internalId match {
- case InternalId.TweetId(tweetId) =>
- Some(tweetId)
- case _ =>
- None
- }
- }
-
- override def apply(
- sourceEmbedding: SimClustersEmbedding,
- sourceEmbeddingId: SimClustersEmbeddingId,
- config: SimClustersANNConfig,
- candidateScoresStat: Int => Unit,
- clusterTweetsMap: Map[ClusterId, Option[Seq[(TweetId, Double)]]] = Map.empty,
- clusterTweetsMapArray: Map[ClusterId, Option[Array[(TweetId, Double)]]] = Map.empty
- ): Seq[ScoredTweet] = {
- val now = Time.now
- val earliestTweetId =
- if (config.maxTweetCandidateAgeHours >= MaxTweetCandidateAgeUpperBound)
- 0L // Disable max tweet age filter
- else
- SnowflakeId.firstIdFor(now - Duration.fromHours(config.maxTweetCandidateAgeHours))
- val latestTweetId =
- SnowflakeId.firstIdFor(now - Duration.fromHours(config.minTweetCandidateAgeHours))
-
- val candidateScoresMap = new java.util.HashMap[Long, (Double, Double)](InitialCandidateMapSize)
-
- val sourceTweetId = parseTweetId(sourceEmbeddingId).getOrElse(0L)
-
- clusterTweetsMap.foreach {
- case (clusterId, Some(tweetScores)) if sourceEmbedding.contains(clusterId) =>
- val sourceClusterScore = sourceEmbedding.getOrElse(clusterId)
-
- for (i <- 0 until Math.min(tweetScores.size, config.maxTopTweetsPerCluster)) {
- val (tweetId, score) = tweetScores(i)
-
- if (tweetId >= earliestTweetId &&
- tweetId <= latestTweetId &&
- tweetId != sourceTweetId) {
-
- val scores = candidateScoresMap.getOrDefault(tweetId, (0.0, 0.0))
- val newScores = (
- scores._1 + score * sourceClusterScore,
- scores._2 + score * score,
- )
- candidateScoresMap.put(tweetId, newScores)
- }
- }
- case _ => ()
- }
-
- candidateScoresStat(candidateScoresMap.size)
-
- val normFn: (Long, (Double, Double)) => (Long, Double) = config.annAlgorithm match {
- case ScoringAlgorithm.LogCosineSimilarity =>
- (candidateId: Long, score: (Double, Double)) =>
- candidateId -> score._1 / sourceEmbedding.logNorm / math.log(1 + score._2)
- case ScoringAlgorithm.CosineSimilarity =>
- (candidateId: Long, score: (Double, Double)) =>
- candidateId -> score._1 / sourceEmbedding.l2norm / math.sqrt(score._2)
- case ScoringAlgorithm.CosineSimilarityNoSourceEmbeddingNormalization =>
- (candidateId: Long, score: (Double, Double)) =>
- candidateId -> score._1 / math.sqrt(score._2)
- case ScoringAlgorithm.DotProduct =>
- (candidateId: Long, score: (Double, Double)) => (candidateId, score._1)
- }
-
- val scoredTweets: java.util.ArrayList[(Long, Double)] =
- new java.util.ArrayList(candidateScoresMap.size)
-
- val it = candidateScoresMap.entrySet().iterator()
- while (it.hasNext) {
- val mapEntry = it.next()
- val normedScore = normFn(mapEntry.getKey, mapEntry.getValue)
- if (normedScore._2 >= config.minScore)
- scoredTweets.add(normedScore)
- }
- import scala.collection.JavaConverters._
-
- scoredTweets.asScala
- .sortBy(-_._2)
- .take(Math.min(config.maxNumResults, MaxNumResultsUpperBound))
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.docx
new file mode 100644
index 000000000..92cfb4438
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.scala
deleted file mode 100644
index bb12a54f1..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.scala
+++ /dev/null
@@ -1,102 +0,0 @@
-package com.twitter.simclustersann.candidate_source
-
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.frigate.common.base.Stats
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.simclustersann.thriftscala.SimClustersANNConfig
-import com.twitter.simclustersann.thriftscala.SimClustersANNTweetCandidate
-import com.twitter.storehaus.ReadableStore
-import com.twitter.util.Future
-
-/**
- * This store looks for tweets whose similarity is close to a Source SimClustersEmbeddingId.
- *
- * Approximate cosine similarity is the core algorithm to drive this store.
- *
- * Step 1 - 4 are in "fetchCandidates" method.
- * 1. Retrieve the SimClusters Embedding by the SimClustersEmbeddingId
- * 2. Fetch top N clusters' top tweets from the clusterTweetCandidatesStore (TopTweetsPerCluster index).
- * 3. Calculate all the tweet candidates' dot-product or approximate cosine similarity to source tweets.
- * 4. Take top M tweet candidates by the step 3's score
- */
-case class SimClustersANNCandidateSource(
- approximateCosineSimilarity: ApproximateCosineSimilarity,
- clusterTweetCandidatesStore: ReadableStore[ClusterId, Seq[(TweetId, Double)]],
- simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding],
- statsReceiver: StatsReceiver) {
- private val stats = statsReceiver.scope(this.getClass.getName)
- private val fetchSourceEmbeddingStat = stats.scope("fetchSourceEmbedding")
- private val fetchCandidatesStat = stats.scope("fetchCandidates")
- private val candidateScoresStat = stats.stat("candidateScoresMap")
-
- def get(
- query: SimClustersANNCandidateSource.Query
- ): Future[Option[Seq[SimClustersANNTweetCandidate]]] = {
- val sourceEmbeddingId = query.sourceEmbeddingId
- val config = query.config
- for {
- maybeSimClustersEmbedding <- Stats.track(fetchSourceEmbeddingStat) {
- simClustersEmbeddingStore.get(query.sourceEmbeddingId)
- }
- maybeFilteredCandidates <- maybeSimClustersEmbedding match {
- case Some(sourceEmbedding) =>
- for {
- candidates <- Stats.trackSeq(fetchCandidatesStat) {
- fetchCandidates(sourceEmbeddingId, sourceEmbedding, config)
- }
- } yield {
- fetchCandidatesStat
- .stat(sourceEmbeddingId.embeddingType.name, sourceEmbeddingId.modelVersion.name).add(
- candidates.size)
- Some(candidates)
- }
- case None =>
- fetchCandidatesStat
- .stat(sourceEmbeddingId.embeddingType.name, sourceEmbeddingId.modelVersion.name).add(0)
- Future.None
- }
- } yield {
- maybeFilteredCandidates
- }
- }
-
- private def fetchCandidates(
- sourceEmbeddingId: SimClustersEmbeddingId,
- sourceEmbedding: SimClustersEmbedding,
- config: SimClustersANNConfig
- ): Future[Seq[SimClustersANNTweetCandidate]] = {
-
- val clusterIds =
- sourceEmbedding
- .truncate(config.maxScanClusters).getClusterIds()
- .toSet
-
- Future
- .collect {
- clusterTweetCandidatesStore.multiGet(clusterIds)
- }.map { clusterTweetsMap =>
- approximateCosineSimilarity(
- sourceEmbedding = sourceEmbedding,
- sourceEmbeddingId = sourceEmbeddingId,
- config = config,
- candidateScoresStat = (i: Int) => candidateScoresStat.add(i),
- clusterTweetsMap = clusterTweetsMap
- ).map {
- case (tweetId, score) =>
- SimClustersANNTweetCandidate(
- tweetId = tweetId,
- score = score
- )
- }
- }
- }
-}
-
-object SimClustersANNCandidateSource {
- case class Query(
- sourceEmbeddingId: SimClustersEmbeddingId,
- config: SimClustersANNConfig)
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD
deleted file mode 100644
index 75d63312d..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD
+++ /dev/null
@@ -1,5 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- tags = ["bazel-compatible"],
- dependencies = [],
-)
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD.docx
new file mode 100644
index 000000000..80f91312e
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.docx
new file mode 100644
index 000000000..7db6c0a70
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.scala
deleted file mode 100644
index ae2c36177..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.scala
+++ /dev/null
@@ -1,31 +0,0 @@
-package com.twitter.simclustersann.common
-
-object FlagNames {
-
- /**
- * Global Settings
- */
- final val ServiceTimeout = "service.timeout"
- final val DarkTrafficFilterDeciderKey = "thrift.dark.traffic.filter.decider_key"
-
- /**
- * Cache Setting
- */
- final val CacheDest = "cache_module.dest"
- final val CacheTimeout = "cache_module.timeout"
- // Only turn on the async update when the SANN Cluster has the production taffic.
- final val CacheAsyncUpdate = "cache_module.async_update"
-
- /**
- * Warmup Settings
- */
- final val DisableWarmup = "warmup.disable"
- final val NumberOfThreads = "warmup.thread_number"
- final val RateLimiterQPS = "warmup.rate_limiter_qps"
-
- /**
- * Algorithm Parameters
- */
- final val MaxTopTweetPerCluster = "sim_clusters.ann.max_top_tweets_per_cluster"
-
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD
deleted file mode 100644
index 69ccce158..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD
+++ /dev/null
@@ -1,29 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/google/inject:guice",
- "3rdparty/jvm/javax/inject:javax.inject",
- "3rdparty/jvm/net/codingwell:scala-guice",
- "decider/src/main/scala",
- "finagle/finagle-core/src/main",
- "finatra/inject/inject-core/src/main/scala",
- "finatra/thrift/src/main/scala/com/twitter/finatra/thrift",
- "finatra/thrift/src/main/scala/com/twitter/finatra/thrift:controller",
- "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/exceptions",
- "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/filters",
- "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/modules",
- "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/response",
- "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/routing",
- "representation-manager/server/src/main/scala/com/twitter/representation_manager/migration",
- "scrooge/scrooge-core/src/main/scala",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters",
- "simclusters-ann/thrift/src/main/thrift:thrift-scala",
- "src/scala/com/twitter/simclusters_v2/candidate_source",
- "twitter-server/server/src/main/scala",
- "util/util-core:scala",
- "util/util-slf4j-api/src/main/scala",
- ],
-)
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD.docx
new file mode 100644
index 000000000..af1649ad3
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.docx
new file mode 100644
index 000000000..766c26227
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.scala
deleted file mode 100644
index 459972b32..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.scala
+++ /dev/null
@@ -1,80 +0,0 @@
-package com.twitter.simclustersann.controllers
-
-import com.twitter.conversions.DurationOps._
-import com.twitter.finatra.thrift.Controller
-import com.twitter.simclustersann.thriftscala.SimClustersANNService.GetTweetCandidates
-import com.twitter.simclustersann.thriftscala.SimClustersANNService
-import com.twitter.simclustersann.thriftscala.Query
-import com.twitter.simclustersann.thriftscala.SimClustersANNTweetCandidate
-import com.twitter.scrooge.Request
-import com.twitter.scrooge.Response
-import javax.inject.Inject
-import com.twitter.finagle.Service
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.inject.annotations.Flag
-import com.twitter.simclustersann.candidate_source.{
- SimClustersANNCandidateSource => SANNSimClustersANNCandidateSource
-}
-import com.twitter.simclustersann.common.FlagNames
-import com.twitter.simclustersann.filters.GetTweetCandidatesResponseStatsFilter
-import com.twitter.simclustersann.filters.SimClustersAnnVariantFilter
-import com.twitter.util.Future
-import com.twitter.util.JavaTimer
-import com.twitter.util.Timer
-
-class SimClustersANNController @Inject() (
- @Flag(FlagNames.ServiceTimeout) serviceTimeout: Int,
- variantFilter: SimClustersAnnVariantFilter,
- getTweetCandidatesResponseStatsFilter: GetTweetCandidatesResponseStatsFilter,
- sannCandidateSource: SANNSimClustersANNCandidateSource,
- globalStats: StatsReceiver)
- extends Controller(SimClustersANNService) {
-
- import SimClustersANNController._
-
- private val stats: StatsReceiver = globalStats.scope(this.getClass.getCanonicalName)
- private val timer: Timer = new JavaTimer(true)
-
- val filteredService: Service[Request[GetTweetCandidates.Args], Response[
- Seq[SimClustersANNTweetCandidate]
- ]] = {
- variantFilter
- .andThen(getTweetCandidatesResponseStatsFilter)
- .andThen(Service.mk(handler))
- }
-
- handle(GetTweetCandidates).withService(filteredService)
-
- private def handler(
- request: Request[GetTweetCandidates.Args]
- ): Future[Response[Seq[SimClustersANNTweetCandidate]]] = {
- val query: Query = request.args.query
- val simClustersANNCandidateSourceQuery = SANNSimClustersANNCandidateSource.Query(
- sourceEmbeddingId = query.sourceEmbeddingId,
- config = query.config
- )
-
- val result = sannCandidateSource
- .get(simClustersANNCandidateSourceQuery).map {
- case Some(tweetCandidatesSeq) =>
- Response(tweetCandidatesSeq.map { tweetCandidate =>
- SimClustersANNTweetCandidate(
- tweetId = tweetCandidate.tweetId,
- score = tweetCandidate.score
- )
- })
- case None =>
- DefaultResponse
- }
-
- result.raiseWithin(serviceTimeout.milliseconds)(timer).rescue {
- case e: Throwable =>
- stats.scope("failures").counter(e.getClass.getCanonicalName).incr()
- Future.value(DefaultResponse)
- }
- }
-}
-
-object SimClustersANNController {
- val DefaultResponse: Response[Seq[SimClustersANNTweetCandidate]] = Response(Seq.empty)
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD
deleted file mode 100644
index c557c50ac..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD
+++ /dev/null
@@ -1,12 +0,0 @@
-scala_library(
- sources = ["*.scala"],
- compiler_option_sets = ["fatal_warnings"],
- strict_deps = True,
- tags = ["bazel-compatible"],
- dependencies = [
- "finagle/finagle-core/src/main",
- "finatra-internal/mtls-thriftmux/src/main/scala",
- "finatra-internal/thrift/src/main/thrift:thrift-scala",
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
- ],
-)
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD.docx
new file mode 100644
index 000000000..7cbb7d855
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.docx
new file mode 100644
index 000000000..2ab5a22a2
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.scala
deleted file mode 100644
index c9b046253..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.scala
+++ /dev/null
@@ -1,16 +0,0 @@
-package com.twitter.simclustersann.exceptions
-
-import com.twitter.finagle.RequestException
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-
-case class InvalidRequestForSimClustersAnnVariantException(
- modelVersion: ModelVersion,
- embeddingType: EmbeddingType,
- actualServiceName: String,
- expectedServiceName: Option[String])
- extends RequestException(
- s"Request with model version ($modelVersion) and embedding type ($embeddingType) cannot be " +
- s"processed by service variant ($actualServiceName)." +
- s" Expected service variant: $expectedServiceName.",
- null)
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.docx
new file mode 100644
index 000000000..273f1f983
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.scala
deleted file mode 100644
index fecca048e..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-package com.twitter.simclustersann.exceptions
-
-import com.twitter.finatra.thrift.exceptions.ExceptionMapper
-import com.twitter.finatra.thrift.thriftscala.ClientError
-import com.twitter.finatra.thrift.thriftscala.ClientErrorCause
-import com.twitter.util.Future
-import com.twitter.util.logging.Logging
-import javax.inject.Singleton
-
-/**
- * An exception mapper designed to handle
- * [[com.twitter.simclustersann.exceptions.InvalidRequestForSimClustersAnnVariantException]]
- * by returning a Thrift IDL defined Client Error.
- */
-@Singleton
-class InvalidRequestForSimClustersAnnVariantExceptionMapper
- extends ExceptionMapper[InvalidRequestForSimClustersAnnVariantException, Nothing]
- with Logging {
-
- override def handleException(
- throwable: InvalidRequestForSimClustersAnnVariantException
- ): Future[Nothing] = {
- error("Invalid Request For SimClusters Ann Variant Exception", throwable)
-
- Future.exception(ClientError(ClientErrorCause.BadRequest, throwable.getMessage()))
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.docx
new file mode 100644
index 000000000..4ff9bf941
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.scala
deleted file mode 100644
index c5fd16d8c..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.scala
+++ /dev/null
@@ -1,6 +0,0 @@
-package com.twitter.simclustersann.exceptions
-
-case class MissingClusterConfigForSimClustersAnnVariantException(sannServiceName: String)
- extends IllegalStateException(
- s"No cluster configuration found for service ($sannServiceName)",
- null)
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD
deleted file mode 100644
index cb28d02b4..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD
+++ /dev/null
@@ -1,13 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- tags = ["bazel-compatible"],
- dependencies = [
- "finagle/finagle-core/src/main",
- "finatra/inject/inject-app/src/main/java/com/twitter/inject/annotations",
- "finatra/inject/inject-core/src/main/scala",
- "relevance-platform/src/main/scala/com/twitter/relevance_platform/simclustersann/multicluster",
- "scrooge/scrooge-core/src/main/scala",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions",
- "simclusters-ann/thrift/src/main/thrift:thrift-scala",
- ],
-)
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD.docx
new file mode 100644
index 000000000..9fe5641a5
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.docx
new file mode 100644
index 000000000..222e06539
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.scala
deleted file mode 100644
index f9c9a354f..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.scala
+++ /dev/null
@@ -1,43 +0,0 @@
-package com.twitter.simclustersann.filters
-
-import com.twitter.finagle.Service
-import com.twitter.finagle.SimpleFilter
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.scrooge.Request
-import com.twitter.scrooge.Response
-import com.twitter.simclustersann.thriftscala.SimClustersANNService
-import com.twitter.util.Future
-import javax.inject.Inject
-import javax.inject.Singleton
-
-@Singleton
-class GetTweetCandidatesResponseStatsFilter @Inject() (
- statsReceiver: StatsReceiver)
- extends SimpleFilter[Request[SimClustersANNService.GetTweetCandidates.Args], Response[
- SimClustersANNService.GetTweetCandidates.SuccessType
- ]] {
-
- private[this] val stats = statsReceiver.scope("method_response_stats").scope("getTweetCandidates")
- private[this] val candidateScoreStats = stats.stat("candidate_score_x1000")
- private[this] val emptyResponseCounter = stats.counter("empty")
- private[this] val nonEmptyResponseCounter = stats.counter("non_empty")
- override def apply(
- request: Request[SimClustersANNService.GetTweetCandidates.Args],
- service: Service[Request[SimClustersANNService.GetTweetCandidates.Args], Response[
- SimClustersANNService.GetTweetCandidates.SuccessType
- ]]
- ): Future[Response[SimClustersANNService.GetTweetCandidates.SuccessType]] = {
- val response = service(request)
-
- response.onSuccess { successResponse =>
- if (successResponse.value.size == 0)
- emptyResponseCounter.incr()
- else
- nonEmptyResponseCounter.incr()
- successResponse.value.foreach { candidate =>
- candidateScoreStats.add(candidate.score.toFloat * 1000)
- }
- }
- response
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.docx
new file mode 100644
index 000000000..67a82917a
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.scala
deleted file mode 100644
index 8cfa088dd..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.scala
+++ /dev/null
@@ -1,53 +0,0 @@
-package com.twitter.simclustersann.filters
-
-import com.twitter.finagle.mtls.authentication.ServiceIdentifier
-import com.twitter.finagle.Service
-import com.twitter.finagle.SimpleFilter
-import com.twitter.relevance_platform.simclustersann.multicluster.ServiceNameMapper
-import com.twitter.scrooge.Request
-import com.twitter.scrooge.Response
-import com.twitter.simclustersann.exceptions.InvalidRequestForSimClustersAnnVariantException
-import com.twitter.simclustersann.thriftscala.SimClustersANNService
-import com.twitter.util.Future
-import javax.inject.Inject
-import javax.inject.Singleton
-
-@Singleton
-class SimClustersAnnVariantFilter @Inject() (
- serviceNameMapper: ServiceNameMapper,
- serviceIdentifier: ServiceIdentifier,
-) extends SimpleFilter[Request[SimClustersANNService.GetTweetCandidates.Args], Response[
- SimClustersANNService.GetTweetCandidates.SuccessType
- ]] {
- override def apply(
- request: Request[SimClustersANNService.GetTweetCandidates.Args],
- service: Service[Request[SimClustersANNService.GetTweetCandidates.Args], Response[
- SimClustersANNService.GetTweetCandidates.SuccessType
- ]]
- ): Future[Response[SimClustersANNService.GetTweetCandidates.SuccessType]] = {
-
- validateRequest(request)
- service(request)
- }
-
- private def validateRequest(
- request: Request[SimClustersANNService.GetTweetCandidates.Args]
- ): Unit = {
- val modelVersion = request.args.query.sourceEmbeddingId.modelVersion
- val embeddingType = request.args.query.config.candidateEmbeddingType
-
- val actualServiceName = serviceIdentifier.service
-
- val expectedServiceName = serviceNameMapper.getServiceName(modelVersion, embeddingType)
-
- expectedServiceName match {
- case Some(name) if name == actualServiceName => ()
- case _ =>
- throw InvalidRequestForSimClustersAnnVariantException(
- modelVersion,
- embeddingType,
- actualServiceName,
- expectedServiceName)
- }
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD
deleted file mode 100644
index dcca09b7f..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD
+++ /dev/null
@@ -1,24 +0,0 @@
-scala_library(
- compiler_option_sets = ["fatal_warnings"],
- tags = ["bazel-compatible"],
- dependencies = [
- "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
- "finagle/finagle-stats",
- "finatra/inject/inject-core/src/main/scala",
- "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/store/strato",
- "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common",
- "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/injection",
- "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/readablestore",
- "relevance-platform/src/main/scala/com/twitter/relevance_platform/simclustersann/multicluster",
- "representation-manager/client/src/main/scala/com/twitter/representation_manager",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common",
- "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions",
- "simclusters-ann/thrift/src/main/thrift:thrift-scala",
- "src/scala/com/twitter/simclusters_v2/common",
- "src/scala/com/twitter/simclusters_v2/summingbird",
- "src/scala/com/twitter/storehaus_internal/memcache",
- "src/scala/com/twitter/storehaus_internal/util",
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala",
- ],
-)
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD.docx
new file mode 100644
index 000000000..86f66e894
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.docx
new file mode 100644
index 000000000..a501617a1
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.scala
deleted file mode 100644
index 6abc37b8d..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.scala
+++ /dev/null
@@ -1,34 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.inject.Provides
-import com.twitter.finagle.memcached.Client
-import javax.inject.Singleton
-import com.twitter.conversions.DurationOps._
-import com.twitter.inject.TwitterModule
-import com.twitter.finagle.mtls.authentication.ServiceIdentifier
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.inject.annotations.Flag
-import com.twitter.simclustersann.common.FlagNames
-import com.twitter.storehaus_internal.memcache.MemcacheStore
-import com.twitter.storehaus_internal.util.ClientName
-import com.twitter.storehaus_internal.util.ZkEndPoint
-
-object CacheModule extends TwitterModule {
-
- @Singleton
- @Provides
- def providesCache(
- @Flag(FlagNames.CacheDest) cacheDest: String,
- @Flag(FlagNames.CacheTimeout) cacheTimeout: Int,
- serviceIdentifier: ServiceIdentifier,
- stats: StatsReceiver
- ): Client =
- MemcacheStore.memcachedClient(
- name = ClientName("memcache_simclusters_ann"),
- dest = ZkEndPoint(cacheDest),
- timeout = cacheTimeout.milliseconds,
- retries = 0,
- statsReceiver = stats.scope("cache_client"),
- serviceIdentifier = serviceIdentifier
- )
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.docx
new file mode 100644
index 000000000..0bfeece9b
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.scala
deleted file mode 100644
index 84fec3974..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.scala
+++ /dev/null
@@ -1,15 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.inject.Provides
-import com.twitter.inject.TwitterModule
-import com.twitter.relevance_platform.simclustersann.multicluster.ClusterConfigMapper
-import javax.inject.Singleton
-
-object ClusterConfigMapperModule extends TwitterModule {
- @Singleton
- @Provides
- def providesClusterConfigMapper(
- ): ClusterConfigMapper = {
- ClusterConfigMapper
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.docx
new file mode 100644
index 000000000..de8920c79
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.scala
deleted file mode 100644
index ae4092760..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.scala
+++ /dev/null
@@ -1,25 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.inject.Provides
-import com.twitter.finagle.mtls.authentication.ServiceIdentifier
-import com.twitter.inject.TwitterModule
-import com.twitter.relevance_platform.simclustersann.multicluster.ClusterConfig
-import com.twitter.relevance_platform.simclustersann.multicluster.ClusterConfigMapper
-import com.twitter.simclustersann.exceptions.MissingClusterConfigForSimClustersAnnVariantException
-import javax.inject.Singleton
-
-object ClusterConfigModule extends TwitterModule {
- @Singleton
- @Provides
- def providesClusterConfig(
- serviceIdentifier: ServiceIdentifier,
- clusterConfigMapper: ClusterConfigMapper
- ): ClusterConfig = {
- val serviceName = serviceIdentifier.service
-
- clusterConfigMapper.getClusterConfig(serviceName) match {
- case Some(config) => config
- case None => throw MissingClusterConfigForSimClustersAnnVariantException(serviceName)
- }
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.docx
new file mode 100644
index 000000000..45b832db1
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.scala
deleted file mode 100644
index 34281fa22..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.scala
+++ /dev/null
@@ -1,95 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.inject.Provides
-import com.twitter.conversions.DurationOps._
-import com.twitter.decider.Decider
-import com.twitter.finagle.memcached.Client
-import com.twitter.finagle.mtls.authentication.ServiceIdentifier
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.hermit.store.common.ObservedCachedReadableStore
-import com.twitter.hermit.store.common.ObservedMemcachedReadableStore
-import com.twitter.inject.TwitterModule
-import com.twitter.inject.annotations.Flag
-import com.twitter.relevance_platform.common.injection.LZ4Injection
-import com.twitter.relevance_platform.common.injection.SeqObjectInjection
-import com.twitter.relevance_platform.simclustersann.multicluster.ClusterConfig
-import com.twitter.relevance_platform.simclustersann.multicluster.ClusterTweetIndexStoreConfig
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.ModelVersions
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.summingbird.stores.ClusterKey
-import com.twitter.simclusters_v2.summingbird.stores.TopKTweetsForClusterKeyReadableStore
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclustersann.common.FlagNames
-import com.twitter.storehaus.ReadableStore
-
-import javax.inject.Singleton
-
-object ClusterTweetIndexProviderModule extends TwitterModule {
-
- @Singleton
- @Provides
- // Provides ClusterTweetIndex Store based on different maxResults settings on the same store
- // Create a different provider if index is in a different store
- def providesClusterTweetIndex(
- @Flag(FlagNames.MaxTopTweetPerCluster) maxTopTweetPerCluster: Int,
- @Flag(FlagNames.CacheAsyncUpdate) asyncUpdate: Boolean,
- clusterConfig: ClusterConfig,
- serviceIdentifier: ServiceIdentifier,
- stats: StatsReceiver,
- decider: Decider,
- simClustersANNCacheClient: Client
- ): ReadableStore[ClusterId, Seq[(TweetId, Double)]] = {
- // Build the underling cluster-to-tweet store
- val topTweetsForClusterStore = clusterConfig.clusterTweetIndexStoreConfig match {
- // If the config returns Manhattan tweet index config, we read from a RO MH store
- case manhattanConfig: ClusterTweetIndexStoreConfig.Manhattan =>
- TopKTweetsForClusterKeyReadableStore.getClusterToTopKTweetsStoreFromManhattanRO(
- maxTopTweetPerCluster,
- manhattanConfig,
- serviceIdentifier)
- case memCacheConfig: ClusterTweetIndexStoreConfig.Memcached =>
- TopKTweetsForClusterKeyReadableStore.getClusterToTopKTweetsStoreFromMemCache(
- maxTopTweetPerCluster,
- memCacheConfig,
- serviceIdentifier)
- case _ =>
- // Bad instance
- ReadableStore.empty
- }
-
- val embeddingType: EmbeddingType = clusterConfig.candidateTweetEmbeddingType
- val modelVersion: String = ModelVersions.toKnownForModelVersion(clusterConfig.modelVersion)
-
- val store: ReadableStore[ClusterId, Seq[(TweetId, Double)]] =
- topTweetsForClusterStore.composeKeyMapping { id: ClusterId =>
- ClusterKey(id, modelVersion, embeddingType)
- }
-
- val memcachedTopTweetsForClusterStore =
- ObservedMemcachedReadableStore.fromCacheClient(
- backingStore = store,
- cacheClient = simClustersANNCacheClient,
- ttl = 15.minutes,
- asyncUpdate = asyncUpdate
- )(
- valueInjection = LZ4Injection.compose(SeqObjectInjection[(Long, Double)]()),
- statsReceiver = stats.scope("cluster_tweet_index_mem_cache"),
- keyToString = { k =>
- // prod cache key : SimClusters_LZ4/cluster_to_tweet/clusterId_embeddingType_modelVersion
- s"scz:c2t:${k}_${embeddingType}_${modelVersion}_$maxTopTweetPerCluster"
- }
- )
-
- val cachedStore: ReadableStore[ClusterId, Seq[(TweetId, Double)]] = {
- ObservedCachedReadableStore.from[ClusterId, Seq[(TweetId, Double)]](
- memcachedTopTweetsForClusterStore,
- ttl = 10.minute,
- maxKeys = 150000,
- cacheName = "cluster_tweet_index_cache",
- windowSize = 10000L
- )(stats.scope("cluster_tweet_index_store"))
- }
- cachedStore
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.docx
new file mode 100644
index 000000000..561cdb1e6
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.scala
deleted file mode 100644
index 678943d2a..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.scala
+++ /dev/null
@@ -1,99 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.twitter.finatra.mtls.thriftmux.modules.MtlsThriftWebFormsModule
-import com.twitter.finatra.thrift.ThriftServer
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.InternalId
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.thriftwebforms.MethodOptions
-import com.twitter.thriftwebforms.view.ServiceResponseView
-import com.twitter.util.Future
-import com.twitter.simclustersann.thriftscala.SimClustersANNTweetCandidate
-import com.twitter.simclustersann.thriftscala.Query
-import com.twitter.simclustersann.thriftscala.SimClustersANNConfig
-import com.twitter.simclustersann.thriftscala.ScoringAlgorithm
-import com.twitter.thriftwebforms.MethodOptions.Access
-import scala.reflect.ClassTag
-import com.twitter.simclustersann.thriftscala.SimClustersANNService
-import scala.collection.mutable
-
-class CustomMtlsThriftWebFormsModule[T: ClassTag](server: ThriftServer)
- extends MtlsThriftWebFormsModule[T](server: ThriftServer) {
-
- private val Nbsp = " "
- private val LdapGroups = Seq("recosplat-sensitive-data-medium", "simclusters-ann-admins")
-
- override protected def methodOptions: Map[String, MethodOptions] = {
- val tweetId = 1568796529690902529L
- val sannDefaultQuery = SimClustersANNService.GetTweetCandidates.Args(
- query = Query(
- sourceEmbeddingId = SimClustersEmbeddingId(
- embeddingType = EmbeddingType.LogFavLongestL2EmbeddingTweet,
- modelVersion = ModelVersion.Model20m145k2020,
- internalId = InternalId.TweetId(tweetId)
- ),
- config = SimClustersANNConfig(
- maxNumResults = 10,
- minScore = 0.0,
- candidateEmbeddingType = EmbeddingType.LogFavBasedTweet,
- maxTopTweetsPerCluster = 400,
- maxScanClusters = 50,
- maxTweetCandidateAgeHours = 24,
- minTweetCandidateAgeHours = 0,
- annAlgorithm = ScoringAlgorithm.CosineSimilarity
- )
- ))
-
- Seq("getTweetCandidates")
- .map(
- _ -> MethodOptions(
- defaultRequestValue = Some(sannDefaultQuery),
- responseRenderers = Seq(renderTimeline),
- allowedAccessOverride = Some(Access.ByLdapGroup(LdapGroups))
- )).toMap
- }
-
- val FullAccessLdapGroups: Seq[String] =
- Seq(
- "recosplat-sensitive-data-medium",
- "simclusters-ann-admins",
- "recos-platform-admins"
- )
-
- override protected def defaultMethodAccess: MethodOptions.Access = {
- MethodOptions.Access.ByLdapGroup(FullAccessLdapGroups)
- }
-
- def renderTimeline(r: AnyRef): Future[ServiceResponseView] = {
- val simClustersANNTweetCandidates = r match {
- case response: Iterable[_] =>
- response.map(x => x.asInstanceOf[SimClustersANNTweetCandidate]).toSeq
- case _ => Seq()
- }
- renderTweets(simClustersANNTweetCandidates)
- }
-
- private def renderTweets(
- simClustersANNTweetCandidates: Seq[SimClustersANNTweetCandidate]
- ): Future[ServiceResponseView] = {
- val htmlSb = new mutable.StringBuilder()
- val headerHtml = s"""Tweet Candidates
"""
- val tweetsHtml = simClustersANNTweetCandidates.map { simClustersANNTweetCandidate =>
- val tweetId = simClustersANNTweetCandidate.tweetId
- val score = simClustersANNTweetCandidate.score
- s""" score: $score
"""
- }.mkString
-
- htmlSb ++= headerHtml
- htmlSb ++= Nbsp
- htmlSb ++= tweetsHtml
- Future.value(
- ServiceResponseView(
- "SimClusters ANN Tweet Candidates",
- htmlSb.toString(),
- Seq("//platform.twitter.com/widgets.js")
- )
- )
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.docx
new file mode 100644
index 000000000..a07233084
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.scala
deleted file mode 100644
index 7111501fe..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.inject.Provides
-import com.twitter.decider.Decider
-import com.twitter.finagle.memcached.{Client => MemcachedClient}
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.inject.TwitterModule
-import com.twitter.representation_manager.StoreBuilder
-import com.twitter.representation_manager.config.{
- DefaultClientConfig => RepresentationManagerDefaultClientConfig
-}
-import com.twitter.representation_manager.thriftscala.SimClustersEmbeddingView
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType
-import com.twitter.simclusters_v2.thriftscala.EmbeddingType._
-import com.twitter.simclusters_v2.thriftscala.ModelVersion
-import com.twitter.simclusters_v2.thriftscala.ModelVersion._
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.storehaus.ReadableStore
-import com.twitter.strato.client.{Client => StratoClient}
-import javax.inject.Singleton
-
-object EmbeddingStoreModule extends TwitterModule {
-
- val TweetEmbeddings: Set[SimClustersEmbeddingView] = Set(
- SimClustersEmbeddingView(LogFavLongestL2EmbeddingTweet, Model20m145kUpdated),
- SimClustersEmbeddingView(LogFavLongestL2EmbeddingTweet, Model20m145k2020)
- )
-
- val UserEmbeddings: Set[SimClustersEmbeddingView] = Set(
- // KnownFor
- SimClustersEmbeddingView(FavBasedProducer, Model20m145kUpdated),
- SimClustersEmbeddingView(FavBasedProducer, Model20m145k2020),
- SimClustersEmbeddingView(FollowBasedProducer, Model20m145k2020),
- SimClustersEmbeddingView(AggregatableLogFavBasedProducer, Model20m145k2020),
- // InterestedIn
- SimClustersEmbeddingView(UnfilteredUserInterestedIn, Model20m145k2020),
- SimClustersEmbeddingView(
- LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020),
- SimClustersEmbeddingView(
- LogFavBasedUserInterestedAverageAddressBookFromIIAPE,
- Model20m145k2020),
- SimClustersEmbeddingView(
- LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020),
- SimClustersEmbeddingView(
- LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020),
- SimClustersEmbeddingView(
- LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020),
- SimClustersEmbeddingView(
- LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE,
- Model20m145k2020),
- SimClustersEmbeddingView(UserNextInterestedIn, Model20m145k2020),
- SimClustersEmbeddingView(LogFavBasedUserInterestedInFromAPE, Model20m145k2020)
- )
-
- @Singleton
- @Provides
- def providesEmbeddingStore(
- stratoClient: StratoClient,
- memCachedClient: MemcachedClient,
- decider: Decider,
- stats: StatsReceiver
- ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = {
-
- val rmsStoreBuilder = new StoreBuilder(
- clientConfig = RepresentationManagerDefaultClientConfig,
- stratoClient = stratoClient,
- memCachedClient = memCachedClient,
- globalStats = stats,
- )
-
- val underlyingStores: Map[
- (EmbeddingType, ModelVersion),
- ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding]
- ] = {
- val tweetEmbeddingStores: Map[
- (EmbeddingType, ModelVersion),
- ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding]
- ] = TweetEmbeddings
- .map(embeddingView =>
- (
- (embeddingView.embeddingType, embeddingView.modelVersion),
- rmsStoreBuilder
- .buildSimclustersTweetEmbeddingStoreWithEmbeddingIdAsKey(embeddingView))).toMap
-
- val userEmbeddingStores: Map[
- (EmbeddingType, ModelVersion),
- ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding]
- ] = UserEmbeddings
- .map(embeddingView =>
- (
- (embeddingView.embeddingType, embeddingView.modelVersion),
- rmsStoreBuilder
- .buildSimclustersUserEmbeddingStoreWithEmbeddingIdAsKey(embeddingView))).toMap
-
- tweetEmbeddingStores ++ userEmbeddingStores
- }
-
- SimClustersEmbeddingStore.buildWithDecider(
- underlyingStores = underlyingStores,
- decider = decider,
- statsReceiver = stats
- )
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.docx
new file mode 100644
index 000000000..ea8d535b8
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.scala
deleted file mode 100644
index ebcaeca27..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.scala
+++ /dev/null
@@ -1,44 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.twitter.inject.TwitterModule
-import com.twitter.simclustersann.common.FlagNames
-
-object FlagsModule extends TwitterModule {
-
- flag[Int](
- name = FlagNames.ServiceTimeout,
- default = 40,
- help = "The threshold of Request Timeout"
- )
-
- flag[String](
- name = FlagNames.DarkTrafficFilterDeciderKey,
- default = "dark_traffic_filter",
- help = "Dark traffic filter decider key"
- )
-
- flag[String](
- name = FlagNames.CacheDest,
- default = "/s/cache/content_recommender_unified_v2",
- help = "Path to memcache service. Currently using CR uniform scoring cache"
- )
-
- flag[Int](
- name = FlagNames.CacheTimeout,
- default = 15,
- help = "The threshold of MemCache Timeout"
- )
-
- flag[Boolean](
- name = FlagNames.CacheAsyncUpdate,
- default = false,
- help = "Whether to enable the async update for the MemCache"
- )
-
- flag[Int](
- name = FlagNames.MaxTopTweetPerCluster,
- default = 200,
- help = "Maximum number of tweets to take per each simclusters"
- )
-
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.docx
new file mode 100644
index 000000000..eb0df8ab4
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.scala
deleted file mode 100644
index c66ade392..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.scala
+++ /dev/null
@@ -1,27 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.inject.Provides
-import com.twitter.inject.TwitterModule
-import com.twitter.inject.annotations.Flag
-import com.twitter.simclustersann.common.FlagNames.NumberOfThreads
-import com.twitter.util.ExecutorServiceFuturePool
-import java.util.concurrent.Executors
-import javax.inject.Singleton
-object FuturePoolProvider extends TwitterModule {
- flag[Int](
- name = NumberOfThreads,
- default = 20,
- help = "The number of threads in the future pool."
- )
-
- @Singleton
- @Provides
- def providesFuturePool(
- @Flag(NumberOfThreads) numberOfThreads: Int
- ): ExecutorServiceFuturePool = {
- val threadPool = Executors.newFixedThreadPool(numberOfThreads)
- new ExecutorServiceFuturePool(threadPool) {
- override def toString: String = s"warmup-future-pool-$executor)"
- }
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.docx
new file mode 100644
index 000000000..eb9494f72
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.scala
deleted file mode 100644
index 66e26d4f5..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.scala
+++ /dev/null
@@ -1,23 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.common.util.concurrent.RateLimiter
-import com.google.inject.Provides
-import com.twitter.inject.TwitterModule
-import com.twitter.inject.annotations.Flag
-import com.twitter.simclustersann.common.FlagNames.RateLimiterQPS
-import javax.inject.Singleton
-
-object RateLimiterModule extends TwitterModule {
- flag[Int](
- name = RateLimiterQPS,
- default = 1000,
- help = "The QPS allowed by the rate limiter."
- )
-
- @Singleton
- @Provides
- def providesRateLimiter(
- @Flag(RateLimiterQPS) rateLimiterQps: Int
- ): RateLimiter =
- RateLimiter.create(rateLimiterQps)
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.docx
new file mode 100644
index 000000000..7f5d9e6c0
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.scala
deleted file mode 100644
index 91a38f2a1..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.scala
+++ /dev/null
@@ -1,15 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.inject.Provides
-import com.twitter.inject.TwitterModule
-import com.twitter.relevance_platform.simclustersann.multicluster.ServiceNameMapper
-import javax.inject.Singleton
-
-object ServiceNameMapperModule extends TwitterModule {
- @Singleton
- @Provides
- def providesServiceNameMapper(
- ): ServiceNameMapper = {
- ServiceNameMapper
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.docx
new file mode 100644
index 000000000..aa7172646
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.scala
deleted file mode 100644
index b5f9ee5da..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.scala
+++ /dev/null
@@ -1,47 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.inject.Provides
-import com.twitter.finagle.stats.StatsReceiver
-import com.twitter.inject.TwitterModule
-import com.twitter.simclusters_v2.common.ClusterId
-import com.twitter.simclusters_v2.common.SimClustersEmbedding
-import com.twitter.simclusters_v2.common.TweetId
-import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
-import com.twitter.storehaus.ReadableStore
-import javax.inject.Singleton
-import com.twitter.simclustersann.candidate_source.ApproximateCosineSimilarity
-import com.twitter.simclustersann.candidate_source.ExperimentalApproximateCosineSimilarity
-import com.twitter.simclustersann.candidate_source.OptimizedApproximateCosineSimilarity
-import com.twitter.simclustersann.candidate_source.SimClustersANNCandidateSource
-
-object SimClustersANNCandidateSourceModule extends TwitterModule {
-
- val acsFlag = flag[String](
- name = "approximate_cosine_similarity",
- default = "original",
- help =
- "Select different implementations of the approximate cosine similarity algorithm, for testing optimizations",
- )
- @Singleton
- @Provides
- def provides(
- embeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding],
- cachedClusterTweetIndexStore: ReadableStore[ClusterId, Seq[(TweetId, Double)]],
- statsReceiver: StatsReceiver
- ): SimClustersANNCandidateSource = {
-
- val approximateCosineSimilarity = acsFlag() match {
- case "original" => ApproximateCosineSimilarity
- case "optimized" => OptimizedApproximateCosineSimilarity
- case "experimental" => ExperimentalApproximateCosineSimilarity
- case _ => ApproximateCosineSimilarity
- }
-
- new SimClustersANNCandidateSource(
- approximateCosineSimilarity = approximateCosineSimilarity,
- clusterTweetCandidatesStore = cachedClusterTweetIndexStore,
- simClustersEmbeddingStore = embeddingStore,
- statsReceiver = statsReceiver.scope("simClustersANNCandidateSource")
- )
- }
-}
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.docx
new file mode 100644
index 000000000..113da2abc
Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.docx differ
diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.scala
deleted file mode 100644
index 0766c70a7..000000000
--- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.scala
+++ /dev/null
@@ -1,20 +0,0 @@
-package com.twitter.simclustersann.modules
-
-import com.google.inject.Provides
-import javax.inject.Singleton
-import com.twitter.inject.TwitterModule
-import com.twitter.finagle.mtls.authentication.ServiceIdentifier
-import com.twitter.strato.client.Client
-import com.twitter.strato.client.Strato
-
-object StratoClientProviderModule extends TwitterModule {
-
- @Singleton
- @Provides
- def providesCache(
- serviceIdentifier: ServiceIdentifier,
- ): Client = Strato.client
- .withMutualTls(serviceIdentifier)
- .build()
-
-}
diff --git a/simclusters-ann/thrift/src/main/thrift/BUILD b/simclusters-ann/thrift/src/main/thrift/BUILD
deleted file mode 100644
index fce3b9c8c..000000000
--- a/simclusters-ann/thrift/src/main/thrift/BUILD
+++ /dev/null
@@ -1,16 +0,0 @@
-create_thrift_libraries(
- base_name = "thrift",
- sources = ["**/*.thrift"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependency_roots = [
- "finatra-internal/thrift/src/main/thrift",
- "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift",
- ],
- generate_languages = [
- "java",
- "scala",
- ],
- provides_java_name = "simclusters-ann-thrift-java",
- provides_scala_name = "simclusters-ann-thrift-scala",
-)
diff --git a/simclusters-ann/thrift/src/main/thrift/BUILD.docx b/simclusters-ann/thrift/src/main/thrift/BUILD.docx
new file mode 100644
index 000000000..267769796
Binary files /dev/null and b/simclusters-ann/thrift/src/main/thrift/BUILD.docx differ
diff --git a/simclusters-ann/thrift/src/main/thrift/simClustersAnn.docx b/simclusters-ann/thrift/src/main/thrift/simClustersAnn.docx
new file mode 100644
index 000000000..0311cada9
Binary files /dev/null and b/simclusters-ann/thrift/src/main/thrift/simClustersAnn.docx differ
diff --git a/simclusters-ann/thrift/src/main/thrift/simClustersAnn.thrift b/simclusters-ann/thrift/src/main/thrift/simClustersAnn.thrift
deleted file mode 100644
index 9c327febe..000000000
--- a/simclusters-ann/thrift/src/main/thrift/simClustersAnn.thrift
+++ /dev/null
@@ -1,59 +0,0 @@
-namespace java com.twitter.simclustersann.thriftjava
-#@namespace scala com.twitter.simclustersann.thriftscala
-
-include "finatra-thrift/finatra_thrift_exceptions.thrift"
-include "com/twitter/simclusters_v2/identifier.thrift"
-include "com/twitter/simclusters_v2/score.thrift"
-
-struct Query {
- 1: required identifier.SimClustersEmbeddingId sourceEmbeddingId;
- 2: required SimClustersANNConfig config;
-}
-
-struct SimClustersANNTweetCandidate {
- 1: required i64 tweetId (personalDataType = 'TweetId');
- 2: required double score;
-}
-
-struct SimClustersANNConfig {
- 1: required i32 maxNumResults;
- 2: required double minScore;
- 3: required identifier.EmbeddingType candidateEmbeddingType;
- 4: required i32 maxTopTweetsPerCluster;
- 5: required i32 maxScanClusters;
- 6: required i32 maxTweetCandidateAgeHours;
- 7: required i32 minTweetCandidateAgeHours;
- 8: required ScoringAlgorithm annAlgorithm;
-}
-
-/**
- * The algorithm type to identify the score algorithm.
- **/
-enum ScoringAlgorithm {
- DotProduct = 1,
- CosineSimilarity = 2,
- LogCosineSimilarity = 3,
- CosineSimilarityNoSourceEmbeddingNormalization = 4, // Score = (Source dot Candidate) / candidate_l2_norm
-}(hasPersonalData = 'false')
-
-enum InvalidResponseParameter {
- INVALID_EMBEDDING_TYPE = 1,
- INVALID_MODEL_VERSION = 2,
-}
-
-exception InvalidResponseParameterException {
- 1: required InvalidResponseParameter errorCode,
- 2: optional string message // failure reason
-}
-
-service SimClustersANNService {
-
- list getTweetCandidates(
- 1: required Query query;
- ) throws (
- 1: InvalidResponseParameterException e;
- 2: finatra_thrift_exceptions.ServerError serverError;
- 3: finatra_thrift_exceptions.ClientError clientError;
- );
-
-}
diff --git a/src/java/com/twitter/search/README.docx b/src/java/com/twitter/search/README.docx
new file mode 100644
index 000000000..014caa211
Binary files /dev/null and b/src/java/com/twitter/search/README.docx differ
diff --git a/src/java/com/twitter/search/README.md b/src/java/com/twitter/search/README.md
deleted file mode 100644
index f92a9bdf3..000000000
--- a/src/java/com/twitter/search/README.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# Tweet Search System (Earlybird)
-> **TL;DR** Tweet Search System (Earlybird) find tweets from people you follow, rank them, and serve the tweets to Home.
-
-## What is Tweet Search System (Earlybird)?
-[Earlybird](http://notes.stephenholiday.com/Earlybird.pdf) is a **real-time search system** based on [Apache Lucene](https://lucene.apache.org/) to support the high volume of queries and content updates. The major use cases are Relevance Search (specifically, Text search) and Timeline In-network Tweet retrieval (or UserID based search). It is designed to enable the efficient indexing and querying of billions of tweets, and to provide low-latency search results, even with heavy query loads.
-
-## How it is related to the Home Timeline Recommendation Algorithm
-
-![in-network](img/in-network.png)
-
-At Twitter, we use Tweet Search System (Earlybird) to do Home Timeline In-network Tweet retrieval: given a list of following users, find their recently posted tweets. Earlybird (Search Index) is the major candidate source for in-network tweets across Following tab and For You tab.
-
-
-## High-level architecture
-We split our entire tweet search index into three clusters: a **realtime** cluster indexing all public tweets posted in about the last 7 days, a **protected** cluster indexing all protected tweets for the same timeframe; and an **archive** cluster indexing all tweets ever posted, up to about two days ago.
-
-Earlybird addresses the challenges of scaling real-time search by splitting each cluster across multiple **partitions**, each responsible for a portion of the index. The architecture uses a distributed *inverted index* that is sharded and replicated. This design allows for efficient index updates and query processing.
-
-The system also employs an incremental indexing approach, enabling it to process and index new tweets in real-time as they arrive. With single writer, multiple reader structure, Earlybird can handle a large number of real-time updates and queries concurrently while maintaining low query latency. The system can achieve high query throughput and low query latency while maintaining a high degree of index freshness.
-
-
-### Indexing
-* Ingesters read tweets and user modifications from kafka topics, extract fields and features from them and write the extracted data to intermediate kafka topics for Earlybirds to consume, index and serve.
-* Feature Update Service feeds feature updates such as up-to-date engagement (like, retweets, replies) counts to Earlybird.
-![indexing](img/indexing.png)
-
-### Serving
-Earlybird roots fanout requests to different Earlybird clusters or partitions. Upon receiving responses from the clusters or partitions, roots merge the responses before finally returning the merged response to the client.
-![serving](img/serving.png)
-
-## Use cases
-
-1. Tweet Search
- * Top search
- * Latest search
-
-![top](img/top-search.png)
-
-2. Candidate generation
- * Timeline (For You Tab, Following Tab)
- * Notifications
-
-![home](img/foryou.png)
-
-## References
-* "Earlybird: Real-Time Search at Twitter" (http://notes.stephenholiday.com/Earlybird.pdf)
-* "Reducing search indexing latency to one second" (https://blog.twitter.com/engineering/en_us/topics/infrastructure/2020/reducing-search-indexing-latency-to-one-second)
-* "Omnisearch index formats" (https://blog.twitter.com/engineering/en_us/topics/infrastructure/2016/omnisearch-index-formats)
-
-
diff --git a/src/java/com/twitter/search/common/README.docx b/src/java/com/twitter/search/common/README.docx
new file mode 100644
index 000000000..8227df958
Binary files /dev/null and b/src/java/com/twitter/search/common/README.docx differ
diff --git a/src/java/com/twitter/search/common/README.md b/src/java/com/twitter/search/common/README.md
deleted file mode 100644
index c7f2e38bb..000000000
--- a/src/java/com/twitter/search/common/README.md
+++ /dev/null
@@ -1 +0,0 @@
-Contains code that is common to multiple earlybird services (ingesters, roots and earlybird).
\ No newline at end of file
diff --git a/src/java/com/twitter/search/common/converter/earlybird/BUILD b/src/java/com/twitter/search/common/converter/earlybird/BUILD
deleted file mode 100644
index a5d4ea4ae..000000000
--- a/src/java/com/twitter/search/common/converter/earlybird/BUILD
+++ /dev/null
@@ -1,57 +0,0 @@
-java_library(
- sources = ["*.java"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/google/guava",
- "3rdparty/jvm/com/google/inject:guice",
- "3rdparty/jvm/com/twitter/elephantbird:core",
- "3rdparty/jvm/geo/google:geoGoogle",
- "3rdparty/jvm/joda-time",
- "3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
- "3rdparty/jvm/org/apache/httpcomponents:httpcore",
- "3rdparty/jvm/org/apache/lucene:lucene-core",
- "3rdparty/jvm/org/apache/thrift:libthrift",
- "3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
- "3rdparty/jvm/org/slf4j:slf4j-api",
- "cuad/projects/ner/thrift/src/main/thrift:thrift-java",
- "decider/src/main/scala",
- "src/java/com/twitter/common/base",
- "src/java/com/twitter/common/collections",
- "src/java/com/twitter/common/text/language:locale-util",
- "src/java/com/twitter/common/text/token",
- "src/java/com/twitter/common/text/util:token-util",
- "src/java/com/twitter/common_internal/text:text-penguin7",
- "src/java/com/twitter/common_internal/text/version",
- "src/java/com/twitter/search/common/config",
- "src/java/com/twitter/search/common/constants",
- "src/java/com/twitter/search/common/debug",
- "src/java/com/twitter/search/common/decider",
- "src/java/com/twitter/search/common/encoding/docvalues",
- "src/java/com/twitter/search/common/encoding/features",
- "src/java/com/twitter/search/common/metrics",
- "src/java/com/twitter/search/common/partitioning/base",
- "src/java/com/twitter/search/common/partitioning/snowflakeparser",
- "src/java/com/twitter/search/common/relevance:entities_and_filters",
- "src/java/com/twitter/search/common/relevance:text",
- "src/java/com/twitter/search/common/relevance/features",
- "src/java/com/twitter/search/common/schema",
- "src/java/com/twitter/search/common/schema/base",
- "src/java/com/twitter/search/common/schema/earlybird",
- "src/java/com/twitter/search/common/util:longintconverter",
- "src/java/com/twitter/search/common/util/analysis",
- "src/java/com/twitter/search/common/util/lang",
- "src/java/com/twitter/search/common/util/spatial",
- "src/java/com/twitter/search/common/util/text",
- "src/java/com/twitter/search/common/util/text/regex",
- "src/java/com/twitter/search/common/util/thrift:thrift-utils",
- "src/java/com/twitter/search/common/util/url",
- "src/java/com/twitter/search/ingester/model",
- "src/thrift/com/twitter/search/common:constants-java",
- "src/thrift/com/twitter/search/common:indexing-java",
- "src/thrift/com/twitter/search/common:schema-java",
- "src/thrift/com/twitter/search/common/debug:debug-java",
- "src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java",
- "src/thrift/com/twitter/tweetypie:tweet-java",
- ],
-)
diff --git a/src/java/com/twitter/search/common/converter/earlybird/BUILD.docx b/src/java/com/twitter/search/common/converter/earlybird/BUILD.docx
new file mode 100644
index 000000000..35120101d
Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/BUILD.docx differ
diff --git a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.docx b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.docx
new file mode 100644
index 000000000..bc5b8c6da
Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.docx differ
diff --git a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java
deleted file mode 100644
index afde8a84e..000000000
--- a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java
+++ /dev/null
@@ -1,647 +0,0 @@
-package com.twitter.search.common.converter.earlybird;
-
-import java.io.IOException;
-import java.util.Date;
-import java.util.List;
-import java.util.Optional;
-import javax.annotation.concurrent.NotThreadSafe;
-
-import com.google.common.base.Preconditions;
-
-import org.apache.commons.collections.CollectionUtils;
-import org.joda.time.DateTime;
-import org.joda.time.DateTimeZone;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.twitter.common_internal.text.version.PenguinVersion;
-import com.twitter.search.common.converter.earlybird.EncodedFeatureBuilder.TweetFeatureWithEncodeFeatures;
-import com.twitter.search.common.indexing.thriftjava.Place;
-import com.twitter.search.common.indexing.thriftjava.PotentialLocation;
-import com.twitter.search.common.indexing.thriftjava.ProfileGeoEnrichment;
-import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
-import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures;
-import com.twitter.search.common.metrics.SearchCounter;
-import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
-import com.twitter.search.common.relevance.entities.GeoObject;
-import com.twitter.search.common.relevance.entities.TwitterMessage;
-import com.twitter.search.common.relevance.entities.TwitterQuotedMessage;
-import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
-import com.twitter.search.common.schema.base.Schema;
-import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
-import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures;
-import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants;
-import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
-import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder;
-import com.twitter.search.common.schema.thriftjava.ThriftDocument;
-import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
-import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType;
-import com.twitter.search.common.util.spatial.GeoUtil;
-import com.twitter.search.common.util.text.NormalizerHelper;
-import com.twitter.tweetypie.thriftjava.ComposerSource;
-
-/**
- * Converts a TwitterMessage into a ThriftVersionedEvents. This is only responsible for data that
- * is available immediately when a Tweet is created. Some data, like URL data, isn't available
- * immediately, and so it is processed later, in the DelayedIndexingConverter and sent as an
- * update. In order to achieve this we create the document in 2 passes:
- *
- * 1. BasicIndexingConverter builds thriftVersionedEvents with the fields that do not require
- * external services.
- *
- * 2. DelayedIndexingConverter builds all the document fields depending on external services, once
- * those services have processed the relevant Tweet and we have retrieved that data.
- */
-@NotThreadSafe
-public class BasicIndexingConverter {
- private static final Logger LOG = LoggerFactory.getLogger(BasicIndexingConverter.class);
-
- private static final SearchCounter NUM_NULLCAST_FEATURE_FLAG_SET_TWEETS =
- SearchCounter.export("num_nullcast_feature_flag_set_tweets");
- private static final SearchCounter NUM_NULLCAST_TWEETS =
- SearchCounter.export("num_nullcast_tweets");
- private static final SearchCounter NUM_NON_NULLCAST_TWEETS =
- SearchCounter.export("num_non_nullcast_tweets");
- private static final SearchCounter ADJUSTED_BAD_CREATED_AT_COUNTER =
- SearchCounter.export("adjusted_incorrect_created_at_timestamp");
- private static final SearchCounter INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS =
- SearchCounter.export("inconsistent_tweet_id_and_created_at_ms");
- private static final SearchCounter NUM_SELF_THREAD_TWEETS =
- SearchCounter.export("num_self_thread_tweets");
- private static final SearchCounter NUM_EXCLUSIVE_TWEETS =
- SearchCounter.export("num_exclusive_tweets");
-
- // If a tweet carries a timestamp smaller than this timestamp, we consider the timestamp invalid,
- // because twitter does not even exist back then before: Sun, 01 Jan 2006 00:00:00 GMT
- private static final long VALID_CREATION_TIME_THRESHOLD_MILLIS =
- new DateTime(2006, 1, 1, 0, 0, 0, DateTimeZone.UTC).getMillis();
-
- private final EncodedFeatureBuilder featureBuilder;
- private final Schema schema;
- private final EarlybirdCluster cluster;
-
- public BasicIndexingConverter(Schema schema, EarlybirdCluster cluster) {
- this.featureBuilder = new EncodedFeatureBuilder();
- this.schema = schema;
- this.cluster = cluster;
- }
-
- /**
- * This function converts TwitterMessage to ThriftVersionedEvents, which is a generic data
- * structure that can be consumed by Earlybird directly.
- */
- public ThriftVersionedEvents convertMessageToThrift(
- TwitterMessage message,
- boolean strict,
- List penguinVersions) throws IOException {
- Preconditions.checkNotNull(message);
- Preconditions.checkNotNull(penguinVersions);
-
- ThriftVersionedEvents versionedEvents = new ThriftVersionedEvents()
- .setId(message.getId());
-
- ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot();
-
- for (PenguinVersion penguinVersion : penguinVersions) {
- ThriftDocument document =
- buildDocumentForPenguinVersion(schemaSnapshot, message, strict, penguinVersion);
-
- ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent()
- .setDocument(document)
- .setEventType(ThriftIndexingEventType.INSERT)
- .setSortId(message.getId());
- message.getFromUserTwitterId().map(thriftIndexingEvent::setUid);
- versionedEvents.putToVersionedEvents(penguinVersion.getByteValue(), thriftIndexingEvent);
- }
-
- return versionedEvents;
- }
-
- private ThriftDocument buildDocumentForPenguinVersion(
- ImmutableSchemaInterface schemaSnapshot,
- TwitterMessage message,
- boolean strict,
- PenguinVersion penguinVersion) throws IOException {
- TweetFeatureWithEncodeFeatures tweetFeature =
- featureBuilder.createTweetFeaturesFromTwitterMessage(
- message, penguinVersion, schemaSnapshot);
-
- EarlybirdThriftDocumentBuilder builder =
- buildBasicFields(message, schemaSnapshot, cluster, tweetFeature);
-
- buildUserFields(builder, message, tweetFeature.versionedFeatures, penguinVersion);
- buildGeoFields(builder, message, tweetFeature.versionedFeatures);
- buildRetweetAndReplyFields(builder, message, strict);
- buildQuotesFields(builder, message);
- buildVersionedFeatureFields(builder, tweetFeature.versionedFeatures);
- buildAnnotationFields(builder, message);
- buildNormalizedMinEngagementFields(builder, tweetFeature.encodedFeatures, cluster);
- buildDirectedAtFields(builder, message);
-
- builder.withSpaceIdFields(message.getSpaceIds());
-
- return builder.build();
- }
-
- /**
- * Build the basic fields for a tweet.
- */
- public static EarlybirdThriftDocumentBuilder buildBasicFields(
- TwitterMessage message,
- ImmutableSchemaInterface schemaSnapshot,
- EarlybirdCluster cluster,
- TweetFeatureWithEncodeFeatures tweetFeature) {
- EarlybirdEncodedFeatures extendedEncodedFeatures = tweetFeature.extendedEncodedFeatures;
- if (extendedEncodedFeatures == null && EarlybirdCluster.isTwitterMemoryFormatCluster(cluster)) {
- extendedEncodedFeatures = EarlybirdEncodedFeatures.newEncodedTweetFeatures(
- schemaSnapshot, EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD);
- }
- EarlybirdThriftDocumentBuilder builder = new EarlybirdThriftDocumentBuilder(
- tweetFeature.encodedFeatures,
- extendedEncodedFeatures,
- new EarlybirdFieldConstants(),
- schemaSnapshot);
-
- builder.withID(message.getId());
-
- final Date createdAt = message.getDate();
- long createdAtMs = createdAt == null ? 0L : createdAt.getTime();
-
- createdAtMs = fixCreatedAtTimeStampIfNecessary(message.getId(), createdAtMs);
-
- if (createdAtMs > 0L) {
- builder.withCreatedAt((int) (createdAtMs / 1000));
- }
-
- builder.withTweetSignature(tweetFeature.versionedFeatures.getTweetSignature());
-
- if (message.getConversationId() > 0) {
- long conversationId = message.getConversationId();
- builder.withLongField(
- EarlybirdFieldConstant.CONVERSATION_ID_CSF.getFieldName(), conversationId);
- // We only index conversation ID when it is different from the tweet ID.
- if (message.getId() != conversationId) {
- builder.withLongField(
- EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName(), conversationId);
- }
- }
-
- if (message.getComposerSource().isPresent()) {
- ComposerSource composerSource = message.getComposerSource().get();
- builder.withIntField(
- EarlybirdFieldConstant.COMPOSER_SOURCE.getFieldName(), composerSource.getValue());
- if (composerSource == ComposerSource.CAMERA) {
- builder.withCameraComposerSourceFlag();
- }
- }
-
- EarlybirdEncodedFeatures encodedFeatures = tweetFeature.encodedFeatures;
- if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG)) {
- builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.VERIFIED_FILTER_TERM);
- }
- if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG)) {
- builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.BLUE_VERIFIED_FILTER_TERM);
- }
-
- if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)) {
- builder.withOffensiveFlag();
- }
-
- if (message.getNullcast()) {
- NUM_NULLCAST_TWEETS.increment();
- builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.NULLCAST_FILTER_TERM);
- } else {
- NUM_NON_NULLCAST_TWEETS.increment();
- }
- if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG)) {
- NUM_NULLCAST_FEATURE_FLAG_SET_TWEETS.increment();
- }
- if (message.isSelfThread()) {
- builder.addFilterInternalFieldTerm(
- EarlybirdFieldConstant.SELF_THREAD_FILTER_TERM);
- NUM_SELF_THREAD_TWEETS.increment();
- }
-
- if (message.isExclusive()) {
- builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.EXCLUSIVE_FILTER_TERM);
- builder.withLongField(
- EarlybirdFieldConstant.EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF.getFieldName(),
- message.getExclusiveConversationAuthorId());
- NUM_EXCLUSIVE_TWEETS.increment();
- }
-
- builder.withLanguageCodes(message.getLanguage(), message.getBCP47LanguageTag());
-
- return builder;
- }
-
- /**
- * Build the user fields.
- */
- public static void buildUserFields(
- EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message,
- VersionedTweetFeatures versionedTweetFeatures,
- PenguinVersion penguinVersion) {
- // 1. Set all the from user fields.
- if (message.getFromUserTwitterId().isPresent()) {
- builder.withLongField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(),
- message.getFromUserTwitterId().get())
- // CSF
- .withLongField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(),
- message.getFromUserTwitterId().get());
- } else {
- LOG.warn("fromUserTwitterId is not set in TwitterMessage! Status id: " + message.getId());
- }
-
- if (message.getFromUserScreenName().isPresent()) {
- String fromUser = message.getFromUserScreenName().get();
- String normalizedFromUser =
- NormalizerHelper.normalizeWithUnknownLocale(fromUser, penguinVersion);
-
- builder
- .withWhiteSpaceTokenizedScreenNameField(
- EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(),
- normalizedFromUser)
- .withStringField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(),
- normalizedFromUser);
-
- if (message.getTokenizedFromUserScreenName().isPresent()) {
- builder.withCamelCaseTokenizedScreenNameField(
- EarlybirdFieldConstant.CAMELCASE_USER_HANDLE_FIELD.getFieldName(),
- fromUser,
- normalizedFromUser,
- message.getTokenizedFromUserScreenName().get());
- }
- }
-
- Optional toUserScreenName = message.getToUserLowercasedScreenName();
- if (toUserScreenName.isPresent() && !toUserScreenName.get().isEmpty()) {
- builder.withStringField(
- EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(),
- NormalizerHelper.normalizeWithUnknownLocale(toUserScreenName.get(), penguinVersion));
- }
-
- if (versionedTweetFeatures.isSetUserDisplayNameTokenStreamText()) {
- builder.withTokenStreamField(EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(),
- versionedTweetFeatures.getUserDisplayNameTokenStreamText(),
- versionedTweetFeatures.getUserDisplayNameTokenStream());
- }
- }
-
- /**
- * Build the geo fields.
- */
- public static void buildGeoFields(
- EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message,
- VersionedTweetFeatures versionedTweetFeatures) {
- double lat = GeoUtil.ILLEGAL_LATLON;
- double lon = GeoUtil.ILLEGAL_LATLON;
- if (message.getGeoLocation() != null) {
- GeoObject location = message.getGeoLocation();
- builder.withGeoField(EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(),
- location.getLatitude(), location.getLongitude(), location.getAccuracy());
-
- if (location.getSource() != null) {
- builder.withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
- EarlybirdFieldConstants.formatGeoType(location.getSource()));
- }
-
- if (GeoUtil.validateGeoCoordinates(location.getLatitude(), location.getLongitude())) {
- lat = location.getLatitude();
- lon = location.getLongitude();
- }
- }
-
- // See SEARCH-14317 for investigation on how much space geo filed is used in archive cluster.
- // In lucene archives, this CSF is needed regardless of whether geoLocation is set.
- builder.withLatLonCSF(lat, lon);
-
- if (versionedTweetFeatures.isSetTokenizedPlace()) {
- Place place = versionedTweetFeatures.getTokenizedPlace();
- Preconditions.checkArgument(place.isSetId(), "Place ID not set for tweet "
- + message.getId());
- Preconditions.checkArgument(place.isSetFullName(),
- "Place full name not set for tweet " + message.getId());
- builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName());
- builder
- .withStringField(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName(), place.getId())
- .withStringField(EarlybirdFieldConstant.PLACE_FULL_NAME_FIELD.getFieldName(),
- place.getFullName());
- if (place.isSetCountryCode()) {
- builder.withStringField(EarlybirdFieldConstant.PLACE_COUNTRY_CODE_FIELD.getFieldName(),
- place.getCountryCode());
- }
- }
-
- if (versionedTweetFeatures.isSetTokenizedProfileGeoEnrichment()) {
- ProfileGeoEnrichment profileGeoEnrichment =
- versionedTweetFeatures.getTokenizedProfileGeoEnrichment();
- Preconditions.checkArgument(
- profileGeoEnrichment.isSetPotentialLocations(),
- "ProfileGeoEnrichment.potentialLocations not set for tweet "
- + message.getId());
- List potentialLocations = profileGeoEnrichment.getPotentialLocations();
- Preconditions.checkArgument(
- !potentialLocations.isEmpty(),
- "Found tweet with an empty ProfileGeoEnrichment.potentialLocations: "
- + message.getId());
- builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.PROFILE_GEO_FILTER_TERM);
- for (PotentialLocation potentialLocation : potentialLocations) {
- if (potentialLocation.isSetCountryCode()) {
- builder.withStringField(
- EarlybirdFieldConstant.PROFILE_GEO_COUNTRY_CODE_FIELD.getFieldName(),
- potentialLocation.getCountryCode());
- }
- if (potentialLocation.isSetRegion()) {
- builder.withStringField(EarlybirdFieldConstant.PROFILE_GEO_REGION_FIELD.getFieldName(),
- potentialLocation.getRegion());
- }
- if (potentialLocation.isSetLocality()) {
- builder.withStringField(EarlybirdFieldConstant.PROFILE_GEO_LOCALITY_FIELD.getFieldName(),
- potentialLocation.getLocality());
- }
- }
- }
-
- builder.withPlacesField(message.getPlaces());
- }
-
- /**
- * Build the retweet and reply fields.
- */
- public static void buildRetweetAndReplyFields(
- EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message,
- boolean strict) {
- long retweetUserIdVal = -1;
- long sharedStatusIdVal = -1;
- if (message.getRetweetMessage() != null) {
- if (message.getRetweetMessage().getSharedId() != null) {
- sharedStatusIdVal = message.getRetweetMessage().getSharedId();
- }
- if (message.getRetweetMessage().hasSharedUserTwitterId()) {
- retweetUserIdVal = message.getRetweetMessage().getSharedUserTwitterId();
- }
- }
-
- long inReplyToStatusIdVal = -1;
- long inReplyToUserIdVal = -1;
- if (message.isReply()) {
- if (message.getInReplyToStatusId().isPresent()) {
- inReplyToStatusIdVal = message.getInReplyToStatusId().get();
- }
- if (message.getToUserTwitterId().isPresent()) {
- inReplyToUserIdVal = message.getToUserTwitterId().get();
- }
- }
-
- buildRetweetAndReplyFields(
- retweetUserIdVal,
- sharedStatusIdVal,
- inReplyToStatusIdVal,
- inReplyToUserIdVal,
- strict,
- builder);
- }
-
- /**
- * Build the quotes fields.
- */
- public static void buildQuotesFields(
- EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message) {
- if (message.getQuotedMessage() != null) {
- TwitterQuotedMessage quoted = message.getQuotedMessage();
- if (quoted != null && quoted.getQuotedStatusId() > 0 && quoted.getQuotedUserId() > 0) {
- builder.withQuote(quoted.getQuotedStatusId(), quoted.getQuotedUserId());
- }
- }
- }
-
- /**
- * Build directed at field.
- */
- public static void buildDirectedAtFields(
- EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message) {
- if (message.getDirectedAtUserId().isPresent() && message.getDirectedAtUserId().get() > 0) {
- builder.withDirectedAtUser(message.getDirectedAtUserId().get());
- builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.DIRECTED_AT_FILTER_TERM);
- }
- }
-
- /**
- * Build the versioned features for a tweet.
- */
- public static void buildVersionedFeatureFields(
- EarlybirdThriftDocumentBuilder builder,
- VersionedTweetFeatures versionedTweetFeatures) {
- builder
- .withHashtagsField(versionedTweetFeatures.getHashtags())
- .withMentionsField(versionedTweetFeatures.getMentions())
- .withStocksFields(versionedTweetFeatures.getStocks())
- .withResolvedLinksText(versionedTweetFeatures.getNormalizedResolvedUrlText())
- .withTokenStreamField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(),
- versionedTweetFeatures.getTweetTokenStreamText(),
- versionedTweetFeatures.isSetTweetTokenStream()
- ? versionedTweetFeatures.getTweetTokenStream() : null)
- .withStringField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(),
- versionedTweetFeatures.getSource())
- .withStringField(EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName(),
- versionedTweetFeatures.getNormalizedSource());
-
- // Internal fields for smileys and question marks
- if (versionedTweetFeatures.hasPositiveSmiley) {
- builder.withStringField(
- EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
- EarlybirdFieldConstant.HAS_POSITIVE_SMILEY);
- }
- if (versionedTweetFeatures.hasNegativeSmiley) {
- builder.withStringField(
- EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
- EarlybirdFieldConstant.HAS_NEGATIVE_SMILEY);
- }
- if (versionedTweetFeatures.hasQuestionMark) {
- builder.withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(),
- EarlybirdThriftDocumentBuilder.QUESTION_MARK);
- }
- }
-
- /**
- * Build the escherbird annotations for a tweet.
- */
- public static void buildAnnotationFields(
- EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message) {
- List escherbirdAnnotations =
- message.getEscherbirdAnnotations();
- if (CollectionUtils.isEmpty(escherbirdAnnotations)) {
- return;
- }
-
- builder.addFacetSkipList(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName());
-
- for (TwitterMessage.EscherbirdAnnotation annotation : escherbirdAnnotations) {
- String groupDomainEntity = String.format("%d.%d.%d",
- annotation.groupId, annotation.domainId, annotation.entityId);
- String domainEntity = String.format("%d.%d", annotation.domainId, annotation.entityId);
- String entity = String.format("%d", annotation.entityId);
-
- builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
- groupDomainEntity);
- builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
- domainEntity);
- builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
- entity);
- }
- }
-
- /**
- * Build the correct ThriftIndexingEvent's fields based on retweet and reply status.
- */
- public static void buildRetweetAndReplyFields(
- long retweetUserIdVal,
- long sharedStatusIdVal,
- long inReplyToStatusIdVal,
- long inReplyToUserIdVal,
- boolean strict,
- EarlybirdThriftDocumentBuilder builder) {
- Optional retweetUserId = Optional.of(retweetUserIdVal).filter(x -> x > 0);
- Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(x -> x > 0);
- Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(x -> x > 0);
- Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(x -> x > 0);
-
- // We have six combinations here. A Tweet can be
- // 1) a reply to another tweet (then it has both in-reply-to-user-id and
- // in-reply-to-status-id set),
- // 2) directed-at a user (then it only has in-reply-to-user-id set),
- // 3) not a reply at all.
- // Additionally, it may or may not be a Retweet (if it is, then it has retweet-user-id and
- // retweet-status-id set).
- //
- // We want to set some fields unconditionally, and some fields (reference-author-id and
- // shared-status-id) depending on the reply/retweet combination.
- //
- // 1. Normal tweet (not a reply, not a retweet). None of the fields should be set.
- //
- // 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set).
- // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
- // SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id
- // IS_REPLY_FLAG should be set
- //
- // 3. Directed-at a user (only in-reply-to-user-id is set).
- // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
- // IS_REPLY_FLAG should be set
- //
- // 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set).
- // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
- // SHARED_STATUS_ID_CSF should be set to retweet-status-id
- // IS_RETWEET_FLAG should be set
- //
- // 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set,
- // retweet-user-id and retweet-status-id are set).
- // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
- // SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!)
- // IS_RETWEET_FLAG should be set
- // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
- // IS_REPLY_FLAG should NOT be set
- //
- // 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set,
- // retweet-user-id and retweet-status-id are set).
- // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id
- // SHARED_STATUS_ID_CSF should be set to retweet-status-id
- // IS_RETWEET_FLAG should be set
- // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id
- // IS_REPLY_FLAG should NOT be set
- //
- // In other words:
- // SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to
- // retweet-status-id, otherwise if it's a reply to a tweet, it should be set to
- // in-reply-to-status-id.
-
- Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent());
-
- if (retweetUserId.isPresent()) {
- builder.withNativeRetweet(retweetUserId.get(), sharedStatusId.get());
-
- if (inReplyToUserId.isPresent()) {
- // Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply.
- builder.withInReplyToUserID(inReplyToUserId.get());
- }
- } else {
- // If this is a retweet of a reply, we don't want to mark it as a reply, or override fields
- // set by the retweet logic.
- // If we are in this branch, this is not a retweet. Potentially, we set the reply flag,
- // and override shared-status-id and reference-author-id.
-
- if (inReplyToStatusId.isPresent()) {
- if (strict) {
- // Enforcing that if this is a reply to a tweet, then it also has a replied-to user.
- Preconditions.checkState(inReplyToUserId.isPresent());
- }
- builder.withReplyFlag();
- builder.withLongField(
- EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(),
- inReplyToStatusId.get());
- builder.withLongField(
- EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName(),
- inReplyToStatusId.get());
- }
- if (inReplyToUserId.isPresent()) {
- builder.withReplyFlag();
- builder.withInReplyToUserID(inReplyToUserId.get());
- }
- }
- }
-
- /**
- * Build the engagement fields.
- */
- public static void buildNormalizedMinEngagementFields(
- EarlybirdThriftDocumentBuilder builder,
- EarlybirdEncodedFeatures encodedFeatures,
- EarlybirdCluster cluster) throws IOException {
- if (EarlybirdCluster.isArchive(cluster)) {
- int favoriteCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.FAVORITE_COUNT);
- int retweetCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.RETWEET_COUNT);
- int replyCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.REPLY_COUNT);
- builder
- .withNormalizedMinEngagementField(
- EarlybirdFieldConstant.NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD
- .getFieldName(),
- favoriteCount);
- builder
- .withNormalizedMinEngagementField(
- EarlybirdFieldConstant.NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD
- .getFieldName(),
- retweetCount);
- builder
- .withNormalizedMinEngagementField(
- EarlybirdFieldConstant.NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD
- .getFieldName(),
- replyCount);
- }
- }
-
- /**
- * As seen in SEARCH-5617, we sometimes have incorrect createdAt. This method tries to fix them
- * by extracting creation time from snowflake when possible.
- */
- public static long fixCreatedAtTimeStampIfNecessary(long id, long createdAtMs) {
- if (createdAtMs < VALID_CREATION_TIME_THRESHOLD_MILLIS
- && id > SnowflakeIdParser.SNOWFLAKE_ID_LOWER_BOUND) {
- // This tweet has a snowflake ID, and we can extract timestamp from the ID.
- ADJUSTED_BAD_CREATED_AT_COUNTER.increment();
- return SnowflakeIdParser.getTimestampFromTweetId(id);
- } else if (!SnowflakeIdParser.isTweetIDAndCreatedAtConsistent(id, createdAtMs)) {
- LOG.error(
- "Found inconsistent tweet ID and created at timestamp: [statusID={}], [createdAtMs={}]",
- id, createdAtMs);
- INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS.increment();
- }
-
- return createdAtMs;
- }
-}
diff --git a/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.docx b/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.docx
new file mode 100644
index 000000000..fe5e99466
Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.docx differ
diff --git a/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.java
deleted file mode 100644
index 1ed40bcd4..000000000
--- a/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.java
+++ /dev/null
@@ -1,99 +0,0 @@
-package com.twitter.search.common.converter.earlybird;
-
-import java.io.IOException;
-import java.util.List;
-
-import javax.annotation.concurrent.NotThreadSafe;
-
-import com.google.common.base.Preconditions;
-
-import com.twitter.common_internal.text.version.PenguinVersion;
-import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
-import com.twitter.search.common.relevance.entities.TwitterMessage;
-import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
-import com.twitter.search.common.schema.base.Schema;
-import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
-import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder;
-import com.twitter.search.common.schema.thriftjava.ThriftDocument;
-import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
-import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType;
-
-/**
- * CombinedIndexingConverter builds objects from TwitterMessage to ThriftVersionedEvent.
- *
- * It is used in tests and in offline jobs, so all data is available on the TwitterMessage. This
- * means that we don't need to split up the ThriftVersionedEvents into basic events and update
- * events, like we do in the realtime pipeline using the BasicIndexingConverter and the
- * DelayedIndexingConverter.
- */
-@NotThreadSafe
-public class CombinedIndexingConverter {
- private final EncodedFeatureBuilder featureBuilder;
- private final Schema schema;
- private final EarlybirdCluster cluster;
-
- public CombinedIndexingConverter(Schema schema, EarlybirdCluster cluster) {
- this.featureBuilder = new EncodedFeatureBuilder();
- this.schema = schema;
- this.cluster = cluster;
- }
-
- /**
- * Converts a TwitterMessage to a Thrift representation.
- */
- public ThriftVersionedEvents convertMessageToThrift(
- TwitterMessage message,
- boolean strict,
- List penguinVersions) throws IOException {
- Preconditions.checkNotNull(message);
- Preconditions.checkNotNull(penguinVersions);
-
- ThriftVersionedEvents versionedEvents = new ThriftVersionedEvents()
- .setId(message.getId());
-
- ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot();
-
- for (PenguinVersion penguinVersion : penguinVersions) {
- ThriftDocument document =
- buildDocumentForPenguinVersion(schemaSnapshot, message, strict, penguinVersion);
-
- ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent()
- .setDocument(document)
- .setEventType(ThriftIndexingEventType.INSERT)
- .setSortId(message.getId());
- message.getFromUserTwitterId().map(thriftIndexingEvent::setUid);
- versionedEvents.putToVersionedEvents(penguinVersion.getByteValue(), thriftIndexingEvent);
- }
-
- return versionedEvents;
- }
-
- private ThriftDocument buildDocumentForPenguinVersion(
- ImmutableSchemaInterface schemaSnapshot,
- TwitterMessage message,
- boolean strict,
- PenguinVersion penguinVersion) throws IOException {
- EncodedFeatureBuilder.TweetFeatureWithEncodeFeatures tweetFeature =
- featureBuilder.createTweetFeaturesFromTwitterMessage(
- message, penguinVersion, schemaSnapshot);
-
- EarlybirdThriftDocumentBuilder builder =
- BasicIndexingConverter.buildBasicFields(message, schemaSnapshot, cluster, tweetFeature);
-
- BasicIndexingConverter
- .buildUserFields(builder, message, tweetFeature.versionedFeatures, penguinVersion);
- BasicIndexingConverter.buildGeoFields(builder, message, tweetFeature.versionedFeatures);
- DelayedIndexingConverter.buildURLFields(builder, message, tweetFeature.encodedFeatures);
- BasicIndexingConverter.buildRetweetAndReplyFields(builder, message, strict);
- BasicIndexingConverter.buildQuotesFields(builder, message);
- BasicIndexingConverter.buildVersionedFeatureFields(builder, tweetFeature.versionedFeatures);
- DelayedIndexingConverter.buildCardFields(builder, message, penguinVersion);
- BasicIndexingConverter.buildAnnotationFields(builder, message);
- BasicIndexingConverter.buildNormalizedMinEngagementFields(
- builder, tweetFeature.encodedFeatures, cluster);
- DelayedIndexingConverter.buildNamedEntityFields(builder, message);
- BasicIndexingConverter.buildDirectedAtFields(builder, message);
-
- return builder.build();
- }
-}
diff --git a/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.docx b/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.docx
new file mode 100644
index 000000000..5e7f3dfc1
Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.docx differ
diff --git a/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.java
deleted file mode 100644
index 0ed3ac134..000000000
--- a/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.java
+++ /dev/null
@@ -1,594 +0,0 @@
-package com.twitter.search.common.converter.earlybird;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Set;
-import javax.annotation.Nullable;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Lists;
-
-import org.apache.commons.lang.StringUtils;
-import org.apache.http.annotation.NotThreadSafe;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.twitter.common.text.token.TokenizedCharSequenceStream;
-import com.twitter.common.text.util.TokenStreamSerializer;
-import com.twitter.common_internal.text.version.PenguinVersion;
-import com.twitter.cuad.ner.plain.thriftjava.NamedEntity;
-import com.twitter.decider.Decider;
-import com.twitter.search.common.constants.SearchCardType;
-import com.twitter.search.common.decider.DeciderUtil;
-import com.twitter.search.common.indexing.thriftjava.SearchCard2;
-import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl;
-import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
-import com.twitter.search.common.indexing.thriftjava.TwitterPhotoUrl;
-import com.twitter.search.common.relevance.entities.TwitterMessage;
-import com.twitter.search.common.relevance.entities.TwitterMessageUser;
-import com.twitter.search.common.relevance.features.TweetTextFeatures;
-import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
-import com.twitter.search.common.schema.base.Schema;
-import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures;
-import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants;
-import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder;
-import com.twitter.search.common.schema.thriftjava.ThriftDocument;
-import com.twitter.search.common.schema.thriftjava.ThriftField;
-import com.twitter.search.common.schema.thriftjava.ThriftFieldData;
-import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
-import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType;
-import com.twitter.search.common.util.lang.ThriftLanguageUtil;
-import com.twitter.search.common.util.text.LanguageIdentifierHelper;
-import com.twitter.search.common.util.text.NormalizerHelper;
-import com.twitter.search.common.util.text.TokenizerHelper;
-import com.twitter.search.common.util.text.TokenizerResult;
-import com.twitter.search.common.util.text.TweetTokenStreamSerializer;
-import com.twitter.service.spiderduck.gen.MediaTypes;
-import com.twitter.search.common.metrics.SearchCounter;
-
-/**
- * Create and populate ThriftVersionedEvents from the URL data, card data, and named entities
- * contained in a TwitterMessage. This data is delayed because these services take a few seconds
- * to process tweets, and we want to send the basic data available in the BasicIndexingConverter as
- * soon as possible, so we send the additional data a few seconds later, as an update.
- *
- * Prefer to add data and processing to the BasicIndexingConverter when possible. Only add data here
- * if your data source _requires_ data from an external service AND the external service takes at
- * least a few seconds to process new tweets.
- */
-@NotThreadSafe
-public class DelayedIndexingConverter {
- private static final SearchCounter NUM_TWEETS_WITH_CARD_URL =
- SearchCounter.export("tweets_with_card_url");
- private static final SearchCounter NUM_TWEETS_WITH_NUMERIC_CARD_URI =
- SearchCounter.export("tweets_with_numeric_card_uri");
- private static final SearchCounter NUM_TWEETS_WITH_INVALID_CARD_URI =
- SearchCounter.export("tweets_with_invalid_card_uri");
- private static final SearchCounter TOTAL_URLS =
- SearchCounter.export("total_urls_on_tweets");
- private static final SearchCounter MEDIA_URLS_ON_TWEETS =
- SearchCounter.export("media_urls_on_tweets");
- private static final SearchCounter NON_MEDIA_URLS_ON_TWEETS =
- SearchCounter.export("non_media_urls_on_tweets");
- public static final String INDEX_URL_DESCRIPTION_AND_TITLE_DECIDER =
- "index_url_description_and_title";
-
- private static class ThriftDocumentWithEncodedTweetFeatures {
- private final ThriftDocument document;
- private final EarlybirdEncodedFeatures encodedFeatures;
-
- public ThriftDocumentWithEncodedTweetFeatures(ThriftDocument document,
- EarlybirdEncodedFeatures encodedFeatures) {
- this.document = document;
- this.encodedFeatures = encodedFeatures;
- }
-
- public ThriftDocument getDocument() {
- return document;
- }
-
- public EarlybirdEncodedFeatures getEncodedFeatures() {
- return encodedFeatures;
- }
- }
-
- // The list of all the encoded_tweet_features flags that might be updated by this converter.
- // No extended_encoded_tweet_features are updated (otherwise they should be in this list too).
- private static final List UPDATED_FLAGS =
- Lists.newArrayList(
- EarlybirdFieldConstants.EarlybirdFieldConstant.IS_OFFENSIVE_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_LINK_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.IS_SENSITIVE_CONTENT,
- EarlybirdFieldConstants.EarlybirdFieldConstant.TEXT_SCORE,
- EarlybirdFieldConstants.EarlybirdFieldConstant.TWEET_SIGNATURE,
- EarlybirdFieldConstants.EarlybirdFieldConstant.LINK_LANGUAGE,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NEWS_URL_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_EXPANDO_CARD_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_MULTIPLE_MEDIA_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CARD_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VINE_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PERISCOPE_FLAG,
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG
- );
-
- private static final Logger LOG = LoggerFactory.getLogger(DelayedIndexingConverter.class);
- private static final String AMPLIFY_CARD_NAME = "amplify";
- private static final String PLAYER_CARD_NAME = "player";
-
- private final EncodedFeatureBuilder featureBuilder = new EncodedFeatureBuilder();
-
- private final Schema schema;
- private final Decider decider;
-
- public DelayedIndexingConverter(Schema schema, Decider decider) {
- this.schema = schema;
- this.decider = decider;
- }
-
- /**
- * Converts the given message to two ThriftVersionedEvents instances: the first one is a feature
- * update event for all link and card related flags, and the second one is the append event that
- * might contain updates to all link and card related fields.
- *
- * We need to split the updates to fields and flags into two separate events because:
- * - When a tweet is created, earlybirds get the "main" event, which does not have resolved URLs.
- * - Then the earlybirds might get a feature update from the signal ingesters, marking the tweet
- * as spam.
- * - Then the ingesters resolve the URLs and send an update event. At this point, the ingesters
- * need to send updates for link-related flags too (HAS_LINK_FLAG, etc.). And there are a few
- * ways to do this:
- * 1. Encode these flags into encoded_tweet_features and extended_encoded_tweet_features and
- * add these fields to the update event. The problem is that earlybirds will then override
- * the encoded_tweet_features ane extended_encoded_tweet_features fields in the index for
- * this tweet, which will override the feature update the earlybirds got earlier, which
- * means that a spammy tweet might no longer be marked as spam in the index.
- * 2. Send updates only for the flags that might've been updated by this converter. Since
- * ThriftIndexingEvent already has a map of field -> value, it seems like the natural place
- * to add these updates to. However, earlybirds can correctly process flag updates only if
- * they come in a feature update event (PARTIAL_UPDATE). So we need to send the field
- * updates in an OUT_OF_ORDER_UPDATE event, and the flag updates in a PARTIAL_UPDATE event.
- *
- * We need to send the feature update event before the append event to avoid issues like the one
- * in SEARCH-30919 where tweets were returned from the card name field index before the HAS_CARD
- * feature was updated to true.
- *
- * @param message The TwitterMessage to convert.
- * @param penguinVersions The Penguin versions for which ThriftIndexingEvents should be created.
- * @return An out of order update event for all link- and card-related fields and a feature update
- * event for all link- and card-related flags.
- */
- public List convertMessageToOutOfOrderAppendAndFeatureUpdate(
- TwitterMessage message, List penguinVersions) {
- Preconditions.checkNotNull(message);
- Preconditions.checkNotNull(penguinVersions);
-
- ThriftVersionedEvents featureUpdateVersionedEvents = new ThriftVersionedEvents();
- ThriftVersionedEvents outOfOrderAppendVersionedEvents = new ThriftVersionedEvents();
- ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot();
-
- for (PenguinVersion penguinVersion : penguinVersions) {
- ThriftDocumentWithEncodedTweetFeatures documentWithEncodedFeatures =
- buildDocumentForPenguinVersion(schemaSnapshot, message, penguinVersion);
-
- ThriftIndexingEvent featureUpdateThriftIndexingEvent = new ThriftIndexingEvent();
- featureUpdateThriftIndexingEvent.setEventType(ThriftIndexingEventType.PARTIAL_UPDATE);
- featureUpdateThriftIndexingEvent.setUid(message.getId());
- featureUpdateThriftIndexingEvent.setDocument(
- buildFeatureUpdateDocument(documentWithEncodedFeatures.getEncodedFeatures()));
- featureUpdateVersionedEvents.putToVersionedEvents(
- penguinVersion.getByteValue(), featureUpdateThriftIndexingEvent);
-
- ThriftIndexingEvent outOfOrderAppendThriftIndexingEvent = new ThriftIndexingEvent();
- outOfOrderAppendThriftIndexingEvent.setDocument(documentWithEncodedFeatures.getDocument());
- outOfOrderAppendThriftIndexingEvent.setEventType(ThriftIndexingEventType.OUT_OF_ORDER_APPEND);
- message.getFromUserTwitterId().ifPresent(outOfOrderAppendThriftIndexingEvent::setUid);
- outOfOrderAppendThriftIndexingEvent.setSortId(message.getId());
- outOfOrderAppendVersionedEvents.putToVersionedEvents(
- penguinVersion.getByteValue(), outOfOrderAppendThriftIndexingEvent);
- }
-
- featureUpdateVersionedEvents.setId(message.getId());
- outOfOrderAppendVersionedEvents.setId(message.getId());
-
- return Lists.newArrayList(featureUpdateVersionedEvents, outOfOrderAppendVersionedEvents);
- }
-
- private ThriftDocument buildFeatureUpdateDocument(EarlybirdEncodedFeatures encodedFeatures) {
- ThriftDocument document = new ThriftDocument();
- for (EarlybirdFieldConstants.EarlybirdFieldConstant flag : UPDATED_FLAGS) {
- ThriftField field = new ThriftField();
- field.setFieldConfigId(flag.getFieldId());
- field.setFieldData(new ThriftFieldData().setIntValue(encodedFeatures.getFeatureValue(flag)));
- document.addToFields(field);
- }
- return document;
- }
-
- private ThriftDocumentWithEncodedTweetFeatures buildDocumentForPenguinVersion(
- ImmutableSchemaInterface schemaSnapshot,
- TwitterMessage message,
- PenguinVersion penguinVersion) {
-
- EarlybirdEncodedFeatures encodedFeatures = featureBuilder.createTweetFeaturesFromTwitterMessage(
- message, penguinVersion, schemaSnapshot).encodedFeatures;
-
- EarlybirdThriftDocumentBuilder builder = new EarlybirdThriftDocumentBuilder(
- encodedFeatures,
- null,
- new EarlybirdFieldConstants(),
- schemaSnapshot);
-
- builder.setAddLatLonCSF(false);
- builder.withID(message.getId());
- buildFieldsFromUrlInfo(builder, message, penguinVersion, encodedFeatures);
- buildCardFields(builder, message, penguinVersion);
- buildNamedEntityFields(builder, message);
- builder.withTweetSignature(message.getTweetSignature(penguinVersion));
-
- buildSpaceAdminAndTitleFields(builder, message, penguinVersion);
-
- builder.setAddEncodedTweetFeatures(false);
-
- return new ThriftDocumentWithEncodedTweetFeatures(builder.build(), encodedFeatures);
- }
-
- public static void buildNamedEntityFields(
- EarlybirdThriftDocumentBuilder builder, TwitterMessage message) {
- for (NamedEntity namedEntity : message.getNamedEntities()) {
- builder.withNamedEntity(namedEntity);
- }
- }
-
- private void buildFieldsFromUrlInfo(
- EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message,
- PenguinVersion penguinVersion,
- EarlybirdEncodedFeatures encodedFeatures) {
- // We need to update the RESOLVED_LINKS_TEXT_FIELD, since we might have new resolved URLs.
- // Use the same logic as in EncodedFeatureBuilder.java.
- TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion);
- String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
- builder.withResolvedLinksText(resolvedUrlsText);
-
- buildURLFields(builder, message, encodedFeatures);
- buildAnalyzedURLFields(builder, message, penguinVersion);
- }
-
- private void buildAnalyzedURLFields(
- EarlybirdThriftDocumentBuilder builder, TwitterMessage message, PenguinVersion penguinVersion
- ) {
- TOTAL_URLS.add(message.getExpandedUrls().size());
- if (DeciderUtil.isAvailableForRandomRecipient(
- decider,
- INDEX_URL_DESCRIPTION_AND_TITLE_DECIDER)) {
- for (ThriftExpandedUrl expandedUrl : message.getExpandedUrls()) {
- /*
- Consumer Media URLs are added to the expanded URLs in
- TweetEventParserHelper.addMediaEntitiesToMessage. These Twitter.com media URLs contain
- the tweet text as the description and the title is " on Twitter". This is
- redundant information at best and misleading at worst. We will ignore these URLs to avoid
- polluting the url_description and url_title field as well as saving space.
- */
- if (!expandedUrl.isSetConsumerMedia() || !expandedUrl.isConsumerMedia()) {
- NON_MEDIA_URLS_ON_TWEETS.increment();
- if (expandedUrl.isSetDescription()) {
- buildTweetTokenizerTokenizedField(builder,
- EarlybirdFieldConstants.EarlybirdFieldConstant.URL_DESCRIPTION_FIELD.getFieldName(),
- expandedUrl.getDescription(),
- penguinVersion);
- }
- if (expandedUrl.isSetTitle()) {
- buildTweetTokenizerTokenizedField(builder,
- EarlybirdFieldConstants.EarlybirdFieldConstant.URL_TITLE_FIELD.getFieldName(),
- expandedUrl.getTitle(),
- penguinVersion);
- }
- } else {
- MEDIA_URLS_ON_TWEETS.increment();
- }
- }
- }
- }
-
- /**
- * Build the URL based fields from a tweet.
- */
- public static void buildURLFields(
- EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message,
- EarlybirdEncodedFeatures encodedFeatures
- ) {
- Map expandedUrlMap = message.getExpandedUrlMap();
-
- for (ThriftExpandedUrl expandedUrl : expandedUrlMap.values()) {
- if (expandedUrl.getMediaType() == MediaTypes.NATIVE_IMAGE) {
- EncodedFeatureBuilder.addPhotoUrl(message, expandedUrl.getCanonicalLastHopUrl());
- }
- }
-
- // now add all twitter photos links that came with the tweet's payload
- Map photos = message.getPhotoUrls();
- List photoURLs = new ArrayList<>();
- if (photos != null) {
- for (Map.Entry entry : photos.entrySet()) {
- TwitterPhotoUrl photo = new TwitterPhotoUrl(entry.getKey());
- String mediaUrl = entry.getValue();
- if (mediaUrl != null) {
- photo.setMediaUrl(mediaUrl);
- }
- photoURLs.add(photo);
- }
- }
-
- try {
- builder
- .withURLs(Lists.newArrayList(expandedUrlMap.values()))
- .withTwimgURLs(photoURLs);
- } catch (IOException ioe) {
- LOG.error("URL field creation threw an IOException", ioe);
- }
-
-
- if (encodedFeatures.isFlagSet(
- EarlybirdFieldConstants.EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)) {
- builder.withOffensiveFlag();
- }
- if (encodedFeatures.isFlagSet(
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG)) {
- builder.addFilterInternalFieldTerm(
- EarlybirdFieldConstants.EarlybirdFieldConstant.CONSUMER_VIDEO_FILTER_TERM);
- }
- if (encodedFeatures.isFlagSet(
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG)) {
- builder.addFilterInternalFieldTerm(
- EarlybirdFieldConstants.EarlybirdFieldConstant.PRO_VIDEO_FILTER_TERM);
- }
- if (encodedFeatures.isFlagSet(EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VINE_FLAG)) {
- builder.addFilterInternalFieldTerm(
- EarlybirdFieldConstants.EarlybirdFieldConstant.VINE_FILTER_TERM);
- }
- if (encodedFeatures.isFlagSet(
- EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PERISCOPE_FLAG)) {
- builder.addFilterInternalFieldTerm(
- EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_FILTER_TERM);
- }
- }
-
- /**
- * Build the card information inside ThriftIndexingEvent's fields.
- */
- static void buildCardFields(EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message,
- PenguinVersion penguinVersion) {
- if (message.hasCard()) {
- SearchCard2 card = buildSearchCardFromTwitterMessage(
- message,
- TweetTokenStreamSerializer.getTweetTokenStreamSerializer(),
- penguinVersion);
- buildCardFeatures(message.getId(), builder, card);
- }
- }
-
- private static SearchCard2 buildSearchCardFromTwitterMessage(
- TwitterMessage message,
- TokenStreamSerializer streamSerializer,
- PenguinVersion penguinVersion) {
- SearchCard2 card = new SearchCard2();
- card.setCardName(message.getCardName());
- if (message.getCardDomain() != null) {
- card.setCardDomain(message.getCardDomain());
- }
- if (message.getCardLang() != null) {
- card.setCardLang(message.getCardLang());
- }
- if (message.getCardUrl() != null) {
- card.setCardUrl(message.getCardUrl());
- }
-
- if (message.getCardTitle() != null && !message.getCardTitle().isEmpty()) {
- String normalizedTitle = NormalizerHelper.normalize(
- message.getCardTitle(), message.getLocale(), penguinVersion);
- TokenizerResult result = TokenizerHelper.tokenizeTweet(
- normalizedTitle, message.getLocale(), penguinVersion);
- TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream();
- tokenSeqStream.reset(result.tokenSequence);
- try {
- card.setCardTitleTokenStream(streamSerializer.serialize(tokenSeqStream));
- card.setCardTitleTokenStreamText(result.tokenSequence.toString());
- } catch (IOException e) {
- LOG.error("TwitterTokenStream serialization error! Could not serialize card title: "
- + result.tokenSequence);
- card.unsetCardTitleTokenStream();
- card.unsetCardTitleTokenStreamText();
- }
- }
- if (message.getCardDescription() != null && !message.getCardDescription().isEmpty()) {
- String normalizedDesc = NormalizerHelper.normalize(
- message.getCardDescription(), message.getLocale(), penguinVersion);
- TokenizerResult result = TokenizerHelper.tokenizeTweet(
- normalizedDesc, message.getLocale(), penguinVersion);
- TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream();
- tokenSeqStream.reset(result.tokenSequence);
- try {
- card.setCardDescriptionTokenStream(streamSerializer.serialize(tokenSeqStream));
- card.setCardDescriptionTokenStreamText(result.tokenSequence.toString());
- } catch (IOException e) {
- LOG.error("TwitterTokenStream serialization error! Could not serialize card description: "
- + result.tokenSequence);
- card.unsetCardDescriptionTokenStream();
- card.unsetCardDescriptionTokenStreamText();
- }
- }
-
- return card;
- }
-
- /**
- * Builds card features.
- */
- private static void buildCardFeatures(
- long tweetId, EarlybirdThriftDocumentBuilder builder, SearchCard2 card) {
- if (card == null) {
- return;
- }
- builder
- .withTokenStreamField(
- EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(),
- card.getCardTitleTokenStreamText(),
- card.isSetCardTitleTokenStream() ? card.getCardTitleTokenStream() : null)
- .withTokenStreamField(
- EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(),
- card.getCardDescriptionTokenStreamText(),
- card.isSetCardDescriptionTokenStream() ? card.getCardDescriptionTokenStream() : null)
- .withStringField(
- EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(),
- card.getCardName())
- .withIntField(
- EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(),
- SearchCardType.cardTypeFromStringName(card.getCardName()).getByteValue());
-
- if (card.getCardLang() != null) {
- builder.withStringField(
- EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_LANG.getFieldName(),
- card.getCardLang()).withIntField(
- EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(),
- ThriftLanguageUtil.getThriftLanguageOf(card.getCardLang()).getValue());
- }
- if (card.getCardDomain() != null) {
- builder.withStringField(
- EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName(),
- card.getCardDomain());
- }
- if (card.getCardUrl() != null) {
- NUM_TWEETS_WITH_CARD_URL.increment();
- if (card.getCardUrl().startsWith("card://")) {
- String suffix = card.getCardUrl().replace("card://", "");
- if (StringUtils.isNumeric(suffix)) {
- NUM_TWEETS_WITH_NUMERIC_CARD_URI.increment();
- builder.withLongField(
- EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_URI_CSF.getFieldName(),
- Long.parseLong(suffix));
- LOG.debug(String.format(
- "Good card URL for tweet %s: %s",
- tweetId,
- card.getCardUrl()));
- } else {
- NUM_TWEETS_WITH_INVALID_CARD_URI.increment();
- LOG.debug(String.format(
- "Card URL starts with \"card://\" but followed by non-numeric for tweet %s: %s",
- tweetId,
- card.getCardUrl()));
- }
- }
- }
- if (isCardVideo(card)) {
- // Add into "internal" field so that this tweet is returned by filter:videos.
- builder.addFacetSkipList(
- EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName());
- }
- }
-
- /**
- * Determines if a card is a video.
- */
- private static boolean isCardVideo(@Nullable SearchCard2 card) {
- if (card == null) {
- return false;
- }
- return AMPLIFY_CARD_NAME.equalsIgnoreCase(card.getCardName())
- || PLAYER_CARD_NAME.equalsIgnoreCase(card.getCardName());
- }
-
- private void buildSpaceAdminAndTitleFields(
- EarlybirdThriftDocumentBuilder builder,
- TwitterMessage message,
- PenguinVersion penguinVersion) {
-
- buildSpaceAdminFields(builder, message.getSpaceAdmins(), penguinVersion);
-
- // build the space title field.
- buildTweetTokenizerTokenizedField(
- builder,
- EarlybirdFieldConstants.EarlybirdFieldConstant.SPACE_TITLE_FIELD.getFieldName(),
- message.getSpaceTitle(),
- penguinVersion);
- }
-
- private void buildSpaceAdminFields(
- EarlybirdThriftDocumentBuilder builder,
- Set spaceAdmins,
- PenguinVersion penguinVersion) {
-
- for (TwitterMessageUser spaceAdmin : spaceAdmins) {
- if (spaceAdmin.getScreenName().isPresent()) {
- // build screen name (aka handle) fields.
- String screenName = spaceAdmin.getScreenName().get();
- String normalizedScreenName =
- NormalizerHelper.normalizeWithUnknownLocale(screenName, penguinVersion);
-
- builder.withStringField(
- EarlybirdFieldConstants.EarlybirdFieldConstant.SPACE_ADMIN_FIELD.getFieldName(),
- normalizedScreenName);
- builder.withWhiteSpaceTokenizedScreenNameField(
- EarlybirdFieldConstants
- .EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(),
- normalizedScreenName);
-
- if (spaceAdmin.getTokenizedScreenName().isPresent()) {
- builder.withCamelCaseTokenizedScreenNameField(
- EarlybirdFieldConstants
- .EarlybirdFieldConstant.CAMELCASE_TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(),
- screenName,
- normalizedScreenName,
- spaceAdmin.getTokenizedScreenName().get());
- }
- }
-
- if (spaceAdmin.getDisplayName().isPresent()) {
- buildTweetTokenizerTokenizedField(
- builder,
- EarlybirdFieldConstants
- .EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_DISPLAY_NAME_FIELD.getFieldName(),
- spaceAdmin.getDisplayName().get(),
- penguinVersion);
- }
- }
- }
-
- private void buildTweetTokenizerTokenizedField(
- EarlybirdThriftDocumentBuilder builder,
- String fieldName,
- String text,
- PenguinVersion penguinVersion) {
-
- if (StringUtils.isNotEmpty(text)) {
- Locale locale = LanguageIdentifierHelper
- .identifyLanguage(text);
- String normalizedText = NormalizerHelper.normalize(
- text, locale, penguinVersion);
- TokenizerResult result = TokenizerHelper
- .tokenizeTweet(normalizedText, locale, penguinVersion);
- TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream();
- tokenSeqStream.reset(result.tokenSequence);
- TokenStreamSerializer streamSerializer =
- TweetTokenStreamSerializer.getTweetTokenStreamSerializer();
- try {
- builder.withTokenStreamField(
- fieldName,
- result.tokenSequence.toString(),
- streamSerializer.serialize(tokenSeqStream));
- } catch (IOException e) {
- LOG.error("TwitterTokenStream serialization error! Could not serialize: " + text);
- }
- }
- }
-}
diff --git a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.docx b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.docx
new file mode 100644
index 000000000..e5e13f68f
Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.docx differ
diff --git a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java
deleted file mode 100644
index c5d6b1c76..000000000
--- a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java
+++ /dev/null
@@ -1,531 +0,0 @@
-package com.twitter.search.common.converter.earlybird;
-
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.Optional;
-import java.util.Set;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-import java.util.stream.Collectors;
-
-import com.google.common.base.Joiner;
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-import org.apache.commons.lang.StringUtils;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.twitter.common.text.token.TokenizedCharSequence;
-import com.twitter.common.text.token.TokenizedCharSequenceStream;
-import com.twitter.common.text.util.TokenStreamSerializer;
-import com.twitter.common_internal.text.version.PenguinVersion;
-import com.twitter.search.common.indexing.thriftjava.Place;
-import com.twitter.search.common.indexing.thriftjava.PotentialLocation;
-import com.twitter.search.common.indexing.thriftjava.ProfileGeoEnrichment;
-import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl;
-import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures;
-import com.twitter.search.common.metrics.SearchCounter;
-import com.twitter.search.common.relevance.entities.PotentialLocationObject;
-import com.twitter.search.common.relevance.entities.TwitterMessage;
-import com.twitter.search.common.relevance.features.FeatureSink;
-import com.twitter.search.common.relevance.features.MutableFeatureNormalizers;
-import com.twitter.search.common.relevance.features.RelevanceSignalConstants;
-import com.twitter.search.common.relevance.features.TweetTextFeatures;
-import com.twitter.search.common.relevance.features.TweetTextQuality;
-import com.twitter.search.common.relevance.features.TweetUserFeatures;
-import com.twitter.search.common.schema.base.FeatureConfiguration;
-import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
-import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures;
-import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
-import com.twitter.search.common.util.lang.ThriftLanguageUtil;
-import com.twitter.search.common.util.text.LanguageIdentifierHelper;
-import com.twitter.search.common.util.text.NormalizerHelper;
-import com.twitter.search.common.util.text.SourceNormalizer;
-import com.twitter.search.common.util.text.TokenizerHelper;
-import com.twitter.search.common.util.text.TokenizerResult;
-import com.twitter.search.common.util.text.TweetTokenStreamSerializer;
-import com.twitter.search.common.util.url.LinkVisibilityUtils;
-import com.twitter.search.common.util.url.NativeVideoClassificationUtils;
-import com.twitter.search.ingester.model.VisibleTokenRatioUtil;
-
-/**
- * EncodedFeatureBuilder helps to build encoded features for TwitterMessage.
- *
- * This is stateful so should only be used one tweet at a time
- */
-public class EncodedFeatureBuilder {
- private static final Logger LOG = LoggerFactory.getLogger(EncodedFeatureBuilder.class);
-
- private static final SearchCounter NUM_TWEETS_WITH_INVALID_TWEET_ID_IN_PHOTO_URL =
- SearchCounter.export("tweets_with_invalid_tweet_id_in_photo_url");
-
- // TwitterTokenStream for converting TokenizedCharSequence into a stream for serialization
- // This is stateful so should only be used one tweet at a time
- private final TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream();
-
- // SUPPRESS CHECKSTYLE:OFF LineLength
- private static final Pattern TWITTER_PHOTO_PERMA_LINK_PATTERN =
- Pattern.compile("(?i:^(?:(?:https?\\:\\/\\/)?(?:www\\.)?)?twitter\\.com\\/(?:\\?[^#]+)?(?:#!?\\/?)?\\w{1,20}\\/status\\/(\\d+)\\/photo\\/\\d*$)");
-
- private static final Pattern TWITTER_PHOTO_COPY_PASTE_LINK_PATTERN =
- Pattern.compile("(?i:^(?:(?:https?\\:\\/\\/)?(?:www\\.)?)?twitter\\.com\\/(?:#!?\\/)?\\w{1,20}\\/status\\/(\\d+)\\/photo\\/\\d*$)");
- // SUPPRESS CHECKSTYLE:ON LineLength
-
- private static final VisibleTokenRatioUtil VISIBLE_TOKEN_RATIO = new VisibleTokenRatioUtil();
-
- private static final Map SERIALIZE_FAILURE_COUNTERS_MAP =
- Maps.newEnumMap(PenguinVersion.class);
- static {
- for (PenguinVersion penguinVersion : PenguinVersion.values()) {
- SERIALIZE_FAILURE_COUNTERS_MAP.put(
- penguinVersion,
- SearchCounter.export(
- "tokenstream_serialization_failure_" + penguinVersion.name().toLowerCase()));
- }
- }
-
- public static class TweetFeatureWithEncodeFeatures {
- public final VersionedTweetFeatures versionedFeatures;
- public final EarlybirdEncodedFeatures encodedFeatures;
- public final EarlybirdEncodedFeatures extendedEncodedFeatures;
-
- public TweetFeatureWithEncodeFeatures(
- VersionedTweetFeatures versionedFeatures,
- EarlybirdEncodedFeatures encodedFeatures,
- EarlybirdEncodedFeatures extendedEncodedFeatures) {
- this.versionedFeatures = versionedFeatures;
- this.encodedFeatures = encodedFeatures;
- this.extendedEncodedFeatures = extendedEncodedFeatures;
- }
- }
-
- /**
- * Create tweet text features and the encoded features.
- *
- * @param message the tweet message
- * @param penguinVersion the based penguin version to create the features
- * @param schemaSnapshot the schema associated with the features
- * @return the text features and the encoded features
- */
- public TweetFeatureWithEncodeFeatures createTweetFeaturesFromTwitterMessage(
- TwitterMessage message,
- PenguinVersion penguinVersion,
- ImmutableSchemaInterface schemaSnapshot) {
- VersionedTweetFeatures versionedTweetFeatures = new VersionedTweetFeatures();
-
- // Write extendedPackedFeatures.
- EarlybirdEncodedFeatures extendedEncodedFeatures =
- createExtendedEncodedFeaturesFromTwitterMessage(message, penguinVersion, schemaSnapshot);
- if (extendedEncodedFeatures != null) {
- extendedEncodedFeatures
- .writeExtendedFeaturesToVersionedTweetFeatures(versionedTweetFeatures);
- }
-
- setSourceAndNormalizedSource(
- message.getStrippedSource(), versionedTweetFeatures, penguinVersion);
-
- TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion);
-
- ///////////////////////////////
- // Add hashtags and mentions
- textFeatures.getHashtags().forEach(versionedTweetFeatures::addToHashtags);
- textFeatures.getMentions().forEach(versionedTweetFeatures::addToMentions);
-
- ///////////////////////////////
- // Extract some extra information from the message text.
- // Index stock symbols with $ prepended
- textFeatures.getStocks().stream()
- .filter(stock -> stock != null)
- .forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase()));
-
- // Question marks
- versionedTweetFeatures.setHasQuestionMark(textFeatures.hasQuestionMark());
- // Smileys
- versionedTweetFeatures.setHasPositiveSmiley(textFeatures.hasPositiveSmiley());
- versionedTweetFeatures.setHasNegativeSmiley(textFeatures.hasNegativeSmiley());
-
- TokenStreamSerializer streamSerializer =
- TweetTokenStreamSerializer.getTweetTokenStreamSerializer();
- TokenizedCharSequence tokenSeq = textFeatures.getTokenSequence();
- tokenSeqStream.reset(tokenSeq);
- int tokenPercent = VISIBLE_TOKEN_RATIO.extractAndNormalizeTokenPercentage(tokenSeqStream);
- tokenSeqStream.reset(tokenSeq);
-
- // Write packedFeatures.
- EarlybirdEncodedFeatures encodedFeatures = createEncodedFeaturesFromTwitterMessage(
- message, penguinVersion, schemaSnapshot, tokenPercent);
- encodedFeatures.writeFeaturesToVersionedTweetFeatures(versionedTweetFeatures);
-
- try {
- versionedTweetFeatures.setTweetTokenStream(streamSerializer.serialize(tokenSeqStream));
- versionedTweetFeatures.setTweetTokenStreamText(tokenSeq.toString());
- } catch (IOException e) {
- LOG.error("TwitterTokenStream serialization error! Could not serialize: "
- + tokenSeq.toString());
- SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment();
- versionedTweetFeatures.unsetTweetTokenStream();
- versionedTweetFeatures.unsetTweetTokenStreamText();
- }
-
- // User name features
- if (message.getFromUserDisplayName().isPresent()) {
- Locale locale = LanguageIdentifierHelper
- .identifyLanguage(message.getFromUserDisplayName().get());
- String normalizedDisplayName = NormalizerHelper.normalize(
- message.getFromUserDisplayName().get(), locale, penguinVersion);
- TokenizerResult result = TokenizerHelper
- .tokenizeTweet(normalizedDisplayName, locale, penguinVersion);
- tokenSeqStream.reset(result.tokenSequence);
- try {
- versionedTweetFeatures.setUserDisplayNameTokenStream(
- streamSerializer.serialize(tokenSeqStream));
- versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString());
- } catch (IOException e) {
- LOG.error("TwitterTokenStream serialization error! Could not serialize: "
- + message.getFromUserDisplayName().get());
- SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment();
- versionedTweetFeatures.unsetUserDisplayNameTokenStream();
- versionedTweetFeatures.unsetUserDisplayNameTokenStreamText();
- }
- }
-
- String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens());
- versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText);
-
- addPlace(message, versionedTweetFeatures, penguinVersion);
- addProfileGeoEnrichment(message, versionedTweetFeatures, penguinVersion);
-
- versionedTweetFeatures.setTweetSignature(message.getTweetSignature(penguinVersion));
-
- return new TweetFeatureWithEncodeFeatures(
- versionedTweetFeatures, encodedFeatures, extendedEncodedFeatures);
- }
-
-
- protected static void setSourceAndNormalizedSource(
- String strippedSource,
- VersionedTweetFeatures versionedTweetFeatures,
- PenguinVersion penguinVersion) {
-
- if (strippedSource != null && !strippedSource.isEmpty()) {
- // normalize source for searchable field - replaces whitespace with underscores (???).
- versionedTweetFeatures.setNormalizedSource(
- SourceNormalizer.normalize(strippedSource, penguinVersion));
-
- // source facet has simpler normalization.
- Locale locale = LanguageIdentifierHelper.identifyLanguage(strippedSource);
- versionedTweetFeatures.setSource(NormalizerHelper.normalizeKeepCase(
- strippedSource, locale, penguinVersion));
- }
- }
-
- /**
- * Adds the given photo url to the thrift status if it is a twitter photo permalink.
- * Returns true, if this was indeed a twitter photo, false otherwise.
- */
- public static boolean addPhotoUrl(TwitterMessage message, String photoPermalink) {
- Matcher matcher = TWITTER_PHOTO_COPY_PASTE_LINK_PATTERN.matcher(photoPermalink);
- if (!matcher.matches() || matcher.groupCount() < 1) {
- matcher = TWITTER_PHOTO_PERMA_LINK_PATTERN.matcher(photoPermalink);
- }
-
- if (matcher.matches() && matcher.groupCount() == 1) {
- // this is a native photo url which we need to store in a separate field
- String idStr = matcher.group(1);
- if (idStr != null) {
- // idStr should be a valid tweet ID (and therefore, should fit into a Long), but we have
- // tweets for which idStr is a long sequence of digits that does not fit into a Long.
- try {
- long photoStatusId = Long.parseLong(idStr);
- message.addPhotoUrl(photoStatusId, null);
- } catch (NumberFormatException e) {
- LOG.warn("Found a tweet with a photo URL with an invalid tweet ID: " + message);
- NUM_TWEETS_WITH_INVALID_TWEET_ID_IN_PHOTO_URL.increment();
- }
- }
- return true;
- }
- return false;
- }
-
- private void addPlace(TwitterMessage message,
- VersionedTweetFeatures versionedTweetFeatures,
- PenguinVersion penguinVersion) {
- String placeId = message.getPlaceId();
- if (placeId == null) {
- return;
- }
-
- // Tweet.Place.id and Tweet.Place.full_name are both required fields.
- String placeFullName = message.getPlaceFullName();
- Preconditions.checkNotNull(placeFullName, "Tweet.Place without full_name.");
-
- Locale placeFullNameLocale = LanguageIdentifierHelper.identifyLanguage(placeFullName);
- String normalizedPlaceFullName =
- NormalizerHelper.normalize(placeFullName, placeFullNameLocale, penguinVersion);
- String tokenizedPlaceFullName = StringUtils.join(
- TokenizerHelper.tokenizeQuery(normalizedPlaceFullName, placeFullNameLocale, penguinVersion),
- " ");
-
- Place place = new Place(placeId, tokenizedPlaceFullName);
- String placeCountryCode = message.getPlaceCountryCode();
- if (placeCountryCode != null) {
- Locale placeCountryCodeLocale = LanguageIdentifierHelper.identifyLanguage(placeCountryCode);
- place.setCountryCode(
- NormalizerHelper.normalize(placeCountryCode, placeCountryCodeLocale, penguinVersion));
- }
-
- versionedTweetFeatures.setTokenizedPlace(place);
- }
-
- private void addProfileGeoEnrichment(TwitterMessage message,
- VersionedTweetFeatures versionedTweetFeatures,
- PenguinVersion penguinVersion) {
- List potentialLocations = message.getPotentialLocations();
- if (potentialLocations.isEmpty()) {
- return;
- }
-
- List thriftPotentialLocations = Lists.newArrayList();
- for (PotentialLocationObject potentialLocation : potentialLocations) {
- thriftPotentialLocations.add(potentialLocation.toThriftPotentialLocation(penguinVersion));
- }
- versionedTweetFeatures.setTokenizedProfileGeoEnrichment(
- new ProfileGeoEnrichment(thriftPotentialLocations));
- }
-
- /** Returns the encoded features. */
- public static EarlybirdEncodedFeatures createEncodedFeaturesFromTwitterMessage(
- TwitterMessage message,
- PenguinVersion penguinVersion,
- ImmutableSchemaInterface schema,
- int normalizedTokenPercentBucket) {
- FeatureSink sink = new FeatureSink(schema);
-
- // Static features
- sink.setBooleanValue(EarlybirdFieldConstant.IS_RETWEET_FLAG, message.isRetweet())
- .setBooleanValue(EarlybirdFieldConstant.IS_REPLY_FLAG, message.isReply())
- .setBooleanValue(
- EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG, message.isUserVerified())
- .setBooleanValue(
- EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG, message.isUserBlueVerified())
- .setBooleanValue(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT, message.isSensitiveContent());
-
- TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion);
- if (textFeatures != null) {
- final FeatureConfiguration featureConfigNumHashtags = schema.getFeatureConfigurationByName(
- EarlybirdFieldConstant.NUM_HASHTAGS.getFieldName());
- final FeatureConfiguration featureConfigNumMentions = schema.getFeatureConfigurationByName(
- EarlybirdFieldConstant.NUM_MENTIONS.getFieldName());
-
- sink.setNumericValue(
- EarlybirdFieldConstant.NUM_HASHTAGS,
- Math.min(textFeatures.getHashtagsSize(), featureConfigNumHashtags.getMaxValue()))
- .setNumericValue(
- EarlybirdFieldConstant.NUM_MENTIONS,
- Math.min(textFeatures.getMentionsSize(), featureConfigNumMentions.getMaxValue()))
- .setBooleanValue(
- EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG,
- TwitterMessage.hasMultipleHashtagsOrTrends(textFeatures))
- .setBooleanValue(
- EarlybirdFieldConstant.HAS_TREND_FLAG,
- textFeatures.getTrendingTermsSize() > 0);
- }
-
- TweetTextQuality textQuality = message.getTweetTextQuality(penguinVersion);
- if (textQuality != null) {
- sink.setNumericValue(EarlybirdFieldConstant.TEXT_SCORE, textQuality.getTextScore());
- sink.setBooleanValue(
- EarlybirdFieldConstant.IS_OFFENSIVE_FLAG,
- textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE)
- || textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE_USER)
- // Note: if json message "possibly_sensitive" flag is set, we consider the tweet
- // sensitive and is currently filtered out in safe search mode via a hacky setup:
- // earlybird does not create _filter_sensitive_content field, only
- // _is_offensive field is created, and used in filter:safe operator
- || textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.SENSITIVE));
- if (textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.SENSITIVE)) {
- sink.setBooleanValue(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT, true);
- }
- } else {
- // we don't have text score, for whatever reason, set to sentinel value so we won't be
- // skipped by scoring function
- sink.setNumericValue(EarlybirdFieldConstant.TEXT_SCORE,
- RelevanceSignalConstants.UNSET_TEXT_SCORE_SENTINEL);
- }
-
- if (message.isSetLocale()) {
- sink.setNumericValue(EarlybirdFieldConstant.LANGUAGE,
- ThriftLanguageUtil.getThriftLanguageOf(message.getLocale()).getValue());
- }
-
- // User features
- TweetUserFeatures userFeatures = message.getTweetUserFeatures(penguinVersion);
- if (userFeatures != null) {
- sink.setBooleanValue(EarlybirdFieldConstant.IS_USER_SPAM_FLAG, userFeatures.isSpam())
- .setBooleanValue(EarlybirdFieldConstant.IS_USER_NSFW_FLAG, userFeatures.isNsfw())
- .setBooleanValue(EarlybirdFieldConstant.IS_USER_BOT_FLAG, userFeatures.isBot());
- }
- if (message.getUserReputation() != TwitterMessage.DOUBLE_FIELD_NOT_PRESENT) {
- sink.setNumericValue(EarlybirdFieldConstant.USER_REPUTATION,
- (byte) message.getUserReputation());
- } else {
- sink.setNumericValue(EarlybirdFieldConstant.USER_REPUTATION,
- RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL);
- }
-
- sink.setBooleanValue(EarlybirdFieldConstant.IS_NULLCAST_FLAG, message.getNullcast());
-
- // Realtime Ingestion does not write engagement features. Updater does that.
- if (message.getNumFavorites() > 0) {
- sink.setNumericValue(EarlybirdFieldConstant.FAVORITE_COUNT,
- MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(message.getNumFavorites()));
- }
- if (message.getNumRetweets() > 0) {
- sink.setNumericValue(EarlybirdFieldConstant.RETWEET_COUNT,
- MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(message.getNumRetweets()));
- }
- if (message.getNumReplies() > 0) {
- sink.setNumericValue(EarlybirdFieldConstant.REPLY_COUNT,
- MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(message.getNumReplies()));
- }
-
- sink.setNumericValue(EarlybirdFieldConstant.VISIBLE_TOKEN_RATIO, normalizedTokenPercentBucket);
-
- EarlybirdEncodedFeatures encodedFeatures =
- (EarlybirdEncodedFeatures) sink.getFeaturesForBaseField(
- EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName());
- updateLinkEncodedFeatures(encodedFeatures, message);
- return encodedFeatures;
- }
-
- /**
- * Returns the extended encoded features.
- */
- public static EarlybirdEncodedFeatures createExtendedEncodedFeaturesFromTwitterMessage(
- TwitterMessage message,
- PenguinVersion penguinVersion,
- ImmutableSchemaInterface schema) {
- FeatureSink sink = new FeatureSink(schema);
-
- TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion);
-
- if (textFeatures != null) {
- setExtendedEncodedFeatureIntValue(sink, schema,
- EarlybirdFieldConstant.NUM_HASHTAGS_V2, textFeatures.getHashtagsSize());
- setExtendedEncodedFeatureIntValue(sink, schema,
- EarlybirdFieldConstant.NUM_MENTIONS_V2, textFeatures.getMentionsSize());
- setExtendedEncodedFeatureIntValue(sink, schema,
- EarlybirdFieldConstant.NUM_STOCKS, textFeatures.getStocksSize());
- }
-
- Optional referenceAuthorId = message.getReferenceAuthorId();
- if (referenceAuthorId.isPresent()) {
- setEncodedReferenceAuthorId(sink, referenceAuthorId.get());
- }
-
- return (EarlybirdEncodedFeatures) sink.getFeaturesForBaseField(
- EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldName());
- }
-
- /**
- * Updates all URL-related features, based on the values stored in the given message.
- *
- * @param encodedFeatures The features to be updated.
- * @param message The message.
- */
- public static void updateLinkEncodedFeatures(
- EarlybirdEncodedFeatures encodedFeatures, TwitterMessage message) {
- if (message.getLinkLocale() != null) {
- encodedFeatures.setFeatureValue(
- EarlybirdFieldConstant.LINK_LANGUAGE,
- ThriftLanguageUtil.getThriftLanguageOf(message.getLinkLocale()).getValue());
- }
-
- if (message.hasCard()) {
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_CARD_FLAG);
- }
-
- // Set HAS_IMAGE HAS_NEWS HAS_VIDEO etc. flags for expanded urls.
- if (message.getExpandedUrlMapSize() > 0) {
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_LINK_FLAG);
-
- for (ThriftExpandedUrl url : message.getExpandedUrlMap().values()) {
- if (url.isSetMediaType()) {
- switch (url.getMediaType()) {
- case NATIVE_IMAGE:
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG);
- break;
- case IMAGE:
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
- break;
- case VIDEO:
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG);
- break;
- case NEWS:
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG);
- break;
- case UNKNOWN:
- break;
- default:
- throw new IllegalStateException("Unexpected enum value: " + url.getMediaType());
- }
- }
- }
- }
-
- Set canonicalLastHopUrlsStrings = message.getCanonicalLastHopUrls();
- Set expandedUrlsStrings = message.getExpandedUrls()
- .stream()
- .map(ThriftExpandedUrl::getExpandedUrl)
- .collect(Collectors.toSet());
- Set expandedAndLastHopUrlsStrings = new HashSet<>();
- expandedAndLastHopUrlsStrings.addAll(expandedUrlsStrings);
- expandedAndLastHopUrlsStrings.addAll(canonicalLastHopUrlsStrings);
- // Check both expanded and last hop url for consumer videos as consumer video urls are
- // sometimes redirected to the url of the tweets containing the videos (SEARCH-42612).
- if (NativeVideoClassificationUtils.hasConsumerVideo(expandedAndLastHopUrlsStrings)) {
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG);
- }
- if (NativeVideoClassificationUtils.hasProVideo(canonicalLastHopUrlsStrings)) {
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG);
- }
- if (NativeVideoClassificationUtils.hasVine(canonicalLastHopUrlsStrings)) {
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_VINE_FLAG);
- }
- if (NativeVideoClassificationUtils.hasPeriscope(canonicalLastHopUrlsStrings)) {
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_PERISCOPE_FLAG);
- }
- if (LinkVisibilityUtils.hasVisibleLink(message.getExpandedUrls())) {
- encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG);
- }
- }
-
- private static void setExtendedEncodedFeatureIntValue(
- FeatureSink sink,
- ImmutableSchemaInterface schema,
- EarlybirdFieldConstant field,
- int value) {
- boolean fieldInSchema = schema.hasField(field.getFieldName());
- if (fieldInSchema) {
- FeatureConfiguration featureConfig =
- schema.getFeatureConfigurationByName(field.getFieldName());
- sink.setNumericValue(field, Math.min(value, featureConfig.getMaxValue()));
- }
- }
-
- private static void setEncodedReferenceAuthorId(FeatureSink sink, long referenceAuthorId) {
- LongIntConverter.IntegerRepresentation ints =
- LongIntConverter.convertOneLongToTwoInt(referenceAuthorId);
- sink.setNumericValue(
- EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT, ints.leastSignificantInt);
- sink.setNumericValue(
- EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT, ints.mostSignificantInt);
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/docvalues/BUILD b/src/java/com/twitter/search/common/encoding/docvalues/BUILD
deleted file mode 100644
index bc4756173..000000000
--- a/src/java/com/twitter/search/common/encoding/docvalues/BUILD
+++ /dev/null
@@ -1,20 +0,0 @@
-# Java library for docvalues and common stride field encoding utilities.
-java_library(
- sources = ["*.java"],
- platform = "java8",
- provides = artifact(
- org = "com.twitter.search.common",
- name = "encoding-docvalues",
- repo = artifactory,
- ),
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/google/guava",
- "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
- "3rdparty/jvm/org/apache/lucene:lucene-core",
- "3rdparty/jvm/org/apache/lucene:lucene-facet",
- "3rdparty/jvm/org/apache/thrift:libthrift",
- "src/java/com/twitter/search/common/schema/base",
- "src/thrift/com/twitter/search/common:schema-java",
- ],
-)
diff --git a/src/java/com/twitter/search/common/encoding/docvalues/BUILD.docx b/src/java/com/twitter/search/common/encoding/docvalues/BUILD.docx
new file mode 100644
index 000000000..effd5061e
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/docvalues/BUILD.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.docx b/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.docx
new file mode 100644
index 000000000..5f1fa4979
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.java b/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.java
deleted file mode 100644
index 1d6d2c0bb..000000000
--- a/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.java
+++ /dev/null
@@ -1,34 +0,0 @@
-package com.twitter.search.common.encoding.docvalues;
-
-public final class CSFTypeUtil {
- private CSFTypeUtil() {
- }
-
- /**
- * Convert a long into a byte array, stored into dest.
- */
- public static void convertToBytes(byte[] dest, int valueIndex, int value) {
- int offset = valueIndex * Integer.BYTES;
- dest[offset] = (byte) (value >>> 24);
- dest[offset + 1] = (byte) (value >>> 16);
- dest[offset + 2] = (byte) (value >>> 8);
- dest[offset + 3] = (byte) value;
- }
-
- /**
- * Convert bytes into a long value. Inverse function of convertToBytes.
- */
- public static int convertFromBytes(byte[] data, int startOffset, int valueIndex) {
- // This should rarely happen, eg. when we get a corrupt ThriftIndexingEvent, we insert a new
- // Document which is blank. Such a document results in a length 0 BytesRef.
- if (data.length == 0) {
- return 0;
- }
-
- int offset = startOffset + valueIndex * Integer.BYTES;
- return ((data[offset] & 0xFF) << 24)
- | ((data[offset + 1] & 0xFF) << 16)
- | ((data[offset + 2] & 0xFF) << 8)
- | (data[offset + 3] & 0xFF);
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/BUILD b/src/java/com/twitter/search/common/encoding/features/BUILD
deleted file mode 100644
index 93b13c03f..000000000
--- a/src/java/com/twitter/search/common/encoding/features/BUILD
+++ /dev/null
@@ -1,17 +0,0 @@
-# Java library for feature encoding and decoding utilities.
-java_library(
- sources = ["*.java"],
- platform = "java8",
- provides = artifact(
- org = "com.twitter.search.common",
- name = "encoding-features",
- repo = artifactory,
- ),
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/google/guava",
- "3rdparty/jvm/org/apache/thrift:libthrift",
- "src/java/com/twitter/search/common/schema/base",
- "src/thrift/com/twitter/search/common:indexing-java",
- ],
-)
diff --git a/src/java/com/twitter/search/common/encoding/features/BUILD.docx b/src/java/com/twitter/search/common/encoding/features/BUILD.docx
new file mode 100644
index 000000000..ca8bbf26f
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/BUILD.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.docx
new file mode 100644
index 000000000..9a27e4894
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.java b/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.java
deleted file mode 100644
index 36abc323e..000000000
--- a/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.java
+++ /dev/null
@@ -1,73 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-import java.util.Map;
-import java.util.SortedSet;
-import java.util.TreeMap;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-
-/**
- * Normalizes values to predefined bins.
- * If the value to normalize is lower than the lowest bin defined, normalizes to Byte.MIN_VALUE.
- */
-public class BinByteNormalizer extends ByteNormalizer {
-
- private final TreeMap bins = Maps.newTreeMap();
- private final TreeMap reverseBins = Maps.newTreeMap();
-
- /**
- * Constructs a normalizer using predefined bins.
- * @param bins A mapping between the upper bound of a value and the bin it should normalize to.
- * For example providing a map with 2 entries, {5=>1, 10=>2} will normalize as follows:
- * values under 5: Byte.MIN_VALUE
- * values between 5 and 10: 1
- * values over 10: 2
- */
- public BinByteNormalizer(final Map bins) {
- Preconditions.checkNotNull(bins);
- Preconditions.checkArgument(!bins.isEmpty(), "No bins provided");
- Preconditions.checkArgument(hasIncreasingValues(bins));
- this.bins.putAll(bins);
- for (Map.Entry entry : bins.entrySet()) {
- reverseBins.put(entry.getValue(), entry.getKey());
- }
- }
-
- /**
- * check that if key1 > key2 then val1 > val2 in the {@code map}.
- */
- private static boolean hasIncreasingValues(final Map map) {
- SortedSet orderedKeys = Sets.newTreeSet(map.keySet());
- byte prev = Byte.MIN_VALUE;
- for (Double key : orderedKeys) { // save the unboxing
- byte cur = map.get(key);
- if (cur <= prev) {
- return false;
- }
- prev = cur;
- }
- return true;
- }
-
- @Override
- public byte normalize(double val) {
- Map.Entry lowerBound = bins.floorEntry(val);
- return lowerBound == null
- ? Byte.MIN_VALUE
- : lowerBound.getValue();
- }
-
- @Override
- public double unnormLowerBound(byte norm) {
- return reverseBins.get(reverseBins.floorKey(norm));
- }
-
- @Override
- public double unnormUpperBound(byte norm) {
- return norm == reverseBins.lastKey()
- ? Double.POSITIVE_INFINITY
- : reverseBins.get(reverseBins.floorKey((byte) (1 + norm)));
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.docx
new file mode 100644
index 000000000..21b2ef632
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.java b/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.java
deleted file mode 100644
index 6a6845a12..000000000
--- a/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.java
+++ /dev/null
@@ -1,38 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-/**
- * Interface for compressing unbounded float values to a signed byte. It includes both
- * normalization of values and encoding of values in a byte.
- */
-public abstract class ByteNormalizer {
- public static byte intToUnsignedByte(int i) {
- return (byte) i;
- }
-
- public static int unsignedByteToInt(byte b) {
- return (int) b & 0xFF;
- }
-
- /**
- * Returns the byte-compressed value of {@code val}.
- */
- public abstract byte normalize(double val);
-
- /**
- * Returns a lower bound to the unnormalized range of {@code norm}.
- */
- public abstract double unnormLowerBound(byte norm);
-
- /**
- * Returns an upper bound to the unnormalized range of {@code norm}.
- */
- public abstract double unnormUpperBound(byte norm);
-
- /**
- * Returns true if the normalized value of {@code val} is different than the normalized value of
- * {@code val - 1}
- */
- public boolean changedNorm(double val) {
- return normalize(val) != normalize(val - 1);
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.docx
new file mode 100644
index 000000000..cd370fcff
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.java b/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.java
deleted file mode 100644
index ec1d3faa9..000000000
--- a/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.java
+++ /dev/null
@@ -1,47 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-import com.google.common.base.Preconditions;
-
-/**
- * A byte normalizer that restricts the values to the given range before normalizing them.
- */
-public class ClampByteNormalizer extends ByteNormalizer {
- private final int minUnnormalizedValue;
- private final int maxUnnormalizedValue;
-
- /**
- * Creates a new ClampByteNormalizer instance.
- *
- * @param minValue The smallest allowed unnormalized value.
- * @param maxValue The largest allowed unnormalized value.
- */
- public ClampByteNormalizer(int minUnnormalizedValue, int maxUnnormalizedValue) {
- Preconditions.checkState(minUnnormalizedValue <= maxUnnormalizedValue);
- Preconditions.checkState(minUnnormalizedValue >= 0);
- Preconditions.checkState(maxUnnormalizedValue <= 255);
- this.minUnnormalizedValue = minUnnormalizedValue;
- this.maxUnnormalizedValue = maxUnnormalizedValue;
- }
-
- @Override
- public byte normalize(double val) {
- int adjustedValue = (int) val;
- if (adjustedValue < minUnnormalizedValue) {
- adjustedValue = minUnnormalizedValue;
- }
- if (adjustedValue > maxUnnormalizedValue) {
- adjustedValue = maxUnnormalizedValue;
- }
- return ByteNormalizer.intToUnsignedByte(adjustedValue);
- }
-
- @Override
- public double unnormLowerBound(byte norm) {
- return ByteNormalizer.unsignedByteToInt(norm);
- }
-
- @Override
- public double unnormUpperBound(byte norm) {
- return ByteNormalizer.unsignedByteToInt(norm) + 1;
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.docx b/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.docx
new file mode 100644
index 000000000..ff1c6fabf
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.java b/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.java
deleted file mode 100644
index f6d9b16bb..000000000
--- a/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.java
+++ /dev/null
@@ -1,58 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-/**
- * Encodes multiple values (bytes or bits) into an integer.
- */
-public class EncodedFeatures {
- private int value;
-
- public final void setSerializedValue(int val) {
- this.value = val;
- }
-
- public final int getSerializedValue() {
- return value;
- }
-
- // setByte is agnostic to signed / unsigned bytes.
- protected final EncodedFeatures setByte(byte count, int bitshift, long inverseMask) {
- value = (int) ((value & inverseMask) | ((count & 0xffL) << bitshift));
- return this;
- }
-
- /**
- * Sets the value but only if greater. setByteIfGreater assumes unsigned bytes.
- */
- public final EncodedFeatures setByteIfGreater(byte newCount, int bitshift, long inversemask) {
- if ((getByte(bitshift) & 0xff) < (newCount & 0xff)) {
- setByte(newCount, bitshift, inversemask);
- }
- return this;
- }
-
- protected final int getByte(int bitshift) {
- return (int) (((value & 0xffffffffL) >>> bitshift) & 0xffL);
- }
-
- protected final int getByteMasked(int bitshift, long mask) {
- return (int) (((value & mask) >>> bitshift) & 0xffL);
- }
-
- protected final EncodedFeatures setBit(int bit, boolean flag) {
- if (flag) {
- value |= bit;
- } else {
- value &= ~bit;
- }
- return this;
- }
-
- protected final boolean getBit(int bit) {
- return (value & bit) != 0;
- }
-
- @Override
- public String toString() {
- return String.format("%x", value);
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/IntNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/IntNormalizer.docx
new file mode 100644
index 000000000..da11c3a0f
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/IntNormalizer.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/IntNormalizer.java b/src/java/com/twitter/search/common/encoding/features/IntNormalizer.java
deleted file mode 100644
index 0a2477e46..000000000
--- a/src/java/com/twitter/search/common/encoding/features/IntNormalizer.java
+++ /dev/null
@@ -1,15 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-/**
- * Interface for processing different feature values into an int. It provides a one-way translation
- * of encoding using com.twitter.search.common.encoding.features.ByteNormalizer and supports all the
- * old normalizers. The difference is that we directly return the normalized int value
- * (instead of converting from byte).
- */
-public interface IntNormalizer {
- /**
- * Returns the normalized value of {@code val}.
- * The value may be byte-compressed or as-is depending on the normalizer type
- */
- int normalize(double val);
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.docx b/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.docx
new file mode 100644
index 000000000..e6a1e3107
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.java b/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.java
deleted file mode 100644
index a86e079c3..000000000
--- a/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.java
+++ /dev/null
@@ -1,159 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-import java.util.List;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Lists;
-
-import com.twitter.search.common.indexing.thriftjava.PackedFeatures;
-import com.twitter.search.common.schema.base.FeatureConfiguration;
-
-/**
- * Class used to read/write integers encoded according to
- * {@link com.twitter.search.common.schema.base.FeatureConfiguration}
- *
- * Implementations must override {@link #getInt(int pos)} and {@link #setInt(int pos, int value)}.
- */
-public abstract class IntegerEncodedFeatures {
- /**
- * Returns the value at the given position.
- */
- public abstract int getInt(int pos);
-
- /**
- * Sets the given value at the given position.
- */
- public abstract void setInt(int pos, int value);
-
- /**
- * Get the maximum number of integers to hold features.
- * @return the number of integers to represent all features.
- */
- public abstract int getNumInts();
-
- /**
- * Test to see if the given feature is true or non-zero. Useful for one bit features.
- * @param feature feature to examine
- * @return true if feature is non-zero
- */
- public boolean isFlagSet(FeatureConfiguration feature) {
- return (getInt(feature.getValueIndex()) & feature.getBitMask()) != 0;
- }
-
- public IntegerEncodedFeatures setFlag(FeatureConfiguration feature) {
- setInt(feature.getValueIndex(), getInt(feature.getValueIndex()) | feature.getBitMask());
- return this;
- }
-
- public IntegerEncodedFeatures clearFlag(FeatureConfiguration feature) {
- setInt(feature.getValueIndex(), getInt(feature.getValueIndex()) & feature.getInverseBitMask());
- return this;
- }
-
- /**
- * Sets a boolean flag.
- */
- public IntegerEncodedFeatures setFlagValue(FeatureConfiguration feature, boolean value) {
- if (value) {
- setFlag(feature);
- } else {
- clearFlag(feature);
- }
- return this;
- }
-
- /**
- * Get feature value
- * @param feature feature to get
- * @return the value of the feature
- */
- public int getFeatureValue(FeatureConfiguration feature) {
- return (getInt(feature.getValueIndex()) & feature.getBitMask())
- >>> feature.getBitStartPosition();
- }
-
- /**
- * Set feature value
- * @param feature feature to modify
- * @param value value to set.
- */
- public IntegerEncodedFeatures setFeatureValue(FeatureConfiguration feature, int value) {
- Preconditions.checkState(
- value <= feature.getMaxValue(),
- "Feature value, %s, is greater than the max value allowed for this feature. "
- + "Feature: %s, Max value: %s",
- value, feature.getName(), feature.getMaxValue());
-
- // Clear the value of the given feature in its int.
- int temp = getInt(feature.getValueIndex()) & feature.getInverseBitMask();
-
- // Set the new feature value. Applying the bit mask here ensures that other features in the
- // same int are not modified by mistake.
- temp |= (value << feature.getBitStartPosition()) & feature.getBitMask();
-
- setInt(feature.getValueIndex(), temp);
- return this;
- }
-
- /**
- * Sets feature value if greater than current value
- * @param feature feature to modify
- * @param value new value
- */
- public IntegerEncodedFeatures setFeatureValueIfGreater(FeatureConfiguration feature, int value) {
- if (value > getFeatureValue(feature)) {
- setFeatureValue(feature, value);
- }
- return this;
- }
-
- /**
- * Increment a feature if its not at its maximum value.
- * @return whether the feature is incremented.
- */
- public boolean incrementIfNotMaximum(FeatureConfiguration feature) {
- int newValue = getFeatureValue(feature) + 1;
- if (newValue <= feature.getMaxValue()) {
- setFeatureValue(feature, newValue);
- return true;
- } else {
- return false;
- }
- }
-
- /**
- * Copy these encoded features to a new PackedFeatures thrift struct.
- */
- public PackedFeatures copyToPackedFeatures() {
- return copyToPackedFeatures(new PackedFeatures());
- }
-
- /**
- * Copy these encoded features to a PackedFeatures thrift struct.
- */
- public PackedFeatures copyToPackedFeatures(PackedFeatures packedFeatures) {
- Preconditions.checkNotNull(packedFeatures);
- final List integers = Lists.newArrayListWithCapacity(getNumInts());
- for (int i = 0; i < getNumInts(); i++) {
- integers.add(getInt(i));
- }
- packedFeatures.setDeprecated_featureConfigurationVersion(0);
- packedFeatures.setFeatures(integers);
- return packedFeatures;
- }
-
- /**
- * Copy features from a packed features struct.
- */
- public void readFromPackedFeatures(PackedFeatures packedFeatures) {
- Preconditions.checkNotNull(packedFeatures);
- List ints = packedFeatures.getFeatures();
- for (int i = 0; i < getNumInts(); i++) {
- if (i < ints.size()) {
- setInt(i, ints.get(i));
- } else {
- setInt(i, 0);
- }
- }
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.docx
new file mode 100644
index 000000000..58f7c5c54
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.java b/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.java
deleted file mode 100644
index 0124d0be3..000000000
--- a/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.java
+++ /dev/null
@@ -1,53 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-import com.google.common.base.Preconditions;
-
-/**
- * Normalizes values as follows:
- * Positive numbers normalize to (1 + round(log_baseN(value))).
- * Negative numbers throw.
- * 0 will normalize to 0.
- * The log base is 2 by default.
- */
-public class LogByteNormalizer extends ByteNormalizer {
-
- private static final double DEFAULT_BASE = 2;
- private final double base;
- private final double logBase;
-
- public LogByteNormalizer(double base) {
- Preconditions.checkArgument(base > 0);
- this.base = base;
- logBase = Math.log(base);
- }
-
- public LogByteNormalizer() {
- this(DEFAULT_BASE);
- }
-
- @Override
- public byte normalize(double val) {
- if (val < 0) {
- throw new IllegalArgumentException("Can't log-normalize negative value " + val);
- } else if (val == 0) {
- return 0;
- } else {
- long logVal = 1 + (long) Math.floor(Math.log(val) / logBase);
- return logVal > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte) logVal;
- }
- }
-
- @Override
- public double unnormLowerBound(byte norm) {
- return norm < 0
- ? Double.NEGATIVE_INFINITY
- : Math.floor(Math.pow(base, norm - 1));
- }
-
- @Override
- public double unnormUpperBound(byte norm) {
- return norm == Byte.MAX_VALUE
- ? Double.POSITIVE_INFINITY
- : Math.floor(Math.pow(base, norm));
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.docx
new file mode 100644
index 000000000..bdac26d34
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.java b/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.java
deleted file mode 100644
index e02519f08..000000000
--- a/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.java
+++ /dev/null
@@ -1,51 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-import com.google.common.base.Preconditions;
-
-/**
- * A normalizer that normalizes the prediction score from a machine learning classifier, which
- * ranges within [0.0, 1.0], to an integer value by multiplying by (10 ^ precision), and returns
- * the rounded value. The lower the precision, the less amount of bits it takes to encode the score.
- * @see #precision
- *
- * This normalizer also could denormalize the normalized value from integer back to double using the
- * same precision.
- */
-public class PredictionScoreNormalizer {
-
- private final int precision;
- private final double normalizingBase;
-
- public PredictionScoreNormalizer(int precision) {
- this.precision = precision;
- this.normalizingBase = Math.pow(10, this.precision);
- }
-
- /**
- * Returns the normalized int value for prediction score {@code score} by multiplying
- * by {@code normalizingBase}, and round the result.
- * @throws IllegalArgumentException when parameter {@code score} is not within [0.0, 1.0]
- */
- public int normalize(double score) {
- Preconditions.checkArgument(isScoreWithinRange(score));
- return (int) Math.round(score * this.normalizingBase);
- }
-
- /**
- * Converts the normalized int value back to a double score by dividing by {@code normalizingBase}
- * @throws IllegalStateException when the denormalized value is not within [0.0, 1.0]
- */
- public double denormalize(int normalizedScore) {
- double denormalizedValue = normalizedScore / this.normalizingBase;
- if (!isScoreWithinRange(denormalizedValue)) {
- throw new IllegalStateException(
- String.format("The denormalized value %s is not within [0.0, 1.0]", denormalizedValue)
- );
- }
- return denormalizedValue;
- }
-
- private static boolean isScoreWithinRange(double score) {
- return 0.0 <= score && score <= 1.0;
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.docx
new file mode 100644
index 000000000..2992c9239
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.java b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.java
deleted file mode 100644
index 32acc5048..000000000
--- a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.java
+++ /dev/null
@@ -1,35 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-/**
- * Normalizes using the logic described in {@link SingleBytePositiveFloatUtil}.
- */
-public class SingleBytePositiveFloatNormalizer extends ByteNormalizer {
-
- @Override
- public byte normalize(double val) {
- return SingleBytePositiveFloatUtil.toSingleBytePositiveFloat((float) val);
- }
-
- @Override
- public double unnormLowerBound(byte norm) {
- return SingleBytePositiveFloatUtil.toJavaFloat(norm);
- }
-
- /**
- * Get the upper bound of the raw value for a normalized byte.
- * @deprecated This is wrongly implemented, always use unnormLowerBound(),
- * or use SmartIntegerNormalizer.
- */
- @Override @Deprecated
- public double unnormUpperBound(byte norm) {
- return 1 + SingleBytePositiveFloatUtil.toJavaFloat(norm);
- }
-
- /**
- * Return the the post-log2 unnormalized value. This is only used for some legacy Earlybird
- * features and scoring functions.
- */
- public double unnormAndLog2(byte norm) {
- return SingleBytePositiveFloatUtil.toLog2Double(norm);
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.docx b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.docx
new file mode 100644
index 000000000..4cf5adb92
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.java b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.java
deleted file mode 100644
index 2894241e8..000000000
--- a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.java
+++ /dev/null
@@ -1,164 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-/**
- * Util used to:
- * - Encode a positive Java float into a single byte float
- * - Decode a single byte into a positive Java float
- *
- * Configuration:
- * - Exponent: higher 4 bits, base 10.
- * - Mantissa: lower 4 bit, representing 1.0 to 9.0
- * - Exponent bias is 1.
- *
- * Formula:
- * Max(Mantissa, 9) * 10 ^ (Exponent - 1)
- *
- * Smallest float: 0.0 (0000 0000)
- * Smallest positive float: 1.0 * 10^-1 (0000 0001)
- * Largest float: 9.0 * 10^13 (1110 1111)
- * Infinity: (1111 0000)
- * NaN: (1111 1000)
- */
-public final class SingleBytePositiveFloatUtil {
- private SingleBytePositiveFloatUtil() { }
-
- // 4 bits mantissa. Range [1.0, 10.0) is divided into 16 steps
- public static final byte MAX_BYTE_VALUE = (byte) 0xEF;
- public static final byte INFINITY = (byte) 0xF0;
- public static final byte NOT_A_NUMBER = (byte) 0xF8;
- private static final float STEP_SIZE = 1.0f;
- private static final int EXPONENT_BIAS = 1;
- private static final byte MIN_EXPONENT = -EXPONENT_BIAS;
- private static final int MAX_EXPONENT = 14 - EXPONENT_BIAS;
- private static final byte MANTISSA_MASK = 0x0F;
-
- /**
- * Converts the given float into a single byte floating point number.
- * This is used in the updater and OK to be a bit slow.
- */
- public static byte toSingleBytePositiveFloat(float f) {
- if (f < 0) {
- throw new UnsupportedOperationException(
- "Cannot encode negative floats into SingleBytePostiveFloat.");
- }
-
- if (Float.compare(f, Float.POSITIVE_INFINITY) == 0) {
- return INFINITY;
- }
-
- if (Float.compare(f, Float.NaN) == 0) {
- return NOT_A_NUMBER;
- }
-
- int mantissa = 0;
- int exponent = (int) Math.floor(Math.log10(f));
- // Overflow (Number too large), just return the largest possible value
- if (exponent > MAX_EXPONENT) {
- return MAX_BYTE_VALUE;
- }
-
- // Underflow (Number too small), just return 0
- if (exponent < MIN_EXPONENT) {
- return 0;
- }
-
- int frac = Math.round(f / (float) Math.pow(10.0f, exponent) / STEP_SIZE);
- mantissa = fractionToMantissaTable[frac];
-
- return (byte) (((exponent + EXPONENT_BIAS) << 4) | mantissa);
- }
-
- /**
- * Called in Earlybird per hit and needs to be fast.
- */
- public static float toJavaFloat(byte b) {
- return BYTE_TO_FLOAT_CONVERSION_TABLE[b & 0xff];
- }
-
- // Table used for converting mantissa into a significant
- private static float[] mantissaToFractionTable = {
- // Decimal Matisa value
- STEP_SIZE * 0, // 0000
- STEP_SIZE * 1, // 0001
- STEP_SIZE * 1, // 0010
- STEP_SIZE * 2, // 0011
- STEP_SIZE * 2, // 0100
- STEP_SIZE * 3, // 0101
- STEP_SIZE * 3, // 0110
- STEP_SIZE * 4, // 0111
- STEP_SIZE * 4, // 1000
- STEP_SIZE * 5, // 1001
- STEP_SIZE * 5, // 1010
- STEP_SIZE * 6, // 1011
- STEP_SIZE * 6, // 1100
- STEP_SIZE * 7, // 1101
- STEP_SIZE * 8, // 1110
- STEP_SIZE * 9 // 1111
- };
-
- // Table used for converting fraction into mantissa.
- // Reverse operation of the above
- private static int[] fractionToMantissaTable = {
- 0, // 0
- 1, // 1
- 3, // 2
- 5, // 3
- 7, // 4
- 9, // 5
- 11, // 6
- 13, // 7
- 14, // 8
- 15, // 9
- 15, // 10 (Edge case: because we round the fraction, we can get 10 here.)
- };
-
- public static final byte LARGEST_FRACTION_UNDER_ONE = (byte) (toSingleBytePositiveFloat(1f) - 1);
-
- /**
- * Converts the given byte to java float.
- */
- private static float toJavaFloatSlow(byte b) {
- if (b == INFINITY) {
- return Float.POSITIVE_INFINITY;
- }
-
- if ((b & 0xff) > (INFINITY & 0xff)) {
- return Float.NaN;
- }
-
- int exponent = ((b & 0xff) >>> 4) - EXPONENT_BIAS;
- int mantissa = b & MANTISSA_MASK;
- return mantissaToFractionTable[mantissa] * (float) Math.pow(10.0f, exponent);
- }
-
- // Cached results from byte to float conversion
- private static final float[] BYTE_TO_FLOAT_CONVERSION_TABLE = new float[256];
- private static final double[] BYTE_TO_LOG2_CONVERSION_TABLE = new double[256];
- private static final byte[] OLD_TO_NEW_BYTE_CONVERSION_TABLE = new byte[256];
-
- static {
- LogByteNormalizer normalizer = new LogByteNormalizer();
- for (int i = 0; i < 256; i++) {
- byte b = (byte) i;
- BYTE_TO_FLOAT_CONVERSION_TABLE[i] = toJavaFloatSlow(b);
- BYTE_TO_LOG2_CONVERSION_TABLE[i] =
- 0xff & normalizer.normalize(BYTE_TO_FLOAT_CONVERSION_TABLE[i]);
- if (b == 0) {
- OLD_TO_NEW_BYTE_CONVERSION_TABLE[i] = 0;
- } else if (b > 0) {
- OLD_TO_NEW_BYTE_CONVERSION_TABLE[i] =
- toSingleBytePositiveFloat((float) normalizer.unnormLowerBound(b));
- } else {
- // should not get here.
- OLD_TO_NEW_BYTE_CONVERSION_TABLE[i] = MAX_BYTE_VALUE;
- }
- }
- }
-
- /**
- * Convert a normalized byte to the log2() version of its original value
- */
- static double toLog2Double(byte b) {
- return BYTE_TO_LOG2_CONVERSION_TABLE[b & 0xff];
- }
-}
diff --git a/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.docx
new file mode 100644
index 000000000..23d98fd0d
Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.docx differ
diff --git a/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.java b/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.java
deleted file mode 100644
index f2655e294..000000000
--- a/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.java
+++ /dev/null
@@ -1,150 +0,0 @@
-package com.twitter.search.common.encoding.features;
-
-import com.google.common.annotations.VisibleForTesting;
-import com.google.common.base.Preconditions;
-
-/**
- * A smart integer normalizer that converts an integer of a known range to a small integer up to
- * 8 bits long. This normalizer generates a boundary value array in the constructor as the buckets
- * for different values.
- *
- * The normalized value has a nice properties:
- * 1) it maintains the order of original value: if a > b, then normalize(a) > normalize(b).
- * 2) the value 0 is always normalized to byte 0.
- * 3) the normalized values are (almost) evenly distributed on the log scale
- * 4) no waste in code space, all possible values representable by normalized bits are used,
- * each corresponding to a different value.
- */
-public class SmartIntegerNormalizer extends ByteNormalizer {
- // The max value we want to support in this normalizer. If the input is larger than this value,
- // it's normalized as if it's the maxValue.
- private final int maxValue;
- // Number of bits used for normalized value, the largest normalized value
- // would be (1 << numBits) - 1.
- private final int numBits;
- // The inclusive lower bounds of all buckets. A normalized value k corresponds to original values
- // in the inclusive-exclusive range
- // [ boundaryValues[k], boundaryValues[k+1] )
- private final int[] boundaryValues;
- // The length of the boundaryValues array, or the number of buckets.
- private final int length;
-
- /**
- * Construct a normalizer.
- *
- * @param maxValue max value it supports, must be larger than minValue. Anything larger than this
- * would be treated as maxValue.
- * @param numBits number of bits you want to use for this normalization, between 1 and 8.
- * higher resolution for the lower numbers.
- */
- public SmartIntegerNormalizer(int maxValue, int numBits) {
- Preconditions.checkArgument(maxValue > 0);
- Preconditions.checkArgument(numBits > 0 && numBits <= 8);
-
- this.maxValue = maxValue;
- this.numBits = numBits;
-
- this.length = 1 << numBits;
- this.boundaryValues = new int[length];
-
-
- int index;
- for (index = length - 1; index >= 0; --index) {
- // values are evenly distributed on the log scale
- int boundary = (int) Math.pow(maxValue, (double) index / length);
- // we have more byte slots left than we have possible boundary values (buckets),
- // just give consecutive boundary values to all remaining slots, starting from 0.
- if (boundary <= index) {
- break;
- }
- boundaryValues[index] = boundary;
- }
- if (index >= 0) {
- for (int i = 1; i <= index; ++i) {
- boundaryValues[i] = i;
- }
- }
- boundaryValues[0] = 0; // the first one is always 0.
- }
-
- @Override
- public byte normalize(double val) {
- int intVal = (int) (val > maxValue ? maxValue : val);
- return intToUnsignedByte(binarySearch(intVal, boundaryValues));
- }
-
- /**
- * Return the lower bound of the bucket represent by norm. This simply returns the boundary
- * value indexed by current norm.
- */
- @Override
- public double unnormLowerBound(byte norm) {
- return boundaryValues[unsignedByteToInt(norm)];
- }
-
- /**
- * Return the upper bound of the bucket represent by norm. This returns the next boundary value
- * minus 1. If norm represents the last bucket, it returns the maxValue.
- */
- @Override
- public double unnormUpperBound(byte norm) {
- // if it's already the last possible normalized value, just return the corresponding last
- // boundary value.
- int intNorm = unsignedByteToInt(norm);
- if (intNorm == length - 1) {
- return maxValue;
- }
- return boundaryValues[intNorm + 1] - 1;
- }
-
- /**
- * Do a binary search on array and find the index of the item that's no bigger than value.
- */
- private static int binarySearch(int value, int[] array) {
- // corner cases
- if (value <= array[0]) {
- return 0;
- } else if (value >= array[array.length - 1]) {
- return array.length - 1;
- }
- int left = 0;
- int right = array.length - 1;
- int pivot = (left + right) >> 1;
- do {
- int midVal = array[pivot];
- if (value == midVal) {
- break;
- } else if (value > midVal) {
- left = pivot;
- } else {
- right = pivot;
- }
- pivot = (left + right) >> 1;
- } while (pivot != left);
- return pivot;
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder(String.format(
- "Smart Integer Normalizer (numBits = %d, max = %d)\n",
- this.numBits, this.maxValue));
- for (int i = 0; i < this.length; i++) {
- sb.append(String.format(
- "[%2d] boundary = %6d, range [ %6d, %6d ), norm: %4d | %4d | %4d %s\n",
- i, boundaryValues[i],
- (int) unnormLowerBound(intToUnsignedByte(i)),
- (int) unnormUpperBound(intToUnsignedByte(i)),
- unsignedByteToInt(normalize(boundaryValues[i] - 1)),
- unsignedByteToInt(normalize(boundaryValues[i])),
- unsignedByteToInt(normalize(boundaryValues[i] + 1)),
- i == boundaryValues[i] ? "*" : ""));
- }
- return sb.toString();
- }
-
- @VisibleForTesting
- int[] getBoundaryValues() {
- return boundaryValues;
- }
-}
diff --git a/src/java/com/twitter/search/common/query/BUILD b/src/java/com/twitter/search/common/query/BUILD
deleted file mode 100644
index 5c4cd6330..000000000
--- a/src/java/com/twitter/search/common/query/BUILD
+++ /dev/null
@@ -1,25 +0,0 @@
-java_library(
- sources = ["*.java"],
- platform = "java8",
- tags = ["bazel-compatible"],
- dependencies = [
- "3rdparty/jvm/com/google/code/findbugs:jsr305",
- "3rdparty/jvm/com/google/guava",
- "3rdparty/jvm/com/google/inject:guice",
- "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
- "3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn",
- "3rdparty/jvm/org/apache/lucene:lucene-core",
- "3rdparty/jvm/org/apache/lucene:lucene-facet",
- "3rdparty/jvm/org/apache/lucene:lucene-queries",
- "3rdparty/jvm/org/apache/thrift:libthrift",
- "3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
- "3rdparty/jvm/org/slf4j:slf4j-api",
- "src/java/com/twitter/search/common/features",
- "src/java/com/twitter/search/common/schema/base",
- "src/java/com/twitter/search/common/schema/earlybird",
- "src/java/com/twitter/search/common/util/analysis",
- "src/java/com/twitter/search/queryparser",
- "src/java/com/twitter/search/queryparser/query:core-query-nodes",
- "src/java/com/twitter/search/queryparser/query/search:search-query-nodes",
- ],
-)
diff --git a/src/java/com/twitter/search/common/query/BUILD.docx b/src/java/com/twitter/search/common/query/BUILD.docx
new file mode 100644
index 000000000..5ff203847
Binary files /dev/null and b/src/java/com/twitter/search/common/query/BUILD.docx differ
diff --git a/src/java/com/twitter/search/common/query/BoostUtils.docx b/src/java/com/twitter/search/common/query/BoostUtils.docx
new file mode 100644
index 000000000..570a5b18b
Binary files /dev/null and b/src/java/com/twitter/search/common/query/BoostUtils.docx differ
diff --git a/src/java/com/twitter/search/common/query/BoostUtils.java b/src/java/com/twitter/search/common/query/BoostUtils.java
deleted file mode 100644
index 10ae55942..000000000
--- a/src/java/com/twitter/search/common/query/BoostUtils.java
+++ /dev/null
@@ -1,27 +0,0 @@
-package com.twitter.search.common.query;
-
-import org.apache.lucene.search.BoostQuery;
-import org.apache.lucene.search.Query;
-
-/**
- * A class of utilities related to query boosts.
- */
-public final class BoostUtils {
- private BoostUtils() {
- }
-
- /**
- * Wraps the given query into a BoostQuery, if {@code boost} is not equal to 1.0f.
- *
- * @param query The query.
- * @param boost The boost.
- * @return If {@code boost} is equal to 1.0f, then {@code query} is returned; otherwise,
- * {@code query} is wrapped into a {@code BoostQuery} instance with the given boost.
- */
- public static Query maybeWrapInBoostQuery(Query query, float boost) {
- if (boost == 1.0f) {
- return query;
- }
- return new BoostQuery(query, boost);
- }
-}
diff --git a/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.docx b/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.docx
new file mode 100644
index 000000000..47ad1ea6c
Binary files /dev/null and b/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.docx differ
diff --git a/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.java b/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.java
deleted file mode 100644
index 457ace646..000000000
--- a/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.java
+++ /dev/null
@@ -1,92 +0,0 @@
-package com.twitter.search.common.query;
-
-
-import java.util.Map;
-import java.util.Set;
-
-import com.google.common.base.Preconditions;
-import com.google.common.collect.Maps;
-
-import com.twitter.search.queryparser.query.BooleanQuery;
-import com.twitter.search.queryparser.query.Conjunction;
-import com.twitter.search.queryparser.query.Disjunction;
-import com.twitter.search.queryparser.query.Operator;
-import com.twitter.search.queryparser.query.Phrase;
-import com.twitter.search.queryparser.query.Query;
-import com.twitter.search.queryparser.query.QueryParserException;
-import com.twitter.search.queryparser.query.QueryVisitor;
-import com.twitter.search.queryparser.query.SpecialTerm;
-import com.twitter.search.queryparser.query.Term;
-import com.twitter.search.queryparser.query.annotation.Annotation;
-
-/**
- * Collect the nodes with a specified annotation type in the given query.
- */
-public class CollectAnnotationsVisitor extends QueryVisitor {
-
- protected final Annotation.Type type;
-
- protected final Map nodeToTypeMap = Maps.newIdentityHashMap();
-
- public CollectAnnotationsVisitor(Annotation.Type type) {
- this.type = Preconditions.checkNotNull(type);
- }
-
- @Override
- public Boolean visit(Disjunction disjunction) throws QueryParserException {
- return visitBooleanQuery(disjunction);
- }
-
- @Override
- public Boolean visit(Conjunction conjunction) throws QueryParserException {
- return visitBooleanQuery(conjunction);
- }
-
- @Override
- public Boolean visit(Phrase phrase) throws QueryParserException {
- return visitQuery(phrase);
- }
-
- @Override
- public Boolean visit(Term term) throws QueryParserException {
- return visitQuery(term);
- }
-
- @Override
- public Boolean visit(Operator operator) throws QueryParserException {
- return visitQuery(operator);
- }
-
- @Override
- public Boolean visit(SpecialTerm special) throws QueryParserException {
- return visitQuery(special);
- }
-
- protected boolean visitQuery(Query query) throws QueryParserException {
- if (query.hasAnnotationType(type)) {
- collectNode(query);
- return true;
- }
- return false;
- }
-
- protected void collectNode(Query query) {
- nodeToTypeMap.put(query, true);
- }
-
- protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
- boolean found = false;
- if (query.hasAnnotationType(type)) {
- collectNode(query);
- found = true;
- }
- for (Query child : query.getChildren()) {
- found |= child.accept(this);
- }
- return found;
- }
-
- public Set getNodes() {
- return nodeToTypeMap.keySet();
- }
-}
diff --git a/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.docx b/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.docx
new file mode 100644
index 000000000..71c5ef367
Binary files /dev/null and b/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.docx differ
diff --git a/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.java b/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.java
deleted file mode 100644
index 0e135991e..000000000
--- a/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.java
+++ /dev/null
@@ -1,89 +0,0 @@
-package com.twitter.search.common.query;
-
-import java.util.Map;
-import java.util.Set;
-
-import com.google.common.collect.Maps;
-
-import com.twitter.search.queryparser.query.BooleanQuery;
-import com.twitter.search.queryparser.query.Conjunction;
-import com.twitter.search.queryparser.query.Disjunction;
-import com.twitter.search.queryparser.query.Operator;
-import com.twitter.search.queryparser.query.Phrase;
-import com.twitter.search.queryparser.query.Query;
-import com.twitter.search.queryparser.query.QueryParserException;
-import com.twitter.search.queryparser.query.QueryVisitor;
-import com.twitter.search.queryparser.query.SpecialTerm;
-import com.twitter.search.queryparser.query.Term;
-
-/**
- * Collects the nodes with a specified query type in the given query.
- */
-public class CollectQueryTypeVisitor extends QueryVisitor {
-
- protected final Query.QueryType queryType;
-
- protected final Map nodeToTypeMap = Maps.newIdentityHashMap();
-
- public CollectQueryTypeVisitor(Query.QueryType queryType) {
- this.queryType = queryType;
- }
-
- @Override
- public Boolean visit(Disjunction disjunction) throws QueryParserException {
- return visitBooleanQuery(disjunction);
- }
-
- @Override
- public Boolean visit(Conjunction conjunction) throws QueryParserException {
- return visitBooleanQuery(conjunction);
- }
-
- @Override
- public Boolean visit(Phrase phrase) throws QueryParserException {
- return visitQuery(phrase);
- }
-
- @Override
- public Boolean visit(Term term) throws QueryParserException {
- return visitQuery(term);
- }
-
- @Override
- public Boolean visit(Operator operator) throws QueryParserException {
- return visitQuery(operator);
- }
-
- @Override
- public Boolean visit(SpecialTerm special) throws QueryParserException {
- return visitQuery(special);
- }
-
- public Set getCollectedNodes() {
- return nodeToTypeMap.keySet();
- }
-
- protected boolean visitQuery(Query query) throws QueryParserException {
- if (query.isTypeOf(queryType)) {
- collectNode(query);
- return true;
- }
- return false;
- }
-
- protected void collectNode(Query query) {
- nodeToTypeMap.put(query, true);
- }
-
- protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException {
- boolean found = false;
- if (query.isTypeOf(queryType)) {
- collectNode(query);
- found = true;
- }
- for (Query child : query.getChildren()) {
- found |= child.accept(this);
- }
- return found;
- }
-}
diff --git a/src/java/com/twitter/search/common/query/CollectVariantVisitor.docx b/src/java/com/twitter/search/common/query/CollectVariantVisitor.docx
new file mode 100644
index 000000000..39b0d59b7
Binary files /dev/null and b/src/java/com/twitter/search/common/query/CollectVariantVisitor.docx differ
diff --git a/src/java/com/twitter/search/common/query/CollectVariantVisitor.java b/src/java/com/twitter/search/common/query/CollectVariantVisitor.java
deleted file mode 100644
index a66961d7f..000000000
--- a/src/java/com/twitter/search/common/query/CollectVariantVisitor.java
+++ /dev/null
@@ -1,13 +0,0 @@
-package com.twitter.search.common.query;
-
-import com.twitter.search.queryparser.query.annotation.Annotation;
-
-
-/**
- * A visitor that collects the nodes that have :v annotation
- */
-public class CollectVariantVisitor extends CollectAnnotationsVisitor {
- public CollectVariantVisitor() {
- super(Annotation.Type.VARIANT);
- }
-}
diff --git a/src/java/com/twitter/search/common/query/DefaultFilterWeight.docx b/src/java/com/twitter/search/common/query/DefaultFilterWeight.docx
new file mode 100644
index 000000000..334bd8563
Binary files /dev/null and b/src/java/com/twitter/search/common/query/DefaultFilterWeight.docx differ
diff --git a/src/java/com/twitter/search/common/query/DefaultFilterWeight.java b/src/java/com/twitter/search/common/query/DefaultFilterWeight.java
deleted file mode 100644
index 5fcc14433..000000000
--- a/src/java/com/twitter/search/common/query/DefaultFilterWeight.java
+++ /dev/null
@@ -1,60 +0,0 @@
-package com.twitter.search.common.query;
-
-import java.io.IOException;
-import java.util.Set;
-
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.ConstantScoreScorer;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.ScoreMode;
-import org.apache.lucene.search.Weight;
-
-/**
- * An abstract Weight implementation that can be used by all "filter" classes (Query instances that
- * should not contribute to the overall query score).
- */
-public abstract class DefaultFilterWeight extends Weight {
- public DefaultFilterWeight(Query query) {
- super(query);
- }
-
- @Override
- public void extractTerms(Set terms) {
- }
-
- @Override
- public Explanation explain(LeafReaderContext context, int doc) throws IOException {
- Scorer scorer = scorer(context);
- if ((scorer != null) && (scorer.iterator().advance(doc) == doc)) {
- return Explanation.match(0f, "Match on id " + doc);
- }
- return Explanation.match(0f, "No match on id " + doc);
- }
-
- @Override
- public Scorer scorer(LeafReaderContext context) throws IOException {
- DocIdSetIterator disi = getDocIdSetIterator(context);
- if (disi == null) {
- return null;
- }
-
- return new ConstantScoreScorer(this, 0.0f, ScoreMode.COMPLETE_NO_SCORES, disi);
- }
-
- @Override
- public boolean isCacheable(LeafReaderContext ctx) {
- return false;
- }
-
- /**
- * Returns the DocIdSetIterator over which the scorers created by this weight need to iterate.
- *
- * @param context The LeafReaderContext instance used to create the scorer.
- */
- protected abstract DocIdSetIterator getDocIdSetIterator(LeafReaderContext context)
- throws IOException;
-}
diff --git a/src/java/com/twitter/search/common/query/DocIdFilter.docx b/src/java/com/twitter/search/common/query/DocIdFilter.docx
new file mode 100644
index 000000000..ee2adcc7d
Binary files /dev/null and b/src/java/com/twitter/search/common/query/DocIdFilter.docx differ
diff --git a/src/java/com/twitter/search/common/query/DocIdFilter.java b/src/java/com/twitter/search/common/query/DocIdFilter.java
deleted file mode 100644
index fed309f86..000000000
--- a/src/java/com/twitter/search/common/query/DocIdFilter.java
+++ /dev/null
@@ -1,74 +0,0 @@
-package com.twitter.search.common.query;
-
-import java.io.IOException;
-import java.util.Set;
-
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.ConstantScoreScorer;
-import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.ScoreMode;
-import org.apache.lucene.search.Weight;
-
-/**
- * Lucene filter on top of a known docid
- *
- */
-public class DocIdFilter extends Query {
- private final int docid;
-
- public DocIdFilter(int docid) {
- this.docid = docid;
- }
-
- @Override
- public Weight createWeight(
- IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
- return new Weight(this) {
- @Override
- public void extractTerms(Set terms) {
- }
-
- @Override
- public Explanation explain(LeafReaderContext context, int doc) throws IOException {
- Scorer scorer = scorer(context);
- if ((scorer != null) && (scorer.iterator().advance(doc) == doc)) {
- return Explanation.match(0f, "Match on id " + doc);
- }
- return Explanation.match(0f, "No match on id " + doc);
- }
-
- @Override
- public Scorer scorer(LeafReaderContext context) throws IOException {
- return new ConstantScoreScorer(this, 0.0f, scoreMode, new SingleDocDocIdSetIterator(docid));
- }
-
- @Override
- public boolean isCacheable(LeafReaderContext ctx) {
- return true;
- }
- };
- }
-
- @Override
- public int hashCode() {
- return docid;
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof DocIdFilter)) {
- return false;
- }
-
- return docid == DocIdFilter.class.cast(obj).docid;
- }
-
- @Override
- public String toString(String field) {
- return "DOC_ID_FILTER[docId=" + docid + " + ]";
- }
-}
diff --git a/src/java/com/twitter/search/common/query/FieldRankHitInfo.docx b/src/java/com/twitter/search/common/query/FieldRankHitInfo.docx
new file mode 100644
index 000000000..ee86bfbb0
Binary files /dev/null and b/src/java/com/twitter/search/common/query/FieldRankHitInfo.docx differ
diff --git a/src/java/com/twitter/search/common/query/FieldRankHitInfo.java b/src/java/com/twitter/search/common/query/FieldRankHitInfo.java
deleted file mode 100644
index f7d509719..000000000
--- a/src/java/com/twitter/search/common/query/FieldRankHitInfo.java
+++ /dev/null
@@ -1,48 +0,0 @@
-package com.twitter.search.common.query;
-
-/**
- * When a hit (on a part of the query tree) occurs, this class is passed to HitAttributeCollector
- * for collection.
- *
- * This implementation carries the following info:
- *
- * - The field that matched (the field ID is recorded)
- * - The query node that matched (the query node rank is recorded)
- * - The ID of the last doc that matched this query
- *
- *
- * Each IdentifiableQuery should be associated with one FieldRankHitInfo, which is passed to a
- * HitAttributeCollector when a hit occurs.
- */
-public class FieldRankHitInfo {
- protected static final int UNSET_DOC_ID = -1;
-
- private final int fieldId;
- private final int rank;
- private int docId = UNSET_DOC_ID;
-
- public FieldRankHitInfo(int fieldId, int rank) {
- this.fieldId = fieldId;
- this.rank = rank;
- }
-
- public int getFieldId() {
- return fieldId;
- }
-
- public int getRank() {
- return rank;
- }
-
- public int getDocId() {
- return docId;
- }
-
- public void setDocId(int docId) {
- this.docId = docId;
- }
-
- public void resetDocId() {
- this.docId = UNSET_DOC_ID;
- }
-}
diff --git a/src/java/com/twitter/search/common/query/FieldWeightUtil.docx b/src/java/com/twitter/search/common/query/FieldWeightUtil.docx
new file mode 100644
index 000000000..ff2adac3d
Binary files /dev/null and b/src/java/com/twitter/search/common/query/FieldWeightUtil.docx differ
diff --git a/src/java/com/twitter/search/common/query/FieldWeightUtil.java b/src/java/com/twitter/search/common/query/FieldWeightUtil.java
deleted file mode 100644
index dcb7d08a8..000000000
--- a/src/java/com/twitter/search/common/query/FieldWeightUtil.java
+++ /dev/null
@@ -1,205 +0,0 @@
-package com.twitter.search.common.query;
-
-import java.util.Collections;
-import java.util.EnumSet;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import javax.annotation.Nullable;
-
-import com.google.common.base.Enums;
-import com.google.common.base.Function;
-import com.google.common.base.Functions;
-import com.google.common.base.Predicates;
-import com.google.common.collect.FluentIterable;
-import com.google.common.collect.ImmutableMap;
-import com.google.common.collect.Iterables;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.twitter.search.common.schema.base.FieldWeightDefault;
-import com.twitter.search.queryparser.query.Query;
-import com.twitter.search.queryparser.query.QueryParserException;
-import com.twitter.search.queryparser.query.annotation.Annotation;
-import com.twitter.search.queryparser.query.annotation.FieldAnnotationUtils;
-import com.twitter.search.queryparser.query.annotation.FieldNameWithBoost;
-
-public final class FieldWeightUtil {
- private static final Logger LOG = LoggerFactory.getLogger(FieldWeightUtil.class);
- private FieldWeightUtil() {
- }
-
- /**
- * Combines default field weight configuration with field annotations and returns a
- * field-to-weight map.
- *
- * @param query The query whose annotations we will look into
- * @param defaultFieldWeightMap field-to-FieldWeightDefault map
- * @param enabledFieldWeightMap for optimization, this is the field-to-weight map inferred from
- * the field-to-FieldWeightDefault map
- * @param fieldNameToTyped A function that can turn string field name to typed field
- * @param The typed field
- */
- public static ImmutableMap combineDefaultWithAnnotation(
- Query query,
- Map defaultFieldWeightMap,
- Map enabledFieldWeightMap,
- Function fieldNameToTyped) throws QueryParserException {
- return combineDefaultWithAnnotation(
- query,
- defaultFieldWeightMap,
- enabledFieldWeightMap,
- fieldNameToTyped,
- Collections.emptyMap(),
- Functions.forMap(Collections.emptyMap(), ""));
- }
-
- /**
- * Combines default field weight configuration with field annotations and returns a
- * field-to-weight map. Also maps generic mappable fields to field weight boosts and resolves them
- *
- * @param query The query whose annotations we will look into
- * @param defaultFieldWeightMap field-to-FieldWeightDefault map
- * @param enabledFieldWeightMap for optimization, this is the field-to-weight map inferred from
- * the field-to-FieldWeightDefault map
- * @param fieldNameToTyped A function that can turn a string field name to typed field
- * @param mappableFieldMap mapping of mappable fields to the corresponding typed fields
- * @param typedToFieldName A function that can turn a typed field into a string field name
- * @param The typed field
- *
- * Note: As a result of discussion on SEARCH-24029, we now allow replace and remove annotations
- * on a single term. See http://go/fieldweight for info on field weight annotations.
- */
- public static ImmutableMap combineDefaultWithAnnotation(
- Query query,
- Map defaultFieldWeightMap,
- Map enabledFieldWeightMap,
- Function fieldNameToTyped,
- Map mappableFieldMap,
- Function typedToFieldName) throws QueryParserException {
- List fieldAnnotations = query.getAllAnnotationsOf(Annotation.Type.FIELD);
- List mappableFieldAnnotations =
- query.getAllAnnotationsOf(Annotation.Type.MAPPABLE_FIELD);
-
- if (fieldAnnotations.isEmpty() && mappableFieldAnnotations.isEmpty()) {
- return ImmutableMap.copyOf(enabledFieldWeightMap);
- }
-
- // Convert mapped fields to field annotations
- Iterable fieldAnnotationsForMappedFields =
- FluentIterable.from(mappableFieldAnnotations)
- .transform(FieldWeightUtil.fieldAnnotationForMappableField(mappableFieldMap,
- typedToFieldName))
- .filter(Predicates.notNull());
-
- Iterable annotations =
- Iterables.concat(fieldAnnotationsForMappedFields, fieldAnnotations);
-
- // Sanitize the field annotations first, remove the ones we don't know
- // for REPLACE and REMOVE.
- List sanitizedFields = Lists.newArrayList();
- Set seenModifierTypes =
- EnumSet.noneOf(FieldNameWithBoost.FieldModifier.class);
-
- for (Annotation annotation : annotations) {
- FieldNameWithBoost fieldNameWithBoost = (FieldNameWithBoost) annotation.getValue();
- T typedField = fieldNameToTyped.apply(fieldNameWithBoost.getFieldName());
- FieldNameWithBoost.FieldModifier modifier = fieldNameWithBoost.getFieldModifier();
- if (defaultFieldWeightMap.containsKey(typedField)) {
- seenModifierTypes.add(modifier);
- sanitizedFields.add(fieldNameWithBoost);
- }
- }
-
- // Even if there is no mapping for a mapped annotation, if a query is replaced by an unknown
- // mapping, it should not map to other fields, so we need to detect a REPLACE annotation
- if (seenModifierTypes.isEmpty()
- && FieldAnnotationUtils.hasReplaceAnnotation(mappableFieldAnnotations)) {
- seenModifierTypes.add(FieldNameWithBoost.FieldModifier.REPLACE);
- }
-
- boolean onlyHasReplace = seenModifierTypes.size() == 1
- && seenModifierTypes.contains(FieldNameWithBoost.FieldModifier.REPLACE);
-
- // If we only have replace, start with an empty map, otherwise, start with all enabled fields.
- Map actualMap = onlyHasReplace
- ? Maps.newLinkedHashMap()
- : Maps.newLinkedHashMap(enabledFieldWeightMap);
-
- // Go over all field annotations and apply them.
- for (FieldNameWithBoost fieldAnnotation : sanitizedFields) {
- T typedField = fieldNameToTyped.apply(fieldAnnotation.getFieldName());
- FieldNameWithBoost.FieldModifier modifier = fieldAnnotation.getFieldModifier();
- switch (modifier) {
- case REMOVE:
- actualMap.remove(typedField);
- break;
-
- case ADD:
- case REPLACE:
- if (fieldAnnotation.getBoost().isPresent()) {
- actualMap.put(typedField, fieldAnnotation.getBoost().get());
- } else {
- // When annotation does not specify weight, use default weight
- actualMap.put(
- typedField,
- defaultFieldWeightMap.get(typedField).getWeight());
- }
- break;
- default:
- throw new QueryParserException("Unknown field annotation type: " + fieldAnnotation);
- }
- }
-
- return ImmutableMap.copyOf(actualMap);
- }
-
- public static ImmutableMap combineDefaultWithAnnotation(
- Query query,
- Map defaultFieldWeightMap,
- Map enabledFieldWeightMap) throws QueryParserException {
-
- return combineDefaultWithAnnotation(
- query, defaultFieldWeightMap, enabledFieldWeightMap, Functions.identity());
- }
-
- /**
- * Create an annotation of the FIELD type from annotations of the MAPPED_FIELD type
- * @param mappableFieldMap mapping of mappable fields to the corresponding typed fields
- * @param typedToFieldName A function that can turn a typed field into a string field name
- * @param The typed field
- * @return an Annotation with the same modifier and boost for a FIELD as the incoming MAPPED_FIELD
- * annotation
- */
- private static Function fieldAnnotationForMappableField(
- final Map mappableFieldMap,
- final Function typedToFieldName) {
- return new Function() {
- @Nullable
- @Override
- public Annotation apply(Annotation mappableAnnotation) {
- FieldNameWithBoost fieldNameWithBoost = (FieldNameWithBoost) mappableAnnotation.getValue();
- MappableField mappedField =
- Enums.getIfPresent(
- MappableField.class,
- fieldNameWithBoost.getFieldName().toUpperCase()).orNull();
- T typedFieldName = mappableFieldMap.get(mappedField);
- Annotation fieldAnnotation = null;
- if (typedFieldName != null) {
- String fieldName = typedToFieldName.apply(typedFieldName);
- FieldNameWithBoost mappedFieldBoost =
- new FieldNameWithBoost(
- fieldName,
- fieldNameWithBoost.getBoost(),
- fieldNameWithBoost.getFieldModifier());
- fieldAnnotation = Annotation.Type.FIELD.newInstance(mappedFieldBoost);
- }
- return fieldAnnotation;
- }
- };
- }
-}
diff --git a/src/java/com/twitter/search/common/query/FilteredQuery.docx b/src/java/com/twitter/search/common/query/FilteredQuery.docx
new file mode 100644
index 000000000..94d0f2c98
Binary files /dev/null and b/src/java/com/twitter/search/common/query/FilteredQuery.docx differ
diff --git a/src/java/com/twitter/search/common/query/FilteredQuery.java b/src/java/com/twitter/search/common/query/FilteredQuery.java
deleted file mode 100644
index a4740970b..000000000
--- a/src/java/com/twitter/search/common/query/FilteredQuery.java
+++ /dev/null
@@ -1,225 +0,0 @@
-package com.twitter.search.common.query;
-
-import java.io.IOException;
-import java.util.Set;
-
-import com.google.common.base.Preconditions;
-
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.Explanation;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.ScoreMode;
-import org.apache.lucene.search.Weight;
-
-/**
- * A pairing of a query and a filter. The hits traversal is driven by the query's DocIdSetIterator,
- * and the filter is used only to do post-filtering. In other words, the filter is never used to
- * find the next doc ID: it's only used to filter out the doc IDs returned by the query's
- * DocIdSetIterator. This is useful when we need to have a conjunction between a query that can
- * quickly iterate through doc IDs (eg. a posting list), and an expensive filter (eg. a filter based
- * on the values stored in a CSF).
- *
- * For example, let say we want to build a query that returns all docs that have at least 100 faves.
- * 1. One option is to go with the [min_faves 100] query. This would be very expensive though,
- * because this query would have to walk through every doc in the segment and for each one of
- * them it would have to extract the number of faves from the forward index.
- * 2. Another option is to go with a conjunction between this query and the HAS_ENGAGEMENT filter:
- * (+[min_faves 100] +[cached_filter has_engagements]). The HAS_ENGAGEMENT filter could
- * traverse the doc ID space faster (if it's backed by a posting list). But this approach would
- * still be slow, because as soon as the HAS_ENGAGEMENT filter finds a doc ID, the conjunction
- * scorer would trigger an advance(docID) call on the min_faves part of the query, which has
- * the same problem as the first option.
- * 3. Finally, a better option for this particular case would be to drive by the HAS_ENGAGEMENT
- * filter (because it can quickly jump over all docs that do not have any engagement), and use
- * the min_faves filter as a post-processing step, on a much smaller set of docs.
- */
-public class FilteredQuery extends Query {
- /**
- * A doc ID predicate that determines if the given doc ID should be accepted.
- */
- @FunctionalInterface
- public static interface DocIdFilter {
- /**
- * Determines if the given doc ID should be accepted.
- */
- boolean accept(int docId) throws IOException;
- }
-
- /**
- * A factory for creating DocIdFilter instances based on a given LeafReaderContext instance.
- */
- @FunctionalInterface
- public static interface DocIdFilterFactory {
- /**
- * Returns a DocIdFilter instance for the given LeafReaderContext instance.
- */
- DocIdFilter getDocIdFilter(LeafReaderContext context) throws IOException;
- }
-
- private static class FilteredQueryDocIdSetIterator extends DocIdSetIterator {
- private final DocIdSetIterator queryScorerIterator;
- private final DocIdFilter docIdFilter;
-
- public FilteredQueryDocIdSetIterator(
- DocIdSetIterator queryScorerIterator, DocIdFilter docIdFilter) {
- this.queryScorerIterator = Preconditions.checkNotNull(queryScorerIterator);
- this.docIdFilter = Preconditions.checkNotNull(docIdFilter);
- }
-
- @Override
- public int docID() {
- return queryScorerIterator.docID();
- }
-
- @Override
- public int nextDoc() throws IOException {
- int docId;
- do {
- docId = queryScorerIterator.nextDoc();
- } while (docId != NO_MORE_DOCS && !docIdFilter.accept(docId));
- return docId;
- }
-
- @Override
- public int advance(int target) throws IOException {
- int docId = queryScorerIterator.advance(target);
- if (docId == NO_MORE_DOCS || docIdFilter.accept(docId)) {
- return docId;
- }
- return nextDoc();
- }
-
- @Override
- public long cost() {
- return queryScorerIterator.cost();
- }
- }
-
- private static class FilteredQueryScorer extends Scorer {
- private final Scorer queryScorer;
- private final DocIdFilter docIdFilter;
-
- public FilteredQueryScorer(Weight weight, Scorer queryScorer, DocIdFilter docIdFilter) {
- super(weight);
- this.queryScorer = Preconditions.checkNotNull(queryScorer);
- this.docIdFilter = Preconditions.checkNotNull(docIdFilter);
- }
-
- @Override
- public int docID() {
- return queryScorer.docID();
- }
-
- @Override
- public float score() throws IOException {
- return queryScorer.score();
- }
-
- @Override
- public DocIdSetIterator iterator() {
- return new FilteredQueryDocIdSetIterator(queryScorer.iterator(), docIdFilter);
- }
-
- @Override
- public float getMaxScore(int upTo) throws IOException {
- return queryScorer.getMaxScore(upTo);
- }
- }
-
- private static class FilteredQueryWeight extends Weight {
- private final Weight queryWeight;
- private final DocIdFilterFactory docIdFilterFactory;
-
- public FilteredQueryWeight(
- FilteredQuery query, Weight queryWeight, DocIdFilterFactory docIdFilterFactory) {
- super(query);
- this.queryWeight = Preconditions.checkNotNull(queryWeight);
- this.docIdFilterFactory = Preconditions.checkNotNull(docIdFilterFactory);
- }
-
- @Override
- public void extractTerms(Set terms) {
- queryWeight.extractTerms(terms);
- }
-
- @Override
- public Explanation explain(LeafReaderContext context, int doc) throws IOException {
- return queryWeight.explain(context, doc);
- }
-
- @Override
- public Scorer scorer(LeafReaderContext context) throws IOException {
- Scorer queryScorer = queryWeight.scorer(context);
- if (queryScorer == null) {
- return null;
- }
-
- return new FilteredQueryScorer(this, queryScorer, docIdFilterFactory.getDocIdFilter(context));
- }
-
- @Override
- public boolean isCacheable(LeafReaderContext ctx) {
- return queryWeight.isCacheable(ctx);
- }
- }
-
- private final Query query;
- private final DocIdFilterFactory docIdFilterFactory;
-
- public FilteredQuery(Query query, DocIdFilterFactory docIdFilterFactory) {
- this.query = Preconditions.checkNotNull(query);
- this.docIdFilterFactory = Preconditions.checkNotNull(docIdFilterFactory);
- }
-
- public Query getQuery() {
- return query;
- }
-
- @Override
- public Query rewrite(IndexReader reader) throws IOException {
- Query rewrittenQuery = query.rewrite(reader);
- if (rewrittenQuery != query) {
- return new FilteredQuery(rewrittenQuery, docIdFilterFactory);
- }
- return this;
- }
-
- @Override
- public int hashCode() {
- return query.hashCode() * 13 + docIdFilterFactory.hashCode();
- }
-
- @Override
- public boolean equals(Object obj) {
- if (!(obj instanceof FilteredQuery)) {
- return false;
- }
-
- FilteredQuery filteredQuery = FilteredQuery.class.cast(obj);
- return query.equals(filteredQuery.query)
- && docIdFilterFactory.equals(filteredQuery.docIdFilterFactory);
- }
-
- @Override
- public String toString(String field) {
- StringBuilder sb = new StringBuilder();
- sb.append("FilteredQuery(")
- .append(query)
- .append(" -> ")
- .append(docIdFilterFactory)
- .append(")");
- return sb.toString();
- }
-
- @Override
- public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
- throws IOException {
- Weight queryWeight = Preconditions.checkNotNull(query.createWeight(searcher, scoreMode, boost));
- return new FilteredQueryWeight(this, queryWeight, docIdFilterFactory);
- }
-}
diff --git a/src/java/com/twitter/search/common/query/FilteredScorer.docx b/src/java/com/twitter/search/common/query/FilteredScorer.docx
new file mode 100644
index 000000000..1bce7324a
Binary files /dev/null and b/src/java/com/twitter/search/common/query/FilteredScorer.docx differ
diff --git a/src/java/com/twitter/search/common/query/FilteredScorer.java b/src/java/com/twitter/search/common/query/FilteredScorer.java
deleted file mode 100644
index 41d9032f6..000000000
--- a/src/java/com/twitter/search/common/query/FilteredScorer.java
+++ /dev/null
@@ -1,36 +0,0 @@
-package com.twitter.search.common.query;
-
-import java.io.IOException;
-
-import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.Scorer;
-import org.apache.lucene.search.Weight;
-
-public class FilteredScorer extends Scorer {
- protected final Scorer inner;
-
- public FilteredScorer(Weight weight, Scorer inner) {
- super(weight);
- this.inner = inner;
- }
-
- @Override
- public float score() throws IOException {
- return inner.score();
- }
-
- @Override
- public int docID() {
- return inner.docID();
- }
-
- @Override
- public DocIdSetIterator iterator() {
- return inner.iterator();
- }
-
- @Override
- public float getMaxScore(int upTo) throws IOException {
- return inner.getMaxScore(upTo);
- }
-}
diff --git a/src/java/com/twitter/search/common/query/HitAttributeCollector.docx b/src/java/com/twitter/search/common/query/HitAttributeCollector.docx
new file mode 100644
index 000000000..71b286bdd
Binary files /dev/null and b/src/java/com/twitter/search/common/query/HitAttributeCollector.docx differ
diff --git a/src/java/com/twitter/search/common/query/HitAttributeCollector.java b/src/java/com/twitter/search/common/query/HitAttributeCollector.java
deleted file mode 100644
index 21844aa71..000000000
--- a/src/java/com/twitter/search/common/query/HitAttributeCollector.java
+++ /dev/null
@@ -1,101 +0,0 @@
-package com.twitter.search.common.query;
-
-import java.util.List;
-import java.util.Map;
-import java.util.function.BiFunction;
-import java.util.function.Function;
-
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.search.Query;
-
-/**
- * Not threadsafe, but should be reused across different queries unless the size of the existing
- * one is too small for a new huge serialized query.
- */
-public class HitAttributeCollector {
- private final List hitInfos = Lists.newArrayList();
- private final BiFunction hitInfoSupplier;
-
- private int docBase = 0;
-
- public HitAttributeCollector() {
- this.hitInfoSupplier = FieldRankHitInfo::new;
- }
-
- /**
- * Constructs a new {@code HitAttributionCollector} with the specified {@code FieldRankHitInfo}
- * supplier.
- *
- * @param hitInfoSupplier function to supply a {@code FieldRankHitInfo} instance
- */
- public HitAttributeCollector(BiFunction hitInfoSupplier) {
- this.hitInfoSupplier = hitInfoSupplier;
- }
-
- /**
- * Creates a new IdentifiableQuery for the given query, fieldId and rank, and "registers"
- * the fieldId and the rank with this collector.
- *
- * @param query the query to be wrapped.
- * @param fieldId the ID of the field to be searched.
- * @param rank The rank of this query.
- * @return A new IdentifiableQuery instance for the given query, fieldId and rank.
- */
- public IdentifiableQuery newIdentifiableQuery(Query query, int fieldId, int rank) {
- FieldRankHitInfo fieldRankHitInfo = hitInfoSupplier.apply(fieldId, rank);
- hitInfos.add(fieldRankHitInfo);
- return new IdentifiableQuery(query, fieldRankHitInfo, this);
- }
-
- public void clearHitAttributions(LeafReaderContext ctx, FieldRankHitInfo hitInfo) {
- docBase = ctx.docBase;
- hitInfo.resetDocId();
- }
-
- public void collectScorerAttribution(int docId, FieldRankHitInfo hitInfo) {
- hitInfo.setDocId(docId + docBase);
- }
-
- /**
- * This method should be called when a global hit occurs.
- * This method returns hit attribution summary for the whole query tree.
- * This supports getting hit attribution for only the curDoc.
- *
- * @param docId docId passed in for checking against curDoc.
- * @return Returns a map from node rank to a set of matching field IDs. This map does not contain
- * entries for ranks that did not hit at all.
- */
- public Map> getHitAttribution(int docId) {
- return getHitAttribution(docId, (fieldId) -> fieldId);
- }
-
- /**
- * This method should be called when a global hit occurs.
- * This method returns hit attribution summary for the whole query tree.
- * This supports getting hit attribution for only the curDoc.
- *
- * @param docId docId passed in for checking against curDoc.
- * @param fieldIdFunc The mapping of field IDs to objects of type T.
- * @return Returns a map from node rank to a set of matching objects (usually field IDs or names).
- * This map does not contain entries for ranks that did not hit at all.
- */
- public Map> getHitAttribution(int docId, Function fieldIdFunc) {
- int key = docId + docBase;
- Map> hitMap = Maps.newHashMap();
-
- // Manually iterate through all hitInfos elements. It's slightly faster than using an Iterator.
- for (FieldRankHitInfo hitInfo : hitInfos) {
- if (hitInfo.getDocId() == key) {
- int rank = hitInfo.getRank();
- List rankHits = hitMap.computeIfAbsent(rank, k -> Lists.newArrayList());
- T fieldDescription = fieldIdFunc.apply(hitInfo.getFieldId());
- rankHits.add(fieldDescription);
- }
- }
-
- return hitMap;
- }
-}
diff --git a/src/java/com/twitter/search/common/query/HitAttributeHelper.docx b/src/java/com/twitter/search/common/query/HitAttributeHelper.docx
new file mode 100644
index 000000000..40cab5283
Binary files /dev/null and b/src/java/com/twitter/search/common/query/HitAttributeHelper.docx differ
diff --git a/src/java/com/twitter/search/common/query/HitAttributeHelper.java b/src/java/com/twitter/search/common/query/HitAttributeHelper.java
deleted file mode 100644
index 572f7b855..000000000
--- a/src/java/com/twitter/search/common/query/HitAttributeHelper.java
+++ /dev/null
@@ -1,102 +0,0 @@
-package com.twitter.search.common.query;
-
-import java.util.List;
-import java.util.Map;
-import java.util.function.Function;
-
-import com.google.common.collect.Maps;
-
-import com.twitter.search.queryparser.query.Query;
-
-import static com.twitter.search.common.query.FieldRankHitInfo.UNSET_DOC_ID;
-
-/**
- * Generic helper class containing the data needed to set up and collect field hit attributions.
- */
-public class HitAttributeHelper implements HitAttributeProvider {
- private final HitAttributeCollector collector;
- private final Function fieldIdsToFieldNames;
-
- // This is a mapping of type T query nodes to rank id
- private final Map nodeToRankMap;
-
- // This is meant to expand individual Query nodes into multiple ranks,
- // for example, expanding a multi_term_disjunction to include a rank for each disjunction value.
- private final Map> expandedNodeToRankMap;
-
- // A single-entry cache for hit attribution, so we can reuse the immediate result. Will be used
- // only when lastDocId matches
- private ThreadLocal