diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.docx b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.docx new file mode 100644 index 000000000..5135f363e Binary files /dev/null and b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.docx differ diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala deleted file mode 100644 index dd00ea126..000000000 --- a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/DeciderConstants.scala +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.representation_manager.store - -import com.twitter.servo.decider.DeciderKeyEnum - -object DeciderConstants { - // Deciders inherited from CR and RSX and only used in LegacyRMS - // Their value are manipulated by CR and RSX's yml file and their decider dashboard - // We will remove them after migration completed - val enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore = - "enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore" - - val enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore = - "enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore" - - val enablelogFavBased20M145K2020TweetEmbeddingStoreTimeouts = - "enable_log_fav_based_tweet_embedding_20m145k2020_timeouts" - val logFavBased20M145K2020TweetEmbeddingStoreTimeoutValueMillis = - "log_fav_based_tweet_embedding_20m145k2020_timeout_value_millis" - - val enablelogFavBased20M145KUpdatedTweetEmbeddingStoreTimeouts = - "enable_log_fav_based_tweet_embedding_20m145kUpdated_timeouts" - val logFavBased20M145KUpdatedTweetEmbeddingStoreTimeoutValueMillis = - "log_fav_based_tweet_embedding_20m145kUpdated_timeout_value_millis" - - val enableSimClustersEmbeddingStoreTimeouts = "enable_sim_clusters_embedding_store_timeouts" - val simClustersEmbeddingStoreTimeoutValueMillis = - "sim_clusters_embedding_store_timeout_value_millis" -} - -// Necessary for using servo Gates -object DeciderKey extends DeciderKeyEnum { - val enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore: Value = Value( - DeciderConstants.enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore - ) - - val enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore: Value = Value( - DeciderConstants.enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore - ) -} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.docx b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.docx new file mode 100644 index 000000000..f28e4a3b8 Binary files /dev/null and b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.docx differ diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala deleted file mode 100644 index cc6485b79..000000000 --- a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TopicSimClustersEmbeddingStore.scala +++ /dev/null @@ -1,198 +0,0 @@ -package com.twitter.representation_manager.store - -import com.twitter.contentrecommender.store.ApeEntityEmbeddingStore -import com.twitter.contentrecommender.store.InterestsOptOutStore -import com.twitter.contentrecommender.store.SemanticCoreTopicSeedStore -import com.twitter.conversions.DurationOps._ -import com.twitter.escherbird.util.uttclient.CachedUttClientV2 -import com.twitter.finagle.memcached.Client -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.store.strato.StratoFetchableStore -import com.twitter.frigate.common.util.SeqLongInjection -import com.twitter.hermit.store.common.ObservedCachedReadableStore -import com.twitter.hermit.store.common.ObservedMemcachedReadableStore -import com.twitter.hermit.store.common.ObservedReadableStore -import com.twitter.interests.thriftscala.InterestsThriftService -import com.twitter.representation_manager.common.MemCacheConfig -import com.twitter.representation_manager.common.RepresentationManagerDecider -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.ModelVersion._ -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.TopicId -import com.twitter.simclusters_v2.thriftscala.LocaleEntityId -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.strato.client.{Client => StratoClient} -import com.twitter.tweetypie.util.UserId -import javax.inject.Inject - -class TopicSimClustersEmbeddingStore @Inject() ( - stratoClient: StratoClient, - cacheClient: Client, - globalStats: StatsReceiver, - mhMtlsParams: ManhattanKVClientMtlsParams, - rmsDecider: RepresentationManagerDecider, - interestService: InterestsThriftService.MethodPerEndpoint, - uttClient: CachedUttClientV2) { - - private val stats = globalStats.scope(this.getClass.getSimpleName) - private val interestsOptOutStore = InterestsOptOutStore(interestService) - - /** - * Note this is NOT an embedding store. It is a list of author account ids we use to represent - * topics - */ - private val semanticCoreTopicSeedStore: ReadableStore[ - SemanticCoreTopicSeedStore.Key, - Seq[UserId] - ] = { - /* - Up to 1000 Long seeds per topic/language = 62.5kb per topic/language (worst case) - Assume ~10k active topic/languages ~= 650MB (worst case) - */ - val underlying = new SemanticCoreTopicSeedStore(uttClient, interestsOptOutStore)( - stats.scope("semantic_core_topic_seed_store")) - - val memcacheStore = ObservedMemcachedReadableStore.fromCacheClient( - backingStore = underlying, - cacheClient = cacheClient, - ttl = 12.hours)( - valueInjection = SeqLongInjection, - statsReceiver = stats.scope("topic_producer_seed_store_mem_cache"), - keyToString = { k => s"tpss:${k.entityId}_${k.languageCode}" } - ) - - ObservedCachedReadableStore.from[SemanticCoreTopicSeedStore.Key, Seq[UserId]]( - store = memcacheStore, - ttl = 6.hours, - maxKeys = 20e3.toInt, - cacheName = "topic_producer_seed_store_cache", - windowSize = 5000 - )(stats.scope("topic_producer_seed_store_cache")) - } - - private val favBasedTfgTopicEmbedding20m145k2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = - StratoFetchableStore - .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( - stratoClient, - "recommendations/simclusters_v2/embeddings/favBasedTFGTopic20M145K2020").mapValues( - embedding => SimClustersEmbedding(embedding, truncate = 50).toThrift) - .composeKeyMapping[LocaleEntityId] { localeEntityId => - SimClustersEmbeddingId( - FavTfgTopic, - Model20m145k2020, - InternalId.LocaleEntityId(localeEntityId)) - } - - buildLocaleEntityIdMemCacheStore(rawStore, FavTfgTopic, Model20m145k2020) - } - - private val logFavBasedApeEntity20M145K2020EmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val apeStore = StratoFetchableStore - .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( - stratoClient, - "recommendations/simclusters_v2/embeddings/logFavBasedAPE20M145K2020") - .mapValues(embedding => SimClustersEmbedding(embedding, truncate = 50)) - .composeKeyMapping[UserId]({ id => - SimClustersEmbeddingId( - AggregatableLogFavBasedProducer, - Model20m145k2020, - InternalId.UserId(id)) - }) - val rawStore = new ApeEntityEmbeddingStore( - semanticCoreSeedStore = semanticCoreTopicSeedStore, - aggregatableProducerEmbeddingStore = apeStore, - statsReceiver = stats.scope("log_fav_based_ape_entity_2020_embedding_store")) - .mapValues(embedding => SimClustersEmbedding(embedding.toThrift, truncate = 50).toThrift) - .composeKeyMapping[TopicId] { topicId => - SimClustersEmbeddingId( - LogFavBasedKgoApeTopic, - Model20m145k2020, - InternalId.TopicId(topicId)) - } - - buildTopicIdMemCacheStore(rawStore, LogFavBasedKgoApeTopic, Model20m145k2020) - } - - private def buildTopicIdMemCacheStore( - rawStore: ReadableStore[TopicId, ThriftSimClustersEmbedding], - embeddingType: EmbeddingType, - modelVersion: ModelVersion - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - val observedStore: ObservedReadableStore[TopicId, ThriftSimClustersEmbedding] = - ObservedReadableStore( - store = rawStore - )(stats.scope(embeddingType.name).scope(modelVersion.name)) - - val storeWithKeyMapping = observedStore.composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(_, _, InternalId.TopicId(topicId)) => - topicId - } - - MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( - storeWithKeyMapping, - cacheClient, - embeddingType, - modelVersion, - stats - ) - } - - private def buildLocaleEntityIdMemCacheStore( - rawStore: ReadableStore[LocaleEntityId, ThriftSimClustersEmbedding], - embeddingType: EmbeddingType, - modelVersion: ModelVersion - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - val observedStore: ObservedReadableStore[LocaleEntityId, ThriftSimClustersEmbedding] = - ObservedReadableStore( - store = rawStore - )(stats.scope(embeddingType.name).scope(modelVersion.name)) - - val storeWithKeyMapping = observedStore.composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(_, _, InternalId.LocaleEntityId(localeEntityId)) => - localeEntityId - } - - MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( - storeWithKeyMapping, - cacheClient, - embeddingType, - modelVersion, - stats - ) - } - - private val underlyingStores: Map[ - (EmbeddingType, ModelVersion), - ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ] = Map( - // Topic Embeddings - (FavTfgTopic, Model20m145k2020) -> favBasedTfgTopicEmbedding20m145k2020Store, - (LogFavBasedKgoApeTopic, Model20m145k2020) -> logFavBasedApeEntity20M145K2020EmbeddingStore, - ) - - val topicSimClustersEmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - SimClustersEmbeddingStore.buildWithDecider( - underlyingStores = underlyingStores, - decider = rmsDecider.decider, - statsReceiver = stats - ) - } - -} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.docx b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.docx new file mode 100644 index 000000000..ddeb7e935 Binary files /dev/null and b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.docx differ diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala deleted file mode 100644 index 857e38649..000000000 --- a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/TweetSimClustersEmbeddingStore.scala +++ /dev/null @@ -1,141 +0,0 @@ -package com.twitter.representation_manager.store - -import com.twitter.finagle.memcached.Client -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.hermit.store.common.ObservedReadableStore -import com.twitter.representation_manager.common.MemCacheConfig -import com.twitter.representation_manager.common.RepresentationManagerDecider -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore -import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.ModelVersion._ -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import javax.inject.Inject - -class TweetSimClustersEmbeddingStore @Inject() ( - cacheClient: Client, - globalStats: StatsReceiver, - mhMtlsParams: ManhattanKVClientMtlsParams, - rmsDecider: RepresentationManagerDecider) { - - private val stats = globalStats.scope(this.getClass.getSimpleName) - - val logFavBasedLongestL2Tweet20M145KUpdatedEmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = - PersistentTweetEmbeddingStore - .longestL2NormTweetEmbeddingStoreManhattan( - mhMtlsParams, - PersistentTweetEmbeddingStore.LogFavBased20m145kUpdatedDataset, - stats - ).mapValues(_.toThrift) - - buildMemCacheStore(rawStore, LogFavLongestL2EmbeddingTweet, Model20m145kUpdated) - } - - val logFavBasedLongestL2Tweet20M145K2020EmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = - PersistentTweetEmbeddingStore - .longestL2NormTweetEmbeddingStoreManhattan( - mhMtlsParams, - PersistentTweetEmbeddingStore.LogFavBased20m145k2020Dataset, - stats - ).mapValues(_.toThrift) - - buildMemCacheStore(rawStore, LogFavLongestL2EmbeddingTweet, Model20m145k2020) - } - - val logFavBased20M145KUpdatedTweetEmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = - PersistentTweetEmbeddingStore - .mostRecentTweetEmbeddingStoreManhattan( - mhMtlsParams, - PersistentTweetEmbeddingStore.LogFavBased20m145kUpdatedDataset, - stats - ).mapValues(_.toThrift) - - buildMemCacheStore(rawStore, LogFavBasedTweet, Model20m145kUpdated) - } - - val logFavBased20M145K2020TweetEmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = - PersistentTweetEmbeddingStore - .mostRecentTweetEmbeddingStoreManhattan( - mhMtlsParams, - PersistentTweetEmbeddingStore.LogFavBased20m145k2020Dataset, - stats - ).mapValues(_.toThrift) - - buildMemCacheStore(rawStore, LogFavBasedTweet, Model20m145k2020) - } - - private def buildMemCacheStore( - rawStore: ReadableStore[TweetId, ThriftSimClustersEmbedding], - embeddingType: EmbeddingType, - modelVersion: ModelVersion - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - val observedStore: ObservedReadableStore[TweetId, ThriftSimClustersEmbedding] = - ObservedReadableStore( - store = rawStore - )(stats.scope(embeddingType.name).scope(modelVersion.name)) - - val storeWithKeyMapping = observedStore.composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(_, _, InternalId.TweetId(tweetId)) => - tweetId - } - - MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( - storeWithKeyMapping, - cacheClient, - embeddingType, - modelVersion, - stats - ) - } - - private val underlyingStores: Map[ - (EmbeddingType, ModelVersion), - ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ] = Map( - // Tweet Embeddings - (LogFavBasedTweet, Model20m145kUpdated) -> logFavBased20M145KUpdatedTweetEmbeddingStore, - (LogFavBasedTweet, Model20m145k2020) -> logFavBased20M145K2020TweetEmbeddingStore, - ( - LogFavLongestL2EmbeddingTweet, - Model20m145kUpdated) -> logFavBasedLongestL2Tweet20M145KUpdatedEmbeddingStore, - ( - LogFavLongestL2EmbeddingTweet, - Model20m145k2020) -> logFavBasedLongestL2Tweet20M145K2020EmbeddingStore, - ) - - val tweetSimClustersEmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - SimClustersEmbeddingStore.buildWithDecider( - underlyingStores = underlyingStores, - decider = rmsDecider.decider, - statsReceiver = stats - ) - } - -} diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.docx b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.docx new file mode 100644 index 000000000..ec6c2e279 Binary files /dev/null and b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.docx differ diff --git a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala b/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala deleted file mode 100644 index b416d9b17..000000000 --- a/representation-manager/server/src/main/scala/com/twitter/representation_manager/store/UserSimClustersEmbeddingStore.scala +++ /dev/null @@ -1,602 +0,0 @@ -package com.twitter.representation_manager.store - -import com.twitter.contentrecommender.twistly -import com.twitter.finagle.memcached.Client -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.store.strato.StratoFetchableStore -import com.twitter.hermit.store.common.ObservedReadableStore -import com.twitter.representation_manager.common.MemCacheConfig -import com.twitter.representation_manager.common.RepresentationManagerDecider -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore -import com.twitter.simclusters_v2.summingbird.stores.ProducerClusterEmbeddingReadableStores -import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore -import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.getStore -import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.modelVersionToDatasetMap -import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.knownModelVersions -import com.twitter.simclusters_v2.summingbird.stores.UserInterestedInReadableStore.toSimClustersEmbedding -import com.twitter.simclusters_v2.thriftscala.ClustersUserIsInterestedIn -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.ModelVersion._ -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.{SimClustersEmbedding => ThriftSimClustersEmbedding} -import com.twitter.storage.client.manhattan.kv.ManhattanKVClientMtlsParams -import com.twitter.storehaus.ReadableStore -import com.twitter.storehaus_internal.manhattan.Apollo -import com.twitter.storehaus_internal.manhattan.ManhattanCluster -import com.twitter.strato.client.{Client => StratoClient} -import com.twitter.strato.thrift.ScroogeConvImplicits._ -import com.twitter.tweetypie.util.UserId -import com.twitter.util.Future -import javax.inject.Inject - -class UserSimClustersEmbeddingStore @Inject() ( - stratoClient: StratoClient, - cacheClient: Client, - globalStats: StatsReceiver, - mhMtlsParams: ManhattanKVClientMtlsParams, - rmsDecider: RepresentationManagerDecider) { - - private val stats = globalStats.scope(this.getClass.getSimpleName) - - private val favBasedProducer20M145KUpdatedEmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = ProducerClusterEmbeddingReadableStores - .getProducerTopKSimClustersEmbeddingsStore( - mhMtlsParams - ).mapValues { topSimClustersWithScore => - ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters) - }.composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) => - userId - } - - buildMemCacheStore(rawStore, FavBasedProducer, Model20m145kUpdated) - } - - private val favBasedProducer20M145K2020EmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = ProducerClusterEmbeddingReadableStores - .getProducerTopKSimClusters2020EmbeddingsStore( - mhMtlsParams - ).mapValues { topSimClustersWithScore => - ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters) - }.composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) => - userId - } - - buildMemCacheStore(rawStore, FavBasedProducer, Model20m145k2020) - } - - private val followBasedProducer20M145K2020EmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = ProducerClusterEmbeddingReadableStores - .getProducerTopKSimClustersEmbeddingsByFollowStore( - mhMtlsParams - ).mapValues { topSimClustersWithScore => - ThriftSimClustersEmbedding(topSimClustersWithScore.topClusters) - }.composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(_, _, InternalId.UserId(userId)) => - userId - } - - buildMemCacheStore(rawStore, FollowBasedProducer, Model20m145k2020) - } - - private val logFavBasedApe20M145K2020EmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = StratoFetchableStore - .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( - stratoClient, - "recommendations/simclusters_v2/embeddings/logFavBasedAPE20M145K2020") - .mapValues(embedding => SimClustersEmbedding(embedding, truncate = 50).toThrift) - - buildMemCacheStore(rawStore, AggregatableLogFavBasedProducer, Model20m145k2020) - } - - private val rawRelaxedLogFavBasedApe20M145K2020EmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - ThriftSimClustersEmbedding - ] = { - StratoFetchableStore - .withUnitView[SimClustersEmbeddingId, ThriftSimClustersEmbedding]( - stratoClient, - "recommendations/simclusters_v2/embeddings/logFavBasedAPERelaxedFavEngagementThreshold20M145K2020") - .mapValues(embedding => SimClustersEmbedding(embedding, truncate = 50).toThrift) - } - - private val relaxedLogFavBasedApe20M145K2020EmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildMemCacheStore( - rawRelaxedLogFavBasedApe20M145K2020EmbeddingStore, - RelaxedAggregatableLogFavBasedProducer, - Model20m145k2020) - } - - private val relaxedLogFavBasedApe20m145kUpdatedEmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = rawRelaxedLogFavBasedApe20M145K2020EmbeddingStore - .composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId( - RelaxedAggregatableLogFavBasedProducer, - Model20m145kUpdated, - internalId) => - SimClustersEmbeddingId( - RelaxedAggregatableLogFavBasedProducer, - Model20m145k2020, - internalId) - } - - buildMemCacheStore(rawStore, RelaxedAggregatableLogFavBasedProducer, Model20m145kUpdated) - } - - private val logFavBasedInterestedInFromAPE20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildUserInterestedInStore( - UserInterestedInReadableStore.defaultIIAPESimClustersEmbeddingStoreWithMtls, - LogFavBasedUserInterestedInFromAPE, - Model20m145k2020) - } - - private val followBasedInterestedInFromAPE20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildUserInterestedInStore( - UserInterestedInReadableStore.defaultIIAPESimClustersEmbeddingStoreWithMtls, - FollowBasedUserInterestedInFromAPE, - Model20m145k2020) - } - - private val favBasedUserInterestedIn20M145KUpdatedStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildUserInterestedInStore( - UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls, - FavBasedUserInterestedIn, - Model20m145kUpdated) - } - - private val favBasedUserInterestedIn20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildUserInterestedInStore( - UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls, - FavBasedUserInterestedIn, - Model20m145k2020) - } - - private val followBasedUserInterestedIn20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildUserInterestedInStore( - UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls, - FollowBasedUserInterestedIn, - Model20m145k2020) - } - - private val logFavBasedUserInterestedIn20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildUserInterestedInStore( - UserInterestedInReadableStore.defaultSimClustersEmbeddingStoreWithMtls, - LogFavBasedUserInterestedIn, - Model20m145k2020) - } - - private val favBasedUserInterestedInFromPE20M145KUpdatedStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildUserInterestedInStore( - UserInterestedInReadableStore.defaultIIPESimClustersEmbeddingStoreWithMtls, - FavBasedUserInterestedInFromPE, - Model20m145kUpdated) - } - - private val twistlyUserInterestedInStore: ReadableStore[ - SimClustersEmbeddingId, - ThriftSimClustersEmbedding - ] = { - val interestedIn20M145KUpdatedStore = { - UserInterestedInReadableStore.defaultStoreWithMtls( - mhMtlsParams, - modelVersion = ModelVersions.Model20M145KUpdated - ) - } - val interestedIn20M145K2020Store = { - UserInterestedInReadableStore.defaultStoreWithMtls( - mhMtlsParams, - modelVersion = ModelVersions.Model20M145K2020 - ) - } - val interestedInFromPE20M145KUpdatedStore = { - UserInterestedInReadableStore.defaultIIPEStoreWithMtls( - mhMtlsParams, - modelVersion = ModelVersions.Model20M145KUpdated) - } - val simClustersInterestedInStore: ReadableStore[ - (UserId, ModelVersion), - ClustersUserIsInterestedIn - ] = { - new ReadableStore[(UserId, ModelVersion), ClustersUserIsInterestedIn] { - override def get(k: (UserId, ModelVersion)): Future[Option[ClustersUserIsInterestedIn]] = { - k match { - case (userId, Model20m145kUpdated) => - interestedIn20M145KUpdatedStore.get(userId) - case (userId, Model20m145k2020) => - interestedIn20M145K2020Store.get(userId) - case _ => - Future.None - } - } - } - } - val simClustersInterestedInFromProducerEmbeddingsStore: ReadableStore[ - (UserId, ModelVersion), - ClustersUserIsInterestedIn - ] = { - new ReadableStore[(UserId, ModelVersion), ClustersUserIsInterestedIn] { - override def get(k: (UserId, ModelVersion)): Future[Option[ClustersUserIsInterestedIn]] = { - k match { - case (userId, ModelVersion.Model20m145kUpdated) => - interestedInFromPE20M145KUpdatedStore.get(userId) - case _ => - Future.None - } - } - } - } - new twistly.interestedin.EmbeddingStore( - interestedInStore = simClustersInterestedInStore, - interestedInFromProducerEmbeddingStore = simClustersInterestedInFromProducerEmbeddingsStore, - statsReceiver = stats - ).mapValues(_.toThrift) - } - - private val userNextInterestedIn20m145k2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildUserInterestedInStore( - UserInterestedInReadableStore.defaultNextInterestedInStoreWithMtls, - UserNextInterestedIn, - Model20m145k2020) - } - - private val filteredUserInterestedIn20m145kUpdatedStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildMemCacheStore(twistlyUserInterestedInStore, FilteredUserInterestedIn, Model20m145kUpdated) - } - - private val filteredUserInterestedIn20m145k2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildMemCacheStore(twistlyUserInterestedInStore, FilteredUserInterestedIn, Model20m145k2020) - } - - private val filteredUserInterestedInFromPE20m145kUpdatedStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildMemCacheStore( - twistlyUserInterestedInStore, - FilteredUserInterestedInFromPE, - Model20m145kUpdated) - } - - private val unfilteredUserInterestedIn20m145kUpdatedStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildMemCacheStore( - twistlyUserInterestedInStore, - UnfilteredUserInterestedIn, - Model20m145kUpdated) - } - - private val unfilteredUserInterestedIn20m145k2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - buildMemCacheStore(twistlyUserInterestedInStore, UnfilteredUserInterestedIn, Model20m145k2020) - } - - // [Experimental] User InterestedIn, generated by aggregating IIAPE embedding from AddressBook - - private val logFavBasedInterestedMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val datasetName = "addressbook_sims_embedding_iiape_maxpooling" - val appId = "wtf_embedding_apollo" - buildUserInterestedInStoreGeneric( - simClustersEmbeddingStoreWithMtls, - LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE, - Model20m145k2020, - datasetName = datasetName, - appId = appId, - manhattanCluster = Apollo - ) - } - - private val logFavBasedInterestedAverageAddressBookFromIIAPE20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val datasetName = "addressbook_sims_embedding_iiape_average" - val appId = "wtf_embedding_apollo" - buildUserInterestedInStoreGeneric( - simClustersEmbeddingStoreWithMtls, - LogFavBasedUserInterestedAverageAddressBookFromIIAPE, - Model20m145k2020, - datasetName = datasetName, - appId = appId, - manhattanCluster = Apollo - ) - } - - private val logFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val datasetName = "addressbook_sims_embedding_iiape_booktype_maxpooling" - val appId = "wtf_embedding_apollo" - buildUserInterestedInStoreGeneric( - simClustersEmbeddingStoreWithMtls, - LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE, - Model20m145k2020, - datasetName = datasetName, - appId = appId, - manhattanCluster = Apollo - ) - } - - private val logFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val datasetName = "addressbook_sims_embedding_iiape_largestdim_maxpooling" - val appId = "wtf_embedding_apollo" - buildUserInterestedInStoreGeneric( - simClustersEmbeddingStoreWithMtls, - LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE, - Model20m145k2020, - datasetName = datasetName, - appId = appId, - manhattanCluster = Apollo - ) - } - - private val logFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val datasetName = "addressbook_sims_embedding_iiape_louvain_maxpooling" - val appId = "wtf_embedding_apollo" - buildUserInterestedInStoreGeneric( - simClustersEmbeddingStoreWithMtls, - LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE, - Model20m145k2020, - datasetName = datasetName, - appId = appId, - manhattanCluster = Apollo - ) - } - - private val logFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE20M145K2020Store: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val datasetName = "addressbook_sims_embedding_iiape_connected_maxpooling" - val appId = "wtf_embedding_apollo" - buildUserInterestedInStoreGeneric( - simClustersEmbeddingStoreWithMtls, - LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE, - Model20m145k2020, - datasetName = datasetName, - appId = appId, - manhattanCluster = Apollo - ) - } - - /** - * Helper func to build a readable store for some UserInterestedIn embeddings with - * 1. A storeFunc from UserInterestedInReadableStore - * 2. EmbeddingType - * 3. ModelVersion - * 4. MemCacheConfig - * */ - private def buildUserInterestedInStore( - storeFunc: (ManhattanKVClientMtlsParams, EmbeddingType, ModelVersion) => ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ], - embeddingType: EmbeddingType, - modelVersion: ModelVersion - ): ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = storeFunc(mhMtlsParams, embeddingType, modelVersion) - .mapValues(_.toThrift) - val observedStore = ObservedReadableStore( - store = rawStore - )(stats.scope(embeddingType.name).scope(modelVersion.name)) - - MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( - observedStore, - cacheClient, - embeddingType, - modelVersion, - stats - ) - } - - private def buildUserInterestedInStoreGeneric( - storeFunc: (ManhattanKVClientMtlsParams, EmbeddingType, ModelVersion, String, String, - ManhattanCluster) => ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ], - embeddingType: EmbeddingType, - modelVersion: ModelVersion, - datasetName: String, - appId: String, - manhattanCluster: ManhattanCluster - ): ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - val rawStore = - storeFunc(mhMtlsParams, embeddingType, modelVersion, datasetName, appId, manhattanCluster) - .mapValues(_.toThrift) - val observedStore = ObservedReadableStore( - store = rawStore - )(stats.scope(embeddingType.name).scope(modelVersion.name)) - - MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( - observedStore, - cacheClient, - embeddingType, - modelVersion, - stats - ) - } - - private def simClustersEmbeddingStoreWithMtls( - mhMtlsParams: ManhattanKVClientMtlsParams, - embeddingType: EmbeddingType, - modelVersion: ModelVersion, - datasetName: String, - appId: String, - manhattanCluster: ManhattanCluster - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - - if (!modelVersionToDatasetMap.contains(ModelVersions.toKnownForModelVersion(modelVersion))) { - throw new IllegalArgumentException( - "Unknown model version: " + modelVersion + ". Known model versions: " + knownModelVersions) - } - getStore(appId, mhMtlsParams, datasetName, manhattanCluster) - .composeKeyMapping[SimClustersEmbeddingId] { - case SimClustersEmbeddingId(theEmbeddingType, theModelVersion, InternalId.UserId(userId)) - if theEmbeddingType == embeddingType && theModelVersion == modelVersion => - userId - }.mapValues(toSimClustersEmbedding(_, embeddingType)) - } - - private def buildMemCacheStore( - rawStore: ReadableStore[SimClustersEmbeddingId, ThriftSimClustersEmbedding], - embeddingType: EmbeddingType, - modelVersion: ModelVersion - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - val observedStore = ObservedReadableStore( - store = rawStore - )(stats.scope(embeddingType.name).scope(modelVersion.name)) - - MemCacheConfig.buildMemCacheStoreForSimClustersEmbedding( - observedStore, - cacheClient, - embeddingType, - modelVersion, - stats - ) - } - - private val underlyingStores: Map[ - (EmbeddingType, ModelVersion), - ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ] = Map( - // KnownFor Embeddings - (FavBasedProducer, Model20m145kUpdated) -> favBasedProducer20M145KUpdatedEmbeddingStore, - (FavBasedProducer, Model20m145k2020) -> favBasedProducer20M145K2020EmbeddingStore, - (FollowBasedProducer, Model20m145k2020) -> followBasedProducer20M145K2020EmbeddingStore, - (AggregatableLogFavBasedProducer, Model20m145k2020) -> logFavBasedApe20M145K2020EmbeddingStore, - ( - RelaxedAggregatableLogFavBasedProducer, - Model20m145kUpdated) -> relaxedLogFavBasedApe20m145kUpdatedEmbeddingStore, - ( - RelaxedAggregatableLogFavBasedProducer, - Model20m145k2020) -> relaxedLogFavBasedApe20M145K2020EmbeddingStore, - // InterestedIn Embeddings - ( - LogFavBasedUserInterestedInFromAPE, - Model20m145k2020) -> logFavBasedInterestedInFromAPE20M145K2020Store, - ( - FollowBasedUserInterestedInFromAPE, - Model20m145k2020) -> followBasedInterestedInFromAPE20M145K2020Store, - (FavBasedUserInterestedIn, Model20m145kUpdated) -> favBasedUserInterestedIn20M145KUpdatedStore, - (FavBasedUserInterestedIn, Model20m145k2020) -> favBasedUserInterestedIn20M145K2020Store, - (FollowBasedUserInterestedIn, Model20m145k2020) -> followBasedUserInterestedIn20M145K2020Store, - (LogFavBasedUserInterestedIn, Model20m145k2020) -> logFavBasedUserInterestedIn20M145K2020Store, - ( - FavBasedUserInterestedInFromPE, - Model20m145kUpdated) -> favBasedUserInterestedInFromPE20M145KUpdatedStore, - (FilteredUserInterestedIn, Model20m145kUpdated) -> filteredUserInterestedIn20m145kUpdatedStore, - (FilteredUserInterestedIn, Model20m145k2020) -> filteredUserInterestedIn20m145k2020Store, - ( - FilteredUserInterestedInFromPE, - Model20m145kUpdated) -> filteredUserInterestedInFromPE20m145kUpdatedStore, - ( - UnfilteredUserInterestedIn, - Model20m145kUpdated) -> unfilteredUserInterestedIn20m145kUpdatedStore, - (UnfilteredUserInterestedIn, Model20m145k2020) -> unfilteredUserInterestedIn20m145k2020Store, - (UserNextInterestedIn, Model20m145k2020) -> userNextInterestedIn20m145k2020Store, - ( - LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE, - Model20m145k2020) -> logFavBasedInterestedMaxpoolingAddressBookFromIIAPE20M145K2020Store, - ( - LogFavBasedUserInterestedAverageAddressBookFromIIAPE, - Model20m145k2020) -> logFavBasedInterestedAverageAddressBookFromIIAPE20M145K2020Store, - ( - LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE, - Model20m145k2020) -> logFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE20M145K2020Store, - ( - LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE, - Model20m145k2020) -> logFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE20M145K2020Store, - ( - LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE, - Model20m145k2020) -> logFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE20M145K2020Store, - ( - LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE, - Model20m145k2020) -> logFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE20M145K2020Store, - ) - - val userSimClustersEmbeddingStore: ReadableStore[ - SimClustersEmbeddingId, - SimClustersEmbedding - ] = { - SimClustersEmbeddingStore.buildWithDecider( - underlyingStores = underlyingStores, - decider = rmsDecider.decider, - statsReceiver = stats - ) - } - -} diff --git a/representation-manager/server/src/main/thrift/BUILD b/representation-manager/server/src/main/thrift/BUILD deleted file mode 100644 index f4edb5dcb..000000000 --- a/representation-manager/server/src/main/thrift/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -create_thrift_libraries( - base_name = "thrift", - sources = [ - "com/twitter/representation_manager/service.thrift", - ], - platform = "java8", - tags = [ - "bazel-compatible", - ], - dependency_roots = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift", - ], - generate_languages = [ - "java", - "scala", - "strato", - ], -) diff --git a/representation-manager/server/src/main/thrift/BUILD.docx b/representation-manager/server/src/main/thrift/BUILD.docx new file mode 100644 index 000000000..4ad5aa910 Binary files /dev/null and b/representation-manager/server/src/main/thrift/BUILD.docx differ diff --git a/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.docx b/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.docx new file mode 100644 index 000000000..de1661949 Binary files /dev/null and b/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.docx differ diff --git a/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift b/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift deleted file mode 100644 index 4eb36e999..000000000 --- a/representation-manager/server/src/main/thrift/com/twitter/representation_manager/service.thrift +++ /dev/null @@ -1,14 +0,0 @@ -namespace java com.twitter.representation_manager.thriftjava -#@namespace scala com.twitter.representation_manager.thriftscala -#@namespace strato com.twitter.representation_manager - -include "com/twitter/simclusters_v2/online_store.thrift" -include "com/twitter/simclusters_v2/identifier.thrift" - -/** - * A uniform column view for all kinds of SimClusters based embeddings. - **/ -struct SimClustersEmbeddingView { - 1: required identifier.EmbeddingType embeddingType - 2: required online_store.ModelVersion modelVersion -}(persisted = 'false', hasPersonalData = 'false') diff --git a/representation-scorer/BUILD.bazel b/representation-scorer/BUILD.bazel deleted file mode 100644 index 1624a57d4..000000000 --- a/representation-scorer/BUILD.bazel +++ /dev/null @@ -1 +0,0 @@ -# This prevents SQ query from grabbing //:all since it traverses up once to find a BUILD diff --git a/representation-scorer/BUILD.docx b/representation-scorer/BUILD.docx new file mode 100644 index 000000000..b090a5bc7 Binary files /dev/null and b/representation-scorer/BUILD.docx differ diff --git a/representation-scorer/README.docx b/representation-scorer/README.docx new file mode 100644 index 000000000..cfd244236 Binary files /dev/null and b/representation-scorer/README.docx differ diff --git a/representation-scorer/README.md b/representation-scorer/README.md deleted file mode 100644 index b74e3472f..000000000 --- a/representation-scorer/README.md +++ /dev/null @@ -1,5 +0,0 @@ -# Representation Scorer # - -**Representation Scorer** (RSX) serves as a centralized scoring system, offering SimClusters or other embedding-based scoring solutions as machine learning features. - -The Representation Scorer acquires user behavior data from the User Signal Service (USS) and extracts embeddings from the Representation Manager (RMS). It then calculates both pairwise and listwise features. These features are used at various stages, including candidate retrieval and ranking. \ No newline at end of file diff --git a/representation-scorer/bin/canary-check.docx b/representation-scorer/bin/canary-check.docx new file mode 100644 index 000000000..c56174979 Binary files /dev/null and b/representation-scorer/bin/canary-check.docx differ diff --git a/representation-scorer/bin/canary-check.sh b/representation-scorer/bin/canary-check.sh deleted file mode 100755 index cbb31f9ad..000000000 --- a/representation-scorer/bin/canary-check.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash - -export CANARY_CHECK_ROLE="representation-scorer" -export CANARY_CHECK_NAME="representation-scorer" -export CANARY_CHECK_INSTANCES="0-19" - -python3 relevance-platform/tools/canary_check.py "$@" - diff --git a/representation-scorer/bin/deploy.docx b/representation-scorer/bin/deploy.docx new file mode 100644 index 000000000..c57726f04 Binary files /dev/null and b/representation-scorer/bin/deploy.docx differ diff --git a/representation-scorer/bin/deploy.sh b/representation-scorer/bin/deploy.sh deleted file mode 100755 index 2f1ab8a69..000000000 --- a/representation-scorer/bin/deploy.sh +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env bash - -JOB=representation-scorer bazel run --ui_event_filters=-info,-stdout,-stderr --noshow_progress \ - //relevance-platform/src/main/python/deploy -- "$@" diff --git a/representation-scorer/bin/remote-debug-tunnel.docx b/representation-scorer/bin/remote-debug-tunnel.docx new file mode 100644 index 000000000..1c0381873 Binary files /dev/null and b/representation-scorer/bin/remote-debug-tunnel.docx differ diff --git a/representation-scorer/bin/remote-debug-tunnel.sh b/representation-scorer/bin/remote-debug-tunnel.sh deleted file mode 100755 index 2a6e71511..000000000 --- a/representation-scorer/bin/remote-debug-tunnel.sh +++ /dev/null @@ -1,66 +0,0 @@ -#!/bin/bash - -set -o nounset -set -eu - -DC="atla" -ROLE="$USER" -SERVICE="representation-scorer" -INSTANCE="0" -KEY="$DC/$ROLE/devel/$SERVICE/$INSTANCE" - -while test $# -gt 0; do - case "$1" in - -h|--help) - echo "$0 Set up an ssh tunnel for $SERVICE remote debugging and disable aurora health checks" - echo " " - echo "See representation-scorer/README.md for details of how to use this script, and go/remote-debug for" - echo "general information about remote debugging in Aurora" - echo " " - echo "Default instance if called with no args:" - echo " $KEY" - echo " " - echo "Positional args:" - echo " $0 [datacentre] [role] [service_name] [instance]" - echo " " - echo "Options:" - echo " -h, --help show brief help" - exit 0 - ;; - *) - break - ;; - esac -done - -if [ -n "${1-}" ]; then - DC="$1" -fi - -if [ -n "${2-}" ]; then - ROLE="$2" -fi - -if [ -n "${3-}" ]; then - SERVICE="$3" -fi - -if [ -n "${4-}" ]; then - INSTANCE="$4" -fi - -KEY="$DC/$ROLE/devel/$SERVICE/$INSTANCE" -read -p "Set up remote debugger tunnel for $KEY? (y/n) " -r CONFIRM -if [[ ! $CONFIRM =~ ^[Yy]$ ]]; then - echo "Exiting, tunnel not created" - exit 1 -fi - -echo "Disabling health check and opening tunnel. Exit with control-c when you're finished" -CMD="aurora task ssh $KEY -c 'touch .healthchecksnooze' && aurora task ssh $KEY -L '5005:debug' --ssh-options '-N -S none -v '" - -echo "Running $CMD" -eval "$CMD" - - - diff --git a/representation-scorer/docs/index.docx b/representation-scorer/docs/index.docx new file mode 100644 index 000000000..1873fb82e Binary files /dev/null and b/representation-scorer/docs/index.docx differ diff --git a/representation-scorer/docs/index.rst b/representation-scorer/docs/index.rst deleted file mode 100644 index c4fd8966d..000000000 --- a/representation-scorer/docs/index.rst +++ /dev/null @@ -1,39 +0,0 @@ -Representation Scorer (RSX) -########################### - -Overview -======== - -Representation Scorer (RSX) is a StratoFed service which serves scores for pairs of entities (User, Tweet, Topic...) based on some representation of those entities. For example, it serves User-Tweet scores based on the cosine similarity of SimClusters embeddings for each of these. It aims to provide these with low latency and at high scale, to support applications such as scoring for ANN candidate generation and feature hydration via feature store. - - -Current use cases ------------------ - -RSX currently serves traffic for the following use cases: - -- User-Tweet similarity scores for Home ranking, using SimClusters embedding dot product -- Topic-Tweet similarity scores for topical tweet candidate generation and topic social proof, using SimClusters embedding cosine similarity and CERTO scores -- Tweet-Tweet and User-Tweet similarity scores for ANN candidate generation, using SimClusters embedding cosine similarity -- (in development) User-Tweet similarity scores for Home ranking, based on various aggregations of similarities with recent faves, retweets and follows performed by the user - -Getting Started -=============== - -Fetching scores ---------------- - -Scores are served from the recommendations/representation_scorer/score column. - -Using RSX for your application ------------------------------- - -RSX may be a good fit for your application if you need scores based on combinations of SimCluster embeddings for core nouns. We also plan to support other embeddings and scoring approaches in the future. - -.. toctree:: - :maxdepth: 2 - :hidden: - - index - - diff --git a/representation-scorer/server/BUILD b/representation-scorer/server/BUILD deleted file mode 100644 index cc7325192..000000000 --- a/representation-scorer/server/BUILD +++ /dev/null @@ -1,22 +0,0 @@ -jvm_binary( - name = "bin", - basename = "representation-scorer", - main = "com.twitter.representationscorer.RepresentationScorerFedServerMain", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "finatra/inject/inject-logback/src/main/scala", - "loglens/loglens-logback/src/main/scala/com/twitter/loglens/logback", - "representation-scorer/server/src/main/resources", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer", - "twitter-server/logback-classic/src/main/scala", - ], -) - -# Aurora Workflows build phase convention requires a jvm_app named with ${project-name}-app -jvm_app( - name = "representation-scorer-app", - archive = "zip", - binary = ":bin", - tags = ["bazel-compatible"], -) diff --git a/representation-scorer/server/BUILD.docx b/representation-scorer/server/BUILD.docx new file mode 100644 index 000000000..d267cb076 Binary files /dev/null and b/representation-scorer/server/BUILD.docx differ diff --git a/representation-scorer/server/src/main/resources/BUILD b/representation-scorer/server/src/main/resources/BUILD deleted file mode 100644 index 150a224ff..000000000 --- a/representation-scorer/server/src/main/resources/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -resources( - sources = [ - "*.xml", - "*.yml", - "com/twitter/slo/slo.json", - "config/*.yml", - ], - tags = ["bazel-compatible"], -) diff --git a/representation-scorer/server/src/main/resources/BUILD.docx b/representation-scorer/server/src/main/resources/BUILD.docx new file mode 100644 index 000000000..5a66b8601 Binary files /dev/null and b/representation-scorer/server/src/main/resources/BUILD.docx differ diff --git a/representation-scorer/server/src/main/resources/com/twitter/slo/slo.docx b/representation-scorer/server/src/main/resources/com/twitter/slo/slo.docx new file mode 100644 index 000000000..6294669f7 Binary files /dev/null and b/representation-scorer/server/src/main/resources/com/twitter/slo/slo.docx differ diff --git a/representation-scorer/server/src/main/resources/com/twitter/slo/slo.json b/representation-scorer/server/src/main/resources/com/twitter/slo/slo.json deleted file mode 100644 index 836b44058..000000000 --- a/representation-scorer/server/src/main/resources/com/twitter/slo/slo.json +++ /dev/null @@ -1,55 +0,0 @@ -{ - "servers": [ - { - "name": "strato", - "indicators": [ - { - "id": "success_rate_3m", - "indicator_type": "SuccessRateIndicator", - "duration": 3, - "duration_unit": "MINUTES" - }, { - "id": "latency_3m_p99", - "indicator_type": "LatencyIndicator", - "duration": 3, - "duration_unit": "MINUTES", - "percentile": 0.99 - } - ], - "objectives": [ - { - "indicator": "success_rate_3m", - "objective_type": "SuccessRateObjective", - "operator": ">=", - "threshold": 0.995 - }, - { - "indicator": "latency_3m_p99", - "objective_type": "LatencyObjective", - "operator": "<=", - "threshold": 50 - } - ], - "long_term_objectives": [ - { - "id": "success_rate_28_days", - "objective_type": "SuccessRateObjective", - "operator": ">=", - "threshold": 0.993, - "duration": 28, - "duration_unit": "DAYS" - }, - { - "id": "latency_p99_28_days", - "objective_type": "LatencyObjective", - "operator": "<=", - "threshold": 60, - "duration": 28, - "duration_unit": "DAYS", - "percentile": 0.99 - } - ] - } - ], - "@version": 1 -} diff --git a/representation-scorer/server/src/main/resources/config/decider.docx b/representation-scorer/server/src/main/resources/config/decider.docx new file mode 100644 index 000000000..5b7ee2751 Binary files /dev/null and b/representation-scorer/server/src/main/resources/config/decider.docx differ diff --git a/representation-scorer/server/src/main/resources/config/decider.yml b/representation-scorer/server/src/main/resources/config/decider.yml deleted file mode 100644 index 56ae90418..000000000 --- a/representation-scorer/server/src/main/resources/config/decider.yml +++ /dev/null @@ -1,155 +0,0 @@ -enableLogFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore: - comment: "Enable to use the non-empty store for logFavBasedApeEntity20M145KUpdatedEmbeddingCachedStore (from 0% to 100%). 0 means use EMPTY readable store for all requests." - default_availability: 0 - -enableLogFavBasedApeEntity20M145K2020EmbeddingCachedStore: - comment: "Enable to use the non-empty store for logFavBasedApeEntity20M145K2020EmbeddingCachedStore (from 0% to 100%). 0 means use EMPTY readable store for all requests." - default_availability: 0 - -representation-scorer_forward_dark_traffic: - comment: "Defines the percentage of traffic to forward to diffy-proxy. Set to 0 to disable dark traffic forwarding" - default_availability: 0 - -"representation-scorer_load_shed_non_prod_callers": - comment: "Discard traffic from all non-prod callers" - default_availability: 0 - -enable_log_fav_based_tweet_embedding_20m145k2020_timeouts: - comment: "If enabled, set a timeout on calls to the logFavBased20M145K2020TweetEmbeddingStore" - default_availability: 0 - -log_fav_based_tweet_embedding_20m145k2020_timeout_value_millis: - comment: "The value of this decider defines the timeout (in milliseconds) to use on calls to the logFavBased20M145K2020TweetEmbeddingStore, i.e. 1.50% is 150ms. Only applied if enable_log_fav_based_tweet_embedding_20m145k2020_timeouts is true" - default_availability: 2000 - -enable_log_fav_based_tweet_embedding_20m145kUpdated_timeouts: - comment: "If enabled, set a timeout on calls to the logFavBased20M145KUpdatedTweetEmbeddingStore" - default_availability: 0 - -log_fav_based_tweet_embedding_20m145kUpdated_timeout_value_millis: - comment: "The value of this decider defines the timeout (in milliseconds) to use on calls to the logFavBased20M145KUpdatedTweetEmbeddingStore, i.e. 1.50% is 150ms. Only applied if enable_log_fav_based_tweet_embedding_20m145kUpdated_timeouts is true" - default_availability: 2000 - -enable_cluster_tweet_index_store_timeouts: - comment: "If enabled, set a timeout on calls to the ClusterTweetIndexStore" - default_availability: 0 - -cluster_tweet_index_store_timeout_value_millis: - comment: "The value of this decider defines the timeout (in milliseconds) to use on calls to the ClusterTweetIndexStore, i.e. 1.50% is 150ms. Only applied if enable_cluster_tweet_index_store_timeouts is true" - default_availability: 2000 - -representation_scorer_fetch_signal_share: - comment: "If enabled, fetches share signals from USS" - default_availability: 0 - -representation_scorer_fetch_signal_reply: - comment: "If enabled, fetches reply signals from USS" - default_availability: 0 - -representation_scorer_fetch_signal_original_tweet: - comment: "If enabled, fetches original tweet signals from USS" - default_availability: 0 - -representation_scorer_fetch_signal_video_playback: - comment: "If enabled, fetches video playback signals from USS" - default_availability: 0 - -representation_scorer_fetch_signal_block: - comment: "If enabled, fetches account block signals from USS" - default_availability: 0 - -representation_scorer_fetch_signal_mute: - comment: "If enabled, fetches account mute signals from USS" - default_availability: 0 - -representation_scorer_fetch_signal_report: - comment: "If enabled, fetches tweet report signals from USS" - default_availability: 0 - -representation_scorer_fetch_signal_dont_like: - comment: "If enabled, fetches tweet don't like signals from USS" - default_availability: 0 - -representation_scorer_fetch_signal_see_fewer: - comment: "If enabled, fetches tweet see fewer signals from USS" - default_availability: 0 - -# To create a new decider, add here with the same format and caller's details : "representation-scorer_load_shed_by_caller_id_twtr:{{role}}:{{name}}:{{environment}}:{{cluster}}" -# All the deciders below are generated by this script - ./strato/bin/fed deciders ./ --service-role=representation-scorer --service-name=representation-scorer -# If you need to run the script and paste the output, add only the prod deciders here. Non-prod ones are being taken care of by representation-scorer_load_shed_non_prod_callers - -"representation-scorer_load_shed_by_caller_id_all": - comment: "Reject all traffic from caller id: all" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice-canary:prod:atla": - comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice-canary:prod:atla" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice-canary:prod:pdxa": - comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice-canary:prod:pdxa" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice-send:prod:atla": - comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice-send:prod:atla" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice:prod:atla": - comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice:prod:atla" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice:prod:pdxa": - comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice:prod:pdxa" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice:staging:atla": - comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice:staging:atla" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:frigate:frigate-pushservice:staging:pdxa": - comment: "Reject all traffic from caller id: twtr:svc:frigate:frigate-pushservice:staging:pdxa" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:home-scorer:home-scorer:prod:atla": - comment: "Reject all traffic from caller id: twtr:svc:home-scorer:home-scorer:prod:atla" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:home-scorer:home-scorer:prod:pdxa": - comment: "Reject all traffic from caller id: twtr:svc:home-scorer:home-scorer:prod:pdxa" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:stratostore:stratoapi:prod:atla": - comment: "Reject all traffic from caller id: twtr:svc:stratostore:stratoapi:prod:atla" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:stratostore:stratoserver:prod:atla": - comment: "Reject all traffic from caller id: twtr:svc:stratostore:stratoserver:prod:atla" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:stratostore:stratoserver:prod:pdxa": - comment: "Reject all traffic from caller id: twtr:svc:stratostore:stratoserver:prod:pdxa" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:timelinescorer:timelinescorer:prod:atla": - comment: "Reject all traffic from caller id: twtr:svc:timelinescorer:timelinescorer:prod:atla" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:timelinescorer:timelinescorer:prod:pdxa": - comment: "Reject all traffic from caller id: twtr:svc:timelinescorer:timelinescorer:prod:pdxa" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:topic-social-proof:topic-social-proof:prod:atla": - comment: "Reject all traffic from caller id: twtr:svc:topic-social-proof:topic-social-proof:prod:atla" - default_availability: 0 - -"representation-scorer_load_shed_by_caller_id_twtr:svc:topic-social-proof:topic-social-proof:prod:pdxa": - comment: "Reject all traffic from caller id: twtr:svc:topic-social-proof:topic-social-proof:prod:pdxa" - default_availability: 0 - -"enable_sim_clusters_embedding_store_timeouts": - comment: "If enabled, set a timeout on calls to the SimClustersEmbeddingStore" - default_availability: 10000 - -sim_clusters_embedding_store_timeout_value_millis: - comment: "The value of this decider defines the timeout (in milliseconds) to use on calls to the SimClustersEmbeddingStore, i.e. 1.50% is 150ms. Only applied if enable_sim_clusters_embedding_store_timeouts is true" - default_availability: 2000 diff --git a/representation-scorer/server/src/main/resources/logback.docx b/representation-scorer/server/src/main/resources/logback.docx new file mode 100644 index 000000000..e1c3b30e4 Binary files /dev/null and b/representation-scorer/server/src/main/resources/logback.docx differ diff --git a/representation-scorer/server/src/main/resources/logback.xml b/representation-scorer/server/src/main/resources/logback.xml deleted file mode 100644 index cf7028151..000000000 --- a/representation-scorer/server/src/main/resources/logback.xml +++ /dev/null @@ -1,165 +0,0 @@ - - - - - - - - - - - - - - - - - true - - - - - - - - - - - ${log.service.output} - - - ${log.service.output}.%d.gz - - 3GB - - 21 - true - - - %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n - - - - - - ${log.access.output} - - - ${log.access.output}.%d.gz - - 100MB - - 7 - true - - - ${DEFAULT_ACCESS_PATTERN}%n - - - - - - true - ${log.lens.category} - ${log.lens.index} - ${log.lens.tag}/service - - %msg - - - - - - true - ${log.lens.category} - ${log.lens.index} - ${log.lens.tag}/access - - %msg - - - - - - allow_listed_pipeline_executions.log - - - allow_listed_pipeline_executions.log.%d.gz - - 100MB - - 7 - true - - - %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n - - - - - - - - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD deleted file mode 100644 index fdb60da54..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "finagle-internal/slo/src/main/scala/com/twitter/finagle/slo", - "finatra/inject/inject-thrift-client", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns", - "strato/src/main/scala/com/twitter/strato/fed", - "strato/src/main/scala/com/twitter/strato/fed/server", - "twitter-server-internal/src/main/scala", - ], -) diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD.docx new file mode 100644 index 000000000..4a3761281 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/BUILD.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.docx new file mode 100644 index 000000000..8a9c4b7f3 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.scala deleted file mode 100644 index a0a203311..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/RepresentationScorerFedServer.scala +++ /dev/null @@ -1,38 +0,0 @@ -package com.twitter.representationscorer - -import com.google.inject.Module -import com.twitter.inject.thrift.modules.ThriftClientIdModule -import com.twitter.representationscorer.columns.ListScoreColumn -import com.twitter.representationscorer.columns.ScoreColumn -import com.twitter.representationscorer.columns.SimClustersRecentEngagementSimilarityColumn -import com.twitter.representationscorer.columns.SimClustersRecentEngagementSimilarityUserTweetEdgeColumn -import com.twitter.representationscorer.modules.CacheModule -import com.twitter.representationscorer.modules.EmbeddingStoreModule -import com.twitter.representationscorer.modules.RMSConfigModule -import com.twitter.representationscorer.modules.TimerModule -import com.twitter.representationscorer.twistlyfeatures.UserSignalServiceRecentEngagementsClientModule -import com.twitter.strato.fed._ -import com.twitter.strato.fed.server._ - -object RepresentationScorerFedServerMain extends RepresentationScorerFedServer - -trait RepresentationScorerFedServer extends StratoFedServer { - override def dest: String = "/s/representation-scorer/representation-scorer" - override val modules: Seq[Module] = - Seq( - CacheModule, - ThriftClientIdModule, - UserSignalServiceRecentEngagementsClientModule, - TimerModule, - RMSConfigModule, - EmbeddingStoreModule - ) - - override def columns: Seq[Class[_ <: StratoFed.Column]] = - Seq( - classOf[ListScoreColumn], - classOf[ScoreColumn], - classOf[SimClustersRecentEngagementSimilarityUserTweetEdgeColumn], - classOf[SimClustersRecentEngagementSimilarityColumn] - ) -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD deleted file mode 100644 index 3352a51b9..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "content-recommender/thrift/src/main/thrift:thrift-scala", - "finatra/inject/inject-core/src/main/scala", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer/common", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures", - "representation-scorer/server/src/main/thrift:thrift-scala", - "strato/src/main/scala/com/twitter/strato/fed", - "strato/src/main/scala/com/twitter/strato/fed/server", - ], -) diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD.docx new file mode 100644 index 000000000..d108c391f Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/BUILD.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.docx new file mode 100644 index 000000000..5a3d00bde Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.scala deleted file mode 100644 index 3b14a491f..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/Info.scala +++ /dev/null @@ -1,13 +0,0 @@ -package com.twitter.representationscorer.columns - -import com.twitter.strato.config.{ContactInfo => StratoContactInfo} - -object Info { - val contactInfo: StratoContactInfo = StratoContactInfo( - description = "Please contact Relevance Platform team for more details", - contactEmail = "no-reply@twitter.com", - ldapGroup = "representation-scorer-admins", - jiraProject = "JIRA", - links = Seq("http://go.twitter.biz/rsx-runbook") - ) -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.docx new file mode 100644 index 000000000..0f87c9d1a Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.scala deleted file mode 100644 index 04d8b8cb1..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ListScoreColumn.scala +++ /dev/null @@ -1,116 +0,0 @@ -package com.twitter.representationscorer.columns - -import com.twitter.representationscorer.thriftscala.ListScoreId -import com.twitter.representationscorer.thriftscala.ListScoreResponse -import com.twitter.representationscorer.scorestore.ScoreStore -import com.twitter.representationscorer.thriftscala.ScoreResult -import com.twitter.simclusters_v2.common.SimClustersEmbeddingId.LongInternalId -import com.twitter.simclusters_v2.common.SimClustersEmbeddingId.LongSimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.Score -import com.twitter.simclusters_v2.thriftscala.ScoreId -import com.twitter.simclusters_v2.thriftscala.ScoreInternalId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingPairScoreId -import com.twitter.stitch -import com.twitter.stitch.Stitch -import com.twitter.strato.catalog.OpMetadata -import com.twitter.strato.config.ContactInfo -import com.twitter.strato.config.Policy -import com.twitter.strato.data.Conv -import com.twitter.strato.data.Description.PlainText -import com.twitter.strato.data.Lifecycle -import com.twitter.strato.fed._ -import com.twitter.strato.thrift.ScroogeConv -import com.twitter.util.Future -import com.twitter.util.Return -import com.twitter.util.Throw -import javax.inject.Inject - -class ListScoreColumn @Inject() (scoreStore: ScoreStore) - extends StratoFed.Column("recommendations/representation_scorer/listScore") - with StratoFed.Fetch.Stitch { - - override val policy: Policy = Common.rsxReadPolicy - - override type Key = ListScoreId - override type View = Unit - override type Value = ListScoreResponse - - override val keyConv: Conv[Key] = ScroogeConv.fromStruct[ListScoreId] - override val viewConv: Conv[View] = Conv.ofType - override val valueConv: Conv[Value] = ScroogeConv.fromStruct[ListScoreResponse] - - override val contactInfo: ContactInfo = Info.contactInfo - - override val metadata: OpMetadata = OpMetadata( - lifecycle = Some(Lifecycle.Production), - description = Some( - PlainText( - "Scoring for multiple candidate entities against a single target entity" - )) - ) - - override def fetch(key: Key, view: View): Stitch[Result[Value]] = { - - val target = SimClustersEmbeddingId( - embeddingType = key.targetEmbeddingType, - modelVersion = key.modelVersion, - internalId = key.targetId - ) - val scoreIds = key.candidateIds.map { candidateId => - val candidate = SimClustersEmbeddingId( - embeddingType = key.candidateEmbeddingType, - modelVersion = key.modelVersion, - internalId = candidateId - ) - ScoreId( - algorithm = key.algorithm, - internalId = ScoreInternalId.SimClustersEmbeddingPairScoreId( - SimClustersEmbeddingPairScoreId(target, candidate) - ) - ) - } - - Stitch - .callFuture { - val (keys: Iterable[ScoreId], vals: Iterable[Future[Option[Score]]]) = - scoreStore.uniformScoringStore.multiGet(scoreIds.toSet).unzip - val results: Future[Iterable[Option[Score]]] = Future.collectToTry(vals.toSeq) map { - tryOptVals => - tryOptVals map { - case Return(Some(v)) => Some(v) - case Return(None) => None - case Throw(_) => None - } - } - val scoreMap: Future[Map[Long, Double]] = results.map { scores => - keys - .zip(scores).collect { - case ( - ScoreId( - _, - ScoreInternalId.SimClustersEmbeddingPairScoreId( - SimClustersEmbeddingPairScoreId( - _, - LongSimClustersEmbeddingId(candidateId)))), - Some(score)) => - (candidateId, score.score) - }.toMap - } - scoreMap - } - .map { (scores: Map[Long, Double]) => - val orderedScores = key.candidateIds.collect { - case LongInternalId(id) => ScoreResult(scores.get(id)) - case _ => - // This will return None scores for candidates which don't have Long ids, but that's fine: - // at the moment we're only scoring for Tweets - ScoreResult(None) - } - found(ListScoreResponse(orderedScores)) - } - .handle { - case stitch.NotFound => missing - } - } -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.docx new file mode 100644 index 000000000..bc3d6b2c8 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.scala deleted file mode 100644 index 6b565288b..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/ScoreColumn.scala +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.representationscorer.columns - -import com.twitter.contentrecommender.thriftscala.ScoringResponse -import com.twitter.representationscorer.scorestore.ScoreStore -import com.twitter.simclusters_v2.thriftscala.ScoreId -import com.twitter.stitch -import com.twitter.stitch.Stitch -import com.twitter.strato.config.ContactInfo -import com.twitter.strato.config.Policy -import com.twitter.strato.catalog.OpMetadata -import com.twitter.strato.data.Conv -import com.twitter.strato.data.Lifecycle -import com.twitter.strato.data.Description.PlainText -import com.twitter.strato.fed._ -import com.twitter.strato.thrift.ScroogeConv -import javax.inject.Inject - -class ScoreColumn @Inject() (scoreStore: ScoreStore) - extends StratoFed.Column("recommendations/representation_scorer/score") - with StratoFed.Fetch.Stitch { - - override val policy: Policy = Common.rsxReadPolicy - - override type Key = ScoreId - override type View = Unit - override type Value = ScoringResponse - - override val keyConv: Conv[Key] = ScroogeConv.fromStruct[ScoreId] - override val viewConv: Conv[View] = Conv.ofType - override val valueConv: Conv[Value] = ScroogeConv.fromStruct[ScoringResponse] - - override val contactInfo: ContactInfo = Info.contactInfo - - override val metadata: OpMetadata = OpMetadata( - lifecycle = Some(Lifecycle.Production), - description = Some(PlainText( - "The Uniform Scoring Endpoint in Representation Scorer for the Content-Recommender." + - " TDD: http://go/representation-scorer-tdd Guideline: http://go/uniform-scoring-guideline")) - ) - - override def fetch(key: Key, view: View): Stitch[Result[Value]] = - scoreStore - .uniformScoringStoreStitch(key) - .map(score => found(ScoringResponse(Some(score)))) - .handle { - case stitch.NotFound => missing - } -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.docx new file mode 100644 index 000000000..72d4182cf Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.scala deleted file mode 100644 index e14a67eae..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityColumn.scala +++ /dev/null @@ -1,52 +0,0 @@ -package com.twitter.representationscorer.columns - -import com.twitter.representationscorer.common.TweetId -import com.twitter.representationscorer.common.UserId -import com.twitter.representationscorer.thriftscala.RecentEngagementSimilaritiesResponse -import com.twitter.representationscorer.twistlyfeatures.Scorer -import com.twitter.stitch -import com.twitter.stitch.Stitch -import com.twitter.strato.catalog.OpMetadata -import com.twitter.strato.config.ContactInfo -import com.twitter.strato.config.Policy -import com.twitter.strato.data.Conv -import com.twitter.strato.data.Description.PlainText -import com.twitter.strato.data.Lifecycle -import com.twitter.strato.fed._ -import com.twitter.strato.thrift.ScroogeConv -import javax.inject.Inject - -class SimClustersRecentEngagementSimilarityColumn @Inject() (scorer: Scorer) - extends StratoFed.Column( - "recommendations/representation_scorer/simClustersRecentEngagementSimilarity") - with StratoFed.Fetch.Stitch { - - override val policy: Policy = Common.rsxReadPolicy - - override type Key = (UserId, Seq[TweetId]) - override type View = Unit - override type Value = RecentEngagementSimilaritiesResponse - - override val keyConv: Conv[Key] = Conv.ofType[(Long, Seq[Long])] - override val viewConv: Conv[View] = Conv.ofType - override val valueConv: Conv[Value] = - ScroogeConv.fromStruct[RecentEngagementSimilaritiesResponse] - - override val contactInfo: ContactInfo = Info.contactInfo - - override val metadata: OpMetadata = OpMetadata( - lifecycle = Some(Lifecycle.Production), - description = Some( - PlainText( - "User-Tweet scores based on the user's recent engagements for multiple tweets." - )) - ) - - override def fetch(key: Key, view: View): Stitch[Result[Value]] = - scorer - .get(key._1, key._2) - .map(results => found(RecentEngagementSimilaritiesResponse(results))) - .handle { - case stitch.NotFound => missing - } -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.docx new file mode 100644 index 000000000..77719e391 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.scala deleted file mode 100644 index e54d3a71b..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/columns/SimClustersRecentEngagementSimilarityUserTweetEdgeColumn.scala +++ /dev/null @@ -1,52 +0,0 @@ -package com.twitter.representationscorer.columns - -import com.twitter.representationscorer.common.TweetId -import com.twitter.representationscorer.common.UserId -import com.twitter.representationscorer.thriftscala.SimClustersRecentEngagementSimilarities -import com.twitter.representationscorer.twistlyfeatures.Scorer -import com.twitter.stitch -import com.twitter.stitch.Stitch -import com.twitter.strato.catalog.OpMetadata -import com.twitter.strato.config.ContactInfo -import com.twitter.strato.config.Policy -import com.twitter.strato.data.Conv -import com.twitter.strato.data.Description.PlainText -import com.twitter.strato.data.Lifecycle -import com.twitter.strato.fed._ -import com.twitter.strato.thrift.ScroogeConv -import javax.inject.Inject - -class SimClustersRecentEngagementSimilarityUserTweetEdgeColumn @Inject() (scorer: Scorer) - extends StratoFed.Column( - "recommendations/representation_scorer/simClustersRecentEngagementSimilarity.UserTweetEdge") - with StratoFed.Fetch.Stitch { - - override val policy: Policy = Common.rsxReadPolicy - - override type Key = (UserId, TweetId) - override type View = Unit - override type Value = SimClustersRecentEngagementSimilarities - - override val keyConv: Conv[Key] = Conv.ofType[(Long, Long)] - override val viewConv: Conv[View] = Conv.ofType - override val valueConv: Conv[Value] = - ScroogeConv.fromStruct[SimClustersRecentEngagementSimilarities] - - override val contactInfo: ContactInfo = Info.contactInfo - - override val metadata: OpMetadata = OpMetadata( - lifecycle = Some(Lifecycle.Production), - description = Some( - PlainText( - "User-Tweet scores based on the user's recent engagements" - )) - ) - - override def fetch(key: Key, view: View): Stitch[Result[Value]] = - scorer - .get(key._1, key._2) - .map(found(_)) - .handle { - case stitch.NotFound => missing - } -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD deleted file mode 100644 index 018cef9eb..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD +++ /dev/null @@ -1,9 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "decider/src/main/scala", - "src/scala/com/twitter/simclusters_v2/common", - ], -) diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD.docx new file mode 100644 index 000000000..e153443a2 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/BUILD.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.docx new file mode 100644 index 000000000..5ba2973a9 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.scala deleted file mode 100644 index 838835616..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/DeciderConstants.scala +++ /dev/null @@ -1,7 +0,0 @@ -package com.twitter.representationscorer - -object DeciderConstants { - val enableSimClustersEmbeddingStoreTimeouts = "enable_sim_clusters_embedding_store_timeouts" - val simClustersEmbeddingStoreTimeoutValueMillis = - "sim_clusters_embedding_store_timeout_value_millis" -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.docx new file mode 100644 index 000000000..b1fbcd983 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.scala deleted file mode 100644 index 5aa4b4f2c..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/RepresentationScorerDecider.scala +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.representationscorer.common - -import com.twitter.decider.Decider -import com.twitter.decider.RandomRecipient -import com.twitter.decider.Recipient -import com.twitter.simclusters_v2.common.DeciderGateBuilderWithIdHashing -import javax.inject.Inject -import javax.inject.Singleton - -@Singleton -case class RepresentationScorerDecider @Inject() (decider: Decider) { - - val deciderGateBuilder = new DeciderGateBuilderWithIdHashing(decider) - - def isAvailable(feature: String, recipient: Option[Recipient]): Boolean = { - decider.isAvailable(feature, recipient) - } - - /** - * When useRandomRecipient is set to false, the decider is either completely on or off. - * When useRandomRecipient is set to true, the decider is on for the specified % of traffic. - */ - def isAvailable(feature: String, useRandomRecipient: Boolean = true): Boolean = { - if (useRandomRecipient) isAvailable(feature, Some(RandomRecipient)) - else isAvailable(feature, None) - } -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.docx new file mode 100644 index 000000000..317943c84 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.scala deleted file mode 100644 index c5bf9c60a..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/common/package.scala +++ /dev/null @@ -1,6 +0,0 @@ -package com.twitter.representationscorer - -package object common { - type UserId = Long - type TweetId = Long -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD deleted file mode 100644 index c73f2a68e..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD +++ /dev/null @@ -1,19 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "finagle/finagle-stats", - "finatra/inject/inject-core/src/main/scala", - "representation-manager/client/src/main/scala/com/twitter/representation_manager", - "representation-manager/client/src/main/scala/com/twitter/representation_manager/config", - "representation-manager/server/src/main/scala/com/twitter/representation_manager/migration", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer/common", - "servo/util", - "src/scala/com/twitter/simclusters_v2/stores", - "src/scala/com/twitter/storehaus_internal/memcache", - "src/scala/com/twitter/storehaus_internal/util", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD.docx new file mode 100644 index 000000000..893395a81 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/BUILD.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.docx new file mode 100644 index 000000000..a8f23b243 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.scala deleted file mode 100644 index b8b815872..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/CacheModule.scala +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.representationscorer.modules - -import com.google.inject.Provides -import com.twitter.finagle.memcached.Client -import javax.inject.Singleton -import com.twitter.conversions.DurationOps._ -import com.twitter.inject.TwitterModule -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.storehaus_internal.memcache.MemcacheStore -import com.twitter.storehaus_internal.util.ClientName -import com.twitter.storehaus_internal.util.ZkEndPoint - -object CacheModule extends TwitterModule { - - private val cacheDest = flag[String]("cache_module.dest", "Path to memcache service") - private val timeout = flag[Int]("memcache.timeout", "Memcache client timeout") - private val retries = flag[Int]("memcache.retries", "Memcache timeout retries") - - @Singleton - @Provides - def providesCache( - serviceIdentifier: ServiceIdentifier, - stats: StatsReceiver - ): Client = - MemcacheStore.memcachedClient( - name = ClientName("memcache_representation_manager"), - dest = ZkEndPoint(cacheDest()), - timeout = timeout().milliseconds, - retries = retries(), - statsReceiver = stats.scope("cache_client"), - serviceIdentifier = serviceIdentifier - ) -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.docx new file mode 100644 index 000000000..b0b62f21c Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.scala deleted file mode 100644 index bff5d491c..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/EmbeddingStoreModule.scala +++ /dev/null @@ -1,100 +0,0 @@ -package com.twitter.representationscorer.modules - -import com.google.inject.Provides -import com.twitter.decider.Decider -import com.twitter.finagle.memcached.{Client => MemcachedClient} -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.finagle.thrift.ClientId -import com.twitter.hermit.store.common.ObservedReadableStore -import com.twitter.inject.TwitterModule -import com.twitter.relevance_platform.common.readablestore.ReadableStoreWithTimeout -import com.twitter.representation_manager.migration.LegacyRMS -import com.twitter.representationscorer.DeciderConstants -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.ModelVersion._ -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Timer -import javax.inject.Singleton - -object EmbeddingStoreModule extends TwitterModule { - @Singleton - @Provides - def providesEmbeddingStore( - memCachedClient: MemcachedClient, - serviceIdentifier: ServiceIdentifier, - clientId: ClientId, - timer: Timer, - decider: Decider, - stats: StatsReceiver - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - val cacheHashKeyPrefix: String = "RMS" - val embeddingStoreClient = new LegacyRMS( - serviceIdentifier, - memCachedClient, - stats, - decider, - clientId, - timer, - cacheHashKeyPrefix - ) - - val underlyingStores: Map[ - (EmbeddingType, ModelVersion), - ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ] = Map( - // Tweet Embeddings - ( - LogFavBasedTweet, - Model20m145k2020) -> embeddingStoreClient.logFavBased20M145K2020TweetEmbeddingStore, - ( - LogFavLongestL2EmbeddingTweet, - Model20m145k2020) -> embeddingStoreClient.logFavBasedLongestL2Tweet20M145K2020EmbeddingStore, - // InterestedIn Embeddings - ( - LogFavBasedUserInterestedInFromAPE, - Model20m145k2020) -> embeddingStoreClient.LogFavBasedInterestedInFromAPE20M145K2020Store, - ( - FavBasedUserInterestedIn, - Model20m145k2020) -> embeddingStoreClient.favBasedUserInterestedIn20M145K2020Store, - // Author Embeddings - ( - FavBasedProducer, - Model20m145k2020) -> embeddingStoreClient.favBasedProducer20M145K2020EmbeddingStore, - // Entity Embeddings - ( - LogFavBasedKgoApeTopic, - Model20m145k2020) -> embeddingStoreClient.logFavBasedApeEntity20M145K2020EmbeddingCachedStore, - (FavTfgTopic, Model20m145k2020) -> embeddingStoreClient.favBasedTfgTopicEmbedding2020Store, - ) - - val simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - val underlying: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = - SimClustersEmbeddingStore.buildWithDecider( - underlyingStores = underlyingStores, - decider = decider, - statsReceiver = stats.scope("simClusters_embeddings_store_deciderable") - ) - - val underlyingWithTimeout: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = - new ReadableStoreWithTimeout( - rs = underlying, - decider = decider, - enableTimeoutDeciderKey = DeciderConstants.enableSimClustersEmbeddingStoreTimeouts, - timeoutValueKey = DeciderConstants.simClustersEmbeddingStoreTimeoutValueMillis, - timer = timer, - statsReceiver = stats.scope("simClusters_embedding_store_timeouts") - ) - - ObservedReadableStore( - store = underlyingWithTimeout - )(stats.scope("simClusters_embeddings_store")) - } - simClustersEmbeddingStore - } -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.docx new file mode 100644 index 000000000..77ee14dbd Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.scala deleted file mode 100644 index 08ac0cb93..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/RMSConfigModule.scala +++ /dev/null @@ -1,63 +0,0 @@ -package com.twitter.representationscorer.modules - -import com.google.inject.Provides -import com.twitter.conversions.DurationOps._ -import com.twitter.inject.TwitterModule -import com.twitter.representation_manager.config.ClientConfig -import com.twitter.representation_manager.config.EnabledInMemoryCacheParams -import com.twitter.representation_manager.config.InMemoryCacheParams -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.ModelVersion._ -import javax.inject.Singleton - -object RMSConfigModule extends TwitterModule { - def getCacheName(embedingType: EmbeddingType, modelVersion: ModelVersion): String = - s"${embedingType.name}_${modelVersion.name}_in_mem_cache" - - @Singleton - @Provides - def providesRMSClientConfig: ClientConfig = { - val cacheParamsMap: Map[ - (EmbeddingType, ModelVersion), - InMemoryCacheParams - ] = Map( - // Tweet Embeddings - (LogFavBasedTweet, Model20m145k2020) -> EnabledInMemoryCacheParams( - ttl = 10.minutes, - maxKeys = 1048575, // 800MB - cacheName = getCacheName(LogFavBasedTweet, Model20m145k2020)), - (LogFavLongestL2EmbeddingTweet, Model20m145k2020) -> EnabledInMemoryCacheParams( - ttl = 5.minute, - maxKeys = 1048575, // 800MB - cacheName = getCacheName(LogFavLongestL2EmbeddingTweet, Model20m145k2020)), - // User - KnownFor Embeddings - (FavBasedProducer, Model20m145k2020) -> EnabledInMemoryCacheParams( - ttl = 1.day, - maxKeys = 500000, // 400MB - cacheName = getCacheName(FavBasedProducer, Model20m145k2020)), - // User - InterestedIn Embeddings - (LogFavBasedUserInterestedInFromAPE, Model20m145k2020) -> EnabledInMemoryCacheParams( - ttl = 6.hours, - maxKeys = 262143, - cacheName = getCacheName(LogFavBasedUserInterestedInFromAPE, Model20m145k2020)), - (FavBasedUserInterestedIn, Model20m145k2020) -> EnabledInMemoryCacheParams( - ttl = 6.hours, - maxKeys = 262143, - cacheName = getCacheName(FavBasedUserInterestedIn, Model20m145k2020)), - // Topic Embeddings - (FavTfgTopic, Model20m145k2020) -> EnabledInMemoryCacheParams( - ttl = 12.hours, - maxKeys = 262143, // 200MB - cacheName = getCacheName(FavTfgTopic, Model20m145k2020)), - (LogFavBasedKgoApeTopic, Model20m145k2020) -> EnabledInMemoryCacheParams( - ttl = 6.hours, - maxKeys = 262143, - cacheName = getCacheName(LogFavBasedKgoApeTopic, Model20m145k2020)), - ) - - new ClientConfig(inMemCacheParamsOverrides = cacheParamsMap) - } - -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.docx new file mode 100644 index 000000000..d5c0e25b2 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.scala deleted file mode 100644 index b425d516a..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/modules/TimerModule.scala +++ /dev/null @@ -1,13 +0,0 @@ -package com.twitter.representationscorer.modules - -import com.google.inject.Provides -import com.twitter.finagle.util.DefaultTimer -import com.twitter.inject.TwitterModule -import com.twitter.util.Timer -import javax.inject.Singleton - -object TimerModule extends TwitterModule { - @Singleton - @Provides - def providesTimer: Timer = DefaultTimer -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD deleted file mode 100644 index 3c259cfc4..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD +++ /dev/null @@ -1,19 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/util", - "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common", - "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/injection", - "representation-manager/client/src/main/scala/com/twitter/representation_manager", - "representation-manager/client/src/main/scala/com/twitter/representation_manager/config", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer/common", - "src/scala/com/twitter/simclusters_v2/score", - "src/scala/com/twitter/topic_recos/common", - "src/scala/com/twitter/topic_recos/stores", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - "src/thrift/com/twitter/topic_recos:topic_recos-thrift-scala", - "stitch/stitch-storehaus", - ], -) diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD.docx new file mode 100644 index 000000000..4cb511845 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/BUILD.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.docx new file mode 100644 index 000000000..ad63f4db0 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.scala deleted file mode 100644 index db7cbefa9..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/ScoreStore.scala +++ /dev/null @@ -1,168 +0,0 @@ -package com.twitter.representationscorer.scorestore - -import com.twitter.bijection.scrooge.BinaryScalaCodec -import com.twitter.conversions.DurationOps._ -import com.twitter.finagle.memcached.Client -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.hashing.KeyHasher -import com.twitter.hermit.store.common.ObservedCachedReadableStore -import com.twitter.hermit.store.common.ObservedMemcachedReadableStore -import com.twitter.hermit.store.common.ObservedReadableStore -import com.twitter.relevance_platform.common.injection.LZ4Injection -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.score.ScoreFacadeStore -import com.twitter.simclusters_v2.score.SimClustersEmbeddingPairScoreStore -import com.twitter.simclusters_v2.thriftscala.EmbeddingType.FavTfgTopic -import com.twitter.simclusters_v2.thriftscala.EmbeddingType.LogFavBasedKgoApeTopic -import com.twitter.simclusters_v2.thriftscala.EmbeddingType.LogFavBasedTweet -import com.twitter.simclusters_v2.thriftscala.ModelVersion.Model20m145kUpdated -import com.twitter.simclusters_v2.thriftscala.Score -import com.twitter.simclusters_v2.thriftscala.ScoreId -import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.stitch.storehaus.StitchOfReadableStore -import com.twitter.storehaus.ReadableStore -import com.twitter.strato.client.{Client => StratoClient} -import com.twitter.topic_recos.stores.CertoTweetTopicScoresStore -import javax.inject.Inject -import javax.inject.Singleton - -@Singleton() -class ScoreStore @Inject() ( - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding], - stratoClient: StratoClient, - representationScorerCacheClient: Client, - stats: StatsReceiver) { - - private val keyHasher = KeyHasher.FNV1A_64 - private val statsReceiver = stats.scope("score_store") - - /** ** Score Store *****/ - private val simClustersEmbeddingCosineSimilarityScoreStore = - ObservedReadableStore( - SimClustersEmbeddingPairScoreStore - .buildCosineSimilarityStore(simClustersEmbeddingStore) - .toThriftStore - )(statsReceiver.scope("simClusters_embedding_cosine_similarity_score_store")) - - private val simClustersEmbeddingDotProductScoreStore = - ObservedReadableStore( - SimClustersEmbeddingPairScoreStore - .buildDotProductStore(simClustersEmbeddingStore) - .toThriftStore - )(statsReceiver.scope("simClusters_embedding_dot_product_score_store")) - - private val simClustersEmbeddingJaccardSimilarityScoreStore = - ObservedReadableStore( - SimClustersEmbeddingPairScoreStore - .buildJaccardSimilarityStore(simClustersEmbeddingStore) - .toThriftStore - )(statsReceiver.scope("simClusters_embedding_jaccard_similarity_score_store")) - - private val simClustersEmbeddingEuclideanDistanceScoreStore = - ObservedReadableStore( - SimClustersEmbeddingPairScoreStore - .buildEuclideanDistanceStore(simClustersEmbeddingStore) - .toThriftStore - )(statsReceiver.scope("simClusters_embedding_euclidean_distance_score_store")) - - private val simClustersEmbeddingManhattanDistanceScoreStore = - ObservedReadableStore( - SimClustersEmbeddingPairScoreStore - .buildManhattanDistanceStore(simClustersEmbeddingStore) - .toThriftStore - )(statsReceiver.scope("simClusters_embedding_manhattan_distance_score_store")) - - private val simClustersEmbeddingLogCosineSimilarityScoreStore = - ObservedReadableStore( - SimClustersEmbeddingPairScoreStore - .buildLogCosineSimilarityStore(simClustersEmbeddingStore) - .toThriftStore - )(statsReceiver.scope("simClusters_embedding_log_cosine_similarity_score_store")) - - private val simClustersEmbeddingExpScaledCosineSimilarityScoreStore = - ObservedReadableStore( - SimClustersEmbeddingPairScoreStore - .buildExpScaledCosineSimilarityStore(simClustersEmbeddingStore) - .toThriftStore - )(statsReceiver.scope("simClusters_embedding_exp_scaled_cosine_similarity_score_store")) - - // Use the default setting - private val topicTweetRankingScoreStore = - TopicTweetRankingScoreStore.buildTopicTweetRankingStore( - FavTfgTopic, - LogFavBasedKgoApeTopic, - LogFavBasedTweet, - Model20m145kUpdated, - consumerEmbeddingMultiplier = 1.0, - producerEmbeddingMultiplier = 1.0 - ) - - private val topicTweetsCortexThresholdStore = TopicTweetsCosineSimilarityAggregateStore( - TopicTweetsCosineSimilarityAggregateStore.DefaultScoreKeys, - statsReceiver.scope("topic_tweets_cortex_threshold_store") - ) - - val topicTweetCertoScoreStore: ObservedCachedReadableStore[ScoreId, Score] = { - val underlyingStore = ObservedReadableStore( - TopicTweetCertoScoreStore(CertoTweetTopicScoresStore.prodStore(stratoClient)) - )(statsReceiver.scope("topic_tweet_certo_score_store")) - - val memcachedStore = ObservedMemcachedReadableStore - .fromCacheClient( - backingStore = underlyingStore, - cacheClient = representationScorerCacheClient, - ttl = 10.minutes - )( - valueInjection = LZ4Injection.compose(BinaryScalaCodec(Score)), - statsReceiver = statsReceiver.scope("topic_tweet_certo_store_memcache"), - keyToString = { k: ScoreId => - s"certocs:${keyHasher.hashKey(k.toString.getBytes)}" - } - ) - - ObservedCachedReadableStore.from[ScoreId, Score]( - memcachedStore, - ttl = 5.minutes, - maxKeys = 1000000, - cacheName = "topic_tweet_certo_store_cache", - windowSize = 10000L - )(statsReceiver.scope("topic_tweet_certo_store_cache")) - } - - val uniformScoringStore: ReadableStore[ScoreId, Score] = - ScoreFacadeStore.buildWithMetrics( - readableStores = Map( - ScoringAlgorithm.PairEmbeddingCosineSimilarity -> - simClustersEmbeddingCosineSimilarityScoreStore, - ScoringAlgorithm.PairEmbeddingDotProduct -> - simClustersEmbeddingDotProductScoreStore, - ScoringAlgorithm.PairEmbeddingJaccardSimilarity -> - simClustersEmbeddingJaccardSimilarityScoreStore, - ScoringAlgorithm.PairEmbeddingEuclideanDistance -> - simClustersEmbeddingEuclideanDistanceScoreStore, - ScoringAlgorithm.PairEmbeddingManhattanDistance -> - simClustersEmbeddingManhattanDistanceScoreStore, - ScoringAlgorithm.PairEmbeddingLogCosineSimilarity -> - simClustersEmbeddingLogCosineSimilarityScoreStore, - ScoringAlgorithm.PairEmbeddingExpScaledCosineSimilarity -> - simClustersEmbeddingExpScaledCosineSimilarityScoreStore, - // Certo normalized cosine score between topic-tweet pairs - ScoringAlgorithm.CertoNormalizedCosineScore - -> topicTweetCertoScoreStore, - // Certo normalized dot-product score between topic-tweet pairs - ScoringAlgorithm.CertoNormalizedDotProductScore - -> topicTweetCertoScoreStore - ), - aggregatedStores = Map( - ScoringAlgorithm.WeightedSumTopicTweetRanking -> - topicTweetRankingScoreStore, - ScoringAlgorithm.CortexTopicTweetLabel -> - topicTweetsCortexThresholdStore, - ), - statsReceiver = stats - ) - - val uniformScoringStoreStitch: ScoreId => com.twitter.stitch.Stitch[Score] = - StitchOfReadableStore(uniformScoringStore) -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.docx new file mode 100644 index 000000000..3e88f4109 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.scala deleted file mode 100644 index b6216985f..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetCertoScoreStore.scala +++ /dev/null @@ -1,106 +0,0 @@ -package com.twitter.representationscorer.scorestore - -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.thriftscala.ScoreInternalId.GenericPairScoreId -import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm.CertoNormalizedDotProductScore -import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm.CertoNormalizedCosineScore -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.TopicId -import com.twitter.simclusters_v2.thriftscala.{Score => ThriftScore} -import com.twitter.simclusters_v2.thriftscala.{ScoreId => ThriftScoreId} -import com.twitter.storehaus.FutureOps -import com.twitter.storehaus.ReadableStore -import com.twitter.topic_recos.thriftscala.Scores -import com.twitter.topic_recos.thriftscala.TopicToScores -import com.twitter.util.Future - -/** - * Score store to get Certo scores. - * Currently, the store supports two Scoring Algorithms (i.e., two types of Certo scores): - * 1. NormalizedDotProduct - * 2. NormalizedCosine - * Querying with corresponding scoring algorithms results in different Certo scores. - */ -case class TopicTweetCertoScoreStore(certoStratoStore: ReadableStore[TweetId, TopicToScores]) - extends ReadableStore[ThriftScoreId, ThriftScore] { - - override def multiGet[K1 <: ThriftScoreId](ks: Set[K1]): Map[K1, Future[Option[ThriftScore]]] = { - val tweetIds = - ks.map(_.internalId).collect { - case GenericPairScoreId(scoreId) => - ((scoreId.id1, scoreId.id2): @annotation.nowarn( - "msg=may not be exhaustive|max recursion depth")) match { - case (InternalId.TweetId(tweetId), _) => tweetId - case (_, InternalId.TweetId(tweetId)) => tweetId - } - } - - val result = for { - certoScores <- Future.collect(certoStratoStore.multiGet(tweetIds)) - } yield { - ks.map { k => - (k.algorithm, k.internalId) match { - case (CertoNormalizedDotProductScore, GenericPairScoreId(scoreId)) => - (scoreId.id1, scoreId.id2) match { - case (InternalId.TweetId(tweetId), InternalId.TopicId(topicId)) => - ( - k, - extractScore( - tweetId, - topicId, - certoScores, - _.followerL2NormalizedDotProduct8HrHalfLife)) - case (InternalId.TopicId(topicId), InternalId.TweetId(tweetId)) => - ( - k, - extractScore( - tweetId, - topicId, - certoScores, - _.followerL2NormalizedDotProduct8HrHalfLife)) - case _ => (k, None) - } - case (CertoNormalizedCosineScore, GenericPairScoreId(scoreId)) => - (scoreId.id1, scoreId.id2) match { - case (InternalId.TweetId(tweetId), InternalId.TopicId(topicId)) => - ( - k, - extractScore( - tweetId, - topicId, - certoScores, - _.followerL2NormalizedCosineSimilarity8HrHalfLife)) - case (InternalId.TopicId(topicId), InternalId.TweetId(tweetId)) => - ( - k, - extractScore( - tweetId, - topicId, - certoScores, - _.followerL2NormalizedCosineSimilarity8HrHalfLife)) - case _ => (k, None) - } - case _ => (k, None) - } - }.toMap - } - FutureOps.liftValues(ks, result) - } - - /** - * Given tweetToCertoScores, extract certain Certo score between the given tweetId and topicId. - * The Certo score of interest is specified using scoreExtractor. - */ - def extractScore( - tweetId: TweetId, - topicId: TopicId, - tweetToCertoScores: Map[TweetId, Option[TopicToScores]], - scoreExtractor: Scores => Double - ): Option[ThriftScore] = { - tweetToCertoScores.get(tweetId).flatMap { - case Some(topicToScores) => - topicToScores.topicToScores.flatMap(_.get(topicId).map(scoreExtractor).map(ThriftScore(_))) - case _ => Some(ThriftScore(0.0)) - } - } -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.docx new file mode 100644 index 000000000..228aa2528 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.scala deleted file mode 100644 index 9ff502fd6..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetRankingScoreStore.scala +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.representationscorer.scorestore - -import com.twitter.simclusters_v2.score.WeightedSumAggregatedScoreStore -import com.twitter.simclusters_v2.score.WeightedSumAggregatedScoreStore.WeightedSumAggregatedScoreParameter -import com.twitter.simclusters_v2.thriftscala.{EmbeddingType, ModelVersion, ScoringAlgorithm} - -object TopicTweetRankingScoreStore { - val producerEmbeddingScoreMultiplier = 1.0 - val consumerEmbeddingScoreMultiplier = 1.0 - - /** - * Build the scoring store for TopicTweet Ranking based on Default Multipliers. - * If you want to compare the ranking between different multipliers, register a new - * ScoringAlgorithm and let the upstream uses different scoringAlgorithm by params. - */ - def buildTopicTweetRankingStore( - consumerEmbeddingType: EmbeddingType, - producerEmbeddingType: EmbeddingType, - tweetEmbeddingType: EmbeddingType, - modelVersion: ModelVersion, - consumerEmbeddingMultiplier: Double = consumerEmbeddingScoreMultiplier, - producerEmbeddingMultiplier: Double = producerEmbeddingScoreMultiplier - ): WeightedSumAggregatedScoreStore = { - WeightedSumAggregatedScoreStore( - List( - WeightedSumAggregatedScoreParameter( - ScoringAlgorithm.PairEmbeddingCosineSimilarity, - consumerEmbeddingMultiplier, - WeightedSumAggregatedScoreStore.genericPairScoreIdToSimClustersEmbeddingPairScoreId( - consumerEmbeddingType, - tweetEmbeddingType, - modelVersion - ) - ), - WeightedSumAggregatedScoreParameter( - ScoringAlgorithm.PairEmbeddingCosineSimilarity, - producerEmbeddingMultiplier, - WeightedSumAggregatedScoreStore.genericPairScoreIdToSimClustersEmbeddingPairScoreId( - producerEmbeddingType, - tweetEmbeddingType, - modelVersion - ) - ) - ) - ) - } - -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.docx new file mode 100644 index 000000000..d024c0dbe Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.scala deleted file mode 100644 index f835158b8..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore/TopicTweetsCosineSimilarityAggregateStore.scala +++ /dev/null @@ -1,148 +0,0 @@ -package com.twitter.representationscorer.scorestore - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.util.StatsUtil -import com.twitter.representationscorer.scorestore.TopicTweetsCosineSimilarityAggregateStore.ScoreKey -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.score.AggregatedScoreStore -import com.twitter.simclusters_v2.thriftscala.ScoreInternalId.GenericPairScoreId -import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm.CortexTopicTweetLabel -import com.twitter.simclusters_v2.thriftscala.{ - EmbeddingType, - InternalId, - ModelVersion, - ScoreInternalId, - ScoringAlgorithm, - SimClustersEmbeddingId, - TopicId, - Score => ThriftScore, - ScoreId => ThriftScoreId, - SimClustersEmbeddingPairScoreId => ThriftSimClustersEmbeddingPairScoreId -} -import com.twitter.storehaus.ReadableStore -import com.twitter.topic_recos.common.Configs.{DefaultModelVersion, MinCosineSimilarityScore} -import com.twitter.topic_recos.common._ -import com.twitter.util.Future - -/** - * Calculates the cosine similarity scores of arbitrary combinations of TopicEmbeddings and - * TweetEmbeddings. - * The class has 2 uses: - * 1. For internal uses. TSP will call this store to fetch the raw scores for (topic, tweet) with - * all available embedding types. We calculate all the scores here, so the caller can do filtering - * & score caching on their side. This will make it possible to DDG different embedding scores. - * - * 2. For external calls from Cortex. We return true (or 1.0) for any given (topic, tweet) if their - * cosine similarity passes the threshold for any of the embedding types. - * The expected input type is - * ScoreId( - * PairEmbeddingCosineSimilarity, - * GenericPairScoreId(TopicId, TweetId) - * ) - */ -case class TopicTweetsCosineSimilarityAggregateStore( - scoreKeys: Seq[ScoreKey], - statsReceiver: StatsReceiver) - extends AggregatedScoreStore { - - def toCortexScore(scoresMap: Map[ScoreKey, Double]): Double = { - val passThreshold = scoresMap.exists { - case (_, score) => score >= MinCosineSimilarityScore - } - if (passThreshold) 1.0 else 0.0 - } - - /** - * To be called by Cortex through Unified Score API ONLY. Calculates all possible (topic, tweet), - * return 1.0 if any of the embedding scores passes the minimum threshold. - * - * Expect a GenericPairScoreId(PairEmbeddingCosineSimilarity, (TopicId, TweetId)) as input - */ - override def get(k: ThriftScoreId): Future[Option[ThriftScore]] = { - StatsUtil.trackOptionStats(statsReceiver) { - (k.algorithm, k.internalId) match { - case (CortexTopicTweetLabel, GenericPairScoreId(genericPairScoreId)) => - (genericPairScoreId.id1, genericPairScoreId.id2) match { - case (InternalId.TopicId(topicId), InternalId.TweetId(tweetId)) => - TopicTweetsCosineSimilarityAggregateStore - .getRawScoresMap(topicId, tweetId, scoreKeys, scoreFacadeStore) - .map { scoresMap => Some(ThriftScore(toCortexScore(scoresMap))) } - case (InternalId.TweetId(tweetId), InternalId.TopicId(topicId)) => - TopicTweetsCosineSimilarityAggregateStore - .getRawScoresMap(topicId, tweetId, scoreKeys, scoreFacadeStore) - .map { scoresMap => Some(ThriftScore(toCortexScore(scoresMap))) } - case _ => - Future.None - // Do not accept other InternalId combinations - } - case _ => - // Do not accept other Id types for now - Future.None - } - } - } -} - -object TopicTweetsCosineSimilarityAggregateStore { - - val TopicEmbeddingTypes: Seq[EmbeddingType] = - Seq( - EmbeddingType.FavTfgTopic, - EmbeddingType.LogFavBasedKgoApeTopic - ) - - // Add the new embedding types if want to test the new Tweet embedding performance. - val TweetEmbeddingTypes: Seq[EmbeddingType] = Seq(EmbeddingType.LogFavBasedTweet) - - val ModelVersions: Seq[ModelVersion] = - Seq(DefaultModelVersion) - - val DefaultScoreKeys: Seq[ScoreKey] = { - for { - modelVersion <- ModelVersions - topicEmbeddingType <- TopicEmbeddingTypes - tweetEmbeddingType <- TweetEmbeddingTypes - } yield { - ScoreKey( - topicEmbeddingType = topicEmbeddingType, - tweetEmbeddingType = tweetEmbeddingType, - modelVersion = modelVersion - ) - } - } - case class ScoreKey( - topicEmbeddingType: EmbeddingType, - tweetEmbeddingType: EmbeddingType, - modelVersion: ModelVersion) - - def getRawScoresMap( - topicId: TopicId, - tweetId: TweetId, - scoreKeys: Seq[ScoreKey], - uniformScoringStore: ReadableStore[ThriftScoreId, ThriftScore] - ): Future[Map[ScoreKey, Double]] = { - val scoresMapFut = scoreKeys.map { key => - val scoreInternalId = ScoreInternalId.SimClustersEmbeddingPairScoreId( - ThriftSimClustersEmbeddingPairScoreId( - buildTopicEmbedding(topicId, key.topicEmbeddingType, key.modelVersion), - SimClustersEmbeddingId( - key.tweetEmbeddingType, - key.modelVersion, - InternalId.TweetId(tweetId)) - )) - val scoreFut = uniformScoringStore - .get( - ThriftScoreId( - algorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, // Hard code as cosine sim - internalId = scoreInternalId - )) - key -> scoreFut - }.toMap - - Future - .collect(scoresMapFut).map(_.collect { - case (key, Some(ThriftScore(score))) => - (key, score) - }) - } -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD deleted file mode 100644 index 1c617e9a0..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD +++ /dev/null @@ -1,20 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/github/ben-manes/caffeine", - "finatra/inject/inject-core/src/main/scala", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer/common", - "representation-scorer/server/src/main/scala/com/twitter/representationscorer/scorestore", - "representation-scorer/server/src/main/thrift:thrift-scala", - "src/thrift/com/twitter/twistly:twistly-scala", - "stitch/stitch-core", - "stitch/stitch-core:cache", - "strato/config/columns/recommendations/twistly:twistly-strato-client", - "strato/config/columns/recommendations/user-signal-service:user-signal-service-strato-client", - "strato/src/main/scala/com/twitter/strato/client", - "user-signal-service/thrift/src/main/thrift:thrift-scala", - "util/util-core", - ], -) diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD.docx new file mode 100644 index 000000000..536e20f40 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/BUILD.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.docx new file mode 100644 index 000000000..c49d547d2 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.scala deleted file mode 100644 index 2da828ce6..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Engagements.scala +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.representationscorer.twistlyfeatures - -import com.twitter.conversions.DurationOps._ -import com.twitter.util.Duration -import com.twitter.util.Time - -case class Engagements( - favs7d: Seq[UserSignal] = Nil, - retweets7d: Seq[UserSignal] = Nil, - follows30d: Seq[UserSignal] = Nil, - shares7d: Seq[UserSignal] = Nil, - replies7d: Seq[UserSignal] = Nil, - originalTweets7d: Seq[UserSignal] = Nil, - videoPlaybacks7d: Seq[UserSignal] = Nil, - block30d: Seq[UserSignal] = Nil, - mute30d: Seq[UserSignal] = Nil, - report30d: Seq[UserSignal] = Nil, - dontlike30d: Seq[UserSignal] = Nil, - seeFewer30d: Seq[UserSignal] = Nil) { - - import Engagements._ - - private val now = Time.now - private val oneDayAgo = (now - OneDaySpan).inMillis - private val sevenDaysAgo = (now - SevenDaysSpan).inMillis - - // All ids from the signals grouped by type (tweetIds, userIds, etc) - val tweetIds: Seq[Long] = - (favs7d ++ retweets7d ++ shares7d - ++ replies7d ++ originalTweets7d ++ videoPlaybacks7d - ++ report30d ++ dontlike30d ++ seeFewer30d) - .map(_.targetId) - val authorIds: Seq[Long] = (follows30d ++ block30d ++ mute30d).map(_.targetId) - - // Tweet signals - val dontlike7d: Seq[UserSignal] = dontlike30d.filter(_.timestamp > sevenDaysAgo) - val seeFewer7d: Seq[UserSignal] = seeFewer30d.filter(_.timestamp > sevenDaysAgo) - - val favs1d: Seq[UserSignal] = favs7d.filter(_.timestamp > oneDayAgo) - val retweets1d: Seq[UserSignal] = retweets7d.filter(_.timestamp > oneDayAgo) - val shares1d: Seq[UserSignal] = shares7d.filter(_.timestamp > oneDayAgo) - val replies1d: Seq[UserSignal] = replies7d.filter(_.timestamp > oneDayAgo) - val originalTweets1d: Seq[UserSignal] = originalTweets7d.filter(_.timestamp > oneDayAgo) - val videoPlaybacks1d: Seq[UserSignal] = videoPlaybacks7d.filter(_.timestamp > oneDayAgo) - val dontlike1d: Seq[UserSignal] = dontlike7d.filter(_.timestamp > oneDayAgo) - val seeFewer1d: Seq[UserSignal] = seeFewer7d.filter(_.timestamp > oneDayAgo) - - // User signals - val follows7d: Seq[UserSignal] = follows30d.filter(_.timestamp > sevenDaysAgo) - val block7d: Seq[UserSignal] = block30d.filter(_.timestamp > sevenDaysAgo) - val mute7d: Seq[UserSignal] = mute30d.filter(_.timestamp > sevenDaysAgo) - val report7d: Seq[UserSignal] = report30d.filter(_.timestamp > sevenDaysAgo) - - val block1d: Seq[UserSignal] = block7d.filter(_.timestamp > oneDayAgo) - val mute1d: Seq[UserSignal] = mute7d.filter(_.timestamp > oneDayAgo) - val report1d: Seq[UserSignal] = report7d.filter(_.timestamp > oneDayAgo) -} - -object Engagements { - val OneDaySpan: Duration = 1.days - val SevenDaysSpan: Duration = 7.days - val ThirtyDaysSpan: Duration = 30.days -} - -case class UserSignal(targetId: Long, timestamp: Long) diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.docx new file mode 100644 index 000000000..b43f972ee Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.scala deleted file mode 100644 index 71df34a19..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/ScoreResult.scala +++ /dev/null @@ -1,3 +0,0 @@ -package com.twitter.representationscorer.twistlyfeatures - -case class ScoreResult(id: Long, score: Option[Double]) diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.docx new file mode 100644 index 000000000..b82e4eb84 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.scala deleted file mode 100644 index 731412d0a..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/Scorer.scala +++ /dev/null @@ -1,474 +0,0 @@ -package com.twitter.representationscorer.twistlyfeatures - -import com.twitter.finagle.stats.Counter -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.representationscorer.common.TweetId -import com.twitter.representationscorer.common.UserId -import com.twitter.representationscorer.scorestore.ScoreStore -import com.twitter.representationscorer.thriftscala.SimClustersRecentEngagementSimilarities -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.ScoreId -import com.twitter.simclusters_v2.thriftscala.ScoreInternalId -import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingPairScoreId -import com.twitter.stitch.Stitch -import javax.inject.Inject - -class Scorer @Inject() ( - fetchEngagementsFromUSS: Long => Stitch[Engagements], - scoreStore: ScoreStore, - stats: StatsReceiver) { - - import Scorer._ - - private val scoreStats = stats.scope("score") - private val scoreCalculationStats = scoreStats.scope("calculation") - private val scoreResultStats = scoreStats.scope("result") - - private val scoresNonEmptyCounter = scoreResultStats.scope("all").counter("nonEmpty") - private val scoresNonZeroCounter = scoreResultStats.scope("all").counter("nonZero") - - private val tweetScoreStats = scoreCalculationStats.scope("tweetScore").stat("latency") - private val userScoreStats = scoreCalculationStats.scope("userScore").stat("latency") - - private val favNonZero = scoreResultStats.scope("favs").counter("nonZero") - private val favNonEmpty = scoreResultStats.scope("favs").counter("nonEmpty") - - private val retweetsNonZero = scoreResultStats.scope("retweets").counter("nonZero") - private val retweetsNonEmpty = scoreResultStats.scope("retweets").counter("nonEmpty") - - private val followsNonZero = scoreResultStats.scope("follows").counter("nonZero") - private val followsNonEmpty = scoreResultStats.scope("follows").counter("nonEmpty") - - private val sharesNonZero = scoreResultStats.scope("shares").counter("nonZero") - private val sharesNonEmpty = scoreResultStats.scope("shares").counter("nonEmpty") - - private val repliesNonZero = scoreResultStats.scope("replies").counter("nonZero") - private val repliesNonEmpty = scoreResultStats.scope("replies").counter("nonEmpty") - - private val originalTweetsNonZero = scoreResultStats.scope("originalTweets").counter("nonZero") - private val originalTweetsNonEmpty = scoreResultStats.scope("originalTweets").counter("nonEmpty") - - private val videoViewsNonZero = scoreResultStats.scope("videoViews").counter("nonZero") - private val videoViewsNonEmpty = scoreResultStats.scope("videoViews").counter("nonEmpty") - - private val blockNonZero = scoreResultStats.scope("block").counter("nonZero") - private val blockNonEmpty = scoreResultStats.scope("block").counter("nonEmpty") - - private val muteNonZero = scoreResultStats.scope("mute").counter("nonZero") - private val muteNonEmpty = scoreResultStats.scope("mute").counter("nonEmpty") - - private val reportNonZero = scoreResultStats.scope("report").counter("nonZero") - private val reportNonEmpty = scoreResultStats.scope("report").counter("nonEmpty") - - private val dontlikeNonZero = scoreResultStats.scope("dontlike").counter("nonZero") - private val dontlikeNonEmpty = scoreResultStats.scope("dontlike").counter("nonEmpty") - - private val seeFewerNonZero = scoreResultStats.scope("seeFewer").counter("nonZero") - private val seeFewerNonEmpty = scoreResultStats.scope("seeFewer").counter("nonEmpty") - - private def getTweetScores( - candidateTweetId: TweetId, - sourceTweetIds: Seq[TweetId] - ): Stitch[Seq[ScoreResult]] = { - val getScoresStitch = Stitch.traverse(sourceTweetIds) { sourceTweetId => - scoreStore - .uniformScoringStoreStitch(getTweetScoreId(sourceTweetId, candidateTweetId)) - .liftNotFoundToOption - .map(score => ScoreResult(sourceTweetId, score.map(_.score))) - } - - Stitch.time(getScoresStitch).flatMap { - case (tryResult, duration) => - tweetScoreStats.add(duration.inMillis) - Stitch.const(tryResult) - } - } - - private def getUserScores( - tweetId: TweetId, - authorIds: Seq[UserId] - ): Stitch[Seq[ScoreResult]] = { - val getScoresStitch = Stitch.traverse(authorIds) { authorId => - scoreStore - .uniformScoringStoreStitch(getAuthorScoreId(authorId, tweetId)) - .liftNotFoundToOption - .map(score => ScoreResult(authorId, score.map(_.score))) - } - - Stitch.time(getScoresStitch).flatMap { - case (tryResult, duration) => - userScoreStats.add(duration.inMillis) - Stitch.const(tryResult) - } - } - - /** - * Get the [[SimClustersRecentEngagementSimilarities]] result containing the similarity - * features for the given userId-TweetId. - */ - def get( - userId: UserId, - tweetId: TweetId - ): Stitch[SimClustersRecentEngagementSimilarities] = { - get(userId, Seq(tweetId)).map(x => x.head) - } - - /** - * Get a list of [[SimClustersRecentEngagementSimilarities]] results containing the similarity - * features for the given tweets of the user Id. - * Guaranteed to be the same number/order as requested. - */ - def get( - userId: UserId, - tweetIds: Seq[TweetId] - ): Stitch[Seq[SimClustersRecentEngagementSimilarities]] = { - fetchEngagementsFromUSS(userId) - .flatMap(engagements => { - // For each tweet received in the request, compute the similarity scores between them - // and the user signals fetched from USS. - Stitch - .join( - Stitch.traverse(tweetIds)(id => getTweetScores(id, engagements.tweetIds)), - Stitch.traverse(tweetIds)(id => getUserScores(id, engagements.authorIds)), - ) - .map { - case (tweetScoresSeq, userScoreSeq) => - // All seq have = size because when scores don't exist, they are returned as Option - (tweetScoresSeq, userScoreSeq).zipped.map { (tweetScores, userScores) => - computeSimilarityScoresPerTweet( - engagements, - tweetScores.groupBy(_.id), - userScores.groupBy(_.id)) - } - } - }) - } - - /** - * - * Computes the [[SimClustersRecentEngagementSimilarities]] - * using the given tweet-tweet and user-tweet scores in TweetScoresMap - * and the user signals in [[Engagements]]. - */ - private def computeSimilarityScoresPerTweet( - engagements: Engagements, - tweetScores: Map[TweetId, Seq[ScoreResult]], - authorScores: Map[UserId, Seq[ScoreResult]] - ): SimClustersRecentEngagementSimilarities = { - val favs7d = engagements.favs7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val favs1d = engagements.favs1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val retweets7d = engagements.retweets7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val retweets1d = engagements.retweets1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val follows30d = engagements.follows30d.view - .flatMap(s => authorScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val follows7d = engagements.follows7d.view - .flatMap(s => authorScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val shares7d = engagements.shares7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val shares1d = engagements.shares1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val replies7d = engagements.replies7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val replies1d = engagements.replies1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val originalTweets7d = engagements.originalTweets7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val originalTweets1d = engagements.originalTweets1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val videoViews7d = engagements.videoPlaybacks7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val videoViews1d = engagements.videoPlaybacks1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val block30d = engagements.block30d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val block7d = engagements.block7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val block1d = engagements.block1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val mute30d = engagements.mute30d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val mute7d = engagements.mute7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val mute1d = engagements.mute1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val report30d = engagements.report30d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val report7d = engagements.report7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val report1d = engagements.report1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val dontlike30d = engagements.dontlike30d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val dontlike7d = engagements.dontlike7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val dontlike1d = engagements.dontlike1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val seeFewer30d = engagements.seeFewer30d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val seeFewer7d = engagements.seeFewer7d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val seeFewer1d = engagements.seeFewer1d.view - .flatMap(s => tweetScores.get(s.targetId)) - .flatten.flatMap(_.score) - .force - - val result = SimClustersRecentEngagementSimilarities( - fav1dLast10Max = max(favs1d), - fav1dLast10Avg = avg(favs1d), - fav7dLast10Max = max(favs7d), - fav7dLast10Avg = avg(favs7d), - retweet1dLast10Max = max(retweets1d), - retweet1dLast10Avg = avg(retweets1d), - retweet7dLast10Max = max(retweets7d), - retweet7dLast10Avg = avg(retweets7d), - follow7dLast10Max = max(follows7d), - follow7dLast10Avg = avg(follows7d), - follow30dLast10Max = max(follows30d), - follow30dLast10Avg = avg(follows30d), - share1dLast10Max = max(shares1d), - share1dLast10Avg = avg(shares1d), - share7dLast10Max = max(shares7d), - share7dLast10Avg = avg(shares7d), - reply1dLast10Max = max(replies1d), - reply1dLast10Avg = avg(replies1d), - reply7dLast10Max = max(replies7d), - reply7dLast10Avg = avg(replies7d), - originalTweet1dLast10Max = max(originalTweets1d), - originalTweet1dLast10Avg = avg(originalTweets1d), - originalTweet7dLast10Max = max(originalTweets7d), - originalTweet7dLast10Avg = avg(originalTweets7d), - videoPlayback1dLast10Max = max(videoViews1d), - videoPlayback1dLast10Avg = avg(videoViews1d), - videoPlayback7dLast10Max = max(videoViews7d), - videoPlayback7dLast10Avg = avg(videoViews7d), - block1dLast10Max = max(block1d), - block1dLast10Avg = avg(block1d), - block7dLast10Max = max(block7d), - block7dLast10Avg = avg(block7d), - block30dLast10Max = max(block30d), - block30dLast10Avg = avg(block30d), - mute1dLast10Max = max(mute1d), - mute1dLast10Avg = avg(mute1d), - mute7dLast10Max = max(mute7d), - mute7dLast10Avg = avg(mute7d), - mute30dLast10Max = max(mute30d), - mute30dLast10Avg = avg(mute30d), - report1dLast10Max = max(report1d), - report1dLast10Avg = avg(report1d), - report7dLast10Max = max(report7d), - report7dLast10Avg = avg(report7d), - report30dLast10Max = max(report30d), - report30dLast10Avg = avg(report30d), - dontlike1dLast10Max = max(dontlike1d), - dontlike1dLast10Avg = avg(dontlike1d), - dontlike7dLast10Max = max(dontlike7d), - dontlike7dLast10Avg = avg(dontlike7d), - dontlike30dLast10Max = max(dontlike30d), - dontlike30dLast10Avg = avg(dontlike30d), - seeFewer1dLast10Max = max(seeFewer1d), - seeFewer1dLast10Avg = avg(seeFewer1d), - seeFewer7dLast10Max = max(seeFewer7d), - seeFewer7dLast10Avg = avg(seeFewer7d), - seeFewer30dLast10Max = max(seeFewer30d), - seeFewer30dLast10Avg = avg(seeFewer30d), - ) - trackStats(result) - result - } - - private def trackStats(result: SimClustersRecentEngagementSimilarities): Unit = { - val scores = Seq( - result.fav7dLast10Max, - result.retweet7dLast10Max, - result.follow30dLast10Max, - result.share1dLast10Max, - result.share7dLast10Max, - result.reply7dLast10Max, - result.originalTweet7dLast10Max, - result.videoPlayback7dLast10Max, - result.block30dLast10Max, - result.mute30dLast10Max, - result.report30dLast10Max, - result.dontlike30dLast10Max, - result.seeFewer30dLast10Max - ) - - val nonEmpty = scores.exists(_.isDefined) - val nonZero = scores.exists { case Some(score) if score > 0 => true; case _ => false } - - if (nonEmpty) { - scoresNonEmptyCounter.incr() - } - - if (nonZero) { - scoresNonZeroCounter.incr() - } - - // We use the largest window of a given type of score, - // because the largest window is inclusive of smaller windows. - trackSignalStats(favNonEmpty, favNonZero, result.fav7dLast10Avg) - trackSignalStats(retweetsNonEmpty, retweetsNonZero, result.retweet7dLast10Avg) - trackSignalStats(followsNonEmpty, followsNonZero, result.follow30dLast10Avg) - trackSignalStats(sharesNonEmpty, sharesNonZero, result.share7dLast10Avg) - trackSignalStats(repliesNonEmpty, repliesNonZero, result.reply7dLast10Avg) - trackSignalStats(originalTweetsNonEmpty, originalTweetsNonZero, result.originalTweet7dLast10Avg) - trackSignalStats(videoViewsNonEmpty, videoViewsNonZero, result.videoPlayback7dLast10Avg) - trackSignalStats(blockNonEmpty, blockNonZero, result.block30dLast10Avg) - trackSignalStats(muteNonEmpty, muteNonZero, result.mute30dLast10Avg) - trackSignalStats(reportNonEmpty, reportNonZero, result.report30dLast10Avg) - trackSignalStats(dontlikeNonEmpty, dontlikeNonZero, result.dontlike30dLast10Avg) - trackSignalStats(seeFewerNonEmpty, seeFewerNonZero, result.seeFewer30dLast10Avg) - } - - private def trackSignalStats(nonEmpty: Counter, nonZero: Counter, score: Option[Double]): Unit = { - if (score.nonEmpty) { - nonEmpty.incr() - - if (score.get > 0) - nonZero.incr() - } - } -} - -object Scorer { - def avg(s: Traversable[Double]): Option[Double] = - if (s.isEmpty) None else Some(s.sum / s.size) - def max(s: Traversable[Double]): Option[Double] = - if (s.isEmpty) None else Some(s.foldLeft(0.0D) { (curr, _max) => math.max(curr, _max) }) - - private def getAuthorScoreId( - userId: UserId, - tweetId: TweetId - ) = { - ScoreId( - algorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - internalId = ScoreInternalId.SimClustersEmbeddingPairScoreId( - SimClustersEmbeddingPairScoreId( - SimClustersEmbeddingId( - internalId = InternalId.UserId(userId), - modelVersion = ModelVersion.Model20m145k2020, - embeddingType = EmbeddingType.FavBasedProducer - ), - SimClustersEmbeddingId( - internalId = InternalId.TweetId(tweetId), - modelVersion = ModelVersion.Model20m145k2020, - embeddingType = EmbeddingType.LogFavBasedTweet - ) - )) - ) - } - - private def getTweetScoreId( - sourceTweetId: TweetId, - candidateTweetId: TweetId - ) = { - ScoreId( - algorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity, - internalId = ScoreInternalId.SimClustersEmbeddingPairScoreId( - SimClustersEmbeddingPairScoreId( - SimClustersEmbeddingId( - internalId = InternalId.TweetId(sourceTweetId), - modelVersion = ModelVersion.Model20m145k2020, - embeddingType = EmbeddingType.LogFavLongestL2EmbeddingTweet - ), - SimClustersEmbeddingId( - internalId = InternalId.TweetId(candidateTweetId), - modelVersion = ModelVersion.Model20m145k2020, - embeddingType = EmbeddingType.LogFavBasedTweet - ) - )) - ) - } -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.docx new file mode 100644 index 000000000..74f054937 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.scala deleted file mode 100644 index fb09c1e57..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClient.scala +++ /dev/null @@ -1,155 +0,0 @@ -package com.twitter.representationscorer.twistlyfeatures - -import com.twitter.decider.SimpleRecipient -import com.twitter.finagle.stats.Stat -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.representationscorer.common._ -import com.twitter.representationscorer.twistlyfeatures.Engagements._ -import com.twitter.simclusters_v2.common.SimClustersEmbeddingId.LongInternalId -import com.twitter.stitch.Stitch -import com.twitter.strato.generated.client.recommendations.user_signal_service.SignalsClientColumn -import com.twitter.strato.generated.client.recommendations.user_signal_service.SignalsClientColumn.Value -import com.twitter.usersignalservice.thriftscala.BatchSignalRequest -import com.twitter.usersignalservice.thriftscala.SignalRequest -import com.twitter.usersignalservice.thriftscala.SignalType -import com.twitter.util.Time -import scala.collection.mutable.ArrayBuffer -import com.twitter.usersignalservice.thriftscala.ClientIdentifier - -class UserSignalServiceRecentEngagementsClient( - stratoClient: SignalsClientColumn, - decider: RepresentationScorerDecider, - stats: StatsReceiver) { - - import UserSignalServiceRecentEngagementsClient._ - - private val signalStats = stats.scope("user-signal-service", "signal") - private val signalTypeStats: Map[SignalType, Stat] = - SignalType.list.map(s => (s, signalStats.scope(s.name).stat("size"))).toMap - - def get(userId: UserId): Stitch[Engagements] = { - val request = buildRequest(userId) - stratoClient.fetcher.fetch(request).map(_.v).lowerFromOption().map { response => - val now = Time.now - val sevenDaysAgo = now - SevenDaysSpan - val thirtyDaysAgo = now - ThirtyDaysSpan - - Engagements( - favs7d = getUserSignals(response, SignalType.TweetFavorite, sevenDaysAgo), - retweets7d = getUserSignals(response, SignalType.Retweet, sevenDaysAgo), - follows30d = getUserSignals(response, SignalType.AccountFollowWithDelay, thirtyDaysAgo), - shares7d = getUserSignals(response, SignalType.TweetShareV1, sevenDaysAgo), - replies7d = getUserSignals(response, SignalType.Reply, sevenDaysAgo), - originalTweets7d = getUserSignals(response, SignalType.OriginalTweet, sevenDaysAgo), - videoPlaybacks7d = - getUserSignals(response, SignalType.VideoView90dPlayback50V1, sevenDaysAgo), - block30d = getUserSignals(response, SignalType.AccountBlock, thirtyDaysAgo), - mute30d = getUserSignals(response, SignalType.AccountMute, thirtyDaysAgo), - report30d = getUserSignals(response, SignalType.TweetReport, thirtyDaysAgo), - dontlike30d = getUserSignals(response, SignalType.TweetDontLike, thirtyDaysAgo), - seeFewer30d = getUserSignals(response, SignalType.TweetSeeFewer, thirtyDaysAgo), - ) - } - } - - private def getUserSignals( - response: Value, - signalType: SignalType, - earliestValidTimestamp: Time - ): Seq[UserSignal] = { - val signals = response.signalResponse - .getOrElse(signalType, Seq.empty) - .view - .filter(_.timestamp > earliestValidTimestamp.inMillis) - .map(s => s.targetInternalId.collect { case LongInternalId(id) => (id, s.timestamp) }) - .collect { case Some((id, engagedAt)) => UserSignal(id, engagedAt) } - .take(EngagementsToScore) - .force - - signalTypeStats(signalType).add(signals.size) - signals - } - - private def buildRequest(userId: Long) = { - val recipient = Some(SimpleRecipient(userId)) - - // Signals RSX always fetches - val requestSignals = ArrayBuffer( - SignalRequestFav, - SignalRequestRetweet, - SignalRequestFollow - ) - - // Signals under experimentation. We use individual deciders to disable them if necessary. - // If experiments are successful, they will become permanent. - if (decider.isAvailable(FetchSignalShareDeciderKey, recipient)) - requestSignals.append(SignalRequestShare) - - if (decider.isAvailable(FetchSignalReplyDeciderKey, recipient)) - requestSignals.append(SignalRequestReply) - - if (decider.isAvailable(FetchSignalOriginalTweetDeciderKey, recipient)) - requestSignals.append(SignalRequestOriginalTweet) - - if (decider.isAvailable(FetchSignalVideoPlaybackDeciderKey, recipient)) - requestSignals.append(SignalRequestVideoPlayback) - - if (decider.isAvailable(FetchSignalBlockDeciderKey, recipient)) - requestSignals.append(SignalRequestBlock) - - if (decider.isAvailable(FetchSignalMuteDeciderKey, recipient)) - requestSignals.append(SignalRequestMute) - - if (decider.isAvailable(FetchSignalReportDeciderKey, recipient)) - requestSignals.append(SignalRequestReport) - - if (decider.isAvailable(FetchSignalDontlikeDeciderKey, recipient)) - requestSignals.append(SignalRequestDontlike) - - if (decider.isAvailable(FetchSignalSeeFewerDeciderKey, recipient)) - requestSignals.append(SignalRequestSeeFewer) - - BatchSignalRequest(userId, requestSignals, Some(ClientIdentifier.RepresentationScorerHome)) - } -} - -object UserSignalServiceRecentEngagementsClient { - val FetchSignalShareDeciderKey = "representation_scorer_fetch_signal_share" - val FetchSignalReplyDeciderKey = "representation_scorer_fetch_signal_reply" - val FetchSignalOriginalTweetDeciderKey = "representation_scorer_fetch_signal_original_tweet" - val FetchSignalVideoPlaybackDeciderKey = "representation_scorer_fetch_signal_video_playback" - val FetchSignalBlockDeciderKey = "representation_scorer_fetch_signal_block" - val FetchSignalMuteDeciderKey = "representation_scorer_fetch_signal_mute" - val FetchSignalReportDeciderKey = "representation_scorer_fetch_signal_report" - val FetchSignalDontlikeDeciderKey = "representation_scorer_fetch_signal_dont_like" - val FetchSignalSeeFewerDeciderKey = "representation_scorer_fetch_signal_see_fewer" - - val EngagementsToScore = 10 - private val engagementsToScoreOpt: Option[Long] = Some(EngagementsToScore) - - val SignalRequestFav: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.TweetFavorite) - val SignalRequestRetweet: SignalRequest = SignalRequest(engagementsToScoreOpt, SignalType.Retweet) - val SignalRequestFollow: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.AccountFollowWithDelay) - // New experimental signals - val SignalRequestShare: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.TweetShareV1) - val SignalRequestReply: SignalRequest = SignalRequest(engagementsToScoreOpt, SignalType.Reply) - val SignalRequestOriginalTweet: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.OriginalTweet) - val SignalRequestVideoPlayback: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.VideoView90dPlayback50V1) - - // Negative signals - val SignalRequestBlock: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.AccountBlock) - val SignalRequestMute: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.AccountMute) - val SignalRequestReport: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.TweetReport) - val SignalRequestDontlike: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.TweetDontLike) - val SignalRequestSeeFewer: SignalRequest = - SignalRequest(engagementsToScoreOpt, SignalType.TweetSeeFewer) -} diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.docx b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.docx new file mode 100644 index 000000000..8b8376276 Binary files /dev/null and b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.docx differ diff --git a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.scala b/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.scala deleted file mode 100644 index ee9f61df4..000000000 --- a/representation-scorer/server/src/main/scala/com/twitter/representationscorer/twistlyfeatures/UserSignalServiceRecentEngagementsClientModule.scala +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.representationscorer.twistlyfeatures - -import com.github.benmanes.caffeine.cache.Caffeine -import com.twitter.stitch.cache.EvictingCache -import com.google.inject.Provides -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.inject.TwitterModule -import com.twitter.representationscorer.common.RepresentationScorerDecider -import com.twitter.stitch.Stitch -import com.twitter.stitch.cache.ConcurrentMapCache -import com.twitter.stitch.cache.MemoizeQuery -import com.twitter.strato.client.Client -import com.twitter.strato.generated.client.recommendations.user_signal_service.SignalsClientColumn -import java.util.concurrent.ConcurrentMap -import java.util.concurrent.TimeUnit -import javax.inject.Singleton - -object UserSignalServiceRecentEngagementsClientModule extends TwitterModule { - - @Singleton - @Provides - def provide( - client: Client, - decider: RepresentationScorerDecider, - statsReceiver: StatsReceiver - ): Long => Stitch[Engagements] = { - val stratoClient = new SignalsClientColumn(client) - - /* - This cache holds a users recent engagements for a short period of time, such that batched requests - for multiple (userid, tweetid) pairs don't all need to fetch them. - - [1] Caffeine cache keys/values must be objects, so we cannot use the `Long` primitive directly. - The boxed java.lang.Long works as a key, since it is an object. In most situations the compiler - can see where auto(un)boxing can occur. However, here we seem to need some wrapper functions - with explicit types to allow the boxing to happen. - */ - val mapCache: ConcurrentMap[java.lang.Long, Stitch[Engagements]] = - Caffeine - .newBuilder() - .expireAfterWrite(5, TimeUnit.SECONDS) - .maximumSize( - 1000 // We estimate 5M unique users in a 5m period - with 2k RSX instances, assume that one will see < 1k in a 5s period - ) - .build[java.lang.Long, Stitch[Engagements]] - .asMap - - statsReceiver.provideGauge("ussRecentEngagementsClient", "cache_size") { mapCache.size.toFloat } - - val engagementsClient = - new UserSignalServiceRecentEngagementsClient(stratoClient, decider, statsReceiver) - - val f = (l: java.lang.Long) => engagementsClient.get(l) // See note [1] above - val cachedCall = MemoizeQuery(f, EvictingCache.lazily(new ConcurrentMapCache(mapCache))) - (l: Long) => cachedCall(l) // see note [1] above - } -} diff --git a/representation-scorer/server/src/main/thrift/BUILD b/representation-scorer/server/src/main/thrift/BUILD deleted file mode 100644 index f7ea37675..000000000 --- a/representation-scorer/server/src/main/thrift/BUILD +++ /dev/null @@ -1,20 +0,0 @@ -create_thrift_libraries( - base_name = "thrift", - sources = [ - "com/twitter/representationscorer/service.thrift", - ], - platform = "java8", - tags = [ - "bazel-compatible", - ], - dependency_roots = [ - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift", - ], - generate_languages = [ - "java", - "scala", - "strato", - ], - provides_java_name = "representationscorer-service-thrift-java", - provides_scala_name = "representationscorer-service-thrift-scala", -) diff --git a/representation-scorer/server/src/main/thrift/BUILD.docx b/representation-scorer/server/src/main/thrift/BUILD.docx new file mode 100644 index 000000000..87fb52cdd Binary files /dev/null and b/representation-scorer/server/src/main/thrift/BUILD.docx differ diff --git a/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.docx b/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.docx new file mode 100644 index 000000000..6b8d943a3 Binary files /dev/null and b/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.docx differ diff --git a/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.thrift b/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.thrift deleted file mode 100644 index 0e2f23a31..000000000 --- a/representation-scorer/server/src/main/thrift/com/twitter/representationscorer/service.thrift +++ /dev/null @@ -1,106 +0,0 @@ -namespace java com.twitter.representationscorer.thriftjava -#@namespace scala com.twitter.representationscorer.thriftscala -#@namespace strato com.twitter.representationscorer - -include "com/twitter/simclusters_v2/identifier.thrift" -include "com/twitter/simclusters_v2/online_store.thrift" -include "com/twitter/simclusters_v2/score.thrift" - -struct SimClustersRecentEngagementSimilarities { - // All scores computed using cosine similarity - // 1 - 1000 Positive Signals - 1: optional double fav1dLast10Max // max score from last 10 faves in the last 1 day - 2: optional double fav1dLast10Avg // avg score from last 10 faves in the last 1 day - 3: optional double fav7dLast10Max // max score from last 10 faves in the last 7 days - 4: optional double fav7dLast10Avg // avg score from last 10 faves in the last 7 days - 5: optional double retweet1dLast10Max // max score from last 10 retweets in the last 1 days - 6: optional double retweet1dLast10Avg // avg score from last 10 retweets in the last 1 days - 7: optional double retweet7dLast10Max // max score from last 10 retweets in the last 7 days - 8: optional double retweet7dLast10Avg // avg score from last 10 retweets in the last 7 days - 9: optional double follow7dLast10Max // max score from the last 10 follows in the last 7 days - 10: optional double follow7dLast10Avg // avg score from the last 10 follows in the last 7 days - 11: optional double follow30dLast10Max // max score from the last 10 follows in the last 30 days - 12: optional double follow30dLast10Avg // avg score from the last 10 follows in the last 30 days - 13: optional double share1dLast10Max // max score from last 10 shares in the last 1 day - 14: optional double share1dLast10Avg // avg score from last 10 shares in the last 1 day - 15: optional double share7dLast10Max // max score from last 10 shares in the last 7 days - 16: optional double share7dLast10Avg // avg score from last 10 shares in the last 7 days - 17: optional double reply1dLast10Max // max score from last 10 replies in the last 1 day - 18: optional double reply1dLast10Avg // avg score from last 10 replies in the last 1 day - 19: optional double reply7dLast10Max // max score from last 10 replies in the last 7 days - 20: optional double reply7dLast10Avg // avg score from last 10 replies in the last 7 days - 21: optional double originalTweet1dLast10Max // max score from last 10 original tweets in the last 1 day - 22: optional double originalTweet1dLast10Avg // avg score from last 10 original tweets in the last 1 day - 23: optional double originalTweet7dLast10Max // max score from last 10 original tweets in the last 7 days - 24: optional double originalTweet7dLast10Avg // avg score from last 10 original tweets in the last 7 days - 25: optional double videoPlayback1dLast10Max // max score from last 10 video playback50 in the last 1 day - 26: optional double videoPlayback1dLast10Avg // avg score from last 10 video playback50 in the last 1 day - 27: optional double videoPlayback7dLast10Max // max score from last 10 video playback50 in the last 7 days - 28: optional double videoPlayback7dLast10Avg // avg score from last 10 video playback50 in the last 7 days - - // 1001 - 2000 Implicit Signals - - // 2001 - 3000 Negative Signals - // Block Series - 2001: optional double block1dLast10Avg - 2002: optional double block1dLast10Max - 2003: optional double block7dLast10Avg - 2004: optional double block7dLast10Max - 2005: optional double block30dLast10Avg - 2006: optional double block30dLast10Max - // Mute Series - 2101: optional double mute1dLast10Avg - 2102: optional double mute1dLast10Max - 2103: optional double mute7dLast10Avg - 2104: optional double mute7dLast10Max - 2105: optional double mute30dLast10Avg - 2106: optional double mute30dLast10Max - // Report Series - 2201: optional double report1dLast10Avg - 2202: optional double report1dLast10Max - 2203: optional double report7dLast10Avg - 2204: optional double report7dLast10Max - 2205: optional double report30dLast10Avg - 2206: optional double report30dLast10Max - // Dontlike - 2301: optional double dontlike1dLast10Avg - 2302: optional double dontlike1dLast10Max - 2303: optional double dontlike7dLast10Avg - 2304: optional double dontlike7dLast10Max - 2305: optional double dontlike30dLast10Avg - 2306: optional double dontlike30dLast10Max - // SeeFewer - 2401: optional double seeFewer1dLast10Avg - 2402: optional double seeFewer1dLast10Max - 2403: optional double seeFewer7dLast10Avg - 2404: optional double seeFewer7dLast10Max - 2405: optional double seeFewer30dLast10Avg - 2406: optional double seeFewer30dLast10Max -}(persisted='true', hasPersonalData = 'true') - -/* - * List score API - */ -struct ListScoreId { - 1: required score.ScoringAlgorithm algorithm - 2: required online_store.ModelVersion modelVersion - 3: required identifier.EmbeddingType targetEmbeddingType - 4: required identifier.InternalId targetId - 5: required identifier.EmbeddingType candidateEmbeddingType - 6: required list candidateIds -}(hasPersonalData = 'true') - -struct ScoreResult { - // This api does not communicate why a score is missing. For example, it may be unavailable - // because the referenced entities do not exist (e.g. the embedding was not found) or because - // timeouts prevented us from calculating it. - 1: optional double score -} - -struct ListScoreResponse { - 1: required list scores // Guaranteed to be the same number/order as requested -} - -struct RecentEngagementSimilaritiesResponse { - 1: required list results // Guaranteed to be the same number/order as requested -} diff --git a/science/search/ingester/config/README.docx b/science/search/ingester/config/README.docx new file mode 100644 index 000000000..d230edd25 Binary files /dev/null and b/science/search/ingester/config/README.docx differ diff --git a/science/search/ingester/config/README.md b/science/search/ingester/config/README.md deleted file mode 100644 index 34f69d6e6..000000000 --- a/science/search/ingester/config/README.md +++ /dev/null @@ -1,2 +0,0 @@ -## Ingester Configs -This directory contains pipeline configurations for the tweet ingesters (realtime, protected and realtime_cg) and the user-updates ingester. The pipeline configurations define an ordered sequence of stages that the tweet or user update goes through before reaching Earlybird. Source code for the various stages referenced in the configs can be found at src/java/com/twitter/search/ingester/pipeline/twitter. \ No newline at end of file diff --git a/science/search/ingester/config/pipeline-indexer.userupdates.docx b/science/search/ingester/config/pipeline-indexer.userupdates.docx new file mode 100644 index 000000000..fe11d55a5 Binary files /dev/null and b/science/search/ingester/config/pipeline-indexer.userupdates.docx differ diff --git a/science/search/ingester/config/pipeline-indexer.userupdates.xml b/science/search/ingester/config/pipeline-indexer.userupdates.xml deleted file mode 100644 index f422b511d..000000000 --- a/science/search/ingester/config/pipeline-indexer.userupdates.xml +++ /dev/null @@ -1,30 +0,0 @@ - - - - - - - - - - - - - - - diff --git a/science/search/ingester/config/pipeline-ingester.protected.docx b/science/search/ingester/config/pipeline-ingester.protected.docx new file mode 100644 index 000000000..7ff79d6fc Binary files /dev/null and b/science/search/ingester/config/pipeline-ingester.protected.docx differ diff --git a/science/search/ingester/config/pipeline-ingester.protected.xml b/science/search/ingester/config/pipeline-ingester.protected.xml deleted file mode 100644 index 434a621c2..000000000 --- a/science/search/ingester/config/pipeline-ingester.protected.xml +++ /dev/null @@ -1,202 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/science/search/ingester/config/pipeline-ingester.realtime.docx b/science/search/ingester/config/pipeline-ingester.realtime.docx new file mode 100644 index 000000000..57381fe03 Binary files /dev/null and b/science/search/ingester/config/pipeline-ingester.realtime.docx differ diff --git a/science/search/ingester/config/pipeline-ingester.realtime.xml b/science/search/ingester/config/pipeline-ingester.realtime.xml deleted file mode 100644 index 65700bed2..000000000 --- a/science/search/ingester/config/pipeline-ingester.realtime.xml +++ /dev/null @@ -1,240 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/science/search/ingester/config/pipeline-ingester.realtime_cg.docx b/science/search/ingester/config/pipeline-ingester.realtime_cg.docx new file mode 100644 index 000000000..b9dc5feb3 Binary files /dev/null and b/science/search/ingester/config/pipeline-ingester.realtime_cg.docx differ diff --git a/science/search/ingester/config/pipeline-ingester.realtime_cg.xml b/science/search/ingester/config/pipeline-ingester.realtime_cg.xml deleted file mode 100644 index 617af252e..000000000 --- a/science/search/ingester/config/pipeline-ingester.realtime_cg.xml +++ /dev/null @@ -1,199 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/simclusters-ann/BUILD.bazel b/simclusters-ann/BUILD.bazel deleted file mode 100644 index 1624a57d4..000000000 --- a/simclusters-ann/BUILD.bazel +++ /dev/null @@ -1 +0,0 @@ -# This prevents SQ query from grabbing //:all since it traverses up once to find a BUILD diff --git a/simclusters-ann/BUILD.docx b/simclusters-ann/BUILD.docx new file mode 100644 index 000000000..99307005e Binary files /dev/null and b/simclusters-ann/BUILD.docx differ diff --git a/simclusters-ann/README.docx b/simclusters-ann/README.docx new file mode 100644 index 000000000..407662cc7 Binary files /dev/null and b/simclusters-ann/README.docx differ diff --git a/simclusters-ann/README.md b/simclusters-ann/README.md deleted file mode 100644 index 69ff6cffa..000000000 --- a/simclusters-ann/README.md +++ /dev/null @@ -1,99 +0,0 @@ -# SimClusters ANN - -SimClusters ANN is a service that returns tweet candidate recommendations given a SimClusters embedding. The service implements tweet recommendations based on the Approximate Cosine Similarity algorithm. - -The cosine similarity between two Tweet SimClusters Embedding represents the relevance level of two tweets in SimCluster space. The traditional algorithm for calculating cosine similarity is expensive and hard to support by the existing infrastructure. Therefore, the Approximate Cosine Similarity algorithm is introduced to save response time by reducing I/O operations. - -## Background -SimClusters V2 runtime infra introduces the SimClusters and its online and offline approaches. A heron job builds the mapping between SimClusters and Tweets. The job saves top 400 Tweets for a SimClusters and top 100 SimClusters for a Tweet. Favorite score and follow score are two types of tweet score. In the document, the top 100 SimClusters based on the favorite score for a Tweet stands for the Tweet SimClusters Embedding. - -The cosine similarity between two Tweet SimClusters Embedding presents the relevant level of two tweets in SimCluster space. The score varies from 0 to 1. The high cosine similarity score(>= 0.7 in Prod) means that the users who like two tweets share the same SimClusters. - - -SimClusters from the Linear Algebra Perspective discussed the difference between the dot-product and cosine similarity in SimCluster space. We believe the cosine similarity approach is better because it avoids the bias of tweet popularity. - - However, calculating the cosine similarity between two Tweets is pretty expensive in Tweet candidate generation. In TWISTLY, we scan at most 15,000 (6 source tweets * 25 clusters * 100 tweets per clusters) tweet candidates for every Home Timeline request. The traditional algorithm needs to make API calls to fetch 15,000 tweet SimCluster embeddings. Consider that we need to process over 6,000 RPS, it’s hard to support by the existing infrastructure. - - -## SimClusters Approximate Cosine Similarity Core Algorithm - -1. Provide a source SimCluster Embedding *SV*, *SV = [(SC1, Score), (SC2, Score), (SC3, Score) …]* - -2. Fetch top *M* tweets for each Top *N* SimClusters based on SV. In Prod, *M = 400*, *N = 50*. Tweets may appear in multiple SimClusters. - -| | | | | -|---|---|---|---| -| SC1 | T1:Score | T2: Score | ... | -| SC2 | T3: Score | T4: Score | ... | - - -3. Based on the previous table, generate an *(M x N) x N* Matrix *R*. The *R* represents the approximate SimCluster embeddings for *MxN* tweets. The embedding only contains top *N* SimClusters from *SV*. Only top *M* tweets from each SimCluster have the score. Others are 0. - -| | SC1 | SC2 | ... | -|---|---|---|---| -| T1 | Score | 0 | ... | -| T2 | Score | 0 | ... | -| T3 | 0 | Score | ... | - -4. Compute the dot product between source vector and the approximate vectors for each tweet. (Calculate *R • SV^T*). Take top *X* tweets. In Prod, *X = 200* - -5. Fetch *X* tweet SimClusters Embedding, Calculate Cosine Similarity between *X* tweets and *SV*, Return top *Y* above a certain threshold *Z*. - -Approximate Cosine Similarity is an approximate algorithm. Instead of fetching *M * N* tweets embedding, it only fetches *X* tweets embedding. In prod, *X / M * N * 100% = 6%*. Based on the metrics during TWISTLY development, most of the response time is consumed by I/O operation. The Approximate Cosine Similarity is a good approach to save a large amount of response time. - -The idea of the approximate algorithm is based on the assumption that the higher dot-product between source tweets’ SimCluster embedding and candidate tweet’s limited SimCluster Embedding, the possibility that these two tweets are relevant is higher. Additional Cosine Similarity filter is to guarantee that the results are not affected by popularity bias. - -Adjusting the M, N, X, Y, Z is able to balance the precision and recall for different products. The implementation of approximate cosine similarity is used by TWISTLY, Interest-based tweet recommendation, Similar Tweet in RUX, and Author based recommendation. This algorithm is also suitable for future user or entity recommendation based on SimClusters Embedding. - - -# ------------------------------- -# Build and Test -# ------------------------------- -Compile the service - - $ ./bazel build simclusters-ann/server:bin - -Unit tests - - $ ./bazel test simclusters-ann/server:bin - -# ------------------------------- -# Deploy -# ------------------------------- - -## Prerequisite for devel deployments -First of all, you need to generate Service to Service certificates for use while developing locally. This only needs to be done ONCE: - -To add cert files to Aurora (if you want to deploy to DEVEL): -``` -$ developer-cert-util --env devel --job simclusters-ann -``` - -## Deploying to devel/staging from a local build -Reference - - - $ ./simclusters-ann/bin/deploy.sh --help - -Use the script to build the service in your local branch, upload it to packer and deploy in devel aurora: - - $ ./simclusters-ann/bin/deploy.sh atla $USER devel simclusters-ann - -You can also deploy to staging with this script. E.g. to deploy to instance 1: - - $ ./simclusters-ann/bin/deploy.sh atla simclusters-ann staging simclusters-ann - -## Deploying to production - -Production deploys should be managed by Workflows. -_Do not_ deploy to production unless it is an emergency and you have approval from oncall. - -##### It is not recommended to deploy from Command Lines into production environments, unless 1) you're testing a small change in Canary shard [0,9]. 2) Tt is an absolute emergency. Be sure to make oncalls aware of the changes you're deploying. - - $ ./simclusters-ann/bin/deploy.sh atla simclusters-ann prod simclusters-ann -In the case of multiple instances, - - $ ./simclusters-ann/bin/deploy.sh atla simclusters-ann prod simclusters-ann - - -## Checking Deployed Version and Rolling Back - -Wherever possible, roll back using Workflows by finding an earlier good version and clicking the "rollback" button in the UI. This is the safest and least error-prone method. diff --git a/simclusters-ann/server/BUILD b/simclusters-ann/server/BUILD deleted file mode 100644 index 9a62359c3..000000000 --- a/simclusters-ann/server/BUILD +++ /dev/null @@ -1,23 +0,0 @@ -jvm_binary( - name = "bin", - basename = "simclusters-ann", - main = "com.twitter.simclustersann.SimClustersAnnServerMain", - runtime_platform = "java11", - tags = ["bazel-compatible"], - dependencies = [ - "finagle/finagle-zipkin-scribe/src/main/scala", - "finatra/inject/inject-logback/src/main/scala", - "loglens/loglens-logback/src/main/scala/com/twitter/loglens/logback", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann", - "twitter-server-internal/src/main/scala", - "twitter-server/logback-classic/src/main/scala", - ], -) - -# Aurora Workflows build phase convention requires a jvm_app named with ${project-name}-app -jvm_app( - name = "simclusters-ann-app", - archive = "zip", - binary = ":bin", - tags = ["bazel-compatible"], -) diff --git a/simclusters-ann/server/BUILD.docx b/simclusters-ann/server/BUILD.docx new file mode 100644 index 000000000..32d1c77d1 Binary files /dev/null and b/simclusters-ann/server/BUILD.docx differ diff --git a/simclusters-ann/server/src/main/resources/BUILD b/simclusters-ann/server/src/main/resources/BUILD deleted file mode 100644 index b3a752276..000000000 --- a/simclusters-ann/server/src/main/resources/BUILD +++ /dev/null @@ -1,7 +0,0 @@ -resources( - sources = [ - "*.xml", - "config/*.yml", - ], - tags = ["bazel-compatible"], -) diff --git a/simclusters-ann/server/src/main/resources/BUILD.docx b/simclusters-ann/server/src/main/resources/BUILD.docx new file mode 100644 index 000000000..5bff15e03 Binary files /dev/null and b/simclusters-ann/server/src/main/resources/BUILD.docx differ diff --git a/simclusters-ann/server/src/main/resources/config/decider.docx b/simclusters-ann/server/src/main/resources/config/decider.docx new file mode 100644 index 000000000..5d9d7f8c6 Binary files /dev/null and b/simclusters-ann/server/src/main/resources/config/decider.docx differ diff --git a/simclusters-ann/server/src/main/resources/config/decider.yml b/simclusters-ann/server/src/main/resources/config/decider.yml deleted file mode 100644 index 80469028a..000000000 --- a/simclusters-ann/server/src/main/resources/config/decider.yml +++ /dev/null @@ -1,95 +0,0 @@ -# SimClusters embedding store enable / disable decider values - -# ---------- Dark Traffic Proxy ---------- -dark_traffic_filter: - comment: Proportion of the requests that are forwarded as dark traffic to the proxy - default_availability: 0 - -# Tweet embeddings -enable_LogFavBasedTweet_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_LogFavLongestL2EmbeddingTweet_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -# Entity embeddings -enable_FavTfgTopic_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - - -enable_LogFavBasedKgoApeTopic_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -# KnownFor embeddings -enable_FavBasedProducer_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_FollowBasedProducer_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_RelaxedAggregatableLogFavBasedProducer_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -# InterestedIn embeddings -enable_LogFavBasedUserInterestedInFromAPE_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_FollowBasedUserInterestedInFromAPE_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_FavBasedUserInterestedIn_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_FollowBasedUserInterestedIn_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_LogFavBasedUserInterestedIn_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_FilteredUserInterestedIn_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_UnfilteredUserInterestedIn_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_LogFavBasedUserInterestedAverageAddressBookFromIIAPE_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 - -enable_UserNextInterestedIn_Model20m145k2020: - comment: "Enable the read traffic to (embeddingType, modelVersion) from 0% to 100%. 0 means return EMPTY for all requests." - default_availability: 10000 diff --git a/simclusters-ann/server/src/main/resources/logback.docx b/simclusters-ann/server/src/main/resources/logback.docx new file mode 100644 index 000000000..90e89f629 Binary files /dev/null and b/simclusters-ann/server/src/main/resources/logback.docx differ diff --git a/simclusters-ann/server/src/main/resources/logback.xml b/simclusters-ann/server/src/main/resources/logback.xml deleted file mode 100644 index 0bb0d6646..000000000 --- a/simclusters-ann/server/src/main/resources/logback.xml +++ /dev/null @@ -1,167 +0,0 @@ - - - - - - - - - - - - - - - - - true - - - - - - - - - - - ${log.service.output} - - - ${log.service.output}.%d.gz - - 3GB - - 21 - true - - - %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n - - - - - - ${log.access.output} - - - ${log.access.output}.%d.gz - - 100MB - - 7 - true - - - ${DEFAULT_ACCESS_PATTERN}%n - - - - - - true - ${log.lens.category} - ${log.lens.index} - ${log.lens.tag}/service - - %msg - - - - - - true - ${log.lens.category} - ${log.lens.index} - ${log.lens.tag}/access - - %msg - - - - - - allow_listed_pipeline_executions.log - - - allow_listed_pipeline_executions.log.%d.gz - - 100MB - - 7 - true - - - %date %.-3level ${DEFAULT_SERVICE_PATTERN}%n - - - - - - - - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - ${async_queue_size} - ${async_max_flush_time} - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD deleted file mode 100644 index 00aefb800..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD +++ /dev/null @@ -1,31 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/javax/inject:javax.inject", - "3rdparty/jvm/net/codingwell:scala-guice", - "finagle/finagle-core/src/main", - "finagle/finagle-http/src/main/scala", - "finagle/finagle-thriftmux/src/main/scala", - "finatra-internal/decider/src/main/scala", - "finatra-internal/mtls-thriftmux/src/main/scala", - "finatra/inject/inject-app/src/main/scala", - "finatra/inject/inject-core/src/main/scala", - "finatra/inject/inject-server/src/main/scala", - "finatra/inject/inject-thrift-client/src/main/scala", - "finatra/inject/inject-utils/src/main/scala", - "finatra/utils/src/main/java/com/twitter/finatra/annotations", - "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/exceptions", - "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/filters", - "simclusters-ann/server/src/main/resources", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules", - "simclusters-ann/thrift/src/main/thrift:thrift-scala", - "src/thrift/com/twitter/search:earlybird-scala", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms/view", - "twitter-server/server/src/main/scala", - "util/util-app/src/main/scala", - "util/util-core:scala", - ], -) diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD.docx new file mode 100644 index 000000000..0fa5c117d Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/BUILD.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.docx new file mode 100644 index 000000000..e3b5d0b93 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.scala deleted file mode 100644 index 6168a871c..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnServer.scala +++ /dev/null @@ -1,70 +0,0 @@ -package com.twitter.simclustersann - -import com.google.inject.Module -import com.twitter.finatra.decider.modules.DeciderModule -import com.twitter.finatra.mtls.thriftmux.Mtls -import com.twitter.finatra.thrift.ThriftServer -import com.twitter.finatra.thrift.filters._ -import com.twitter.finatra.thrift.routing.ThriftRouter -import com.twitter.inject.thrift.modules.ThriftClientIdModule -import com.twitter.relevance_platform.common.exceptions._ -import com.twitter.simclustersann.controllers.SimClustersANNController -import com.twitter.simclustersann.exceptions.InvalidRequestForSimClustersAnnVariantExceptionMapper -import com.twitter.simclustersann.modules._ -import com.twitter.simclustersann.thriftscala.SimClustersANNService -import com.twitter.finagle.Filter -import com.twitter.finatra.annotations.DarkTrafficFilterType -import com.twitter.inject.annotations.Flags -import com.twitter.relevance_platform.common.filters.DarkTrafficFilterModule -import com.twitter.relevance_platform.common.filters.ClientStatsFilter -import com.twitter.simclustersann.common.FlagNames.DisableWarmup - -object SimClustersAnnServerMain extends SimClustersAnnServer - -class SimClustersAnnServer extends ThriftServer with Mtls { - flag( - name = DisableWarmup, - default = false, - help = "If true, no warmup will be run." - ) - - override val name = "simclusters-ann-server" - - override val modules: Seq[Module] = Seq( - CacheModule, - ServiceNameMapperModule, - ClusterConfigMapperModule, - ClusterConfigModule, - ClusterTweetIndexProviderModule, - DeciderModule, - EmbeddingStoreModule, - FlagsModule, - FuturePoolProvider, - RateLimiterModule, - SimClustersANNCandidateSourceModule, - StratoClientProviderModule, - ThriftClientIdModule, - new CustomMtlsThriftWebFormsModule[SimClustersANNService.MethodPerEndpoint](this), - new DarkTrafficFilterModule[SimClustersANNService.ReqRepServicePerEndpoint]() - ) - - def configureThrift(router: ThriftRouter): Unit = { - router - .filter[LoggingMDCFilter] - .filter[TraceIdMDCFilter] - .filter[ThriftMDCFilter] - .filter[ClientStatsFilter] - .filter[ExceptionMappingFilter] - .filter[Filter.TypeAgnostic, DarkTrafficFilterType] - .exceptionMapper[InvalidRequestForSimClustersAnnVariantExceptionMapper] - .exceptionMapper[DeadlineExceededExceptionMapper] - .exceptionMapper[UnhandledExceptionMapper] - .add[SimClustersANNController] - } - - override protected def warmup(): Unit = { - if (!injector.instance[Boolean](Flags.named(DisableWarmup))) { - handle[SimclustersAnnWarmupHandler]() - } - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.docx new file mode 100644 index 000000000..c57be5136 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.scala deleted file mode 100644 index ca1078b75..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/SimclustersAnnWarmupHandler.scala +++ /dev/null @@ -1,73 +0,0 @@ -package com.twitter.simclustersann - -import com.twitter.inject.Logging -import com.twitter.inject.utils.Handler -import javax.inject.Inject -import scala.util.control.NonFatal -import com.google.common.util.concurrent.RateLimiter -import com.twitter.conversions.DurationOps.richDurationFromInt -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Await -import com.twitter.util.ExecutorServiceFuturePool -import com.twitter.util.Future - -class SimclustersAnnWarmupHandler @Inject() ( - clusterTweetCandidatesStore: ReadableStore[ClusterId, Seq[(TweetId, Double)]], - futurePool: ExecutorServiceFuturePool, - rateLimiter: RateLimiter, - statsReceiver: StatsReceiver) - extends Handler - with Logging { - - private val stats = statsReceiver.scope(this.getClass.getName) - - private val scopedStats = stats.scope("fetchFromCache") - private val clusters = scopedStats.counter("clusters") - private val fetchedKeys = scopedStats.counter("keys") - private val failures = scopedStats.counter("failures") - private val success = scopedStats.counter("success") - - private val SimclustersNumber = 144428 - - override def handle(): Unit = { - try { - val clusterIds = List.range(1, SimclustersNumber) - val futures: Seq[Future[Unit]] = clusterIds - .map { clusterId => - clusters.incr() - futurePool { - rateLimiter.acquire() - - Await.result( - clusterTweetCandidatesStore - .get(clusterId) - .onSuccess { _ => - success.incr() - } - .handle { - case NonFatal(e) => - failures.incr() - }, - timeout = 10.seconds - ) - fetchedKeys.incr() - } - } - - Await.result(Future.collect(futures), timeout = 10.minutes) - - } catch { - case NonFatal(e) => error(e.getMessage, e) - } finally { - try { - futurePool.executor.shutdown() - } catch { - case NonFatal(_) => - } - info("Warmup done.") - } - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.docx new file mode 100644 index 000000000..f29a575d6 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.scala deleted file mode 100644 index b5264f0bb..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ApproximateCosineSimilarity.scala +++ /dev/null @@ -1,129 +0,0 @@ -package com.twitter.simclustersann.candidate_source - -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclustersann.thriftscala.ScoringAlgorithm -import com.twitter.simclustersann.thriftscala.SimClustersANNConfig -import com.twitter.snowflake.id.SnowflakeId -import com.twitter.util.Duration -import com.twitter.util.Time -import scala.collection.mutable - -/** - * This store looks for tweets whose similarity is close to a Source SimClustersEmbeddingId. - * - * Approximate cosine similarity is the core algorithm to drive this store. - * - * Step 1 - 4 are in "fetchCandidates" method. - * 1. Retrieve the SimClusters Embedding by the SimClustersEmbeddingId - * 2. Fetch top N clusters' top tweets from the clusterTweetCandidatesStore (TopTweetsPerCluster index). - * 3. Calculate all the tweet candidates' dot-product or approximate cosine similarity to source tweets. - * 4. Take top M tweet candidates by the step 3's score - */ -trait ApproximateCosineSimilarity { - type ScoredTweet = (Long, Double) - def apply( - sourceEmbedding: SimClustersEmbedding, - sourceEmbeddingId: SimClustersEmbeddingId, - config: SimClustersANNConfig, - candidateScoresStat: Int => Unit, - clusterTweetsMap: Map[ClusterId, Option[Seq[(TweetId, Double)]]], - clusterTweetsMapArray: Map[ClusterId, Option[Array[(TweetId, Double)]]] = Map.empty - ): Seq[ScoredTweet] -} - -object ApproximateCosineSimilarity extends ApproximateCosineSimilarity { - - final val InitialCandidateMapSize = 16384 - val MaxNumResultsUpperBound = 1000 - final val MaxTweetCandidateAgeUpperBound = 175200 - - private class HashMap[A, B](initSize: Int) extends mutable.HashMap[A, B] { - override def initialSize: Int = initSize // 16 - by default - } - - private def parseTweetId(embeddingId: SimClustersEmbeddingId): Option[TweetId] = { - embeddingId.internalId match { - case InternalId.TweetId(tweetId) => - Some(tweetId) - case _ => - None - } - } - - override def apply( - sourceEmbedding: SimClustersEmbedding, - sourceEmbeddingId: SimClustersEmbeddingId, - config: SimClustersANNConfig, - candidateScoresStat: Int => Unit, - clusterTweetsMap: Map[ClusterId, Option[Seq[(TweetId, Double)]]] = Map.empty, - clusterTweetsMapArray: Map[ClusterId, Option[Array[(TweetId, Double)]]] = Map.empty - ): Seq[ScoredTweet] = { - val now = Time.now - val earliestTweetId = - if (config.maxTweetCandidateAgeHours >= MaxTweetCandidateAgeUpperBound) - 0L // Disable max tweet age filter - else - SnowflakeId.firstIdFor(now - Duration.fromHours(config.maxTweetCandidateAgeHours)) - val latestTweetId = - SnowflakeId.firstIdFor(now - Duration.fromHours(config.minTweetCandidateAgeHours)) - - // Use Mutable map to optimize performance. The method is thread-safe. - - // Set initial map size to around p75 of map size distribution to avoid too many copying - // from extending the size of the mutable hashmap - val candidateScoresMap = - new HashMap[TweetId, Double](InitialCandidateMapSize) - val candidateNormalizationMap = - new HashMap[TweetId, Double](InitialCandidateMapSize) - - clusterTweetsMap.foreach { - case (clusterId, Some(tweetScores)) if sourceEmbedding.contains(clusterId) => - val sourceClusterScore = sourceEmbedding.getOrElse(clusterId) - - for (i <- 0 until Math.min(tweetScores.size, config.maxTopTweetsPerCluster)) { - val (tweetId, score) = tweetScores(i) - - if (!parseTweetId(sourceEmbeddingId).contains(tweetId) && - tweetId >= earliestTweetId && tweetId <= latestTweetId) { - candidateScoresMap.put( - tweetId, - candidateScoresMap.getOrElse(tweetId, 0.0) + score * sourceClusterScore) - candidateNormalizationMap - .put(tweetId, candidateNormalizationMap.getOrElse(tweetId, 0.0) + score * score) - } - } - case _ => () - } - - candidateScoresStat(candidateScoresMap.size) - - // Re-Rank the candidate by configuration - val processedCandidateScores: Seq[(TweetId, Double)] = candidateScoresMap.map { - case (candidateId, score) => - // Enable Partial Normalization - val processedScore = { - // We applied the "log" version of partial normalization when we rank candidates - // by log cosine similarity - config.annAlgorithm match { - case ScoringAlgorithm.LogCosineSimilarity => - score / sourceEmbedding.logNorm / math.log(1 + candidateNormalizationMap(candidateId)) - case ScoringAlgorithm.CosineSimilarity => - score / sourceEmbedding.l2norm / math.sqrt(candidateNormalizationMap(candidateId)) - case ScoringAlgorithm.CosineSimilarityNoSourceEmbeddingNormalization => - score / math.sqrt(candidateNormalizationMap(candidateId)) - case ScoringAlgorithm.DotProduct => score - } - } - candidateId -> processedScore - }.toSeq - - processedCandidateScores - .filter(_._2 >= config.minScore) - .sortBy(-_._2) - .take(Math.min(config.maxNumResults, MaxNumResultsUpperBound)) - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD deleted file mode 100644 index 21411b854..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -scala_library( - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/twitter/storehaus:core", - "frigate/frigate-common:base", - "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/base", - "simclusters-ann/thrift/src/main/thrift:thrift-scala", - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/simclusters_v2/summingbird/stores", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - "util/util-stats/src/main/scala/com/twitter/finagle/stats", - ], -) diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD.docx new file mode 100644 index 000000000..98ca7c8a3 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/BUILD.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.docx new file mode 100644 index 000000000..e5b329072 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.scala deleted file mode 100644 index 7be2728f6..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/ExperimentalApproximateCosineSimilarity.scala +++ /dev/null @@ -1,131 +0,0 @@ -package com.twitter.simclustersann.candidate_source - -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclustersann.thriftscala.ScoringAlgorithm -import com.twitter.simclustersann.thriftscala.SimClustersANNConfig -import com.twitter.snowflake.id.SnowflakeId -import com.twitter.util.Duration -import com.twitter.util.Time -import com.google.common.collect.Comparators -import com.twitter.simclusters_v2.common.ClusterId - -/** - * A modified version of OptimizedApproximateCosineSimilarity which uses more java streams to avoid - * materializing intermediate collections. Its performance is still under investigation. - */ -object ExperimentalApproximateCosineSimilarity extends ApproximateCosineSimilarity { - - final val InitialCandidateMapSize = 16384 - val MaxNumResultsUpperBound = 1000 - final val MaxTweetCandidateAgeUpperBound = 175200 - - private def parseTweetId(embeddingId: SimClustersEmbeddingId): Option[TweetId] = { - embeddingId.internalId match { - case InternalId.TweetId(tweetId) => - Some(tweetId) - case _ => - None - } - } - private val CompareByScore: java.util.Comparator[(Long, Double)] = - new java.util.Comparator[(Long, Double)] { - override def compare(o1: (Long, Double), o2: (Long, Double)): Int = { - java.lang.Double.compare(o1._2, o2._2) - } - } - class Scores(var score: Double, var norm: Double) - - override def apply( - sourceEmbedding: SimClustersEmbedding, - sourceEmbeddingId: SimClustersEmbeddingId, - config: SimClustersANNConfig, - candidateScoresStat: Int => Unit, - clusterTweetsMap: Map[ClusterId, Option[Seq[(TweetId, Double)]]] = Map.empty, - clusterTweetsMapArray: Map[ClusterId, Option[Array[(TweetId, Double)]]] = Map.empty - ): Seq[ScoredTweet] = { - val now = Time.now - val earliestTweetId = - if (config.maxTweetCandidateAgeHours >= MaxTweetCandidateAgeUpperBound) - 0L // Disable max tweet age filter - else - SnowflakeId.firstIdFor(now - Duration.fromHours(config.maxTweetCandidateAgeHours)) - val latestTweetId = - SnowflakeId.firstIdFor(now - Duration.fromHours(config.minTweetCandidateAgeHours)) - - val candidateScoresMap = new java.util.HashMap[Long, Scores](InitialCandidateMapSize) - val sourceTweetId = parseTweetId(sourceEmbeddingId).getOrElse(0L) - - clusterTweetsMap.foreach { - case (clusterId, Some(tweetScores)) => - val sourceClusterScore = sourceEmbedding.getOrElse(clusterId) - - for (i <- 0 until Math.min(tweetScores.size, config.maxTopTweetsPerCluster)) { - val (tweetId, score) = tweetScores(i) - - if (tweetId >= earliestTweetId && - tweetId <= latestTweetId && - tweetId != sourceTweetId) { - - val scores = candidateScoresMap.get(tweetId) - if (scores == null) { - val scorePair = new Scores( - score = score * sourceClusterScore, - norm = score * score - ) - candidateScoresMap.put(tweetId, scorePair) - } else { - scores.score = scores.score + (score * sourceClusterScore) - scores.norm = scores.norm + (score * score) - } - } - } - case _ => () - } - - candidateScoresStat(candidateScoresMap.size) - - val normFn: (Long, Scores) => (Long, Double) = config.annAlgorithm match { - case ScoringAlgorithm.LogCosineSimilarity => - (candidateId: Long, score: Scores) => - ( - candidateId, - score.score / sourceEmbedding.logNorm / math.log(1 + score.norm) - ) - case ScoringAlgorithm.CosineSimilarity => - (candidateId: Long, score: Scores) => - ( - candidateId, - score.score / sourceEmbedding.l2norm / math.sqrt(score.norm) - ) - case ScoringAlgorithm.CosineSimilarityNoSourceEmbeddingNormalization => - (candidateId: Long, score: Scores) => - ( - candidateId, - score.score / math.sqrt(score.norm) - ) - case ScoringAlgorithm.DotProduct => - (candidateId: Long, score: Scores) => - ( - candidateId, - score.score - ) - } - - import scala.collection.JavaConverters._ - - val topKCollector = Comparators.greatest( - Math.min(config.maxNumResults, MaxNumResultsUpperBound), - CompareByScore - ) - - candidateScoresMap - .entrySet().stream() - .map[(Long, Double)]((e: java.util.Map.Entry[Long, Scores]) => normFn(e.getKey, e.getValue)) - .filter((s: (Long, Double)) => s._2 >= config.minScore) - .collect(topKCollector) - .asScala - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.docx new file mode 100644 index 000000000..41dfc75d1 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.scala deleted file mode 100644 index db2e7613e..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/OptimizedApproximateCosineSimilarity.scala +++ /dev/null @@ -1,112 +0,0 @@ -package com.twitter.simclustersann.candidate_source - -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclustersann.thriftscala.ScoringAlgorithm -import com.twitter.simclustersann.thriftscala.SimClustersANNConfig -import com.twitter.snowflake.id.SnowflakeId -import com.twitter.util.Duration -import com.twitter.util.Time - -/** - * Compared with ApproximateCosineSimilarity, this implementation: - * - moves some computation aroudn to reduce allocations - * - uses a single hashmap to store both scores and normalization coefficients - * - uses some java collections in place of scala ones - * Testing is still in progress, but this implementation shows significant (> 2x) improvements in - * CPU utilization and allocations with 800 tweets per cluster. - */ -object OptimizedApproximateCosineSimilarity extends ApproximateCosineSimilarity { - - final val InitialCandidateMapSize = 16384 - val MaxNumResultsUpperBound = 1000 - final val MaxTweetCandidateAgeUpperBound = 175200 - - private def parseTweetId(embeddingId: SimClustersEmbeddingId): Option[TweetId] = { - embeddingId.internalId match { - case InternalId.TweetId(tweetId) => - Some(tweetId) - case _ => - None - } - } - - override def apply( - sourceEmbedding: SimClustersEmbedding, - sourceEmbeddingId: SimClustersEmbeddingId, - config: SimClustersANNConfig, - candidateScoresStat: Int => Unit, - clusterTweetsMap: Map[ClusterId, Option[Seq[(TweetId, Double)]]] = Map.empty, - clusterTweetsMapArray: Map[ClusterId, Option[Array[(TweetId, Double)]]] = Map.empty - ): Seq[ScoredTweet] = { - val now = Time.now - val earliestTweetId = - if (config.maxTweetCandidateAgeHours >= MaxTweetCandidateAgeUpperBound) - 0L // Disable max tweet age filter - else - SnowflakeId.firstIdFor(now - Duration.fromHours(config.maxTweetCandidateAgeHours)) - val latestTweetId = - SnowflakeId.firstIdFor(now - Duration.fromHours(config.minTweetCandidateAgeHours)) - - val candidateScoresMap = new java.util.HashMap[Long, (Double, Double)](InitialCandidateMapSize) - - val sourceTweetId = parseTweetId(sourceEmbeddingId).getOrElse(0L) - - clusterTweetsMap.foreach { - case (clusterId, Some(tweetScores)) if sourceEmbedding.contains(clusterId) => - val sourceClusterScore = sourceEmbedding.getOrElse(clusterId) - - for (i <- 0 until Math.min(tweetScores.size, config.maxTopTweetsPerCluster)) { - val (tweetId, score) = tweetScores(i) - - if (tweetId >= earliestTweetId && - tweetId <= latestTweetId && - tweetId != sourceTweetId) { - - val scores = candidateScoresMap.getOrDefault(tweetId, (0.0, 0.0)) - val newScores = ( - scores._1 + score * sourceClusterScore, - scores._2 + score * score, - ) - candidateScoresMap.put(tweetId, newScores) - } - } - case _ => () - } - - candidateScoresStat(candidateScoresMap.size) - - val normFn: (Long, (Double, Double)) => (Long, Double) = config.annAlgorithm match { - case ScoringAlgorithm.LogCosineSimilarity => - (candidateId: Long, score: (Double, Double)) => - candidateId -> score._1 / sourceEmbedding.logNorm / math.log(1 + score._2) - case ScoringAlgorithm.CosineSimilarity => - (candidateId: Long, score: (Double, Double)) => - candidateId -> score._1 / sourceEmbedding.l2norm / math.sqrt(score._2) - case ScoringAlgorithm.CosineSimilarityNoSourceEmbeddingNormalization => - (candidateId: Long, score: (Double, Double)) => - candidateId -> score._1 / math.sqrt(score._2) - case ScoringAlgorithm.DotProduct => - (candidateId: Long, score: (Double, Double)) => (candidateId, score._1) - } - - val scoredTweets: java.util.ArrayList[(Long, Double)] = - new java.util.ArrayList(candidateScoresMap.size) - - val it = candidateScoresMap.entrySet().iterator() - while (it.hasNext) { - val mapEntry = it.next() - val normedScore = normFn(mapEntry.getKey, mapEntry.getValue) - if (normedScore._2 >= config.minScore) - scoredTweets.add(normedScore) - } - import scala.collection.JavaConverters._ - - scoredTweets.asScala - .sortBy(-_._2) - .take(Math.min(config.maxNumResults, MaxNumResultsUpperBound)) - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.docx new file mode 100644 index 000000000..92cfb4438 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.scala deleted file mode 100644 index bb12a54f1..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source/SimClustersANNCandidateSource.scala +++ /dev/null @@ -1,102 +0,0 @@ -package com.twitter.simclustersann.candidate_source - -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.frigate.common.base.Stats -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.simclustersann.thriftscala.SimClustersANNConfig -import com.twitter.simclustersann.thriftscala.SimClustersANNTweetCandidate -import com.twitter.storehaus.ReadableStore -import com.twitter.util.Future - -/** - * This store looks for tweets whose similarity is close to a Source SimClustersEmbeddingId. - * - * Approximate cosine similarity is the core algorithm to drive this store. - * - * Step 1 - 4 are in "fetchCandidates" method. - * 1. Retrieve the SimClusters Embedding by the SimClustersEmbeddingId - * 2. Fetch top N clusters' top tweets from the clusterTweetCandidatesStore (TopTweetsPerCluster index). - * 3. Calculate all the tweet candidates' dot-product or approximate cosine similarity to source tweets. - * 4. Take top M tweet candidates by the step 3's score - */ -case class SimClustersANNCandidateSource( - approximateCosineSimilarity: ApproximateCosineSimilarity, - clusterTweetCandidatesStore: ReadableStore[ClusterId, Seq[(TweetId, Double)]], - simClustersEmbeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding], - statsReceiver: StatsReceiver) { - private val stats = statsReceiver.scope(this.getClass.getName) - private val fetchSourceEmbeddingStat = stats.scope("fetchSourceEmbedding") - private val fetchCandidatesStat = stats.scope("fetchCandidates") - private val candidateScoresStat = stats.stat("candidateScoresMap") - - def get( - query: SimClustersANNCandidateSource.Query - ): Future[Option[Seq[SimClustersANNTweetCandidate]]] = { - val sourceEmbeddingId = query.sourceEmbeddingId - val config = query.config - for { - maybeSimClustersEmbedding <- Stats.track(fetchSourceEmbeddingStat) { - simClustersEmbeddingStore.get(query.sourceEmbeddingId) - } - maybeFilteredCandidates <- maybeSimClustersEmbedding match { - case Some(sourceEmbedding) => - for { - candidates <- Stats.trackSeq(fetchCandidatesStat) { - fetchCandidates(sourceEmbeddingId, sourceEmbedding, config) - } - } yield { - fetchCandidatesStat - .stat(sourceEmbeddingId.embeddingType.name, sourceEmbeddingId.modelVersion.name).add( - candidates.size) - Some(candidates) - } - case None => - fetchCandidatesStat - .stat(sourceEmbeddingId.embeddingType.name, sourceEmbeddingId.modelVersion.name).add(0) - Future.None - } - } yield { - maybeFilteredCandidates - } - } - - private def fetchCandidates( - sourceEmbeddingId: SimClustersEmbeddingId, - sourceEmbedding: SimClustersEmbedding, - config: SimClustersANNConfig - ): Future[Seq[SimClustersANNTweetCandidate]] = { - - val clusterIds = - sourceEmbedding - .truncate(config.maxScanClusters).getClusterIds() - .toSet - - Future - .collect { - clusterTweetCandidatesStore.multiGet(clusterIds) - }.map { clusterTweetsMap => - approximateCosineSimilarity( - sourceEmbedding = sourceEmbedding, - sourceEmbeddingId = sourceEmbeddingId, - config = config, - candidateScoresStat = (i: Int) => candidateScoresStat.add(i), - clusterTweetsMap = clusterTweetsMap - ).map { - case (tweetId, score) => - SimClustersANNTweetCandidate( - tweetId = tweetId, - score = score - ) - } - } - } -} - -object SimClustersANNCandidateSource { - case class Query( - sourceEmbeddingId: SimClustersEmbeddingId, - config: SimClustersANNConfig) -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD deleted file mode 100644 index 75d63312d..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD +++ /dev/null @@ -1,5 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - tags = ["bazel-compatible"], - dependencies = [], -) diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD.docx new file mode 100644 index 000000000..80f91312e Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/BUILD.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.docx new file mode 100644 index 000000000..7db6c0a70 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.scala deleted file mode 100644 index ae2c36177..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common/FlagNames.scala +++ /dev/null @@ -1,31 +0,0 @@ -package com.twitter.simclustersann.common - -object FlagNames { - - /** - * Global Settings - */ - final val ServiceTimeout = "service.timeout" - final val DarkTrafficFilterDeciderKey = "thrift.dark.traffic.filter.decider_key" - - /** - * Cache Setting - */ - final val CacheDest = "cache_module.dest" - final val CacheTimeout = "cache_module.timeout" - // Only turn on the async update when the SANN Cluster has the production taffic. - final val CacheAsyncUpdate = "cache_module.async_update" - - /** - * Warmup Settings - */ - final val DisableWarmup = "warmup.disable" - final val NumberOfThreads = "warmup.thread_number" - final val RateLimiterQPS = "warmup.rate_limiter_qps" - - /** - * Algorithm Parameters - */ - final val MaxTopTweetPerCluster = "sim_clusters.ann.max_top_tweets_per_cluster" - -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD deleted file mode 100644 index 69ccce158..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD +++ /dev/null @@ -1,29 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/javax/inject:javax.inject", - "3rdparty/jvm/net/codingwell:scala-guice", - "decider/src/main/scala", - "finagle/finagle-core/src/main", - "finatra/inject/inject-core/src/main/scala", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift:controller", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/exceptions", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/filters", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/modules", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/response", - "finatra/thrift/src/main/scala/com/twitter/finatra/thrift/routing", - "representation-manager/server/src/main/scala/com/twitter/representation_manager/migration", - "scrooge/scrooge-core/src/main/scala", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters", - "simclusters-ann/thrift/src/main/thrift:thrift-scala", - "src/scala/com/twitter/simclusters_v2/candidate_source", - "twitter-server/server/src/main/scala", - "util/util-core:scala", - "util/util-slf4j-api/src/main/scala", - ], -) diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD.docx new file mode 100644 index 000000000..af1649ad3 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/BUILD.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.docx new file mode 100644 index 000000000..766c26227 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.scala deleted file mode 100644 index 459972b32..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/controllers/SimClustersANNController.scala +++ /dev/null @@ -1,80 +0,0 @@ -package com.twitter.simclustersann.controllers - -import com.twitter.conversions.DurationOps._ -import com.twitter.finatra.thrift.Controller -import com.twitter.simclustersann.thriftscala.SimClustersANNService.GetTweetCandidates -import com.twitter.simclustersann.thriftscala.SimClustersANNService -import com.twitter.simclustersann.thriftscala.Query -import com.twitter.simclustersann.thriftscala.SimClustersANNTweetCandidate -import com.twitter.scrooge.Request -import com.twitter.scrooge.Response -import javax.inject.Inject -import com.twitter.finagle.Service -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.inject.annotations.Flag -import com.twitter.simclustersann.candidate_source.{ - SimClustersANNCandidateSource => SANNSimClustersANNCandidateSource -} -import com.twitter.simclustersann.common.FlagNames -import com.twitter.simclustersann.filters.GetTweetCandidatesResponseStatsFilter -import com.twitter.simclustersann.filters.SimClustersAnnVariantFilter -import com.twitter.util.Future -import com.twitter.util.JavaTimer -import com.twitter.util.Timer - -class SimClustersANNController @Inject() ( - @Flag(FlagNames.ServiceTimeout) serviceTimeout: Int, - variantFilter: SimClustersAnnVariantFilter, - getTweetCandidatesResponseStatsFilter: GetTweetCandidatesResponseStatsFilter, - sannCandidateSource: SANNSimClustersANNCandidateSource, - globalStats: StatsReceiver) - extends Controller(SimClustersANNService) { - - import SimClustersANNController._ - - private val stats: StatsReceiver = globalStats.scope(this.getClass.getCanonicalName) - private val timer: Timer = new JavaTimer(true) - - val filteredService: Service[Request[GetTweetCandidates.Args], Response[ - Seq[SimClustersANNTweetCandidate] - ]] = { - variantFilter - .andThen(getTweetCandidatesResponseStatsFilter) - .andThen(Service.mk(handler)) - } - - handle(GetTweetCandidates).withService(filteredService) - - private def handler( - request: Request[GetTweetCandidates.Args] - ): Future[Response[Seq[SimClustersANNTweetCandidate]]] = { - val query: Query = request.args.query - val simClustersANNCandidateSourceQuery = SANNSimClustersANNCandidateSource.Query( - sourceEmbeddingId = query.sourceEmbeddingId, - config = query.config - ) - - val result = sannCandidateSource - .get(simClustersANNCandidateSourceQuery).map { - case Some(tweetCandidatesSeq) => - Response(tweetCandidatesSeq.map { tweetCandidate => - SimClustersANNTweetCandidate( - tweetId = tweetCandidate.tweetId, - score = tweetCandidate.score - ) - }) - case None => - DefaultResponse - } - - result.raiseWithin(serviceTimeout.milliseconds)(timer).rescue { - case e: Throwable => - stats.scope("failures").counter(e.getClass.getCanonicalName).incr() - Future.value(DefaultResponse) - } - } -} - -object SimClustersANNController { - val DefaultResponse: Response[Seq[SimClustersANNTweetCandidate]] = Response(Seq.empty) -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD deleted file mode 100644 index c557c50ac..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD +++ /dev/null @@ -1,12 +0,0 @@ -scala_library( - sources = ["*.scala"], - compiler_option_sets = ["fatal_warnings"], - strict_deps = True, - tags = ["bazel-compatible"], - dependencies = [ - "finagle/finagle-core/src/main", - "finatra-internal/mtls-thriftmux/src/main/scala", - "finatra-internal/thrift/src/main/thrift:thrift-scala", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD.docx new file mode 100644 index 000000000..7cbb7d855 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/BUILD.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.docx new file mode 100644 index 000000000..2ab5a22a2 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.scala deleted file mode 100644 index c9b046253..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantException.scala +++ /dev/null @@ -1,16 +0,0 @@ -package com.twitter.simclustersann.exceptions - -import com.twitter.finagle.RequestException -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.ModelVersion - -case class InvalidRequestForSimClustersAnnVariantException( - modelVersion: ModelVersion, - embeddingType: EmbeddingType, - actualServiceName: String, - expectedServiceName: Option[String]) - extends RequestException( - s"Request with model version ($modelVersion) and embedding type ($embeddingType) cannot be " + - s"processed by service variant ($actualServiceName)." + - s" Expected service variant: $expectedServiceName.", - null) diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.docx new file mode 100644 index 000000000..273f1f983 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.scala deleted file mode 100644 index fecca048e..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/InvalidRequestForSimClustersAnnVariantExceptionMapper.scala +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.simclustersann.exceptions - -import com.twitter.finatra.thrift.exceptions.ExceptionMapper -import com.twitter.finatra.thrift.thriftscala.ClientError -import com.twitter.finatra.thrift.thriftscala.ClientErrorCause -import com.twitter.util.Future -import com.twitter.util.logging.Logging -import javax.inject.Singleton - -/** - * An exception mapper designed to handle - * [[com.twitter.simclustersann.exceptions.InvalidRequestForSimClustersAnnVariantException]] - * by returning a Thrift IDL defined Client Error. - */ -@Singleton -class InvalidRequestForSimClustersAnnVariantExceptionMapper - extends ExceptionMapper[InvalidRequestForSimClustersAnnVariantException, Nothing] - with Logging { - - override def handleException( - throwable: InvalidRequestForSimClustersAnnVariantException - ): Future[Nothing] = { - error("Invalid Request For SimClusters Ann Variant Exception", throwable) - - Future.exception(ClientError(ClientErrorCause.BadRequest, throwable.getMessage())) - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.docx new file mode 100644 index 000000000..4ff9bf941 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.scala deleted file mode 100644 index c5fd16d8c..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions/MissingClusterConfigForSimClustersAnnVariantException.scala +++ /dev/null @@ -1,6 +0,0 @@ -package com.twitter.simclustersann.exceptions - -case class MissingClusterConfigForSimClustersAnnVariantException(sannServiceName: String) - extends IllegalStateException( - s"No cluster configuration found for service ($sannServiceName)", - null) diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD deleted file mode 100644 index cb28d02b4..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD +++ /dev/null @@ -1,13 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - tags = ["bazel-compatible"], - dependencies = [ - "finagle/finagle-core/src/main", - "finatra/inject/inject-app/src/main/java/com/twitter/inject/annotations", - "finatra/inject/inject-core/src/main/scala", - "relevance-platform/src/main/scala/com/twitter/relevance_platform/simclustersann/multicluster", - "scrooge/scrooge-core/src/main/scala", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions", - "simclusters-ann/thrift/src/main/thrift:thrift-scala", - ], -) diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD.docx new file mode 100644 index 000000000..9fe5641a5 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/BUILD.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.docx new file mode 100644 index 000000000..222e06539 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.scala deleted file mode 100644 index f9c9a354f..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/GetTweetCandidatesResponseStatsFilter.scala +++ /dev/null @@ -1,43 +0,0 @@ -package com.twitter.simclustersann.filters - -import com.twitter.finagle.Service -import com.twitter.finagle.SimpleFilter -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.scrooge.Request -import com.twitter.scrooge.Response -import com.twitter.simclustersann.thriftscala.SimClustersANNService -import com.twitter.util.Future -import javax.inject.Inject -import javax.inject.Singleton - -@Singleton -class GetTweetCandidatesResponseStatsFilter @Inject() ( - statsReceiver: StatsReceiver) - extends SimpleFilter[Request[SimClustersANNService.GetTweetCandidates.Args], Response[ - SimClustersANNService.GetTweetCandidates.SuccessType - ]] { - - private[this] val stats = statsReceiver.scope("method_response_stats").scope("getTweetCandidates") - private[this] val candidateScoreStats = stats.stat("candidate_score_x1000") - private[this] val emptyResponseCounter = stats.counter("empty") - private[this] val nonEmptyResponseCounter = stats.counter("non_empty") - override def apply( - request: Request[SimClustersANNService.GetTweetCandidates.Args], - service: Service[Request[SimClustersANNService.GetTweetCandidates.Args], Response[ - SimClustersANNService.GetTweetCandidates.SuccessType - ]] - ): Future[Response[SimClustersANNService.GetTweetCandidates.SuccessType]] = { - val response = service(request) - - response.onSuccess { successResponse => - if (successResponse.value.size == 0) - emptyResponseCounter.incr() - else - nonEmptyResponseCounter.incr() - successResponse.value.foreach { candidate => - candidateScoreStats.add(candidate.score.toFloat * 1000) - } - } - response - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.docx new file mode 100644 index 000000000..67a82917a Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.scala deleted file mode 100644 index 8cfa088dd..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/filters/SimClustersAnnVariantFilter.scala +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.simclustersann.filters - -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.Service -import com.twitter.finagle.SimpleFilter -import com.twitter.relevance_platform.simclustersann.multicluster.ServiceNameMapper -import com.twitter.scrooge.Request -import com.twitter.scrooge.Response -import com.twitter.simclustersann.exceptions.InvalidRequestForSimClustersAnnVariantException -import com.twitter.simclustersann.thriftscala.SimClustersANNService -import com.twitter.util.Future -import javax.inject.Inject -import javax.inject.Singleton - -@Singleton -class SimClustersAnnVariantFilter @Inject() ( - serviceNameMapper: ServiceNameMapper, - serviceIdentifier: ServiceIdentifier, -) extends SimpleFilter[Request[SimClustersANNService.GetTweetCandidates.Args], Response[ - SimClustersANNService.GetTweetCandidates.SuccessType - ]] { - override def apply( - request: Request[SimClustersANNService.GetTweetCandidates.Args], - service: Service[Request[SimClustersANNService.GetTweetCandidates.Args], Response[ - SimClustersANNService.GetTweetCandidates.SuccessType - ]] - ): Future[Response[SimClustersANNService.GetTweetCandidates.SuccessType]] = { - - validateRequest(request) - service(request) - } - - private def validateRequest( - request: Request[SimClustersANNService.GetTweetCandidates.Args] - ): Unit = { - val modelVersion = request.args.query.sourceEmbeddingId.modelVersion - val embeddingType = request.args.query.config.candidateEmbeddingType - - val actualServiceName = serviceIdentifier.service - - val expectedServiceName = serviceNameMapper.getServiceName(modelVersion, embeddingType) - - expectedServiceName match { - case Some(name) if name == actualServiceName => () - case _ => - throw InvalidRequestForSimClustersAnnVariantException( - modelVersion, - embeddingType, - actualServiceName, - expectedServiceName) - } - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD deleted file mode 100644 index dcca09b7f..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD +++ /dev/null @@ -1,24 +0,0 @@ -scala_library( - compiler_option_sets = ["fatal_warnings"], - tags = ["bazel-compatible"], - dependencies = [ - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "finagle/finagle-stats", - "finatra/inject/inject-core/src/main/scala", - "frigate/frigate-common/src/main/scala/com/twitter/frigate/common/store/strato", - "hermit/hermit-core/src/main/scala/com/twitter/hermit/store/common", - "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/injection", - "relevance-platform/src/main/scala/com/twitter/relevance_platform/common/readablestore", - "relevance-platform/src/main/scala/com/twitter/relevance_platform/simclustersann/multicluster", - "representation-manager/client/src/main/scala/com/twitter/representation_manager", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/candidate_source", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/common", - "simclusters-ann/server/src/main/scala/com/twitter/simclustersann/exceptions", - "simclusters-ann/thrift/src/main/thrift:thrift-scala", - "src/scala/com/twitter/simclusters_v2/common", - "src/scala/com/twitter/simclusters_v2/summingbird", - "src/scala/com/twitter/storehaus_internal/memcache", - "src/scala/com/twitter/storehaus_internal/util", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift-scala", - ], -) diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD.docx new file mode 100644 index 000000000..86f66e894 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/BUILD.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.docx new file mode 100644 index 000000000..a501617a1 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.scala deleted file mode 100644 index 6abc37b8d..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CacheModule.scala +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.inject.Provides -import com.twitter.finagle.memcached.Client -import javax.inject.Singleton -import com.twitter.conversions.DurationOps._ -import com.twitter.inject.TwitterModule -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.inject.annotations.Flag -import com.twitter.simclustersann.common.FlagNames -import com.twitter.storehaus_internal.memcache.MemcacheStore -import com.twitter.storehaus_internal.util.ClientName -import com.twitter.storehaus_internal.util.ZkEndPoint - -object CacheModule extends TwitterModule { - - @Singleton - @Provides - def providesCache( - @Flag(FlagNames.CacheDest) cacheDest: String, - @Flag(FlagNames.CacheTimeout) cacheTimeout: Int, - serviceIdentifier: ServiceIdentifier, - stats: StatsReceiver - ): Client = - MemcacheStore.memcachedClient( - name = ClientName("memcache_simclusters_ann"), - dest = ZkEndPoint(cacheDest), - timeout = cacheTimeout.milliseconds, - retries = 0, - statsReceiver = stats.scope("cache_client"), - serviceIdentifier = serviceIdentifier - ) -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.docx new file mode 100644 index 000000000..0bfeece9b Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.scala deleted file mode 100644 index 84fec3974..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigMapperModule.scala +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.inject.Provides -import com.twitter.inject.TwitterModule -import com.twitter.relevance_platform.simclustersann.multicluster.ClusterConfigMapper -import javax.inject.Singleton - -object ClusterConfigMapperModule extends TwitterModule { - @Singleton - @Provides - def providesClusterConfigMapper( - ): ClusterConfigMapper = { - ClusterConfigMapper - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.docx new file mode 100644 index 000000000..de8920c79 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.scala deleted file mode 100644 index ae4092760..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterConfigModule.scala +++ /dev/null @@ -1,25 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.inject.Provides -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.inject.TwitterModule -import com.twitter.relevance_platform.simclustersann.multicluster.ClusterConfig -import com.twitter.relevance_platform.simclustersann.multicluster.ClusterConfigMapper -import com.twitter.simclustersann.exceptions.MissingClusterConfigForSimClustersAnnVariantException -import javax.inject.Singleton - -object ClusterConfigModule extends TwitterModule { - @Singleton - @Provides - def providesClusterConfig( - serviceIdentifier: ServiceIdentifier, - clusterConfigMapper: ClusterConfigMapper - ): ClusterConfig = { - val serviceName = serviceIdentifier.service - - clusterConfigMapper.getClusterConfig(serviceName) match { - case Some(config) => config - case None => throw MissingClusterConfigForSimClustersAnnVariantException(serviceName) - } - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.docx new file mode 100644 index 000000000..45b832db1 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.scala deleted file mode 100644 index 34281fa22..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ClusterTweetIndexProviderModule.scala +++ /dev/null @@ -1,95 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.inject.Provides -import com.twitter.conversions.DurationOps._ -import com.twitter.decider.Decider -import com.twitter.finagle.memcached.Client -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.hermit.store.common.ObservedCachedReadableStore -import com.twitter.hermit.store.common.ObservedMemcachedReadableStore -import com.twitter.inject.TwitterModule -import com.twitter.inject.annotations.Flag -import com.twitter.relevance_platform.common.injection.LZ4Injection -import com.twitter.relevance_platform.common.injection.SeqObjectInjection -import com.twitter.relevance_platform.simclustersann.multicluster.ClusterConfig -import com.twitter.relevance_platform.simclustersann.multicluster.ClusterTweetIndexStoreConfig -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.ModelVersions -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.summingbird.stores.ClusterKey -import com.twitter.simclusters_v2.summingbird.stores.TopKTweetsForClusterKeyReadableStore -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclustersann.common.FlagNames -import com.twitter.storehaus.ReadableStore - -import javax.inject.Singleton - -object ClusterTweetIndexProviderModule extends TwitterModule { - - @Singleton - @Provides - // Provides ClusterTweetIndex Store based on different maxResults settings on the same store - // Create a different provider if index is in a different store - def providesClusterTweetIndex( - @Flag(FlagNames.MaxTopTweetPerCluster) maxTopTweetPerCluster: Int, - @Flag(FlagNames.CacheAsyncUpdate) asyncUpdate: Boolean, - clusterConfig: ClusterConfig, - serviceIdentifier: ServiceIdentifier, - stats: StatsReceiver, - decider: Decider, - simClustersANNCacheClient: Client - ): ReadableStore[ClusterId, Seq[(TweetId, Double)]] = { - // Build the underling cluster-to-tweet store - val topTweetsForClusterStore = clusterConfig.clusterTweetIndexStoreConfig match { - // If the config returns Manhattan tweet index config, we read from a RO MH store - case manhattanConfig: ClusterTweetIndexStoreConfig.Manhattan => - TopKTweetsForClusterKeyReadableStore.getClusterToTopKTweetsStoreFromManhattanRO( - maxTopTweetPerCluster, - manhattanConfig, - serviceIdentifier) - case memCacheConfig: ClusterTweetIndexStoreConfig.Memcached => - TopKTweetsForClusterKeyReadableStore.getClusterToTopKTweetsStoreFromMemCache( - maxTopTweetPerCluster, - memCacheConfig, - serviceIdentifier) - case _ => - // Bad instance - ReadableStore.empty - } - - val embeddingType: EmbeddingType = clusterConfig.candidateTweetEmbeddingType - val modelVersion: String = ModelVersions.toKnownForModelVersion(clusterConfig.modelVersion) - - val store: ReadableStore[ClusterId, Seq[(TweetId, Double)]] = - topTweetsForClusterStore.composeKeyMapping { id: ClusterId => - ClusterKey(id, modelVersion, embeddingType) - } - - val memcachedTopTweetsForClusterStore = - ObservedMemcachedReadableStore.fromCacheClient( - backingStore = store, - cacheClient = simClustersANNCacheClient, - ttl = 15.minutes, - asyncUpdate = asyncUpdate - )( - valueInjection = LZ4Injection.compose(SeqObjectInjection[(Long, Double)]()), - statsReceiver = stats.scope("cluster_tweet_index_mem_cache"), - keyToString = { k => - // prod cache key : SimClusters_LZ4/cluster_to_tweet/clusterId_embeddingType_modelVersion - s"scz:c2t:${k}_${embeddingType}_${modelVersion}_$maxTopTweetPerCluster" - } - ) - - val cachedStore: ReadableStore[ClusterId, Seq[(TweetId, Double)]] = { - ObservedCachedReadableStore.from[ClusterId, Seq[(TweetId, Double)]]( - memcachedTopTweetsForClusterStore, - ttl = 10.minute, - maxKeys = 150000, - cacheName = "cluster_tweet_index_cache", - windowSize = 10000L - )(stats.scope("cluster_tweet_index_store")) - } - cachedStore - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.docx new file mode 100644 index 000000000..561cdb1e6 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.scala deleted file mode 100644 index 678943d2a..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/CustomMtlsThriftWebFormsModule.scala +++ /dev/null @@ -1,99 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.twitter.finatra.mtls.thriftmux.modules.MtlsThriftWebFormsModule -import com.twitter.finatra.thrift.ThriftServer -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.InternalId -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.thriftwebforms.MethodOptions -import com.twitter.thriftwebforms.view.ServiceResponseView -import com.twitter.util.Future -import com.twitter.simclustersann.thriftscala.SimClustersANNTweetCandidate -import com.twitter.simclustersann.thriftscala.Query -import com.twitter.simclustersann.thriftscala.SimClustersANNConfig -import com.twitter.simclustersann.thriftscala.ScoringAlgorithm -import com.twitter.thriftwebforms.MethodOptions.Access -import scala.reflect.ClassTag -import com.twitter.simclustersann.thriftscala.SimClustersANNService -import scala.collection.mutable - -class CustomMtlsThriftWebFormsModule[T: ClassTag](server: ThriftServer) - extends MtlsThriftWebFormsModule[T](server: ThriftServer) { - - private val Nbsp = " " - private val LdapGroups = Seq("recosplat-sensitive-data-medium", "simclusters-ann-admins") - - override protected def methodOptions: Map[String, MethodOptions] = { - val tweetId = 1568796529690902529L - val sannDefaultQuery = SimClustersANNService.GetTweetCandidates.Args( - query = Query( - sourceEmbeddingId = SimClustersEmbeddingId( - embeddingType = EmbeddingType.LogFavLongestL2EmbeddingTweet, - modelVersion = ModelVersion.Model20m145k2020, - internalId = InternalId.TweetId(tweetId) - ), - config = SimClustersANNConfig( - maxNumResults = 10, - minScore = 0.0, - candidateEmbeddingType = EmbeddingType.LogFavBasedTweet, - maxTopTweetsPerCluster = 400, - maxScanClusters = 50, - maxTweetCandidateAgeHours = 24, - minTweetCandidateAgeHours = 0, - annAlgorithm = ScoringAlgorithm.CosineSimilarity - ) - )) - - Seq("getTweetCandidates") - .map( - _ -> MethodOptions( - defaultRequestValue = Some(sannDefaultQuery), - responseRenderers = Seq(renderTimeline), - allowedAccessOverride = Some(Access.ByLdapGroup(LdapGroups)) - )).toMap - } - - val FullAccessLdapGroups: Seq[String] = - Seq( - "recosplat-sensitive-data-medium", - "simclusters-ann-admins", - "recos-platform-admins" - ) - - override protected def defaultMethodAccess: MethodOptions.Access = { - MethodOptions.Access.ByLdapGroup(FullAccessLdapGroups) - } - - def renderTimeline(r: AnyRef): Future[ServiceResponseView] = { - val simClustersANNTweetCandidates = r match { - case response: Iterable[_] => - response.map(x => x.asInstanceOf[SimClustersANNTweetCandidate]).toSeq - case _ => Seq() - } - renderTweets(simClustersANNTweetCandidates) - } - - private def renderTweets( - simClustersANNTweetCandidates: Seq[SimClustersANNTweetCandidate] - ): Future[ServiceResponseView] = { - val htmlSb = new mutable.StringBuilder() - val headerHtml = s"""

Tweet Candidates

""" - val tweetsHtml = simClustersANNTweetCandidates.map { simClustersANNTweetCandidate => - val tweetId = simClustersANNTweetCandidate.tweetId - val score = simClustersANNTweetCandidate.score - s""" score: $score

""" - }.mkString - - htmlSb ++= headerHtml - htmlSb ++= Nbsp - htmlSb ++= tweetsHtml - Future.value( - ServiceResponseView( - "SimClusters ANN Tweet Candidates", - htmlSb.toString(), - Seq("//platform.twitter.com/widgets.js") - ) - ) - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.docx new file mode 100644 index 000000000..a07233084 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.scala deleted file mode 100644 index 7111501fe..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/EmbeddingStoreModule.scala +++ /dev/null @@ -1,110 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.inject.Provides -import com.twitter.decider.Decider -import com.twitter.finagle.memcached.{Client => MemcachedClient} -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.inject.TwitterModule -import com.twitter.representation_manager.StoreBuilder -import com.twitter.representation_manager.config.{ - DefaultClientConfig => RepresentationManagerDefaultClientConfig -} -import com.twitter.representation_manager.thriftscala.SimClustersEmbeddingView -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.stores.SimClustersEmbeddingStore -import com.twitter.simclusters_v2.thriftscala.EmbeddingType -import com.twitter.simclusters_v2.thriftscala.EmbeddingType._ -import com.twitter.simclusters_v2.thriftscala.ModelVersion -import com.twitter.simclusters_v2.thriftscala.ModelVersion._ -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.storehaus.ReadableStore -import com.twitter.strato.client.{Client => StratoClient} -import javax.inject.Singleton - -object EmbeddingStoreModule extends TwitterModule { - - val TweetEmbeddings: Set[SimClustersEmbeddingView] = Set( - SimClustersEmbeddingView(LogFavLongestL2EmbeddingTweet, Model20m145kUpdated), - SimClustersEmbeddingView(LogFavLongestL2EmbeddingTweet, Model20m145k2020) - ) - - val UserEmbeddings: Set[SimClustersEmbeddingView] = Set( - // KnownFor - SimClustersEmbeddingView(FavBasedProducer, Model20m145kUpdated), - SimClustersEmbeddingView(FavBasedProducer, Model20m145k2020), - SimClustersEmbeddingView(FollowBasedProducer, Model20m145k2020), - SimClustersEmbeddingView(AggregatableLogFavBasedProducer, Model20m145k2020), - // InterestedIn - SimClustersEmbeddingView(UnfilteredUserInterestedIn, Model20m145k2020), - SimClustersEmbeddingView( - LogFavBasedUserInterestedMaxpoolingAddressBookFromIIAPE, - Model20m145k2020), - SimClustersEmbeddingView( - LogFavBasedUserInterestedAverageAddressBookFromIIAPE, - Model20m145k2020), - SimClustersEmbeddingView( - LogFavBasedUserInterestedBooktypeMaxpoolingAddressBookFromIIAPE, - Model20m145k2020), - SimClustersEmbeddingView( - LogFavBasedUserInterestedLargestDimMaxpoolingAddressBookFromIIAPE, - Model20m145k2020), - SimClustersEmbeddingView( - LogFavBasedUserInterestedLouvainMaxpoolingAddressBookFromIIAPE, - Model20m145k2020), - SimClustersEmbeddingView( - LogFavBasedUserInterestedConnectedMaxpoolingAddressBookFromIIAPE, - Model20m145k2020), - SimClustersEmbeddingView(UserNextInterestedIn, Model20m145k2020), - SimClustersEmbeddingView(LogFavBasedUserInterestedInFromAPE, Model20m145k2020) - ) - - @Singleton - @Provides - def providesEmbeddingStore( - stratoClient: StratoClient, - memCachedClient: MemcachedClient, - decider: Decider, - stats: StatsReceiver - ): ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] = { - - val rmsStoreBuilder = new StoreBuilder( - clientConfig = RepresentationManagerDefaultClientConfig, - stratoClient = stratoClient, - memCachedClient = memCachedClient, - globalStats = stats, - ) - - val underlyingStores: Map[ - (EmbeddingType, ModelVersion), - ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ] = { - val tweetEmbeddingStores: Map[ - (EmbeddingType, ModelVersion), - ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ] = TweetEmbeddings - .map(embeddingView => - ( - (embeddingView.embeddingType, embeddingView.modelVersion), - rmsStoreBuilder - .buildSimclustersTweetEmbeddingStoreWithEmbeddingIdAsKey(embeddingView))).toMap - - val userEmbeddingStores: Map[ - (EmbeddingType, ModelVersion), - ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding] - ] = UserEmbeddings - .map(embeddingView => - ( - (embeddingView.embeddingType, embeddingView.modelVersion), - rmsStoreBuilder - .buildSimclustersUserEmbeddingStoreWithEmbeddingIdAsKey(embeddingView))).toMap - - tweetEmbeddingStores ++ userEmbeddingStores - } - - SimClustersEmbeddingStore.buildWithDecider( - underlyingStores = underlyingStores, - decider = decider, - statsReceiver = stats - ) - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.docx new file mode 100644 index 000000000..ea8d535b8 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.scala deleted file mode 100644 index ebcaeca27..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FlagsModule.scala +++ /dev/null @@ -1,44 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.twitter.inject.TwitterModule -import com.twitter.simclustersann.common.FlagNames - -object FlagsModule extends TwitterModule { - - flag[Int]( - name = FlagNames.ServiceTimeout, - default = 40, - help = "The threshold of Request Timeout" - ) - - flag[String]( - name = FlagNames.DarkTrafficFilterDeciderKey, - default = "dark_traffic_filter", - help = "Dark traffic filter decider key" - ) - - flag[String]( - name = FlagNames.CacheDest, - default = "/s/cache/content_recommender_unified_v2", - help = "Path to memcache service. Currently using CR uniform scoring cache" - ) - - flag[Int]( - name = FlagNames.CacheTimeout, - default = 15, - help = "The threshold of MemCache Timeout" - ) - - flag[Boolean]( - name = FlagNames.CacheAsyncUpdate, - default = false, - help = "Whether to enable the async update for the MemCache" - ) - - flag[Int]( - name = FlagNames.MaxTopTweetPerCluster, - default = 200, - help = "Maximum number of tweets to take per each simclusters" - ) - -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.docx new file mode 100644 index 000000000..eb0df8ab4 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.scala deleted file mode 100644 index c66ade392..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/FuturePoolProvider.scala +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.inject.Provides -import com.twitter.inject.TwitterModule -import com.twitter.inject.annotations.Flag -import com.twitter.simclustersann.common.FlagNames.NumberOfThreads -import com.twitter.util.ExecutorServiceFuturePool -import java.util.concurrent.Executors -import javax.inject.Singleton -object FuturePoolProvider extends TwitterModule { - flag[Int]( - name = NumberOfThreads, - default = 20, - help = "The number of threads in the future pool." - ) - - @Singleton - @Provides - def providesFuturePool( - @Flag(NumberOfThreads) numberOfThreads: Int - ): ExecutorServiceFuturePool = { - val threadPool = Executors.newFixedThreadPool(numberOfThreads) - new ExecutorServiceFuturePool(threadPool) { - override def toString: String = s"warmup-future-pool-$executor)" - } - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.docx new file mode 100644 index 000000000..eb9494f72 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.scala deleted file mode 100644 index 66e26d4f5..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/RateLimiterModule.scala +++ /dev/null @@ -1,23 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.common.util.concurrent.RateLimiter -import com.google.inject.Provides -import com.twitter.inject.TwitterModule -import com.twitter.inject.annotations.Flag -import com.twitter.simclustersann.common.FlagNames.RateLimiterQPS -import javax.inject.Singleton - -object RateLimiterModule extends TwitterModule { - flag[Int]( - name = RateLimiterQPS, - default = 1000, - help = "The QPS allowed by the rate limiter." - ) - - @Singleton - @Provides - def providesRateLimiter( - @Flag(RateLimiterQPS) rateLimiterQps: Int - ): RateLimiter = - RateLimiter.create(rateLimiterQps) -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.docx new file mode 100644 index 000000000..7f5d9e6c0 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.scala deleted file mode 100644 index 91a38f2a1..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/ServiceNameMapperModule.scala +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.inject.Provides -import com.twitter.inject.TwitterModule -import com.twitter.relevance_platform.simclustersann.multicluster.ServiceNameMapper -import javax.inject.Singleton - -object ServiceNameMapperModule extends TwitterModule { - @Singleton - @Provides - def providesServiceNameMapper( - ): ServiceNameMapper = { - ServiceNameMapper - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.docx new file mode 100644 index 000000000..aa7172646 Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.scala deleted file mode 100644 index b5f9ee5da..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/SimClustersANNCandidateSourceModule.scala +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.inject.Provides -import com.twitter.finagle.stats.StatsReceiver -import com.twitter.inject.TwitterModule -import com.twitter.simclusters_v2.common.ClusterId -import com.twitter.simclusters_v2.common.SimClustersEmbedding -import com.twitter.simclusters_v2.common.TweetId -import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId -import com.twitter.storehaus.ReadableStore -import javax.inject.Singleton -import com.twitter.simclustersann.candidate_source.ApproximateCosineSimilarity -import com.twitter.simclustersann.candidate_source.ExperimentalApproximateCosineSimilarity -import com.twitter.simclustersann.candidate_source.OptimizedApproximateCosineSimilarity -import com.twitter.simclustersann.candidate_source.SimClustersANNCandidateSource - -object SimClustersANNCandidateSourceModule extends TwitterModule { - - val acsFlag = flag[String]( - name = "approximate_cosine_similarity", - default = "original", - help = - "Select different implementations of the approximate cosine similarity algorithm, for testing optimizations", - ) - @Singleton - @Provides - def provides( - embeddingStore: ReadableStore[SimClustersEmbeddingId, SimClustersEmbedding], - cachedClusterTweetIndexStore: ReadableStore[ClusterId, Seq[(TweetId, Double)]], - statsReceiver: StatsReceiver - ): SimClustersANNCandidateSource = { - - val approximateCosineSimilarity = acsFlag() match { - case "original" => ApproximateCosineSimilarity - case "optimized" => OptimizedApproximateCosineSimilarity - case "experimental" => ExperimentalApproximateCosineSimilarity - case _ => ApproximateCosineSimilarity - } - - new SimClustersANNCandidateSource( - approximateCosineSimilarity = approximateCosineSimilarity, - clusterTweetCandidatesStore = cachedClusterTweetIndexStore, - simClustersEmbeddingStore = embeddingStore, - statsReceiver = statsReceiver.scope("simClustersANNCandidateSource") - ) - } -} diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.docx b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.docx new file mode 100644 index 000000000..113da2abc Binary files /dev/null and b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.docx differ diff --git a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.scala b/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.scala deleted file mode 100644 index 0766c70a7..000000000 --- a/simclusters-ann/server/src/main/scala/com/twitter/simclustersann/modules/StratoClientProviderModule.scala +++ /dev/null @@ -1,20 +0,0 @@ -package com.twitter.simclustersann.modules - -import com.google.inject.Provides -import javax.inject.Singleton -import com.twitter.inject.TwitterModule -import com.twitter.finagle.mtls.authentication.ServiceIdentifier -import com.twitter.strato.client.Client -import com.twitter.strato.client.Strato - -object StratoClientProviderModule extends TwitterModule { - - @Singleton - @Provides - def providesCache( - serviceIdentifier: ServiceIdentifier, - ): Client = Strato.client - .withMutualTls(serviceIdentifier) - .build() - -} diff --git a/simclusters-ann/thrift/src/main/thrift/BUILD b/simclusters-ann/thrift/src/main/thrift/BUILD deleted file mode 100644 index fce3b9c8c..000000000 --- a/simclusters-ann/thrift/src/main/thrift/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -create_thrift_libraries( - base_name = "thrift", - sources = ["**/*.thrift"], - platform = "java8", - tags = ["bazel-compatible"], - dependency_roots = [ - "finatra-internal/thrift/src/main/thrift", - "src/thrift/com/twitter/simclusters_v2:simclusters_v2-thrift", - ], - generate_languages = [ - "java", - "scala", - ], - provides_java_name = "simclusters-ann-thrift-java", - provides_scala_name = "simclusters-ann-thrift-scala", -) diff --git a/simclusters-ann/thrift/src/main/thrift/BUILD.docx b/simclusters-ann/thrift/src/main/thrift/BUILD.docx new file mode 100644 index 000000000..267769796 Binary files /dev/null and b/simclusters-ann/thrift/src/main/thrift/BUILD.docx differ diff --git a/simclusters-ann/thrift/src/main/thrift/simClustersAnn.docx b/simclusters-ann/thrift/src/main/thrift/simClustersAnn.docx new file mode 100644 index 000000000..0311cada9 Binary files /dev/null and b/simclusters-ann/thrift/src/main/thrift/simClustersAnn.docx differ diff --git a/simclusters-ann/thrift/src/main/thrift/simClustersAnn.thrift b/simclusters-ann/thrift/src/main/thrift/simClustersAnn.thrift deleted file mode 100644 index 9c327febe..000000000 --- a/simclusters-ann/thrift/src/main/thrift/simClustersAnn.thrift +++ /dev/null @@ -1,59 +0,0 @@ -namespace java com.twitter.simclustersann.thriftjava -#@namespace scala com.twitter.simclustersann.thriftscala - -include "finatra-thrift/finatra_thrift_exceptions.thrift" -include "com/twitter/simclusters_v2/identifier.thrift" -include "com/twitter/simclusters_v2/score.thrift" - -struct Query { - 1: required identifier.SimClustersEmbeddingId sourceEmbeddingId; - 2: required SimClustersANNConfig config; -} - -struct SimClustersANNTweetCandidate { - 1: required i64 tweetId (personalDataType = 'TweetId'); - 2: required double score; -} - -struct SimClustersANNConfig { - 1: required i32 maxNumResults; - 2: required double minScore; - 3: required identifier.EmbeddingType candidateEmbeddingType; - 4: required i32 maxTopTweetsPerCluster; - 5: required i32 maxScanClusters; - 6: required i32 maxTweetCandidateAgeHours; - 7: required i32 minTweetCandidateAgeHours; - 8: required ScoringAlgorithm annAlgorithm; -} - -/** - * The algorithm type to identify the score algorithm. - **/ -enum ScoringAlgorithm { - DotProduct = 1, - CosineSimilarity = 2, - LogCosineSimilarity = 3, - CosineSimilarityNoSourceEmbeddingNormalization = 4, // Score = (Source dot Candidate) / candidate_l2_norm -}(hasPersonalData = 'false') - -enum InvalidResponseParameter { - INVALID_EMBEDDING_TYPE = 1, - INVALID_MODEL_VERSION = 2, -} - -exception InvalidResponseParameterException { - 1: required InvalidResponseParameter errorCode, - 2: optional string message // failure reason -} - -service SimClustersANNService { - - list getTweetCandidates( - 1: required Query query; - ) throws ( - 1: InvalidResponseParameterException e; - 2: finatra_thrift_exceptions.ServerError serverError; - 3: finatra_thrift_exceptions.ClientError clientError; - ); - -} diff --git a/src/java/com/twitter/search/README.docx b/src/java/com/twitter/search/README.docx new file mode 100644 index 000000000..014caa211 Binary files /dev/null and b/src/java/com/twitter/search/README.docx differ diff --git a/src/java/com/twitter/search/README.md b/src/java/com/twitter/search/README.md deleted file mode 100644 index f92a9bdf3..000000000 --- a/src/java/com/twitter/search/README.md +++ /dev/null @@ -1,50 +0,0 @@ -# Tweet Search System (Earlybird) -> **TL;DR** Tweet Search System (Earlybird) find tweets from people you follow, rank them, and serve the tweets to Home. - -## What is Tweet Search System (Earlybird)? -[Earlybird](http://notes.stephenholiday.com/Earlybird.pdf) is a **real-time search system** based on [Apache Lucene](https://lucene.apache.org/) to support the high volume of queries and content updates. The major use cases are Relevance Search (specifically, Text search) and Timeline In-network Tweet retrieval (or UserID based search). It is designed to enable the efficient indexing and querying of billions of tweets, and to provide low-latency search results, even with heavy query loads. - -## How it is related to the Home Timeline Recommendation Algorithm - -![in-network](img/in-network.png) - -At Twitter, we use Tweet Search System (Earlybird) to do Home Timeline In-network Tweet retrieval: given a list of following users, find their recently posted tweets. Earlybird (Search Index) is the major candidate source for in-network tweets across Following tab and For You tab. - - -## High-level architecture -We split our entire tweet search index into three clusters: a **realtime** cluster indexing all public tweets posted in about the last 7 days, a **protected** cluster indexing all protected tweets for the same timeframe; and an **archive** cluster indexing all tweets ever posted, up to about two days ago. - -Earlybird addresses the challenges of scaling real-time search by splitting each cluster across multiple **partitions**, each responsible for a portion of the index. The architecture uses a distributed *inverted index* that is sharded and replicated. This design allows for efficient index updates and query processing. - -The system also employs an incremental indexing approach, enabling it to process and index new tweets in real-time as they arrive. With single writer, multiple reader structure, Earlybird can handle a large number of real-time updates and queries concurrently while maintaining low query latency. The system can achieve high query throughput and low query latency while maintaining a high degree of index freshness. - - -### Indexing -* Ingesters read tweets and user modifications from kafka topics, extract fields and features from them and write the extracted data to intermediate kafka topics for Earlybirds to consume, index and serve. -* Feature Update Service feeds feature updates such as up-to-date engagement (like, retweets, replies) counts to Earlybird. -![indexing](img/indexing.png) - -### Serving -Earlybird roots fanout requests to different Earlybird clusters or partitions. Upon receiving responses from the clusters or partitions, roots merge the responses before finally returning the merged response to the client. -![serving](img/serving.png) - -## Use cases - -1. Tweet Search - * Top search - * Latest search - -![top](img/top-search.png) - -2. Candidate generation - * Timeline (For You Tab, Following Tab) - * Notifications - -![home](img/foryou.png) - -## References -* "Earlybird: Real-Time Search at Twitter" (http://notes.stephenholiday.com/Earlybird.pdf) -* "Reducing search indexing latency to one second" (https://blog.twitter.com/engineering/en_us/topics/infrastructure/2020/reducing-search-indexing-latency-to-one-second) -* "Omnisearch index formats" (https://blog.twitter.com/engineering/en_us/topics/infrastructure/2016/omnisearch-index-formats) - - diff --git a/src/java/com/twitter/search/common/README.docx b/src/java/com/twitter/search/common/README.docx new file mode 100644 index 000000000..8227df958 Binary files /dev/null and b/src/java/com/twitter/search/common/README.docx differ diff --git a/src/java/com/twitter/search/common/README.md b/src/java/com/twitter/search/common/README.md deleted file mode 100644 index c7f2e38bb..000000000 --- a/src/java/com/twitter/search/common/README.md +++ /dev/null @@ -1 +0,0 @@ -Contains code that is common to multiple earlybird services (ingesters, roots and earlybird). \ No newline at end of file diff --git a/src/java/com/twitter/search/common/converter/earlybird/BUILD b/src/java/com/twitter/search/common/converter/earlybird/BUILD deleted file mode 100644 index a5d4ea4ae..000000000 --- a/src/java/com/twitter/search/common/converter/earlybird/BUILD +++ /dev/null @@ -1,57 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/joda-time", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/httpcomponents:httpcore", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "cuad/projects/ner/thrift/src/main/thrift:thrift-java", - "decider/src/main/scala", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/common/text/language:locale-util", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/common/text/util:token-util", - "src/java/com/twitter/common_internal/text:text-penguin7", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/config", - "src/java/com/twitter/search/common/constants", - "src/java/com/twitter/search/common/debug", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/encoding/docvalues", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/relevance:entities_and_filters", - "src/java/com/twitter/search/common/relevance:text", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/util:longintconverter", - "src/java/com/twitter/search/common/util/analysis", - "src/java/com/twitter/search/common/util/lang", - "src/java/com/twitter/search/common/util/spatial", - "src/java/com/twitter/search/common/util/text", - "src/java/com/twitter/search/common/util/text/regex", - "src/java/com/twitter/search/common/util/thrift:thrift-utils", - "src/java/com/twitter/search/common/util/url", - "src/java/com/twitter/search/ingester/model", - "src/thrift/com/twitter/search/common:constants-java", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/search/common/debug:debug-java", - "src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java", - "src/thrift/com/twitter/tweetypie:tweet-java", - ], -) diff --git a/src/java/com/twitter/search/common/converter/earlybird/BUILD.docx b/src/java/com/twitter/search/common/converter/earlybird/BUILD.docx new file mode 100644 index 000000000..35120101d Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.docx b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.docx new file mode 100644 index 000000000..bc5b8c6da Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.docx differ diff --git a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java deleted file mode 100644 index afde8a84e..000000000 --- a/src/java/com/twitter/search/common/converter/earlybird/BasicIndexingConverter.java +++ /dev/null @@ -1,647 +0,0 @@ -package com.twitter.search.common.converter.earlybird; - -import java.io.IOException; -import java.util.Date; -import java.util.List; -import java.util.Optional; -import javax.annotation.concurrent.NotThreadSafe; - -import com.google.common.base.Preconditions; - -import org.apache.commons.collections.CollectionUtils; -import org.joda.time.DateTime; -import org.joda.time.DateTimeZone; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.converter.earlybird.EncodedFeatureBuilder.TweetFeatureWithEncodeFeatures; -import com.twitter.search.common.indexing.thriftjava.Place; -import com.twitter.search.common.indexing.thriftjava.PotentialLocation; -import com.twitter.search.common.indexing.thriftjava.ProfileGeoEnrichment; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.relevance.entities.GeoObject; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.entities.TwitterQuotedMessage; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; -import com.twitter.search.common.util.spatial.GeoUtil; -import com.twitter.search.common.util.text.NormalizerHelper; -import com.twitter.tweetypie.thriftjava.ComposerSource; - -/** - * Converts a TwitterMessage into a ThriftVersionedEvents. This is only responsible for data that - * is available immediately when a Tweet is created. Some data, like URL data, isn't available - * immediately, and so it is processed later, in the DelayedIndexingConverter and sent as an - * update. In order to achieve this we create the document in 2 passes: - * - * 1. BasicIndexingConverter builds thriftVersionedEvents with the fields that do not require - * external services. - * - * 2. DelayedIndexingConverter builds all the document fields depending on external services, once - * those services have processed the relevant Tweet and we have retrieved that data. - */ -@NotThreadSafe -public class BasicIndexingConverter { - private static final Logger LOG = LoggerFactory.getLogger(BasicIndexingConverter.class); - - private static final SearchCounter NUM_NULLCAST_FEATURE_FLAG_SET_TWEETS = - SearchCounter.export("num_nullcast_feature_flag_set_tweets"); - private static final SearchCounter NUM_NULLCAST_TWEETS = - SearchCounter.export("num_nullcast_tweets"); - private static final SearchCounter NUM_NON_NULLCAST_TWEETS = - SearchCounter.export("num_non_nullcast_tweets"); - private static final SearchCounter ADJUSTED_BAD_CREATED_AT_COUNTER = - SearchCounter.export("adjusted_incorrect_created_at_timestamp"); - private static final SearchCounter INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS = - SearchCounter.export("inconsistent_tweet_id_and_created_at_ms"); - private static final SearchCounter NUM_SELF_THREAD_TWEETS = - SearchCounter.export("num_self_thread_tweets"); - private static final SearchCounter NUM_EXCLUSIVE_TWEETS = - SearchCounter.export("num_exclusive_tweets"); - - // If a tweet carries a timestamp smaller than this timestamp, we consider the timestamp invalid, - // because twitter does not even exist back then before: Sun, 01 Jan 2006 00:00:00 GMT - private static final long VALID_CREATION_TIME_THRESHOLD_MILLIS = - new DateTime(2006, 1, 1, 0, 0, 0, DateTimeZone.UTC).getMillis(); - - private final EncodedFeatureBuilder featureBuilder; - private final Schema schema; - private final EarlybirdCluster cluster; - - public BasicIndexingConverter(Schema schema, EarlybirdCluster cluster) { - this.featureBuilder = new EncodedFeatureBuilder(); - this.schema = schema; - this.cluster = cluster; - } - - /** - * This function converts TwitterMessage to ThriftVersionedEvents, which is a generic data - * structure that can be consumed by Earlybird directly. - */ - public ThriftVersionedEvents convertMessageToThrift( - TwitterMessage message, - boolean strict, - List penguinVersions) throws IOException { - Preconditions.checkNotNull(message); - Preconditions.checkNotNull(penguinVersions); - - ThriftVersionedEvents versionedEvents = new ThriftVersionedEvents() - .setId(message.getId()); - - ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot(); - - for (PenguinVersion penguinVersion : penguinVersions) { - ThriftDocument document = - buildDocumentForPenguinVersion(schemaSnapshot, message, strict, penguinVersion); - - ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent() - .setDocument(document) - .setEventType(ThriftIndexingEventType.INSERT) - .setSortId(message.getId()); - message.getFromUserTwitterId().map(thriftIndexingEvent::setUid); - versionedEvents.putToVersionedEvents(penguinVersion.getByteValue(), thriftIndexingEvent); - } - - return versionedEvents; - } - - private ThriftDocument buildDocumentForPenguinVersion( - ImmutableSchemaInterface schemaSnapshot, - TwitterMessage message, - boolean strict, - PenguinVersion penguinVersion) throws IOException { - TweetFeatureWithEncodeFeatures tweetFeature = - featureBuilder.createTweetFeaturesFromTwitterMessage( - message, penguinVersion, schemaSnapshot); - - EarlybirdThriftDocumentBuilder builder = - buildBasicFields(message, schemaSnapshot, cluster, tweetFeature); - - buildUserFields(builder, message, tweetFeature.versionedFeatures, penguinVersion); - buildGeoFields(builder, message, tweetFeature.versionedFeatures); - buildRetweetAndReplyFields(builder, message, strict); - buildQuotesFields(builder, message); - buildVersionedFeatureFields(builder, tweetFeature.versionedFeatures); - buildAnnotationFields(builder, message); - buildNormalizedMinEngagementFields(builder, tweetFeature.encodedFeatures, cluster); - buildDirectedAtFields(builder, message); - - builder.withSpaceIdFields(message.getSpaceIds()); - - return builder.build(); - } - - /** - * Build the basic fields for a tweet. - */ - public static EarlybirdThriftDocumentBuilder buildBasicFields( - TwitterMessage message, - ImmutableSchemaInterface schemaSnapshot, - EarlybirdCluster cluster, - TweetFeatureWithEncodeFeatures tweetFeature) { - EarlybirdEncodedFeatures extendedEncodedFeatures = tweetFeature.extendedEncodedFeatures; - if (extendedEncodedFeatures == null && EarlybirdCluster.isTwitterMemoryFormatCluster(cluster)) { - extendedEncodedFeatures = EarlybirdEncodedFeatures.newEncodedTweetFeatures( - schemaSnapshot, EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD); - } - EarlybirdThriftDocumentBuilder builder = new EarlybirdThriftDocumentBuilder( - tweetFeature.encodedFeatures, - extendedEncodedFeatures, - new EarlybirdFieldConstants(), - schemaSnapshot); - - builder.withID(message.getId()); - - final Date createdAt = message.getDate(); - long createdAtMs = createdAt == null ? 0L : createdAt.getTime(); - - createdAtMs = fixCreatedAtTimeStampIfNecessary(message.getId(), createdAtMs); - - if (createdAtMs > 0L) { - builder.withCreatedAt((int) (createdAtMs / 1000)); - } - - builder.withTweetSignature(tweetFeature.versionedFeatures.getTweetSignature()); - - if (message.getConversationId() > 0) { - long conversationId = message.getConversationId(); - builder.withLongField( - EarlybirdFieldConstant.CONVERSATION_ID_CSF.getFieldName(), conversationId); - // We only index conversation ID when it is different from the tweet ID. - if (message.getId() != conversationId) { - builder.withLongField( - EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName(), conversationId); - } - } - - if (message.getComposerSource().isPresent()) { - ComposerSource composerSource = message.getComposerSource().get(); - builder.withIntField( - EarlybirdFieldConstant.COMPOSER_SOURCE.getFieldName(), composerSource.getValue()); - if (composerSource == ComposerSource.CAMERA) { - builder.withCameraComposerSourceFlag(); - } - } - - EarlybirdEncodedFeatures encodedFeatures = tweetFeature.encodedFeatures; - if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG)) { - builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.VERIFIED_FILTER_TERM); - } - if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG)) { - builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.BLUE_VERIFIED_FILTER_TERM); - } - - if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)) { - builder.withOffensiveFlag(); - } - - if (message.getNullcast()) { - NUM_NULLCAST_TWEETS.increment(); - builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.NULLCAST_FILTER_TERM); - } else { - NUM_NON_NULLCAST_TWEETS.increment(); - } - if (encodedFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG)) { - NUM_NULLCAST_FEATURE_FLAG_SET_TWEETS.increment(); - } - if (message.isSelfThread()) { - builder.addFilterInternalFieldTerm( - EarlybirdFieldConstant.SELF_THREAD_FILTER_TERM); - NUM_SELF_THREAD_TWEETS.increment(); - } - - if (message.isExclusive()) { - builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.EXCLUSIVE_FILTER_TERM); - builder.withLongField( - EarlybirdFieldConstant.EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF.getFieldName(), - message.getExclusiveConversationAuthorId()); - NUM_EXCLUSIVE_TWEETS.increment(); - } - - builder.withLanguageCodes(message.getLanguage(), message.getBCP47LanguageTag()); - - return builder; - } - - /** - * Build the user fields. - */ - public static void buildUserFields( - EarlybirdThriftDocumentBuilder builder, - TwitterMessage message, - VersionedTweetFeatures versionedTweetFeatures, - PenguinVersion penguinVersion) { - // 1. Set all the from user fields. - if (message.getFromUserTwitterId().isPresent()) { - builder.withLongField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(), - message.getFromUserTwitterId().get()) - // CSF - .withLongField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(), - message.getFromUserTwitterId().get()); - } else { - LOG.warn("fromUserTwitterId is not set in TwitterMessage! Status id: " + message.getId()); - } - - if (message.getFromUserScreenName().isPresent()) { - String fromUser = message.getFromUserScreenName().get(); - String normalizedFromUser = - NormalizerHelper.normalizeWithUnknownLocale(fromUser, penguinVersion); - - builder - .withWhiteSpaceTokenizedScreenNameField( - EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), - normalizedFromUser) - .withStringField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), - normalizedFromUser); - - if (message.getTokenizedFromUserScreenName().isPresent()) { - builder.withCamelCaseTokenizedScreenNameField( - EarlybirdFieldConstant.CAMELCASE_USER_HANDLE_FIELD.getFieldName(), - fromUser, - normalizedFromUser, - message.getTokenizedFromUserScreenName().get()); - } - } - - Optional toUserScreenName = message.getToUserLowercasedScreenName(); - if (toUserScreenName.isPresent() && !toUserScreenName.get().isEmpty()) { - builder.withStringField( - EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(), - NormalizerHelper.normalizeWithUnknownLocale(toUserScreenName.get(), penguinVersion)); - } - - if (versionedTweetFeatures.isSetUserDisplayNameTokenStreamText()) { - builder.withTokenStreamField(EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(), - versionedTweetFeatures.getUserDisplayNameTokenStreamText(), - versionedTweetFeatures.getUserDisplayNameTokenStream()); - } - } - - /** - * Build the geo fields. - */ - public static void buildGeoFields( - EarlybirdThriftDocumentBuilder builder, - TwitterMessage message, - VersionedTweetFeatures versionedTweetFeatures) { - double lat = GeoUtil.ILLEGAL_LATLON; - double lon = GeoUtil.ILLEGAL_LATLON; - if (message.getGeoLocation() != null) { - GeoObject location = message.getGeoLocation(); - builder.withGeoField(EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(), - location.getLatitude(), location.getLongitude(), location.getAccuracy()); - - if (location.getSource() != null) { - builder.withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstants.formatGeoType(location.getSource())); - } - - if (GeoUtil.validateGeoCoordinates(location.getLatitude(), location.getLongitude())) { - lat = location.getLatitude(); - lon = location.getLongitude(); - } - } - - // See SEARCH-14317 for investigation on how much space geo filed is used in archive cluster. - // In lucene archives, this CSF is needed regardless of whether geoLocation is set. - builder.withLatLonCSF(lat, lon); - - if (versionedTweetFeatures.isSetTokenizedPlace()) { - Place place = versionedTweetFeatures.getTokenizedPlace(); - Preconditions.checkArgument(place.isSetId(), "Place ID not set for tweet " - + message.getId()); - Preconditions.checkArgument(place.isSetFullName(), - "Place full name not set for tweet " + message.getId()); - builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName()); - builder - .withStringField(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName(), place.getId()) - .withStringField(EarlybirdFieldConstant.PLACE_FULL_NAME_FIELD.getFieldName(), - place.getFullName()); - if (place.isSetCountryCode()) { - builder.withStringField(EarlybirdFieldConstant.PLACE_COUNTRY_CODE_FIELD.getFieldName(), - place.getCountryCode()); - } - } - - if (versionedTweetFeatures.isSetTokenizedProfileGeoEnrichment()) { - ProfileGeoEnrichment profileGeoEnrichment = - versionedTweetFeatures.getTokenizedProfileGeoEnrichment(); - Preconditions.checkArgument( - profileGeoEnrichment.isSetPotentialLocations(), - "ProfileGeoEnrichment.potentialLocations not set for tweet " - + message.getId()); - List potentialLocations = profileGeoEnrichment.getPotentialLocations(); - Preconditions.checkArgument( - !potentialLocations.isEmpty(), - "Found tweet with an empty ProfileGeoEnrichment.potentialLocations: " - + message.getId()); - builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.PROFILE_GEO_FILTER_TERM); - for (PotentialLocation potentialLocation : potentialLocations) { - if (potentialLocation.isSetCountryCode()) { - builder.withStringField( - EarlybirdFieldConstant.PROFILE_GEO_COUNTRY_CODE_FIELD.getFieldName(), - potentialLocation.getCountryCode()); - } - if (potentialLocation.isSetRegion()) { - builder.withStringField(EarlybirdFieldConstant.PROFILE_GEO_REGION_FIELD.getFieldName(), - potentialLocation.getRegion()); - } - if (potentialLocation.isSetLocality()) { - builder.withStringField(EarlybirdFieldConstant.PROFILE_GEO_LOCALITY_FIELD.getFieldName(), - potentialLocation.getLocality()); - } - } - } - - builder.withPlacesField(message.getPlaces()); - } - - /** - * Build the retweet and reply fields. - */ - public static void buildRetweetAndReplyFields( - EarlybirdThriftDocumentBuilder builder, - TwitterMessage message, - boolean strict) { - long retweetUserIdVal = -1; - long sharedStatusIdVal = -1; - if (message.getRetweetMessage() != null) { - if (message.getRetweetMessage().getSharedId() != null) { - sharedStatusIdVal = message.getRetweetMessage().getSharedId(); - } - if (message.getRetweetMessage().hasSharedUserTwitterId()) { - retweetUserIdVal = message.getRetweetMessage().getSharedUserTwitterId(); - } - } - - long inReplyToStatusIdVal = -1; - long inReplyToUserIdVal = -1; - if (message.isReply()) { - if (message.getInReplyToStatusId().isPresent()) { - inReplyToStatusIdVal = message.getInReplyToStatusId().get(); - } - if (message.getToUserTwitterId().isPresent()) { - inReplyToUserIdVal = message.getToUserTwitterId().get(); - } - } - - buildRetweetAndReplyFields( - retweetUserIdVal, - sharedStatusIdVal, - inReplyToStatusIdVal, - inReplyToUserIdVal, - strict, - builder); - } - - /** - * Build the quotes fields. - */ - public static void buildQuotesFields( - EarlybirdThriftDocumentBuilder builder, - TwitterMessage message) { - if (message.getQuotedMessage() != null) { - TwitterQuotedMessage quoted = message.getQuotedMessage(); - if (quoted != null && quoted.getQuotedStatusId() > 0 && quoted.getQuotedUserId() > 0) { - builder.withQuote(quoted.getQuotedStatusId(), quoted.getQuotedUserId()); - } - } - } - - /** - * Build directed at field. - */ - public static void buildDirectedAtFields( - EarlybirdThriftDocumentBuilder builder, - TwitterMessage message) { - if (message.getDirectedAtUserId().isPresent() && message.getDirectedAtUserId().get() > 0) { - builder.withDirectedAtUser(message.getDirectedAtUserId().get()); - builder.addFilterInternalFieldTerm(EarlybirdFieldConstant.DIRECTED_AT_FILTER_TERM); - } - } - - /** - * Build the versioned features for a tweet. - */ - public static void buildVersionedFeatureFields( - EarlybirdThriftDocumentBuilder builder, - VersionedTweetFeatures versionedTweetFeatures) { - builder - .withHashtagsField(versionedTweetFeatures.getHashtags()) - .withMentionsField(versionedTweetFeatures.getMentions()) - .withStocksFields(versionedTweetFeatures.getStocks()) - .withResolvedLinksText(versionedTweetFeatures.getNormalizedResolvedUrlText()) - .withTokenStreamField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), - versionedTweetFeatures.getTweetTokenStreamText(), - versionedTweetFeatures.isSetTweetTokenStream() - ? versionedTweetFeatures.getTweetTokenStream() : null) - .withStringField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(), - versionedTweetFeatures.getSource()) - .withStringField(EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName(), - versionedTweetFeatures.getNormalizedSource()); - - // Internal fields for smileys and question marks - if (versionedTweetFeatures.hasPositiveSmiley) { - builder.withStringField( - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.HAS_POSITIVE_SMILEY); - } - if (versionedTweetFeatures.hasNegativeSmiley) { - builder.withStringField( - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.HAS_NEGATIVE_SMILEY); - } - if (versionedTweetFeatures.hasQuestionMark) { - builder.withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), - EarlybirdThriftDocumentBuilder.QUESTION_MARK); - } - } - - /** - * Build the escherbird annotations for a tweet. - */ - public static void buildAnnotationFields( - EarlybirdThriftDocumentBuilder builder, - TwitterMessage message) { - List escherbirdAnnotations = - message.getEscherbirdAnnotations(); - if (CollectionUtils.isEmpty(escherbirdAnnotations)) { - return; - } - - builder.addFacetSkipList(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName()); - - for (TwitterMessage.EscherbirdAnnotation annotation : escherbirdAnnotations) { - String groupDomainEntity = String.format("%d.%d.%d", - annotation.groupId, annotation.domainId, annotation.entityId); - String domainEntity = String.format("%d.%d", annotation.domainId, annotation.entityId); - String entity = String.format("%d", annotation.entityId); - - builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(), - groupDomainEntity); - builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(), - domainEntity); - builder.withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(), - entity); - } - } - - /** - * Build the correct ThriftIndexingEvent's fields based on retweet and reply status. - */ - public static void buildRetweetAndReplyFields( - long retweetUserIdVal, - long sharedStatusIdVal, - long inReplyToStatusIdVal, - long inReplyToUserIdVal, - boolean strict, - EarlybirdThriftDocumentBuilder builder) { - Optional retweetUserId = Optional.of(retweetUserIdVal).filter(x -> x > 0); - Optional sharedStatusId = Optional.of(sharedStatusIdVal).filter(x -> x > 0); - Optional inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(x -> x > 0); - Optional inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(x -> x > 0); - - // We have six combinations here. A Tweet can be - // 1) a reply to another tweet (then it has both in-reply-to-user-id and - // in-reply-to-status-id set), - // 2) directed-at a user (then it only has in-reply-to-user-id set), - // 3) not a reply at all. - // Additionally, it may or may not be a Retweet (if it is, then it has retweet-user-id and - // retweet-status-id set). - // - // We want to set some fields unconditionally, and some fields (reference-author-id and - // shared-status-id) depending on the reply/retweet combination. - // - // 1. Normal tweet (not a reply, not a retweet). None of the fields should be set. - // - // 2. Reply to a tweet (both in-reply-to-user-id and in-reply-to-status-id set). - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // SHARED_STATUS_ID_CSF should be set to in-reply-to-status-id - // IS_REPLY_FLAG should be set - // - // 3. Directed-at a user (only in-reply-to-user-id is set). - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // IS_REPLY_FLAG should be set - // - // 4. Retweet of a normal tweet (retweet-user-id and retweet-status-id are set). - // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id - // SHARED_STATUS_ID_CSF should be set to retweet-status-id - // IS_RETWEET_FLAG should be set - // - // 5. Retweet of a reply (both in-reply-to-user-id and in-reply-to-status-id set, - // retweet-user-id and retweet-status-id are set). - // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id - // SHARED_STATUS_ID_CSF should be set to retweet-status-id (retweet beats reply!) - // IS_RETWEET_FLAG should be set - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // IS_REPLY_FLAG should NOT be set - // - // 6. Retweet of a directed-at tweet (only in-reply-to-user-id is set, - // retweet-user-id and retweet-status-id are set). - // RETWEET_SOURCE_USER_ID_FIELD should be set to retweet-user-id - // SHARED_STATUS_ID_CSF should be set to retweet-status-id - // IS_RETWEET_FLAG should be set - // IN_REPLY_TO_USER_ID_FIELD should be set to in-reply-to-user-id - // IS_REPLY_FLAG should NOT be set - // - // In other words: - // SHARED_STATUS_ID_CSF logic: if this is a retweet SHARED_STATUS_ID_CSF should be set to - // retweet-status-id, otherwise if it's a reply to a tweet, it should be set to - // in-reply-to-status-id. - - Preconditions.checkState(retweetUserId.isPresent() == sharedStatusId.isPresent()); - - if (retweetUserId.isPresent()) { - builder.withNativeRetweet(retweetUserId.get(), sharedStatusId.get()); - - if (inReplyToUserId.isPresent()) { - // Set IN_REPLY_TO_USER_ID_FIELD even if this is a retweet of a reply. - builder.withInReplyToUserID(inReplyToUserId.get()); - } - } else { - // If this is a retweet of a reply, we don't want to mark it as a reply, or override fields - // set by the retweet logic. - // If we are in this branch, this is not a retweet. Potentially, we set the reply flag, - // and override shared-status-id and reference-author-id. - - if (inReplyToStatusId.isPresent()) { - if (strict) { - // Enforcing that if this is a reply to a tweet, then it also has a replied-to user. - Preconditions.checkState(inReplyToUserId.isPresent()); - } - builder.withReplyFlag(); - builder.withLongField( - EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(), - inReplyToStatusId.get()); - builder.withLongField( - EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName(), - inReplyToStatusId.get()); - } - if (inReplyToUserId.isPresent()) { - builder.withReplyFlag(); - builder.withInReplyToUserID(inReplyToUserId.get()); - } - } - } - - /** - * Build the engagement fields. - */ - public static void buildNormalizedMinEngagementFields( - EarlybirdThriftDocumentBuilder builder, - EarlybirdEncodedFeatures encodedFeatures, - EarlybirdCluster cluster) throws IOException { - if (EarlybirdCluster.isArchive(cluster)) { - int favoriteCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.FAVORITE_COUNT); - int retweetCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.RETWEET_COUNT); - int replyCount = encodedFeatures.getFeatureValue(EarlybirdFieldConstant.REPLY_COUNT); - builder - .withNormalizedMinEngagementField( - EarlybirdFieldConstant.NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD - .getFieldName(), - favoriteCount); - builder - .withNormalizedMinEngagementField( - EarlybirdFieldConstant.NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD - .getFieldName(), - retweetCount); - builder - .withNormalizedMinEngagementField( - EarlybirdFieldConstant.NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD - .getFieldName(), - replyCount); - } - } - - /** - * As seen in SEARCH-5617, we sometimes have incorrect createdAt. This method tries to fix them - * by extracting creation time from snowflake when possible. - */ - public static long fixCreatedAtTimeStampIfNecessary(long id, long createdAtMs) { - if (createdAtMs < VALID_CREATION_TIME_THRESHOLD_MILLIS - && id > SnowflakeIdParser.SNOWFLAKE_ID_LOWER_BOUND) { - // This tweet has a snowflake ID, and we can extract timestamp from the ID. - ADJUSTED_BAD_CREATED_AT_COUNTER.increment(); - return SnowflakeIdParser.getTimestampFromTweetId(id); - } else if (!SnowflakeIdParser.isTweetIDAndCreatedAtConsistent(id, createdAtMs)) { - LOG.error( - "Found inconsistent tweet ID and created at timestamp: [statusID={}], [createdAtMs={}]", - id, createdAtMs); - INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS.increment(); - } - - return createdAtMs; - } -} diff --git a/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.docx b/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.docx new file mode 100644 index 000000000..fe5e99466 Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.docx differ diff --git a/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.java deleted file mode 100644 index 1ed40bcd4..000000000 --- a/src/java/com/twitter/search/common/converter/earlybird/CombinedIndexingConverter.java +++ /dev/null @@ -1,99 +0,0 @@ -package com.twitter.search.common.converter.earlybird; - -import java.io.IOException; -import java.util.List; - -import javax.annotation.concurrent.NotThreadSafe; - -import com.google.common.base.Preconditions; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; - -/** - * CombinedIndexingConverter builds objects from TwitterMessage to ThriftVersionedEvent. - * - * It is used in tests and in offline jobs, so all data is available on the TwitterMessage. This - * means that we don't need to split up the ThriftVersionedEvents into basic events and update - * events, like we do in the realtime pipeline using the BasicIndexingConverter and the - * DelayedIndexingConverter. - */ -@NotThreadSafe -public class CombinedIndexingConverter { - private final EncodedFeatureBuilder featureBuilder; - private final Schema schema; - private final EarlybirdCluster cluster; - - public CombinedIndexingConverter(Schema schema, EarlybirdCluster cluster) { - this.featureBuilder = new EncodedFeatureBuilder(); - this.schema = schema; - this.cluster = cluster; - } - - /** - * Converts a TwitterMessage to a Thrift representation. - */ - public ThriftVersionedEvents convertMessageToThrift( - TwitterMessage message, - boolean strict, - List penguinVersions) throws IOException { - Preconditions.checkNotNull(message); - Preconditions.checkNotNull(penguinVersions); - - ThriftVersionedEvents versionedEvents = new ThriftVersionedEvents() - .setId(message.getId()); - - ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot(); - - for (PenguinVersion penguinVersion : penguinVersions) { - ThriftDocument document = - buildDocumentForPenguinVersion(schemaSnapshot, message, strict, penguinVersion); - - ThriftIndexingEvent thriftIndexingEvent = new ThriftIndexingEvent() - .setDocument(document) - .setEventType(ThriftIndexingEventType.INSERT) - .setSortId(message.getId()); - message.getFromUserTwitterId().map(thriftIndexingEvent::setUid); - versionedEvents.putToVersionedEvents(penguinVersion.getByteValue(), thriftIndexingEvent); - } - - return versionedEvents; - } - - private ThriftDocument buildDocumentForPenguinVersion( - ImmutableSchemaInterface schemaSnapshot, - TwitterMessage message, - boolean strict, - PenguinVersion penguinVersion) throws IOException { - EncodedFeatureBuilder.TweetFeatureWithEncodeFeatures tweetFeature = - featureBuilder.createTweetFeaturesFromTwitterMessage( - message, penguinVersion, schemaSnapshot); - - EarlybirdThriftDocumentBuilder builder = - BasicIndexingConverter.buildBasicFields(message, schemaSnapshot, cluster, tweetFeature); - - BasicIndexingConverter - .buildUserFields(builder, message, tweetFeature.versionedFeatures, penguinVersion); - BasicIndexingConverter.buildGeoFields(builder, message, tweetFeature.versionedFeatures); - DelayedIndexingConverter.buildURLFields(builder, message, tweetFeature.encodedFeatures); - BasicIndexingConverter.buildRetweetAndReplyFields(builder, message, strict); - BasicIndexingConverter.buildQuotesFields(builder, message); - BasicIndexingConverter.buildVersionedFeatureFields(builder, tweetFeature.versionedFeatures); - DelayedIndexingConverter.buildCardFields(builder, message, penguinVersion); - BasicIndexingConverter.buildAnnotationFields(builder, message); - BasicIndexingConverter.buildNormalizedMinEngagementFields( - builder, tweetFeature.encodedFeatures, cluster); - DelayedIndexingConverter.buildNamedEntityFields(builder, message); - BasicIndexingConverter.buildDirectedAtFields(builder, message); - - return builder.build(); - } -} diff --git a/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.docx b/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.docx new file mode 100644 index 000000000..5e7f3dfc1 Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.docx differ diff --git a/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.java b/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.java deleted file mode 100644 index 0ed3ac134..000000000 --- a/src/java/com/twitter/search/common/converter/earlybird/DelayedIndexingConverter.java +++ /dev/null @@ -1,594 +0,0 @@ -package com.twitter.search.common.converter.earlybird; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import javax.annotation.Nullable; - -import com.google.common.base.Joiner; -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import org.apache.commons.lang.StringUtils; -import org.apache.http.annotation.NotThreadSafe; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.text.token.TokenizedCharSequenceStream; -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.cuad.ner.plain.thriftjava.NamedEntity; -import com.twitter.decider.Decider; -import com.twitter.search.common.constants.SearchCardType; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.indexing.thriftjava.SearchCard2; -import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.indexing.thriftjava.TwitterPhotoUrl; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.entities.TwitterMessageUser; -import com.twitter.search.common.relevance.features.TweetTextFeatures; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftField; -import com.twitter.search.common.schema.thriftjava.ThriftFieldData; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; -import com.twitter.search.common.util.lang.ThriftLanguageUtil; -import com.twitter.search.common.util.text.LanguageIdentifierHelper; -import com.twitter.search.common.util.text.NormalizerHelper; -import com.twitter.search.common.util.text.TokenizerHelper; -import com.twitter.search.common.util.text.TokenizerResult; -import com.twitter.search.common.util.text.TweetTokenStreamSerializer; -import com.twitter.service.spiderduck.gen.MediaTypes; -import com.twitter.search.common.metrics.SearchCounter; - -/** - * Create and populate ThriftVersionedEvents from the URL data, card data, and named entities - * contained in a TwitterMessage. This data is delayed because these services take a few seconds - * to process tweets, and we want to send the basic data available in the BasicIndexingConverter as - * soon as possible, so we send the additional data a few seconds later, as an update. - * - * Prefer to add data and processing to the BasicIndexingConverter when possible. Only add data here - * if your data source _requires_ data from an external service AND the external service takes at - * least a few seconds to process new tweets. - */ -@NotThreadSafe -public class DelayedIndexingConverter { - private static final SearchCounter NUM_TWEETS_WITH_CARD_URL = - SearchCounter.export("tweets_with_card_url"); - private static final SearchCounter NUM_TWEETS_WITH_NUMERIC_CARD_URI = - SearchCounter.export("tweets_with_numeric_card_uri"); - private static final SearchCounter NUM_TWEETS_WITH_INVALID_CARD_URI = - SearchCounter.export("tweets_with_invalid_card_uri"); - private static final SearchCounter TOTAL_URLS = - SearchCounter.export("total_urls_on_tweets"); - private static final SearchCounter MEDIA_URLS_ON_TWEETS = - SearchCounter.export("media_urls_on_tweets"); - private static final SearchCounter NON_MEDIA_URLS_ON_TWEETS = - SearchCounter.export("non_media_urls_on_tweets"); - public static final String INDEX_URL_DESCRIPTION_AND_TITLE_DECIDER = - "index_url_description_and_title"; - - private static class ThriftDocumentWithEncodedTweetFeatures { - private final ThriftDocument document; - private final EarlybirdEncodedFeatures encodedFeatures; - - public ThriftDocumentWithEncodedTweetFeatures(ThriftDocument document, - EarlybirdEncodedFeatures encodedFeatures) { - this.document = document; - this.encodedFeatures = encodedFeatures; - } - - public ThriftDocument getDocument() { - return document; - } - - public EarlybirdEncodedFeatures getEncodedFeatures() { - return encodedFeatures; - } - } - - // The list of all the encoded_tweet_features flags that might be updated by this converter. - // No extended_encoded_tweet_features are updated (otherwise they should be in this list too). - private static final List UPDATED_FLAGS = - Lists.newArrayList( - EarlybirdFieldConstants.EarlybirdFieldConstant.IS_OFFENSIVE_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_LINK_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.IS_SENSITIVE_CONTENT, - EarlybirdFieldConstants.EarlybirdFieldConstant.TEXT_SCORE, - EarlybirdFieldConstants.EarlybirdFieldConstant.TWEET_SIGNATURE, - EarlybirdFieldConstants.EarlybirdFieldConstant.LINK_LANGUAGE, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NEWS_URL_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_EXPANDO_CARD_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_MULTIPLE_MEDIA_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CARD_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VINE_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PERISCOPE_FLAG, - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG - ); - - private static final Logger LOG = LoggerFactory.getLogger(DelayedIndexingConverter.class); - private static final String AMPLIFY_CARD_NAME = "amplify"; - private static final String PLAYER_CARD_NAME = "player"; - - private final EncodedFeatureBuilder featureBuilder = new EncodedFeatureBuilder(); - - private final Schema schema; - private final Decider decider; - - public DelayedIndexingConverter(Schema schema, Decider decider) { - this.schema = schema; - this.decider = decider; - } - - /** - * Converts the given message to two ThriftVersionedEvents instances: the first one is a feature - * update event for all link and card related flags, and the second one is the append event that - * might contain updates to all link and card related fields. - * - * We need to split the updates to fields and flags into two separate events because: - * - When a tweet is created, earlybirds get the "main" event, which does not have resolved URLs. - * - Then the earlybirds might get a feature update from the signal ingesters, marking the tweet - * as spam. - * - Then the ingesters resolve the URLs and send an update event. At this point, the ingesters - * need to send updates for link-related flags too (HAS_LINK_FLAG, etc.). And there are a few - * ways to do this: - * 1. Encode these flags into encoded_tweet_features and extended_encoded_tweet_features and - * add these fields to the update event. The problem is that earlybirds will then override - * the encoded_tweet_features ane extended_encoded_tweet_features fields in the index for - * this tweet, which will override the feature update the earlybirds got earlier, which - * means that a spammy tweet might no longer be marked as spam in the index. - * 2. Send updates only for the flags that might've been updated by this converter. Since - * ThriftIndexingEvent already has a map of field -> value, it seems like the natural place - * to add these updates to. However, earlybirds can correctly process flag updates only if - * they come in a feature update event (PARTIAL_UPDATE). So we need to send the field - * updates in an OUT_OF_ORDER_UPDATE event, and the flag updates in a PARTIAL_UPDATE event. - * - * We need to send the feature update event before the append event to avoid issues like the one - * in SEARCH-30919 where tweets were returned from the card name field index before the HAS_CARD - * feature was updated to true. - * - * @param message The TwitterMessage to convert. - * @param penguinVersions The Penguin versions for which ThriftIndexingEvents should be created. - * @return An out of order update event for all link- and card-related fields and a feature update - * event for all link- and card-related flags. - */ - public List convertMessageToOutOfOrderAppendAndFeatureUpdate( - TwitterMessage message, List penguinVersions) { - Preconditions.checkNotNull(message); - Preconditions.checkNotNull(penguinVersions); - - ThriftVersionedEvents featureUpdateVersionedEvents = new ThriftVersionedEvents(); - ThriftVersionedEvents outOfOrderAppendVersionedEvents = new ThriftVersionedEvents(); - ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot(); - - for (PenguinVersion penguinVersion : penguinVersions) { - ThriftDocumentWithEncodedTweetFeatures documentWithEncodedFeatures = - buildDocumentForPenguinVersion(schemaSnapshot, message, penguinVersion); - - ThriftIndexingEvent featureUpdateThriftIndexingEvent = new ThriftIndexingEvent(); - featureUpdateThriftIndexingEvent.setEventType(ThriftIndexingEventType.PARTIAL_UPDATE); - featureUpdateThriftIndexingEvent.setUid(message.getId()); - featureUpdateThriftIndexingEvent.setDocument( - buildFeatureUpdateDocument(documentWithEncodedFeatures.getEncodedFeatures())); - featureUpdateVersionedEvents.putToVersionedEvents( - penguinVersion.getByteValue(), featureUpdateThriftIndexingEvent); - - ThriftIndexingEvent outOfOrderAppendThriftIndexingEvent = new ThriftIndexingEvent(); - outOfOrderAppendThriftIndexingEvent.setDocument(documentWithEncodedFeatures.getDocument()); - outOfOrderAppendThriftIndexingEvent.setEventType(ThriftIndexingEventType.OUT_OF_ORDER_APPEND); - message.getFromUserTwitterId().ifPresent(outOfOrderAppendThriftIndexingEvent::setUid); - outOfOrderAppendThriftIndexingEvent.setSortId(message.getId()); - outOfOrderAppendVersionedEvents.putToVersionedEvents( - penguinVersion.getByteValue(), outOfOrderAppendThriftIndexingEvent); - } - - featureUpdateVersionedEvents.setId(message.getId()); - outOfOrderAppendVersionedEvents.setId(message.getId()); - - return Lists.newArrayList(featureUpdateVersionedEvents, outOfOrderAppendVersionedEvents); - } - - private ThriftDocument buildFeatureUpdateDocument(EarlybirdEncodedFeatures encodedFeatures) { - ThriftDocument document = new ThriftDocument(); - for (EarlybirdFieldConstants.EarlybirdFieldConstant flag : UPDATED_FLAGS) { - ThriftField field = new ThriftField(); - field.setFieldConfigId(flag.getFieldId()); - field.setFieldData(new ThriftFieldData().setIntValue(encodedFeatures.getFeatureValue(flag))); - document.addToFields(field); - } - return document; - } - - private ThriftDocumentWithEncodedTweetFeatures buildDocumentForPenguinVersion( - ImmutableSchemaInterface schemaSnapshot, - TwitterMessage message, - PenguinVersion penguinVersion) { - - EarlybirdEncodedFeatures encodedFeatures = featureBuilder.createTweetFeaturesFromTwitterMessage( - message, penguinVersion, schemaSnapshot).encodedFeatures; - - EarlybirdThriftDocumentBuilder builder = new EarlybirdThriftDocumentBuilder( - encodedFeatures, - null, - new EarlybirdFieldConstants(), - schemaSnapshot); - - builder.setAddLatLonCSF(false); - builder.withID(message.getId()); - buildFieldsFromUrlInfo(builder, message, penguinVersion, encodedFeatures); - buildCardFields(builder, message, penguinVersion); - buildNamedEntityFields(builder, message); - builder.withTweetSignature(message.getTweetSignature(penguinVersion)); - - buildSpaceAdminAndTitleFields(builder, message, penguinVersion); - - builder.setAddEncodedTweetFeatures(false); - - return new ThriftDocumentWithEncodedTweetFeatures(builder.build(), encodedFeatures); - } - - public static void buildNamedEntityFields( - EarlybirdThriftDocumentBuilder builder, TwitterMessage message) { - for (NamedEntity namedEntity : message.getNamedEntities()) { - builder.withNamedEntity(namedEntity); - } - } - - private void buildFieldsFromUrlInfo( - EarlybirdThriftDocumentBuilder builder, - TwitterMessage message, - PenguinVersion penguinVersion, - EarlybirdEncodedFeatures encodedFeatures) { - // We need to update the RESOLVED_LINKS_TEXT_FIELD, since we might have new resolved URLs. - // Use the same logic as in EncodedFeatureBuilder.java. - TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion); - String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); - builder.withResolvedLinksText(resolvedUrlsText); - - buildURLFields(builder, message, encodedFeatures); - buildAnalyzedURLFields(builder, message, penguinVersion); - } - - private void buildAnalyzedURLFields( - EarlybirdThriftDocumentBuilder builder, TwitterMessage message, PenguinVersion penguinVersion - ) { - TOTAL_URLS.add(message.getExpandedUrls().size()); - if (DeciderUtil.isAvailableForRandomRecipient( - decider, - INDEX_URL_DESCRIPTION_AND_TITLE_DECIDER)) { - for (ThriftExpandedUrl expandedUrl : message.getExpandedUrls()) { - /* - Consumer Media URLs are added to the expanded URLs in - TweetEventParserHelper.addMediaEntitiesToMessage. These Twitter.com media URLs contain - the tweet text as the description and the title is " on Twitter". This is - redundant information at best and misleading at worst. We will ignore these URLs to avoid - polluting the url_description and url_title field as well as saving space. - */ - if (!expandedUrl.isSetConsumerMedia() || !expandedUrl.isConsumerMedia()) { - NON_MEDIA_URLS_ON_TWEETS.increment(); - if (expandedUrl.isSetDescription()) { - buildTweetTokenizerTokenizedField(builder, - EarlybirdFieldConstants.EarlybirdFieldConstant.URL_DESCRIPTION_FIELD.getFieldName(), - expandedUrl.getDescription(), - penguinVersion); - } - if (expandedUrl.isSetTitle()) { - buildTweetTokenizerTokenizedField(builder, - EarlybirdFieldConstants.EarlybirdFieldConstant.URL_TITLE_FIELD.getFieldName(), - expandedUrl.getTitle(), - penguinVersion); - } - } else { - MEDIA_URLS_ON_TWEETS.increment(); - } - } - } - } - - /** - * Build the URL based fields from a tweet. - */ - public static void buildURLFields( - EarlybirdThriftDocumentBuilder builder, - TwitterMessage message, - EarlybirdEncodedFeatures encodedFeatures - ) { - Map expandedUrlMap = message.getExpandedUrlMap(); - - for (ThriftExpandedUrl expandedUrl : expandedUrlMap.values()) { - if (expandedUrl.getMediaType() == MediaTypes.NATIVE_IMAGE) { - EncodedFeatureBuilder.addPhotoUrl(message, expandedUrl.getCanonicalLastHopUrl()); - } - } - - // now add all twitter photos links that came with the tweet's payload - Map photos = message.getPhotoUrls(); - List photoURLs = new ArrayList<>(); - if (photos != null) { - for (Map.Entry entry : photos.entrySet()) { - TwitterPhotoUrl photo = new TwitterPhotoUrl(entry.getKey()); - String mediaUrl = entry.getValue(); - if (mediaUrl != null) { - photo.setMediaUrl(mediaUrl); - } - photoURLs.add(photo); - } - } - - try { - builder - .withURLs(Lists.newArrayList(expandedUrlMap.values())) - .withTwimgURLs(photoURLs); - } catch (IOException ioe) { - LOG.error("URL field creation threw an IOException", ioe); - } - - - if (encodedFeatures.isFlagSet( - EarlybirdFieldConstants.EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)) { - builder.withOffensiveFlag(); - } - if (encodedFeatures.isFlagSet( - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG)) { - builder.addFilterInternalFieldTerm( - EarlybirdFieldConstants.EarlybirdFieldConstant.CONSUMER_VIDEO_FILTER_TERM); - } - if (encodedFeatures.isFlagSet( - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG)) { - builder.addFilterInternalFieldTerm( - EarlybirdFieldConstants.EarlybirdFieldConstant.PRO_VIDEO_FILTER_TERM); - } - if (encodedFeatures.isFlagSet(EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VINE_FLAG)) { - builder.addFilterInternalFieldTerm( - EarlybirdFieldConstants.EarlybirdFieldConstant.VINE_FILTER_TERM); - } - if (encodedFeatures.isFlagSet( - EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PERISCOPE_FLAG)) { - builder.addFilterInternalFieldTerm( - EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_FILTER_TERM); - } - } - - /** - * Build the card information inside ThriftIndexingEvent's fields. - */ - static void buildCardFields(EarlybirdThriftDocumentBuilder builder, - TwitterMessage message, - PenguinVersion penguinVersion) { - if (message.hasCard()) { - SearchCard2 card = buildSearchCardFromTwitterMessage( - message, - TweetTokenStreamSerializer.getTweetTokenStreamSerializer(), - penguinVersion); - buildCardFeatures(message.getId(), builder, card); - } - } - - private static SearchCard2 buildSearchCardFromTwitterMessage( - TwitterMessage message, - TokenStreamSerializer streamSerializer, - PenguinVersion penguinVersion) { - SearchCard2 card = new SearchCard2(); - card.setCardName(message.getCardName()); - if (message.getCardDomain() != null) { - card.setCardDomain(message.getCardDomain()); - } - if (message.getCardLang() != null) { - card.setCardLang(message.getCardLang()); - } - if (message.getCardUrl() != null) { - card.setCardUrl(message.getCardUrl()); - } - - if (message.getCardTitle() != null && !message.getCardTitle().isEmpty()) { - String normalizedTitle = NormalizerHelper.normalize( - message.getCardTitle(), message.getLocale(), penguinVersion); - TokenizerResult result = TokenizerHelper.tokenizeTweet( - normalizedTitle, message.getLocale(), penguinVersion); - TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream(); - tokenSeqStream.reset(result.tokenSequence); - try { - card.setCardTitleTokenStream(streamSerializer.serialize(tokenSeqStream)); - card.setCardTitleTokenStreamText(result.tokenSequence.toString()); - } catch (IOException e) { - LOG.error("TwitterTokenStream serialization error! Could not serialize card title: " - + result.tokenSequence); - card.unsetCardTitleTokenStream(); - card.unsetCardTitleTokenStreamText(); - } - } - if (message.getCardDescription() != null && !message.getCardDescription().isEmpty()) { - String normalizedDesc = NormalizerHelper.normalize( - message.getCardDescription(), message.getLocale(), penguinVersion); - TokenizerResult result = TokenizerHelper.tokenizeTweet( - normalizedDesc, message.getLocale(), penguinVersion); - TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream(); - tokenSeqStream.reset(result.tokenSequence); - try { - card.setCardDescriptionTokenStream(streamSerializer.serialize(tokenSeqStream)); - card.setCardDescriptionTokenStreamText(result.tokenSequence.toString()); - } catch (IOException e) { - LOG.error("TwitterTokenStream serialization error! Could not serialize card description: " - + result.tokenSequence); - card.unsetCardDescriptionTokenStream(); - card.unsetCardDescriptionTokenStreamText(); - } - } - - return card; - } - - /** - * Builds card features. - */ - private static void buildCardFeatures( - long tweetId, EarlybirdThriftDocumentBuilder builder, SearchCard2 card) { - if (card == null) { - return; - } - builder - .withTokenStreamField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(), - card.getCardTitleTokenStreamText(), - card.isSetCardTitleTokenStream() ? card.getCardTitleTokenStream() : null) - .withTokenStreamField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(), - card.getCardDescriptionTokenStreamText(), - card.isSetCardDescriptionTokenStream() ? card.getCardDescriptionTokenStream() : null) - .withStringField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(), - card.getCardName()) - .withIntField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(), - SearchCardType.cardTypeFromStringName(card.getCardName()).getByteValue()); - - if (card.getCardLang() != null) { - builder.withStringField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_LANG.getFieldName(), - card.getCardLang()).withIntField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(), - ThriftLanguageUtil.getThriftLanguageOf(card.getCardLang()).getValue()); - } - if (card.getCardDomain() != null) { - builder.withStringField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName(), - card.getCardDomain()); - } - if (card.getCardUrl() != null) { - NUM_TWEETS_WITH_CARD_URL.increment(); - if (card.getCardUrl().startsWith("card://")) { - String suffix = card.getCardUrl().replace("card://", ""); - if (StringUtils.isNumeric(suffix)) { - NUM_TWEETS_WITH_NUMERIC_CARD_URI.increment(); - builder.withLongField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_URI_CSF.getFieldName(), - Long.parseLong(suffix)); - LOG.debug(String.format( - "Good card URL for tweet %s: %s", - tweetId, - card.getCardUrl())); - } else { - NUM_TWEETS_WITH_INVALID_CARD_URI.increment(); - LOG.debug(String.format( - "Card URL starts with \"card://\" but followed by non-numeric for tweet %s: %s", - tweetId, - card.getCardUrl())); - } - } - } - if (isCardVideo(card)) { - // Add into "internal" field so that this tweet is returned by filter:videos. - builder.addFacetSkipList( - EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName()); - } - } - - /** - * Determines if a card is a video. - */ - private static boolean isCardVideo(@Nullable SearchCard2 card) { - if (card == null) { - return false; - } - return AMPLIFY_CARD_NAME.equalsIgnoreCase(card.getCardName()) - || PLAYER_CARD_NAME.equalsIgnoreCase(card.getCardName()); - } - - private void buildSpaceAdminAndTitleFields( - EarlybirdThriftDocumentBuilder builder, - TwitterMessage message, - PenguinVersion penguinVersion) { - - buildSpaceAdminFields(builder, message.getSpaceAdmins(), penguinVersion); - - // build the space title field. - buildTweetTokenizerTokenizedField( - builder, - EarlybirdFieldConstants.EarlybirdFieldConstant.SPACE_TITLE_FIELD.getFieldName(), - message.getSpaceTitle(), - penguinVersion); - } - - private void buildSpaceAdminFields( - EarlybirdThriftDocumentBuilder builder, - Set spaceAdmins, - PenguinVersion penguinVersion) { - - for (TwitterMessageUser spaceAdmin : spaceAdmins) { - if (spaceAdmin.getScreenName().isPresent()) { - // build screen name (aka handle) fields. - String screenName = spaceAdmin.getScreenName().get(); - String normalizedScreenName = - NormalizerHelper.normalizeWithUnknownLocale(screenName, penguinVersion); - - builder.withStringField( - EarlybirdFieldConstants.EarlybirdFieldConstant.SPACE_ADMIN_FIELD.getFieldName(), - normalizedScreenName); - builder.withWhiteSpaceTokenizedScreenNameField( - EarlybirdFieldConstants - .EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), - normalizedScreenName); - - if (spaceAdmin.getTokenizedScreenName().isPresent()) { - builder.withCamelCaseTokenizedScreenNameField( - EarlybirdFieldConstants - .EarlybirdFieldConstant.CAMELCASE_TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), - screenName, - normalizedScreenName, - spaceAdmin.getTokenizedScreenName().get()); - } - } - - if (spaceAdmin.getDisplayName().isPresent()) { - buildTweetTokenizerTokenizedField( - builder, - EarlybirdFieldConstants - .EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_DISPLAY_NAME_FIELD.getFieldName(), - spaceAdmin.getDisplayName().get(), - penguinVersion); - } - } - } - - private void buildTweetTokenizerTokenizedField( - EarlybirdThriftDocumentBuilder builder, - String fieldName, - String text, - PenguinVersion penguinVersion) { - - if (StringUtils.isNotEmpty(text)) { - Locale locale = LanguageIdentifierHelper - .identifyLanguage(text); - String normalizedText = NormalizerHelper.normalize( - text, locale, penguinVersion); - TokenizerResult result = TokenizerHelper - .tokenizeTweet(normalizedText, locale, penguinVersion); - TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream(); - tokenSeqStream.reset(result.tokenSequence); - TokenStreamSerializer streamSerializer = - TweetTokenStreamSerializer.getTweetTokenStreamSerializer(); - try { - builder.withTokenStreamField( - fieldName, - result.tokenSequence.toString(), - streamSerializer.serialize(tokenSeqStream)); - } catch (IOException e) { - LOG.error("TwitterTokenStream serialization error! Could not serialize: " + text); - } - } - } -} diff --git a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.docx b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.docx new file mode 100644 index 000000000..e5e13f68f Binary files /dev/null and b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.docx differ diff --git a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java b/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java deleted file mode 100644 index c5d6b1c76..000000000 --- a/src/java/com/twitter/search/common/converter/earlybird/EncodedFeatureBuilder.java +++ /dev/null @@ -1,531 +0,0 @@ -package com.twitter.search.common.converter.earlybird; - -import java.io.IOException; -import java.util.HashSet; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import java.util.stream.Collectors; - -import com.google.common.base.Joiner; -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.apache.commons.lang.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.text.token.TokenizedCharSequence; -import com.twitter.common.text.token.TokenizedCharSequenceStream; -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.indexing.thriftjava.Place; -import com.twitter.search.common.indexing.thriftjava.PotentialLocation; -import com.twitter.search.common.indexing.thriftjava.ProfileGeoEnrichment; -import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl; -import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.relevance.entities.PotentialLocationObject; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.features.FeatureSink; -import com.twitter.search.common.relevance.features.MutableFeatureNormalizers; -import com.twitter.search.common.relevance.features.RelevanceSignalConstants; -import com.twitter.search.common.relevance.features.TweetTextFeatures; -import com.twitter.search.common.relevance.features.TweetTextQuality; -import com.twitter.search.common.relevance.features.TweetUserFeatures; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.util.lang.ThriftLanguageUtil; -import com.twitter.search.common.util.text.LanguageIdentifierHelper; -import com.twitter.search.common.util.text.NormalizerHelper; -import com.twitter.search.common.util.text.SourceNormalizer; -import com.twitter.search.common.util.text.TokenizerHelper; -import com.twitter.search.common.util.text.TokenizerResult; -import com.twitter.search.common.util.text.TweetTokenStreamSerializer; -import com.twitter.search.common.util.url.LinkVisibilityUtils; -import com.twitter.search.common.util.url.NativeVideoClassificationUtils; -import com.twitter.search.ingester.model.VisibleTokenRatioUtil; - -/** - * EncodedFeatureBuilder helps to build encoded features for TwitterMessage. - * - * This is stateful so should only be used one tweet at a time - */ -public class EncodedFeatureBuilder { - private static final Logger LOG = LoggerFactory.getLogger(EncodedFeatureBuilder.class); - - private static final SearchCounter NUM_TWEETS_WITH_INVALID_TWEET_ID_IN_PHOTO_URL = - SearchCounter.export("tweets_with_invalid_tweet_id_in_photo_url"); - - // TwitterTokenStream for converting TokenizedCharSequence into a stream for serialization - // This is stateful so should only be used one tweet at a time - private final TokenizedCharSequenceStream tokenSeqStream = new TokenizedCharSequenceStream(); - - // SUPPRESS CHECKSTYLE:OFF LineLength - private static final Pattern TWITTER_PHOTO_PERMA_LINK_PATTERN = - Pattern.compile("(?i:^(?:(?:https?\\:\\/\\/)?(?:www\\.)?)?twitter\\.com\\/(?:\\?[^#]+)?(?:#!?\\/?)?\\w{1,20}\\/status\\/(\\d+)\\/photo\\/\\d*$)"); - - private static final Pattern TWITTER_PHOTO_COPY_PASTE_LINK_PATTERN = - Pattern.compile("(?i:^(?:(?:https?\\:\\/\\/)?(?:www\\.)?)?twitter\\.com\\/(?:#!?\\/)?\\w{1,20}\\/status\\/(\\d+)\\/photo\\/\\d*$)"); - // SUPPRESS CHECKSTYLE:ON LineLength - - private static final VisibleTokenRatioUtil VISIBLE_TOKEN_RATIO = new VisibleTokenRatioUtil(); - - private static final Map SERIALIZE_FAILURE_COUNTERS_MAP = - Maps.newEnumMap(PenguinVersion.class); - static { - for (PenguinVersion penguinVersion : PenguinVersion.values()) { - SERIALIZE_FAILURE_COUNTERS_MAP.put( - penguinVersion, - SearchCounter.export( - "tokenstream_serialization_failure_" + penguinVersion.name().toLowerCase())); - } - } - - public static class TweetFeatureWithEncodeFeatures { - public final VersionedTweetFeatures versionedFeatures; - public final EarlybirdEncodedFeatures encodedFeatures; - public final EarlybirdEncodedFeatures extendedEncodedFeatures; - - public TweetFeatureWithEncodeFeatures( - VersionedTweetFeatures versionedFeatures, - EarlybirdEncodedFeatures encodedFeatures, - EarlybirdEncodedFeatures extendedEncodedFeatures) { - this.versionedFeatures = versionedFeatures; - this.encodedFeatures = encodedFeatures; - this.extendedEncodedFeatures = extendedEncodedFeatures; - } - } - - /** - * Create tweet text features and the encoded features. - * - * @param message the tweet message - * @param penguinVersion the based penguin version to create the features - * @param schemaSnapshot the schema associated with the features - * @return the text features and the encoded features - */ - public TweetFeatureWithEncodeFeatures createTweetFeaturesFromTwitterMessage( - TwitterMessage message, - PenguinVersion penguinVersion, - ImmutableSchemaInterface schemaSnapshot) { - VersionedTweetFeatures versionedTweetFeatures = new VersionedTweetFeatures(); - - // Write extendedPackedFeatures. - EarlybirdEncodedFeatures extendedEncodedFeatures = - createExtendedEncodedFeaturesFromTwitterMessage(message, penguinVersion, schemaSnapshot); - if (extendedEncodedFeatures != null) { - extendedEncodedFeatures - .writeExtendedFeaturesToVersionedTweetFeatures(versionedTweetFeatures); - } - - setSourceAndNormalizedSource( - message.getStrippedSource(), versionedTweetFeatures, penguinVersion); - - TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion); - - /////////////////////////////// - // Add hashtags and mentions - textFeatures.getHashtags().forEach(versionedTweetFeatures::addToHashtags); - textFeatures.getMentions().forEach(versionedTweetFeatures::addToMentions); - - /////////////////////////////// - // Extract some extra information from the message text. - // Index stock symbols with $ prepended - textFeatures.getStocks().stream() - .filter(stock -> stock != null) - .forEach(stock -> versionedTweetFeatures.addToStocks(stock.toLowerCase())); - - // Question marks - versionedTweetFeatures.setHasQuestionMark(textFeatures.hasQuestionMark()); - // Smileys - versionedTweetFeatures.setHasPositiveSmiley(textFeatures.hasPositiveSmiley()); - versionedTweetFeatures.setHasNegativeSmiley(textFeatures.hasNegativeSmiley()); - - TokenStreamSerializer streamSerializer = - TweetTokenStreamSerializer.getTweetTokenStreamSerializer(); - TokenizedCharSequence tokenSeq = textFeatures.getTokenSequence(); - tokenSeqStream.reset(tokenSeq); - int tokenPercent = VISIBLE_TOKEN_RATIO.extractAndNormalizeTokenPercentage(tokenSeqStream); - tokenSeqStream.reset(tokenSeq); - - // Write packedFeatures. - EarlybirdEncodedFeatures encodedFeatures = createEncodedFeaturesFromTwitterMessage( - message, penguinVersion, schemaSnapshot, tokenPercent); - encodedFeatures.writeFeaturesToVersionedTweetFeatures(versionedTweetFeatures); - - try { - versionedTweetFeatures.setTweetTokenStream(streamSerializer.serialize(tokenSeqStream)); - versionedTweetFeatures.setTweetTokenStreamText(tokenSeq.toString()); - } catch (IOException e) { - LOG.error("TwitterTokenStream serialization error! Could not serialize: " - + tokenSeq.toString()); - SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment(); - versionedTweetFeatures.unsetTweetTokenStream(); - versionedTweetFeatures.unsetTweetTokenStreamText(); - } - - // User name features - if (message.getFromUserDisplayName().isPresent()) { - Locale locale = LanguageIdentifierHelper - .identifyLanguage(message.getFromUserDisplayName().get()); - String normalizedDisplayName = NormalizerHelper.normalize( - message.getFromUserDisplayName().get(), locale, penguinVersion); - TokenizerResult result = TokenizerHelper - .tokenizeTweet(normalizedDisplayName, locale, penguinVersion); - tokenSeqStream.reset(result.tokenSequence); - try { - versionedTweetFeatures.setUserDisplayNameTokenStream( - streamSerializer.serialize(tokenSeqStream)); - versionedTweetFeatures.setUserDisplayNameTokenStreamText(result.tokenSequence.toString()); - } catch (IOException e) { - LOG.error("TwitterTokenStream serialization error! Could not serialize: " - + message.getFromUserDisplayName().get()); - SERIALIZE_FAILURE_COUNTERS_MAP.get(penguinVersion).increment(); - versionedTweetFeatures.unsetUserDisplayNameTokenStream(); - versionedTweetFeatures.unsetUserDisplayNameTokenStreamText(); - } - } - - String resolvedUrlsText = Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); - versionedTweetFeatures.setNormalizedResolvedUrlText(resolvedUrlsText); - - addPlace(message, versionedTweetFeatures, penguinVersion); - addProfileGeoEnrichment(message, versionedTweetFeatures, penguinVersion); - - versionedTweetFeatures.setTweetSignature(message.getTweetSignature(penguinVersion)); - - return new TweetFeatureWithEncodeFeatures( - versionedTweetFeatures, encodedFeatures, extendedEncodedFeatures); - } - - - protected static void setSourceAndNormalizedSource( - String strippedSource, - VersionedTweetFeatures versionedTweetFeatures, - PenguinVersion penguinVersion) { - - if (strippedSource != null && !strippedSource.isEmpty()) { - // normalize source for searchable field - replaces whitespace with underscores (???). - versionedTweetFeatures.setNormalizedSource( - SourceNormalizer.normalize(strippedSource, penguinVersion)); - - // source facet has simpler normalization. - Locale locale = LanguageIdentifierHelper.identifyLanguage(strippedSource); - versionedTweetFeatures.setSource(NormalizerHelper.normalizeKeepCase( - strippedSource, locale, penguinVersion)); - } - } - - /** - * Adds the given photo url to the thrift status if it is a twitter photo permalink. - * Returns true, if this was indeed a twitter photo, false otherwise. - */ - public static boolean addPhotoUrl(TwitterMessage message, String photoPermalink) { - Matcher matcher = TWITTER_PHOTO_COPY_PASTE_LINK_PATTERN.matcher(photoPermalink); - if (!matcher.matches() || matcher.groupCount() < 1) { - matcher = TWITTER_PHOTO_PERMA_LINK_PATTERN.matcher(photoPermalink); - } - - if (matcher.matches() && matcher.groupCount() == 1) { - // this is a native photo url which we need to store in a separate field - String idStr = matcher.group(1); - if (idStr != null) { - // idStr should be a valid tweet ID (and therefore, should fit into a Long), but we have - // tweets for which idStr is a long sequence of digits that does not fit into a Long. - try { - long photoStatusId = Long.parseLong(idStr); - message.addPhotoUrl(photoStatusId, null); - } catch (NumberFormatException e) { - LOG.warn("Found a tweet with a photo URL with an invalid tweet ID: " + message); - NUM_TWEETS_WITH_INVALID_TWEET_ID_IN_PHOTO_URL.increment(); - } - } - return true; - } - return false; - } - - private void addPlace(TwitterMessage message, - VersionedTweetFeatures versionedTweetFeatures, - PenguinVersion penguinVersion) { - String placeId = message.getPlaceId(); - if (placeId == null) { - return; - } - - // Tweet.Place.id and Tweet.Place.full_name are both required fields. - String placeFullName = message.getPlaceFullName(); - Preconditions.checkNotNull(placeFullName, "Tweet.Place without full_name."); - - Locale placeFullNameLocale = LanguageIdentifierHelper.identifyLanguage(placeFullName); - String normalizedPlaceFullName = - NormalizerHelper.normalize(placeFullName, placeFullNameLocale, penguinVersion); - String tokenizedPlaceFullName = StringUtils.join( - TokenizerHelper.tokenizeQuery(normalizedPlaceFullName, placeFullNameLocale, penguinVersion), - " "); - - Place place = new Place(placeId, tokenizedPlaceFullName); - String placeCountryCode = message.getPlaceCountryCode(); - if (placeCountryCode != null) { - Locale placeCountryCodeLocale = LanguageIdentifierHelper.identifyLanguage(placeCountryCode); - place.setCountryCode( - NormalizerHelper.normalize(placeCountryCode, placeCountryCodeLocale, penguinVersion)); - } - - versionedTweetFeatures.setTokenizedPlace(place); - } - - private void addProfileGeoEnrichment(TwitterMessage message, - VersionedTweetFeatures versionedTweetFeatures, - PenguinVersion penguinVersion) { - List potentialLocations = message.getPotentialLocations(); - if (potentialLocations.isEmpty()) { - return; - } - - List thriftPotentialLocations = Lists.newArrayList(); - for (PotentialLocationObject potentialLocation : potentialLocations) { - thriftPotentialLocations.add(potentialLocation.toThriftPotentialLocation(penguinVersion)); - } - versionedTweetFeatures.setTokenizedProfileGeoEnrichment( - new ProfileGeoEnrichment(thriftPotentialLocations)); - } - - /** Returns the encoded features. */ - public static EarlybirdEncodedFeatures createEncodedFeaturesFromTwitterMessage( - TwitterMessage message, - PenguinVersion penguinVersion, - ImmutableSchemaInterface schema, - int normalizedTokenPercentBucket) { - FeatureSink sink = new FeatureSink(schema); - - // Static features - sink.setBooleanValue(EarlybirdFieldConstant.IS_RETWEET_FLAG, message.isRetweet()) - .setBooleanValue(EarlybirdFieldConstant.IS_REPLY_FLAG, message.isReply()) - .setBooleanValue( - EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG, message.isUserVerified()) - .setBooleanValue( - EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG, message.isUserBlueVerified()) - .setBooleanValue(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT, message.isSensitiveContent()); - - TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion); - if (textFeatures != null) { - final FeatureConfiguration featureConfigNumHashtags = schema.getFeatureConfigurationByName( - EarlybirdFieldConstant.NUM_HASHTAGS.getFieldName()); - final FeatureConfiguration featureConfigNumMentions = schema.getFeatureConfigurationByName( - EarlybirdFieldConstant.NUM_MENTIONS.getFieldName()); - - sink.setNumericValue( - EarlybirdFieldConstant.NUM_HASHTAGS, - Math.min(textFeatures.getHashtagsSize(), featureConfigNumHashtags.getMaxValue())) - .setNumericValue( - EarlybirdFieldConstant.NUM_MENTIONS, - Math.min(textFeatures.getMentionsSize(), featureConfigNumMentions.getMaxValue())) - .setBooleanValue( - EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG, - TwitterMessage.hasMultipleHashtagsOrTrends(textFeatures)) - .setBooleanValue( - EarlybirdFieldConstant.HAS_TREND_FLAG, - textFeatures.getTrendingTermsSize() > 0); - } - - TweetTextQuality textQuality = message.getTweetTextQuality(penguinVersion); - if (textQuality != null) { - sink.setNumericValue(EarlybirdFieldConstant.TEXT_SCORE, textQuality.getTextScore()); - sink.setBooleanValue( - EarlybirdFieldConstant.IS_OFFENSIVE_FLAG, - textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE) - || textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE_USER) - // Note: if json message "possibly_sensitive" flag is set, we consider the tweet - // sensitive and is currently filtered out in safe search mode via a hacky setup: - // earlybird does not create _filter_sensitive_content field, only - // _is_offensive field is created, and used in filter:safe operator - || textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.SENSITIVE)); - if (textQuality.hasBoolQuality(TweetTextQuality.BooleanQualityType.SENSITIVE)) { - sink.setBooleanValue(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT, true); - } - } else { - // we don't have text score, for whatever reason, set to sentinel value so we won't be - // skipped by scoring function - sink.setNumericValue(EarlybirdFieldConstant.TEXT_SCORE, - RelevanceSignalConstants.UNSET_TEXT_SCORE_SENTINEL); - } - - if (message.isSetLocale()) { - sink.setNumericValue(EarlybirdFieldConstant.LANGUAGE, - ThriftLanguageUtil.getThriftLanguageOf(message.getLocale()).getValue()); - } - - // User features - TweetUserFeatures userFeatures = message.getTweetUserFeatures(penguinVersion); - if (userFeatures != null) { - sink.setBooleanValue(EarlybirdFieldConstant.IS_USER_SPAM_FLAG, userFeatures.isSpam()) - .setBooleanValue(EarlybirdFieldConstant.IS_USER_NSFW_FLAG, userFeatures.isNsfw()) - .setBooleanValue(EarlybirdFieldConstant.IS_USER_BOT_FLAG, userFeatures.isBot()); - } - if (message.getUserReputation() != TwitterMessage.DOUBLE_FIELD_NOT_PRESENT) { - sink.setNumericValue(EarlybirdFieldConstant.USER_REPUTATION, - (byte) message.getUserReputation()); - } else { - sink.setNumericValue(EarlybirdFieldConstant.USER_REPUTATION, - RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL); - } - - sink.setBooleanValue(EarlybirdFieldConstant.IS_NULLCAST_FLAG, message.getNullcast()); - - // Realtime Ingestion does not write engagement features. Updater does that. - if (message.getNumFavorites() > 0) { - sink.setNumericValue(EarlybirdFieldConstant.FAVORITE_COUNT, - MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(message.getNumFavorites())); - } - if (message.getNumRetweets() > 0) { - sink.setNumericValue(EarlybirdFieldConstant.RETWEET_COUNT, - MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(message.getNumRetweets())); - } - if (message.getNumReplies() > 0) { - sink.setNumericValue(EarlybirdFieldConstant.REPLY_COUNT, - MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(message.getNumReplies())); - } - - sink.setNumericValue(EarlybirdFieldConstant.VISIBLE_TOKEN_RATIO, normalizedTokenPercentBucket); - - EarlybirdEncodedFeatures encodedFeatures = - (EarlybirdEncodedFeatures) sink.getFeaturesForBaseField( - EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName()); - updateLinkEncodedFeatures(encodedFeatures, message); - return encodedFeatures; - } - - /** - * Returns the extended encoded features. - */ - public static EarlybirdEncodedFeatures createExtendedEncodedFeaturesFromTwitterMessage( - TwitterMessage message, - PenguinVersion penguinVersion, - ImmutableSchemaInterface schema) { - FeatureSink sink = new FeatureSink(schema); - - TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion); - - if (textFeatures != null) { - setExtendedEncodedFeatureIntValue(sink, schema, - EarlybirdFieldConstant.NUM_HASHTAGS_V2, textFeatures.getHashtagsSize()); - setExtendedEncodedFeatureIntValue(sink, schema, - EarlybirdFieldConstant.NUM_MENTIONS_V2, textFeatures.getMentionsSize()); - setExtendedEncodedFeatureIntValue(sink, schema, - EarlybirdFieldConstant.NUM_STOCKS, textFeatures.getStocksSize()); - } - - Optional referenceAuthorId = message.getReferenceAuthorId(); - if (referenceAuthorId.isPresent()) { - setEncodedReferenceAuthorId(sink, referenceAuthorId.get()); - } - - return (EarlybirdEncodedFeatures) sink.getFeaturesForBaseField( - EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldName()); - } - - /** - * Updates all URL-related features, based on the values stored in the given message. - * - * @param encodedFeatures The features to be updated. - * @param message The message. - */ - public static void updateLinkEncodedFeatures( - EarlybirdEncodedFeatures encodedFeatures, TwitterMessage message) { - if (message.getLinkLocale() != null) { - encodedFeatures.setFeatureValue( - EarlybirdFieldConstant.LINK_LANGUAGE, - ThriftLanguageUtil.getThriftLanguageOf(message.getLinkLocale()).getValue()); - } - - if (message.hasCard()) { - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_CARD_FLAG); - } - - // Set HAS_IMAGE HAS_NEWS HAS_VIDEO etc. flags for expanded urls. - if (message.getExpandedUrlMapSize() > 0) { - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_LINK_FLAG); - - for (ThriftExpandedUrl url : message.getExpandedUrlMap().values()) { - if (url.isSetMediaType()) { - switch (url.getMediaType()) { - case NATIVE_IMAGE: - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG); - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG); - break; - case IMAGE: - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG); - break; - case VIDEO: - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG); - break; - case NEWS: - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG); - break; - case UNKNOWN: - break; - default: - throw new IllegalStateException("Unexpected enum value: " + url.getMediaType()); - } - } - } - } - - Set canonicalLastHopUrlsStrings = message.getCanonicalLastHopUrls(); - Set expandedUrlsStrings = message.getExpandedUrls() - .stream() - .map(ThriftExpandedUrl::getExpandedUrl) - .collect(Collectors.toSet()); - Set expandedAndLastHopUrlsStrings = new HashSet<>(); - expandedAndLastHopUrlsStrings.addAll(expandedUrlsStrings); - expandedAndLastHopUrlsStrings.addAll(canonicalLastHopUrlsStrings); - // Check both expanded and last hop url for consumer videos as consumer video urls are - // sometimes redirected to the url of the tweets containing the videos (SEARCH-42612). - if (NativeVideoClassificationUtils.hasConsumerVideo(expandedAndLastHopUrlsStrings)) { - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG); - } - if (NativeVideoClassificationUtils.hasProVideo(canonicalLastHopUrlsStrings)) { - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG); - } - if (NativeVideoClassificationUtils.hasVine(canonicalLastHopUrlsStrings)) { - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_VINE_FLAG); - } - if (NativeVideoClassificationUtils.hasPeriscope(canonicalLastHopUrlsStrings)) { - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_PERISCOPE_FLAG); - } - if (LinkVisibilityUtils.hasVisibleLink(message.getExpandedUrls())) { - encodedFeatures.setFlag(EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG); - } - } - - private static void setExtendedEncodedFeatureIntValue( - FeatureSink sink, - ImmutableSchemaInterface schema, - EarlybirdFieldConstant field, - int value) { - boolean fieldInSchema = schema.hasField(field.getFieldName()); - if (fieldInSchema) { - FeatureConfiguration featureConfig = - schema.getFeatureConfigurationByName(field.getFieldName()); - sink.setNumericValue(field, Math.min(value, featureConfig.getMaxValue())); - } - } - - private static void setEncodedReferenceAuthorId(FeatureSink sink, long referenceAuthorId) { - LongIntConverter.IntegerRepresentation ints = - LongIntConverter.convertOneLongToTwoInt(referenceAuthorId); - sink.setNumericValue( - EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT, ints.leastSignificantInt); - sink.setNumericValue( - EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT, ints.mostSignificantInt); - } -} diff --git a/src/java/com/twitter/search/common/encoding/docvalues/BUILD b/src/java/com/twitter/search/common/encoding/docvalues/BUILD deleted file mode 100644 index bc4756173..000000000 --- a/src/java/com/twitter/search/common/encoding/docvalues/BUILD +++ /dev/null @@ -1,20 +0,0 @@ -# Java library for docvalues and common stride field encoding utilities. -java_library( - sources = ["*.java"], - platform = "java8", - provides = artifact( - org = "com.twitter.search.common", - name = "encoding-docvalues", - repo = artifactory, - ), - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/thrift:libthrift", - "src/java/com/twitter/search/common/schema/base", - "src/thrift/com/twitter/search/common:schema-java", - ], -) diff --git a/src/java/com/twitter/search/common/encoding/docvalues/BUILD.docx b/src/java/com/twitter/search/common/encoding/docvalues/BUILD.docx new file mode 100644 index 000000000..effd5061e Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/docvalues/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.docx b/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.docx new file mode 100644 index 000000000..5f1fa4979 Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.docx differ diff --git a/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.java b/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.java deleted file mode 100644 index 1d6d2c0bb..000000000 --- a/src/java/com/twitter/search/common/encoding/docvalues/CSFTypeUtil.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.search.common.encoding.docvalues; - -public final class CSFTypeUtil { - private CSFTypeUtil() { - } - - /** - * Convert a long into a byte array, stored into dest. - */ - public static void convertToBytes(byte[] dest, int valueIndex, int value) { - int offset = valueIndex * Integer.BYTES; - dest[offset] = (byte) (value >>> 24); - dest[offset + 1] = (byte) (value >>> 16); - dest[offset + 2] = (byte) (value >>> 8); - dest[offset + 3] = (byte) value; - } - - /** - * Convert bytes into a long value. Inverse function of convertToBytes. - */ - public static int convertFromBytes(byte[] data, int startOffset, int valueIndex) { - // This should rarely happen, eg. when we get a corrupt ThriftIndexingEvent, we insert a new - // Document which is blank. Such a document results in a length 0 BytesRef. - if (data.length == 0) { - return 0; - } - - int offset = startOffset + valueIndex * Integer.BYTES; - return ((data[offset] & 0xFF) << 24) - | ((data[offset + 1] & 0xFF) << 16) - | ((data[offset + 2] & 0xFF) << 8) - | (data[offset + 3] & 0xFF); - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/BUILD b/src/java/com/twitter/search/common/encoding/features/BUILD deleted file mode 100644 index 93b13c03f..000000000 --- a/src/java/com/twitter/search/common/encoding/features/BUILD +++ /dev/null @@ -1,17 +0,0 @@ -# Java library for feature encoding and decoding utilities. -java_library( - sources = ["*.java"], - platform = "java8", - provides = artifact( - org = "com.twitter.search.common", - name = "encoding-features", - repo = artifactory, - ), - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/org/apache/thrift:libthrift", - "src/java/com/twitter/search/common/schema/base", - "src/thrift/com/twitter/search/common:indexing-java", - ], -) diff --git a/src/java/com/twitter/search/common/encoding/features/BUILD.docx b/src/java/com/twitter/search/common/encoding/features/BUILD.docx new file mode 100644 index 000000000..ca8bbf26f Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.docx new file mode 100644 index 000000000..9a27e4894 Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.java b/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.java deleted file mode 100644 index 36abc323e..000000000 --- a/src/java/com/twitter/search/common/encoding/features/BinByteNormalizer.java +++ /dev/null @@ -1,73 +0,0 @@ -package com.twitter.search.common.encoding.features; - -import java.util.Map; -import java.util.SortedSet; -import java.util.TreeMap; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -/** - * Normalizes values to predefined bins. - * If the value to normalize is lower than the lowest bin defined, normalizes to Byte.MIN_VALUE. - */ -public class BinByteNormalizer extends ByteNormalizer { - - private final TreeMap bins = Maps.newTreeMap(); - private final TreeMap reverseBins = Maps.newTreeMap(); - - /** - * Constructs a normalizer using predefined bins. - * @param bins A mapping between the upper bound of a value and the bin it should normalize to. - * For example providing a map with 2 entries, {5=>1, 10=>2} will normalize as follows: - * values under 5: Byte.MIN_VALUE - * values between 5 and 10: 1 - * values over 10: 2 - */ - public BinByteNormalizer(final Map bins) { - Preconditions.checkNotNull(bins); - Preconditions.checkArgument(!bins.isEmpty(), "No bins provided"); - Preconditions.checkArgument(hasIncreasingValues(bins)); - this.bins.putAll(bins); - for (Map.Entry entry : bins.entrySet()) { - reverseBins.put(entry.getValue(), entry.getKey()); - } - } - - /** - * check that if key1 > key2 then val1 > val2 in the {@code map}. - */ - private static boolean hasIncreasingValues(final Map map) { - SortedSet orderedKeys = Sets.newTreeSet(map.keySet()); - byte prev = Byte.MIN_VALUE; - for (Double key : orderedKeys) { // save the unboxing - byte cur = map.get(key); - if (cur <= prev) { - return false; - } - prev = cur; - } - return true; - } - - @Override - public byte normalize(double val) { - Map.Entry lowerBound = bins.floorEntry(val); - return lowerBound == null - ? Byte.MIN_VALUE - : lowerBound.getValue(); - } - - @Override - public double unnormLowerBound(byte norm) { - return reverseBins.get(reverseBins.floorKey(norm)); - } - - @Override - public double unnormUpperBound(byte norm) { - return norm == reverseBins.lastKey() - ? Double.POSITIVE_INFINITY - : reverseBins.get(reverseBins.floorKey((byte) (1 + norm))); - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.docx new file mode 100644 index 000000000..21b2ef632 Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.java b/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.java deleted file mode 100644 index 6a6845a12..000000000 --- a/src/java/com/twitter/search/common/encoding/features/ByteNormalizer.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.twitter.search.common.encoding.features; - -/** - * Interface for compressing unbounded float values to a signed byte. It includes both - * normalization of values and encoding of values in a byte. - */ -public abstract class ByteNormalizer { - public static byte intToUnsignedByte(int i) { - return (byte) i; - } - - public static int unsignedByteToInt(byte b) { - return (int) b & 0xFF; - } - - /** - * Returns the byte-compressed value of {@code val}. - */ - public abstract byte normalize(double val); - - /** - * Returns a lower bound to the unnormalized range of {@code norm}. - */ - public abstract double unnormLowerBound(byte norm); - - /** - * Returns an upper bound to the unnormalized range of {@code norm}. - */ - public abstract double unnormUpperBound(byte norm); - - /** - * Returns true if the normalized value of {@code val} is different than the normalized value of - * {@code val - 1} - */ - public boolean changedNorm(double val) { - return normalize(val) != normalize(val - 1); - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.docx new file mode 100644 index 000000000..cd370fcff Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.java b/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.java deleted file mode 100644 index ec1d3faa9..000000000 --- a/src/java/com/twitter/search/common/encoding/features/ClampByteNormalizer.java +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.search.common.encoding.features; - -import com.google.common.base.Preconditions; - -/** - * A byte normalizer that restricts the values to the given range before normalizing them. - */ -public class ClampByteNormalizer extends ByteNormalizer { - private final int minUnnormalizedValue; - private final int maxUnnormalizedValue; - - /** - * Creates a new ClampByteNormalizer instance. - * - * @param minValue The smallest allowed unnormalized value. - * @param maxValue The largest allowed unnormalized value. - */ - public ClampByteNormalizer(int minUnnormalizedValue, int maxUnnormalizedValue) { - Preconditions.checkState(minUnnormalizedValue <= maxUnnormalizedValue); - Preconditions.checkState(minUnnormalizedValue >= 0); - Preconditions.checkState(maxUnnormalizedValue <= 255); - this.minUnnormalizedValue = minUnnormalizedValue; - this.maxUnnormalizedValue = maxUnnormalizedValue; - } - - @Override - public byte normalize(double val) { - int adjustedValue = (int) val; - if (adjustedValue < minUnnormalizedValue) { - adjustedValue = minUnnormalizedValue; - } - if (adjustedValue > maxUnnormalizedValue) { - adjustedValue = maxUnnormalizedValue; - } - return ByteNormalizer.intToUnsignedByte(adjustedValue); - } - - @Override - public double unnormLowerBound(byte norm) { - return ByteNormalizer.unsignedByteToInt(norm); - } - - @Override - public double unnormUpperBound(byte norm) { - return ByteNormalizer.unsignedByteToInt(norm) + 1; - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.docx b/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.docx new file mode 100644 index 000000000..ff1c6fabf Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.java b/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.java deleted file mode 100644 index f6d9b16bb..000000000 --- a/src/java/com/twitter/search/common/encoding/features/EncodedFeatures.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.twitter.search.common.encoding.features; - -/** - * Encodes multiple values (bytes or bits) into an integer. - */ -public class EncodedFeatures { - private int value; - - public final void setSerializedValue(int val) { - this.value = val; - } - - public final int getSerializedValue() { - return value; - } - - // setByte is agnostic to signed / unsigned bytes. - protected final EncodedFeatures setByte(byte count, int bitshift, long inverseMask) { - value = (int) ((value & inverseMask) | ((count & 0xffL) << bitshift)); - return this; - } - - /** - * Sets the value but only if greater. setByteIfGreater assumes unsigned bytes. - */ - public final EncodedFeatures setByteIfGreater(byte newCount, int bitshift, long inversemask) { - if ((getByte(bitshift) & 0xff) < (newCount & 0xff)) { - setByte(newCount, bitshift, inversemask); - } - return this; - } - - protected final int getByte(int bitshift) { - return (int) (((value & 0xffffffffL) >>> bitshift) & 0xffL); - } - - protected final int getByteMasked(int bitshift, long mask) { - return (int) (((value & mask) >>> bitshift) & 0xffL); - } - - protected final EncodedFeatures setBit(int bit, boolean flag) { - if (flag) { - value |= bit; - } else { - value &= ~bit; - } - return this; - } - - protected final boolean getBit(int bit) { - return (value & bit) != 0; - } - - @Override - public String toString() { - return String.format("%x", value); - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/IntNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/IntNormalizer.docx new file mode 100644 index 000000000..da11c3a0f Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/IntNormalizer.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/IntNormalizer.java b/src/java/com/twitter/search/common/encoding/features/IntNormalizer.java deleted file mode 100644 index 0a2477e46..000000000 --- a/src/java/com/twitter/search/common/encoding/features/IntNormalizer.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.search.common.encoding.features; - -/** - * Interface for processing different feature values into an int. It provides a one-way translation - * of encoding using com.twitter.search.common.encoding.features.ByteNormalizer and supports all the - * old normalizers. The difference is that we directly return the normalized int value - * (instead of converting from byte). - */ -public interface IntNormalizer { - /** - * Returns the normalized value of {@code val}. - * The value may be byte-compressed or as-is depending on the normalizer type - */ - int normalize(double val); -} diff --git a/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.docx b/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.docx new file mode 100644 index 000000000..e6a1e3107 Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.java b/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.java deleted file mode 100644 index a86e079c3..000000000 --- a/src/java/com/twitter/search/common/encoding/features/IntegerEncodedFeatures.java +++ /dev/null @@ -1,159 +0,0 @@ -package com.twitter.search.common.encoding.features; - -import java.util.List; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import com.twitter.search.common.indexing.thriftjava.PackedFeatures; -import com.twitter.search.common.schema.base.FeatureConfiguration; - -/** - * Class used to read/write integers encoded according to - * {@link com.twitter.search.common.schema.base.FeatureConfiguration} - * - * Implementations must override {@link #getInt(int pos)} and {@link #setInt(int pos, int value)}. - */ -public abstract class IntegerEncodedFeatures { - /** - * Returns the value at the given position. - */ - public abstract int getInt(int pos); - - /** - * Sets the given value at the given position. - */ - public abstract void setInt(int pos, int value); - - /** - * Get the maximum number of integers to hold features. - * @return the number of integers to represent all features. - */ - public abstract int getNumInts(); - - /** - * Test to see if the given feature is true or non-zero. Useful for one bit features. - * @param feature feature to examine - * @return true if feature is non-zero - */ - public boolean isFlagSet(FeatureConfiguration feature) { - return (getInt(feature.getValueIndex()) & feature.getBitMask()) != 0; - } - - public IntegerEncodedFeatures setFlag(FeatureConfiguration feature) { - setInt(feature.getValueIndex(), getInt(feature.getValueIndex()) | feature.getBitMask()); - return this; - } - - public IntegerEncodedFeatures clearFlag(FeatureConfiguration feature) { - setInt(feature.getValueIndex(), getInt(feature.getValueIndex()) & feature.getInverseBitMask()); - return this; - } - - /** - * Sets a boolean flag. - */ - public IntegerEncodedFeatures setFlagValue(FeatureConfiguration feature, boolean value) { - if (value) { - setFlag(feature); - } else { - clearFlag(feature); - } - return this; - } - - /** - * Get feature value - * @param feature feature to get - * @return the value of the feature - */ - public int getFeatureValue(FeatureConfiguration feature) { - return (getInt(feature.getValueIndex()) & feature.getBitMask()) - >>> feature.getBitStartPosition(); - } - - /** - * Set feature value - * @param feature feature to modify - * @param value value to set. - */ - public IntegerEncodedFeatures setFeatureValue(FeatureConfiguration feature, int value) { - Preconditions.checkState( - value <= feature.getMaxValue(), - "Feature value, %s, is greater than the max value allowed for this feature. " - + "Feature: %s, Max value: %s", - value, feature.getName(), feature.getMaxValue()); - - // Clear the value of the given feature in its int. - int temp = getInt(feature.getValueIndex()) & feature.getInverseBitMask(); - - // Set the new feature value. Applying the bit mask here ensures that other features in the - // same int are not modified by mistake. - temp |= (value << feature.getBitStartPosition()) & feature.getBitMask(); - - setInt(feature.getValueIndex(), temp); - return this; - } - - /** - * Sets feature value if greater than current value - * @param feature feature to modify - * @param value new value - */ - public IntegerEncodedFeatures setFeatureValueIfGreater(FeatureConfiguration feature, int value) { - if (value > getFeatureValue(feature)) { - setFeatureValue(feature, value); - } - return this; - } - - /** - * Increment a feature if its not at its maximum value. - * @return whether the feature is incremented. - */ - public boolean incrementIfNotMaximum(FeatureConfiguration feature) { - int newValue = getFeatureValue(feature) + 1; - if (newValue <= feature.getMaxValue()) { - setFeatureValue(feature, newValue); - return true; - } else { - return false; - } - } - - /** - * Copy these encoded features to a new PackedFeatures thrift struct. - */ - public PackedFeatures copyToPackedFeatures() { - return copyToPackedFeatures(new PackedFeatures()); - } - - /** - * Copy these encoded features to a PackedFeatures thrift struct. - */ - public PackedFeatures copyToPackedFeatures(PackedFeatures packedFeatures) { - Preconditions.checkNotNull(packedFeatures); - final List integers = Lists.newArrayListWithCapacity(getNumInts()); - for (int i = 0; i < getNumInts(); i++) { - integers.add(getInt(i)); - } - packedFeatures.setDeprecated_featureConfigurationVersion(0); - packedFeatures.setFeatures(integers); - return packedFeatures; - } - - /** - * Copy features from a packed features struct. - */ - public void readFromPackedFeatures(PackedFeatures packedFeatures) { - Preconditions.checkNotNull(packedFeatures); - List ints = packedFeatures.getFeatures(); - for (int i = 0; i < getNumInts(); i++) { - if (i < ints.size()) { - setInt(i, ints.get(i)); - } else { - setInt(i, 0); - } - } - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.docx new file mode 100644 index 000000000..58f7c5c54 Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.java b/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.java deleted file mode 100644 index 0124d0be3..000000000 --- a/src/java/com/twitter/search/common/encoding/features/LogByteNormalizer.java +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.search.common.encoding.features; - -import com.google.common.base.Preconditions; - -/** - * Normalizes values as follows: - * Positive numbers normalize to (1 + round(log_baseN(value))). - * Negative numbers throw. - * 0 will normalize to 0. - * The log base is 2 by default. - */ -public class LogByteNormalizer extends ByteNormalizer { - - private static final double DEFAULT_BASE = 2; - private final double base; - private final double logBase; - - public LogByteNormalizer(double base) { - Preconditions.checkArgument(base > 0); - this.base = base; - logBase = Math.log(base); - } - - public LogByteNormalizer() { - this(DEFAULT_BASE); - } - - @Override - public byte normalize(double val) { - if (val < 0) { - throw new IllegalArgumentException("Can't log-normalize negative value " + val); - } else if (val == 0) { - return 0; - } else { - long logVal = 1 + (long) Math.floor(Math.log(val) / logBase); - return logVal > Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte) logVal; - } - } - - @Override - public double unnormLowerBound(byte norm) { - return norm < 0 - ? Double.NEGATIVE_INFINITY - : Math.floor(Math.pow(base, norm - 1)); - } - - @Override - public double unnormUpperBound(byte norm) { - return norm == Byte.MAX_VALUE - ? Double.POSITIVE_INFINITY - : Math.floor(Math.pow(base, norm)); - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.docx new file mode 100644 index 000000000..bdac26d34 Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.java b/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.java deleted file mode 100644 index e02519f08..000000000 --- a/src/java/com/twitter/search/common/encoding/features/PredictionScoreNormalizer.java +++ /dev/null @@ -1,51 +0,0 @@ -package com.twitter.search.common.encoding.features; - -import com.google.common.base.Preconditions; - -/** - * A normalizer that normalizes the prediction score from a machine learning classifier, which - * ranges within [0.0, 1.0], to an integer value by multiplying by (10 ^ precision), and returns - * the rounded value. The lower the precision, the less amount of bits it takes to encode the score. - * @see #precision - * - * This normalizer also could denormalize the normalized value from integer back to double using the - * same precision. - */ -public class PredictionScoreNormalizer { - - private final int precision; - private final double normalizingBase; - - public PredictionScoreNormalizer(int precision) { - this.precision = precision; - this.normalizingBase = Math.pow(10, this.precision); - } - - /** - * Returns the normalized int value for prediction score {@code score} by multiplying - * by {@code normalizingBase}, and round the result. - * @throws IllegalArgumentException when parameter {@code score} is not within [0.0, 1.0] - */ - public int normalize(double score) { - Preconditions.checkArgument(isScoreWithinRange(score)); - return (int) Math.round(score * this.normalizingBase); - } - - /** - * Converts the normalized int value back to a double score by dividing by {@code normalizingBase} - * @throws IllegalStateException when the denormalized value is not within [0.0, 1.0] - */ - public double denormalize(int normalizedScore) { - double denormalizedValue = normalizedScore / this.normalizingBase; - if (!isScoreWithinRange(denormalizedValue)) { - throw new IllegalStateException( - String.format("The denormalized value %s is not within [0.0, 1.0]", denormalizedValue) - ); - } - return denormalizedValue; - } - - private static boolean isScoreWithinRange(double score) { - return 0.0 <= score && score <= 1.0; - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.docx new file mode 100644 index 000000000..2992c9239 Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.java b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.java deleted file mode 100644 index 32acc5048..000000000 --- a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatNormalizer.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.search.common.encoding.features; - -/** - * Normalizes using the logic described in {@link SingleBytePositiveFloatUtil}. - */ -public class SingleBytePositiveFloatNormalizer extends ByteNormalizer { - - @Override - public byte normalize(double val) { - return SingleBytePositiveFloatUtil.toSingleBytePositiveFloat((float) val); - } - - @Override - public double unnormLowerBound(byte norm) { - return SingleBytePositiveFloatUtil.toJavaFloat(norm); - } - - /** - * Get the upper bound of the raw value for a normalized byte. - * @deprecated This is wrongly implemented, always use unnormLowerBound(), - * or use SmartIntegerNormalizer. - */ - @Override @Deprecated - public double unnormUpperBound(byte norm) { - return 1 + SingleBytePositiveFloatUtil.toJavaFloat(norm); - } - - /** - * Return the the post-log2 unnormalized value. This is only used for some legacy Earlybird - * features and scoring functions. - */ - public double unnormAndLog2(byte norm) { - return SingleBytePositiveFloatUtil.toLog2Double(norm); - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.docx b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.docx new file mode 100644 index 000000000..4cf5adb92 Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.java b/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.java deleted file mode 100644 index 2894241e8..000000000 --- a/src/java/com/twitter/search/common/encoding/features/SingleBytePositiveFloatUtil.java +++ /dev/null @@ -1,164 +0,0 @@ -package com.twitter.search.common.encoding.features; - -/** - * Util used to: - * - Encode a positive Java float into a single byte float - * - Decode a single byte into a positive Java float - * - * Configuration: - * - Exponent: higher 4 bits, base 10. - * - Mantissa: lower 4 bit, representing 1.0 to 9.0 - * - Exponent bias is 1. - * - * Formula: - * Max(Mantissa, 9) * 10 ^ (Exponent - 1) - * - * Smallest float: 0.0 (0000 0000) - * Smallest positive float: 1.0 * 10^-1 (0000 0001) - * Largest float: 9.0 * 10^13 (1110 1111) - * Infinity: (1111 0000) - * NaN: (1111 1000) - */ -public final class SingleBytePositiveFloatUtil { - private SingleBytePositiveFloatUtil() { } - - // 4 bits mantissa. Range [1.0, 10.0) is divided into 16 steps - public static final byte MAX_BYTE_VALUE = (byte) 0xEF; - public static final byte INFINITY = (byte) 0xF0; - public static final byte NOT_A_NUMBER = (byte) 0xF8; - private static final float STEP_SIZE = 1.0f; - private static final int EXPONENT_BIAS = 1; - private static final byte MIN_EXPONENT = -EXPONENT_BIAS; - private static final int MAX_EXPONENT = 14 - EXPONENT_BIAS; - private static final byte MANTISSA_MASK = 0x0F; - - /** - * Converts the given float into a single byte floating point number. - * This is used in the updater and OK to be a bit slow. - */ - public static byte toSingleBytePositiveFloat(float f) { - if (f < 0) { - throw new UnsupportedOperationException( - "Cannot encode negative floats into SingleBytePostiveFloat."); - } - - if (Float.compare(f, Float.POSITIVE_INFINITY) == 0) { - return INFINITY; - } - - if (Float.compare(f, Float.NaN) == 0) { - return NOT_A_NUMBER; - } - - int mantissa = 0; - int exponent = (int) Math.floor(Math.log10(f)); - // Overflow (Number too large), just return the largest possible value - if (exponent > MAX_EXPONENT) { - return MAX_BYTE_VALUE; - } - - // Underflow (Number too small), just return 0 - if (exponent < MIN_EXPONENT) { - return 0; - } - - int frac = Math.round(f / (float) Math.pow(10.0f, exponent) / STEP_SIZE); - mantissa = fractionToMantissaTable[frac]; - - return (byte) (((exponent + EXPONENT_BIAS) << 4) | mantissa); - } - - /** - * Called in Earlybird per hit and needs to be fast. - */ - public static float toJavaFloat(byte b) { - return BYTE_TO_FLOAT_CONVERSION_TABLE[b & 0xff]; - } - - // Table used for converting mantissa into a significant - private static float[] mantissaToFractionTable = { - // Decimal Matisa value - STEP_SIZE * 0, // 0000 - STEP_SIZE * 1, // 0001 - STEP_SIZE * 1, // 0010 - STEP_SIZE * 2, // 0011 - STEP_SIZE * 2, // 0100 - STEP_SIZE * 3, // 0101 - STEP_SIZE * 3, // 0110 - STEP_SIZE * 4, // 0111 - STEP_SIZE * 4, // 1000 - STEP_SIZE * 5, // 1001 - STEP_SIZE * 5, // 1010 - STEP_SIZE * 6, // 1011 - STEP_SIZE * 6, // 1100 - STEP_SIZE * 7, // 1101 - STEP_SIZE * 8, // 1110 - STEP_SIZE * 9 // 1111 - }; - - // Table used for converting fraction into mantissa. - // Reverse operation of the above - private static int[] fractionToMantissaTable = { - 0, // 0 - 1, // 1 - 3, // 2 - 5, // 3 - 7, // 4 - 9, // 5 - 11, // 6 - 13, // 7 - 14, // 8 - 15, // 9 - 15, // 10 (Edge case: because we round the fraction, we can get 10 here.) - }; - - public static final byte LARGEST_FRACTION_UNDER_ONE = (byte) (toSingleBytePositiveFloat(1f) - 1); - - /** - * Converts the given byte to java float. - */ - private static float toJavaFloatSlow(byte b) { - if (b == INFINITY) { - return Float.POSITIVE_INFINITY; - } - - if ((b & 0xff) > (INFINITY & 0xff)) { - return Float.NaN; - } - - int exponent = ((b & 0xff) >>> 4) - EXPONENT_BIAS; - int mantissa = b & MANTISSA_MASK; - return mantissaToFractionTable[mantissa] * (float) Math.pow(10.0f, exponent); - } - - // Cached results from byte to float conversion - private static final float[] BYTE_TO_FLOAT_CONVERSION_TABLE = new float[256]; - private static final double[] BYTE_TO_LOG2_CONVERSION_TABLE = new double[256]; - private static final byte[] OLD_TO_NEW_BYTE_CONVERSION_TABLE = new byte[256]; - - static { - LogByteNormalizer normalizer = new LogByteNormalizer(); - for (int i = 0; i < 256; i++) { - byte b = (byte) i; - BYTE_TO_FLOAT_CONVERSION_TABLE[i] = toJavaFloatSlow(b); - BYTE_TO_LOG2_CONVERSION_TABLE[i] = - 0xff & normalizer.normalize(BYTE_TO_FLOAT_CONVERSION_TABLE[i]); - if (b == 0) { - OLD_TO_NEW_BYTE_CONVERSION_TABLE[i] = 0; - } else if (b > 0) { - OLD_TO_NEW_BYTE_CONVERSION_TABLE[i] = - toSingleBytePositiveFloat((float) normalizer.unnormLowerBound(b)); - } else { - // should not get here. - OLD_TO_NEW_BYTE_CONVERSION_TABLE[i] = MAX_BYTE_VALUE; - } - } - } - - /** - * Convert a normalized byte to the log2() version of its original value - */ - static double toLog2Double(byte b) { - return BYTE_TO_LOG2_CONVERSION_TABLE[b & 0xff]; - } -} diff --git a/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.docx b/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.docx new file mode 100644 index 000000000..23d98fd0d Binary files /dev/null and b/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.docx differ diff --git a/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.java b/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.java deleted file mode 100644 index f2655e294..000000000 --- a/src/java/com/twitter/search/common/encoding/features/SmartIntegerNormalizer.java +++ /dev/null @@ -1,150 +0,0 @@ -package com.twitter.search.common.encoding.features; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -/** - * A smart integer normalizer that converts an integer of a known range to a small integer up to - * 8 bits long. This normalizer generates a boundary value array in the constructor as the buckets - * for different values. - *

- * The normalized value has a nice properties: - * 1) it maintains the order of original value: if a > b, then normalize(a) > normalize(b). - * 2) the value 0 is always normalized to byte 0. - * 3) the normalized values are (almost) evenly distributed on the log scale - * 4) no waste in code space, all possible values representable by normalized bits are used, - * each corresponding to a different value. - */ -public class SmartIntegerNormalizer extends ByteNormalizer { - // The max value we want to support in this normalizer. If the input is larger than this value, - // it's normalized as if it's the maxValue. - private final int maxValue; - // Number of bits used for normalized value, the largest normalized value - // would be (1 << numBits) - 1. - private final int numBits; - // The inclusive lower bounds of all buckets. A normalized value k corresponds to original values - // in the inclusive-exclusive range - // [ boundaryValues[k], boundaryValues[k+1] ) - private final int[] boundaryValues; - // The length of the boundaryValues array, or the number of buckets. - private final int length; - - /** - * Construct a normalizer. - * - * @param maxValue max value it supports, must be larger than minValue. Anything larger than this - * would be treated as maxValue. - * @param numBits number of bits you want to use for this normalization, between 1 and 8. - * higher resolution for the lower numbers. - */ - public SmartIntegerNormalizer(int maxValue, int numBits) { - Preconditions.checkArgument(maxValue > 0); - Preconditions.checkArgument(numBits > 0 && numBits <= 8); - - this.maxValue = maxValue; - this.numBits = numBits; - - this.length = 1 << numBits; - this.boundaryValues = new int[length]; - - - int index; - for (index = length - 1; index >= 0; --index) { - // values are evenly distributed on the log scale - int boundary = (int) Math.pow(maxValue, (double) index / length); - // we have more byte slots left than we have possible boundary values (buckets), - // just give consecutive boundary values to all remaining slots, starting from 0. - if (boundary <= index) { - break; - } - boundaryValues[index] = boundary; - } - if (index >= 0) { - for (int i = 1; i <= index; ++i) { - boundaryValues[i] = i; - } - } - boundaryValues[0] = 0; // the first one is always 0. - } - - @Override - public byte normalize(double val) { - int intVal = (int) (val > maxValue ? maxValue : val); - return intToUnsignedByte(binarySearch(intVal, boundaryValues)); - } - - /** - * Return the lower bound of the bucket represent by norm. This simply returns the boundary - * value indexed by current norm. - */ - @Override - public double unnormLowerBound(byte norm) { - return boundaryValues[unsignedByteToInt(norm)]; - } - - /** - * Return the upper bound of the bucket represent by norm. This returns the next boundary value - * minus 1. If norm represents the last bucket, it returns the maxValue. - */ - @Override - public double unnormUpperBound(byte norm) { - // if it's already the last possible normalized value, just return the corresponding last - // boundary value. - int intNorm = unsignedByteToInt(norm); - if (intNorm == length - 1) { - return maxValue; - } - return boundaryValues[intNorm + 1] - 1; - } - - /** - * Do a binary search on array and find the index of the item that's no bigger than value. - */ - private static int binarySearch(int value, int[] array) { - // corner cases - if (value <= array[0]) { - return 0; - } else if (value >= array[array.length - 1]) { - return array.length - 1; - } - int left = 0; - int right = array.length - 1; - int pivot = (left + right) >> 1; - do { - int midVal = array[pivot]; - if (value == midVal) { - break; - } else if (value > midVal) { - left = pivot; - } else { - right = pivot; - } - pivot = (left + right) >> 1; - } while (pivot != left); - return pivot; - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(String.format( - "Smart Integer Normalizer (numBits = %d, max = %d)\n", - this.numBits, this.maxValue)); - for (int i = 0; i < this.length; i++) { - sb.append(String.format( - "[%2d] boundary = %6d, range [ %6d, %6d ), norm: %4d | %4d | %4d %s\n", - i, boundaryValues[i], - (int) unnormLowerBound(intToUnsignedByte(i)), - (int) unnormUpperBound(intToUnsignedByte(i)), - unsignedByteToInt(normalize(boundaryValues[i] - 1)), - unsignedByteToInt(normalize(boundaryValues[i])), - unsignedByteToInt(normalize(boundaryValues[i] + 1)), - i == boundaryValues[i] ? "*" : "")); - } - return sb.toString(); - } - - @VisibleForTesting - int[] getBoundaryValues() { - return boundaryValues; - } -} diff --git a/src/java/com/twitter/search/common/query/BUILD b/src/java/com/twitter/search/common/query/BUILD deleted file mode 100644 index 5c4cd6330..000000000 --- a/src/java/com/twitter/search/common/query/BUILD +++ /dev/null @@ -1,25 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/lucene:lucene-queries", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/search/common/features", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/util/analysis", - "src/java/com/twitter/search/queryparser", - "src/java/com/twitter/search/queryparser/query:core-query-nodes", - "src/java/com/twitter/search/queryparser/query/search:search-query-nodes", - ], -) diff --git a/src/java/com/twitter/search/common/query/BUILD.docx b/src/java/com/twitter/search/common/query/BUILD.docx new file mode 100644 index 000000000..5ff203847 Binary files /dev/null and b/src/java/com/twitter/search/common/query/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/query/BoostUtils.docx b/src/java/com/twitter/search/common/query/BoostUtils.docx new file mode 100644 index 000000000..570a5b18b Binary files /dev/null and b/src/java/com/twitter/search/common/query/BoostUtils.docx differ diff --git a/src/java/com/twitter/search/common/query/BoostUtils.java b/src/java/com/twitter/search/common/query/BoostUtils.java deleted file mode 100644 index 10ae55942..000000000 --- a/src/java/com/twitter/search/common/query/BoostUtils.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.search.common.query; - -import org.apache.lucene.search.BoostQuery; -import org.apache.lucene.search.Query; - -/** - * A class of utilities related to query boosts. - */ -public final class BoostUtils { - private BoostUtils() { - } - - /** - * Wraps the given query into a BoostQuery, if {@code boost} is not equal to 1.0f. - * - * @param query The query. - * @param boost The boost. - * @return If {@code boost} is equal to 1.0f, then {@code query} is returned; otherwise, - * {@code query} is wrapped into a {@code BoostQuery} instance with the given boost. - */ - public static Query maybeWrapInBoostQuery(Query query, float boost) { - if (boost == 1.0f) { - return query; - } - return new BoostQuery(query, boost); - } -} diff --git a/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.docx b/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.docx new file mode 100644 index 000000000..47ad1ea6c Binary files /dev/null and b/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.docx differ diff --git a/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.java b/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.java deleted file mode 100644 index 457ace646..000000000 --- a/src/java/com/twitter/search/common/query/CollectAnnotationsVisitor.java +++ /dev/null @@ -1,92 +0,0 @@ -package com.twitter.search.common.query; - - -import java.util.Map; -import java.util.Set; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import com.twitter.search.queryparser.query.BooleanQuery; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Operator; -import com.twitter.search.queryparser.query.Phrase; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.QueryVisitor; -import com.twitter.search.queryparser.query.SpecialTerm; -import com.twitter.search.queryparser.query.Term; -import com.twitter.search.queryparser.query.annotation.Annotation; - -/** - * Collect the nodes with a specified annotation type in the given query. - */ -public class CollectAnnotationsVisitor extends QueryVisitor { - - protected final Annotation.Type type; - - protected final Map nodeToTypeMap = Maps.newIdentityHashMap(); - - public CollectAnnotationsVisitor(Annotation.Type type) { - this.type = Preconditions.checkNotNull(type); - } - - @Override - public Boolean visit(Disjunction disjunction) throws QueryParserException { - return visitBooleanQuery(disjunction); - } - - @Override - public Boolean visit(Conjunction conjunction) throws QueryParserException { - return visitBooleanQuery(conjunction); - } - - @Override - public Boolean visit(Phrase phrase) throws QueryParserException { - return visitQuery(phrase); - } - - @Override - public Boolean visit(Term term) throws QueryParserException { - return visitQuery(term); - } - - @Override - public Boolean visit(Operator operator) throws QueryParserException { - return visitQuery(operator); - } - - @Override - public Boolean visit(SpecialTerm special) throws QueryParserException { - return visitQuery(special); - } - - protected boolean visitQuery(Query query) throws QueryParserException { - if (query.hasAnnotationType(type)) { - collectNode(query); - return true; - } - return false; - } - - protected void collectNode(Query query) { - nodeToTypeMap.put(query, true); - } - - protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException { - boolean found = false; - if (query.hasAnnotationType(type)) { - collectNode(query); - found = true; - } - for (Query child : query.getChildren()) { - found |= child.accept(this); - } - return found; - } - - public Set getNodes() { - return nodeToTypeMap.keySet(); - } -} diff --git a/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.docx b/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.docx new file mode 100644 index 000000000..71c5ef367 Binary files /dev/null and b/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.docx differ diff --git a/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.java b/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.java deleted file mode 100644 index 0e135991e..000000000 --- a/src/java/com/twitter/search/common/query/CollectQueryTypeVisitor.java +++ /dev/null @@ -1,89 +0,0 @@ -package com.twitter.search.common.query; - -import java.util.Map; -import java.util.Set; - -import com.google.common.collect.Maps; - -import com.twitter.search.queryparser.query.BooleanQuery; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Operator; -import com.twitter.search.queryparser.query.Phrase; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.QueryVisitor; -import com.twitter.search.queryparser.query.SpecialTerm; -import com.twitter.search.queryparser.query.Term; - -/** - * Collects the nodes with a specified query type in the given query. - */ -public class CollectQueryTypeVisitor extends QueryVisitor { - - protected final Query.QueryType queryType; - - protected final Map nodeToTypeMap = Maps.newIdentityHashMap(); - - public CollectQueryTypeVisitor(Query.QueryType queryType) { - this.queryType = queryType; - } - - @Override - public Boolean visit(Disjunction disjunction) throws QueryParserException { - return visitBooleanQuery(disjunction); - } - - @Override - public Boolean visit(Conjunction conjunction) throws QueryParserException { - return visitBooleanQuery(conjunction); - } - - @Override - public Boolean visit(Phrase phrase) throws QueryParserException { - return visitQuery(phrase); - } - - @Override - public Boolean visit(Term term) throws QueryParserException { - return visitQuery(term); - } - - @Override - public Boolean visit(Operator operator) throws QueryParserException { - return visitQuery(operator); - } - - @Override - public Boolean visit(SpecialTerm special) throws QueryParserException { - return visitQuery(special); - } - - public Set getCollectedNodes() { - return nodeToTypeMap.keySet(); - } - - protected boolean visitQuery(Query query) throws QueryParserException { - if (query.isTypeOf(queryType)) { - collectNode(query); - return true; - } - return false; - } - - protected void collectNode(Query query) { - nodeToTypeMap.put(query, true); - } - - protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException { - boolean found = false; - if (query.isTypeOf(queryType)) { - collectNode(query); - found = true; - } - for (Query child : query.getChildren()) { - found |= child.accept(this); - } - return found; - } -} diff --git a/src/java/com/twitter/search/common/query/CollectVariantVisitor.docx b/src/java/com/twitter/search/common/query/CollectVariantVisitor.docx new file mode 100644 index 000000000..39b0d59b7 Binary files /dev/null and b/src/java/com/twitter/search/common/query/CollectVariantVisitor.docx differ diff --git a/src/java/com/twitter/search/common/query/CollectVariantVisitor.java b/src/java/com/twitter/search/common/query/CollectVariantVisitor.java deleted file mode 100644 index a66961d7f..000000000 --- a/src/java/com/twitter/search/common/query/CollectVariantVisitor.java +++ /dev/null @@ -1,13 +0,0 @@ -package com.twitter.search.common.query; - -import com.twitter.search.queryparser.query.annotation.Annotation; - - -/** - * A visitor that collects the nodes that have :v annotation - */ -public class CollectVariantVisitor extends CollectAnnotationsVisitor { - public CollectVariantVisitor() { - super(Annotation.Type.VARIANT); - } -} diff --git a/src/java/com/twitter/search/common/query/DefaultFilterWeight.docx b/src/java/com/twitter/search/common/query/DefaultFilterWeight.docx new file mode 100644 index 000000000..334bd8563 Binary files /dev/null and b/src/java/com/twitter/search/common/query/DefaultFilterWeight.docx differ diff --git a/src/java/com/twitter/search/common/query/DefaultFilterWeight.java b/src/java/com/twitter/search/common/query/DefaultFilterWeight.java deleted file mode 100644 index 5fcc14433..000000000 --- a/src/java/com/twitter/search/common/query/DefaultFilterWeight.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; -import java.util.Set; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.ConstantScoreScorer; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Weight; - -/** - * An abstract Weight implementation that can be used by all "filter" classes (Query instances that - * should not contribute to the overall query score). - */ -public abstract class DefaultFilterWeight extends Weight { - public DefaultFilterWeight(Query query) { - super(query); - } - - @Override - public void extractTerms(Set terms) { - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) throws IOException { - Scorer scorer = scorer(context); - if ((scorer != null) && (scorer.iterator().advance(doc) == doc)) { - return Explanation.match(0f, "Match on id " + doc); - } - return Explanation.match(0f, "No match on id " + doc); - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - DocIdSetIterator disi = getDocIdSetIterator(context); - if (disi == null) { - return null; - } - - return new ConstantScoreScorer(this, 0.0f, ScoreMode.COMPLETE_NO_SCORES, disi); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return false; - } - - /** - * Returns the DocIdSetIterator over which the scorers created by this weight need to iterate. - * - * @param context The LeafReaderContext instance used to create the scorer. - */ - protected abstract DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) - throws IOException; -} diff --git a/src/java/com/twitter/search/common/query/DocIdFilter.docx b/src/java/com/twitter/search/common/query/DocIdFilter.docx new file mode 100644 index 000000000..ee2adcc7d Binary files /dev/null and b/src/java/com/twitter/search/common/query/DocIdFilter.docx differ diff --git a/src/java/com/twitter/search/common/query/DocIdFilter.java b/src/java/com/twitter/search/common/query/DocIdFilter.java deleted file mode 100644 index fed309f86..000000000 --- a/src/java/com/twitter/search/common/query/DocIdFilter.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; -import java.util.Set; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.ConstantScoreScorer; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Weight; - -/** - * Lucene filter on top of a known docid - * - */ -public class DocIdFilter extends Query { - private final int docid; - - public DocIdFilter(int docid) { - this.docid = docid; - } - - @Override - public Weight createWeight( - IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - return new Weight(this) { - @Override - public void extractTerms(Set terms) { - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) throws IOException { - Scorer scorer = scorer(context); - if ((scorer != null) && (scorer.iterator().advance(doc) == doc)) { - return Explanation.match(0f, "Match on id " + doc); - } - return Explanation.match(0f, "No match on id " + doc); - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - return new ConstantScoreScorer(this, 0.0f, scoreMode, new SingleDocDocIdSetIterator(docid)); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return true; - } - }; - } - - @Override - public int hashCode() { - return docid; - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof DocIdFilter)) { - return false; - } - - return docid == DocIdFilter.class.cast(obj).docid; - } - - @Override - public String toString(String field) { - return "DOC_ID_FILTER[docId=" + docid + " + ]"; - } -} diff --git a/src/java/com/twitter/search/common/query/FieldRankHitInfo.docx b/src/java/com/twitter/search/common/query/FieldRankHitInfo.docx new file mode 100644 index 000000000..ee86bfbb0 Binary files /dev/null and b/src/java/com/twitter/search/common/query/FieldRankHitInfo.docx differ diff --git a/src/java/com/twitter/search/common/query/FieldRankHitInfo.java b/src/java/com/twitter/search/common/query/FieldRankHitInfo.java deleted file mode 100644 index f7d509719..000000000 --- a/src/java/com/twitter/search/common/query/FieldRankHitInfo.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.search.common.query; - -/** - * When a hit (on a part of the query tree) occurs, this class is passed to HitAttributeCollector - * for collection. - * - * This implementation carries the following info: - *

    - *
  • The field that matched (the field ID is recorded)
  • - *
  • The query node that matched (the query node rank is recorded)
  • - *
  • The ID of the last doc that matched this query
  • - *
- * - * Each IdentifiableQuery should be associated with one FieldRankHitInfo, which is passed to a - * HitAttributeCollector when a hit occurs. - */ -public class FieldRankHitInfo { - protected static final int UNSET_DOC_ID = -1; - - private final int fieldId; - private final int rank; - private int docId = UNSET_DOC_ID; - - public FieldRankHitInfo(int fieldId, int rank) { - this.fieldId = fieldId; - this.rank = rank; - } - - public int getFieldId() { - return fieldId; - } - - public int getRank() { - return rank; - } - - public int getDocId() { - return docId; - } - - public void setDocId(int docId) { - this.docId = docId; - } - - public void resetDocId() { - this.docId = UNSET_DOC_ID; - } -} diff --git a/src/java/com/twitter/search/common/query/FieldWeightUtil.docx b/src/java/com/twitter/search/common/query/FieldWeightUtil.docx new file mode 100644 index 000000000..ff2adac3d Binary files /dev/null and b/src/java/com/twitter/search/common/query/FieldWeightUtil.docx differ diff --git a/src/java/com/twitter/search/common/query/FieldWeightUtil.java b/src/java/com/twitter/search/common/query/FieldWeightUtil.java deleted file mode 100644 index dcb7d08a8..000000000 --- a/src/java/com/twitter/search/common/query/FieldWeightUtil.java +++ /dev/null @@ -1,205 +0,0 @@ -package com.twitter.search.common.query; - -import java.util.Collections; -import java.util.EnumSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import javax.annotation.Nullable; - -import com.google.common.base.Enums; -import com.google.common.base.Function; -import com.google.common.base.Functions; -import com.google.common.base.Predicates; -import com.google.common.collect.FluentIterable; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.schema.base.FieldWeightDefault; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.annotation.Annotation; -import com.twitter.search.queryparser.query.annotation.FieldAnnotationUtils; -import com.twitter.search.queryparser.query.annotation.FieldNameWithBoost; - -public final class FieldWeightUtil { - private static final Logger LOG = LoggerFactory.getLogger(FieldWeightUtil.class); - private FieldWeightUtil() { - } - - /** - * Combines default field weight configuration with field annotations and returns a - * field-to-weight map. - * - * @param query The query whose annotations we will look into - * @param defaultFieldWeightMap field-to-FieldWeightDefault map - * @param enabledFieldWeightMap for optimization, this is the field-to-weight map inferred from - * the field-to-FieldWeightDefault map - * @param fieldNameToTyped A function that can turn string field name to typed field - * @param The typed field - */ - public static ImmutableMap combineDefaultWithAnnotation( - Query query, - Map defaultFieldWeightMap, - Map enabledFieldWeightMap, - Function fieldNameToTyped) throws QueryParserException { - return combineDefaultWithAnnotation( - query, - defaultFieldWeightMap, - enabledFieldWeightMap, - fieldNameToTyped, - Collections.emptyMap(), - Functions.forMap(Collections.emptyMap(), "")); - } - - /** - * Combines default field weight configuration with field annotations and returns a - * field-to-weight map. Also maps generic mappable fields to field weight boosts and resolves them - * - * @param query The query whose annotations we will look into - * @param defaultFieldWeightMap field-to-FieldWeightDefault map - * @param enabledFieldWeightMap for optimization, this is the field-to-weight map inferred from - * the field-to-FieldWeightDefault map - * @param fieldNameToTyped A function that can turn a string field name to typed field - * @param mappableFieldMap mapping of mappable fields to the corresponding typed fields - * @param typedToFieldName A function that can turn a typed field into a string field name - * @param The typed field - * - * Note: As a result of discussion on SEARCH-24029, we now allow replace and remove annotations - * on a single term. See http://go/fieldweight for info on field weight annotations. - */ - public static ImmutableMap combineDefaultWithAnnotation( - Query query, - Map defaultFieldWeightMap, - Map enabledFieldWeightMap, - Function fieldNameToTyped, - Map mappableFieldMap, - Function typedToFieldName) throws QueryParserException { - List fieldAnnotations = query.getAllAnnotationsOf(Annotation.Type.FIELD); - List mappableFieldAnnotations = - query.getAllAnnotationsOf(Annotation.Type.MAPPABLE_FIELD); - - if (fieldAnnotations.isEmpty() && mappableFieldAnnotations.isEmpty()) { - return ImmutableMap.copyOf(enabledFieldWeightMap); - } - - // Convert mapped fields to field annotations - Iterable fieldAnnotationsForMappedFields = - FluentIterable.from(mappableFieldAnnotations) - .transform(FieldWeightUtil.fieldAnnotationForMappableField(mappableFieldMap, - typedToFieldName)) - .filter(Predicates.notNull()); - - Iterable annotations = - Iterables.concat(fieldAnnotationsForMappedFields, fieldAnnotations); - - // Sanitize the field annotations first, remove the ones we don't know - // for REPLACE and REMOVE. - List sanitizedFields = Lists.newArrayList(); - Set seenModifierTypes = - EnumSet.noneOf(FieldNameWithBoost.FieldModifier.class); - - for (Annotation annotation : annotations) { - FieldNameWithBoost fieldNameWithBoost = (FieldNameWithBoost) annotation.getValue(); - T typedField = fieldNameToTyped.apply(fieldNameWithBoost.getFieldName()); - FieldNameWithBoost.FieldModifier modifier = fieldNameWithBoost.getFieldModifier(); - if (defaultFieldWeightMap.containsKey(typedField)) { - seenModifierTypes.add(modifier); - sanitizedFields.add(fieldNameWithBoost); - } - } - - // Even if there is no mapping for a mapped annotation, if a query is replaced by an unknown - // mapping, it should not map to other fields, so we need to detect a REPLACE annotation - if (seenModifierTypes.isEmpty() - && FieldAnnotationUtils.hasReplaceAnnotation(mappableFieldAnnotations)) { - seenModifierTypes.add(FieldNameWithBoost.FieldModifier.REPLACE); - } - - boolean onlyHasReplace = seenModifierTypes.size() == 1 - && seenModifierTypes.contains(FieldNameWithBoost.FieldModifier.REPLACE); - - // If we only have replace, start with an empty map, otherwise, start with all enabled fields. - Map actualMap = onlyHasReplace - ? Maps.newLinkedHashMap() - : Maps.newLinkedHashMap(enabledFieldWeightMap); - - // Go over all field annotations and apply them. - for (FieldNameWithBoost fieldAnnotation : sanitizedFields) { - T typedField = fieldNameToTyped.apply(fieldAnnotation.getFieldName()); - FieldNameWithBoost.FieldModifier modifier = fieldAnnotation.getFieldModifier(); - switch (modifier) { - case REMOVE: - actualMap.remove(typedField); - break; - - case ADD: - case REPLACE: - if (fieldAnnotation.getBoost().isPresent()) { - actualMap.put(typedField, fieldAnnotation.getBoost().get()); - } else { - // When annotation does not specify weight, use default weight - actualMap.put( - typedField, - defaultFieldWeightMap.get(typedField).getWeight()); - } - break; - default: - throw new QueryParserException("Unknown field annotation type: " + fieldAnnotation); - } - } - - return ImmutableMap.copyOf(actualMap); - } - - public static ImmutableMap combineDefaultWithAnnotation( - Query query, - Map defaultFieldWeightMap, - Map enabledFieldWeightMap) throws QueryParserException { - - return combineDefaultWithAnnotation( - query, defaultFieldWeightMap, enabledFieldWeightMap, Functions.identity()); - } - - /** - * Create an annotation of the FIELD type from annotations of the MAPPED_FIELD type - * @param mappableFieldMap mapping of mappable fields to the corresponding typed fields - * @param typedToFieldName A function that can turn a typed field into a string field name - * @param The typed field - * @return an Annotation with the same modifier and boost for a FIELD as the incoming MAPPED_FIELD - * annotation - */ - private static Function fieldAnnotationForMappableField( - final Map mappableFieldMap, - final Function typedToFieldName) { - return new Function() { - @Nullable - @Override - public Annotation apply(Annotation mappableAnnotation) { - FieldNameWithBoost fieldNameWithBoost = (FieldNameWithBoost) mappableAnnotation.getValue(); - MappableField mappedField = - Enums.getIfPresent( - MappableField.class, - fieldNameWithBoost.getFieldName().toUpperCase()).orNull(); - T typedFieldName = mappableFieldMap.get(mappedField); - Annotation fieldAnnotation = null; - if (typedFieldName != null) { - String fieldName = typedToFieldName.apply(typedFieldName); - FieldNameWithBoost mappedFieldBoost = - new FieldNameWithBoost( - fieldName, - fieldNameWithBoost.getBoost(), - fieldNameWithBoost.getFieldModifier()); - fieldAnnotation = Annotation.Type.FIELD.newInstance(mappedFieldBoost); - } - return fieldAnnotation; - } - }; - } -} diff --git a/src/java/com/twitter/search/common/query/FilteredQuery.docx b/src/java/com/twitter/search/common/query/FilteredQuery.docx new file mode 100644 index 000000000..94d0f2c98 Binary files /dev/null and b/src/java/com/twitter/search/common/query/FilteredQuery.docx differ diff --git a/src/java/com/twitter/search/common/query/FilteredQuery.java b/src/java/com/twitter/search/common/query/FilteredQuery.java deleted file mode 100644 index a4740970b..000000000 --- a/src/java/com/twitter/search/common/query/FilteredQuery.java +++ /dev/null @@ -1,225 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; -import java.util.Set; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Weight; - -/** - * A pairing of a query and a filter. The hits traversal is driven by the query's DocIdSetIterator, - * and the filter is used only to do post-filtering. In other words, the filter is never used to - * find the next doc ID: it's only used to filter out the doc IDs returned by the query's - * DocIdSetIterator. This is useful when we need to have a conjunction between a query that can - * quickly iterate through doc IDs (eg. a posting list), and an expensive filter (eg. a filter based - * on the values stored in a CSF). - * - * For example, let say we want to build a query that returns all docs that have at least 100 faves. - * 1. One option is to go with the [min_faves 100] query. This would be very expensive though, - * because this query would have to walk through every doc in the segment and for each one of - * them it would have to extract the number of faves from the forward index. - * 2. Another option is to go with a conjunction between this query and the HAS_ENGAGEMENT filter: - * (+[min_faves 100] +[cached_filter has_engagements]). The HAS_ENGAGEMENT filter could - * traverse the doc ID space faster (if it's backed by a posting list). But this approach would - * still be slow, because as soon as the HAS_ENGAGEMENT filter finds a doc ID, the conjunction - * scorer would trigger an advance(docID) call on the min_faves part of the query, which has - * the same problem as the first option. - * 3. Finally, a better option for this particular case would be to drive by the HAS_ENGAGEMENT - * filter (because it can quickly jump over all docs that do not have any engagement), and use - * the min_faves filter as a post-processing step, on a much smaller set of docs. - */ -public class FilteredQuery extends Query { - /** - * A doc ID predicate that determines if the given doc ID should be accepted. - */ - @FunctionalInterface - public static interface DocIdFilter { - /** - * Determines if the given doc ID should be accepted. - */ - boolean accept(int docId) throws IOException; - } - - /** - * A factory for creating DocIdFilter instances based on a given LeafReaderContext instance. - */ - @FunctionalInterface - public static interface DocIdFilterFactory { - /** - * Returns a DocIdFilter instance for the given LeafReaderContext instance. - */ - DocIdFilter getDocIdFilter(LeafReaderContext context) throws IOException; - } - - private static class FilteredQueryDocIdSetIterator extends DocIdSetIterator { - private final DocIdSetIterator queryScorerIterator; - private final DocIdFilter docIdFilter; - - public FilteredQueryDocIdSetIterator( - DocIdSetIterator queryScorerIterator, DocIdFilter docIdFilter) { - this.queryScorerIterator = Preconditions.checkNotNull(queryScorerIterator); - this.docIdFilter = Preconditions.checkNotNull(docIdFilter); - } - - @Override - public int docID() { - return queryScorerIterator.docID(); - } - - @Override - public int nextDoc() throws IOException { - int docId; - do { - docId = queryScorerIterator.nextDoc(); - } while (docId != NO_MORE_DOCS && !docIdFilter.accept(docId)); - return docId; - } - - @Override - public int advance(int target) throws IOException { - int docId = queryScorerIterator.advance(target); - if (docId == NO_MORE_DOCS || docIdFilter.accept(docId)) { - return docId; - } - return nextDoc(); - } - - @Override - public long cost() { - return queryScorerIterator.cost(); - } - } - - private static class FilteredQueryScorer extends Scorer { - private final Scorer queryScorer; - private final DocIdFilter docIdFilter; - - public FilteredQueryScorer(Weight weight, Scorer queryScorer, DocIdFilter docIdFilter) { - super(weight); - this.queryScorer = Preconditions.checkNotNull(queryScorer); - this.docIdFilter = Preconditions.checkNotNull(docIdFilter); - } - - @Override - public int docID() { - return queryScorer.docID(); - } - - @Override - public float score() throws IOException { - return queryScorer.score(); - } - - @Override - public DocIdSetIterator iterator() { - return new FilteredQueryDocIdSetIterator(queryScorer.iterator(), docIdFilter); - } - - @Override - public float getMaxScore(int upTo) throws IOException { - return queryScorer.getMaxScore(upTo); - } - } - - private static class FilteredQueryWeight extends Weight { - private final Weight queryWeight; - private final DocIdFilterFactory docIdFilterFactory; - - public FilteredQueryWeight( - FilteredQuery query, Weight queryWeight, DocIdFilterFactory docIdFilterFactory) { - super(query); - this.queryWeight = Preconditions.checkNotNull(queryWeight); - this.docIdFilterFactory = Preconditions.checkNotNull(docIdFilterFactory); - } - - @Override - public void extractTerms(Set terms) { - queryWeight.extractTerms(terms); - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) throws IOException { - return queryWeight.explain(context, doc); - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - Scorer queryScorer = queryWeight.scorer(context); - if (queryScorer == null) { - return null; - } - - return new FilteredQueryScorer(this, queryScorer, docIdFilterFactory.getDocIdFilter(context)); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return queryWeight.isCacheable(ctx); - } - } - - private final Query query; - private final DocIdFilterFactory docIdFilterFactory; - - public FilteredQuery(Query query, DocIdFilterFactory docIdFilterFactory) { - this.query = Preconditions.checkNotNull(query); - this.docIdFilterFactory = Preconditions.checkNotNull(docIdFilterFactory); - } - - public Query getQuery() { - return query; - } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - Query rewrittenQuery = query.rewrite(reader); - if (rewrittenQuery != query) { - return new FilteredQuery(rewrittenQuery, docIdFilterFactory); - } - return this; - } - - @Override - public int hashCode() { - return query.hashCode() * 13 + docIdFilterFactory.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof FilteredQuery)) { - return false; - } - - FilteredQuery filteredQuery = FilteredQuery.class.cast(obj); - return query.equals(filteredQuery.query) - && docIdFilterFactory.equals(filteredQuery.docIdFilterFactory); - } - - @Override - public String toString(String field) { - StringBuilder sb = new StringBuilder(); - sb.append("FilteredQuery(") - .append(query) - .append(" -> ") - .append(docIdFilterFactory) - .append(")"); - return sb.toString(); - } - - @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) - throws IOException { - Weight queryWeight = Preconditions.checkNotNull(query.createWeight(searcher, scoreMode, boost)); - return new FilteredQueryWeight(this, queryWeight, docIdFilterFactory); - } -} diff --git a/src/java/com/twitter/search/common/query/FilteredScorer.docx b/src/java/com/twitter/search/common/query/FilteredScorer.docx new file mode 100644 index 000000000..1bce7324a Binary files /dev/null and b/src/java/com/twitter/search/common/query/FilteredScorer.docx differ diff --git a/src/java/com/twitter/search/common/query/FilteredScorer.java b/src/java/com/twitter/search/common/query/FilteredScorer.java deleted file mode 100644 index 41d9032f6..000000000 --- a/src/java/com/twitter/search/common/query/FilteredScorer.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; - -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Weight; - -public class FilteredScorer extends Scorer { - protected final Scorer inner; - - public FilteredScorer(Weight weight, Scorer inner) { - super(weight); - this.inner = inner; - } - - @Override - public float score() throws IOException { - return inner.score(); - } - - @Override - public int docID() { - return inner.docID(); - } - - @Override - public DocIdSetIterator iterator() { - return inner.iterator(); - } - - @Override - public float getMaxScore(int upTo) throws IOException { - return inner.getMaxScore(upTo); - } -} diff --git a/src/java/com/twitter/search/common/query/HitAttributeCollector.docx b/src/java/com/twitter/search/common/query/HitAttributeCollector.docx new file mode 100644 index 000000000..71b286bdd Binary files /dev/null and b/src/java/com/twitter/search/common/query/HitAttributeCollector.docx differ diff --git a/src/java/com/twitter/search/common/query/HitAttributeCollector.java b/src/java/com/twitter/search/common/query/HitAttributeCollector.java deleted file mode 100644 index 21844aa71..000000000 --- a/src/java/com/twitter/search/common/query/HitAttributeCollector.java +++ /dev/null @@ -1,101 +0,0 @@ -package com.twitter.search.common.query; - -import java.util.List; -import java.util.Map; -import java.util.function.BiFunction; -import java.util.function.Function; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.Query; - -/** - * Not threadsafe, but should be reused across different queries unless the size of the existing - * one is too small for a new huge serialized query. - */ -public class HitAttributeCollector { - private final List hitInfos = Lists.newArrayList(); - private final BiFunction hitInfoSupplier; - - private int docBase = 0; - - public HitAttributeCollector() { - this.hitInfoSupplier = FieldRankHitInfo::new; - } - - /** - * Constructs a new {@code HitAttributionCollector} with the specified {@code FieldRankHitInfo} - * supplier. - * - * @param hitInfoSupplier function to supply a {@code FieldRankHitInfo} instance - */ - public HitAttributeCollector(BiFunction hitInfoSupplier) { - this.hitInfoSupplier = hitInfoSupplier; - } - - /** - * Creates a new IdentifiableQuery for the given query, fieldId and rank, and "registers" - * the fieldId and the rank with this collector. - * - * @param query the query to be wrapped. - * @param fieldId the ID of the field to be searched. - * @param rank The rank of this query. - * @return A new IdentifiableQuery instance for the given query, fieldId and rank. - */ - public IdentifiableQuery newIdentifiableQuery(Query query, int fieldId, int rank) { - FieldRankHitInfo fieldRankHitInfo = hitInfoSupplier.apply(fieldId, rank); - hitInfos.add(fieldRankHitInfo); - return new IdentifiableQuery(query, fieldRankHitInfo, this); - } - - public void clearHitAttributions(LeafReaderContext ctx, FieldRankHitInfo hitInfo) { - docBase = ctx.docBase; - hitInfo.resetDocId(); - } - - public void collectScorerAttribution(int docId, FieldRankHitInfo hitInfo) { - hitInfo.setDocId(docId + docBase); - } - - /** - * This method should be called when a global hit occurs. - * This method returns hit attribution summary for the whole query tree. - * This supports getting hit attribution for only the curDoc. - * - * @param docId docId passed in for checking against curDoc. - * @return Returns a map from node rank to a set of matching field IDs. This map does not contain - * entries for ranks that did not hit at all. - */ - public Map> getHitAttribution(int docId) { - return getHitAttribution(docId, (fieldId) -> fieldId); - } - - /** - * This method should be called when a global hit occurs. - * This method returns hit attribution summary for the whole query tree. - * This supports getting hit attribution for only the curDoc. - * - * @param docId docId passed in for checking against curDoc. - * @param fieldIdFunc The mapping of field IDs to objects of type T. - * @return Returns a map from node rank to a set of matching objects (usually field IDs or names). - * This map does not contain entries for ranks that did not hit at all. - */ - public Map> getHitAttribution(int docId, Function fieldIdFunc) { - int key = docId + docBase; - Map> hitMap = Maps.newHashMap(); - - // Manually iterate through all hitInfos elements. It's slightly faster than using an Iterator. - for (FieldRankHitInfo hitInfo : hitInfos) { - if (hitInfo.getDocId() == key) { - int rank = hitInfo.getRank(); - List rankHits = hitMap.computeIfAbsent(rank, k -> Lists.newArrayList()); - T fieldDescription = fieldIdFunc.apply(hitInfo.getFieldId()); - rankHits.add(fieldDescription); - } - } - - return hitMap; - } -} diff --git a/src/java/com/twitter/search/common/query/HitAttributeHelper.docx b/src/java/com/twitter/search/common/query/HitAttributeHelper.docx new file mode 100644 index 000000000..40cab5283 Binary files /dev/null and b/src/java/com/twitter/search/common/query/HitAttributeHelper.docx differ diff --git a/src/java/com/twitter/search/common/query/HitAttributeHelper.java b/src/java/com/twitter/search/common/query/HitAttributeHelper.java deleted file mode 100644 index 572f7b855..000000000 --- a/src/java/com/twitter/search/common/query/HitAttributeHelper.java +++ /dev/null @@ -1,102 +0,0 @@ -package com.twitter.search.common.query; - -import java.util.List; -import java.util.Map; -import java.util.function.Function; - -import com.google.common.collect.Maps; - -import com.twitter.search.queryparser.query.Query; - -import static com.twitter.search.common.query.FieldRankHitInfo.UNSET_DOC_ID; - -/** - * Generic helper class containing the data needed to set up and collect field hit attributions. - */ -public class HitAttributeHelper implements HitAttributeProvider { - private final HitAttributeCollector collector; - private final Function fieldIdsToFieldNames; - - // This is a mapping of type T query nodes to rank id - private final Map nodeToRankMap; - - // This is meant to expand individual Query nodes into multiple ranks, - // for example, expanding a multi_term_disjunction to include a rank for each disjunction value. - private final Map> expandedNodeToRankMap; - - // A single-entry cache for hit attribution, so we can reuse the immediate result. Will be used - // only when lastDocId matches - private ThreadLocal>> lastHitAttrHolder = new ThreadLocal<>(); - private ThreadLocal lastDocIdHolder = ThreadLocal.withInitial(() -> UNSET_DOC_ID); - - protected HitAttributeHelper( - HitAttributeCollector collector, - Function fieldIdsToFieldNames, - Map nodeToRankMap, - Map> expandedNodeToRankMap) { - this.collector = collector; - this.fieldIdsToFieldNames = fieldIdsToFieldNames; - this.nodeToRankMap = nodeToRankMap; - this.expandedNodeToRankMap = expandedNodeToRankMap; - } - - /** - * Constructs a new {@code HitAttributeHelper} with the specified {@code HitAttributeCollector} - * instance and fields. - * - * @param collector a collector instance - * @param fieldIdsToFieldNames a list of field names indexed by id - */ - public HitAttributeHelper(HitAttributeCollector collector, String[] fieldIdsToFieldNames) { - this(collector, - (fieldId) -> fieldIdsToFieldNames[fieldId], - Maps.newHashMap(), - Maps.newHashMap()); - } - - public HitAttributeCollector getFieldRankHitAttributeCollector() { - return collector; - } - - /** - * Returns hit attribution information indexed by node rank - * - * @param docId the document id - * @return a mapping from the query's node rank to a list of field names that were hit. - */ - public Map> getHitAttribution(int docId) { - // check cache first so we don't have to recompute the same thing. - if (lastDocIdHolder.get() == docId) { - return lastHitAttrHolder.get(); - } - - lastDocIdHolder.set(docId); - Map> hitAttribution = - collector.getHitAttribution(docId, fieldIdsToFieldNames); - lastHitAttrHolder.set(hitAttribution); - return hitAttribution; - } - - /** - * Adds a new node and its respective rank to the helper's node-to-rank map - * Will throw an exception if attempting to add/update an existing node - * - * @param node the query node - * @param rank the rank associated with the node - */ - public void addNodeRank(Query node, int rank) { - // if there are two of the same terms, just map them to the first rank, they should get the same - // hits back - if (!nodeToRankMap.containsKey(node)) { - nodeToRankMap.put(node, rank); - } - } - - public Map getNodeToRankMap() { - return nodeToRankMap; - } - - public Map> getExpandedNodeToRankMap() { - return expandedNodeToRankMap; - } -} diff --git a/src/java/com/twitter/search/common/query/HitAttributeProvider.docx b/src/java/com/twitter/search/common/query/HitAttributeProvider.docx new file mode 100644 index 000000000..4c898985c Binary files /dev/null and b/src/java/com/twitter/search/common/query/HitAttributeProvider.docx differ diff --git a/src/java/com/twitter/search/common/query/HitAttributeProvider.java b/src/java/com/twitter/search/common/query/HitAttributeProvider.java deleted file mode 100644 index bcdcea90c..000000000 --- a/src/java/com/twitter/search/common/query/HitAttributeProvider.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.search.common.query; - -import java.util.List; -import java.util.Map; - -/** - * The interface for objects that can provide hit attributes for a document. - */ -public interface HitAttributeProvider { - /** Returns the hit attributes for the given document. */ - Map> getHitAttribution(int docId); -} diff --git a/src/java/com/twitter/search/common/query/IDDisjunctionQuery.docx b/src/java/com/twitter/search/common/query/IDDisjunctionQuery.docx new file mode 100644 index 000000000..3874d7b44 Binary files /dev/null and b/src/java/com/twitter/search/common/query/IDDisjunctionQuery.docx differ diff --git a/src/java/com/twitter/search/common/query/IDDisjunctionQuery.java b/src/java/com/twitter/search/common/query/IDDisjunctionQuery.java deleted file mode 100644 index e6ac8afe1..000000000 --- a/src/java/com/twitter/search/common/query/IDDisjunctionQuery.java +++ /dev/null @@ -1,378 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; -import java.util.Objects; -import java.util.Set; -import java.util.stream.Collectors; - -import org.apache.lucene.index.FilteredTermsEnum; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermState; -import org.apache.lucene.index.TermStates; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BulkScorer; -import org.apache.lucene.search.ConstantScoreQuery; -import org.apache.lucene.search.ConstantScoreScorer; -import org.apache.lucene.search.ConstantScoreWeight; -import org.apache.lucene.search.DocIdSet; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.Weight; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.DocIdSetBuilder; - -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.IndexedNumericFieldSettings; -import com.twitter.search.common.util.analysis.LongTermAttributeImpl; -import com.twitter.search.common.util.analysis.SortableLongTermAttributeImpl; -import com.twitter.search.queryparser.query.QueryParserException; - -/** - * An extension of Lucene's MultiTermQuery which creates a disjunction of - * long ID terms. Lucene tries to rewrite the Query depending on the number - * of clauses to perform as efficiently as possible. - */ -public class IDDisjunctionQuery extends MultiTermQuery { - private final List ids; - private final boolean useOrderPreservingEncoding; - - /** Creates a new IDDisjunctionQuery instance. */ - public IDDisjunctionQuery(List ids, String field, ImmutableSchemaInterface schemaSnapshot) - throws QueryParserException { - super(field); - this.ids = ids; - - setRewriteMethod(new Rewrite()); - - if (!schemaSnapshot.hasField(field)) { - throw new QueryParserException( - "Tried to search a field which does not exist in schema: " + field); - } - - IndexedNumericFieldSettings numericFieldSettings = - schemaSnapshot.getFieldInfo(field).getFieldType().getNumericFieldSettings(); - - if (numericFieldSettings == null) { - throw new QueryParserException("Requested id field is not numerical: " + field); - } - - this.useOrderPreservingEncoding = numericFieldSettings.isUseSortableEncoding(); - } - - /** - * Work around for an issue where LongTerms are not valid utf8, so calling - * toString on any TermQuery containing a LongTerm may cause exceptions. - */ - private class Rewrite extends RewriteMethod { - @Override - public Query rewrite(IndexReader reader, MultiTermQuery query) throws IOException { - Query result = new MultiTermQueryConstantScoreWrapper( - (IDDisjunctionQuery) query, useOrderPreservingEncoding); - return result; - } - } - - @Override - protected TermsEnum getTermsEnum(final Terms terms, AttributeSource atts) throws IOException { - final Iterator it = this.ids.iterator(); - final TermsEnum termsEnum = terms.iterator(); - - return new FilteredTermsEnum(termsEnum) { - private final BytesRef term = useOrderPreservingEncoding - ? SortableLongTermAttributeImpl.newBytesRef() - : LongTermAttributeImpl.newBytesRef(); - - @Override protected AcceptStatus accept(BytesRef term) throws IOException { - return AcceptStatus.YES; - } - - @Override public BytesRef next() throws IOException { - while (it.hasNext()) { - Long longTerm = it.next(); - if (useOrderPreservingEncoding) { - SortableLongTermAttributeImpl.copyLongToBytesRef(term, longTerm); - } else { - LongTermAttributeImpl.copyLongToBytesRef(term, longTerm); - } - if (termsEnum.seekExact(term)) { - return term; - } - } - - return null; - } - }; - } - - @Override - public String toString(String field) { - StringBuilder builder = new StringBuilder(); - builder.append("IDDisjunction[").append(this.field).append(":"); - for (Long id : this.ids) { - builder.append(id); - builder.append(","); - } - builder.setLength(builder.length() - 1); - builder.append("]"); - return builder.toString(); - } - - private static class TermQueryWithToString extends TermQuery { - private final boolean useOrderPreservingEncoding; - - public TermQueryWithToString(Term t, TermStates states, boolean useOrderPreservingEncoding) { - super(t, states); - this.useOrderPreservingEncoding = useOrderPreservingEncoding; - } - - @Override - public String toString(String field) { - StringBuilder buffer = new StringBuilder(); - if (!getTerm().field().equals(field)) { - buffer.append(getTerm().field()); - buffer.append(":"); - } - long longTerm; - BytesRef termBytes = getTerm().bytes(); - if (useOrderPreservingEncoding) { - longTerm = SortableLongTermAttributeImpl.copyBytesRefToLong(termBytes); - } else { - longTerm = LongTermAttributeImpl.copyBytesRefToLong(termBytes); - } - buffer.append(longTerm); - return buffer.toString(); - } - } - - /** - * This class provides the functionality behind {@link MultiTermQuery#CONSTANT_SCORE_REWRITE}. - * It tries to rewrite per-segment as a boolean query that returns a constant score and otherwise - * fills a DocIdSet with matches and builds a Scorer on top of this DocIdSet. - */ - static final class MultiTermQueryConstantScoreWrapper extends Query { - // disable the rewrite option which will scan all posting lists sequentially and perform - // the intersection using a temporary DocIdSet. In earlybird this mode is slower than a "normal" - // disjunctive BooleanQuery, due to early termination and the fact that everything is in memory. - private static final int BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD = 3000; - - private static class TermAndState { - private final BytesRef term; - private final TermState state; - private final int docFreq; - private final long totalTermFreq; - - TermAndState(BytesRef term, TermState state, int docFreq, long totalTermFreq) { - this.term = term; - this.state = state; - this.docFreq = docFreq; - this.totalTermFreq = totalTermFreq; - } - } - - private static class WeightOrDocIdSet { - private final Weight weight; - private final DocIdSet docIdSet; - - WeightOrDocIdSet(Weight weight) { - this.weight = Objects.requireNonNull(weight); - this.docIdSet = null; - } - - WeightOrDocIdSet(DocIdSet docIdSet) { - this.docIdSet = docIdSet; - this.weight = null; - } - } - - protected final IDDisjunctionQuery query; - private final boolean useOrderPreservingEncoding; - - /** - * Wrap a {@link MultiTermQuery} as a Filter. - */ - protected MultiTermQueryConstantScoreWrapper( - IDDisjunctionQuery query, - boolean useOrderPreservingEncoding) { - this.query = query; - this.useOrderPreservingEncoding = useOrderPreservingEncoding; - } - - @Override - public String toString(String field) { - // query.toString should be ok for the filter, too, if the query boost is 1.0f - return query.toString(field); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof MultiTermQueryConstantScoreWrapper)) { - return false; - } - - return query.equals(MultiTermQueryConstantScoreWrapper.class.cast(obj).query); - } - - @Override - public int hashCode() { - return query == null ? 0 : query.hashCode(); - } - - /** Returns the field name for this query */ - public String getField() { - return query.getField(); - } - - private List getIDs() { - return query.ids; - } - - @Override - public Weight createWeight( - final IndexSearcher searcher, - final ScoreMode scoreMode, - final float boost) throws IOException { - return new ConstantScoreWeight(this, boost) { - /** Try to collect terms from the given terms enum and return true iff all - * terms could be collected. If {@code false} is returned, the enum is - * left positioned on the next term. */ - private boolean collectTerms(LeafReaderContext context, - TermsEnum termsEnum, - List terms) throws IOException { - final int threshold = Math.min(BOOLEAN_REWRITE_TERM_COUNT_THRESHOLD, - BooleanQuery.getMaxClauseCount()); - for (int i = 0; i < threshold; ++i) { - final BytesRef term = termsEnum.next(); - if (term == null) { - return true; - } - TermState state = termsEnum.termState(); - terms.add(new TermAndState(BytesRef.deepCopyOf(term), - state, - termsEnum.docFreq(), - termsEnum.totalTermFreq())); - } - return termsEnum.next() == null; - } - - /** - * On the given leaf context, try to either rewrite to a disjunction if - * there are few terms, or build a DocIdSet containing matching docs. - */ - private WeightOrDocIdSet rewrite(LeafReaderContext context) - throws IOException { - final Terms terms = context.reader().terms(query.getField()); - if (terms == null) { - // field does not exist - return new WeightOrDocIdSet((DocIdSet) null); - } - - final TermsEnum termsEnum = query.getTermsEnum(terms); - assert termsEnum != null; - - PostingsEnum docs = null; - - final List collectedTerms = new ArrayList<>(); - if (collectTerms(context, termsEnum, collectedTerms)) { - // build a boolean query - BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); - for (TermAndState t : collectedTerms) { - final TermStates termStates = new TermStates(searcher.getTopReaderContext()); - termStates.register(t.state, context.ord, t.docFreq, t.totalTermFreq); - final Term term = new Term(query.getField(), t.term); - bqBuilder.add( - new TermQueryWithToString(term, termStates, useOrderPreservingEncoding), - Occur.SHOULD); - } - Query q = BoostUtils.maybeWrapInBoostQuery( - new ConstantScoreQuery(bqBuilder.build()), score()); - return new WeightOrDocIdSet( - searcher.rewrite(q).createWeight(searcher, scoreMode, boost)); - } - - // Too many terms: go back to the terms we already collected and start building - // the DocIdSet - DocIdSetBuilder builder = new DocIdSetBuilder(context.reader().maxDoc()); - if (!collectedTerms.isEmpty()) { - TermsEnum termsEnum2 = terms.iterator(); - for (TermAndState t : collectedTerms) { - termsEnum2.seekExact(t.term, t.state); - docs = termsEnum2.postings(docs, PostingsEnum.NONE); - builder.add(docs); - } - } - - // Then keep filling the DocIdSet with remaining terms - do { - docs = termsEnum.postings(docs, PostingsEnum.NONE); - builder.add(docs); - } while (termsEnum.next() != null); - - return new WeightOrDocIdSet(builder.build()); - } - - private Scorer scorer(DocIdSet set) throws IOException { - if (set == null) { - return null; - } - final DocIdSetIterator disi = set.iterator(); - if (disi == null) { - return null; - } - return new ConstantScoreScorer(this, score(), ScoreMode.COMPLETE_NO_SCORES, disi); - } - - @Override - public BulkScorer bulkScorer(LeafReaderContext context) throws IOException { - final WeightOrDocIdSet weightOrDocIdSet = rewrite(context); - if (weightOrDocIdSet.weight != null) { - return weightOrDocIdSet.weight.bulkScorer(context); - } else { - final Scorer scorer = scorer(weightOrDocIdSet.docIdSet); - if (scorer == null) { - return null; - } - return new DefaultBulkScorer(scorer); - } - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - final WeightOrDocIdSet weightOrDocIdSet = rewrite(context); - if (weightOrDocIdSet.weight != null) { - return weightOrDocIdSet.weight.scorer(context); - } else { - return scorer(weightOrDocIdSet.docIdSet); - } - } - - @Override - public void extractTerms(Set terms) { - terms.addAll(getIDs() - .stream() - .map(id -> new Term(getField(), LongTermAttributeImpl.copyIntoNewBytesRef(id))) - .collect(Collectors.toSet())); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return false; - } - }; - } - } -} diff --git a/src/java/com/twitter/search/common/query/IdentifiableQuery.docx b/src/java/com/twitter/search/common/query/IdentifiableQuery.docx new file mode 100644 index 000000000..3fe018ec1 Binary files /dev/null and b/src/java/com/twitter/search/common/query/IdentifiableQuery.docx differ diff --git a/src/java/com/twitter/search/common/query/IdentifiableQuery.java b/src/java/com/twitter/search/common/query/IdentifiableQuery.java deleted file mode 100644 index dbecf88aa..000000000 --- a/src/java/com/twitter/search/common/query/IdentifiableQuery.java +++ /dev/null @@ -1,77 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Weight; - -/** - * Query implementation adds attribute collection support for an underlying query. - */ -public class IdentifiableQuery extends Query { - protected final Query inner; - private final FieldRankHitInfo queryId; - private final HitAttributeCollector attrCollector; - - public IdentifiableQuery(Query inner, FieldRankHitInfo queryId, - HitAttributeCollector attrCollector) { - this.inner = Preconditions.checkNotNull(inner); - this.queryId = queryId; - this.attrCollector = Preconditions.checkNotNull(attrCollector); - } - - @Override - public Weight createWeight( - IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - Weight innerWeight = inner.createWeight(searcher, scoreMode, boost); - return new IdentifiableQueryWeight(this, innerWeight, queryId, attrCollector); - } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - Query rewritten = inner.rewrite(reader); - if (rewritten != inner) { - return new IdentifiableQuery(rewritten, queryId, attrCollector); - } - return this; - } - - @Override - public int hashCode() { - return inner.hashCode() * 13 + (queryId == null ? 0 : queryId.hashCode()); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof IdentifiableQuery)) { - return false; - } - - IdentifiableQuery identifiableQuery = IdentifiableQuery.class.cast(obj); - return inner.equals(identifiableQuery.inner) - && (queryId == null - ? identifiableQuery.queryId == null - : queryId.equals(identifiableQuery.queryId)); - } - - @Override - public String toString(String field) { - return inner.toString(field); - } - - @VisibleForTesting - public Query getQueryForTest() { - return inner; - } - - @VisibleForTesting - public FieldRankHitInfo getQueryIdForTest() { - return queryId; - } -} diff --git a/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.docx b/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.docx new file mode 100644 index 000000000..a4a135617 Binary files /dev/null and b/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.docx differ diff --git a/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.java b/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.java deleted file mode 100644 index 98c8340eb..000000000 --- a/src/java/com/twitter/search/common/query/IdentifiableQueryScorer.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Weight; - -/** - * Scorer implementation that adds attribute collection support for an underlying query. - * Meant to be used in conjunction with {@link IdentifiableQuery}. - */ -public class IdentifiableQueryScorer extends FilteredScorer { - private final FieldRankHitInfo queryId; - private final HitAttributeCollector attrCollector; - - public IdentifiableQueryScorer(Weight weight, Scorer inner, FieldRankHitInfo queryId, - HitAttributeCollector attrCollector) { - super(weight, inner); - this.queryId = queryId; - this.attrCollector = Preconditions.checkNotNull(attrCollector); - } - - @Override - public DocIdSetIterator iterator() { - final DocIdSetIterator superDISI = super.iterator(); - - return new DocIdSetIterator() { - @Override - public int docID() { - return superDISI.docID(); - } - - @Override - public int nextDoc() throws IOException { - int docid = superDISI.nextDoc(); - if (docid != NO_MORE_DOCS) { - attrCollector.collectScorerAttribution(docid, queryId); - } - return docid; - } - - @Override - public int advance(int target) throws IOException { - int docid = superDISI.advance(target); - if (docid != NO_MORE_DOCS) { - attrCollector.collectScorerAttribution(docid, queryId); - } - return docid; - } - - @Override - public long cost() { - return superDISI.cost(); - } - }; - } -} diff --git a/src/java/com/twitter/search/common/query/IdentifiableQueryWeight.docx b/src/java/com/twitter/search/common/query/IdentifiableQueryWeight.docx new file mode 100644 index 000000000..8053b5beb Binary files /dev/null and b/src/java/com/twitter/search/common/query/IdentifiableQueryWeight.docx differ diff --git a/src/java/com/twitter/search/common/query/IdentifiableQueryWeight.java b/src/java/com/twitter/search/common/query/IdentifiableQueryWeight.java deleted file mode 100644 index 5daba7517..000000000 --- a/src/java/com/twitter/search/common/query/IdentifiableQueryWeight.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; -import java.util.Set; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Weight; - -/** - * Weight implementation that adds attribute collection support for an underlying query. - * Meant to be used in conjunction with {@link IdentifiableQuery}. - */ -public class IdentifiableQueryWeight extends Weight { - private final Weight inner; - private final FieldRankHitInfo queryId; - private final HitAttributeCollector attrCollector; - - /** Creates a new IdentifiableQueryWeight instance. */ - public IdentifiableQueryWeight(IdentifiableQuery query, Weight inner, FieldRankHitInfo queryId, - HitAttributeCollector attrCollector) { - super(query); - this.inner = inner; - this.queryId = queryId; - this.attrCollector = Preconditions.checkNotNull(attrCollector); - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) - throws IOException { - return inner.explain(context, doc); - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - attrCollector.clearHitAttributions(context, queryId); - Scorer innerScorer = inner.scorer(context); - if (innerScorer != null) { - return new IdentifiableQueryScorer(this, innerScorer, queryId, attrCollector); - } else { - return null; - } - } - - @Override - public void extractTerms(Set terms) { - inner.extractTerms(terms); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return inner.isCacheable(ctx); - } -} diff --git a/src/java/com/twitter/search/common/query/MappableField.docx b/src/java/com/twitter/search/common/query/MappableField.docx new file mode 100644 index 000000000..95e75fad5 Binary files /dev/null and b/src/java/com/twitter/search/common/query/MappableField.docx differ diff --git a/src/java/com/twitter/search/common/query/MappableField.java b/src/java/com/twitter/search/common/query/MappableField.java deleted file mode 100644 index 53905472c..000000000 --- a/src/java/com/twitter/search/common/query/MappableField.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.search.common.query; - -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; - -/** - * The indices may map the fields declared here to fields internally without exposing their schemas - * to other services. This can be used, for example, to set boosts for URL-like fields in Earlybird - * without direct knowledge of the internal Earlybird field name - */ -public enum MappableField { - REFERRAL, - URL; - - static { - ImmutableMap.Builder builder = ImmutableMap.builder(); - for (MappableField mappableField : MappableField.values()) { - builder.put(mappableField, mappableField.toString().toLowerCase()); - } - MAPPABLE_FIELD_TO_NAME_MAP = Maps.immutableEnumMap(builder.build()); - } - - private static final ImmutableMap MAPPABLE_FIELD_TO_NAME_MAP; - - /** Returns the name of the given MappableField. */ - public static String mappableFieldName(MappableField mappableField) { - return MAPPABLE_FIELD_TO_NAME_MAP.get(mappableField); - } - - /** Returns the name of this MappableField. */ - public String getName() { - return MAPPABLE_FIELD_TO_NAME_MAP.get(this); - } -} diff --git a/src/java/com/twitter/search/common/query/MultiTermDisjunctionQuery.docx b/src/java/com/twitter/search/common/query/MultiTermDisjunctionQuery.docx new file mode 100644 index 000000000..2900c3ecb Binary files /dev/null and b/src/java/com/twitter/search/common/query/MultiTermDisjunctionQuery.docx differ diff --git a/src/java/com/twitter/search/common/query/MultiTermDisjunctionQuery.java b/src/java/com/twitter/search/common/query/MultiTermDisjunctionQuery.java deleted file mode 100644 index 1f54b0671..000000000 --- a/src/java/com/twitter/search/common/query/MultiTermDisjunctionQuery.java +++ /dev/null @@ -1,61 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; -import java.util.Iterator; -import java.util.Set; - -import org.apache.lucene.index.FilteredTermsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.MultiTermQuery; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; - - -public class MultiTermDisjunctionQuery extends MultiTermQuery { - - private final Set values; - - /** Creates a new MultiTermDisjunctionQuery instance. */ - public MultiTermDisjunctionQuery(String field, Set values) { - super(field); - this.values = values; - } - - @Override - protected TermsEnum getTermsEnum(Terms terms, AttributeSource atts) - throws IOException { - final TermsEnum termsEnum = terms.iterator(); - final Iterator it = values.iterator(); - - return new FilteredTermsEnum(termsEnum) { - @Override protected AcceptStatus accept(BytesRef term) throws IOException { - return AcceptStatus.YES; - } - - @Override public BytesRef next() throws IOException { - while (it.hasNext()) { - BytesRef termRef = it.next(); - if (termsEnum.seekExact(termRef)) { - return termRef; - } - } - - return null; - } - }; - } - - @Override - public String toString(String field) { - StringBuilder builder = new StringBuilder(); - builder.append("MultiTermDisjunctionQuery["); - for (BytesRef termVal : this.values) { - builder.append(termVal); - builder.append(","); - } - builder.setLength(builder.length() - 1); - builder.append("]"); - return builder.toString(); - } -} diff --git a/src/java/com/twitter/search/common/query/QueryCommonFieldHitsVisitor.docx b/src/java/com/twitter/search/common/query/QueryCommonFieldHitsVisitor.docx new file mode 100644 index 000000000..ba53e2833 Binary files /dev/null and b/src/java/com/twitter/search/common/query/QueryCommonFieldHitsVisitor.docx differ diff --git a/src/java/com/twitter/search/common/query/QueryCommonFieldHitsVisitor.java b/src/java/com/twitter/search/common/query/QueryCommonFieldHitsVisitor.java deleted file mode 100644 index e9db5beac..000000000 --- a/src/java/com/twitter/search/common/query/QueryCommonFieldHitsVisitor.java +++ /dev/null @@ -1,160 +0,0 @@ -package com.twitter.search.common.query; - -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.logging.Level; -import java.util.logging.Logger; - -import com.google.common.collect.Sets; - -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Phrase; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.SpecialTerm; -import com.twitter.search.queryparser.query.Term; -import com.twitter.search.queryparser.query.search.Link; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchQueryVisitor; - -/** - * Visitor to track the fields hits of each node - * Returns the common fields among conjunctions and the union of the fields amongst disjunctions - */ -public final class QueryCommonFieldHitsVisitor extends SearchQueryVisitor> { - - private static final Logger LOG = Logger.getLogger(QueryCommonFieldHitsVisitor.class.getName()); - - private Map nodeToRankMap; - private Map> hitFieldsByRank; - - /** - * Find query term hit intersections based on hitmap given by HitAttributeHelper - * - * @param hitAttributeHelper the HitAttributeHelper - * @param docID documentID - * @param query the query searched - * @return a set of hit fields in String representation - */ - public static Set findIntersection( - HitAttributeHelper hitAttributeHelper, - int docID, - Query query) { - return findIntersection(hitAttributeHelper.getNodeToRankMap(), - hitAttributeHelper.getHitAttribution(docID), - query); - } - - /** - * Find query term hit intersections based on hitmap given by HitAttributeHelper - * - * @param nodeToRankMap the map of query node to its integer rank value - * @param hitFieldsByRank map of rank to list of hit fields in String representation - * @param query the query searched - * @return a set of hit fields in String representation - */ - public static Set findIntersection( - Map nodeToRankMap, - Map> hitFieldsByRank, - Query query) { - QueryCommonFieldHitsVisitor visitor = - new QueryCommonFieldHitsVisitor(nodeToRankMap, hitFieldsByRank); - try { - Set returnSet = query.accept(visitor); - return returnSet; - } catch (QueryParserException e) { - LOG.log(Level.SEVERE, "Could not find intersection for query [" + query + "]: ", e); - return Collections.emptySet(); - } - } - - private QueryCommonFieldHitsVisitor(Map nodeToRankMap, - Map> hitFieldsByRank) { - this.nodeToRankMap = nodeToRankMap; - this.hitFieldsByRank = hitFieldsByRank; - } - - @Override - public Set visit(Disjunction disjunction) throws QueryParserException { - Set fieldHitIntersections = Sets.newHashSet(); - for (Query child : disjunction.getChildren()) { - fieldHitIntersections.addAll(child.accept(this)); - } - return fieldHitIntersections; - } - - @Override - public Set visit(Conjunction conjunction) throws QueryParserException { - List children = conjunction.getChildren(); - if (!children.isEmpty()) { - boolean initializedIntersections = false; - Set fieldHitIntersections = Sets.newHashSet(); - for (Query child : children) { - Set hits = child.accept(this); - if (hits.isEmpty()) { - // if it is empty, it means this query node is not of term type - // and we do not include these in the field intersection - // eg. cache filters, proximity groups - continue; - } - if (!initializedIntersections) { - fieldHitIntersections.addAll(hits); - initializedIntersections = true; - } else { - fieldHitIntersections.retainAll(hits); - } - } - return fieldHitIntersections; - } - return Collections.emptySet(); - } - - @Override - public Set visit(Term term) throws QueryParserException { - Set fieldHitIntersections = Sets.newHashSet(); - Integer rank = nodeToRankMap.get(term); - if (rank != null) { - List fields = hitFieldsByRank.get(rank); - // for disjunction cases where a term may not have any hits - if (fields != null) { - fieldHitIntersections.addAll(fields); - } - } - return fieldHitIntersections; - } - - @Override - public Set visit(SpecialTerm specialTerm) throws QueryParserException { - // This is way of splitting @mentions ensures consistency with way the lucene query is built in - // expertsearch - if (specialTerm.getType() == SpecialTerm.Type.MENTION && specialTerm.getValue().contains("_")) { - Phrase phrase = new Phrase(specialTerm.getValue().split("_")); - return phrase.accept(this); - } - return specialTerm.toTermOrPhrase().accept(this); - } - - @Override - public Set visit(SearchOperator operator) throws QueryParserException { - return Collections.emptySet(); - } - - @Override - public Set visit(Link link) throws QueryParserException { - return link.toPhrase().accept(this); - } - - @Override - public Set visit(Phrase phrase) throws QueryParserException { - // All terms in the phrase should return the same hits fields, just check the first one - List terms = phrase.getTerms(); - if (!terms.isEmpty()) { - Term term = new Term(phrase.getTerms().get(0)); - return term.accept(this); - } - return Collections.emptySet(); - } -} diff --git a/src/java/com/twitter/search/common/query/QueryHitAttributeHelper.docx b/src/java/com/twitter/search/common/query/QueryHitAttributeHelper.docx new file mode 100644 index 000000000..7c2792219 Binary files /dev/null and b/src/java/com/twitter/search/common/query/QueryHitAttributeHelper.docx differ diff --git a/src/java/com/twitter/search/common/query/QueryHitAttributeHelper.java b/src/java/com/twitter/search/common/query/QueryHitAttributeHelper.java deleted file mode 100644 index 1a4ad07ad..000000000 --- a/src/java/com/twitter/search/common/query/QueryHitAttributeHelper.java +++ /dev/null @@ -1,81 +0,0 @@ -package com.twitter.search.common.query; - -import java.util.Collections; -import java.util.IdentityHashMap; -import java.util.List; -import java.util.Map; -import java.util.function.Function; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.visitors.MultiTermDisjunctionRankVisitor; -import com.twitter.search.queryparser.visitors.NodeRankAnnotator; -import com.twitter.search.queryparser.visitors.QueryTreeIndex; - -/** - * A helper class to collect field and query node hit attributions. - */ -public class QueryHitAttributeHelper extends HitAttributeHelper { - private final Query annotatedQuery; - - protected QueryHitAttributeHelper(HitAttributeCollector collector, - Function fieldIdsToFieldNames, - IdentityHashMap nodeToRankMap, - Query annotatedQuery, - Map> expandedRanksMap) { - super(collector, fieldIdsToFieldNames, nodeToRankMap, expandedRanksMap); - this.annotatedQuery = annotatedQuery; - } - - /** - * Constructor specific for com.twitter.search.queryParser.query.Query - * - * This helper visits a parsed query to construct a node-to-rank mapping, - * and uses a schema to determine all of the possible fields to be tracked. - * A collector is then created. - * - * @param query the query for which we will collect hit attribution. - * @param schema the indexing schema. - */ - public static QueryHitAttributeHelper from(Query query, final Schema schema) - throws QueryParserException { - IdentityHashMap nodeToRankMap; - Query annotatedQuery; - - // First see if the query already has node rank annotations on it. If so, we'll just use those - // to identify query nodes. - // We enforce that all provided ranks are in the range of [0, N-1] so not to blow up the size - // of the collection array. - QueryRankVisitor rankVisitor = new QueryRankVisitor(); - if (query.accept(rankVisitor)) { - nodeToRankMap = rankVisitor.getNodeToRankMap(); - annotatedQuery = query; - } else { - // Otherwise, we will assign all nodes in-order ranks, and use those to track per-node hit - // attribution - QueryTreeIndex queryTreeIndex = QueryTreeIndex.buildFor(query); - NodeRankAnnotator annotator = new NodeRankAnnotator(queryTreeIndex.getNodeToIndexMap()); - annotatedQuery = query.accept(annotator); - nodeToRankMap = annotator.getUpdatedNodeToRankMap(); - } - - // Extract ranks for multi_term_disjunction operators - MultiTermDisjunctionRankVisitor multiTermDisjunctionRankVisitor = - new MultiTermDisjunctionRankVisitor(Collections.max(nodeToRankMap.values())); - annotatedQuery.accept(multiTermDisjunctionRankVisitor); - Map> expandedRanksMap = - multiTermDisjunctionRankVisitor.getMultiTermDisjunctionRankExpansionsMap(); - - return new QueryHitAttributeHelper( - new HitAttributeCollector(), - (fieldId) -> schema.getFieldName(fieldId), - nodeToRankMap, - annotatedQuery, - expandedRanksMap); - } - - public Query getAnnotatedQuery() { - return annotatedQuery; - } -} diff --git a/src/java/com/twitter/search/common/query/QueryRankVisitor.docx b/src/java/com/twitter/search/common/query/QueryRankVisitor.docx new file mode 100644 index 000000000..a4d14a2e1 Binary files /dev/null and b/src/java/com/twitter/search/common/query/QueryRankVisitor.docx differ diff --git a/src/java/com/twitter/search/common/query/QueryRankVisitor.java b/src/java/com/twitter/search/common/query/QueryRankVisitor.java deleted file mode 100644 index e6f657f6a..000000000 --- a/src/java/com/twitter/search/common/query/QueryRankVisitor.java +++ /dev/null @@ -1,56 +0,0 @@ -package com.twitter.search.common.query; - -import java.util.IdentityHashMap; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import com.twitter.search.queryparser.query.BooleanQuery; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.annotation.Annotation; -import com.twitter.search.queryparser.visitors.DetectAnnotationVisitor; - -/** - * A visitor that collects node ranks from :r annotation in the query - */ -public class QueryRankVisitor extends DetectAnnotationVisitor { - private final IdentityHashMap nodeToRankMap = Maps.newIdentityHashMap(); - - public QueryRankVisitor() { - super(Annotation.Type.NODE_RANK); - } - - @Override - protected boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException { - if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) { - collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query); - } - - boolean found = false; - for (Query child : query.getChildren()) { - found |= child.accept(this); - } - return found; - } - - @Override - protected boolean visitQuery(Query query) throws QueryParserException { - if (query.hasAnnotationType(Annotation.Type.NODE_RANK)) { - collectNodeRank(query.getAnnotationOf(Annotation.Type.NODE_RANK).get(), query); - return true; - } - - return false; - } - - private void collectNodeRank(Annotation anno, Query query) { - Preconditions.checkArgument(anno.getType() == Annotation.Type.NODE_RANK); - int rank = (Integer) anno.getValue(); - nodeToRankMap.put(query, rank); - } - - public IdentityHashMap getNodeToRankMap() { - return nodeToRankMap; - } -} diff --git a/src/java/com/twitter/search/common/query/SingleDocDocIdSetIterator.docx b/src/java/com/twitter/search/common/query/SingleDocDocIdSetIterator.docx new file mode 100644 index 000000000..f26d505af Binary files /dev/null and b/src/java/com/twitter/search/common/query/SingleDocDocIdSetIterator.docx differ diff --git a/src/java/com/twitter/search/common/query/SingleDocDocIdSetIterator.java b/src/java/com/twitter/search/common/query/SingleDocDocIdSetIterator.java deleted file mode 100644 index f68438b22..000000000 --- a/src/java/com/twitter/search/common/query/SingleDocDocIdSetIterator.java +++ /dev/null @@ -1,51 +0,0 @@ -package com.twitter.search.common.query; - -import java.io.IOException; - -import org.apache.lucene.search.DocIdSetIterator; - -public class SingleDocDocIdSetIterator extends DocIdSetIterator { - - // the only docid in the list - private final int doc; - - private int docid = -1; - - public SingleDocDocIdSetIterator(int doc) { - this.doc = doc; - } - - @Override - public int docID() { - return docid; - } - - @Override - public int nextDoc() throws IOException { - if (docid == -1) { - docid = doc; - } else { - docid = NO_MORE_DOCS; - } - return docid; - } - - @Override - public int advance(int target) throws IOException { - if (docid == NO_MORE_DOCS) { - return docid; - } else if (doc < target) { - docid = NO_MORE_DOCS; - return docid; - } else { - docid = doc; - } - return docid; - } - - @Override - public long cost() { - return 1; - } - -} diff --git a/src/java/com/twitter/search/common/query/StaticHitAttributeProvider.docx b/src/java/com/twitter/search/common/query/StaticHitAttributeProvider.docx new file mode 100644 index 000000000..917912fa8 Binary files /dev/null and b/src/java/com/twitter/search/common/query/StaticHitAttributeProvider.docx differ diff --git a/src/java/com/twitter/search/common/query/StaticHitAttributeProvider.java b/src/java/com/twitter/search/common/query/StaticHitAttributeProvider.java deleted file mode 100644 index 4ea8e53ba..000000000 --- a/src/java/com/twitter/search/common/query/StaticHitAttributeProvider.java +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.search.common.query; - -import java.util.Collections; -import java.util.List; -import java.util.Map; - -/** - * A hit attribute provider based on the static data - */ -public class StaticHitAttributeProvider implements HitAttributeProvider { - private int currentDocId; - private Map> currentHitAttr; - - public StaticHitAttributeProvider() { - } - - /** - * Set a fake last doc id and hit attribution, this is only used to generate explanation. - */ - public void setCurrentHitAttr(int docId, Map> hitAttr) { - this.currentDocId = docId; - this.currentHitAttr = hitAttr; - } - - @Override - public Map> getHitAttribution(int docId) { - if (docId == currentDocId) { - return currentHitAttr; - } - return Collections.EMPTY_MAP; - } -} diff --git a/src/java/com/twitter/search/common/relevance/BUILD b/src/java/com/twitter/search/common/relevance/BUILD deleted file mode 100644 index 118eea883..000000000 --- a/src/java/com/twitter/search/common/relevance/BUILD +++ /dev/null @@ -1,257 +0,0 @@ -java_library( - name = "utils", - sources = ["utils/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/text/language:locale-util", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/tweetypie", - "src/thrift/com/twitter/search:earlybird-java", - "src/thrift/com/twitter/search/common:schema-java", - ], -) - -java_library( - name = "ranking", - sources = ["ranking/**/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":utils", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/search/common/logging", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/relevance/features", - "src/thrift/com/twitter/search:earlybird-java", - ], -) - -TRENDS_DATA_SERVICE_SOURCES = [ - "TrendsThriftDataServiceManager.java", - "NGramCache.java", -] - -java_library( - name = "trends-data-service", - sources = TRENDS_DATA_SERVICE_SOURCES, - platform = "java8", - provides = artifact( - org = "com.twitter.search.common.relevance", - name = "trends-data-service", - repo = artifactory, - ), - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/io/netty:netty4-tcnative-boringssl-static", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/client", - "finagle/finagle-core/src/main", - "finagle/finagle-thrift/src/main/java", - "finagle/finagle-thriftmux/src/main/scala", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/penguin/search/filter", - "src/java/com/twitter/search/common/metrics", - "src/thrift/com/twitter/trends/plus:trends-plus-java", - "src/thrift/com/twitter/trends/service/gen:trends_service-java", - "src/thrift/com/twitter/trends/trending_content:trending-content-service-java", - "trends/trends_metadata/thrift/src/main/thrift/com/twitter/trends/trends_metadata:thrift-java", - "twitter-server-internal/src/main/scala", - "util/util-core:scala", - "util/util-stats/src/main/scala", - ], -) - -java_library( - name = "feature-update-reader", - sources = ["readers/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server", - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/util/io:record-reader-api", - "src/java/com/twitter/search/common/util/thrift:text-protocol", - "src/thrift/com/twitter/search/common:schema-java", - ], -) - -target( - dependencies = [ - ":feature-update-reader", - ":trends-data-service", - "src/java/com/twitter/search/common/relevance/features", - ], -) - -java_library( - name = "config", - sources = ["config/**/*.java"], - platform = "java8", - provides = artifact( - org = "com.twitter.search.common.relevance", - name = "config", - repo = artifactory, - ), - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "src/java/com/twitter/search/common/config", - "src/resources/com/twitter/search/common/relevance/config", - ], -) - -java_library( - name = "classifiers", - sources = ["classifiers/**/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":config", - ":entities_and_filters", - ":trends-data-service", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/commons-lang", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/common/text/language:locale-util", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/common/text/transformer", - "src/java/com/twitter/common_internal/text:text-penguin7", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/config", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/util/io/periodic", - "src/java/com/twitter/search/common/util/text", - "twitter-text/lib/java/src/main/java/com/twitter/twittertext", - ], -) - -java_library( - name = "text", - sources = ["text/**/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":entities_and_filters", - "3rdparty/jvm/com/google/guava", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/common/text/util:char-seq-util", - "src/java/com/twitter/common_internal/text:text-penguin7", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/util/text", - "src/java/com/twitter/search/common/util/text/regex", - "src/thrift/com/twitter/search/common:indexing-java", - ], -) - -java_library( - name = "scorers", - sources = ["scorers/**/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":classifiers", - ":config", - ":entities_and_filters", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - ], -) - -java_library( - name = "entities_and_filters", - sources = [ - "entities/**/*.java", - "filters/**/*.java", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/commons-lang", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/org/apache/commons:commons-lang3", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/slf4j:slf4j-api", - "cuad/projects/ner/thrift/src/main/thrift:thrift-java", - "decider/src/main/scala", - "src/java/com/twitter/common/text/extractor", - "src/java/com/twitter/common/text/language:locale-util", - "src/java/com/twitter/common/text/pipeline", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/common/text/transformer", - "src/java/com/twitter/common_internal/text:text-penguin7", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/util/text", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java", - "src/thrift/com/twitter/tweetypie:tweet-java", - "util/util-core:scala", - ], -) - -java_library( - name = "scores", - sources = ["scores/**/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - ], -) diff --git a/src/java/com/twitter/search/common/relevance/BUILD.docx b/src/java/com/twitter/search/common/relevance/BUILD.docx new file mode 100644 index 000000000..22b4d7cf4 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/relevance/NGramCache.docx b/src/java/com/twitter/search/common/relevance/NGramCache.docx new file mode 100644 index 000000000..7100d6faf Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/NGramCache.docx differ diff --git a/src/java/com/twitter/search/common/relevance/NGramCache.java b/src/java/com/twitter/search/common/relevance/NGramCache.java deleted file mode 100644 index 41a3478bd..000000000 --- a/src/java/com/twitter/search/common/relevance/NGramCache.java +++ /dev/null @@ -1,152 +0,0 @@ -package com.twitter.search.common.relevance; - -import java.util.Collections; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.cache.CacheBuilder; -import com.google.common.collect.ImmutableList; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.penguin.search.filter.StringMatchFilter; -import com.twitter.util.Duration; - -/** - * the Cache for Trends - */ -public class NGramCache { - private static final int DEFAULT_MAX_CACHE_SIZE = 5000; - private static final long DEFAULT_CACHE_ITEM_TTL_SEC = 24 * 3600; // 1 day - - private final PenguinVersion penguinVersion; - - // Keys are trends. Values are empty strings. - private final Map trendsCache; - - private volatile StringMatchFilter trendsMatcher = null; - - /** - * Extract Trends from a list of normalized tokens - */ - public List extractTrendsFromNormalized(List tokens) { - if (trendsMatcher == null) { - return Collections.emptyList(); - } - - ImmutableList.Builder trends = ImmutableList.builder(); - for (String trend : trendsMatcher.extractNormalized(tokens)) { - if (trendsCache.containsKey(trend)) { - trends.add(trend); - } - } - - return trends.build(); - } - - /** - * Extract Trends from a list of tokens - */ - public List extractTrendsFrom(List tokens, Locale language) { - if (trendsMatcher == null) { - return Collections.emptyList(); - } - return trendsMatcher.extract(language, tokens); - } - - /** - * Extract Trends from a given CharSequence - */ - public List extractTrendsFrom(CharSequence text, Locale language) { - if (trendsMatcher == null) { - return Collections.emptyList(); - } - - ImmutableList.Builder trends = ImmutableList.builder(); - for (String trend : trendsMatcher.extract(language, text)) { - if (trendsCache.containsKey(trend)) { - trends.add(trend); - } - } - - return trends.build(); - } - - public long numTrendingTerms() { - return trendsCache.size(); - } - - public Set getTrends() { - return trendsCache.keySet(); - } - - public void clear() { - trendsCache.clear(); - trendsMatcher = null; - } - - /** Adds all trends to this NGramCache. */ - public void addAll(Iterable trends) { - for (String trend : trends) { - trendsCache.put(trend, ""); - } - - trendsMatcher = new StringMatchFilter(trendsCache.keySet(), penguinVersion); - } - - public static Builder builder() { - return new Builder(); - } - - public static class Builder { - private int maxCacheSize = DEFAULT_MAX_CACHE_SIZE; - private long cacheItemTTLSecs = DEFAULT_CACHE_ITEM_TTL_SEC; // 1 day - private PenguinVersion penguinVersion = PenguinVersion.PENGUIN_4; - - public Builder maxCacheSize(int cacheSize) { - this.maxCacheSize = cacheSize; - return this; - } - - public Builder cacheItemTTL(long cacheItemTTL) { - this.cacheItemTTLSecs = cacheItemTTL; - return this; - } - - public Builder penguinVersion(PenguinVersion newPenguinVersion) { - this.penguinVersion = Preconditions.checkNotNull(newPenguinVersion); - return this; - } - - /** Builds an NGramCache instance. */ - public NGramCache build() { - return new NGramCache( - maxCacheSize, - Duration.apply(cacheItemTTLSecs, TimeUnit.SECONDS), - penguinVersion); - } - } - - // Should be used only in tests that want to mock out this class. - @VisibleForTesting - public NGramCache() { - this(DEFAULT_MAX_CACHE_SIZE, - Duration.apply(DEFAULT_CACHE_ITEM_TTL_SEC, TimeUnit.SECONDS), - PenguinVersion.PENGUIN_4); - } - - private NGramCache(int maxCacheSize, Duration cacheItemTTL, PenguinVersion penguinVersion) { - // we only have 1 refresher thread that writes to the cache - this.trendsCache = CacheBuilder.newBuilder() - .concurrencyLevel(1) - .expireAfterWrite(cacheItemTTL.inSeconds(), TimeUnit.SECONDS) - .maximumSize(maxCacheSize) - .build() - .asMap(); - this.penguinVersion = penguinVersion; - } -} diff --git a/src/java/com/twitter/search/common/relevance/TrendsThriftDataServiceManager.docx b/src/java/com/twitter/search/common/relevance/TrendsThriftDataServiceManager.docx new file mode 100644 index 000000000..b1d8d9ea7 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/TrendsThriftDataServiceManager.docx differ diff --git a/src/java/com/twitter/search/common/relevance/TrendsThriftDataServiceManager.java b/src/java/com/twitter/search/common/relevance/TrendsThriftDataServiceManager.java deleted file mode 100644 index 62bbd9890..000000000 --- a/src/java/com/twitter/search/common/relevance/TrendsThriftDataServiceManager.java +++ /dev/null @@ -1,353 +0,0 @@ -package com.twitter.search.common.relevance; - -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -import java.util.stream.Collectors; - -import scala.runtime.BoxedUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Sets; -import com.google.common.util.concurrent.ThreadFactoryBuilder; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.finagle.Service; -import com.twitter.finagle.ThriftMux; -import com.twitter.finagle.builder.ClientBuilder; -import com.twitter.finagle.builder.ClientConfig; -import com.twitter.finagle.mtls.authentication.ServiceIdentifier; -import com.twitter.finagle.mtls.client.MtlsClientBuilder; -import com.twitter.finagle.stats.DefaultStatsReceiver; -import com.twitter.finagle.thrift.ThriftClientRequest; -import com.twitter.search.common.metrics.RelevanceStats; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.trends.plus.Module; -import com.twitter.trends.plus.TrendsPlusRequest; -import com.twitter.trends.plus.TrendsPlusResponse; -import com.twitter.trends.service.gen.Location; -import com.twitter.trends.trending_content.thriftjava.TrendingContentService; -import com.twitter.trends.trends_metadata.thriftjava.TrendsMetadataService; -import com.twitter.util.Duration; -import com.twitter.util.Future; -import com.twitter.util.Try; - -/** - * Manages trends data retrieved from trends thrift API and perform automatic refresh. - */ -public final class TrendsThriftDataServiceManager { - private static final Logger LOG = - LoggerFactory.getLogger(TrendsThriftDataServiceManager.class.getName()); - - private static final int DEFAULT_TIME_TO_KILL_SEC = 60; - - @VisibleForTesting - protected static final Map DEFAULT_TRENDS_PARAMS_MAP = ImmutableMap.of( - "MAX_ITEMS_TO_RETURN", "10"); // we only take top 10 for each woeid. - - @VisibleForTesting - protected static final int MAX_TRENDS_PER_WOEID = 10; - - private final Duration requestTimeout; - private final Duration refreshDelayDuration; - private final Duration reloadIntervalDuration; - private final int numRetries; - - // a list of trends cache we want to update - private final List trendsCacheList; - - private final SearchCounter getAvailableSuccessCounter = - RelevanceStats.exportLong("trends_extractor_get_available_success"); - private final SearchCounter getAvailableFailureCounter = - RelevanceStats.exportLong("trends_extractor_get_available_failure"); - private final SearchCounter getTrendsSuccessCounter = - RelevanceStats.exportLong("trends_extractor_success_fetch"); - private final SearchCounter getTrendsFailureCounter = - RelevanceStats.exportLong("trends_extractor_failed_fetch"); - private final SearchCounter updateFailureCounter = - RelevanceStats.exportLong("trends_extractor_failed_update"); - - private final ServiceIdentifier serviceIdentifier; - private ScheduledExecutorService scheduler; - - - @VisibleForTesting - protected Service contentService; - protected TrendingContentService.ServiceToClient contentClient; - protected Service metadataService; - protected TrendsMetadataService.ServiceToClient metadataClient; - - @VisibleForTesting - protected TrendsUpdater trendsUpdater; - - /** - * Returns an instance of TrendsThriftDataServiceManager. - * @param serviceIdentifier The service that wants to call - * into Trend's services. - * @param numRetries The number of retries in the event of - * request failures. - * @param requestTimeout The amount of time we wait before we consider a - * a request as failed. - * @param initTrendsCacheDelay How long to wait before the initial - * filling of the Trends cache in milliseconds. - * @param reloadInterval How often to refresh the cache with updated trends. - * @param trendsCacheList The cache of trends. - * @return An instance of TrendsThriftDataServiceManager configured - * with respect to the params provided. - */ - public static TrendsThriftDataServiceManager newInstance( - ServiceIdentifier serviceIdentifier, - int numRetries, - Duration requestTimeout, - Duration initTrendsCacheDelay, - Duration reloadInterval, - List trendsCacheList) { - return new TrendsThriftDataServiceManager( - serviceIdentifier, - numRetries, - requestTimeout, - initTrendsCacheDelay, - reloadInterval, - trendsCacheList); - } - - /** - * Resume auto refresh. Always called in constructor. Can be invoked after a - * stopAuthRefresh call to resume auto refreshing. Invoking it after shutDown is undefined. - */ - public synchronized void startAutoRefresh() { - if (scheduler == null) { - scheduler = Executors.newSingleThreadScheduledExecutor( - new ThreadFactoryBuilder().setDaemon(true).setNameFormat( - "trends-data-refresher[%d]").build()); - scheduler.scheduleAtFixedRate( - trendsUpdater, - refreshDelayDuration.inSeconds(), - reloadIntervalDuration.inSeconds(), - TimeUnit.SECONDS); - } - } - - /** - * Stop auto refresh. Wait for the current execution thread to finish. - * This is a blocking call. - */ - public synchronized void stopAutoRefresh() { - if (scheduler != null) { - scheduler.shutdown(); // Disable new tasks from being submitted - try { - // Wait a while for existing tasks to terminate - if (!scheduler.awaitTermination(DEFAULT_TIME_TO_KILL_SEC, TimeUnit.SECONDS)) { - scheduler.shutdownNow(); // Cancel currently executing tasks - // Wait a while for tasks to respond to being cancelled - if (!scheduler.awaitTermination(DEFAULT_TIME_TO_KILL_SEC, TimeUnit.SECONDS)) { - LOG.info("Executor thread pool did not terminate."); - } - } - } catch (InterruptedException ie) { - // (Re-)Cancel if current thread also interrupted - scheduler.shutdownNow(); - // Preserve interrupt status - Thread.currentThread().interrupt(); - } - scheduler = null; - } - } - - /** Shuts down the manager. */ - public void shutDown() { - stopAutoRefresh(); - // clear the cache - for (NGramCache cache : trendsCacheList) { - cache.clear(); - } - - if (contentService != null) { - contentService.close(); - } - - if (metadataService != null) { - metadataService.close(); - } - } - - private TrendsThriftDataServiceManager( - ServiceIdentifier serviceIdentifier, - int numRetries, - Duration requestTimeoutMS, - Duration refreshDelayDuration, - Duration reloadIntervalDuration, - List trendsCacheList) { - this.numRetries = numRetries; - this.requestTimeout = requestTimeoutMS; - this.refreshDelayDuration = refreshDelayDuration; - this.reloadIntervalDuration = reloadIntervalDuration; - this.serviceIdentifier = serviceIdentifier; - this.trendsCacheList = Preconditions.checkNotNull(trendsCacheList); - trendsUpdater = new TrendsUpdater(); - metadataService = buildMetadataService(); - metadataClient = buildMetadataClient(metadataService); - contentService = buildContentService(); - contentClient = buildContentClient(contentService); - } - - @VisibleForTesting - protected Service buildContentService() { - ClientBuilder< - ThriftClientRequest, - byte[], ClientConfig.Yes, - ClientConfig.Yes, - ClientConfig.Yes - > - builder = ClientBuilder.get() - .stack(ThriftMux.client()) - .name("trends_thrift_data_service_manager_content") - .dest("") - .retries(numRetries) - .reportTo(DefaultStatsReceiver.get()) - .tcpConnectTimeout(requestTimeout) - .requestTimeout(requestTimeout); - ClientBuilder mtlsBuilder = - new MtlsClientBuilder.MtlsClientBuilderSyntax<>(builder).mutualTls(serviceIdentifier); - - return ClientBuilder.safeBuild(mtlsBuilder); - } - - @VisibleForTesting - protected TrendingContentService.ServiceToClient buildContentClient( - Service service) { - return new TrendingContentService.ServiceToClient(service); - } - - @VisibleForTesting - protected Service buildMetadataService() { - ClientBuilder< - ThriftClientRequest, - byte[], - ClientConfig.Yes, - ClientConfig.Yes, - ClientConfig.Yes - > - builder = ClientBuilder.get() - .stack(ThriftMux.client()) - .name("trends_thrift_data_service_manager_metadata") - .dest("") - .retries(numRetries) - .reportTo(DefaultStatsReceiver.get()) - .tcpConnectTimeout(requestTimeout) - .requestTimeout(requestTimeout); - ClientBuilder mtlsBuilder = - new MtlsClientBuilder.MtlsClientBuilderSyntax<>(builder).mutualTls(serviceIdentifier); - - return ClientBuilder.safeBuild(mtlsBuilder); - } - - @VisibleForTesting - protected TrendsMetadataService.ServiceToClient buildMetadataClient( - Service service) { - return new TrendsMetadataService.ServiceToClient(service); - } - - /** - * Updater that fetches available woeids and corresponding trending terms. - */ - @VisibleForTesting - protected class TrendsUpdater implements Runnable { - @Override - public void run() { - populateCacheFromTrendsService(); - } - - private Future populateCacheFromTrendsService() { - long startTime = System.currentTimeMillis(); - AtomicLong numTrendsReceived = new AtomicLong(0); - return metadataClient.getAvailable().flatMap(locations -> { - if (locations == null) { - getAvailableFailureCounter.increment(); - LOG.warn("Failed to get woeids from trends."); - return Future.value(BoxedUnit.UNIT); - } - getAvailableSuccessCounter.increment(); - return populateCacheFromTrendLocations(locations, numTrendsReceived); - }).onFailure(throwable -> { - LOG.info("Update failed", throwable); - updateFailureCounter.increment(); - return BoxedUnit.UNIT; - }).ensure(() -> { - logRefreshStatus(startTime, numTrendsReceived); - return BoxedUnit.UNIT; - }); - } - - private Future populateCacheFromTrendLocations( - List locations, - AtomicLong numTrendsReceived) { - List> trendsPlusFutures = locations.stream() - .map(location -> makeTrendsPlusRequest(location)) - .collect(Collectors.toList()); - - Future>> trendsPlusFuture = - Future.collectToTry(trendsPlusFutures); - return trendsPlusFuture.map(tryResponses -> { - populateCacheFromResponses(tryResponses, numTrendsReceived); - return BoxedUnit.UNIT; - }); - } - - private Future makeTrendsPlusRequest(Location location) { - TrendsPlusRequest request = new TrendsPlusRequest() - .setWoeid(location.getWoeid()) - .setMaxTrends(MAX_TRENDS_PER_WOEID); - long startTime = System.currentTimeMillis(); - return contentClient.getTrendsPlus(request) - .onSuccess(response -> { - getTrendsSuccessCounter.increment(); - return BoxedUnit.UNIT; - }).onFailure(throwable -> { - getTrendsFailureCounter.increment(); - return BoxedUnit.UNIT; - }); - } - - private void populateCacheFromResponses( - List> tryResponses, - AtomicLong numTrendsReceived) { - Set trendStrings = Sets.newHashSet(); - - for (Try tryResponse : tryResponses) { - if (tryResponse.isThrow()) { - LOG.warn("Failed to fetch trends:" + tryResponse.toString()); - continue; - } - - TrendsPlusResponse trendsPlusResponse = tryResponse.get(); - numTrendsReceived.addAndGet(trendsPlusResponse.modules.size()); - for (Module module : trendsPlusResponse.modules) { - trendStrings.add(module.getTrend().name); - } - } - - for (NGramCache cache : trendsCacheList) { - cache.addAll(trendStrings); - } - } - } - - private void logRefreshStatus(long startTime, AtomicLong numTrendsReceived) { - LOG.info(String.format("Refresh done in [%dms] :\nfetchSuccess[%d] fetchFailure[%d] " - + "updateFailure[%d] num trends received [%d]", - System.currentTimeMillis() - startTime, - getTrendsSuccessCounter.get(), - getTrendsFailureCounter.get(), - updateFailureCounter.get(), - numTrendsReceived.get())); - } -} diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetClassifier.docx b/src/java/com/twitter/search/common/relevance/classifiers/TweetClassifier.docx new file mode 100644 index 000000000..10603a143 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/classifiers/TweetClassifier.docx differ diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetClassifier.java b/src/java/com/twitter/search/common/relevance/classifiers/TweetClassifier.java deleted file mode 100644 index 16210eec8..000000000 --- a/src/java/com/twitter/search/common/relevance/classifiers/TweetClassifier.java +++ /dev/null @@ -1,118 +0,0 @@ -package com.twitter.search.common.relevance.classifiers; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.relevance.entities.TwitterMessage; - -/** - * Interface to perform feature classification for a single - * @TwitterMessage object, or a group of them. - * - * Classification includes two steps: feature extraction, and - * quality evaluation. During feature extraction, any interesting - * feature that is deemed useful for subsequent quality analysis - * is extracted from the @TwitterMessage object. Quality evaluation - * is then done by a group of @TweetEvaluator objects associated - * with the classifier, by using the various features extracted in the - * previous step. - * - * Feature extraction and quality evaluation results are stored in - * @TweetFeatures field of the @TwitterMessage object, which is defined - * in src/main/thrift/classifier.thrift. - */ -public abstract class TweetClassifier { - /** - * A list of TweetQualityEvaluators which are invoked after - * feature extraction is done. If null, no quality evaluation - * is done. - */ - protected Iterable qualityEvaluators = null; - - /** - * Passed in TwitterMessage is examined and any extractable - * features are saved in TweetFeatures field of TwitterMessage. - * Then TweetQualityEvaluators are applied to compute various - * quality values. - * - * @param tweet TwitterMessage to perform classification on. - */ - public void classifyTweet(final TwitterMessage tweet) { - Preconditions.checkNotNull(tweet); - - // extract features - extractFeatures(tweet); - - // compute quality - evaluate(tweet); - } - - /** - * Classify a group of TwitterMessages and store features in their corresponding - * TweetFeatures fields. - * - * This default implementation just iterates through the map and classifies each - * individual tweet. Batching for better performance, if applicable, can be implemented by - * concrete subclasses. - * - * @param tweets TwitterMessages to perform classification on. - */ - public void classifyTweets(final Iterable tweets) { - extractFeatures(tweets); - evaluate(tweets); - } - - /** - * Use the specified list of TweetQualityEvaluators for this classifier. - * - * @param evaluators list of TweetQualityEvaluators to be used with this classifier. - */ - protected void setQualityEvaluators(final Iterable qualityEvaluators) { - Preconditions.checkNotNull(qualityEvaluators); - this.qualityEvaluators = qualityEvaluators; - } - - - /** - * Extract interesting features from a single TwitterMessage for classification. - * - * @param tweet TwitterMessage to extract interesting features for - */ - protected abstract void extractFeatures(final TwitterMessage tweet); - - /** - * Extract interesting features from a list of TwitterMessages for classification. - * @param tweets list of TwitterMessages to extract interesting features for - */ - protected void extractFeatures(final Iterable tweets) { - for (TwitterMessage tweet: tweets) { - extractFeatures(tweet); - } - } - - /** - * Given a TwitterMessage which already has its features extracted, - * perform quality evaluation. - * - * @param tweet TwitterMessage to perform quality evaluation for - */ - protected void evaluate(final TwitterMessage tweet) { - if (qualityEvaluators == null) { - return; - } - for (TweetEvaluator evaluator : qualityEvaluators) { - evaluator.evaluate(tweet); - } - } - - /** - * Given a list of TwitterMessages which already have their features extracted, - * perform quality evaluation. - * - * @param tweets list of TwitterMessages to perform quality evaluation for - */ - protected void evaluate(final Iterable tweets) { - for (TwitterMessage tweet: tweets) { - evaluate(tweet); - } - } -} diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetEvaluator.docx b/src/java/com/twitter/search/common/relevance/classifiers/TweetEvaluator.docx new file mode 100644 index 000000000..becdf8103 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/classifiers/TweetEvaluator.docx differ diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetEvaluator.java b/src/java/com/twitter/search/common/relevance/classifiers/TweetEvaluator.java deleted file mode 100644 index e582e97d9..000000000 --- a/src/java/com/twitter/search/common/relevance/classifiers/TweetEvaluator.java +++ /dev/null @@ -1,37 +0,0 @@ -package com.twitter.search.common.relevance.classifiers; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.relevance.entities.TwitterMessage; - -/** - * Interface to perform quality evaluation for a single @TwitterMessage - * object or a group of them. - * - */ -public abstract class TweetEvaluator { - /** - * Passed in TwitterMessage is examined and any extractable - * features are stored in TweetFeatures field of TwitterMessage. - * - * @param tweet TwitterMessage to perform classification on. - */ - public abstract void evaluate(final TwitterMessage tweet); - - /** - * Classify a group of TwitterMessages and store the features in their corresponding - * TweetFeatures fields. - * - * This default implementation just iterates through the map and classifies each - * individual tweet. Batching for better performance, if applicable, can be implemented by - * concrete subclasses. - * - * @param tweets TwitterMessages to perform classification on. - */ - public void evaluate(final Iterable tweets) { - Preconditions.checkNotNull(tweets); - for (TwitterMessage tweet: tweets) { - evaluate(tweet); - } - } -} diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.docx b/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.docx new file mode 100644 index 000000000..6983201ee Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.docx differ diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java b/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java deleted file mode 100644 index 2de2bc3b5..000000000 --- a/src/java/com/twitter/search/common/relevance/classifiers/TweetOffensiveEvaluator.java +++ /dev/null @@ -1,260 +0,0 @@ -package com.twitter.search.common.relevance.classifiers; - -import java.io.File; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.atomic.AtomicReference; - -import com.google.common.base.Joiner; -import com.google.common.base.Preconditions; -import com.google.common.io.ByteSource; -import com.google.common.util.concurrent.ThreadFactoryBuilder; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.text.language.LocaleUtil; -import com.twitter.common.text.token.TokenizedCharSequence; -import com.twitter.common.text.token.attribute.TokenType; -import com.twitter.common.util.Clock; -import com.twitter.common_internal.text.pipeline.TwitterNgramGenerator; -import com.twitter.common_internal.text.topic.BlacklistedTopics; -import com.twitter.common_internal.text.topic.BlacklistedTopics.FilterMode; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.metrics.RelevanceStats; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.features.TweetTextFeatures; -import com.twitter.search.common.relevance.features.TweetTextQuality; -import com.twitter.search.common.util.io.periodic.PeriodicFileLoader; -import com.twitter.search.common.util.text.NormalizerHelper; -import com.twitter.search.common.util.text.TokenizerHelper; - -/** - * Determines if tweet text or username contains potentially offensive language. - */ -public class TweetOffensiveEvaluator extends TweetEvaluator { - private static final Logger LOG = LoggerFactory.getLogger(TweetOffensiveEvaluator.class); - - private static final int MAX_OFFENSIVE_TERMS = 2; - - private final File filterDirectory; - private static final File DEFAULT_FILTER_DIR = new File(""); - private static final String ADULT_TOKEN_FILE_NAME = "adult_tokens.txt"; - private static final String OFFENSIVE_TOPIC_FILE_NAME = "offensive_topics.txt"; - private static final String OFFENSIVE_SUBSTRING_FILE_NAME = "offensive_substrings.txt"; - - private static final ThreadLocal NGRAM_GENERATOR_HOLDER = - new ThreadLocal() { - @Override - protected TwitterNgramGenerator initialValue() { - // It'll generate ngrams from TokenizedCharSequence, which contains tokenization results, - // so it doesn't matter which Penguin version to use here. - return new TwitterNgramGenerator.Builder(PenguinVersion.PENGUIN_6) - .setSize(1, MAX_OFFENSIVE_TERMS) - .build(); - } - }; - - private final AtomicReference offensiveTopics = - new AtomicReference<>(); - private final AtomicReference offensiveUsersTopics = - new AtomicReference<>(); - - private final AtomicReference adultTokenFileContents = new AtomicReference<>(); - private final AtomicReference offensiveTokenFileContents = new AtomicReference<>(); - private final AtomicReference offensiveSubstringFileContents = new - AtomicReference<>(); - - private final SearchCounter sensitiveTextCounter = - RelevanceStats.exportLong("num_sensitive_text"); - - public TweetOffensiveEvaluator() { - this(DEFAULT_FILTER_DIR); - } - - public TweetOffensiveEvaluator( - File filterDirectory - ) { - this.filterDirectory = filterDirectory; - adultTokenFileContents.set(BlacklistedTopics.getResource( - BlacklistedTopics.DATA_PREFIX + ADULT_TOKEN_FILE_NAME)); - offensiveTokenFileContents.set(BlacklistedTopics.getResource( - BlacklistedTopics.DATA_PREFIX + OFFENSIVE_TOPIC_FILE_NAME)); - offensiveSubstringFileContents.set(BlacklistedTopics.getResource( - BlacklistedTopics.DATA_PREFIX + OFFENSIVE_SUBSTRING_FILE_NAME)); - - try { - rebuildBlacklistedTopics(); - } catch (IOException e) { - throw new RuntimeException(e); - } - - ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor( - new ThreadFactoryBuilder() - .setNameFormat("offensive-evaluator-blacklist-reloader") - .setDaemon(true) - .build()); - initPeriodicFileLoader(adultTokenFileContents, ADULT_TOKEN_FILE_NAME, executor); - initPeriodicFileLoader(offensiveTokenFileContents, OFFENSIVE_TOPIC_FILE_NAME, executor); - initPeriodicFileLoader(offensiveSubstringFileContents, OFFENSIVE_SUBSTRING_FILE_NAME, executor); - } - - private void initPeriodicFileLoader( - AtomicReference byteSource, - String fileName, - ScheduledExecutorService executor) { - File file = new File(filterDirectory, fileName); - try { - PeriodicFileLoader loader = new PeriodicFileLoader( - "offensive-evaluator-" + fileName, - file.getPath(), - executor, - Clock.SYSTEM_CLOCK) { - @Override - protected void accept(InputStream stream) throws IOException { - byteSource.set(ByteSource.wrap(IOUtils.toByteArray(stream))); - rebuildBlacklistedTopics(); - } - }; - loader.init(); - } catch (Exception e) { - // Not the end of the world if we couldn't load the file, we already loaded the resource. - LOG.error("Could not load offensive topic filter " + fileName + " from ConfigBus", e); - } - } - - private void rebuildBlacklistedTopics() throws IOException { - offensiveTopics.set(new BlacklistedTopics.Builder(false) - .loadFilterFromSource(adultTokenFileContents.get(), FilterMode.EXACT) - .loadFilterFromSource(offensiveSubstringFileContents.get(), FilterMode.SUBSTRING) - .build()); - - offensiveUsersTopics.set(new BlacklistedTopics.Builder(false) - .loadFilterFromSource(offensiveTokenFileContents.get(), FilterMode.EXACT) - .loadFilterFromSource(offensiveSubstringFileContents.get(), FilterMode.SUBSTRING) - .build()); - } - - @Override - public void evaluate(final TwitterMessage tweet) { - BlacklistedTopics offensiveFilter = this.offensiveTopics.get(); - BlacklistedTopics offensiveUsersFilter = this.offensiveUsersTopics.get(); - - if (offensiveFilter == null || offensiveUsersFilter == null) { - return; - } - - if (tweet.isSensitiveContent()) { - sensitiveTextCounter.increment(); - } - - // Check for user name. - Preconditions.checkState(tweet.getFromUserScreenName().isPresent(), - "Missing from-user screen name"); - - for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) { - TweetTextQuality textQuality = tweet.getTweetTextQuality(penguinVersion); - - if (tweet.isSensitiveContent()) { - textQuality.addBoolQuality(TweetTextQuality.BooleanQualityType.SENSITIVE); - } - - // Check if username has an offensive term - if (isUserNameOffensive( - tweet.getFromUserScreenName().get(), offensiveUsersFilter, penguinVersion)) { - SearchRateCounter offensiveUserCounter = RelevanceStats.exportRate( - "num_offensive_user_" + penguinVersion.name().toLowerCase()); - offensiveUserCounter.increment(); - textQuality.addBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE_USER); - } - - // Check if tweet has an offensive term - if (isTweetOffensive(tweet, offensiveFilter, penguinVersion)) { - SearchRateCounter offensiveTextCounter = RelevanceStats.exportRate( - "num_offensive_text_" + penguinVersion.name().toLowerCase()); - offensiveTextCounter.increment(); - textQuality.addBoolQuality(TweetTextQuality.BooleanQualityType.OFFENSIVE); - } - } - } - - private boolean isUserNameOffensive(String userName, - BlacklistedTopics offensiveUsersFilter, - PenguinVersion penguinVersion) { - String normalizedUserName = NormalizerHelper.normalizeKeepCase( - userName, LocaleUtil.UNKNOWN, penguinVersion); - List termsToCheck = new ArrayList(TokenizerHelper.getSubtokens(normalizedUserName)); - termsToCheck.add(normalizedUserName.toLowerCase()); - - for (String userNameToken : termsToCheck) { - if (!StringUtils.isBlank(userNameToken) && offensiveUsersFilter.filter(userNameToken)) { - return true; - } - } - return false; - } - - private boolean isTweetOffensive(final TwitterMessage tweet, - BlacklistedTopics offensiveFilter, - PenguinVersion penguinVersion) { - TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion); - - boolean tweetHasOffensiveTerm = false; - - // Check for tweet text. - List ngrams = - NGRAM_GENERATOR_HOLDER.get().generateNgramsAsTokenizedCharSequence( - textFeatures.getTokenSequence(), tweet.getLocale()); - for (TokenizedCharSequence ngram : ngrams) { - // skip URL ngram - if (!ngram.getTokensOf(TokenType.URL).isEmpty()) { - continue; - } - String ngramStr = ngram.toString(); - if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) { - tweetHasOffensiveTerm = true; - break; - } - } - - // Due to some strangeness in Penguin, we don't get ngrams for tokens around "\n-" or "-\n" - // in the original string, this made us miss some offensive words this way. Here we do another - // pass of check using just the tokens generated by the tokenizer. (See SEARCHQUAL-8907) - if (!tweetHasOffensiveTerm) { - for (String ngramStr : textFeatures.getTokens()) { - // skip URLs - if (ngramStr.startsWith("http://") || ngramStr.startsWith("https://")) { - continue; - } - if (!StringUtils.isBlank(ngramStr) && offensiveFilter.filter(ngramStr)) { - tweetHasOffensiveTerm = true; - break; - } - } - } - - if (!tweetHasOffensiveTerm) { - // check for resolved URLs - String resolvedUrlsText = - Joiner.on(" ").skipNulls().join(textFeatures.getResolvedUrlTokens()); - List ngramStrs = NGRAM_GENERATOR_HOLDER.get().generateNgramsAsString( - resolvedUrlsText, LocaleUtil.UNKNOWN); - for (String ngram : ngramStrs) { - if (!StringUtils.isBlank(ngram) && offensiveFilter.filter(ngram)) { - tweetHasOffensiveTerm = true; - break; - } - } - } - - return tweetHasOffensiveTerm; - } -} diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetQualityFeatureExtractor.docx b/src/java/com/twitter/search/common/relevance/classifiers/TweetQualityFeatureExtractor.docx new file mode 100644 index 000000000..e64cdb356 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/classifiers/TweetQualityFeatureExtractor.docx differ diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetQualityFeatureExtractor.java b/src/java/com/twitter/search/common/relevance/classifiers/TweetQualityFeatureExtractor.java deleted file mode 100644 index 5aefd9cb8..000000000 --- a/src/java/com/twitter/search/common/relevance/classifiers/TweetQualityFeatureExtractor.java +++ /dev/null @@ -1,105 +0,0 @@ -package com.twitter.search.common.relevance.classifiers; - -import java.io.IOException; -import java.util.Set; - -import com.google.common.base.Preconditions; - -import com.twitter.common.text.transformer.RegexTransformer; -import com.twitter.common.text.transformer.RtRemovalTransformer; -import com.twitter.common.text.transformer.Transformer; -import com.twitter.common.text.transformer.TransformerChain; -import com.twitter.common_internal.text.duplicate.RandomSubstringExtractor; -import com.twitter.common_internal.text.duplicate.SignatureGenerator; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.features.TweetIntegerShingleSignature; -import com.twitter.search.common.relevance.features.TweetTextFeatures; -import com.twitter.search.common.util.text.NormalizerHelper; -import com.twitter.twittertext.Regex; - -/** - * Given a tweet text, extract useful text features. - */ -public class TweetQualityFeatureExtractor { - private static final Transformer STATUS_TEXT_CLEANER = - TransformerChain.of( - // remove @reply as defined in twitter-text - new RegexTransformer.Builder() - .setRegexPattern(Regex.VALID_REPLY) - .setReplaceString("") - .setTriggeringChar('@') - .build(), - // remove the old style retweet, eg RT: @mention or via @mention - new RtRemovalTransformer() - ); - - // for signature generation - private static final int MIN_NUM_FEATURES = 2; - private final SignatureGenerator signatureGenerator = new SignatureGenerator( - new RandomSubstringExtractor( - TweetIntegerShingleSignature.NUM_SHINGLES, // number of signatures - MIN_NUM_FEATURES, // each signature is generated by taking this number of features/tokens - // from text - false, // do not consider full tweet text as a feature - false)); // do not do early termination - - /** - * Given TwitterMessage, extract all interesting tweet text features and store in - * the returned TweetTextFeatures object. - * - * @param tweet TwitterMessage to extract features from - * @throws IOException - */ - public void extractTweetTextFeatures(final TwitterMessage tweet) { - Preconditions.checkNotNull(tweet); - - for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) { - // Get basic features. - TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion); - - extractCharLength(textFeatures); - - // Signature that hashes on text with resolved urls, aggressively remove RT tags, which - // accounts for more than 50% of neardups, also remove @mentions. - // we use resolved urls for signature since they are what matters. - CharSequence strippedText = tweet.getTextReplacedWithResolvedURLs(); - strippedText = strippedText == null ? "" : strippedText; - strippedText = STATUS_TEXT_CLEANER.transform(strippedText); - - // Generate the signature. - // will lower case, use penguin - String normalizedSignatureText = - NormalizerHelper.normalize(strippedText, tweet.getLocale(), penguinVersion); - if (normalizedSignatureText != null && !normalizedSignatureText.isEmpty()) { - Set rawSignature = - signatureGenerator.generateSignatureByteArray(normalizedSignatureText); - textFeatures.setSignature((new TweetIntegerShingleSignature(rawSignature)).serialize()); - } - } - } - - /** - * Compute number of letters in stripped tweet text, also records unsupported char counts. - * - * @param textFeatures TweetTextFeatures object to store letter length, unsupported chars, etc. - */ - private static void extractCharLength(final TweetTextFeatures textFeatures) { - Preconditions.checkNotNull(textFeatures); - int length = 0; - int caps = 0; - String strippedText = textFeatures.getNormalizedStrippedText(); - if (strippedText != null && !strippedText.isEmpty()) { - for (char c : strippedText.toCharArray()) { - if (Character.isLetter(c)) { - length++; - if (Character.isUpperCase(c)) { - caps++; - } - } - } - } - textFeatures.setLength(length); - textFeatures.setCaps(caps); - } -} diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetTextClassifier.docx b/src/java/com/twitter/search/common/relevance/classifiers/TweetTextClassifier.docx new file mode 100644 index 000000000..54caf26b2 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/classifiers/TweetTextClassifier.docx differ diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetTextClassifier.java b/src/java/com/twitter/search/common/relevance/classifiers/TweetTextClassifier.java deleted file mode 100644 index d45d18e11..000000000 --- a/src/java/com/twitter/search/common/relevance/classifiers/TweetTextClassifier.java +++ /dev/null @@ -1,67 +0,0 @@ -package com.twitter.search.common.relevance.classifiers; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import com.twitter.finagle.mtls.authentication.ServiceIdentifier; -import java.util.List; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.relevance.config.TweetProcessingConfig; -import com.twitter.search.common.relevance.entities.TwitterMessage; - -/** - * Classifier that focuses on tweet text features and their corresponding - * quality. - */ -public class TweetTextClassifier extends TweetClassifier { - private TweetQualityFeatureExtractor featureExtractor = new TweetQualityFeatureExtractor(); - private TweetTrendsExtractor trendsExtractor = null; - - /** - * Constructor. Requires a list of TweetQualityEvaluator objects. - * @param evaluators list of TweetQualityEvaluator objects responsible for quality evaluation. - * @param serviceIdentifier The identifier of the calling service. - * @param supportedPenguinVersions A list of supported penguin versions. - */ - public TweetTextClassifier( - final Iterable evaluators, - ServiceIdentifier serviceIdentifier, - List supportedPenguinVersions) { - Preconditions.checkNotNull(evaluators); - setQualityEvaluators(evaluators); - TweetProcessingConfig.init(); - - if (TweetProcessingConfig.getBool("extract_trends", false)) { - trendsExtractor = new TweetTrendsExtractor(serviceIdentifier, supportedPenguinVersions); - } - } - - /** - * Extract text features for the specified TwitterMessage. - * - * @param tweet TwitterMessage to extract features from. - */ - @Override - protected void extractFeatures(TwitterMessage tweet) { - extractFeatures(Lists.newArrayList(tweet)); - } - - /** - * Extract text features for the specified list of TwitterMessages. - * - * @param tweets list of TwitterMessages to extract interesting features for - */ - @Override - protected void extractFeatures(Iterable tweets) { - Preconditions.checkNotNull(tweets); - for (TwitterMessage tweet : tweets) { - featureExtractor.extractTweetTextFeatures(tweet); - } - - // Optionally try to annotate trends for all the tweets. - if (TweetProcessingConfig.getBool("extract_trends", false) && trendsExtractor != null) { - trendsExtractor.extractTrends(tweets); - } - } -} diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetTextEvaluator.docx b/src/java/com/twitter/search/common/relevance/classifiers/TweetTextEvaluator.docx new file mode 100644 index 000000000..ced2ebd74 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/classifiers/TweetTextEvaluator.docx differ diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetTextEvaluator.java b/src/java/com/twitter/search/common/relevance/classifiers/TweetTextEvaluator.java deleted file mode 100644 index db70c6c29..000000000 --- a/src/java/com/twitter/search/common/relevance/classifiers/TweetTextEvaluator.java +++ /dev/null @@ -1,54 +0,0 @@ -package com.twitter.search.common.relevance.classifiers; - -import java.util.List; -import java.util.Map; -import java.util.function.Function; -import java.util.stream.Collectors; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.features.TweetTextFeatures; -import com.twitter.search.common.relevance.features.TweetTextQuality; - -/** - * Calculates entropy of tweet text based on tokens. - */ -public class TweetTextEvaluator extends TweetEvaluator { - - @Override - public void evaluate(final TwitterMessage tweet) { - for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) { - TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion); - TweetTextQuality textQuality = tweet.getTweetTextQuality(penguinVersion); - - double readability = 0; - int numKeptWords = textFeatures.getStrippedTokensSize(); - for (String token : textFeatures.getStrippedTokens()) { - readability += token.length(); - } - if (numKeptWords > 0) { - readability = readability * Math.log(numKeptWords) / numKeptWords; - } - textQuality.setReadability(readability); - textQuality.setEntropy(entropy(textFeatures.getStrippedTokens())); - textQuality.setShout(textFeatures.getCaps() / Math.max(textFeatures.getLength(), 1.0d)); - } - } - - private static double entropy(List tokens) { - Map tokenCounts = - tokens.stream().collect(Collectors.groupingBy(Function.identity(), Collectors.counting())); - int numItems = tokens.size(); - - double entropy = 0; - for (long count : tokenCounts.values()) { - double prob = (double) count / numItems; - entropy -= prob * log2(prob); - } - return entropy; - } - - private static double log2(double n) { - return Math.log(n) / Math.log(2); - } -} diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetTrendsExtractor.docx b/src/java/com/twitter/search/common/relevance/classifiers/TweetTrendsExtractor.docx new file mode 100644 index 000000000..57ca9e558 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/classifiers/TweetTrendsExtractor.docx differ diff --git a/src/java/com/twitter/search/common/relevance/classifiers/TweetTrendsExtractor.java b/src/java/com/twitter/search/common/relevance/classifiers/TweetTrendsExtractor.java deleted file mode 100644 index a600c1697..000000000 --- a/src/java/com/twitter/search/common/relevance/classifiers/TweetTrendsExtractor.java +++ /dev/null @@ -1,165 +0,0 @@ -package com.twitter.search.common.relevance.classifiers; - -import java.util.List; -import java.util.concurrent.TimeUnit; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.finagle.mtls.authentication.ServiceIdentifier; -import com.twitter.search.common.metrics.RelevanceStats; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.relevance.NGramCache; -import com.twitter.search.common.relevance.TrendsThriftDataServiceManager; -import com.twitter.search.common.relevance.config.TweetProcessingConfig; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.features.TweetTextFeatures; -import com.twitter.util.Duration; - -/** - * Determines if tweets contains trending terms. - * Sets corresponding bits and fields to TweetTextFeatures. - */ -public class TweetTrendsExtractor { - - // The amount of time before filling the trends cache for the first time. - private static final long INIT_TRENDS_CACHE_DELAY = 0; - - private static final Logger LOG = LoggerFactory.getLogger(TweetTrendsExtractor.class.getName()); - - private static final int LOGGING_INTERVAL = 100000; - - // Singleton trends data service. This is the default service used unless a different - // instance is injected in the constructor. - private static volatile TrendsThriftDataServiceManager trendsDataServiceSingleton; - - // trends cache used for extracting trends from tweets - private static volatile ImmutableMap trendsCaches; - - private static synchronized void initTrendsDataServiceInstance( - ServiceIdentifier serviceIdentifier, - List supportedPenguinVersions) { - if (trendsDataServiceSingleton == null) { - TweetProcessingConfig.init(); - if (trendsCaches == null) { - ImmutableMap.Builder trendsCachesBuilder = - ImmutableMap.builder(); - for (PenguinVersion penguinVersion : supportedPenguinVersions) { - NGramCache cache = NGramCache.builder() - .maxCacheSize( - TweetProcessingConfig.getInt("trends_extractor_num_trends_to_cache", 5000)) - .penguinVersion(penguinVersion) - .build(); - trendsCachesBuilder.put(penguinVersion, cache); - } - trendsCaches = trendsCachesBuilder.build(); - } - long rawTimeout = TweetProcessingConfig.getLong("trends_extractor_timeout_msec", 200); - long rawInterval = - TweetProcessingConfig.getLong("trends_extractor_reload_interval_sec", 600L); - trendsDataServiceSingleton = - TrendsThriftDataServiceManager.newInstance( - serviceIdentifier, - TweetProcessingConfig.getInt("trends_extractor_retry", 2), - Duration.apply(rawTimeout, TimeUnit.MILLISECONDS), - Duration.apply(INIT_TRENDS_CACHE_DELAY, TimeUnit.SECONDS), - Duration.apply(rawInterval, TimeUnit.SECONDS), - trendsCaches.values().asList() - ); - trendsDataServiceSingleton.startAutoRefresh(); - LOG.info("Started trend extractor."); - } - } - - public TweetTrendsExtractor( - ServiceIdentifier serviceIdentifier, - List supportedPenguinVersions) { - initTrendsDataServiceInstance(serviceIdentifier, supportedPenguinVersions); - } - - /** - * Extract trending terms from the specified tweet. - * @param tweet the specified tweet - */ - public void extractTrends(TwitterMessage tweet) { - extractTrends(ImmutableList.of(tweet)); - } - - /** - * Extract trending terms from the specified list of tweets. - * @param tweets a list of tweets - */ - public void extractTrends(Iterable tweets) { - Preconditions.checkNotNull(tweets); - - for (TwitterMessage tweet : tweets) { - for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) { - NGramCache trendsCache = trendsCaches.get(penguinVersion); - if (trendsCache == null) { - LOG.info("Trends cache for Penguin version " + penguinVersion + " is null."); - continue; - } else if (trendsCache.numTrendingTerms() == 0) { - LOG.info("Trends cache for Penguin version " + penguinVersion + " is empty."); - continue; - } - - List trendsInTweet = trendsCache.extractTrendsFrom( - tweet.getTokenizedCharSequence(penguinVersion), tweet.getLocale()); - - TweetTextFeatures textFeatures = tweet.getTweetTextFeatures(penguinVersion); - if (textFeatures == null || textFeatures.getTokens() == null) { - continue; - } - - textFeatures.getTrendingTerms().addAll(trendsInTweet); - - updateTrendsStats( - tweet, - textFeatures, - penguinVersion, - RelevanceStats.exportLong( - "trends_extractor_has_trends_" + penguinVersion.name().toLowerCase()), - RelevanceStats.exportLong( - "trends_extractor_no_trends_" + penguinVersion.name().toLowerCase()), - RelevanceStats.exportLong( - "trends_extractor_too_many_trends_" + penguinVersion.name().toLowerCase())); - } - } - } - - private void updateTrendsStats(TwitterMessage tweet, - TweetTextFeatures textFeatures, - PenguinVersion penguinVersion, - SearchCounter hasTrendsCounterToUpdate, - SearchCounter noTrendsCounterToUpdate, - SearchCounter tooManyTrendsCounterToUpdate) { - int numTrendingTerms = textFeatures.getTrendingTerms().size(); - if (numTrendingTerms == 0) { - noTrendsCounterToUpdate.increment(); - } else { - if (numTrendingTerms > 1) { - tooManyTrendsCounterToUpdate.increment(); - } - hasTrendsCounterToUpdate.increment(); - } - - long counter = noTrendsCounterToUpdate.get(); - if (counter % LOGGING_INTERVAL == 0) { - long hasTrends = hasTrendsCounterToUpdate.get(); - long noTrends = noTrendsCounterToUpdate.get(); - long tooManyTrends = tooManyTrendsCounterToUpdate.get(); - double ratio = 100.0d * hasTrends / (hasTrends + noTrends + 1); - double tooManyTrendsRatio = 100.0d * tooManyTrends / (hasTrends + 1); - LOG.info(String.format( - "Has trends %d, no trends %d, ratio %.2f, too many trends %.2f," - + " sample tweet id [%d] matching terms [%s] penguin version [%s]", - hasTrends, noTrends, ratio, tooManyTrendsRatio, tweet.getId(), - textFeatures.getTrendingTerms(), penguinVersion)); - } - } -} diff --git a/src/java/com/twitter/search/common/relevance/config/TweetProcessingConfig.docx b/src/java/com/twitter/search/common/relevance/config/TweetProcessingConfig.docx new file mode 100644 index 000000000..0327d1fb2 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/config/TweetProcessingConfig.docx differ diff --git a/src/java/com/twitter/search/common/relevance/config/TweetProcessingConfig.java b/src/java/com/twitter/search/common/relevance/config/TweetProcessingConfig.java deleted file mode 100644 index e09472c3a..000000000 --- a/src/java/com/twitter/search/common/relevance/config/TweetProcessingConfig.java +++ /dev/null @@ -1,114 +0,0 @@ -package com.twitter.search.common.relevance.config; - -import java.io.InputStream; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.config.ConfigFile; - -/** - * Config file for relevance computation. - */ -public final class TweetProcessingConfig { - private static final Logger LOG = LoggerFactory.getLogger(TweetProcessingConfig.class); - private static final String SCORER_CONFIG_DIR = "common/relevance/config"; - public static final String DEFAULT_CONFIG_FILE = "relevance.yml"; - private static ConfigFile relevanceConfig = null; - - private TweetProcessingConfig() { - } - - /** Initializes this instance from the given config file. */ - public static void init(String configFile) { - if (relevanceConfig == null) { - synchronized (TweetProcessingConfig.class) { - if (relevanceConfig == null) { - String file = configFile == null ? DEFAULT_CONFIG_FILE : configFile; - relevanceConfig = new ConfigFile(SCORER_CONFIG_DIR, file); - } - } - } - } - - /** Initializes this instance from the given input stream. */ - public static void init(InputStream inputStream, String configType) { - if (relevanceConfig == null) { - synchronized (TweetProcessingConfig.class) { - if (relevanceConfig == null) { - relevanceConfig = new ConfigFile(inputStream, configType); - } - } - } - } - - /** Initializes this instance. */ - public static void init() { - init(null); - } - - /** - * Returns the value of the given property as a double value. - * - * @param property The property. - * @param defaultValue The default value to return if the property is not present in the config. - */ - public static double getDouble(String property, double defaultValue) { - return relevanceConfig.getDouble(property, defaultValue); - } - - /** - * Returns the value of the given property as a string value. - * - * @param property The property. - * @param defaultValue The default value to return if the property is not present in the config. - */ - public static String getString(String property, String defaultValue) { - return relevanceConfig.getString(property, defaultValue); - } - - /** - * Returns the value of the given property as an integer value. - * - * @param property The property. - * @param defaultValue The default value to return if the property is not present in the config. - */ - public static int getInt(String property, int defaultValue) { - return relevanceConfig.getInt(property, defaultValue); - } - - /** - * Returns the value of the given property as a long value. - * - * @param property The property. - * @param defaultValue The default value to return if the property is not present in the config. - */ - public static long getLong(String property, long defaultValue) { - return relevanceConfig.getLong(property, defaultValue); - } - - /** - * Returns the value of the given property as a boolean value. - * - * @param property The property. - * @param defaultValue The default value to return if the property is not present in the config. - */ - public static boolean getBool(String property, boolean defaultValue) { - return relevanceConfig.getBool(property, defaultValue); - } - - /** - * Returns the value of the given property as a string. - * - * @param property The property. - * @throws ConfigurationException If the given property is not found in the config. - */ - public static String getString(String property) { - try { - return relevanceConfig.getString(property); - } catch (ConfigurationException e) { - LOG.error("Fatal error: could not get config string " + property, e); - throw new RuntimeException(e); - } - } -} diff --git a/src/java/com/twitter/search/common/relevance/entities/GeoObject.docx b/src/java/com/twitter/search/common/relevance/entities/GeoObject.docx new file mode 100644 index 000000000..d9fff1bbf Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/entities/GeoObject.docx differ diff --git a/src/java/com/twitter/search/common/relevance/entities/GeoObject.java b/src/java/com/twitter/search/common/relevance/entities/GeoObject.java deleted file mode 100644 index ef49c98a6..000000000 --- a/src/java/com/twitter/search/common/relevance/entities/GeoObject.java +++ /dev/null @@ -1,201 +0,0 @@ -package com.twitter.search.common.relevance.entities; - -import java.util.List; -import java.util.Optional; - -import com.google.common.annotations.VisibleForTesting; - -import com.twitter.search.common.indexing.thriftjava.ThriftGeoLocationSource; -import com.twitter.search.common.indexing.thriftjava.ThriftGeoTags; -import com.twitter.tweetypie.thriftjava.GeoCoordinates; -import com.twitter.tweetypie.thriftjava.Place; - -import geo.google.datamodel.GeoAddressAccuracy; - -/** - * A GeoObject, extending a GeoCoordinate to include radius and accuracy - */ -public class GeoObject { - - public static final int INT_FIELD_NOT_PRESENT = -1; - public static final double DOUBLE_FIELD_NOT_PRESENT = -1.0; - - private double latitude = DOUBLE_FIELD_NOT_PRESENT; - private double longitude = DOUBLE_FIELD_NOT_PRESENT; - private double radius = DOUBLE_FIELD_NOT_PRESENT; - - private final ThriftGeoLocationSource source; - - // Valid range is 0-9. With 0 being unknown and 9 being most accurate. - // If this GeoObject is valid, this should be set to INT_FIELD_NOT_PRESENT - private int accuracy = 0; - - /** Creates a new GeoObject instance. */ - public GeoObject(double lat, double lon, ThriftGeoLocationSource source) { - this(lat, lon, 0, source); - } - - /** Creates a new GeoObject instance. */ - public GeoObject(double lat, double lon, int acc, ThriftGeoLocationSource source) { - latitude = lat; - longitude = lon; - accuracy = acc; - this.source = source; - } - - /** Creates a new GeoObject instance. */ - public GeoObject(ThriftGeoLocationSource source) { - this.source = source; - } - - /** - * Tries to create a {@code GeoObject} instance from a given TweetyPie {@code Place} struct based - * on its bounding box coordinates. - * - * @param place - * @return {@code Optional} instance with {@code GeoObject} if bounding box coordinates are - * available, or an empty {@code Optional}. - */ - public static Optional fromPlace(Place place) { - // Can't use place.centroid: from the sample of data, centroid seems to always be null - // (as of May 17 2016). - if (place.isSetBounding_box() && place.getBounding_boxSize() > 0) { - int pointsCount = place.getBounding_boxSize(); - - if (pointsCount == 1) { - GeoCoordinates point = place.getBounding_box().get(0); - return Optional.of(createForIngester(point.getLatitude(), point.getLongitude())); - } else { - double sumLatitude = 0.0; - double sumLongitude = 0.0; - - List box = place.getBounding_box(); - - // Drop the last point if it's the same as the first point. - // The same logic is present in several other classes dealing with places. - // See e.g. birdherd/src/main/scala/com/twitter/birdherd/tweetypie/TweetyPiePlace.scala - if (box.get(pointsCount - 1).equals(box.get(0))) { - pointsCount--; - } - - for (int i = 0; i < pointsCount; i++) { - GeoCoordinates coords = box.get(i); - sumLatitude += coords.getLatitude(); - sumLongitude += coords.getLongitude(); - } - - double averageLatitude = sumLatitude / pointsCount; - double averageLongitude = sumLongitude / pointsCount; - return Optional.of(GeoObject.createForIngester(averageLatitude, averageLongitude)); - } - } - return Optional.empty(); - } - - public void setRadius(double radius) { - this.radius = radius; - } - - public Double getRadius() { - return radius; - } - - public void setLatitude(double latitude) { - this.latitude = latitude; - } - - public Double getLatitude() { - return latitude; - } - - public void setLongitude(double longitude) { - this.longitude = longitude; - } - - public Double getLongitude() { - return longitude; - } - - public int getAccuracy() { - return accuracy; - } - - public void setAccuracy(int accuracy) { - this.accuracy = accuracy; - } - - public ThriftGeoLocationSource getSource() { - return source; - } - - /** Convers this GeoObject instance to a ThriftGeoTags instance. */ - public ThriftGeoTags toThriftGeoTags(long twitterMessageId) { - ThriftGeoTags geoTags = new ThriftGeoTags(); - geoTags.setStatusId(twitterMessageId); - geoTags.setLatitude(getLatitude()); - geoTags.setLongitude(getLongitude()); - geoTags.setAccuracy(accuracy); - geoTags.setGeoLocationSource(source); - return geoTags; - } - - private static final double COORDS_EQUALITY_THRESHOLD = 1e-7; - - /** - * Performs an approximate comparison between the two GeoObject instances. - * - * @deprecated This code is not performant and should not be used in - * production code. Use only for tests. See SEARCH-5148. - */ - @Deprecated - @VisibleForTesting - public static boolean approxEquals(GeoObject a, GeoObject b) { - if (a == null && b == null) { - return true; - } - if ((a == null && b != null) || (a != null && b == null)) { - return false; - } - - if (a.accuracy != b.accuracy) { - return false; - } - if (Math.abs(a.latitude - b.latitude) > COORDS_EQUALITY_THRESHOLD) { - return false; - } - if (Math.abs(a.longitude - b.longitude) > COORDS_EQUALITY_THRESHOLD) { - return false; - } - if (Double.compare(a.radius, b.radius) != 0) { - return false; - } - if (a.source != b.source) { - return false; - } - - return true; - } - - @Override - public String toString() { - return "GeoObject{" - + "latitude=" + latitude - + ", longitude=" + longitude - + ", radius=" + radius - + ", source=" + source - + ", accuracy=" + accuracy - + '}'; - } - - /** - * Convenience factory method for ingester purposes. - */ - public static GeoObject createForIngester(double latitude, double longitude) { - return new GeoObject( - latitude, - longitude, - // store with highest level of accuracy: POINT_LEVEL - GeoAddressAccuracy.POINT_LEVEL.getCode(), - ThriftGeoLocationSource.GEOTAG); - } -} diff --git a/src/java/com/twitter/search/common/relevance/entities/PotentialLocationObject.docx b/src/java/com/twitter/search/common/relevance/entities/PotentialLocationObject.docx new file mode 100644 index 000000000..b9f921b6c Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/entities/PotentialLocationObject.docx differ diff --git a/src/java/com/twitter/search/common/relevance/entities/PotentialLocationObject.java b/src/java/com/twitter/search/common/relevance/entities/PotentialLocationObject.java deleted file mode 100644 index 5547e7d5d..000000000 --- a/src/java/com/twitter/search/common/relevance/entities/PotentialLocationObject.java +++ /dev/null @@ -1,122 +0,0 @@ -package com.twitter.search.common.relevance.entities; - -import java.util.Locale; - -import com.google.common.base.Preconditions; - -import org.apache.commons.lang.StringUtils; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.indexing.thriftjava.PotentialLocation; -import com.twitter.search.common.util.text.LanguageIdentifierHelper; -import com.twitter.search.common.util.text.NormalizerHelper; -import com.twitter.search.common.util.text.TokenizerHelper; - -/** - * An immutable tuple to wrap a country code, region and locality. Based on the PotentialLocation - * struct in status.thrift. - */ -public class PotentialLocationObject { - private final String countryCode; - private final String region; - private final String locality; - - /** - * Creates a new PotentialLocationObject instance. - * - * @param countryCode The country code. - * @param region The region. - * @param locality The locality. - */ - public PotentialLocationObject(String countryCode, String region, String locality) { - this.countryCode = countryCode; - this.region = region; - this.locality = locality; - } - - public String getCountryCode() { - return countryCode; - } - - public String getRegion() { - return region; - } - - public String getLocality() { - return locality; - } - - /** - * Converts this PotentialLocationObject instance to a PotentialLocation thrift struct. - * - * @param penguinVersion The penguin version to use for normalization and tokenization. - */ - public PotentialLocation toThriftPotentialLocation(PenguinVersion penguinVersion) { - Preconditions.checkNotNull(penguinVersion); - - String normalizedCountryCode = null; - if (countryCode != null) { - Locale countryCodeLocale = LanguageIdentifierHelper.identifyLanguage(countryCode); - normalizedCountryCode = - NormalizerHelper.normalize(countryCode, countryCodeLocale, penguinVersion); - } - - String tokenizedRegion = null; - if (region != null) { - Locale regionLocale = LanguageIdentifierHelper.identifyLanguage(region); - String normalizedRegion = NormalizerHelper.normalize(region, regionLocale, penguinVersion); - tokenizedRegion = StringUtils.join( - TokenizerHelper.tokenizeQuery(normalizedRegion, regionLocale, penguinVersion), " "); - } - - String tokenizedLocality = null; - if (locality != null) { - Locale localityLocale = LanguageIdentifierHelper.identifyLanguage(locality); - String normalizedLocality = - NormalizerHelper.normalize(locality, localityLocale, penguinVersion); - tokenizedLocality = - StringUtils.join(TokenizerHelper.tokenizeQuery( - normalizedLocality, localityLocale, penguinVersion), " "); - } - - return new PotentialLocation() - .setCountryCode(normalizedCountryCode) - .setRegion(tokenizedRegion) - .setLocality(tokenizedLocality); - } - - @Override - public int hashCode() { - return ((countryCode == null) ? 0 : countryCode.hashCode()) - + 13 * ((region == null) ? 0 : region.hashCode()) - + 19 * ((locality == null) ? 0 : locality.hashCode()); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof PotentialLocationObject)) { - return false; - } - - PotentialLocationObject entry = (PotentialLocationObject) obj; - return (countryCode == null - ? entry.countryCode == null - : countryCode.equals(entry.countryCode)) - && (region == null - ? entry.region == null - : region.equals(entry.region)) - && (locality == null - ? entry.locality == null - : locality.equals(entry.locality)); - } - - @Override - public String toString() { - return new StringBuilder("PotentialLocationObject {") - .append("countryCode=").append(countryCode) - .append(", region=").append(region) - .append(", locality=").append(locality) - .append("}") - .toString(); - } -} diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterMessage.docx b/src/java/com/twitter/search/common/relevance/entities/TwitterMessage.docx new file mode 100644 index 000000000..745bf1f4b Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/entities/TwitterMessage.docx differ diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterMessage.java b/src/java/com/twitter/search/common/relevance/entities/TwitterMessage.java deleted file mode 100644 index 524c558b2..000000000 --- a/src/java/com/twitter/search/common/relevance/entities/TwitterMessage.java +++ /dev/null @@ -1,1267 +0,0 @@ -package com.twitter.search.common.relevance.entities; - -import java.text.DateFormat; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.Date; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import javax.annotation.Nonnull; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -import org.apache.commons.lang.StringUtils; -import org.apache.commons.lang3.builder.EqualsBuilder; -import org.apache.commons.lang3.builder.HashCodeBuilder; -import org.apache.commons.lang3.builder.ToStringBuilder; -import org.apache.lucene.analysis.TokenStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.text.language.LocaleUtil; -import com.twitter.common.text.pipeline.TwitterLanguageIdentifier; -import com.twitter.common.text.token.TokenizedCharSequence; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.cuad.ner.plain.thriftjava.NamedEntity; -import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl; -import com.twitter.search.common.relevance.features.TweetFeatures; -import com.twitter.search.common.relevance.features.TweetTextFeatures; -import com.twitter.search.common.relevance.features.TweetTextQuality; -import com.twitter.search.common.relevance.features.TweetUserFeatures; -import com.twitter.search.common.util.text.NormalizerHelper; -import com.twitter.service.spiderduck.gen.MediaTypes; -import com.twitter.tweetypie.thriftjava.ComposerSource; -import com.twitter.util.TwitterDateFormat; - -/** - * A representation of tweets used as an intermediate object during ingestion. As we proceed - * in ingestion, we fill this object with data. We then convert it to ThriftVersionedEvents (which - * itself represents a single tweet too, in different penguin versions potentially). - */ -public class TwitterMessage { - private static final Logger LOG = LoggerFactory.getLogger(TwitterMessage.class); - - public static class EscherbirdAnnotation implements Comparable { - public final long groupId; - public final long domainId; - public final long entityId; - - public EscherbirdAnnotation(long groupId, long domainId, long entityId) { - this.groupId = groupId; - this.domainId = domainId; - this.entityId = entityId; - } - - @Override - public boolean equals(Object o2) { - if (o2 instanceof EscherbirdAnnotation) { - EscherbirdAnnotation a2 = (EscherbirdAnnotation) o2; - return groupId == a2.groupId && domainId == a2.domainId && entityId == a2.entityId; - } - return false; - } - - @Override - public int hashCode() { - return new HashCodeBuilder() - .append(groupId) - .append(domainId) - .append(entityId) - .toHashCode(); - } - - @Override - public int compareTo(EscherbirdAnnotation o) { - return ComparisonChain.start() - .compare(this.groupId, o.groupId) - .compare(this.domainId, o.domainId) - .compare(this.entityId, o.entityId) - .result(); - } - } - - private final List escherbirdAnnotations = Lists.newArrayList(); - - // tweet features for multiple penguin versions - private static class VersionedTweetFeatures { - // TweetFeatures populated by relevance classifiers, structure defined - // in src/main/thrift/classifier.thrift. - private TweetFeatures tweetFeatures = new TweetFeatures(); - private TokenizedCharSequence tokenizedCharSequence = null; - private Set normalizedHashtags = Sets.newHashSet(); - - public TweetFeatures getTweetFeatures() { - return this.tweetFeatures; - } - - public void setTweetFeatures(final TweetFeatures tweetFeatures) { - this.tweetFeatures = tweetFeatures; - } - - public TweetTextQuality getTweetTextQuality() { - return this.tweetFeatures.getTweetTextQuality(); - } - - public TweetTextFeatures getTweetTextFeatures() { - return this.tweetFeatures.getTweetTextFeatures(); - } - - public TweetUserFeatures getTweetUserFeatures() { - return this.tweetFeatures.getTweetUserFeatures(); - } - - public TokenizedCharSequence getTokenizedCharSequence() { - return this.tokenizedCharSequence; - } - - public void setTokenizedCharSequence(TokenizedCharSequence sequence) { - this.tokenizedCharSequence = sequence; - } - - public Set getNormalizedHashtags() { - return this.normalizedHashtags; - } - - public void addNormalizedHashtags(String normalizedHashtag) { - this.normalizedHashtags.add(normalizedHashtag); - } - } - - public static final int INT_FIELD_NOT_PRESENT = -1; - public static final long LONG_FIELD_NOT_PRESENT = -1; - public static final double DOUBLE_FIELD_NOT_PRESENT = -1; - public static final int MAX_USER_REPUTATION = 100; - - private final long tweetId; - - private String text; - - private Date date; - @Nonnull - private Optional optionalFromUser = Optional.empty(); - @Nonnull - private Optional optionalToUser = Optional.empty(); - private Locale locale = null; - private Locale linkLocale = null; - - // Original source text. - private String origSource; - // Source with HTML tags removed and truncated. - private String strippedSource; - - // Original location text. - private String origLocation; - - // Location truncated for mysql field-width reasons (see TwitterMessageUtil.java). - private String truncatedNormalizedLocation; - - // User's country - private String fromUserLocCountry; - - private Integer followersCount = INT_FIELD_NOT_PRESENT; - private boolean deleted = false; - - // Fields extracted from entities (in the JSON object) - private List mentions = new ArrayList<>(); - private Set hashtags = Sets.newHashSet(); - // Lat/lon and region accuracy tuples extracted from tweet text, or null. - private GeoObject geoLocation = null; - private boolean uncodeableLocation = false; - // This is set if the tweet is geotagged. (i.e. "geo" or "coordinate" section is present - // in the json) - // This field has only a getter but no setter --- it is filled in when the json is parsed. - private GeoObject geoTaggedLocation = null; - - private double userReputation = DOUBLE_FIELD_NOT_PRESENT; - private boolean geocodeRequired = false; - private boolean sensitiveContent = false; - private boolean userProtected; - private boolean userVerified; - private boolean userBlueVerified; - private TwitterRetweetMessage retweetMessage; - private TwitterQuotedMessage quotedMessage; - private List places; - // maps from original url (the t.co url) to ThriftExpandedUrl, which contains the - // expanded url and the spiderduck response (canoicalLastHopUrl and mediatype) - private final Map expandedUrls; - // maps the photo status id to the media url - private Map photoUrls; - private Optional inReplyToStatusId = Optional.empty(); - private Optional directedAtUserId = Optional.empty(); - - private long conversationId = -1; - - // True if tweet is nullcasted. - private boolean nullcast = false; - - // True if tweet is a self-threaded tweet - private boolean selfThread = false; - - // If the tweet is a part of an exclusive conversation, the author who started - // that conversation. - private Optional exclusiveConversationAuthorId = Optional.empty(); - - // tweet features map for multiple versions of penguin - private Map versionedTweetFeaturesMap; - - // Engagments count: favorites, retweets and replies - private int numFavorites = 0; - private int numRetweets = 0; - private int numReplies = 0; - - // Card information - private String cardName; - private String cardDomain; - private String cardTitle; - private String cardDescription; - private String cardLang; - private String cardUrl; - - private String placeId; - private String placeFullName; - private String placeCountryCode; - - private Set namedEntities = Sets.newHashSet(); - - // Spaces data - private Set spaceIds = Sets.newHashSet(); - private Set spaceAdmins = Sets.newHashSet(); - private String spaceTitle; - - private Optional composerSource = Optional.empty(); - - private final List potentialLocations = Lists.newArrayList(); - - // one or two penguin versions supported by this system - private final List supportedPenguinVersions; - - public TwitterMessage(Long tweetId, List supportedPenguinVersions) { - this.tweetId = tweetId; - this.places = new ArrayList<>(); - this.expandedUrls = new LinkedHashMap<>(); - // make sure we support at least one, but no more than two versions of penguin - this.supportedPenguinVersions = supportedPenguinVersions; - this.versionedTweetFeaturesMap = getVersionedTweetFeaturesMap(); - Preconditions.checkArgument(this.supportedPenguinVersions.size() <= 2 - && this.supportedPenguinVersions.size() > 0); - } - - /** - * Replace to-user with in-reply-to user if needed. - */ - public void replaceToUserWithInReplyToUserIfNeeded( - String inReplyToScreenName, long inReplyToUserId) { - if (shouldUseReplyUserAsToUser(optionalToUser, inReplyToUserId)) { - TwitterMessageUser replyUser = - TwitterMessageUser.createWithNamesAndId(inReplyToScreenName, "", inReplyToUserId); - optionalToUser = Optional.of(replyUser); - } - } - - // To-user could have been inferred from the mention at the position 0. - // But if there is an explicit in-reply-to user, we might need to use it as to-user instead. - private static boolean shouldUseReplyUserAsToUser( - Optional currentToUser, - long inReplyToUserId) { - if (!currentToUser.isPresent()) { - // There is no mention in the tweet that qualifies as to-user. - return true; - } - - // We already have a mention in the tweet that qualifies as to-user. - TwitterMessageUser toUser = currentToUser.get(); - if (!toUser.getId().isPresent()) { - // The to-user from the mention is a stub. - return true; - } - - long toUserId = toUser.getId().get(); - if (toUserId != inReplyToUserId) { - // The to-user from the mention is different that the in-reply-to user, - // use in-reply-to user instead. - return true; - } - - return false; - } - - public double getUserReputation() { - return userReputation; - } - - /** - * Sets the user reputation. - */ - public TwitterMessage setUserReputation(double newUserReputation) { - if (newUserReputation > MAX_USER_REPUTATION) { - LOG.warn("Out of bounds user reputation {} for status id {}", newUserReputation, tweetId); - this.userReputation = (float) MAX_USER_REPUTATION; - } else { - this.userReputation = newUserReputation; - } - return this; - } - - public String getText() { - return text; - } - - public Optional getOptionalToUser() { - return optionalToUser; - } - - public void setOptionalToUser(Optional optionalToUser) { - this.optionalToUser = optionalToUser; - } - - public void setText(String text) { - this.text = text; - } - - public Date getDate() { - return date; - } - - public void setDate(Date date) { - this.date = date; - } - - public void setFromUser(@Nonnull TwitterMessageUser fromUser) { - Preconditions.checkNotNull(fromUser, "Don't set a null fromUser"); - optionalFromUser = Optional.of(fromUser); - } - - public Optional getFromUserScreenName() { - return optionalFromUser.isPresent() - ? optionalFromUser.get().getScreenName() - : Optional.empty(); - } - - /** - * Sets the fromUserScreenName. - */ - public void setFromUserScreenName(@Nonnull String fromUserScreenName) { - TwitterMessageUser newFromUser = optionalFromUser.isPresent() - ? optionalFromUser.get().copyWithScreenName(fromUserScreenName) - : TwitterMessageUser.createWithScreenName(fromUserScreenName); - - optionalFromUser = Optional.of(newFromUser); - } - - public Optional getTokenizedFromUserScreenName() { - return optionalFromUser.flatMap(TwitterMessageUser::getTokenizedScreenName); - } - - public Optional getFromUserDisplayName() { - return optionalFromUser.flatMap(TwitterMessageUser::getDisplayName); - } - - /** - * Sets the fromUserDisplayName. - */ - public void setFromUserDisplayName(@Nonnull String fromUserDisplayName) { - TwitterMessageUser newFromUser = optionalFromUser.isPresent() - ? optionalFromUser.get().copyWithDisplayName(fromUserDisplayName) - : TwitterMessageUser.createWithDisplayName(fromUserDisplayName); - - optionalFromUser = Optional.of(newFromUser); - } - - public Optional getFromUserTwitterId() { - return optionalFromUser.flatMap(TwitterMessageUser::getId); - } - - /** - * Sets the fromUserId. - */ - public void setFromUserId(long fromUserId) { - TwitterMessageUser newFromUser = optionalFromUser.isPresent() - ? optionalFromUser.get().copyWithId(fromUserId) - : TwitterMessageUser.createWithId(fromUserId); - - optionalFromUser = Optional.of(newFromUser); - } - - public long getConversationId() { - return conversationId; - } - - public void setConversationId(long conversationId) { - this.conversationId = conversationId; - } - - public boolean isUserProtected() { - return this.userProtected; - } - - public void setUserProtected(boolean userProtected) { - this.userProtected = userProtected; - } - - public boolean isUserVerified() { - return this.userVerified; - } - - public void setUserVerified(boolean userVerified) { - this.userVerified = userVerified; - } - - public boolean isUserBlueVerified() { - return this.userBlueVerified; - } - - public void setUserBlueVerified(boolean userBlueVerified) { - this.userBlueVerified = userBlueVerified; - } - - public void setIsSensitiveContent(boolean isSensitiveContent) { - this.sensitiveContent = isSensitiveContent; - } - - public boolean isSensitiveContent() { - return this.sensitiveContent; - } - - public Optional getToUserObject() { - return optionalToUser; - } - - public void setToUserObject(@Nonnull TwitterMessageUser user) { - Preconditions.checkNotNull(user, "Don't set a null to-user"); - optionalToUser = Optional.of(user); - } - - public Optional getToUserTwitterId() { - return optionalToUser.flatMap(TwitterMessageUser::getId); - } - - /** - * Sets toUserId. - */ - public void setToUserTwitterId(long toUserId) { - TwitterMessageUser newToUser = optionalToUser.isPresent() - ? optionalToUser.get().copyWithId(toUserId) - : TwitterMessageUser.createWithId(toUserId); - - optionalToUser = Optional.of(newToUser); - } - - public Optional getToUserLowercasedScreenName() { - return optionalToUser.flatMap(TwitterMessageUser::getScreenName).map(String::toLowerCase); - } - - public Optional getToUserScreenName() { - return optionalToUser.flatMap(TwitterMessageUser::getScreenName); - } - - /** - * Sets toUserScreenName. - */ - public void setToUserScreenName(@Nonnull String screenName) { - Preconditions.checkNotNull(screenName, "Don't set a null to-user screenname"); - - TwitterMessageUser newToUser = optionalToUser.isPresent() - ? optionalToUser.get().copyWithScreenName(screenName) - : TwitterMessageUser.createWithScreenName(screenName); - - optionalToUser = Optional.of(newToUser); - } - - // to use from TweetEventParseHelper - public void setDirectedAtUserId(Optional directedAtUserId) { - this.directedAtUserId = directedAtUserId; - } - - @VisibleForTesting - public Optional getDirectedAtUserId() { - return directedAtUserId; - } - - /** - * Returns the referenceAuthorId. - */ - public Optional getReferenceAuthorId() { - // The semantics of reference-author-id: - // - if the tweet is a retweet, it should be the user id of the author of the original tweet - // - else, if the tweet is directed at a user, it should be the id of the user it's directed at. - // - else, if the tweet is a reply in a root self-thread, directed-at is not set, so it's - // the id of the user who started the self-thread. - // - // For definitive info on replies and directed-at, take a look at go/replies. To view these - // for a certain tweet, use http://go/t. - // - // Note that if directed-at is set, reply is always set. - // If reply is set, directed-at is not necessarily set. - if (isRetweet() && retweetMessage.hasSharedUserTwitterId()) { - long retweetedUserId = retweetMessage.getSharedUserTwitterId(); - return Optional.of(retweetedUserId); - } else if (directedAtUserId.isPresent()) { - // Why not replace directedAtUserId with reply and make this function depend - // on the "reply" field of TweetCoreData? - // Well, verified by counters, it seems for ~1% of tweets, which contain both directed-at - // and reply, directed-at-user is different than the reply-to-user id. This happens in the - // following case: - // - // author / reply-to / directed-at - // T1 A - - - // T2 B A A - // T3 B B A - // - // T2 is a reply to T1, T3 is a reply to T2. - // - // It's up to us to decide who this tweet is "referencing", but with the current code, - // we choose that T3 is referencing user A. - return directedAtUserId; - } else { - // This is the case of a root self-thread reply. directed-at is not set. - Optional fromUserId = this.getFromUserTwitterId(); - Optional toUserId = this.getToUserTwitterId(); - - if (fromUserId.isPresent() && fromUserId.equals(toUserId)) { - return fromUserId; - } - } - return Optional.empty(); - } - - public void setNumFavorites(int numFavorites) { - this.numFavorites = numFavorites; - } - - public void setNumRetweets(int numRetweets) { - this.numRetweets = numRetweets; - } - - public void setNumReplies(int numRepliess) { - this.numReplies = numRepliess; - } - - public void addEscherbirdAnnotation(EscherbirdAnnotation annotation) { - escherbirdAnnotations.add(annotation); - } - - public List getEscherbirdAnnotations() { - return escherbirdAnnotations; - } - - public List getPotentialLocations() { - return potentialLocations; - } - - public void setPotentialLocations(Collection potentialLocations) { - this.potentialLocations.clear(); - this.potentialLocations.addAll(potentialLocations); - } - - @Override - public String toString() { - return ToStringBuilder.reflectionToString(this); - } - - // Tweet language related getters and setters. - - /** - * Returns the locale. - *

- * Note the getLocale() will never return null, this is for the convenience of text related - * processing in the ingester. If you want the real locale, you need to check isSetLocale() - * first to see if we really have any information about the locale of this tweet. - */ - public Locale getLocale() { - if (locale == null) { - return TwitterLanguageIdentifier.UNKNOWN; - } else { - return locale; - } - } - - public void setLocale(Locale locale) { - this.locale = locale; - } - - /** - * Determines if the locate is set. - */ - public boolean isSetLocale() { - return locale != null; - } - - /** - * Returns the language of the locale. E.g. zh - */ - public String getLanguage() { - if (isSetLocale()) { - return getLocale().getLanguage(); - } else { - return null; - } - } - - /** - * Returns the IETF BCP 47 Language Tag of the locale. E.g. zh-CN - */ - public String getBCP47LanguageTag() { - if (isSetLocale()) { - return getLocale().toLanguageTag(); - } else { - return null; - } - } - - public void setLanguage(String language) { - if (language != null) { - locale = LocaleUtil.getLocaleOf(language); - } - } - - // Tweet link language related getters and setters. - public Locale getLinkLocale() { - return linkLocale; - } - - public void setLinkLocale(Locale linkLocale) { - this.linkLocale = linkLocale; - } - - /** - * Returns the language of the link locale. - */ - public String getLinkLanguage() { - if (this.linkLocale == null) { - return null; - } else { - return this.linkLocale.getLanguage(); - } - } - - public String getOrigSource() { - return origSource; - } - - public void setOrigSource(String origSource) { - this.origSource = origSource; - } - - public String getStrippedSource() { - return strippedSource; - } - - public void setStrippedSource(String strippedSource) { - this.strippedSource = strippedSource; - } - - public String getOrigLocation() { - return origLocation; - } - - public String getLocation() { - return truncatedNormalizedLocation; - } - - public void setOrigLocation(String origLocation) { - this.origLocation = origLocation; - } - - public void setTruncatedNormalizedLocation(String truncatedNormalizedLocation) { - this.truncatedNormalizedLocation = truncatedNormalizedLocation; - } - - public boolean hasFromUserLocCountry() { - return fromUserLocCountry != null; - } - - public String getFromUserLocCountry() { - return fromUserLocCountry; - } - - public void setFromUserLocCountry(String fromUserLocCountry) { - this.fromUserLocCountry = fromUserLocCountry; - } - - public String getTruncatedNormalizedLocation() { - return truncatedNormalizedLocation; - } - - public Integer getFollowersCount() { - return followersCount; - } - - public void setFollowersCount(Integer followersCount) { - this.followersCount = followersCount; - } - - public boolean hasFollowersCount() { - return followersCount != INT_FIELD_NOT_PRESENT; - } - - public boolean isDeleted() { - return deleted; - } - - public void setDeleted(boolean deleted) { - this.deleted = deleted; - } - - public boolean hasCard() { - return !StringUtils.isBlank(getCardName()); - } - - @Override - public int hashCode() { - return ((Long) getId()).hashCode(); - } - - /** - * Parses the given date using the TwitterDateFormat. - */ - public static Date parseDate(String date) { - DateFormat parser = TwitterDateFormat.apply("EEE MMM d HH:mm:ss Z yyyy"); - try { - return parser.parse(date); - } catch (Exception e) { - return null; - } - } - - public boolean hasGeoLocation() { - return geoLocation != null; - } - - public void setGeoLocation(GeoObject location) { - this.geoLocation = location; - } - - public GeoObject getGeoLocation() { - return geoLocation; - } - - public String getPlaceId() { - return placeId; - } - - public void setPlaceId(String placeId) { - this.placeId = placeId; - } - - public String getPlaceFullName() { - return placeFullName; - } - - public void setPlaceFullName(String placeFullName) { - this.placeFullName = placeFullName; - } - - public String getPlaceCountryCode() { - return placeCountryCode; - } - - public void setPlaceCountryCode(String placeCountryCode) { - this.placeCountryCode = placeCountryCode; - } - - public void setGeoTaggedLocation(GeoObject geoTaggedLocation) { - this.geoTaggedLocation = geoTaggedLocation; - } - - public GeoObject getGeoTaggedLocation() { - return geoTaggedLocation; - } - - public void setLatLon(double latitude, double longitude) { - geoLocation = new GeoObject(latitude, longitude, null); - } - - public Double getLatitude() { - return hasGeoLocation() ? geoLocation.getLatitude() : null; - } - - public Double getLongitude() { - return hasGeoLocation() ? geoLocation.getLongitude() : null; - } - - public boolean isUncodeableLocation() { - return uncodeableLocation; - } - - public void setUncodeableLocation() { - uncodeableLocation = true; - } - - public void setGeocodeRequired() { - this.geocodeRequired = true; - } - - public boolean isGeocodeRequired() { - return geocodeRequired; - } - - public Map getPhotoUrls() { - return photoUrls; - } - - /** - * Associates the given mediaUrl with the given photoStatusId. - */ - public void addPhotoUrl(long photoStatusId, String mediaUrl) { - if (photoUrls == null) { - photoUrls = new LinkedHashMap<>(); - } - photoUrls.putIfAbsent(photoStatusId, mediaUrl); - } - - public Map getExpandedUrlMap() { - return expandedUrls; - } - - public int getExpandedUrlMapSize() { - return expandedUrls.size(); - } - - /** - * Associates the given originalUrl with the given expanderUrl. - */ - public void addExpandedUrl(String originalUrl, ThriftExpandedUrl expandedUrl) { - this.expandedUrls.put(originalUrl, expandedUrl); - } - - /** - * Replaces urls with resolved ones. - */ - public String getTextReplacedWithResolvedURLs() { - String retText = text; - for (Map.Entry entry : expandedUrls.entrySet()) { - ThriftExpandedUrl urlInfo = entry.getValue(); - String resolvedUrl; - String canonicalLastHopUrl = urlInfo.getCanonicalLastHopUrl(); - String expandedUrl = urlInfo.getExpandedUrl(); - if (canonicalLastHopUrl != null) { - resolvedUrl = canonicalLastHopUrl; - LOG.debug("{} has canonical last hop url set", urlInfo); - } else if (expandedUrl != null) { - LOG.debug("{} has no canonical last hop url set, using expanded url instead", urlInfo); - resolvedUrl = expandedUrl; - } else { - LOG.debug("{} has no canonical last hop url or expanded url set, skipping", urlInfo); - continue; - } - retText = retText.replace(entry.getKey(), resolvedUrl); - } - return retText; - } - - public long getId() { - return tweetId; - } - - public boolean isRetweet() { - return retweetMessage != null; - } - - public boolean hasQuote() { - return quotedMessage != null; - } - - public boolean isReply() { - return getToUserScreenName().isPresent() - || getToUserTwitterId().isPresent() - || getInReplyToStatusId().isPresent(); - } - - public boolean isReplyToTweet() { - return getInReplyToStatusId().isPresent(); - } - - public TwitterRetweetMessage getRetweetMessage() { - return retweetMessage; - } - - public void setRetweetMessage(TwitterRetweetMessage retweetMessage) { - this.retweetMessage = retweetMessage; - } - - public TwitterQuotedMessage getQuotedMessage() { - return quotedMessage; - } - - public void setQuotedMessage(TwitterQuotedMessage quotedMessage) { - this.quotedMessage = quotedMessage; - } - - public List getPlaces() { - return places; - } - - public void addPlace(String place) { - // Places are used for earlybird serialization - places.add(place); - } - - public Optional getInReplyToStatusId() { - return inReplyToStatusId; - } - - public void setInReplyToStatusId(long inReplyToStatusId) { - Preconditions.checkArgument(inReplyToStatusId > 0, "In-reply-to status ID should be positive"); - this.inReplyToStatusId = Optional.of(inReplyToStatusId); - } - - public boolean getNullcast() { - return nullcast; - } - - public void setNullcast(boolean nullcast) { - this.nullcast = nullcast; - } - - public List getSupportedPenguinVersions() { - return supportedPenguinVersions; - } - - private VersionedTweetFeatures getVersionedTweetFeatures(PenguinVersion penguinVersion) { - VersionedTweetFeatures versionedTweetFeatures = versionedTweetFeaturesMap.get(penguinVersion); - return Preconditions.checkNotNull(versionedTweetFeatures); - } - - public TweetFeatures getTweetFeatures(PenguinVersion penguinVersion) { - return getVersionedTweetFeatures(penguinVersion).getTweetFeatures(); - } - - @VisibleForTesting - // only used in Tests - public void setTweetFeatures(PenguinVersion penguinVersion, TweetFeatures tweetFeatures) { - versionedTweetFeaturesMap.get(penguinVersion).setTweetFeatures(tweetFeatures); - } - - public int getTweetSignature(PenguinVersion penguinVersion) { - return getVersionedTweetFeatures(penguinVersion).getTweetTextFeatures().getSignature(); - } - - public TweetTextQuality getTweetTextQuality(PenguinVersion penguinVersion) { - return getVersionedTweetFeatures(penguinVersion).getTweetTextQuality(); - } - - public TweetTextFeatures getTweetTextFeatures(PenguinVersion penguinVersion) { - return getVersionedTweetFeatures(penguinVersion).getTweetTextFeatures(); - } - - public TweetUserFeatures getTweetUserFeatures(PenguinVersion penguinVersion) { - return getVersionedTweetFeatures(penguinVersion).getTweetUserFeatures(); - } - - public TokenizedCharSequence getTokenizedCharSequence(PenguinVersion penguinVersion) { - return getVersionedTweetFeatures(penguinVersion).getTokenizedCharSequence(); - } - - public void setTokenizedCharSequence(PenguinVersion penguinVersion, - TokenizedCharSequence sequence) { - getVersionedTweetFeatures(penguinVersion).setTokenizedCharSequence(sequence); - } - - // True if the features contain multiple hash tags or multiple trends. - // This is intended as an anti-trend-spam measure. - public static boolean hasMultipleHashtagsOrTrends(TweetTextFeatures textFeatures) { - // Allow at most 1 trend and 2 hashtags. - return textFeatures.getTrendingTermsSize() > 1 || textFeatures.getHashtagsSize() > 2; - } - - /** - * Returns the expanded URLs. - */ - public Collection getExpandedUrls() { - return expandedUrls.values(); - } - - /** - * Returns the canonical last hop URLs. - */ - public Set getCanonicalLastHopUrls() { - Set result = new HashSet<>(expandedUrls.size()); - for (ThriftExpandedUrl url : expandedUrls.values()) { - result.add(url.getCanonicalLastHopUrl()); - } - return result; - } - - public String getCardName() { - return cardName; - } - - public void setCardName(String cardName) { - this.cardName = cardName; - } - - public String getCardDomain() { - return cardDomain; - } - - public void setCardDomain(String cardDomain) { - this.cardDomain = cardDomain; - } - - public String getCardTitle() { - return cardTitle; - } - - public void setCardTitle(String cardTitle) { - this.cardTitle = cardTitle; - } - - public String getCardDescription() { - return cardDescription; - } - - public void setCardDescription(String cardDescription) { - this.cardDescription = cardDescription; - } - - public String getCardLang() { - return cardLang; - } - - public void setCardLang(String cardLang) { - this.cardLang = cardLang; - } - - public String getCardUrl() { - return cardUrl; - } - - public void setCardUrl(String cardUrl) { - this.cardUrl = cardUrl; - } - - public List getMentions() { - return this.mentions; - } - - public void setMentions(List mentions) { - this.mentions = mentions; - } - - public List getLowercasedMentions() { - return Lists.transform(getMentions(), user -> { - // This condition is also checked in addUserToMentions(). - Preconditions.checkState(user.getScreenName().isPresent(), "Invalid mention"); - return user.getScreenName().get().toLowerCase(); - }); - } - - public Set getHashtags() { - return this.hashtags; - } - - public Set getNormalizedHashtags(PenguinVersion penguinVersion) { - return getVersionedTweetFeatures(penguinVersion).getNormalizedHashtags(); - } - - public void addNormalizedHashtag(String normalizedHashtag, PenguinVersion penguinVersion) { - getVersionedTweetFeatures(penguinVersion).addNormalizedHashtags(normalizedHashtag); - } - - public Optional getComposerSource() { - return composerSource; - } - - public void setComposerSource(ComposerSource composerSource) { - Preconditions.checkNotNull(composerSource, "composerSource should not be null"); - this.composerSource = Optional.of(composerSource); - } - - public boolean isSelfThread() { - return selfThread; - } - - public void setSelfThread(boolean selfThread) { - this.selfThread = selfThread; - } - - public boolean isExclusive() { - return exclusiveConversationAuthorId.isPresent(); - } - - public long getExclusiveConversationAuthorId() { - return exclusiveConversationAuthorId.get(); - } - - public void setExclusiveConversationAuthorId(long exclusiveConversationAuthorId) { - this.exclusiveConversationAuthorId = Optional.of(exclusiveConversationAuthorId); - } - - /** - * Adds an expanded media url based on the given parameters. - */ - public void addExpandedMediaUrl(String originalUrl, - String expandedUrl, - @Nullable MediaTypes mediaType) { - if (!StringUtils.isBlank(originalUrl) && !StringUtils.isBlank(expandedUrl)) { - ThriftExpandedUrl thriftExpandedUrl = new ThriftExpandedUrl(); - if (mediaType != null) { - thriftExpandedUrl.setMediaType(mediaType); - } - thriftExpandedUrl.setOriginalUrl(originalUrl); - thriftExpandedUrl.setExpandedUrl(expandedUrl); // This will be tokenized and indexed - // Note that the mediaURL is not indexed. We could also index it, but it is not indexed - // to reduce RAM usage. - thriftExpandedUrl.setCanonicalLastHopUrl(expandedUrl); // This will be tokenized and indexed - addExpandedUrl(originalUrl, thriftExpandedUrl); - thriftExpandedUrl.setConsumerMedia(true); - } - } - - /** - * Adds an expanded non-media url based on the given parameters. - */ - public void addExpandedNonMediaUrl(String originalUrl, String expandedUrl) { - if (!StringUtils.isBlank(originalUrl)) { - ThriftExpandedUrl thriftExpandedUrl = new ThriftExpandedUrl(originalUrl); - if (!StringUtils.isBlank(expandedUrl)) { - thriftExpandedUrl.setExpandedUrl(expandedUrl); - } - addExpandedUrl(originalUrl, thriftExpandedUrl); - thriftExpandedUrl.setConsumerMedia(false); - } - } - - /** - * Only used in tests. - * - * Simulates resolving compressed URLs, which is usually done by ResolveCompressedUrlsStage. - */ - @VisibleForTesting - public void replaceUrlsWithResolvedUrls(Map resolvedUrls) { - for (Map.Entry urlEntry : expandedUrls.entrySet()) { - String tcoUrl = urlEntry.getKey(); - if (resolvedUrls.containsKey(tcoUrl)) { - ThriftExpandedUrl expandedUrl = urlEntry.getValue(); - expandedUrl.setCanonicalLastHopUrl(resolvedUrls.get(tcoUrl)); - } - } - } - - /** - * Adds a mention for a user with the given screen name. - */ - public void addMention(String screenName) { - TwitterMessageUser user = TwitterMessageUser.createWithScreenName(screenName); - addUserToMentions(user); - } - - /** - * Adds the given user to mentions. - */ - public void addUserToMentions(TwitterMessageUser user) { - Preconditions.checkArgument(user.getScreenName().isPresent(), "Don't add invalid mentions"); - this.mentions.add(user); - } - - /** - * Adds the given hashtag. - */ - public void addHashtag(String hashtag) { - this.hashtags.add(hashtag); - for (PenguinVersion penguinVersion : supportedPenguinVersions) { - addNormalizedHashtag(NormalizerHelper.normalize(hashtag, getLocale(), penguinVersion), - penguinVersion); - } - } - - private Map getVersionedTweetFeaturesMap() { - Map versionedMap = - Maps.newEnumMap(PenguinVersion.class); - for (PenguinVersion penguinVersion : getSupportedPenguinVersions()) { - versionedMap.put(penguinVersion, new VersionedTweetFeatures()); - } - - return versionedMap; - } - - public int getNumFavorites() { - return numFavorites; - } - - public int getNumRetweets() { - return numRetweets; - } - - public int getNumReplies() { - return numReplies; - } - - public Set getNamedEntities() { - return namedEntities; - } - - public void addNamedEntity(NamedEntity namedEntity) { - namedEntities.add(namedEntity); - } - - public Set getSpaceIds() { - return spaceIds; - } - - public void setSpaceIds(Set spaceIds) { - this.spaceIds = Sets.newHashSet(spaceIds); - } - - public Set getSpaceAdmins() { - return spaceAdmins; - } - - public void addSpaceAdmin(TwitterMessageUser admin) { - spaceAdmins.add(admin); - } - - public String getSpaceTitle() { - return spaceTitle; - } - - public void setSpaceTitle(String spaceTitle) { - this.spaceTitle = spaceTitle; - } - - private static boolean equals(List l1, List l2) { - EscherbirdAnnotation[] arr1 = l1.toArray(new EscherbirdAnnotation[l1.size()]); - Arrays.sort(arr1); - EscherbirdAnnotation[] arr2 = l1.toArray(new EscherbirdAnnotation[l2.size()]); - Arrays.sort(arr2); - return Arrays.equals(arr1, arr2); - } - - /** - * Compares the given messages using reflection and determines if they're approximately equal. - */ - public static boolean reflectionApproxEquals( - TwitterMessage a, - TwitterMessage b, - List additionalExcludeFields) { - List excludeFields = Lists.newArrayList( - "versionedTweetFeaturesMap", - "geoLocation", - "geoTaggedLocation", - "escherbirdAnnotations" - ); - excludeFields.addAll(additionalExcludeFields); - - return EqualsBuilder.reflectionEquals(a, b, excludeFields) - && GeoObject.approxEquals(a.getGeoLocation(), b.getGeoLocation()) - && GeoObject.approxEquals(a.getGeoTaggedLocation(), b.getGeoTaggedLocation()) - && equals(a.getEscherbirdAnnotations(), b.getEscherbirdAnnotations()); - } - - public static boolean reflectionApproxEquals(TwitterMessage a, TwitterMessage b) { - return reflectionApproxEquals(a, b, Collections.emptyList()); - } -} diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUser.docx b/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUser.docx new file mode 100644 index 000000000..0d053e0d3 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUser.docx differ diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUser.java b/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUser.java deleted file mode 100644 index 6ecd5efd7..000000000 --- a/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUser.java +++ /dev/null @@ -1,231 +0,0 @@ -package com.twitter.search.common.relevance.entities; - -import java.util.Optional; -import javax.annotation.Nonnull; - -import com.google.common.base.Preconditions; - -import org.apache.commons.lang3.builder.EqualsBuilder; -import org.apache.commons.lang3.builder.HashCodeBuilder; -import org.apache.lucene.analysis.TokenStream; - -import com.twitter.search.common.util.text.TokenizerHelper; - -// Represents from-user, to-user, mentions and audioSpace admins in TwitterMessage. -public final class TwitterMessageUser { - - @Nonnull private final Optional screenName; // a.k.a. user handle or username - @Nonnull private final Optional displayName; - - @Nonnull private Optional tokenizedScreenName; - - @Nonnull private final Optional id; // twitter ID - - public static final class Builder { - @Nonnull private Optional screenName = Optional.empty(); - @Nonnull private Optional displayName = Optional.empty(); - @Nonnull private Optional tokenizedScreenName = Optional.empty(); - @Nonnull private Optional id = Optional.empty(); - - public Builder() { - } - - /** - * Initialized Builder based on an existing TwitterMessageUser - */ - public Builder(TwitterMessageUser user) { - this.screenName = user.screenName; - this.displayName = user.displayName; - this.tokenizedScreenName = user.tokenizedScreenName; - this.id = user.id; - } - - /** - * Initialized Builder screen name (handle/the name following the "@") and do tokenization - * for it. - */ - public Builder withScreenName(Optional newScreenName) { - this.screenName = newScreenName; - if (newScreenName.isPresent()) { - this.tokenizedScreenName = Optional.of( - TokenizerHelper.getNormalizedCamelcaseTokenStream(newScreenName.get())); - } - return this; - } - - /** - * Initialized Builder display name - */ - public Builder withDisplayName(Optional newDisplayName) { - this.displayName = newDisplayName; - return this; - } - - public Builder withId(Optional newId) { - this.id = newId; - return this; - } - - public TwitterMessageUser build() { - return new TwitterMessageUser( - screenName, displayName, tokenizedScreenName, id); - } - } - - /** Creates a TwitterMessageUser instance with the given screen name. */ - public static TwitterMessageUser createWithScreenName(@Nonnull String screenName) { - Preconditions.checkNotNull(screenName, "Don't set a null screen name"); - return new Builder() - .withScreenName(Optional.of(screenName)) - .build(); - } - - /** Creates a TwitterMessageUser instance with the given display name. */ - public static TwitterMessageUser createWithDisplayName(@Nonnull String displayName) { - Preconditions.checkNotNull(displayName, "Don't set a null display name"); - return new Builder() - .withDisplayName(Optional.of(displayName)) - .build(); - } - - /** Creates a TwitterMessageUser instance with the given ID. */ - public static TwitterMessageUser createWithId(long id) { - Preconditions.checkArgument(id >= 0, "Don't sent a negative user ID"); - return new Builder() - .withId(Optional.of(id)) - .build(); - } - - /** Creates a TwitterMessageUser instance with the given parameters. */ - public static TwitterMessageUser createWithNamesAndId( - @Nonnull String screenName, - @Nonnull String displayName, - long id) { - Preconditions.checkNotNull(screenName, "Use another method instead of passing null name"); - Preconditions.checkNotNull(displayName, "Use another method instead of passing null name"); - Preconditions.checkArgument(id >= 0, "Use another method instead of passing negative ID"); - return new Builder() - .withScreenName(Optional.of(screenName)) - .withDisplayName(Optional.of(displayName)) - .withId(Optional.of(id)) - .build(); - } - - /** Creates a TwitterMessageUser instance with the given parameters. */ - public static TwitterMessageUser createWithNames( - @Nonnull String screenName, - @Nonnull String displayName) { - Preconditions.checkNotNull(screenName, "Use another method instead of passing null name"); - Preconditions.checkNotNull(displayName, "Use another method instead of passing null name"); - return new Builder() - .withScreenName(Optional.of(screenName)) - .withDisplayName(Optional.of(displayName)) - .build(); - } - - /** Creates a TwitterMessageUser instance with the given parameters. */ - public static TwitterMessageUser createWithOptionalNamesAndId( - @Nonnull Optional optScreenName, - @Nonnull Optional optDisplayName, - @Nonnull Optional optId) { - Preconditions.checkNotNull(optScreenName, "Pass Optional.absent() instead of null"); - Preconditions.checkNotNull(optDisplayName, "Pass Optional.absent() instead of null"); - Preconditions.checkNotNull(optId, "Pass Optional.absent() instead of null"); - return new Builder() - .withScreenName(optScreenName) - .withDisplayName(optDisplayName) - .withId(optId) - .build(); - } - - private TwitterMessageUser( - @Nonnull Optional screenName, - @Nonnull Optional displayName, - @Nonnull Optional tokenizedScreenName, - @Nonnull Optional id) { - this.screenName = screenName; - this.displayName = displayName; - this.tokenizedScreenName = tokenizedScreenName; - this.id = id; - } - - /** Creates a copy of this TwitterMessageUser instance, with the given screen name. */ - public TwitterMessageUser copyWithScreenName(@Nonnull String newScreenName) { - Preconditions.checkNotNull(newScreenName, "Don't set a null screen name"); - return new Builder(this) - .withScreenName(Optional.of(newScreenName)) - .build(); - } - - /** Creates a copy of this TwitterMessageUser instance, with the given display name. */ - public TwitterMessageUser copyWithDisplayName(@Nonnull String newDisplayName) { - Preconditions.checkNotNull(newDisplayName, "Don't set a null display name"); - return new Builder(this) - .withDisplayName(Optional.of(newDisplayName)) - .build(); - } - - /** Creates a copy of this TwitterMessageUser instance, with the given ID. */ - public TwitterMessageUser copyWithId(long newId) { - Preconditions.checkArgument(newId >= 0, "Don't set a negative user ID"); - return new Builder(this) - .withId(Optional.of(newId)) - .build(); - } - - public Optional getScreenName() { - return screenName; - } - - public Optional getDisplayName() { - return displayName; - } - - public Optional getTokenizedScreenName() { - return tokenizedScreenName; - } - - public Optional getId() { - return id; - } - - @Override - public String toString() { - return "[" + screenName + ", " + displayName + ", " + id + "]"; - } - - /** - * Compares this TwitterMessageUser instance to the given object. - * - * @deprecated deprecated. - */ - @Deprecated - @Override - public boolean equals(Object o) { - if (o == null) { - return false; - } - if (o == this) { - return true; - } - if (o.getClass() != getClass()) { - return false; - } - TwitterMessageUser other = (TwitterMessageUser) o; - return new EqualsBuilder() - .append(screenName, other.screenName) - .append(displayName, other.displayName) - .isEquals(); - } - - /** - * Returns a hash code for this TwitterMessageUser instance. - * - * @deprecated deprecated. - */ - @Deprecated - @Override - public int hashCode() { - return HashCodeBuilder.reflectionHashCode(this); - } -} diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUtil.docx b/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUtil.docx new file mode 100644 index 000000000..f536d2c90 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUtil.docx differ diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUtil.java b/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUtil.java deleted file mode 100644 index 7437de7fd..000000000 --- a/src/java/com/twitter/search/common/relevance/entities/TwitterMessageUtil.java +++ /dev/null @@ -1,444 +0,0 @@ -package com.twitter.search.common.relevance.entities; - -import java.text.Normalizer; -import java.util.Map; -import java.util.NavigableMap; -import java.util.Set; -import java.util.TreeMap; -import java.util.concurrent.ConcurrentMap; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import org.apache.commons.lang.StringUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.text.transformer.HTMLTagRemovalTransformer; -import com.twitter.common_internal.text.extractor.EmojiExtractor; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; - -public final class TwitterMessageUtil { - private static final Logger LOG = LoggerFactory.getLogger(TwitterMessageUtil.class); - - private TwitterMessageUtil() { - } - - @VisibleForTesting - static final ConcurrentMap COUNTERS_MAP = Maps.newConcurrentMap(); - // We truncate the location string because we used to use a MySQL table to store the geocoding - // information. In the MySQL table, the location string was fix width of 30 characters. - // We have migrated to Manhattan and the location string is no longer limited to 30 character. - // However, in order to correctly lookup location geocode from Manhattan, we still need to - // truncate the location just like we did before. - private static final int MAX_LOCATION_LEN = 30; - - // Note: we strip tags to index source, as typically source contains tags. - // Sometimes we get a source where stripping fails, as the URL in the tag was - // excessively long. We drop these sources, as there is little reason to index them. - private static final int MAX_SOURCE_LEN = 64; - - private static HTMLTagRemovalTransformer tagRemovalTransformer = new HTMLTagRemovalTransformer(); - - private static final String STAT_PREFIX = "twitter_message_"; - - public enum Field { - FROM_USER_DISPLAY_NAME, - NORMALIZED_LOCATION, - ORIG_LOCATION, - ORIG_SOURCE, - SHARED_USER_DISPLAY_NAME, - SOURCE, - TEXT, - TO_USER_SCREEN_NAME; - - public String getNameForStats() { - return name().toLowerCase(); - } - } - - @VisibleForTesting - static class Counters { - private final SearchRateCounter truncatedCounter; - private final SearchRateCounter tweetsWithStrippedSupplementaryCharsCounter; - private final SearchRateCounter strippedSupplementaryCharsCounter; - private final SearchRateCounter nonStrippedEmojiCharsCounter; - private final SearchRateCounter emojisAtTruncateBoundaryCounter; - - Counters(Field field) { - String fieldNameForStats = field.getNameForStats(); - truncatedCounter = SearchRateCounter.export( - STAT_PREFIX + "truncated_" + fieldNameForStats); - tweetsWithStrippedSupplementaryCharsCounter = SearchRateCounter.export( - STAT_PREFIX + "tweets_with_stripped_supplementary_chars_" + fieldNameForStats); - strippedSupplementaryCharsCounter = SearchRateCounter.export( - STAT_PREFIX + "stripped_supplementary_chars_" + fieldNameForStats); - nonStrippedEmojiCharsCounter = SearchRateCounter.export( - STAT_PREFIX + "non_stripped_emoji_chars_" + fieldNameForStats); - emojisAtTruncateBoundaryCounter = SearchRateCounter.export( - STAT_PREFIX + "emojis_at_truncate_boundary_" + fieldNameForStats); - } - - SearchRateCounter getTruncatedCounter() { - return truncatedCounter; - } - - SearchRateCounter getTweetsWithStrippedSupplementaryCharsCounter() { - return tweetsWithStrippedSupplementaryCharsCounter; - } - - SearchRateCounter getStrippedSupplementaryCharsCounter() { - return strippedSupplementaryCharsCounter; - } - - SearchRateCounter getNonStrippedEmojiCharsCounter() { - return nonStrippedEmojiCharsCounter; - } - - SearchRateCounter getEmojisAtTruncateBoundaryCounter() { - return emojisAtTruncateBoundaryCounter; - } - } - - static { - for (Field field : Field.values()) { - COUNTERS_MAP.put(field, new Counters(field)); - } - } - - // Note: the monorail enforces a limit of 15 characters for screen names, - // but some users with up to 20 character names were grandfathered-in. To allow - // those users to be searchable, support up to 20 chars. - private static final int MAX_SCREEN_NAME_LEN = 20; - - // Note: we expect the current limit to be 10K. Also, all supplementary unicode characters (with - // the exception of emojis, maybe) will be removed and not counted as total length. Added alert - // for text truncation rate as well. SEARCH-9512 - private static final int MAX_TWEET_TEXT_LEN = 10000; - - @VisibleForTesting - static final SearchRateCounter FILTERED_NO_STATUS_ID = - SearchRateCounter.export(STAT_PREFIX + "filtered_no_status_id"); - @VisibleForTesting - static final SearchRateCounter FILTERED_NO_FROM_USER = - SearchRateCounter.export(STAT_PREFIX + "filtered_no_from_user"); - @VisibleForTesting - static final SearchRateCounter FILTERED_LONG_SCREEN_NAME = - SearchRateCounter.export(STAT_PREFIX + "filtered_long_screen_name"); - @VisibleForTesting - static final SearchRateCounter FILTERED_NO_TEXT = - SearchRateCounter.export(STAT_PREFIX + "filtered_no_text"); - @VisibleForTesting - static final SearchRateCounter FILTERED_NO_DATE = - SearchRateCounter.export(STAT_PREFIX + "filtered_no_date"); - @VisibleForTesting - static final SearchRateCounter NULLCAST_TWEET = - SearchRateCounter.export(STAT_PREFIX + "filter_nullcast_tweet"); - @VisibleForTesting - static final SearchRateCounter NULLCAST_TWEET_ACCEPTED = - SearchRateCounter.export(STAT_PREFIX + "nullcast_tweet_accepted"); - @VisibleForTesting - static final SearchRateCounter INCONSISTENT_TWEET_ID_AND_CREATED_AT = - SearchRateCounter.export(STAT_PREFIX + "inconsistent_tweet_id_and_created_at_ms"); - - /** Strips the given source from the message with the given ID. */ - private static String stripSource(String source, Long messageId) { - if (source == null) { - return null; - } - // Always strip emojis from sources: they don't really make sense in this field. - String strippedSource = stripSupplementaryChars( - tagRemovalTransformer.transform(source).toString(), Field.SOURCE, true); - if (strippedSource.length() > MAX_SOURCE_LEN) { - LOG.warn("Message " - + messageId - + " contains stripped source that exceeds MAX_SOURCE_LEN. Removing: " - + strippedSource); - COUNTERS_MAP.get(Field.SOURCE).getTruncatedCounter().increment(); - return null; - } - return strippedSource; - } - - /** - * Strips and truncates the location of the message with the given ID. - * - */ - private static String stripAndTruncateLocation(String location) { - // Always strip emojis from locations: they don't really make sense in this field. - String strippedLocation = stripSupplementaryChars(location, Field.NORMALIZED_LOCATION, true); - return truncateString(strippedLocation, MAX_LOCATION_LEN, Field.NORMALIZED_LOCATION, true); - } - - /** - * Sets the origSource and strippedSource fields on a TwitterMessage - * - */ - public static void setSourceOnMessage(TwitterMessage message, String modifiedDeviceSource) { - // Always strip emojis from sources: they don't really make sense in this field. - message.setOrigSource(stripSupplementaryChars(modifiedDeviceSource, Field.ORIG_SOURCE, true)); - message.setStrippedSource(stripSource(modifiedDeviceSource, message.getId())); - } - - /** - * Sets the origLocation to the stripped location, and sets - * the truncatedNormalizedLocation to the truncated and normalized location. - */ - public static void setAndTruncateLocationOnMessage( - TwitterMessage message, - String newOrigLocation) { - // Always strip emojis from locations: they don't really make sense in this field. - message.setOrigLocation(stripSupplementaryChars(newOrigLocation, Field.ORIG_LOCATION, true)); - - // Locations in the new locations table require additional normalization. It can also change - // the length of the string, so we must do this before truncation. - if (newOrigLocation != null) { - String normalized = - Normalizer.normalize(newOrigLocation, Normalizer.Form.NFKC).toLowerCase().trim(); - message.setTruncatedNormalizedLocation(stripAndTruncateLocation(normalized)); - } else { - message.setTruncatedNormalizedLocation(null); - } - } - - /** - * Validates the given TwitterMessage. - * - * @param message The message to validate. - * @param stripEmojisForFields The set of fields for which emojis should be stripped. - * @param acceptNullcastMessage Determines if this message should be accepted, if it's a nullcast - * message. - * @return {@code true} if the given message is valid; {@code false} otherwise. - */ - public static boolean validateTwitterMessage( - TwitterMessage message, - Set stripEmojisForFields, - boolean acceptNullcastMessage) { - if (message.getNullcast()) { - NULLCAST_TWEET.increment(); - if (!acceptNullcastMessage) { - LOG.info("Dropping nullcasted message " + message.getId()); - return false; - } - NULLCAST_TWEET_ACCEPTED.increment(); - } - - if (!message.getFromUserScreenName().isPresent() - || StringUtils.isBlank(message.getFromUserScreenName().get())) { - LOG.error("Message " + message.getId() + " contains no from user. Skipping."); - FILTERED_NO_FROM_USER.increment(); - return false; - } - String fromUserScreenName = message.getFromUserScreenName().get(); - - if (fromUserScreenName.length() > MAX_SCREEN_NAME_LEN) { - LOG.warn("Message " + message.getId() + " has a user screen name longer than " - + MAX_SCREEN_NAME_LEN + " characters: " + message.getFromUserScreenName() - + ". Skipping."); - FILTERED_LONG_SCREEN_NAME.increment(); - return false; - } - - // Remove supplementary characters and truncate these text fields. - if (message.getFromUserDisplayName().isPresent()) { - message.setFromUserDisplayName(stripSupplementaryChars( - message.getFromUserDisplayName().get(), - Field.FROM_USER_DISPLAY_NAME, - stripEmojisForFields.contains(Field.FROM_USER_DISPLAY_NAME))); - } - if (message.getToUserScreenName().isPresent()) { - String strippedToUserScreenName = stripSupplementaryChars( - message.getToUserLowercasedScreenName().get(), - Field.TO_USER_SCREEN_NAME, - stripEmojisForFields.contains(Field.TO_USER_SCREEN_NAME)); - message.setToUserScreenName( - truncateString( - strippedToUserScreenName, - MAX_SCREEN_NAME_LEN, - Field.TO_USER_SCREEN_NAME, - stripEmojisForFields.contains(Field.TO_USER_SCREEN_NAME))); - } - - String strippedText = stripSupplementaryChars( - message.getText(), - Field.TEXT, - stripEmojisForFields.contains(Field.TEXT)); - message.setText(truncateString( - strippedText, - MAX_TWEET_TEXT_LEN, - Field.TEXT, - stripEmojisForFields.contains(Field.TEXT))); - - if (StringUtils.isBlank(message.getText())) { - FILTERED_NO_TEXT.increment(); - return false; - } - - if (message.getDate() == null) { - LOG.error("Message " + message.getId() + " contains no date. Skipping."); - FILTERED_NO_DATE.increment(); - return false; - } - - if (message.isRetweet()) { - return validateRetweetMessage(message.getRetweetMessage(), stripEmojisForFields); - } - - // Track if both the snowflake ID and created at timestamp are consistent. - if (!SnowflakeIdParser.isTweetIDAndCreatedAtConsistent(message.getId(), message.getDate())) { - LOG.error("Found inconsistent tweet ID and created at timestamp: [messageID=" - + message.getId() + "], [messageDate=" + message.getDate() + "]."); - INCONSISTENT_TWEET_ID_AND_CREATED_AT.increment(); - } - - return true; - } - - private static boolean validateRetweetMessage( - TwitterRetweetMessage message, Set stripEmojisForFields) { - if (message.getSharedId() == null || message.getRetweetId() == null) { - LOG.error("Retweet Message contains a null twitter id. Skipping."); - FILTERED_NO_STATUS_ID.increment(); - return false; - } - - if (message.getSharedDate() == null) { - LOG.error("Retweet Message " + message.getRetweetId() + " contains no date. Skipping."); - return false; - } - - // Remove supplementary characters from these text fields. - message.setSharedUserDisplayName(stripSupplementaryChars( - message.getSharedUserDisplayName(), - Field.SHARED_USER_DISPLAY_NAME, - stripEmojisForFields.contains(Field.SHARED_USER_DISPLAY_NAME))); - - return true; - } - - /** - * Strips non indexable chars from the text. - * - * Returns the resulting string, which may be the same object as the text argument when - * no stripping or truncation is necessary. - * - * Non-indexed characters are "supplementary unicode" that are not emojis. Note that - * supplementary unicode are still characters that seem worth indexing, as many characters - * in CJK languages are supplementary. However this would make the size of our index - * explode (~186k supplementary characters exist), so it's not feasible. - * - * @param text The text to strip - * @param field The field this text is from - * @param stripSupplementaryEmojis Whether or not to strip supplementary emojis. Note that this - * parameter name isn't 100% accurate. This parameter is meant to replicate behavior prior to - * adding support for *not* stripping supplementary emojis. The prior behavior would turn an - * emoji such as a keycap "1\uFE0F\u20E3" (http://www.iemoji.com/view/emoji/295/symbols/keycap-1) - * into just '1'. So the keycap emoji is not completely stripped, only the portion after the '1'. - * - */ - @VisibleForTesting - public static String stripSupplementaryChars( - String text, - Field field, - boolean stripSupplementaryEmojis) { - if (text == null || text.isEmpty()) { - return text; - } - - // Initialize an empty map so that if we choose not to strip emojis, - // then no emojipositions will be found and we don't need a null - // check before checking if an emoji is at a certain spot. - NavigableMap emojiPositions = new TreeMap<>(); - - if (!stripSupplementaryEmojis) { - emojiPositions = EmojiExtractor.getEmojiPositions(text); - } - - StringBuilder strippedTextBuilder = new StringBuilder(); - int sequenceStart = 0; - int i = 0; - while (i < text.length()) { - if (Character.isSupplementaryCodePoint(text.codePointAt(i))) { - // Check if this supplementary character is an emoji - if (!emojiPositions.containsKey(i)) { - // It's not an emoji, or we want to strip emojis, so strip it - - // text[i] and text[i + 1] are part of a supplementary code point. - strippedTextBuilder.append(text.substring(sequenceStart, i)); - sequenceStart = i + 2; // skip 2 chars - i = sequenceStart; - COUNTERS_MAP.get(field).getStrippedSupplementaryCharsCounter().increment(); - } else { - // It's an emoji, keep it - i += emojiPositions.get(i); - COUNTERS_MAP.get(field).getNonStrippedEmojiCharsCounter().increment(); - } - } else { - ++i; - } - } - if (sequenceStart < text.length()) { - strippedTextBuilder.append(text.substring(sequenceStart)); - } - - String strippedText = strippedTextBuilder.toString(); - if (strippedText.length() < text.length()) { - COUNTERS_MAP.get(field).getTweetsWithStrippedSupplementaryCharsCounter().increment(); - } - return strippedText; - } - - /** - * Truncates the given string to the given length. - * - * Note that we are truncating based on the # of UTF-16 characters a given emoji takes up. - * So if a single emoji takes up 4 UTF-16 characters, that counts as 4 for the truncation, - * not just 1. - * - * @param text The text to truncate - * @param maxLength The maximum length of the string after truncation - * @param field The field from which this string cames - * @param splitEmojisAtMaxLength If true, don't worry about emojis and just truncate at maxLength, - * potentially splitting them. If false, truncate before the emoji if truncating at maxLength - * would cause the emoji to be split. - */ - @VisibleForTesting - static String truncateString( - String text, - int maxLength, - Field field, - boolean splitEmojisAtMaxLength) { - Preconditions.checkArgument(maxLength > 0); - - if ((text == null) || (text.length() <= maxLength)) { - return text; - } - - int truncatePoint = maxLength; - NavigableMap emojiPositions; - // If we want to consider emojis we should not strip on an emoji boundary. - if (!splitEmojisAtMaxLength) { - emojiPositions = EmojiExtractor.getEmojiPositions(text); - - // Get the last emoji before maxlength. - Map.Entry lastEmojiBeforeMaxLengthEntry = - emojiPositions.lowerEntry(maxLength); - - if (lastEmojiBeforeMaxLengthEntry != null) { - int lowerEmojiEnd = lastEmojiBeforeMaxLengthEntry.getKey() - + lastEmojiBeforeMaxLengthEntry.getValue(); - - // If the last emoji would be truncated, truncate before the last emoji. - if (lowerEmojiEnd > truncatePoint) { - truncatePoint = lastEmojiBeforeMaxLengthEntry.getKey(); - COUNTERS_MAP.get(field).getEmojisAtTruncateBoundaryCounter().increment(); - } - } - } - - COUNTERS_MAP.get(field).getTruncatedCounter().increment(); - return text.substring(0, truncatePoint); - } -} diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterQuotedMessage.docx b/src/java/com/twitter/search/common/relevance/entities/TwitterQuotedMessage.docx new file mode 100644 index 000000000..820720093 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/entities/TwitterQuotedMessage.docx differ diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterQuotedMessage.java b/src/java/com/twitter/search/common/relevance/entities/TwitterQuotedMessage.java deleted file mode 100644 index 4e9f9b88f..000000000 --- a/src/java/com/twitter/search/common/relevance/entities/TwitterQuotedMessage.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.twitter.search.common.relevance.entities; - -import org.apache.commons.lang3.builder.EqualsBuilder; -import org.apache.commons.lang3.builder.HashCodeBuilder; -import org.apache.commons.lang3.builder.ToStringBuilder; - -/** - * The object for quoted message - */ -public class TwitterQuotedMessage { - private final long quotedStatusId; - private final long quotedUserId; - - public TwitterQuotedMessage(long quotedStatusId, long quotedUserId) { - this.quotedStatusId = quotedStatusId; - this.quotedUserId = quotedUserId; - } - - public long getQuotedStatusId() { - return quotedStatusId; - } - - public long getQuotedUserId() { - return quotedUserId; - } - - @Override - public boolean equals(Object o) { - return EqualsBuilder.reflectionEquals(this, o); - } - - @Override - public int hashCode() { - return HashCodeBuilder.reflectionHashCode(this); - } - - @Override - public String toString() { - return ToStringBuilder.reflectionToString(this); - } -} diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterRetweetMessage.docx b/src/java/com/twitter/search/common/relevance/entities/TwitterRetweetMessage.docx new file mode 100644 index 000000000..85a259f91 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/entities/TwitterRetweetMessage.docx differ diff --git a/src/java/com/twitter/search/common/relevance/entities/TwitterRetweetMessage.java b/src/java/com/twitter/search/common/relevance/entities/TwitterRetweetMessage.java deleted file mode 100644 index e2aac7bc2..000000000 --- a/src/java/com/twitter/search/common/relevance/entities/TwitterRetweetMessage.java +++ /dev/null @@ -1,80 +0,0 @@ -package com.twitter.search.common.relevance.entities; - -import java.util.Date; - -import org.apache.commons.lang3.builder.EqualsBuilder; -import org.apache.commons.lang3.builder.HashCodeBuilder; -import org.apache.commons.lang3.builder.ToStringBuilder; - -public class TwitterRetweetMessage { - // based on original tweet - private Long sharedId; - - // TwitterMessageUtil checks them - private String sharedUserDisplayName; - private Long sharedUserTwitterId = TwitterMessage.LONG_FIELD_NOT_PRESENT; - - private Date sharedDate = null; - - // based on retweet - private Long retweetId; - - public Long getRetweetId() { - return retweetId; - } - - public void setRetweetId(Long retweetId) { - this.retweetId = retweetId; - } - - public Long getSharedId() { - return sharedId; - } - - public void setSharedId(Long sharedId) { - this.sharedId = sharedId; - } - - public String getSharedUserDisplayName() { - return sharedUserDisplayName; - } - - public void setSharedUserDisplayName(String sharedUserDisplayName) { - this.sharedUserDisplayName = sharedUserDisplayName; - } - - public Long getSharedUserTwitterId() { - return sharedUserTwitterId; - } - - public boolean hasSharedUserTwitterId() { - return sharedUserTwitterId != TwitterMessage.LONG_FIELD_NOT_PRESENT; - } - - public void setSharedUserTwitterId(Long sharedUserTwitterId) { - this.sharedUserTwitterId = sharedUserTwitterId; - } - - public Date getSharedDate() { - return sharedDate; - } - - public void setSharedDate(Date sharedDate) { - this.sharedDate = sharedDate; - } - - @Override - public boolean equals(Object o) { - return EqualsBuilder.reflectionEquals(this, o); - } - - @Override - public int hashCode() { - return HashCodeBuilder.reflectionHashCode(this); - } - - @Override - public String toString() { - return ToStringBuilder.reflectionToString(this); - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/AgeDecay.docx b/src/java/com/twitter/search/common/relevance/features/AgeDecay.docx new file mode 100644 index 000000000..d2c7133df Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/AgeDecay.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/AgeDecay.java b/src/java/com/twitter/search/common/relevance/features/AgeDecay.java deleted file mode 100644 index 910eaae40..000000000 --- a/src/java/com/twitter/search/common/relevance/features/AgeDecay.java +++ /dev/null @@ -1,88 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.util.concurrent.TimeUnit; - -import com.google.common.base.Preconditions; - -/** - * Utility to compute an age decay multiplier based on a sigmoid function. - */ -public class AgeDecay { - public static final double SLOPE_COEFF = 4.0; - public static final double LN_HALF = Math.log(0.5); - public final double halflife; - public final double maxBoost; - public final double base; - public final double slope; - - /** Creates a new AgeDecay instance. */ - public AgeDecay(double base, double maxBoost, double halflife, double slope) { - this.maxBoost = maxBoost; - this.base = base; - this.halflife = halflife; - this.slope = slope; - } - - /** Creates a new AgeDecay instance. */ - public AgeDecay(double base, double halflife, double slope) { - this(base, 1.0, halflife, slope); - } - - /** - * Compute the age decay, using the provided halflife. - * - * @param tweetAge The tweet age. - * @param unit The unit of the tweetAge parameter. - */ - public double getAgeDecayMultiplier(long tweetAge, TimeUnit unit) { - return getAgeDecayMultiplier(TimeUnit.SECONDS.convert(tweetAge, unit)); - } - - /** - * Compute the age decay, assuming the halflife in the constructor is in minutes. - * @param ageInSeconds the age in seconds - */ - public double getAgeDecayMultiplier(long ageInSeconds) { - long minutesSinceTweet = TimeUnit.MINUTES.convert(ageInSeconds, TimeUnit.SECONDS); - return compute(minutesSinceTweet); - } - - /** - * Compute age decay given an age, the age has to be in the same unit as halflife, which you - * construct the object with. - */ - public double compute(double age) { - return compute(base, maxBoost, halflife, slope, age); - } - - /** - * Compute the age decay given all parameters. Use this if you don't need to reuse an AgeDecay - * object. - */ - public static double compute( - double base, double maxBoost, double halflife, double slope, double age) { - return base + ((maxBoost - base) / (1 + Math.exp(slope * (age - halflife)))); - } - - public static double compute( - double base, double maxBoost, double halflife, double age) { - Preconditions.checkArgument(halflife != 0); - return compute(base, maxBoost, halflife, SLOPE_COEFF / halflife, age); - } - - /** - * Another nicer exponential decay function. Returns a value in (0, 1] - */ - public static double computeExponential(double halflife, double exp, double age) { - return Math.exp(LN_HALF * Math.pow(age, exp) / Math.pow(halflife, exp)); - } - - /** - * Exponential decay with remapping of the value from (0,1] to (min,max] - */ - public static double computeExponential(double halflife, double exp, double age, - double minBoost, double maxBoost) { - double decay = computeExponential(halflife, exp, age); // in (0, 1] - return (maxBoost - minBoost) * decay + minBoost; - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/BUILD b/src/java/com/twitter/search/common/relevance/features/BUILD deleted file mode 100644 index f93592fd9..000000000 --- a/src/java/com/twitter/search/common/relevance/features/BUILD +++ /dev/null @@ -1,24 +0,0 @@ -# Java library for tweet features and utilities. -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/features", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/util/lang", - "src/thrift/com/twitter/search/common:constants-java", - "src/thrift/com/twitter/search/common:features-java", - "src/thrift/com/twitter/search/common:schema-java", - ], -) diff --git a/src/java/com/twitter/search/common/relevance/features/BUILD.docx b/src/java/com/twitter/search/common/relevance/features/BUILD.docx new file mode 100644 index 000000000..6ce8ecf05 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/EarlybirdDocumentFeatures.docx b/src/java/com/twitter/search/common/relevance/features/EarlybirdDocumentFeatures.docx new file mode 100644 index 000000000..71b689710 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/EarlybirdDocumentFeatures.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/EarlybirdDocumentFeatures.java b/src/java/com/twitter/search/common/relevance/features/EarlybirdDocumentFeatures.java deleted file mode 100644 index 79afe8d2f..000000000 --- a/src/java/com/twitter/search/common/relevance/features/EarlybirdDocumentFeatures.java +++ /dev/null @@ -1,232 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.io.IOException; -import java.util.Map; -import java.util.function.Function; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.NumericDocValues; - -import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.schema.thriftjava.ThriftFeatureNormalizationType; - -public class EarlybirdDocumentFeatures { - private static final Map FEATURE_CONFIG_IS_NULL_MAP = Maps.newHashMap(); - private static final Map FEATURE_OUTPUT_TYPE_IS_NULL_MAP = - Maps.newHashMap(); - private static final Map NO_SCHEMA_FIELD_FOR_FEATURE_MAP = - Maps.newHashMap(); - private static final String FEATURE_CONFIG_IS_NULL_COUNTER_PATTERN = - "null_feature_config_for_feature_id_%d"; - private static final String FEATURE_OUTPUT_TYPE_IS_NULL_COUNTER_PATTERN = - "null_output_type_for_feature_id_%d"; - private static final String NO_SCHEMA_FIELD_FOR_FEATURE_COUNTER_PATTERN = - "no_schema_field_for_feature_id_%d"; - private static final SearchCounter UNKNOWN_FEATURE_OUTPUT_TYPE_COUNTER = - SearchCounter.export("unknown_feature_output_type"); - - private final Map numericDocValues = Maps.newHashMap(); - private final LeafReader leafReader; - private int docId = -1; - - /** - * Creates a new EarlybirdDocumentFeatures instance that will return feature values based on the - * NumericDocValues stored in the given LeafReader for the given document. - */ - public EarlybirdDocumentFeatures(LeafReader leafReader) { - this.leafReader = Preconditions.checkNotNull(leafReader); - } - - /** - * Advances this instance to the given doc ID. The new doc ID must be greater than or equal to the - * current doc ID stored in this instance. - */ - public void advance(int target) { - Preconditions.checkArgument( - target >= 0, - "Target (%s) cannot be negative.", - target); - Preconditions.checkArgument( - target >= docId, - "Target (%s) smaller than current doc ID (%s).", - target, - docId); - Preconditions.checkArgument( - target < leafReader.maxDoc(), - "Target (%s) cannot be greater than or equal to the max doc ID (%s).", - target, - leafReader.maxDoc()); - docId = target; - } - - /** - * Returns the feature value for the given field. - */ - public long getFeatureValue(EarlybirdFieldConstant field) throws IOException { - // The index might not have a NumericDocValues instance for this feature. - // This might happen if we dynamically update the feature schema, for example. - // - // Cache the NumericDocValues instances for all accessed features, even if they're null. - String fieldName = field.getFieldName(); - NumericDocValues docValues; - if (numericDocValues.containsKey(fieldName)) { - docValues = numericDocValues.get(fieldName); - } else { - docValues = leafReader.getNumericDocValues(fieldName); - numericDocValues.put(fieldName, docValues); - } - return docValues != null && docValues.advanceExact(docId) ? docValues.longValue() : 0L; - } - - /** - * Determines if the given flag is set. - */ - public boolean isFlagSet(EarlybirdFieldConstant field) throws IOException { - return getFeatureValue(field) != 0; - } - - /** - * Returns the unnormalized value for the given field. - */ - public double getUnnormalizedFeatureValue(EarlybirdFieldConstant field) throws IOException { - long featureValue = getFeatureValue(field); - ThriftFeatureNormalizationType normalizationType = field.getFeatureNormalizationType(); - if (normalizationType == null) { - normalizationType = ThriftFeatureNormalizationType.NONE; - } - switch (normalizationType) { - case NONE: - return featureValue; - case LEGACY_BYTE_NORMALIZER: - return MutableFeatureNormalizers.BYTE_NORMALIZER.unnormLowerBound((byte) featureValue); - case LEGACY_BYTE_NORMALIZER_WITH_LOG2: - return MutableFeatureNormalizers.BYTE_NORMALIZER.unnormAndLog2((byte) featureValue); - case SMART_INTEGER_NORMALIZER: - return MutableFeatureNormalizers.SMART_INTEGER_NORMALIZER.unnormUpperBound( - (byte) featureValue); - case PREDICTION_SCORE_NORMALIZER: - return IntNormalizers.PREDICTION_SCORE_NORMALIZER.denormalize((int) featureValue); - default: - throw new IllegalArgumentException( - "Unsupported normalization type " + normalizationType + " for feature " - + field.getFieldName()); - } - } - - /** - * Creates a ThriftSearchResultFeatures instance populated with values for all available features - * that have a non-zero value set. - */ - public ThriftSearchResultFeatures getSearchResultFeatures(ImmutableSchemaInterface schema) - throws IOException { - return getSearchResultFeatures(schema, (featureId) -> true); - } - - /** - * Creates a ThriftSearchResultFeatures instance populated with values for all available features - * that have a non-zero value set. - * - * @param schema The schema. - * @param shouldCollectFeatureId A predicate that determines which features should be collected. - */ - public ThriftSearchResultFeatures getSearchResultFeatures( - ImmutableSchemaInterface schema, - Function shouldCollectFeatureId) throws IOException { - Map boolValues = Maps.newHashMap(); - Map doubleValues = Maps.newHashMap(); - Map intValues = Maps.newHashMap(); - Map longValues = Maps.newHashMap(); - - Map idToFeatureConfigMap = schema.getFeatureIdToFeatureConfig(); - for (int featureId : schema.getSearchFeatureSchema().getEntries().keySet()) { - if (!shouldCollectFeatureId.apply(featureId)) { - continue; - } - - FeatureConfiguration featureConfig = idToFeatureConfigMap.get(featureId); - if (featureConfig == null) { - FEATURE_CONFIG_IS_NULL_MAP.computeIfAbsent( - featureId, - (fId) -> SearchCounter.export( - String.format(FEATURE_CONFIG_IS_NULL_COUNTER_PATTERN, fId))).increment(); - continue; - } - - ThriftCSFType outputType = featureConfig.getOutputType(); - if (outputType == null) { - FEATURE_OUTPUT_TYPE_IS_NULL_MAP.computeIfAbsent( - featureId, - (fId) -> SearchCounter.export( - String.format(FEATURE_OUTPUT_TYPE_IS_NULL_COUNTER_PATTERN, fId))).increment(); - continue; - } - - if (!EarlybirdFieldConstants.hasFieldConstant(featureId)) { - // Should only happen for features that were dynamically added to the schema. - NO_SCHEMA_FIELD_FOR_FEATURE_MAP.computeIfAbsent( - featureId, - (fId) -> SearchCounter.export( - String.format(NO_SCHEMA_FIELD_FOR_FEATURE_COUNTER_PATTERN, fId))).increment(); - continue; - } - - EarlybirdFieldConstant field = EarlybirdFieldConstants.getFieldConstant(featureId); - switch (outputType) { - case BOOLEAN: - if (isFlagSet(field)) { - boolValues.put(featureId, true); - } - break; - case BYTE: - // It's unclear why we don't add this feature to a separate byteValues map... - byte byteFeatureValue = (byte) getFeatureValue(field); - if (byteFeatureValue != 0) { - intValues.put(featureId, (int) byteFeatureValue); - } - break; - case INT: - int intFeatureValue = (int) getFeatureValue(field); - if (intFeatureValue != 0) { - intValues.put(featureId, intFeatureValue); - } - break; - case LONG: - long longFeatureValue = getFeatureValue(field); - if (longFeatureValue != 0) { - longValues.put(featureId, longFeatureValue); - } - break; - case FLOAT: - // It's unclear why we don't add this feature to a separate floatValues map... - float floatFeatureValue = (float) getFeatureValue(field); - if (floatFeatureValue != 0) { - doubleValues.put(featureId, (double) floatFeatureValue); - } - break; - case DOUBLE: - double doubleFeatureValue = getUnnormalizedFeatureValue(field); - if (doubleFeatureValue != 0) { - doubleValues.put(featureId, doubleFeatureValue); - } - break; - default: - UNKNOWN_FEATURE_OUTPUT_TYPE_COUNTER.increment(); - } - } - - return new ThriftSearchResultFeatures() - .setBoolValues(boolValues) - .setIntValues(intValues) - .setLongValues(longValues) - .setDoubleValues(doubleValues); - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/FeatureSink.docx b/src/java/com/twitter/search/common/relevance/features/FeatureSink.docx new file mode 100644 index 000000000..71143f941 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/FeatureSink.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/FeatureSink.java b/src/java/com/twitter/search/common/relevance/features/FeatureSink.java deleted file mode 100644 index 63be4bdad..000000000 --- a/src/java/com/twitter/search/common/relevance/features/FeatureSink.java +++ /dev/null @@ -1,75 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.util.Map; - -import com.google.common.collect.Maps; - -import com.twitter.search.common.encoding.features.IntegerEncodedFeatures; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; - -/** - * FeatureSink is used to write features based on feature configuration or feature name. After - * all feature is written, the class can return the base field integer array values. - * - * This class is not thread-safe. - */ -public class FeatureSink { - private ImmutableSchemaInterface schema; - private final Map encodedFeatureMap; - - /** Creates a new FeatureSink instance. */ - public FeatureSink(ImmutableSchemaInterface schema) { - this.schema = schema; - this.encodedFeatureMap = Maps.newHashMap(); - } - - private IntegerEncodedFeatures getFeatures(String baseFieldName) { - IntegerEncodedFeatures features = encodedFeatureMap.get(baseFieldName); - if (features == null) { - features = EarlybirdEncodedFeatures.newEncodedTweetFeatures(schema, baseFieldName); - encodedFeatureMap.put(baseFieldName, features); - } - return features; - } - - /** Sets the given numeric value for the field. */ - public FeatureSink setNumericValue(EarlybirdFieldConstant field, int value) { - return setNumericValue(field.getFieldName(), value); - } - - /** Sets the given numeric value for the feature with the given name. */ - public FeatureSink setNumericValue(String featureName, int value) { - final FeatureConfiguration featureConfig = schema.getFeatureConfigurationByName(featureName); - if (featureConfig != null) { - getFeatures(featureConfig.getBaseField()).setFeatureValue(featureConfig, value); - } - return this; - } - - /** Sets the given boolean value for the given field. */ - public FeatureSink setBooleanValue(EarlybirdFieldConstant field, boolean value) { - return setBooleanValue(field.getFieldName(), value); - } - - /** Sets the given boolean value for the feature with the given name. */ - public FeatureSink setBooleanValue(String featureName, boolean value) { - final FeatureConfiguration featureConfig = schema.getFeatureConfigurationByName(featureName); - if (featureConfig != null) { - getFeatures(featureConfig.getBaseField()).setFlagValue(featureConfig, value); - } - return this; - } - - /** Returns the features for the given base field. */ - public IntegerEncodedFeatures getFeaturesForBaseField(EarlybirdFieldConstant baseField) { - return getFeaturesForBaseField(baseField.getFieldName()); - } - - /** Returns the features for the given base field. */ - public IntegerEncodedFeatures getFeaturesForBaseField(String baseFieldName) { - return encodedFeatureMap.get(baseFieldName); - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/IntNormalizers.docx b/src/java/com/twitter/search/common/relevance/features/IntNormalizers.docx new file mode 100644 index 000000000..0685378f2 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/IntNormalizers.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/IntNormalizers.java b/src/java/com/twitter/search/common/relevance/features/IntNormalizers.java deleted file mode 100644 index 5dc3d5ddd..000000000 --- a/src/java/com/twitter/search/common/relevance/features/IntNormalizers.java +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.util.concurrent.TimeUnit; - -import com.twitter.search.common.encoding.features.ByteNormalizer; -import com.twitter.search.common.encoding.features.IntNormalizer; -import com.twitter.search.common.encoding.features.PredictionScoreNormalizer; - -/** - * Int value normalizers used to push feature values into earlybird db. For the - * 8-bit feature types, this class wraps the - * com.twitter.search.common.relevance.features.MutableFeatureNormalizers - */ -public final class IntNormalizers { - private IntNormalizers() { - } - - public static final IntNormalizer LEGACY_NORMALIZER = - val -> ByteNormalizer.unsignedByteToInt( - MutableFeatureNormalizers.BYTE_NORMALIZER.normalize(val)); - - public static final IntNormalizer SMART_INTEGER_NORMALIZER = - val -> ByteNormalizer.unsignedByteToInt( - MutableFeatureNormalizers.SMART_INTEGER_NORMALIZER.normalize(val)); - - // The PARUS_SCORE feature is deprecated and is never set in our indexes. However, we still need - // this normalizer for now, because some models do not work properly with "missing" features, so - // for now we still need to set the PARUS_SCORE feature to 0. - public static final IntNormalizer PARUS_SCORE_NORMALIZER = val -> 0; - - public static final IntNormalizer BOOLEAN_NORMALIZER = - val -> val == 0 ? 0 : 1; - - public static final IntNormalizer TIMESTAMP_SEC_TO_HR_NORMALIZER = - val -> (int) TimeUnit.SECONDS.toHours((long) val); - - public static final PredictionScoreNormalizer PREDICTION_SCORE_NORMALIZER = - new PredictionScoreNormalizer(3); -} diff --git a/src/java/com/twitter/search/common/relevance/features/MutableFeatureNormalizers.docx b/src/java/com/twitter/search/common/relevance/features/MutableFeatureNormalizers.docx new file mode 100644 index 000000000..d6c788c49 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/MutableFeatureNormalizers.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/MutableFeatureNormalizers.java b/src/java/com/twitter/search/common/relevance/features/MutableFeatureNormalizers.java deleted file mode 100644 index b44414ea3..000000000 --- a/src/java/com/twitter/search/common/relevance/features/MutableFeatureNormalizers.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import com.twitter.search.common.encoding.features.ByteNormalizer; -import com.twitter.search.common.encoding.features.SingleBytePositiveFloatNormalizer; -import com.twitter.search.common.encoding.features.SmartIntegerNormalizer; - -/** - * Byte value normalizers used to push feature values into earlybird db. - */ -public abstract class MutableFeatureNormalizers { - // The max value we support in SMART_INTEGER_NORMALIZER below, this should be enough for all kinds - // of engagements we see on Twitter, anything larger than this would be represented as the same - // value (255, if using a byte). - private static final int MAX_COUNTER_VALUE_SUPPORTED = 50000000; - - // Avoid using this normalizer for procesing any new data, always use SmartIntegerNormalizer - // below. - public static final SingleBytePositiveFloatNormalizer BYTE_NORMALIZER = - new SingleBytePositiveFloatNormalizer(); - - public static final ByteNormalizer SMART_INTEGER_NORMALIZER = - new SmartIntegerNormalizer(MAX_COUNTER_VALUE_SUPPORTED, 8); -} diff --git a/src/java/com/twitter/search/common/relevance/features/QueryFeatureType.docx b/src/java/com/twitter/search/common/relevance/features/QueryFeatureType.docx new file mode 100644 index 000000000..928bebd45 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/QueryFeatureType.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/QueryFeatureType.java b/src/java/com/twitter/search/common/relevance/features/QueryFeatureType.java deleted file mode 100644 index d46c183fa..000000000 --- a/src/java/com/twitter/search/common/relevance/features/QueryFeatureType.java +++ /dev/null @@ -1,9 +0,0 @@ -package com.twitter.search.common.relevance.features; - -/** - * An enum to hold different types of query-specific features (these are not indexed in Earlybird) - */ -public enum QueryFeatureType { - SOCIAL_ENGAGEMENTS, - CLICKS -} diff --git a/src/java/com/twitter/search/common/relevance/features/RelevanceSignalConstants.docx b/src/java/com/twitter/search/common/relevance/features/RelevanceSignalConstants.docx new file mode 100644 index 000000000..ed04469d7 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/RelevanceSignalConstants.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/RelevanceSignalConstants.java b/src/java/com/twitter/search/common/relevance/features/RelevanceSignalConstants.java deleted file mode 100644 index abae2e9a8..000000000 --- a/src/java/com/twitter/search/common/relevance/features/RelevanceSignalConstants.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.search.common.relevance.features; - -/** - * Defines relevance related constants that are used at both ingestion time and - * earlybird scoring time. - */ -public final class RelevanceSignalConstants { - // user reputation - public static final byte UNSET_REPUTATION_SENTINEL = Byte.MIN_VALUE; - public static final byte MAX_REPUTATION = 100; - public static final byte MIN_REPUTATION = 0; - // below overall CDF of ~10%, default value for new users, - // given as a goodwill value in case it is unset - public static final byte GOODWILL_REPUTATION = 17; - - // text score - public static final byte UNSET_TEXT_SCORE_SENTINEL = Byte.MIN_VALUE; - // roughly at overall CDF of ~10%, given as a goodwill value in case it is unset - public static final byte GOODWILL_TEXT_SCORE = 19; - - private RelevanceSignalConstants() { - } - - // check whether the specified user rep value is valid - public static boolean isValidUserReputation(int userRep) { - return userRep != UNSET_REPUTATION_SENTINEL - && userRep >= MIN_REPUTATION - && userRep < MAX_REPUTATION; - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/ScoringUtils.docx b/src/java/com/twitter/search/common/relevance/features/ScoringUtils.docx new file mode 100644 index 000000000..6c9d3a956 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/ScoringUtils.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/ScoringUtils.java b/src/java/com/twitter/search/common/relevance/features/ScoringUtils.java deleted file mode 100644 index 7fc7a502f..000000000 --- a/src/java/com/twitter/search/common/relevance/features/ScoringUtils.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import com.google.common.base.Preconditions; - -/** - * Scoring utilities - */ -public final class ScoringUtils { - private ScoringUtils() { } - - /** - * normalize a positive value of arbitrary range to [0.0, 1.0], with a slop - * @param value the value to normalize. - * @param halfval a reference value that will be normalized to 0.5 - * @param exp an exponential parameter (must be positive) to control the converging speed, - * the smaller the value the faster it reaches the halfval but slower it reaches the maximum. - * @return a normalized value - */ - public static float normalize(float value, double halfval, double exp) { - Preconditions.checkArgument(exp > 0.0 && exp <= 1.0); - return (float) (Math.pow(value, exp) / (Math.pow(value, exp) + Math.pow(halfval, exp))); - } - -} diff --git a/src/java/com/twitter/search/common/relevance/features/TermVector.docx b/src/java/com/twitter/search/common/relevance/features/TermVector.docx new file mode 100644 index 000000000..023fc55be Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/TermVector.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/TermVector.java b/src/java/com/twitter/search/common/relevance/features/TermVector.java deleted file mode 100644 index 75e7982e2..000000000 --- a/src/java/com/twitter/search/common/relevance/features/TermVector.java +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.util.Map; - -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; - -import com.twitter.common.base.Function; - -/** - * Class to keep String-Double of term vectors - * It can calculate magnitude, dot product, and cosine similarity - */ -public class TermVector { - private static final double MIN_MAGNITUDE = 0.00001; - private final double magnitude; - private final ImmutableMap termWeights; - - /** Creates a new TermVector instance. */ - public TermVector(Map termWeights) { - this.termWeights = ImmutableMap.copyOf(termWeights); - double sum = 0.0; - for (Map.Entry entry : termWeights.entrySet()) { - double value = entry.getValue(); - sum += value * value; - } - magnitude = Math.sqrt(sum); - } - - public ImmutableMap getTermWeights() { - return termWeights; - } - - public double getMagnitude() { - return magnitude; - } - - /** - * Normalize term vector into unit magnitude - * @return the unit normalized TermVector with magnitude equals 1 - * return null if magnitude is very low - */ - public TermVector getUnitNormalized() { - if (magnitude < MIN_MAGNITUDE) { - return null; - } - return new TermVector( - Maps.transformValues(termWeights, (Function) weight -> weight / magnitude)); - } - - /** - * Calculate the dot product with another term vector - * @param other the other term vector - * @return the dot product of the two vectors - */ - public double getDotProduct(TermVector other) { - double sum = 0.0; - for (Map.Entry entry : termWeights.entrySet()) { - Double value2 = other.termWeights.get(entry.getKey()); - if (value2 != null) { - sum += entry.getValue() * value2; - } - } - return sum; - } - - /** - * Calculate the cosine similarity of with another term vector - * @param other the other term vector - * @return the cosine similarity. - * if either has very small magnitude, it returns 0 (dotProduct close to 0) - */ - public double getCosineSimilarity(TermVector other) { - if (magnitude < MIN_MAGNITUDE || other.magnitude < MIN_MAGNITUDE) { - return 0; - } - return getDotProduct(other) / (magnitude * other.magnitude); - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/TweetEngagementFeatures.docx b/src/java/com/twitter/search/common/relevance/features/TweetEngagementFeatures.docx new file mode 100644 index 000000000..3ba470349 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/TweetEngagementFeatures.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/TweetEngagementFeatures.java b/src/java/com/twitter/search/common/relevance/features/TweetEngagementFeatures.java deleted file mode 100644 index 22b610e4c..000000000 --- a/src/java/com/twitter/search/common/relevance/features/TweetEngagementFeatures.java +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import com.twitter.search.common.encoding.features.EncodedFeatures; - -/** - * Holds engagement features for a particular tweet and encodes them as a single int. - * The features are: retweet count, favorite count, itweet score, reply count. - */ -public class TweetEngagementFeatures extends EncodedFeatures { - private static final int RETWEET_COUNT_BIT_SHIFT = 0; - private static final long RETWEET_COUNT_INVERSE_BIT_MASK = 0xffffff00L; - - private static final int ITWEET_SCORE_BIT_SHIFT = 8; - private static final long ITWEET_SCORE_INVERSE_BIT_MASK = 0xffff00ffL; - - private static final int FAV_COUNT_BIT_SHIFT = 16; - private static final long FAV_COUNT_INVERSE_BIT_MASK = 0xff00ffffL; - - private static final int REPLY_COUNT_BIT_SHIFT = 24; - private static final long REPLY_COUNT_INVERSE_BIT_MASK = 0x00ffffffL; - - public TweetEngagementFeatures setRetweetCount(byte count) { - setByteIfGreater(count, RETWEET_COUNT_BIT_SHIFT, RETWEET_COUNT_INVERSE_BIT_MASK); - return this; - } - - public int getRetweetCount() { - return getByte(RETWEET_COUNT_BIT_SHIFT); - } - - public TweetEngagementFeatures setITweetScore(byte score) { - setByteIfGreater(score, ITWEET_SCORE_BIT_SHIFT, ITWEET_SCORE_INVERSE_BIT_MASK); - return this; - } - - public int getITweetScore() { - return getByte(ITWEET_SCORE_BIT_SHIFT); - } - - public TweetEngagementFeatures setFavCount(byte count) { - setByteIfGreater(count, FAV_COUNT_BIT_SHIFT, FAV_COUNT_INVERSE_BIT_MASK); - return this; - } - - public int getFavCount() { - return getByte(FAV_COUNT_BIT_SHIFT); - } - - public TweetEngagementFeatures setReplyCount(byte count) { - setByteIfGreater(count, REPLY_COUNT_BIT_SHIFT, REPLY_COUNT_INVERSE_BIT_MASK); - return this; - } - - public int getReplyCount() { - return getByte(REPLY_COUNT_BIT_SHIFT); - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/TweetFeatureType.docx b/src/java/com/twitter/search/common/relevance/features/TweetFeatureType.docx new file mode 100644 index 000000000..895e9ef9e Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/TweetFeatureType.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/TweetFeatureType.java b/src/java/com/twitter/search/common/relevance/features/TweetFeatureType.java deleted file mode 100644 index 024a14ea4..000000000 --- a/src/java/com/twitter/search/common/relevance/features/TweetFeatureType.java +++ /dev/null @@ -1,291 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.util.Map; -import java.util.Set; -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; - -import com.twitter.search.common.encoding.features.IntNormalizer; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; - -import static com.twitter.search.common.relevance.features.IntNormalizers.BOOLEAN_NORMALIZER; -import static com.twitter.search.common.relevance.features.IntNormalizers.LEGACY_NORMALIZER; -import static com.twitter.search.common.relevance.features.IntNormalizers.PARUS_SCORE_NORMALIZER; -import static com.twitter.search.common.relevance.features.IntNormalizers.SMART_INTEGER_NORMALIZER; -import static com.twitter.search.common.relevance.features.IntNormalizers.TIMESTAMP_SEC_TO_HR_NORMALIZER; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; - -/** - * An enum to represent all dynamic/realtime feature types we can update in the Signal Ingester. - * It provides information for their normalization and their corresponding earlybird feature fields - * and provides utils both producer (Signal Ingester) and consumer (Earlybird) side. - * - */ -public enum TweetFeatureType { - RETWEET (true, 0, LEGACY_NORMALIZER, - EarlybirdFieldConstant.RETWEET_COUNT), - REPLY (true, 1, LEGACY_NORMALIZER, - EarlybirdFieldConstant.REPLY_COUNT), - FAVORITE (true, 4, LEGACY_NORMALIZER, - EarlybirdFieldConstant.FAVORITE_COUNT), - PARUS_SCORE (false, 3, PARUS_SCORE_NORMALIZER, - EarlybirdFieldConstant.PARUS_SCORE), - EMBEDS_IMP_COUNT (true, 10, LEGACY_NORMALIZER, - EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT), - EMBEDS_URL_COUNT (true, 11, LEGACY_NORMALIZER, - EarlybirdFieldConstant.EMBEDS_URL_COUNT), - VIDEO_VIEW (false, 12, LEGACY_NORMALIZER, - EarlybirdFieldConstant.VIDEO_VIEW_COUNT), - // v2 engagement counters, they will eventually replace v1 counters above - RETWEET_V2 (true, 13, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.RETWEET_COUNT_V2), - REPLY_V2 (true, 14, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.REPLY_COUNT_V2), - FAVORITE_V2 (true, 15, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.FAVORITE_COUNT_V2), - EMBEDS_IMP_COUNT_V2 (true, 16, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT_V2), - EMBEDS_URL_COUNT_V2 (true, 17, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.EMBEDS_URL_COUNT_V2), - VIDEO_VIEW_V2 (false, 18, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.VIDEO_VIEW_COUNT_V2), - // other new items - QUOTE (true, 19, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.QUOTE_COUNT), - // weighted engagement counters - WEIGHTED_RETWEET (true, 20, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.WEIGHTED_RETWEET_COUNT), - WEIGHTED_REPLY (true, 21, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.WEIGHTED_REPLY_COUNT), - WEIGHTED_FAVORITE (true, 22, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.WEIGHTED_FAVORITE_COUNT), - WEIGHTED_QUOTE (true, 23, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.WEIGHTED_QUOTE_COUNT), - - // tweet-level safety labels - LABEL_ABUSIVE (false, 24, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.LABEL_ABUSIVE_FLAG), - LABEL_ABUSIVE_HI_RCL (false, 25, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.LABEL_ABUSIVE_HI_RCL_FLAG), - LABEL_DUP_CONTENT (false, 26, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.LABEL_DUP_CONTENT_FLAG), - LABEL_NSFW_HI_PRC (false, 27, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.LABEL_NSFW_HI_PRC_FLAG), - LABEL_NSFW_HI_RCL (false, 28, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.LABEL_NSFW_HI_RCL_FLAG), - LABEL_SPAM (false, 29, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.LABEL_SPAM_FLAG), - LABEL_SPAM_HI_RCL (false, 30, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.LABEL_SPAM_HI_RCL_FLAG), - - PERISCOPE_EXISTS (false, 32, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.PERISCOPE_EXISTS), - PERISCOPE_HAS_BEEN_FEATURED (false, 33, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.PERISCOPE_HAS_BEEN_FEATURED), - PERISCOPE_IS_CURRENTLY_FEATURED (false, 34, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.PERISCOPE_IS_CURRENTLY_FEATURED), - PERISCOPE_IS_FROM_QUALITY_SOURCE(false, 35, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.PERISCOPE_IS_FROM_QUALITY_SOURCE), - PERISCOPE_IS_LIVE (false, 36, BOOLEAN_NORMALIZER, - EarlybirdFieldConstant.PERISCOPE_IS_LIVE), - - // decayed engagement counters - DECAYED_RETWEET (true, 37, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.DECAYED_RETWEET_COUNT), - DECAYED_REPLY (true, 38, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.DECAYED_REPLY_COUNT), - DECAYED_FAVORITE (true, 39, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.DECAYED_FAVORITE_COUNT), - DECAYED_QUOTE (true, 40, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.DECAYED_QUOTE_COUNT), - - // timestamp of last engagement types - LAST_RETWEET_SINCE_CREATION_HR (false, 41, TIMESTAMP_SEC_TO_HR_NORMALIZER, - EarlybirdFieldConstant.LAST_RETWEET_SINCE_CREATION_HRS), - LAST_REPLY_SINCE_CREATION_HR (false, 42, TIMESTAMP_SEC_TO_HR_NORMALIZER, - EarlybirdFieldConstant.LAST_REPLY_SINCE_CREATION_HRS), - LAST_FAVORITE_SINCE_CREATION_HR (false, 43, TIMESTAMP_SEC_TO_HR_NORMALIZER, - EarlybirdFieldConstant.LAST_FAVORITE_SINCE_CREATION_HRS), - LAST_QUOTE_SINCE_CREATION_HR (false, 44, TIMESTAMP_SEC_TO_HR_NORMALIZER, - EarlybirdFieldConstant.LAST_QUOTE_SINCE_CREATION_HRS), - - // fake engagement counters - FAKE_RETWEET (true, 45, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.FAKE_RETWEET_COUNT), - FAKE_REPLY (true, 46, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.FAKE_REPLY_COUNT), - FAKE_FAVORITE (true, 47, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.FAKE_FAVORITE_COUNT), - FAKE_QUOTE (true, 48, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.FAKE_QUOTE_COUNT), - - // blink engagement counters - BLINK_RETWEET (true, 49, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.BLINK_RETWEET_COUNT), - BLINK_REPLY (true, 50, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.BLINK_REPLY_COUNT), - BLINK_FAVORITE (true, 51, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.BLINK_FAVORITE_COUNT), - BLINK_QUOTE (true, 52, SMART_INTEGER_NORMALIZER, - EarlybirdFieldConstant.BLINK_QUOTE_COUNT), - - /* semicolon in a single line to avoid polluting git blame */; - - private static final Map V2_COUNTER_MAP = - ImmutableMap.builder() - .put(RETWEET, RETWEET_V2) - .put(REPLY, REPLY_V2) - .put(FAVORITE, FAVORITE_V2) - .put(EMBEDS_IMP_COUNT, EMBEDS_IMP_COUNT_V2) - .put(EMBEDS_URL_COUNT, EMBEDS_URL_COUNT_V2) - .put(VIDEO_VIEW, VIDEO_VIEW_V2) - .build(); - - private static final Map WEIGHTED_COUNTER_MAP = - ImmutableMap.builder() - .put(RETWEET, WEIGHTED_RETWEET) - .put(REPLY, WEIGHTED_REPLY) - .put(FAVORITE, WEIGHTED_FAVORITE) - .put(QUOTE, WEIGHTED_QUOTE) - .build(); - - private static final Map DECAYED_COUNTER_MAP = - ImmutableMap.builder() - .put(RETWEET, DECAYED_RETWEET) - .put(REPLY, DECAYED_REPLY) - .put(FAVORITE, DECAYED_FAVORITE) - .put(QUOTE, DECAYED_QUOTE) - .build(); - - private static final Map DECAYED_COUNTER_TO_ELAPSED_TIME = - ImmutableMap.builder() - .put(DECAYED_RETWEET, LAST_RETWEET_SINCE_CREATION_HR) - .put(DECAYED_REPLY, LAST_REPLY_SINCE_CREATION_HR) - .put(DECAYED_FAVORITE, LAST_FAVORITE_SINCE_CREATION_HR) - .put(DECAYED_QUOTE, LAST_QUOTE_SINCE_CREATION_HR) - .build(); - - private static final Set DECAYED_FEATURES = - ImmutableSet.of(DECAYED_RETWEET, DECAYED_REPLY, DECAYED_FAVORITE, DECAYED_QUOTE); - - private static final Set FAKE_ENGAGEMENT_FEATURES = - ImmutableSet.of(FAKE_RETWEET, FAKE_REPLY, FAKE_FAVORITE, FAKE_QUOTE); - - private static final Set BLINK_ENGAGEMENT_FEATURES = - ImmutableSet.of(BLINK_RETWEET, BLINK_REPLY, BLINK_FAVORITE, BLINK_QUOTE); - - @Nullable - public TweetFeatureType getV2Type() { - return V2_COUNTER_MAP.get(this); - } - - @Nullable - public static TweetFeatureType getWeightedType(TweetFeatureType type) { - return WEIGHTED_COUNTER_MAP.get(type); - } - - @Nullable - public static TweetFeatureType getDecayedType(TweetFeatureType type) { - return DECAYED_COUNTER_MAP.get(type); - } - - // Whether this feature is incremental or direct value. - private final boolean incremental; - - // This normalizer is used to (1) normalize the output value in DLIndexEventOutputBolt, - // (2) check value change. - private final IntNormalizer normalizer; - - // value for composing cache key. It has to be unique and in increasing order. - private final int typeInt; - - private final EarlybirdFieldConstants.EarlybirdFieldConstant earlybirdField; - - private final IncrementChecker incrementChecker; - - /** - * Constructing an enum for a type. The earlybirdField can be null if it's not prepared, they - * can be here as placeholders but they can't be outputted. - * The normalizer is null for the timestamp features that do not require normalization - */ - TweetFeatureType(boolean incremental, - int typeInt, - IntNormalizer normalizer, - @Nullable EarlybirdFieldConstant earlybirdField) { - this.incremental = incremental; - this.typeInt = typeInt; - this.normalizer = normalizer; - this.earlybirdField = earlybirdField; - this.incrementChecker = new IncrementChecker(this); - } - - public boolean isIncremental() { - return incremental; - } - - public IntNormalizer getNormalizer() { - return normalizer; - } - - public int getTypeInt() { - return typeInt; - } - - public int normalize(double value) { - return normalizer.normalize(value); - } - - public IncrementChecker getIncrementChecker() { - return incrementChecker; - } - - public EarlybirdFieldConstant getEarlybirdField() { - return Preconditions.checkNotNull(earlybirdField); - } - - public boolean hasEarlybirdField() { - return earlybirdField != null; - } - - public boolean isDecayed() { - return DECAYED_FEATURES.contains(this); - } - - @Nullable - public TweetFeatureType getElapsedTimeFeatureType() { - return DECAYED_COUNTER_TO_ELAPSED_TIME.get(this); - } - - public boolean isFakeEngagement() { - return FAKE_ENGAGEMENT_FEATURES.contains(this); - } - - public boolean isBlinkEngagement() { - return BLINK_ENGAGEMENT_FEATURES.contains(this); - } - - /** - * Check if an increment is eligible for emitting - */ - public static class IncrementChecker { - private final IntNormalizer normalizer; - - public IncrementChecker(IntNormalizer normalizer) { - this.normalizer = normalizer; - } - - IncrementChecker(TweetFeatureType type) { - this(type.getNormalizer()); - } - - /** - * Check if a value change is eligible for output - */ - public boolean eligibleForEmit(int oldValue, int newValue) { - return normalizer.normalize(oldValue) != normalizer.normalize(newValue); - } - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/TweetFeatures.docx b/src/java/com/twitter/search/common/relevance/features/TweetFeatures.docx new file mode 100644 index 000000000..e9df10388 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/TweetFeatures.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/TweetFeatures.java b/src/java/com/twitter/search/common/relevance/features/TweetFeatures.java deleted file mode 100644 index b3eb4600a..000000000 --- a/src/java/com/twitter/search/common/relevance/features/TweetFeatures.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.twitter.search.common.relevance.features; - -public class TweetFeatures { - private final TweetTextQuality tweetTextQuality = new TweetTextQuality(); - private final TweetTextFeatures tweetTextFeatures = new TweetTextFeatures(); - private final TweetUserFeatures tweetUserFeatures = new TweetUserFeatures(); - - public TweetTextFeatures getTweetTextFeatures() { - return tweetTextFeatures; - } - - public TweetTextQuality getTweetTextQuality() { - return tweetTextQuality; - } - - public TweetUserFeatures getTweetUserFeatures() { - return tweetUserFeatures; - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/TweetIntegerShingleSignature.docx b/src/java/com/twitter/search/common/relevance/features/TweetIntegerShingleSignature.docx new file mode 100644 index 000000000..4e058e5d9 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/TweetIntegerShingleSignature.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/TweetIntegerShingleSignature.java b/src/java/com/twitter/search/common/relevance/features/TweetIntegerShingleSignature.java deleted file mode 100644 index 9caf94e88..000000000 --- a/src/java/com/twitter/search/common/relevance/features/TweetIntegerShingleSignature.java +++ /dev/null @@ -1,201 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.nio.ByteBuffer; -import java.util.Arrays; - -import com.google.common.base.Preconditions; - -/** - * A TweetIntegerShingleSignature object consists of 4 bytes, each representing the signature of - * a status text sample. The signature bytes are sorted in ascending order and compacted to an - * integer in big endian for serialization. - * - * Fuzzy matching of two TweetIntegerShingleSignature objects is met when the number of matching - * bytes between the two is equal to or above 3. - */ -public class TweetIntegerShingleSignature { - public static final int NUM_SHINGLES = Integer.SIZE / Byte.SIZE; - public static final int DEFAULT_NO_SIGNATURE = 0; - public static final TweetIntegerShingleSignature NO_SIGNATURE_HANDLE = - deserialize(DEFAULT_NO_SIGNATURE); - public static final int DEFAULT_MIN_SHINGLES_MATCH = 3; - private final int minShinglesMatch; - - private final byte[] shingles; - private final int signature; // redundant information, for easier comparison. - - /** - * Construct from a byte array. - */ - public TweetIntegerShingleSignature(byte[] shingles, int minShinglesMatch) { - Preconditions.checkArgument(shingles.length == NUM_SHINGLES); - this.shingles = shingles; - // sort to byte's natural ascending order - Arrays.sort(this.shingles); - this.minShinglesMatch = minShinglesMatch; - this.signature = serializeInternal(shingles); - } - - /** - * Construct from a byte array. - */ - public TweetIntegerShingleSignature(byte[] shingles) { - this(shingles, DEFAULT_MIN_SHINGLES_MATCH); - } - - /** - * Construct from a serialized integer signature. - */ - public TweetIntegerShingleSignature(int signature, int minShinglesMatch) { - this.shingles = deserializeInternal(signature); - // sort to byte's natural ascending order - Arrays.sort(this.shingles); - this.minShinglesMatch = minShinglesMatch; - // now store the sorted shingles into signature field, may be different from what passed in. - this.signature = serializeInternal(shingles); - } - - /** - * Construct from a serialized integer signature. - */ - public TweetIntegerShingleSignature(int signature) { - this(signature, DEFAULT_MIN_SHINGLES_MATCH); - } - - /** - * Used by ingester to generate signature. - * Raw signatures are in byte arrays per sample, and can be more or less - * than what is asked for. - * - * @param rawSignature - */ - public TweetIntegerShingleSignature(Iterable rawSignature) { - byte[] condensedSignature = new byte[NUM_SHINGLES]; - int i = 0; - for (byte[] signatureItem : rawSignature) { - condensedSignature[i++] = signatureItem[0]; - if (i == NUM_SHINGLES) { - break; - } - } - this.shingles = condensedSignature; - Arrays.sort(this.shingles); - this.minShinglesMatch = DEFAULT_MIN_SHINGLES_MATCH; - this.signature = serializeInternal(shingles); - } - - /** - * When used in a hashtable for dup detection, take the first byte of each signature for fast - * pass for majority case of no fuzzy matching. For top queries, this optimization losses about - * only 4% of all fuzzy matches. - * - * @return most significant byte of this signature as its hashcode. - */ - @Override - public int hashCode() { - return shingles[0] & 0xFF; - } - - /** - * Perform fuzzy matching between two TweetIntegerShingleSignature objects. - * - * @param other TweetIntegerShingleSignature object to perform fuzzy match against - * @return true if at least minMatch number of bytes match - */ - @Override - public boolean equals(Object other) { - if (this == other) { - return true; - } - if (other == null) { - return false; - } - if (getClass() != other.getClass()) { - return false; - } - - final TweetIntegerShingleSignature otherSignatureInteger = (TweetIntegerShingleSignature) other; - - int otherSignature = otherSignatureInteger.serialize(); - if (signature == otherSignature) { - // Both serialized signature is the same - return true; - } else if (signature != DEFAULT_NO_SIGNATURE && otherSignature != DEFAULT_NO_SIGNATURE) { - // Neither is NO_SIGNATURE, need to compare shingles. - byte[] otherShingles = otherSignatureInteger.getShingles(); - int numberMatchesNeeded = minShinglesMatch; - // expect bytes are in ascending sorted order - int i = 0; - int j = 0; - while (((numberMatchesNeeded <= (NUM_SHINGLES - i)) // early termination for i - || (numberMatchesNeeded <= (NUM_SHINGLES - j))) // early termination j - && (i < NUM_SHINGLES) && (j < NUM_SHINGLES)) { - if (shingles[i] == otherShingles[j]) { - if (shingles[i] != 0) { // we only consider two shingles equal if they are non zero - numberMatchesNeeded--; - if (numberMatchesNeeded == 0) { - return true; - } - } - i++; - j++; - } else if (shingles[i] < otherShingles[j]) { - i++; - } else if (shingles[i] > otherShingles[j]) { - j++; - } - } - } - // One is NO_SIGNATURE and one is not. - return false; - } - - /** - * Returns the sorted array of signature bytes. - */ - public byte[] getShingles() { - return shingles; - } - - /** - * Serialize 4 sorted signature bytes into an integer in big endian order. - * - * @return compacted int signature - */ - private static int serializeInternal(byte[] shingles) { - ByteBuffer byteBuffer = ByteBuffer.allocate(NUM_SHINGLES); - byteBuffer.put(shingles, 0, NUM_SHINGLES); - return byteBuffer.getInt(0); - } - - /** - * Deserialize an integer into a 4-byte array. - * @param signature The signature integer. - * @return A byte array with 4 elements. - */ - private static byte[] deserializeInternal(int signature) { - return ByteBuffer.allocate(NUM_SHINGLES).putInt(signature).array(); - } - - public int serialize() { - return signature; - } - - public static boolean isFuzzyMatch(int signature1, int signature2) { - return TweetIntegerShingleSignature.deserialize(signature1).equals( - TweetIntegerShingleSignature.deserialize(signature2)); - } - - public static TweetIntegerShingleSignature deserialize(int signature) { - return new TweetIntegerShingleSignature(signature); - } - - public static TweetIntegerShingleSignature deserialize(int signature, int minMatchSingles) { - return new TweetIntegerShingleSignature(signature, minMatchSingles); - } - - @Override - public String toString() { - return String.format("%d %d %d %d", shingles[0], shingles[1], shingles[2], shingles[3]); - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/TweetSignatureUtil.docx b/src/java/com/twitter/search/common/relevance/features/TweetSignatureUtil.docx new file mode 100644 index 000000000..761c6a09a Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/TweetSignatureUtil.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/TweetSignatureUtil.java b/src/java/com/twitter/search/common/relevance/features/TweetSignatureUtil.java deleted file mode 100644 index 76bb215db..000000000 --- a/src/java/com/twitter/search/common/relevance/features/TweetSignatureUtil.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.search.common.relevance.features; - -public final class TweetSignatureUtil { - private TweetSignatureUtil() { - } - - /** Converts the signature in args[0] to a TweetIntegerShingleSignature. */ - public static void main(String[] args) throws Exception { - if (args.length < 1) { - throw new RuntimeException("Please provide signature value."); - } - int signature = Integer.parseInt(args[0]); - System.out.println(TweetIntegerShingleSignature.deserialize(signature).toString()); - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/TweetTextFeatures.docx b/src/java/com/twitter/search/common/relevance/features/TweetTextFeatures.docx new file mode 100644 index 000000000..e636d7464 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/TweetTextFeatures.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/TweetTextFeatures.java b/src/java/com/twitter/search/common/relevance/features/TweetTextFeatures.java deleted file mode 100644 index e545edd3f..000000000 --- a/src/java/com/twitter/search/common/relevance/features/TweetTextFeatures.java +++ /dev/null @@ -1,225 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.util.Collection; -import java.util.List; -import java.util.Set; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Sets; - -import com.twitter.common.text.token.TokenizedCharSequence; - -public class TweetTextFeatures { - // Basic Features, always extracted. - // normalized, lower cased tweet text, w/o resolved urls - private String normalizedText; - - // tokens from normalizedText, w/o resolved urls, lower cased. - private List tokens; - - // tokens from resolved urls, lower cased. - private List resolvedUrlsTokens; - - // tokens in the form of a TokenizedCharSeq, NOT LOWER CASED - private TokenizedCharSequence tokenSequence; - - // strippedTokens above joined with space - private String normalizedStrippedText; - - // normalized, original case tokens, without @mention, #hashtag or urls. - private List strippedTokens; - - // all hash tags, without "#", lower cased - private Set hashtags = Sets.newHashSet(); - - // all mentions, without "@", lower cased - private Set mentions = Sets.newHashSet(); - - // whether this tweet has a question mark that's not in url. - private boolean hasQuestionMark = false; - - private boolean hasPositiveSmiley = false; - private boolean hasNegativeSmiley = false; - - // normalized, original case smileys - private List smileys; - - // lower cased, normalized stock names, without "$" - private List stocks; - - // Extra features for text quality evaluation only. - private int signature = TweetIntegerShingleSignature.DEFAULT_NO_SIGNATURE; - private Set trendingTerms = Sets.newHashSet(); - private int length; - private int caps; - - public String getNormalizedText() { - return normalizedText; - } - - public void setNormalizedText(String normalizedText) { - this.normalizedText = normalizedText; - } - - public List getTokens() { - return tokens; - } - - public int getTokensSize() { - return tokens == null ? 0 : tokens.size(); - } - - public void setTokens(List tokens) { - this.tokens = tokens; - } - - public List getResolvedUrlTokens() { - return resolvedUrlsTokens; - } - - public int getResolvedUrlTokensSize() { - return resolvedUrlsTokens == null ? 0 : resolvedUrlsTokens.size(); - } - - public void setResolvedUrlTokens(List tokensResolvedUrls) { - this.resolvedUrlsTokens = tokensResolvedUrls; - } - - public TokenizedCharSequence getTokenSequence() { - return tokenSequence; - } - - public void setTokenSequence(TokenizedCharSequence tokenSequence) { - this.tokenSequence = tokenSequence; - } - - public String getNormalizedStrippedText() { - return normalizedStrippedText; - } - - public void setNormalizedStrippedText(String normalizedStrippedText) { - this.normalizedStrippedText = normalizedStrippedText; - } - - public List getStrippedTokens() { - return strippedTokens; - } - - public int getStrippedTokensSize() { - return strippedTokens == null ? 0 : strippedTokens.size(); - } - - public void setStrippedTokens(List strippedTokens) { - this.strippedTokens = strippedTokens; - } - - public Set getHashtags() { - return hashtags; - } - - public int getHashtagsSize() { - return hashtags.size(); - } - - public void setHashtags(Collection hashtags) { - this.hashtags = Sets.newHashSet(hashtags); - } - - public Set getMentions() { - return mentions; - } - - public int getMentionsSize() { - return mentions.size(); - } - - public void setMentions(Collection mentions) { - this.mentions = Sets.newHashSet(mentions); - } - - public boolean hasQuestionMark() { - return hasQuestionMark; - } - - public void setHasQuestionMark(boolean hasQuestionMark) { - this.hasQuestionMark = hasQuestionMark; - } - - public boolean hasPositiveSmiley() { - return hasPositiveSmiley; - } - - public void setHasPositiveSmiley(boolean hasPositiveSmiley) { - this.hasPositiveSmiley = hasPositiveSmiley; - } - - public boolean hasNegativeSmiley() { - return hasNegativeSmiley; - } - - public void setHasNegativeSmiley(boolean hasNegativeSmiley) { - this.hasNegativeSmiley = hasNegativeSmiley; - } - - public List getSmileys() { - return smileys; - } - - public int getSmileysSize() { - return smileys == null ? 0 : smileys.size(); - } - - public void setSmileys(List smileys) { - this.smileys = smileys; - } - - public List getStocks() { - return stocks; - } - - public int getStocksSize() { - return stocks == null ? 0 : stocks.size(); - } - - public void setStocks(List stocks) { - this.stocks = stocks; - } - - public int getSignature() { - return signature; - } - - public void setSignature(int signature) { - this.signature = signature; - } - - /** Returns the trending terms. */ - public Set getTrendingTerms() { - return trendingTerms; - } - - public int getTrendingTermsSize() { - return trendingTerms.size(); - } - - @VisibleForTesting - public void setTrendingTerms(Set trendingTerms) { - this.trendingTerms = trendingTerms; - } - - public int getLength() { - return length; - } - - public void setLength(int length) { - this.length = length; - } - - public int getCaps() { - return caps; - } - - public void setCaps(int caps) { - this.caps = caps; - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/TweetTextQuality.docx b/src/java/com/twitter/search/common/relevance/features/TweetTextQuality.docx new file mode 100644 index 000000000..04a9d6987 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/TweetTextQuality.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/TweetTextQuality.java b/src/java/com/twitter/search/common/relevance/features/TweetTextQuality.java deleted file mode 100644 index 63aa30eeb..000000000 --- a/src/java/com/twitter/search/common/relevance/features/TweetTextQuality.java +++ /dev/null @@ -1,69 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.util.Set; - -import com.google.common.collect.Sets; - -public class TweetTextQuality { - - public static enum BooleanQualityType { - OFFENSIVE, // tweet text is offensive - OFFENSIVE_USER, // user name is offensive - HASHTAG_NAME_MATCH, // hashtag matches username - SENSITIVE, // tweet is marked as sensitive when it comes in - } - - public static final double ENTROPY_NOT_SET = Double.MIN_VALUE; - - public static final byte UNSET_TEXT_SCORE = -128; - - private double readability; - private double shout; - private double entropy = ENTROPY_NOT_SET; - private final Set boolQualities = Sets.newHashSet(); - private byte textScore = UNSET_TEXT_SCORE; - - public double getReadability() { - return readability; - } - - public void setReadability(double readability) { - this.readability = readability; - } - - public double getShout() { - return shout; - } - - public void setShout(double shout) { - this.shout = shout; - } - - public double getEntropy() { - return entropy; - } - - public void setEntropy(double entropy) { - this.entropy = entropy; - } - - public void addBoolQuality(BooleanQualityType type) { - boolQualities.add(type); - } - - public boolean hasBoolQuality(BooleanQualityType type) { - return boolQualities.contains(type); - } - - public Set getBoolQualities() { - return boolQualities; - } - - public byte getTextScore() { - return textScore; - } - - public void setTextScore(byte textScore) { - this.textScore = textScore; - } -} diff --git a/src/java/com/twitter/search/common/relevance/features/TweetUserFeatures.docx b/src/java/com/twitter/search/common/relevance/features/TweetUserFeatures.docx new file mode 100644 index 000000000..052e24bfc Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/features/TweetUserFeatures.docx differ diff --git a/src/java/com/twitter/search/common/relevance/features/TweetUserFeatures.java b/src/java/com/twitter/search/common/relevance/features/TweetUserFeatures.java deleted file mode 100644 index 89c9b5196..000000000 --- a/src/java/com/twitter/search/common/relevance/features/TweetUserFeatures.java +++ /dev/null @@ -1,114 +0,0 @@ -package com.twitter.search.common.relevance.features; - -import java.util.Map; - -public class TweetUserFeatures { - private String lang; - private double langConfidence; - private int followers; - private int following; - private int reputation; - private int tweets; - private int retweets; - private int retweeted; - private Map knownForTopics; - private boolean isSpam; - private boolean isNsfw; - private boolean isBot; - - public String getLang() { - return lang; - } - - public void setLang(String lang) { - this.lang = lang; - } - - public double getLangConfidence() { - return langConfidence; - } - - public void setLangConfidence(double langConfidence) { - this.langConfidence = langConfidence; - } - - public int getFollowers() { - return followers; - } - - public void setFollowers(int followers) { - this.followers = followers; - } - - public int getFollowing() { - return following; - } - - public void setFollowing(int following) { - this.following = following; - } - - public int getReputation() { - return reputation; - } - - public void setReputation(int reputation) { - this.reputation = reputation; - } - - public int getTweets() { - return tweets; - } - - public void setTweets(int tweets) { - this.tweets = tweets; - } - - public int getRetweets() { - return retweets; - } - - public void setRetweets(int retweets) { - this.retweets = retweets; - } - - public int getRetweeted() { - return retweeted; - } - - public void setRetweeted(int retweeted) { - this.retweeted = retweeted; - } - - public Map getKnownForTopics() { - return knownForTopics; - } - - public void setKnownForTopics(Map knownForTopics) { - this.knownForTopics = knownForTopics; - } - - public boolean isSpam() { - return isSpam; - } - - public void setSpam(boolean spam) { - isSpam = spam; - } - - public boolean isNsfw() { - return isNsfw; - } - - public void setNsfw(boolean nsfw) { - isNsfw = nsfw; - } - - public boolean isBot() { - return isBot; - } - - public void setBot(boolean bot) { - isBot = bot; - } -} diff --git a/src/java/com/twitter/search/common/relevance/scorers/TweetScorer.docx b/src/java/com/twitter/search/common/relevance/scorers/TweetScorer.docx new file mode 100644 index 000000000..f30e867c4 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/scorers/TweetScorer.docx differ diff --git a/src/java/com/twitter/search/common/relevance/scorers/TweetScorer.java b/src/java/com/twitter/search/common/relevance/scorers/TweetScorer.java deleted file mode 100644 index bd8f55bad..000000000 --- a/src/java/com/twitter/search/common/relevance/scorers/TweetScorer.java +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.search.common.relevance.scorers; - -import com.twitter.search.common.relevance.classifiers.TweetClassifier; -import com.twitter.search.common.relevance.entities.TwitterMessage; - -/** - * Interface to compute feature scores for a single @TwitterMessage - * object, or a group of them, after they have been processed by - * feature classifiers. - * - * Intentionally kept Scorers separate from Classifiers, since they - * may be run at different stages and in different batching manners. - * Convenience methods are provided to run classification and scoring - * in one call. - */ -public abstract class TweetScorer { - /** - * Compute and store feature score in TwitterMessage based on its - * TweetFeatures. - * - * @param tweet tweet message to compute and store score to. - */ - public abstract void scoreTweet(final TwitterMessage tweet); - - /** - * Score a group of TwitterMessages based on their corresponding TweetFeatures - * and store feature scores in TwitterMessages. - * - * This default implementation just iterates through the map and scores each - * individual tweet. Batching for better performance, if applicable, can be implemented by - * concrete subclasses. - * - * @param tweets TwitterMessages to score. - */ - public void scoreTweets(Iterable tweets) { - for (TwitterMessage tweet: tweets) { - scoreTweet(tweet); - } - } - - /** - * Convenience method. - * Classify tweet using the specified list of classifiers, then compute score. - * - * @param classifier list of classifiers to use for classification. - * @param tweet tweet to classify and score - */ - public void classifyAndScoreTweet(TweetClassifier classifier, TwitterMessage tweet) { - classifier.classifyTweet(tweet); - scoreTweet(tweet); - } - - /** - * Convenience method. - * Classify tweets using the specified list of classifiers, then compute score. - * - * @param classifier classifier to use for classification. - * @param tweets tweets to classify and score - */ - public void classifyAndScoreTweets(TweetClassifier classifier, Iterable tweets) { - for (TwitterMessage tweet: tweets) { - classifyAndScoreTweet(classifier, tweet); - } - } -} diff --git a/src/java/com/twitter/search/common/relevance/scorers/TweetTextScorer.docx b/src/java/com/twitter/search/common/relevance/scorers/TweetTextScorer.docx new file mode 100644 index 000000000..23526c8d5 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/scorers/TweetTextScorer.docx differ diff --git a/src/java/com/twitter/search/common/relevance/scorers/TweetTextScorer.java b/src/java/com/twitter/search/common/relevance/scorers/TweetTextScorer.java deleted file mode 100644 index e682e5614..000000000 --- a/src/java/com/twitter/search/common/relevance/scorers/TweetTextScorer.java +++ /dev/null @@ -1,242 +0,0 @@ -package com.twitter.search.common.relevance.scorers; - -import java.util.Map; -import java.util.concurrent.ConcurrentMap; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.metrics.RelevanceStats; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.relevance.config.TweetProcessingConfig; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.features.TweetFeatures; -import com.twitter.search.common.relevance.features.TweetTextFeatures; -import com.twitter.search.common.relevance.features.TweetTextQuality; - -/** - * Compute a text score for TwitterMessage based on its offensiveness, - * shoutness, length, readability and hashtag properties extracted from - * tweet text. - *

- * Formula: - * text_score = offensive_text_damping * offensive_username_damping * - * Sigma(feature_score_weight * feature_score) - *

- * scored features are: length, readability, shout, entropy, links - */ -public class TweetTextScorer extends TweetScorer { - private static final Logger LOG = LoggerFactory.getLogger(TweetTextScorer.class); - - private static final double DEFAULT_OFFENSIVE_TERM_DAMPING = 0.2d; - private static final double DEFAULT_OFFENSIVE_NAME_DAMPING = 0.2d; - - // Sigma of all weights = 1.0d - private static final double DEFAULT_LENGTH_WEIGHT = 0.5d; - private static final double DEFAULT_READABILITY_WEIGHT = 0.1d; - private static final double DEFAULT_SHOUT_WEIGHT = 0.1d; - private static final double DEFAULT_ENTROPY_WEIGHT = 0.25d; - private static final double DEFAULT_LINK_WEIGHT = 0.05d; - - private static final double DEFAULT_NO_DAMPING = 1.0d; - - // Sigmoid alpha values for normalization - private static final double DEFAULT_READABILITY_ALPHA = 0.05d; - private static final double DEFAULT_ENTROPY_ALPHA = 0.5d; - private static final double DEFAULT_LENGTH_ALPHA = 0.03d; - - private static final ConcurrentMap RATE_COUNTERS = - Maps.newConcurrentMap(); - private static final ConcurrentMap> - SCORE_HISTOGRAMS = Maps.newConcurrentMap(); - - private double offensiveTermDamping = DEFAULT_OFFENSIVE_TERM_DAMPING; - private double offensiveNameDamping = DEFAULT_OFFENSIVE_NAME_DAMPING; - - private double lengthWeight = DEFAULT_LENGTH_WEIGHT; - private double readabilityWeight = DEFAULT_READABILITY_WEIGHT; - private double shoutWeight = DEFAULT_SHOUT_WEIGHT; - private double entropyWeight = DEFAULT_ENTROPY_WEIGHT; - private double linkWeight = DEFAULT_LINK_WEIGHT; - - private double readabilityAlpha = DEFAULT_READABILITY_ALPHA; - private double entropyAlpha = DEFAULT_ENTROPY_ALPHA; - private double lengthAlpha = DEFAULT_LENGTH_ALPHA; - - /** Configure from a config file, validate the configuration. */ - public TweetTextScorer(String configFile) { - TweetProcessingConfig.init(configFile); - - // get dampings - checkWeightRange(offensiveTermDamping = TweetProcessingConfig - .getDouble("offensive_term_damping", DEFAULT_OFFENSIVE_TERM_DAMPING)); - checkWeightRange(offensiveNameDamping = TweetProcessingConfig - .getDouble("offensive_name_damping", DEFAULT_OFFENSIVE_NAME_DAMPING)); - - // get weights - checkWeightRange(lengthWeight = TweetProcessingConfig - .getDouble("length_weight", DEFAULT_LENGTH_WEIGHT)); - checkWeightRange(readabilityWeight = TweetProcessingConfig - .getDouble("readability_weight", DEFAULT_READABILITY_WEIGHT)); - checkWeightRange(shoutWeight = TweetProcessingConfig - .getDouble("shout_weight", DEFAULT_SHOUT_WEIGHT)); - checkWeightRange(entropyWeight = TweetProcessingConfig - .getDouble("entropy_weight", DEFAULT_ENTROPY_WEIGHT)); - checkWeightRange(linkWeight = TweetProcessingConfig - .getDouble("link_weight", DEFAULT_LINK_WEIGHT)); - - // check sigma of weights - Preconditions.checkArgument( - lengthWeight + readabilityWeight + shoutWeight + entropyWeight + linkWeight == 1.0d); - - readabilityAlpha = TweetProcessingConfig - .getDouble("readability_alpha", DEFAULT_READABILITY_ALPHA); - entropyAlpha = TweetProcessingConfig.getDouble("entropy_alpha", DEFAULT_ENTROPY_ALPHA); - lengthAlpha = TweetProcessingConfig.getDouble("length_alpha", DEFAULT_LENGTH_ALPHA); - } - - /** Creates a new TweetTextScorer instance. */ - public TweetTextScorer() { - } - - /** Scores the given tweet. */ - public void scoreTweet(final TwitterMessage tweet) { - Preconditions.checkNotNull(tweet); - - for (PenguinVersion penguinVersion : tweet.getSupportedPenguinVersions()) { - TweetFeatures features = Preconditions.checkNotNull(tweet.getTweetFeatures(penguinVersion)); - TweetTextFeatures textFeatures = Preconditions.checkNotNull(features.getTweetTextFeatures()); - TweetTextQuality textQuality = Preconditions.checkNotNull(features.getTweetTextQuality()); - boolean isOffensiveText = textQuality.hasBoolQuality( - TweetTextQuality.BooleanQualityType.OFFENSIVE); - boolean isOffensiveScreenName = textQuality.hasBoolQuality( - TweetTextQuality.BooleanQualityType.OFFENSIVE_USER); - double shoutScore = DEFAULT_NO_DAMPING - textQuality.getShout(); - double lengthScore = normalize(textFeatures.getLength(), lengthAlpha); - double readabilityScore = normalize(textQuality.getReadability(), readabilityAlpha); - double entropyScore = normalize(textQuality.getEntropy(), entropyAlpha); - - double score = (isOffensiveText ? offensiveTermDamping : DEFAULT_NO_DAMPING) - * (isOffensiveScreenName ? offensiveNameDamping : DEFAULT_NO_DAMPING) - * (lengthWeight * lengthScore - + readabilityWeight * readabilityScore - + shoutWeight * shoutScore - + entropyWeight * entropyScore - + linkWeight * (tweet.getExpandedUrlMapSize() > 0 ? 1 : 0)); - - // scale to [0, 100] byte - textQuality.setTextScore((byte) (score * 100)); - - updateStats( - isOffensiveText, - isOffensiveScreenName, - textFeatures, - score, - getRateCounterStat("num_offensive_text_", penguinVersion), - getRateCounterStat("num_offensive_user_", penguinVersion), - getRateCounterStat("num_no_trends_", penguinVersion), - getRateCounterStat("num_has_trends_", penguinVersion), - getRateCounterStat("num_too_many_trends_", penguinVersion), - getRateCounterStat("num_scored_tweets_", penguinVersion), - getScoreHistogram(penguinVersion)); - - if (LOG.isDebugEnabled()) { - LOG.debug(String.format( - "Tweet length [%.2f] weighted length [%.2f], readability [%.2f] " - + "weighted readability [%.2f], shout [%.2f] weighted shout [%.2f], " - + "entropy [%.2f], weighted entropy [%.2f], " - + "score [%.2f], text [%s], penguin version [%s]", - lengthScore, - lengthWeight * lengthScore, - readabilityScore, - readabilityWeight * readabilityScore, - shoutScore, - shoutWeight * shoutScore, - entropyScore, - entropyWeight * entropyScore, - score, - tweet.getText(), - penguinVersion)); - } - } - } - - private void updateStats(boolean isOffensiveText, - boolean isOffensiveScreenName, - TweetTextFeatures textFeatures, - double score, - SearchRateCounter offensiveTextCounter, - SearchRateCounter offensiveUserNameCounter, - SearchRateCounter noTrendsCounter, - SearchRateCounter hasTrendsCounter, - SearchRateCounter tooManyTrendsHashtagsCounter, - SearchRateCounter scoredTweets, - Map scoreHistogram) { - // set stats - if (isOffensiveText) { - offensiveTextCounter.increment(); - } - if (isOffensiveScreenName) { - offensiveUserNameCounter.increment(); - } - if (textFeatures.getTrendingTermsSize() == 0) { - noTrendsCounter.increment(); - } else { - hasTrendsCounter.increment(); - } - if (TwitterMessage.hasMultipleHashtagsOrTrends(textFeatures)) { - tooManyTrendsHashtagsCounter.increment(); - } - scoredTweets.increment(); - - int bucket = (int) Math.floor(score * 10) * 10; - scoreHistogram.get(bucket).increment(); - } - - // normalize the passed in value to smoothed [0, 1.0d] range - private static double normalize(double value, double alpha) { - return 2 * (1.0d / (1.0d + Math.exp(-(alpha * value))) - 0.5); - } - - // Make sure weight values are within the range of [0.0, 1.0] - private void checkWeightRange(double value) { - Preconditions.checkArgument(value >= 0.0d && value <= 1.0d); - } - - private Map getScoreHistogram(PenguinVersion penguinVersion) { - Map scoreHistogram = SCORE_HISTOGRAMS.get(penguinVersion); - if (scoreHistogram == null) { - scoreHistogram = Maps.newHashMap(); - String statsName = "num_text_score_%d_%s"; - - for (int i = 0; i <= 100; i += 10) { - scoreHistogram.put(i, RelevanceStats.exportRate( - String.format(statsName, i, penguinVersion.name().toLowerCase()))); - } - - scoreHistogram = SCORE_HISTOGRAMS.putIfAbsent(penguinVersion, scoreHistogram); - if (scoreHistogram == null) { - scoreHistogram = SCORE_HISTOGRAMS.get(penguinVersion); - } - } - - return scoreHistogram; - } - - private SearchRateCounter getRateCounterStat(String statPrefix, PenguinVersion penguinVersion) { - String statName = statPrefix + penguinVersion.name().toLowerCase(); - SearchRateCounter rateCounter = RATE_COUNTERS.get(statName); - if (rateCounter == null) { - // Only one RateCounter instance is created for each stat name. So we don't need to worry - // that another thread might've created this instance in the meantime: we can just create/get - // it, and store it in the map. - rateCounter = RelevanceStats.exportRate(statName); - RATE_COUNTERS.put(statName, rateCounter); - } - return rateCounter; - } -} diff --git a/src/java/com/twitter/search/common/relevance/text/LocationUtils.docx b/src/java/com/twitter/search/common/relevance/text/LocationUtils.docx new file mode 100644 index 000000000..bd0d432a1 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/text/LocationUtils.docx differ diff --git a/src/java/com/twitter/search/common/relevance/text/LocationUtils.java b/src/java/com/twitter/search/common/relevance/text/LocationUtils.java deleted file mode 100644 index 5fb43543e..000000000 --- a/src/java/com/twitter/search/common/relevance/text/LocationUtils.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.twitter.search.common.relevance.text; - -import java.util.regex.Matcher; - -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.util.text.regex.Regex; - -public final class LocationUtils { - private LocationUtils() { - } - - /** - * Extract lat/lon information from a twitter message. - * @param message The twitter message. - * @return A two-element double array for the lat/lon information. - */ - public static double[] extractLatLon(TwitterMessage message) { - // first look in text for L:, then fall back to profile - Matcher loc = Regex.LAT_LON_LOC_PATTERN.matcher(message.getText()); - if (loc.find() || message.getOrigLocation() != null - && (loc = Regex.LAT_LON_PATTERN.matcher(message.getOrigLocation())).find()) { - final double lat = Double.parseDouble(loc.group(2)); - final double lon = Double.parseDouble(loc.group(3)); - - if (Math.abs(lat) > 90.0) { - throw new NumberFormatException("Latitude cannot exceed +-90 degrees: " + lat); - } - if (Math.abs(lon) > 180.0) { - throw new NumberFormatException("Longitude cannot exceed +-180 degrees: " + lon); - } - - // Reject these common "bogus" regions. - if ((lat == 0 && lon == 0) || lat == -1 || lon == -1) { - return null; - } - - return new double[]{lat, lon}; - } - return null; - } -} diff --git a/src/java/com/twitter/search/common/relevance/text/TweetParser.docx b/src/java/com/twitter/search/common/relevance/text/TweetParser.docx new file mode 100644 index 000000000..cc841aa76 Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/text/TweetParser.docx differ diff --git a/src/java/com/twitter/search/common/relevance/text/TweetParser.java b/src/java/com/twitter/search/common/relevance/text/TweetParser.java deleted file mode 100644 index df518ba5f..000000000 --- a/src/java/com/twitter/search/common/relevance/text/TweetParser.java +++ /dev/null @@ -1,190 +0,0 @@ -package com.twitter.search.common.relevance.text; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import java.util.Locale; -import java.util.Set; - -import com.google.common.base.Joiner; -import com.google.common.collect.Sets; - -import com.twitter.common.text.util.CharSequenceUtils; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl; -import com.twitter.search.common.relevance.entities.TwitterMessage; -import com.twitter.search.common.relevance.features.TweetTextFeatures; -import com.twitter.search.common.util.text.NormalizerHelper; -import com.twitter.search.common.util.text.Smileys; -import com.twitter.search.common.util.text.TokenizerHelper; -import com.twitter.search.common.util.text.TokenizerResult; - -/** - * A parser to extract very basic information from a tweet. - */ -public class TweetParser { - private static final boolean DO_NOT_REMOVE_WWW = false; - - /** Parses the given TwitterMessage. */ - public void parseTweet(TwitterMessage message) { - parseTweet(message, false, true); - } - - /** Parses the given TwitterMessage. */ - public void parseTweet(TwitterMessage message, - boolean useEntitiesFromTweetText, - boolean parseUrls) { - for (PenguinVersion penguinVersion : message.getSupportedPenguinVersions()) { - parseTweet(message, useEntitiesFromTweetText, parseUrls, penguinVersion); - } - } - - /** Parses the given TwitterMessage. */ - public void parseTweet(TwitterMessage message, - boolean useEntitiesFromTweetText, - boolean parseUrls, - PenguinVersion penguinVersion) { - TweetTextFeatures textFeatures = message.getTweetTextFeatures(penguinVersion); - String rawText = message.getText(); - Locale locale = message.getLocale(); - - // don't lower case first. - String normalizedText = NormalizerHelper.normalizeKeepCase(rawText, locale, penguinVersion); - String lowercasedNormalizedText = - CharSequenceUtils.toLowerCase(normalizedText, locale).toString(); - - textFeatures.setNormalizedText(lowercasedNormalizedText); - - TokenizerResult result = TokenizerHelper.tokenizeTweet(normalizedText, locale, penguinVersion); - List tokens = new ArrayList<>(result.tokens); - textFeatures.setTokens(tokens); - textFeatures.setTokenSequence(result.tokenSequence); - - if (parseUrls) { - parseUrls(message, textFeatures); - } - - textFeatures.setStrippedTokens(result.strippedDownTokens); - textFeatures.setNormalizedStrippedText(Joiner.on(" ").skipNulls() - .join(result.strippedDownTokens)); - - // Sanity checks, make sure there is no null token list. - if (textFeatures.getTokens() == null) { - textFeatures.setTokens(Collections.emptyList()); - } - if (textFeatures.getResolvedUrlTokens() == null) { - textFeatures.setResolvedUrlTokens(Collections.emptyList()); - } - if (textFeatures.getStrippedTokens() == null) { - textFeatures.setStrippedTokens(Collections.emptyList()); - } - - setHashtagsAndMentions(message, textFeatures, penguinVersion); - textFeatures.setStocks(sanitizeTokenizerResults(result.stocks, '$')); - textFeatures.setHasQuestionMark(findQuestionMark(textFeatures)); - - // Set smiley polarities. - textFeatures.setSmileys(result.smileys); - for (String smiley : textFeatures.getSmileys()) { - if (Smileys.isValidSmiley(smiley)) { - boolean polarity = Smileys.getPolarity(smiley); - if (polarity) { - textFeatures.setHasPositiveSmiley(true); - } else { - textFeatures.setHasNegativeSmiley(true); - } - } - } - message.setTokenizedCharSequence(penguinVersion, result.rawSequence); - - if (useEntitiesFromTweetText) { - takeEntities(message, textFeatures, result, penguinVersion); - } - } - - /** Parse the URLs in the given TwitterMessage. */ - public void parseUrls(TwitterMessage message) { - for (PenguinVersion penguinVersion : message.getSupportedPenguinVersions()) { - parseUrls(message, message.getTweetTextFeatures(penguinVersion)); - } - } - - /** Parse the URLs in the given TwitterMessage. */ - public void parseUrls(TwitterMessage message, TweetTextFeatures textFeatures) { - if (message.getExpandedUrlMap() != null) { - Set urlsToTokenize = Sets.newLinkedHashSet(); - for (ThriftExpandedUrl url : message.getExpandedUrlMap().values()) { - if (url.isSetExpandedUrl()) { - urlsToTokenize.add(url.getExpandedUrl()); - } - if (url.isSetCanonicalLastHopUrl()) { - urlsToTokenize.add(url.getCanonicalLastHopUrl()); - } - } - TokenizerResult resolvedUrlResult = - TokenizerHelper.tokenizeUrls(urlsToTokenize, message.getLocale(), DO_NOT_REMOVE_WWW); - List urlTokens = new ArrayList<>(resolvedUrlResult.tokens); - textFeatures.setResolvedUrlTokens(urlTokens); - } - } - - private void takeEntities(TwitterMessage message, - TweetTextFeatures textFeatures, - TokenizerResult result, - PenguinVersion penguinVersion) { - if (message.getHashtags().isEmpty()) { - // add hashtags to TwitterMessage if it doens't already have them, from - // JSON entities, this happens when we do offline indexing - for (String hashtag : sanitizeTokenizerResults(result.hashtags, '#')) { - message.addHashtag(hashtag); - } - } - - if (message.getMentions().isEmpty()) { - // add mentions to TwitterMessage if it doens't already have them, from - // JSON entities, this happens when we do offline indexing - for (String mention : sanitizeTokenizerResults(result.mentions, '@')) { - message.addMention(mention); - } - } - - setHashtagsAndMentions(message, textFeatures, penguinVersion); - } - - private void setHashtagsAndMentions(TwitterMessage message, - TweetTextFeatures textFeatures, - PenguinVersion penguinVersion) { - textFeatures.setHashtags(message.getNormalizedHashtags(penguinVersion)); - textFeatures.setMentions(message.getLowercasedMentions()); - } - - // The strings in the mentions, hashtags and stocks lists in TokenizerResult should already have - // the leading characters ('@', '#' and '$') stripped. So in most cases, this sanitization is not - // needed. However, sometimes Penguin tokenizes hashtags, cashtags and mentions incorrectly - // (for example, when using the Korean tokenizer for tokens like ~@mention or ?#hashtag -- see - // SEARCHQUAL-11924 for more details). So we're doing this extra sanitization here to try to work - // around these tokenization issues. - private List sanitizeTokenizerResults(List tokens, char tokenSymbol) { - List sanitizedTokens = new ArrayList(); - for (String token : tokens) { - int indexOfTokenSymbol = token.indexOf(tokenSymbol); - if (indexOfTokenSymbol < 0) { - sanitizedTokens.add(token); - } else { - String sanitizedToken = token.substring(indexOfTokenSymbol + 1); - if (!sanitizedToken.isEmpty()) { - sanitizedTokens.add(sanitizedToken); - } - } - } - return sanitizedTokens; - } - - /** Determines if the normalized text of the given features contain a question mark. */ - public static boolean findQuestionMark(TweetTextFeatures textFeatures) { - // t.co links don't contain ?'s, so it's not necessary to subtract ?'s occurring in Urls - // the tweet text always contains t.co, even if the display url is different - // all links on twitter are now wrapped into t.co - return textFeatures.getNormalizedText().contains("?"); - } -} diff --git a/src/java/com/twitter/search/common/relevance/text/VisibleTokenRatioNormalizer.docx b/src/java/com/twitter/search/common/relevance/text/VisibleTokenRatioNormalizer.docx new file mode 100644 index 000000000..de950769c Binary files /dev/null and b/src/java/com/twitter/search/common/relevance/text/VisibleTokenRatioNormalizer.docx differ diff --git a/src/java/com/twitter/search/common/relevance/text/VisibleTokenRatioNormalizer.java b/src/java/com/twitter/search/common/relevance/text/VisibleTokenRatioNormalizer.java deleted file mode 100644 index d7017448f..000000000 --- a/src/java/com/twitter/search/common/relevance/text/VisibleTokenRatioNormalizer.java +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.search.common.relevance.text; - -public class VisibleTokenRatioNormalizer { - - private static final int NORMALIZE_TO_BITS = 4; - private final int normalizeToSize; - - /** - * constructor - */ - public VisibleTokenRatioNormalizer(int normalizeToBits) { - int size = 2 << (normalizeToBits - 1); - // Let's say normalizeSize is set to 16.... - // If you multiply 1.0 * 16, it is 16 - // If you multiply 0.0 * 16, it is 0 - // That would be occupying 17 ints, not 16, so we subtract 1 here... - this.normalizeToSize = size - 1; - } - - /** - * method - */ - public int normalize(double percent) { - if (percent > 1 || percent < 0) { - throw new IllegalArgumentException("percent should be less than 1 and greater than 0"); - } - int bucket = (int) (percent * normalizeToSize); - return normalizeToSize - bucket; - } - - public double denormalize(int reverseBucket) { - int bucket = normalizeToSize - reverseBucket; - return bucket / (double) normalizeToSize; - } - - public static VisibleTokenRatioNormalizer createInstance() { - return new VisibleTokenRatioNormalizer(NORMALIZE_TO_BITS); - } -} diff --git a/src/java/com/twitter/search/common/schema/AnalyzerFactory.docx b/src/java/com/twitter/search/common/schema/AnalyzerFactory.docx new file mode 100644 index 000000000..c255f819a Binary files /dev/null and b/src/java/com/twitter/search/common/schema/AnalyzerFactory.docx differ diff --git a/src/java/com/twitter/search/common/schema/AnalyzerFactory.java b/src/java/com/twitter/search/common/schema/AnalyzerFactory.java deleted file mode 100644 index 36da161f4..000000000 --- a/src/java/com/twitter/search/common/schema/AnalyzerFactory.java +++ /dev/null @@ -1,142 +0,0 @@ -package com.twitter.search.common.schema; - -import java.io.Reader; -import java.text.ParseException; -import java.util.Map; - -import com.google.common.base.Splitter; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.CharArraySet; -import org.apache.lucene.analysis.CharFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.Tokenizer; -import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter; -import org.apache.lucene.analysis.core.WhitespaceAnalyzer; -import org.apache.lucene.analysis.fa.PersianCharFilter; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.util.Version; - -import com.twitter.search.common.schema.thriftjava.ThriftAnalyzer; -import com.twitter.search.common.schema.thriftjava.ThriftClassInstantiater; -import com.twitter.search.common.schema.thriftjava.ThriftCustomAnalyzer; - -public class AnalyzerFactory { - private static final Logger LOG = LoggerFactory.getLogger(AnalyzerFactory.class); - - private static final String MATCH_VERSION_ARG_NAME = "matchVersion"; - private static final String STANDARD_ANALYZER = "StandardAnalyzer"; - private static final String WHITESPACE_ANALYZER = "WhitespaceAnalyzer"; - private static final String SEARCH_WHITESPACE_ANALYZER = "SearchWhitespaceAnalyzer"; - private static final String HTML_STRIP_CHAR_FILTER = "HTMLStripCharFilter"; - private static final String PERSIAN_CHAR_FILTER = "PersianCharFilter"; - - /** - * Return a Lucene Analyzer based on the given ThriftAnalyzer. - */ - public Analyzer getAnalyzer(ThriftAnalyzer analyzer) { - if (analyzer.isSetAnalyzer()) { - return resolveAnalyzerClass(analyzer.getAnalyzer()); - } else if (analyzer.isSetCustomAnalyzer()) { - return buildCustomAnalyzer(analyzer.getCustomAnalyzer()); - } - return new SearchWhitespaceAnalyzer(); - } - - private Analyzer resolveAnalyzerClass(ThriftClassInstantiater classDef) { - Map params = classDef.getParams(); - Version matchVersion = Version.LUCENE_8_5_2; - - String matchVersionName = getArg(params, MATCH_VERSION_ARG_NAME); - if (matchVersionName != null) { - try { - matchVersion = Version.parse(matchVersionName); - } catch (ParseException e) { - // ignore and use default version - LOG.warn("Unable to parse match version: " + matchVersionName - + ". Will use default version of 8.5.2."); - } - } - - if (classDef.getClassName().equals(STANDARD_ANALYZER)) { - String stopwords = getArg(params, "stopwords"); - if (stopwords != null) { - - CharArraySet stopwordSet = new CharArraySet( - Lists.newLinkedList(Splitter.on(",").split(stopwords)), - false); - return new StandardAnalyzer(stopwordSet); - } else { - return new StandardAnalyzer(); - } - } else if (classDef.getClassName().equals(WHITESPACE_ANALYZER)) { - return new WhitespaceAnalyzer(); - } else if (classDef.getClassName().equals(SEARCH_WHITESPACE_ANALYZER)) { - return new SearchWhitespaceAnalyzer(); - } - - return null; - } - - private Analyzer buildCustomAnalyzer(final ThriftCustomAnalyzer customAnalyzer) { - return new Analyzer() { - @Override - protected TokenStreamComponents createComponents(String fieldName) { - final Tokenizer tokenizer = resolveTokenizerClass(customAnalyzer.getTokenizer()); - - TokenStream filter = tokenizer; - - if (customAnalyzer.isSetFilters()) { - for (ThriftClassInstantiater filterClass : customAnalyzer.getFilters()) { - filter = resolveTokenFilterClass(filterClass, filter); - } - } - - return new TokenStreamComponents(tokenizer, filter); - } - }; - } - - private Tokenizer resolveTokenizerClass(ThriftClassInstantiater classDef) { - return null; - } - - private TokenStream resolveTokenFilterClass(ThriftClassInstantiater classDef, TokenStream input) { - return null; - } - - private CharFilter resolveCharFilterClass(ThriftClassInstantiater classDef, Reader input) { - if (classDef.getClassName().equals(HTML_STRIP_CHAR_FILTER)) { - String escapedTags = getArg(classDef.getParams(), "excapedTags"); - if (escapedTags != null) { - return new HTMLStripCharFilter(input, Sets.newHashSet(Splitter.on(",").split(escapedTags))); - } else { - return new HTMLStripCharFilter(input); - } - } else if (classDef.getClassName().equals(PERSIAN_CHAR_FILTER)) { - return new PersianCharFilter(input); - } - - - throw new ClassNotSupportedException("CharFilter", classDef); - } - - private String getArg(Map args, String arg) { - if (args == null) { - return null; - } - - return args.get(arg); - } - - public final class ClassNotSupportedException extends RuntimeException { - private ClassNotSupportedException(String type, ThriftClassInstantiater classDef) { - super(type + " class with name " + classDef.getClassName() + " currently not supported."); - } - } -} diff --git a/src/java/com/twitter/search/common/schema/BUILD b/src/java/com/twitter/search/common/schema/BUILD deleted file mode 100644 index 1eaa7b968..000000000 --- a/src/java/com/twitter/search/common/schema/BUILD +++ /dev/null @@ -1,34 +0,0 @@ -# Library for schema builder and related analysis utilities. -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/common/text/util:token-util", - "src/java/com/twitter/search/common/encoding/docvalues", - "src/java/com/twitter/search/common/features", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/util/analysis", - "src/java/com/twitter/search/common/util/io", - "src/java/com/twitter/search/common/util/io:record-reader-api", - "src/java/com/twitter/search/common/util/spatial", - "src/java/com/twitter/search/common/util/text", - "src/java/com/twitter/search/common/util/thrift:thrift-utils", - "src/thrift/com/twitter/search/common:features-java", - "src/thrift/com/twitter/search/common:schema-java", - ], -) diff --git a/src/java/com/twitter/search/common/schema/BUILD.docx b/src/java/com/twitter/search/common/schema/BUILD.docx new file mode 100644 index 000000000..537786d00 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/schema/DynamicSchema.docx b/src/java/com/twitter/search/common/schema/DynamicSchema.docx new file mode 100644 index 000000000..a8e46fc8f Binary files /dev/null and b/src/java/com/twitter/search/common/schema/DynamicSchema.docx differ diff --git a/src/java/com/twitter/search/common/schema/DynamicSchema.java b/src/java/com/twitter/search/common/schema/DynamicSchema.java deleted file mode 100644 index ee1063728..000000000 --- a/src/java/com/twitter/search/common/schema/DynamicSchema.java +++ /dev/null @@ -1,214 +0,0 @@ -package com.twitter.search.common.schema; - -import java.util.Collection; -import java.util.Map; -import java.util.concurrent.atomic.AtomicReference; - -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; -import com.google.common.base.Predicate; -import com.google.common.collect.ImmutableCollection; -import com.google.common.collect.ImmutableMap; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.index.FieldInfos; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchema; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.FieldWeightDefault; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.thriftjava.ThriftAnalyzer; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration; - -/** - * A schema implementation that allow minor version increments at run time. - */ -public class DynamicSchema implements Schema { - private static final Logger LOG = LoggerFactory.getLogger(DynamicSchema.class); - - private final AtomicReference schema; - - public DynamicSchema(ImmutableSchema schema) { - this.schema = new AtomicReference<>(schema); - } - - public ImmutableSchemaInterface getSchemaSnapshot() { - return schema.get(); - } - - /** - * Update the schema reference inside this DynamicSchema. - */ - public synchronized void updateSchema(ImmutableSchema newSchema) throws SchemaUpdateException { - ImmutableSchema oldSchema = schema.get(); - if (newSchema.getMajorVersionNumber() != oldSchema.getMajorVersionNumber()) { - throw new SchemaUpdateException("Dynamic major version update is not supported."); - } else { - if (newSchema.getMinorVersionNumber() <= oldSchema.getMinorVersionNumber()) { - throw new SchemaUpdateException("Dynamic backward minor version update is not supported."); - } else { - LOG.info("DynamicSchema accepted update. Old version is {}.{}; new version is {}.{}", - oldSchema.getMajorVersionNumber(), - oldSchema.getMinorVersionNumber(), - newSchema.getMajorVersionNumber(), - newSchema.getMinorVersionNumber()); - schema.set(newSchema); - } - } - } - - public static class SchemaUpdateException extends Exception { - public SchemaUpdateException(String message) { - super(message); - } - } - - // The below are all methods in the Schema interface delegated to the underlying ImmutableSchema. - // The below is generated by IntelliJ, and reviewers can stop reviewing this file here. - // If you are adding logic into this class, please do so above this line. - @Override - public FieldInfos getLuceneFieldInfos( - Predicate acceptedFields) { - return schema.get().getLuceneFieldInfos(acceptedFields); - } - - @Override - public FacetsConfig getFacetsConfig() { - return schema.get().getFacetsConfig(); - } - - @Override - public Analyzer getDefaultAnalyzer( - ThriftAnalyzer override) { - return schema.get().getDefaultAnalyzer(override); - } - - @Override - public ImmutableCollection getFieldInfos() { - return schema.get().getFieldInfos(); - } - - @Override - public boolean hasField(int fieldConfigId) { - return schema.get().hasField(fieldConfigId); - } - - @Override - public boolean hasField(String fieldName) { - return schema.get().hasField(fieldName); - } - - @Override - @Nullable - public FieldInfo getFieldInfo(int fieldConfigId) { - return schema.get().getFieldInfo(fieldConfigId); - } - - @Override - @Nullable - public FieldInfo getFieldInfo(String fieldName) { - return schema.get().getFieldInfo(fieldName); - } - - @Override - public String getFieldName(int fieldConfigId) { - return schema.get().getFieldName(fieldConfigId); - } - - @Override - public FieldInfo getFieldInfo(int fieldConfigId, - ThriftFieldConfiguration override) { - return schema.get().getFieldInfo(fieldConfigId, override); - } - - @Override - public int getNumFacetFields() { - return schema.get().getNumFacetFields(); - } - - @Override - public FieldInfo getFacetFieldByFacetName( - String facetName) { - return schema.get().getFacetFieldByFacetName(facetName); - } - - @Override - public FieldInfo getFacetFieldByFieldName( - String fieldName) { - return schema.get().getFacetFieldByFieldName(fieldName); - } - - @Override - public Collection getFacetFields() { - return schema.get().getFacetFields(); - } - - @Override - public Collection getCsfFacetFields() { - return schema.get().getCsfFacetFields(); - } - - @Override - public String getVersionDescription() { - return schema.get().getVersionDescription(); - } - - @Override - public int getMajorVersionNumber() { - return schema.get().getMajorVersionNumber(); - } - - @Override - public int getMinorVersionNumber() { - return schema.get().getMinorVersionNumber(); - } - - @Override - public boolean isVersionOfficial() { - return schema.get().isVersionOfficial(); - } - - @Override - public Map getFieldWeightMap() { - return schema.get().getFieldWeightMap(); - } - - @Override - public FeatureConfiguration getFeatureConfigurationByName( - String featureName) { - return schema.get().getFeatureConfigurationByName(featureName); - } - - @Override - public FeatureConfiguration getFeatureConfigurationById(int featureFieldId) { - return Preconditions.checkNotNull(schema.get().getFeatureConfigurationById(featureFieldId)); - } - - @Override - @Nullable - public ThriftCSFType getCSFFieldType( - String fieldName) { - return schema.get().getCSFFieldType(fieldName); - } - - @Override - public ThriftSearchFeatureSchema getSearchFeatureSchema() { - return schema.get().getSearchFeatureSchema(); - } - - @Override - public ImmutableMap getFeatureIdToFeatureConfig() { - return schema.get().getFeatureIdToFeatureConfig(); - } - - @Override - public ImmutableMap getFeatureNameToFeatureConfig() { - return schema.get().getFeatureNameToFeatureConfig(); - } -} diff --git a/src/java/com/twitter/search/common/schema/ImmutableSchema.docx b/src/java/com/twitter/search/common/schema/ImmutableSchema.docx new file mode 100644 index 000000000..b224f7f9d Binary files /dev/null and b/src/java/com/twitter/search/common/schema/ImmutableSchema.docx differ diff --git a/src/java/com/twitter/search/common/schema/ImmutableSchema.java b/src/java/com/twitter/search/common/schema/ImmutableSchema.java deleted file mode 100644 index 6285812f0..000000000 --- a/src/java/com/twitter/search/common/schema/ImmutableSchema.java +++ /dev/null @@ -1,904 +0,0 @@ -package com.twitter.search.common.schema; - -import java.io.IOException; -import java.io.ObjectOutputStream; -import java.util.Collection; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.concurrent.atomic.AtomicLong; -import javax.annotation.Nullable; -import javax.annotation.concurrent.Immutable; -import javax.annotation.concurrent.ThreadSafe; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Predicate; -import com.google.common.collect.ImmutableCollection; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.ImmutableSortedMap; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.index.DocValuesType; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.IndexOptions; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.collections.Pair; -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.search.common.features.ExternalTweetFeature; -import com.twitter.search.common.features.SearchResultFeature; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchema; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchemaEntry; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchemaSpecifier; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureType; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.FieldWeightDefault; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.IndexedNumericFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftAnalyzer; -import com.twitter.search.common.schema.thriftjava.ThriftCSFFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.schema.thriftjava.ThriftCSFViewSettings; -import com.twitter.search.common.schema.thriftjava.ThriftFacetFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration; -import com.twitter.search.common.schema.thriftjava.ThriftFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftIndexedFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftSchema; -import com.twitter.search.common.schema.thriftjava.ThriftSearchFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftTokenStreamSerializer; - -/** - * A schema instance that does not change at run time. - */ -@Immutable @ThreadSafe -public class ImmutableSchema implements ImmutableSchemaInterface { - private static final Logger LOG = LoggerFactory.getLogger(ImmutableSchema.class); - private static final ImmutableSet CAN_FACET_ON_CSF_TYPES = - ImmutableSet.builder() - .add(ThriftCSFType.BYTE) - .add(ThriftCSFType.INT) - .add(ThriftCSFType.LONG) - .build(); - - private static final SearchCounter FEATURES_EXISTED_IN_OLD_SCHEMA = - SearchCounter.export("features_existed_in_old_schema"); - - // Currently our index uses 4 bits to store the facet field id. - public static final int MAX_FACET_FIELD_ID = 15; - - public static final String HF_TERM_PAIRS_FIELD = "hf_term_pairs"; - public static final String HF_PHRASE_PAIRS_FIELD = "hf_phrase_pairs"; - - private final ImmutableMap fieldSettingsMapById; - private final ImmutableMap fieldSettingsMapByName; - private final ImmutableMap featureConfigMapByName; - private final ImmutableMap featureConfigMapById; - - @Nullable - private final ThriftAnalyzer defaultAnalyzer; - private final AnalyzerFactory analyzerFactory; - - private final ImmutableMap fieldWeightMap; - private final Map facetNameToFieldMap = Maps.newHashMap(); - private final int numFacetFields; - private final ImmutableSet csfFacetFields; - - // This is the search result feature schema - it has the definition for all the column stride - // view fields. - private final ThriftSearchFeatureSchema searchFeatureSchema; - - private final int majorVersionNumber; - private final int minorVersionNumber; - private final String versionDesc; - private final boolean isVersionOfficial; - - /** - * Construct a Schema instance with the given ThriftSchema and AnalyzerFactory. - */ - public ImmutableSchema(ThriftSchema thriftSchema, - AnalyzerFactory analyzerFactory, - String featureSchemaVersionPrefix) throws SchemaValidationException { - Pair versionPair = parseVersionString(thriftSchema.getVersion()); - this.majorVersionNumber = thriftSchema.getMajorVersionNumber(); - this.minorVersionNumber = thriftSchema.getMinorVersionNumber(); - this.versionDesc = versionPair.getSecond(); - this.isVersionOfficial = thriftSchema.isVersionIsOfficial(); - - this.analyzerFactory = analyzerFactory; - - Map tmpMap = Maps.newLinkedHashMap(); - Set tmpSet = Sets.newHashSet(); - - if (thriftSchema.isSetDefaultAnalyzer()) { - this.defaultAnalyzer = thriftSchema.getDefaultAnalyzer().deepCopy(); - } else { - this.defaultAnalyzer = null; - } - - Map configs = thriftSchema.getFieldConfigs(); - - // Collect all the CSF Views, so that we can later verify that they are appropriately - // configured once we've processed all the other field settings. - Map csfViewFields = Maps.newHashMap(); - boolean requiresHfPairFields = false; - boolean hasHfTermPairField = false; - boolean hasHfPhrasePairField = false; - int numFacets = 0; - for (Map.Entry entry : configs.entrySet()) { - int fieldId = entry.getKey(); - - if (tmpMap.containsKey(fieldId)) { - throw new SchemaValidationException("Duplicate field id " + fieldId); - } - - ThriftFieldConfiguration config = entry.getValue(); - FieldInfo fieldInfo = parseThriftFieldSettings(fieldId, config, csfViewFields); - validate(fieldInfo); - if (fieldInfo.getFieldType().isFacetField()) { - if (numFacets > MAX_FACET_FIELD_ID) { - throw new SchemaValidationException( - "Maximum supported facet field ID is: " + MAX_FACET_FIELD_ID); - } - numFacets++; - facetNameToFieldMap.put(fieldInfo.getFieldType().getFacetName(), fieldInfo); - - if (fieldInfo.getFieldType().isUseCSFForFacetCounting()) { - tmpSet.add(fieldInfo); - } - } - - tmpMap.put(fieldId, fieldInfo); - - if (fieldInfo.getFieldType().isIndexHFTermPairs()) { - requiresHfPairFields = true; - } - if (fieldInfo.getName().equals(HF_TERM_PAIRS_FIELD)) { - hasHfTermPairField = true; - } - if (fieldInfo.getName().equals(HF_PHRASE_PAIRS_FIELD)) { - hasHfPhrasePairField = true; - } - } - - this.numFacetFields = numFacets; - this.csfFacetFields = ImmutableSet.copyOf(tmpSet); - - // If any field requires high frequency term/phrase pair fields, make sure they exist - if (requiresHfPairFields) { - if (!hasHfTermPairField || !hasHfPhrasePairField) { - throw new SchemaValidationException( - "High frequency term/phrase pair fields do not exist in the schema."); - } - } - - this.fieldSettingsMapById = ImmutableMap.copyOf(tmpMap); - - Pair, ImmutableMap> - featureConfigMapPair = buildFeatureMaps(csfViewFields); - this.featureConfigMapByName = featureConfigMapPair.getFirst(); - this.featureConfigMapById = featureConfigMapPair.getSecond(); - - for (ThriftFieldConfiguration csfViewField : csfViewFields.values()) { - SchemaBuilder.verifyCSFViewSettings(configs, csfViewField); - } - - ImmutableMap.Builder builder = ImmutableMap.builder(); - - for (FieldInfo info : fieldSettingsMapById.values()) { - info.getFieldType().freeze(); - builder.put(info.getName(), info); - } - this.fieldSettingsMapByName = builder.build(); - - ImmutableMap.Builder fieldWeightMapBuilder = ImmutableMap.builder(); - - for (FieldInfo fi : getFieldInfos()) { - // CSF fields are not searchable. All other fields are. - if (fi.getFieldType().isIndexedField()) { - fieldWeightMapBuilder.put( - fi.getName(), - new FieldWeightDefault( - fi.getFieldType().isTextSearchableByDefault(), - fi.getFieldType().getTextSearchableFieldWeight())); - } - } - - this.fieldWeightMap = fieldWeightMapBuilder.build(); - // Create features with extra Earlybird derived fields, extra fields won't change the version - // but they do change the checksum. - this.searchFeatureSchema = createSearchResultFeatureSchema( - featureSchemaVersionPrefix, fieldSettingsMapByName, featureConfigMapByName); - } - - /** - * Add a set of features to a schema if they don't exist yet, and update the schema checksum. - * if there's conflict, RuntimeException will be thrown. - * Old map won't be touched, a new map will be returned will old and new data combined. - */ - public static Map appendToFeatureSchema( - Map oldEntryMap, - Set features) throws SchemaValidationException { - if (oldEntryMap == null) { - throw new SchemaValidationException( - "Cannot append features to schema, the entryMap is null"); - } - // make a copy of the existing map - ImmutableMap.Builder builder = - ImmutableSortedMap.naturalOrder() - .putAll(oldEntryMap); - - for (SearchResultFeature feature : features) { - if (oldEntryMap.containsKey(feature.getId())) { - FEATURES_EXISTED_IN_OLD_SCHEMA.increment(); - } else { - builder.put(feature.getId(), new ThriftSearchFeatureSchemaEntry() - .setFeatureName(feature.getName()) - .setFeatureType(feature.getType())); - } - } - return builder.build(); - } - - /** - * Append external features to create a new schema. - * @param oldSchema The old schema to build on top of - * @param features a list of features to be appended to the schema - * @param versionSuffix the version suffix, if not-null, it will be attached to the end of - * original schema's version. - * @return A new schema object with the appended fields - * @throws SchemaValidationException thrown when the checksum cannot be computed - */ - public static ThriftSearchFeatureSchema appendToCreateNewFeatureSchema( - ThriftSearchFeatureSchema oldSchema, - Set features, - @Nullable String versionSuffix) throws SchemaValidationException { - - ThriftSearchFeatureSchema newSchema = new ThriftSearchFeatureSchema(); - // copy over all the entries plus the new ones - newSchema.setEntries(appendToFeatureSchema(oldSchema.getEntries(), features)); - - ThriftSearchFeatureSchemaSpecifier spec = new ThriftSearchFeatureSchemaSpecifier(); - // the version is directly inherited or with a suffix - Preconditions.checkArgument(versionSuffix == null || !versionSuffix.isEmpty()); - spec.setVersion(versionSuffix == null - ? oldSchema.getSchemaSpecifier().getVersion() - : oldSchema.getSchemaSpecifier().getVersion() + versionSuffix); - spec.setChecksum(getChecksum(newSchema.getEntries())); - newSchema.setSchemaSpecifier(spec); - return newSchema; - } - - @Override - public FieldInfos getLuceneFieldInfos(Predicate acceptedFields) { - List acceptedFieldInfos = Lists.newArrayList(); - for (FieldInfo fi : getFieldInfos()) { - if (acceptedFields == null || acceptedFields.apply(fi.getName())) { - acceptedFieldInfos.add(convert(fi.getName(), fi.getFieldId(), fi.getFieldType())); - } - } - return new FieldInfos(acceptedFieldInfos.toArray( - new org.apache.lucene.index.FieldInfo[acceptedFieldInfos.size()])); - } - - private FieldInfo parseThriftFieldSettings(int fieldId, ThriftFieldConfiguration fieldConfig, - Map csfViewFields) - throws SchemaValidationException { - FieldInfo fieldInfo - = new FieldInfo(fieldId, fieldConfig.getFieldName(), new EarlybirdFieldType()); - ThriftFieldSettings fieldSettings = fieldConfig.getSettings(); - - - boolean settingFound = false; - - if (fieldSettings.isSetIndexedFieldSettings()) { - if (fieldSettings.isSetCsfFieldSettings() || fieldSettings.isSetCsfViewSettings()) { - throw new SchemaValidationException("ThriftFieldSettings: Only one of " - + "'indexedFieldSettings', 'csfFieldSettings', 'csfViewSettings' can be set."); - } - - applyIndexedFieldSettings(fieldInfo, fieldSettings.getIndexedFieldSettings()); - settingFound = true; - } - - if (fieldSettings.isSetCsfFieldSettings()) { - if (fieldSettings.isSetIndexedFieldSettings() || fieldSettings.isSetCsfViewSettings()) { - throw new SchemaValidationException("ThriftFieldSettings: Only one of " - + "'indexedFieldSettings', 'csfFieldSettings', 'csfViewSettings' can be set."); - } - - applyCsfFieldSettings(fieldInfo, fieldSettings.getCsfFieldSettings()); - settingFound = true; - } - - if (fieldSettings.isSetFacetFieldSettings()) { - if (!fieldSettings.isSetIndexedFieldSettings() && !(fieldSettings.isSetCsfFieldSettings() - && fieldSettings.getFacetFieldSettings().isUseCSFForFacetCounting() - && CAN_FACET_ON_CSF_TYPES.contains(fieldSettings.getCsfFieldSettings().getCsfType()))) { - throw new SchemaValidationException("ThriftFieldSettings: 'facetFieldSettings' can only be " - + "used in combination with 'indexedFieldSettings' or with 'csfFieldSettings' " - + "where 'isUseCSFForFacetCounting' was set to true and ThriftCSFType is a type that " - + "can be faceted on."); - } - - applyFacetFieldSettings(fieldInfo, fieldSettings.getFacetFieldSettings()); - settingFound = true; - } - - if (fieldSettings.isSetCsfViewSettings()) { - if (fieldSettings.isSetIndexedFieldSettings() || fieldSettings.isSetCsfFieldSettings()) { - throw new SchemaValidationException("ThriftFieldSettings: Only one of " - + "'indexedFieldSettings', 'csfFieldSettings', 'csfViewSettings' can be set."); - } - - // add this field now, but apply settings later to make sure the base field was added properly - // before - csfViewFields.put(fieldId, fieldConfig); - settingFound = true; - } - - if (!settingFound) { - throw new SchemaValidationException("ThriftFieldSettings: One of 'indexedFieldSettings', " - + "'csfFieldSettings' or 'facetFieldSettings' must be set."); - } - - // search field settings are optional - if (fieldSettings.isSetSearchFieldSettings()) { - if (!fieldSettings.isSetIndexedFieldSettings()) { - throw new SchemaValidationException( - "ThriftFieldSettings: 'searchFieldSettings' can only be " - + "used in combination with 'indexedFieldSettings'"); - } - - applySearchFieldSettings(fieldInfo, fieldSettings.getSearchFieldSettings()); - } - - return fieldInfo; - } - - private void applyCsfFieldSettings(FieldInfo fieldInfo, ThriftCSFFieldSettings settings) - throws SchemaValidationException { - // csfType is required - no need to check if it's set - fieldInfo.getFieldType().setDocValuesType(DocValuesType.NUMERIC); - fieldInfo.getFieldType().setCsfType(settings.getCsfType()); - - if (settings.isVariableLength()) { - fieldInfo.getFieldType().setDocValuesType(DocValuesType.BINARY); - fieldInfo.getFieldType().setCsfVariableLength(); - } else { - if (settings.isSetFixedLengthSettings()) { - fieldInfo.getFieldType().setCsfFixedLengthSettings( - settings.getFixedLengthSettings().getNumValuesPerDoc(), - settings.getFixedLengthSettings().isUpdateable()); - if (settings.getFixedLengthSettings().getNumValuesPerDoc() > 1) { - fieldInfo.getFieldType().setDocValuesType(DocValuesType.BINARY); - } - } else { - throw new SchemaValidationException( - "ThriftCSFFieldSettings: Either variableLength should be set to 'true', " - + "or fixedLengthSettings should be set."); - } - } - - fieldInfo.getFieldType().setCsfLoadIntoRam(settings.isLoadIntoRAM()); - if (settings.isSetDefaultValue()) { - fieldInfo.getFieldType().setCsfDefaultValue(settings.getDefaultValue()); - } - } - - private void applyCsfViewFieldSettings(FieldInfo fieldInfo, FieldInfo baseField, - ThriftCSFViewSettings settings) - throws SchemaValidationException { - // csfType is required - no need to check if it's set - fieldInfo.getFieldType().setDocValuesType(DocValuesType.NUMERIC); - fieldInfo.getFieldType().setCsfType(settings.getCsfType()); - - fieldInfo.getFieldType().setCsfFixedLengthSettings(1 /* numValuesPerDoc*/, - false /* updateable*/); - - fieldInfo.getFieldType().setCsfViewSettings(fieldInfo.getName(), settings, baseField); - } - - private void applyFacetFieldSettings(FieldInfo fieldInfo, ThriftFacetFieldSettings settings) { - if (settings.isSetFacetName()) { - fieldInfo.getFieldType().setFacetName(settings.getFacetName()); - } else { - // fall back to field name if no facet name is explicitly provided - fieldInfo.getFieldType().setFacetName(fieldInfo.getName()); - } - fieldInfo.getFieldType().setStoreFacetSkiplist(settings.isStoreSkiplist()); - fieldInfo.getFieldType().setStoreFacetOffensiveCounters(settings.isStoreOffensiveCounters()); - fieldInfo.getFieldType().setUseCSFForFacetCounting(settings.isUseCSFForFacetCounting()); - } - - private void applyIndexedFieldSettings(FieldInfo fieldInfo, ThriftIndexedFieldSettings settings) - throws SchemaValidationException { - fieldInfo.getFieldType().setIndexedField(true); - fieldInfo.getFieldType().setStored(settings.isStored()); - fieldInfo.getFieldType().setTokenized(settings.isTokenized()); - fieldInfo.getFieldType().setStoreTermVectors(settings.isStoreTermVectors()); - fieldInfo.getFieldType().setStoreTermVectorOffsets(settings.isStoreTermVectorOffsets()); - fieldInfo.getFieldType().setStoreTermVectorPositions(settings.isStoreTermVectorPositions()); - fieldInfo.getFieldType().setStoreTermVectorPayloads(settings.isStoreTermVectorPayloads()); - fieldInfo.getFieldType().setOmitNorms(settings.isOmitNorms()); - fieldInfo.getFieldType().setIndexHFTermPairs(settings.isIndexHighFreqTermPairs()); - fieldInfo.getFieldType().setUseTweetSpecificNormalization( - settings.deprecated_performTweetSpecificNormalizations); - - if (settings.isSetIndexOptions()) { - switch (settings.getIndexOptions()) { - case DOCS_ONLY : - fieldInfo.getFieldType().setIndexOptions(IndexOptions.DOCS); - break; - case DOCS_AND_FREQS : - fieldInfo.getFieldType().setIndexOptions(IndexOptions.DOCS_AND_FREQS); - break; - case DOCS_AND_FREQS_AND_POSITIONS : - fieldInfo.getFieldType().setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); - break; - case DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS : - fieldInfo.getFieldType().setIndexOptions( - IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); - break; - default: - throw new SchemaValidationException("Unknown value for IndexOptions: " - + settings.getIndexOptions()); - } - } else if (settings.isIndexed()) { - // default for backward-compatibility - fieldInfo.getFieldType().setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS); - } - - fieldInfo.getFieldType().setStorePerPositionPayloads(settings.isStorePerPositionPayloads()); - fieldInfo.getFieldType().setDefaultPayloadLength( - settings.getDefaultPerPositionPayloadLength()); - fieldInfo.getFieldType().setBecomesImmutable(!settings.isSupportOutOfOrderAppends()); - fieldInfo.getFieldType().setSupportOrderedTerms(settings.isSupportOrderedTerms()); - fieldInfo.getFieldType().setSupportTermTextLookup(settings.isSupportTermTextLookup()); - - if (settings.isSetNumericFieldSettings()) { - fieldInfo.getFieldType().setNumericFieldSettings( - new IndexedNumericFieldSettings(settings.getNumericFieldSettings())); - } - - if (settings.isSetTokenStreamSerializer()) { - fieldInfo.getFieldType().setTokenStreamSerializerBuilder( - buildTokenStreamSerializerProvider(settings.getTokenStreamSerializer())); - } - } - - private void applySearchFieldSettings(FieldInfo fieldInfo, ThriftSearchFieldSettings settings) - throws SchemaValidationException { - fieldInfo.getFieldType().setTextSearchableFieldWeight( - (float) settings.getTextSearchableFieldWeight()); - fieldInfo.getFieldType().setTextSearchableByDefault(settings.isTextDefaultSearchable()); - } - - private void validate(FieldInfo fieldInfo) throws SchemaValidationException { - } - - private TokenStreamSerializer.Builder buildTokenStreamSerializerProvider( - final ThriftTokenStreamSerializer settings) { - TokenStreamSerializer.Builder builder = TokenStreamSerializer.builder(); - for (String serializerName : settings.getAttributeSerializerClassNames()) { - try { - builder.add((TokenStreamSerializer.AttributeSerializer) Class.forName(serializerName) - .newInstance()); - } catch (InstantiationException e) { - throw new RuntimeException( - "Unable to instantiate AttributeSerializer for name " + serializerName); - } catch (IllegalAccessException e) { - throw new RuntimeException( - "Unable to instantiate AttributeSerializer for name " + serializerName); - } catch (ClassNotFoundException e) { - throw new RuntimeException( - "Unable to instantiate AttributeSerializer for name " + serializerName); - } - } - return builder; - } - - @Override - public FacetsConfig getFacetsConfig() { - FacetsConfig facetsConfig = new FacetsConfig(); - - for (String facetName : facetNameToFieldMap.keySet()) { - // set multiValued = true as default, since we're using SortedSetDocValues facet, in which, - // there is no difference between multiValued true or false for the real facet, but only the - // checking of the values. - facetsConfig.setMultiValued(facetName, true); - } - - return facetsConfig; - } - - @Override - public Analyzer getDefaultAnalyzer(ThriftAnalyzer override) { - if (override != null) { - return analyzerFactory.getAnalyzer(override); - } - - if (defaultAnalyzer != null) { - return analyzerFactory.getAnalyzer(defaultAnalyzer); - } - - return new SearchWhitespaceAnalyzer(); - } - - @Override - public ImmutableCollection getFieldInfos() { - return fieldSettingsMapById.values(); - } - - /** - * This is the preferred method to check whether a field configuration is in schema. - * One can also use getFieldInfo and do null checks, but should be careful about excessive - * warning logging resulting from looking up fields not in schema. - */ - @Override - public boolean hasField(int fieldConfigId) { - return fieldSettingsMapById.containsKey(fieldConfigId); - } - - /** - * This is the preferred method to check whether a field configuration is in schema. - * One can also use getFieldInfo and do null checks, but should be careful about excessive - * warning logging resulting from looking up fields not in schema. - */ - @Override - public boolean hasField(String fieldName) { - return fieldSettingsMapByName.containsKey(fieldName); - } - - /** - * Get FieldInfo for the given field id. - * If the goal is to check whether a field is in the schema, use {@link #hasField(int)} instead. - * This method logs a warning whenever it returns null. - */ - @Override - @Nullable - public FieldInfo getFieldInfo(int fieldConfigId) { - return getFieldInfo(fieldConfigId, null); - } - - private org.apache.lucene.index.FieldInfo convert(String fieldName, - int index, - EarlybirdFieldType type) { - return new org.apache.lucene.index.FieldInfo( - fieldName, // String name - index, // int number - type.storeTermVectors(), // boolean storeTermVector - type.omitNorms(), // boolean omitNorms - type.isStorePerPositionPayloads(), // boolean storePayloads - type.indexOptions(), // IndexOptions indexOptions - type.docValuesType(), // DocValuesType docValues - -1, // long dvGen - Maps.newHashMap(), // Map attributes - 0, // int pointDataDimensionCount - 0, // int pointIndexDimensionCount - 0, // int pointNumBytes - false); // boolean softDeletesField - } - - /** - * Get FieldInfo for the given field name, or null if the field does not exist. - */ - @Override - @Nullable - public FieldInfo getFieldInfo(String fieldName) { - return fieldSettingsMapByName.get(fieldName); - } - - @Override - public String getFieldName(int fieldConfigId) { - FieldInfo fieldInfo = fieldSettingsMapById.get(fieldConfigId); - return fieldInfo != null ? fieldInfo.getName() : null; - } - - @Override - public FieldInfo getFieldInfo(int fieldConfigId, ThriftFieldConfiguration override) { - FieldInfo fieldInfo = fieldSettingsMapById.get(fieldConfigId); - if (fieldInfo == null) { - // This method is used to check the availability of fields by IDs, - // so no warning is logged here (would be too verbose otherwise). - return null; - } - - if (override != null) { - try { - return merge(fieldConfigId, fieldInfo, override); - } catch (SchemaValidationException e) { - throw new RuntimeException(e); - } - } - - return fieldInfo; - } - - @Override - public int getNumFacetFields() { - return numFacetFields; - } - - @Override - public FieldInfo getFacetFieldByFacetName(String facetName) { - return facetNameToFieldMap.get(facetName); - } - - @Override - public FieldInfo getFacetFieldByFieldName(String fieldName) { - FieldInfo fieldInfo = getFieldInfo(fieldName); - return fieldInfo != null && fieldInfo.getFieldType().isFacetField() ? fieldInfo : null; - } - - @Override - public Collection getFacetFields() { - return facetNameToFieldMap.values(); - } - - @Override - public Collection getCsfFacetFields() { - return csfFacetFields; - } - - @Override - public String getVersionDescription() { - return versionDesc; - } - - @Override - public int getMajorVersionNumber() { - return majorVersionNumber; - } - - @Override - public int getMinorVersionNumber() { - return minorVersionNumber; - } - - @Override - public boolean isVersionOfficial() { - return isVersionOfficial; - } - - /** - * Parses a version string like "16: renamed field x into y" into a version number and - * a string description. - * @return a Pair of the version number and the description - */ - private static Pair parseVersionString(String version) - throws SchemaValidationException { - Preconditions.checkNotNull(version, "Schema must have a version number and description."); - int colonIndex = version.indexOf(':'); - if (colonIndex == -1) { - throw new SchemaValidationException("Malformed version string: " + version); - } - try { - int versionNumber = Integer.parseInt(version.substring(0, colonIndex)); - String versionDesc = version.substring(colonIndex + 1); - return Pair.of(versionNumber, versionDesc); - } catch (Exception e) { - throw new SchemaValidationException("Malformed version string: " + version, e); - } - } - - @Override - public Map getFieldWeightMap() { - return fieldWeightMap; - } - - /** - * Build the feature maps so that we can use feature name to get the feature configuration. - * @return: an immutable map keyed on fieldName. - */ - private Pair, - ImmutableMap> buildFeatureMaps( - final Map csvViewFields) - throws SchemaValidationException { - - final ImmutableMap.Builder featureConfigMapByNameBuilder = - ImmutableMap.builder(); - final ImmutableMap.Builder featureConfigMapByIdBuilder = - ImmutableMap.builder(); - - for (final Map.Entry entry : csvViewFields.entrySet()) { - ThriftFieldSettings fieldSettings = entry.getValue().getSettings(); - FieldInfo fieldInfo = getFieldInfo(entry.getKey()); - FieldInfo baseFieldInfo = - getFieldInfo(fieldSettings.getCsfViewSettings().getBaseFieldConfigId()); - if (baseFieldInfo == null) { - throw new SchemaValidationException("Base field (id=" - + fieldSettings.getCsfViewSettings().getBaseFieldConfigId() + ") not found."); - } - applyCsfViewFieldSettings(fieldInfo, baseFieldInfo, fieldSettings.getCsfViewSettings()); - - FeatureConfiguration featureConfig = fieldInfo.getFieldType() - .getCsfViewFeatureConfiguration(); - if (featureConfig != null) { - featureConfigMapByNameBuilder.put(fieldInfo.getName(), featureConfig); - featureConfigMapByIdBuilder.put(fieldInfo.getFieldId(), featureConfig); - } - } - - return Pair.of(featureConfigMapByNameBuilder.build(), featureConfigMapByIdBuilder.build()); - } - - @Override - public FeatureConfiguration getFeatureConfigurationByName(String featureName) { - return featureConfigMapByName.get(featureName); - } - - @Override - public FeatureConfiguration getFeatureConfigurationById(int featureFieldId) { - return Preconditions.checkNotNull(featureConfigMapById.get(featureFieldId), - "Field ID: " + featureFieldId); - } - - @Override - @Nullable - public ThriftCSFType getCSFFieldType(String fieldName) { - FieldInfo fieldInfo = getFieldInfo(fieldName); - if (fieldInfo == null) { - return null; - } - - EarlybirdFieldType fieldType = fieldInfo.getFieldType(); - if (fieldType.docValuesType() != org.apache.lucene.index.DocValuesType.NUMERIC) { - return null; - } - - return fieldType.getCsfType(); - } - - @Override - public ImmutableSchemaInterface getSchemaSnapshot() { - return this; - } - - private FieldInfo merge(int fieldConfigId, - FieldInfo fieldInfo, - ThriftFieldConfiguration overrideConfig) - throws SchemaValidationException { - - throw new UnsupportedOperationException("Field override config not supported"); - } - - @Override - public ThriftSearchFeatureSchema getSearchFeatureSchema() { - return searchFeatureSchema; - } - - @Override - public ImmutableMap getFeatureIdToFeatureConfig() { - return featureConfigMapById; - } - - @Override - public ImmutableMap getFeatureNameToFeatureConfig() { - return featureConfigMapByName; - } - - private ThriftSearchFeatureSchema createSearchResultFeatureSchema( - String featureSchemaVersionPrefix, - Map allFieldSettings, - Map featureConfigurations) throws SchemaValidationException { - final ImmutableMap.Builder builder = - new ImmutableMap.Builder<>(); - - for (Map.Entry field : allFieldSettings.entrySet()) { - FeatureConfiguration featureConfig = featureConfigurations.get(field.getKey()); - if (featureConfig == null) { - // This is either a not csf related field or a csf field. - continue; - } - - // This is a csfView field. - if (featureConfig.getOutputType() == null) { - LOG.info("Skip unused fieldschemas: {} for search feature schema.", field.getKey()); - continue; - } - - ThriftSearchFeatureType featureType = getResultFeatureType(featureConfig.getOutputType()); - if (featureType != null) { - builder.put( - field.getValue().getFieldId(), - new ThriftSearchFeatureSchemaEntry(field.getKey(), featureType)); - } else { - LOG.error("Invalid CSFType encountered for csf field: {}", field.getKey()); - } - } - Map indexOnlySchemaEntries = builder.build(); - - // Add earlybird derived features, they are defined in ExternalTweetFeatures and used in the - // scoring function. They are no different from those auto-generated index-based features - // viewed from outside Earlybird. - Map entriesWithEBFeatures = - appendToFeatureSchema( - indexOnlySchemaEntries, ExternalTweetFeature.EARLYBIRD_DERIVED_FEATURES); - - // Add other features needed for tweet ranking from EarlybirdRankingDerivedFeature. - Map allSchemaEntries = appendToFeatureSchema( - entriesWithEBFeatures, ExternalTweetFeature.EARLYBIRD_RANKING_DERIVED_FEATURES); - - long schemaEntriesChecksum = getChecksum(allSchemaEntries); - SearchLongGauge.export("feature_schema_checksum", new AtomicLong(schemaEntriesChecksum)); - - String schemaVersion = String.format( - "%s.%d.%d", featureSchemaVersionPrefix, majorVersionNumber, minorVersionNumber); - ThriftSearchFeatureSchemaSpecifier schemaSpecifier = - new ThriftSearchFeatureSchemaSpecifier(schemaVersion, schemaEntriesChecksum); - - ThriftSearchFeatureSchema schema = new ThriftSearchFeatureSchema(); - schema.setSchemaSpecifier(schemaSpecifier); - schema.setEntries(allSchemaEntries); - - return schema; - } - - // Serializes schemaEntries to a byte array, and computes a CRC32 checksum of the array. - // The serialization needs to be stable: if schemaEntries1.equals(schemaEntries2), we want - // this method to produce the same checksum for schemaEntrie1 and schemaEntrie2, even if - // the checksums are computed in different JVMs, etc. - private static long getChecksum(Map schemaEntries) - throws SchemaValidationException { - SortedMap sortedSchemaEntries = - new TreeMap(schemaEntries); - - CRC32OutputStream crc32OutputStream = new CRC32OutputStream(); - ObjectOutputStream objectOutputStream = null; - try { - objectOutputStream = new ObjectOutputStream(crc32OutputStream); - for (Integer fieldId : sortedSchemaEntries.keySet()) { - objectOutputStream.writeObject(fieldId); - ThriftSearchFeatureSchemaEntry schemaEntry = sortedSchemaEntries.get(fieldId); - objectOutputStream.writeObject(schemaEntry.getFeatureName()); - objectOutputStream.writeObject(schemaEntry.getFeatureType()); - } - objectOutputStream.flush(); - return crc32OutputStream.getValue(); - } catch (IOException e) { - throw new SchemaValidationException("Could not serialize feature schema entries.", e); - } finally { - Preconditions.checkNotNull(objectOutputStream); - try { - objectOutputStream.close(); - } catch (IOException e) { - throw new SchemaValidationException("Could not close ObjectOutputStream.", e); - } - } - } - - /** - * Get the search feature type based on the csf type. - * @param csfType the column stride field type for the data - * @return the corresponding search feature type - */ - @VisibleForTesting - public static ThriftSearchFeatureType getResultFeatureType(ThriftCSFType csfType) { - switch (csfType) { - case INT: - case BYTE: - return ThriftSearchFeatureType.INT32_VALUE; - case BOOLEAN: - return ThriftSearchFeatureType.BOOLEAN_VALUE; - case FLOAT: - case DOUBLE: - return ThriftSearchFeatureType.DOUBLE_VALUE; - case LONG: - return ThriftSearchFeatureType.LONG_VALUE; - default: - return null; - } - } -} diff --git a/src/java/com/twitter/search/common/schema/NumericField.docx b/src/java/com/twitter/search/common/schema/NumericField.docx new file mode 100644 index 000000000..49e4a905e Binary files /dev/null and b/src/java/com/twitter/search/common/schema/NumericField.docx differ diff --git a/src/java/com/twitter/search/common/schema/NumericField.java b/src/java/com/twitter/search/common/schema/NumericField.java deleted file mode 100644 index c6c528d55..000000000 --- a/src/java/com/twitter/search/common/schema/NumericField.java +++ /dev/null @@ -1,44 +0,0 @@ -package com.twitter.search.common.schema; - -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.index.IndexOptions; - -/** - * A Lucene numeric field, similar to the LegacyIntField, LegacyLongField, etc. Lucene classes that - * were removed in Lucene 7.0.0. - */ -public final class NumericField extends Field { - private static final FieldType NUMERIC_FIELD_TYPE = new FieldType(); - static { - NUMERIC_FIELD_TYPE.setTokenized(true); - NUMERIC_FIELD_TYPE.setOmitNorms(true); - NUMERIC_FIELD_TYPE.setIndexOptions(IndexOptions.DOCS); - NUMERIC_FIELD_TYPE.freeze(); - } - - /** - * Creates a new integer field with the given name and value. - */ - public static NumericField newIntField(String fieldName, int value) { - NumericField field = new NumericField(fieldName); - field.fieldsData = Integer.valueOf(value); - return field; - } - - /** - * Creates a new long field with the given name and value. - */ - public static NumericField newLongField(String fieldName, long value) { - NumericField field = new NumericField(fieldName); - field.fieldsData = Long.valueOf(value); - return field; - } - - // We could replace the static methods with constructors, but I think that would make it much - // easier to accidentally use NumericField(String, int) instead of NumericField(String, long), - // for example, leading to hard to debug errors. - private NumericField(String fieldName) { - super(fieldName, NUMERIC_FIELD_TYPE); - } -} diff --git a/src/java/com/twitter/search/common/schema/SchemaBuilder.docx b/src/java/com/twitter/search/common/schema/SchemaBuilder.docx new file mode 100644 index 000000000..0e1d8b69d Binary files /dev/null and b/src/java/com/twitter/search/common/schema/SchemaBuilder.docx differ diff --git a/src/java/com/twitter/search/common/schema/SchemaBuilder.java b/src/java/com/twitter/search/common/schema/SchemaBuilder.java deleted file mode 100644 index e12da2a65..000000000 --- a/src/java/com/twitter/search/common/schema/SchemaBuilder.java +++ /dev/null @@ -1,693 +0,0 @@ -package com.twitter.search.common.schema; - -import java.util.Map; -import java.util.Set; -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Sets; - -import com.twitter.common.text.util.CharSequenceTermAttributeSerializer; -import com.twitter.common.text.util.PositionIncrementAttributeSerializer; -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.common.text.util.TokenTypeAttributeSerializer; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.FieldNameToIdMapping; -import com.twitter.search.common.schema.thriftjava.ThriftCSFFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.schema.thriftjava.ThriftCSFViewSettings; -import com.twitter.search.common.schema.thriftjava.ThriftFacetFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftFeatureNormalizationType; -import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint; -import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration; -import com.twitter.search.common.schema.thriftjava.ThriftFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftFixedLengthCSFSettings; -import com.twitter.search.common.schema.thriftjava.ThriftIndexOptions; -import com.twitter.search.common.schema.thriftjava.ThriftIndexedFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftIndexedNumericFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftNumericType; -import com.twitter.search.common.schema.thriftjava.ThriftSchema; -import com.twitter.search.common.schema.thriftjava.ThriftSearchFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftTokenStreamSerializer; -import com.twitter.search.common.util.analysis.CharTermAttributeSerializer; -import com.twitter.search.common.util.analysis.IntTermAttributeSerializer; -import com.twitter.search.common.util.analysis.LongTermAttributeSerializer; -import com.twitter.search.common.util.analysis.PayloadAttributeSerializer; - -public class SchemaBuilder { - - public static final String CSF_VIEW_NAME_SEPARATOR = "."; - protected final ThriftSchema schema = new ThriftSchema(); - protected final FieldNameToIdMapping idMapping; - protected final int tokenStreamSerializerVersion; - - // As of now, we do not allow two fields to share the same field name. - // This set is used to perform this check. - private final Set fieldNameSet = Sets.newHashSet(); - - /** - * Construct a schema builder with the given FieldNameToIdMapper. - * A SchemaBuilder is used to build a ThriftSchema incrementally. - */ - public SchemaBuilder(FieldNameToIdMapping idMapping, - TokenStreamSerializer.Version tokenStreamSerializerVersion) { - this.idMapping = idMapping; - Preconditions.checkArgument( - tokenStreamSerializerVersion == TokenStreamSerializer.Version.VERSION_2); - this.tokenStreamSerializerVersion = tokenStreamSerializerVersion.ordinal(); - } - - /** - * Build ThriftSchema using settings accumulated so far. - */ - public final ThriftSchema build() { - return schema; - } - - /** - * Uses fieldName also as facetName. - */ - public final SchemaBuilder withFacetConfigs(String fieldName, - boolean storeSkipList, - boolean storeOffensiveCounters, - boolean useCSFForFacetCounting) { - return withFacetConfigs( - fieldName, - fieldName, - storeSkipList, - storeOffensiveCounters, - useCSFForFacetCounting); - } - - /** - * Add facet field configuration. - */ - public final SchemaBuilder withFacetConfigs(String fieldName, - String facetName, - boolean storeSkipList, - boolean storeOffensiveCounters, - boolean useCSFForFacetCounting) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFacetFieldSettings facetSettings = new ThriftFacetFieldSettings(); - // As of now, all our facet names are the same as field names - facetSettings.setFacetName(facetName); - facetSettings.setStoreSkiplist(storeSkipList); - facetSettings.setStoreOffensiveCounters(storeOffensiveCounters); - facetSettings.setUseCSFForFacetCounting(useCSFForFacetCounting); - - int fieldId = idMapping.getFieldID(fieldName); - ThriftFieldConfiguration fieldConfiguration = schema.getFieldConfigs().get(fieldId); - Preconditions.checkNotNull(fieldConfiguration, - "In Earlybird, a facet field must be indexed. " - + "No ThriftIndexedFieldSettings found for field " + fieldName); - fieldConfiguration.getSettings().setFacetFieldSettings(facetSettings); - return this; - } - - /** - * Configure the given field ID to be used for partitioning. - */ - public final SchemaBuilder withPartitionFieldId(int partitionFieldId) { - schema.setPartitionFieldId(partitionFieldId); - return this; - } - - /** - * Add a column stride field into schema. - */ - public final SchemaBuilder withColumnStrideField(String fieldName, - ThriftCSFType type, - int numValuesPerDoc, - boolean updatable, - boolean loadIntoRam) { - return withColumnStrideField(fieldName, type, numValuesPerDoc, updatable, loadIntoRam, null); - } - - /** - * Add a column stride field into schema that is variable length. - */ - public final SchemaBuilder withBinaryColumnStrideField(String fieldName, - boolean loadIntoRam) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftCSFFieldSettings csfFieldSettings = new ThriftCSFFieldSettings(); - csfFieldSettings.setCsfType(ThriftCSFType.BYTE) - .setVariableLength(true) - .setLoadIntoRAM(loadIntoRam); - - ThriftFieldSettings fieldSettings = - new ThriftFieldSettings().setCsfFieldSettings(csfFieldSettings); - ThriftFieldConfiguration fieldConf = - new ThriftFieldConfiguration(fieldName).setSettings(fieldSettings); - putIntoFieldConfigs(idMapping.getFieldID(fieldName), fieldConf); - return this; - } - - /** - * Add a column stride field into schema which has a default value. - */ - public final SchemaBuilder withColumnStrideField(String fieldName, - ThriftCSFType type, - int numValuesPerDoc, - boolean updatable, - boolean loadIntoRam, - Long defaultValue) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftCSFFieldSettings csfFieldSettings = new ThriftCSFFieldSettings(); - csfFieldSettings.setCsfType(type) - .setVariableLength(false) - .setFixedLengthSettings( - new ThriftFixedLengthCSFSettings() - .setNumValuesPerDoc(numValuesPerDoc) - .setUpdateable(updatable)) - .setLoadIntoRAM(loadIntoRam); - - if (defaultValue != null) { - csfFieldSettings.setDefaultValue(defaultValue); - } - - ThriftFieldSettings fieldSettings = - new ThriftFieldSettings().setCsfFieldSettings(csfFieldSettings); - ThriftFieldConfiguration fieldConf = - new ThriftFieldConfiguration(fieldName).setSettings(fieldSettings); - putIntoFieldConfigs(idMapping.getFieldID(fieldName), fieldConf); - return this; - } - - /** - * Add a CSF view into schema. A view is a portion of another CSF. - */ - public final SchemaBuilder withColumnStrideFieldView( - String fieldName, - ThriftCSFType csfType, - ThriftCSFType outputCSFType, - String baseFieldName, - int valueIndex, - int bitStartPosition, - int bitLength, - ThriftFeatureNormalizationType featureNormalizationType, - @Nullable Set constraints) { - if (!shouldIncludeField(fieldName)) { - return this; - } - - int baseFieldConfigID = idMapping.getFieldID(baseFieldName); - - ThriftCSFViewSettings csfViewSettings = new ThriftCSFViewSettings() - .setBaseFieldConfigId(baseFieldConfigID) - .setCsfType(csfType) - .setValueIndex(valueIndex) - .setBitStartPosition(bitStartPosition) - .setBitLength(bitLength); - if (outputCSFType != null) { - csfViewSettings.setOutputCSFType(outputCSFType); - } - if (featureNormalizationType != ThriftFeatureNormalizationType.NONE) { - csfViewSettings.setNormalizationType(featureNormalizationType); - } - if (constraints != null) { - csfViewSettings.setFeatureUpdateConstraints(constraints); - } - ThriftFieldSettings fieldSettings = new ThriftFieldSettings() - .setCsfViewSettings(csfViewSettings); - ThriftFieldConfiguration fieldConf = new ThriftFieldConfiguration(fieldName) - .setSettings(fieldSettings); - - Map fieldConfigs = schema.getFieldConfigs(); - verifyCSFViewSettings(fieldConfigs, fieldConf); - - putIntoFieldConfigs(idMapping.getFieldID(fieldName), fieldConf); - return this; - } - - /** - * Sanity checks for CSF view settings. - */ - public static void verifyCSFViewSettings(Map fieldConfigs, - ThriftFieldConfiguration fieldConf) { - Preconditions.checkNotNull(fieldConf.getSettings()); - Preconditions.checkNotNull(fieldConf.getSettings().getCsfViewSettings()); - ThriftCSFViewSettings csfViewSettings = fieldConf.getSettings().getCsfViewSettings(); - - if (fieldConfigs != null) { - ThriftFieldConfiguration baseFieldConfig = fieldConfigs.get( - csfViewSettings.getBaseFieldConfigId()); - if (baseFieldConfig != null) { - String baseFieldName = baseFieldConfig.getFieldName(); - String expectedViewNamePrefix = baseFieldName + CSF_VIEW_NAME_SEPARATOR; - if (fieldConf.getFieldName().startsWith(expectedViewNamePrefix)) { - ThriftFieldSettings baseFieldSettings = baseFieldConfig.getSettings(); - ThriftCSFFieldSettings baseFieldCSFSettings = baseFieldSettings.getCsfFieldSettings(); - - if (baseFieldCSFSettings != null) { - if (!baseFieldCSFSettings.isVariableLength() - && baseFieldCSFSettings.getFixedLengthSettings() != null) { - - ThriftCSFType baseCSFType = baseFieldCSFSettings.getCsfType(); - switch (baseCSFType) { - case BYTE: - checkCSFViewPositions(baseFieldCSFSettings, 8, csfViewSettings); - break; - case INT: - checkCSFViewPositions(baseFieldCSFSettings, 32, csfViewSettings); - break; - default: - throw new IllegalStateException("Base field: " + baseFieldName - + " is of a non-supported CSFType: " + baseCSFType); - } - } else { - throw new IllegalStateException("Base field: " + baseFieldName - + " must be a fixed-length CSF field"); - } - } else { - throw new IllegalStateException("Base field: " + baseFieldName + " is not a CSF field"); - } - } else { - throw new IllegalStateException("View field name for baseFieldConfigID: " - + csfViewSettings.getBaseFieldConfigId() + " must start with: '" - + expectedViewNamePrefix + "'"); - } - } else { - throw new IllegalStateException("Can't add a view, no field defined for base fieldID: " - + csfViewSettings.getBaseFieldConfigId()); - } - } else { - throw new IllegalStateException("Can't add a view, no field configs defined."); - } - } - - private static void checkCSFViewPositions(ThriftCSFFieldSettings baseFieldCSFSettings, - int bitsPerValue, - ThriftCSFViewSettings csfViewSettings) { - ThriftFixedLengthCSFSettings fixedLengthCSFSettings = - baseFieldCSFSettings.getFixedLengthSettings(); - Preconditions.checkNotNull(fixedLengthCSFSettings); - - int numValues = fixedLengthCSFSettings.getNumValuesPerDoc(); - Preconditions.checkState(csfViewSettings.getValueIndex() >= 0, - "value index must be positive: " + csfViewSettings.getValueIndex()); - Preconditions.checkState(csfViewSettings.getValueIndex() < numValues, "value index " - + csfViewSettings.getValueIndex() + " must be less than numValues: " + numValues); - - Preconditions.checkState(csfViewSettings.getBitStartPosition() >= 0, - "bitStartPosition must be positive: " + csfViewSettings.getBitStartPosition()); - Preconditions.checkState(csfViewSettings.getBitStartPosition() < bitsPerValue, - "bitStartPosition " + csfViewSettings.getBitStartPosition() - + " must be less than bitsPerValue " + bitsPerValue); - - Preconditions.checkState(csfViewSettings.getBitLength() >= 1, - "bitLength must be positive: " + csfViewSettings.getBitLength()); - - Preconditions.checkState( - csfViewSettings.getBitStartPosition() + csfViewSettings.getBitLength() <= bitsPerValue, - String.format("bitStartPosition (%d) + bitLength (%d) must be less than bitsPerValue (%d)", - csfViewSettings.getBitStartPosition(), csfViewSettings.getBitLength(), bitsPerValue)); - } - - // No position; no freq; not pretokenized; not tokenized. - /** - * Norm is disabled as default. Like Lucene string field, or int/long fields. - */ - public final SchemaBuilder withIndexedNotTokenizedField(String fieldName) { - return withIndexedNotTokenizedField(fieldName, false); - } - - /** - * Add an indexed but not tokenized field. This is similar to Lucene's StringField. - */ - public final SchemaBuilder withIndexedNotTokenizedField(String fieldName, - boolean supportOutOfOrderAppends) { - return withIndexedNotTokenizedField(fieldName, supportOutOfOrderAppends, true); - } - - private final SchemaBuilder withIndexedNotTokenizedField(String fieldName, - boolean supportOutOfOrderAppends, - boolean omitNorms) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldSettings settings = getNoPositionNoFreqSettings(supportOutOfOrderAppends); - settings.getIndexedFieldSettings().setOmitNorms(omitNorms); - ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName) - .setSettings(settings); - putIntoFieldConfigs(idMapping.getFieldID(fieldName), config); - return this; - } - - - /** Makes the given field searchable by default, with the given weight. */ - public final SchemaBuilder withSearchFieldByDefault( - String fieldName, float textSearchableFieldWeight) { - if (!shouldIncludeField(fieldName)) { - return this; - } - - ThriftFieldSettings settings = - schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings(); - settings.setSearchFieldSettings( - new ThriftSearchFieldSettings() - .setTextSearchableFieldWeight(textSearchableFieldWeight) - .setTextDefaultSearchable(true)); - - return this; - } - - /** - * Similar to Lucene's TextField. The string is analyzed using the default/override analyzer. - * @param fieldName - * @param addHfPairIfHfFieldsArePresent Add hfPair fields if they exists in the schema. - * For certain text fields, adding hfPair fields are usually preferred, but they may - * not exist in the schema, in which case the hfPair fields will not be added. - */ - public final SchemaBuilder withTextField(String fieldName, - boolean addHfPairIfHfFieldsArePresent) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName).setSettings( - getDefaultSettings(ThriftIndexOptions.DOCS_AND_FREQS_AND_POSITIONS)); - - if (addHfPairIfHfFieldsArePresent) { - // Add hfPair fields only if they exist in the schema for the cluster - boolean hfPair = shouldIncludeField(ImmutableSchema.HF_TERM_PAIRS_FIELD) - && shouldIncludeField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD); - config.getSettings().getIndexedFieldSettings().setIndexHighFreqTermPairs(hfPair); - } - - config.getSettings().getIndexedFieldSettings().setTokenized(true); - putIntoFieldConfigs(idMapping.getFieldID(fieldName), config); - return this; - } - - /** - * Marked the given field as having per position payload. - */ - public final SchemaBuilder withPerPositionPayload(String fieldName, int defaultPayloadLength) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldSettings settings = - schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings(); - - settings.getIndexedFieldSettings().setStorePerPositionPayloads(true); - settings.getIndexedFieldSettings().setDefaultPerPositionPayloadLength(defaultPayloadLength); - return this; - } - - /** - * Add field into schema that is pre-tokenized and does not have position. - * E.g. hashtags / stocks / card_domain - */ - public final SchemaBuilder withPretokenizedNoPositionField(String fieldName) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName) - .setSettings(getPretokenizedNoPositionFieldSetting()); - // Add hfPair fields only if they exist in the schema for the cluster - boolean hfPair = shouldIncludeField(ImmutableSchema.HF_TERM_PAIRS_FIELD) - && shouldIncludeField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD); - config.getSettings().getIndexedFieldSettings().setIndexHighFreqTermPairs(hfPair); - putIntoFieldConfigs(idMapping.getFieldID(fieldName), config); - return this; - } - - /** - * Mark the field to have ordered term dictionary. - * In Lucene, term dictionary is sorted. In Earlybird, term dictionary order is not - * guaranteed unless this is turned on. - */ - public final SchemaBuilder withOrderedTerms(String fieldName) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldSettings settings = - schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings(); - - settings.getIndexedFieldSettings().setSupportOrderedTerms(true); - return this; - } - - /** - * Support lookup of term text by term id in the term dictionary. - */ - public final SchemaBuilder withTermTextLookup(String fieldName) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldSettings settings = - schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings(); - - settings.getIndexedFieldSettings().setSupportTermTextLookup(true); - return this; - } - - /** - * Add a text field that is pre-tokenized, so not analyzed again in the index (e.g. Earlybird). - * - * Note that the token streams MUST be created using the attributes defined in - * {@link com.twitter.search.common.util.text.TweetTokenStreamSerializer}. - */ - public final SchemaBuilder withPretokenizedTextField( - String fieldName, - boolean addHfPairIfHfFieldsArePresent) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName) - .setSettings(getDefaultPretokenizedSettings( - ThriftIndexOptions.DOCS_AND_FREQS_AND_POSITIONS)); - putIntoFieldConfigs(idMapping.getFieldID(fieldName), config); - // Add hfPair fields only if they exist in the schema for the cluster - if (addHfPairIfHfFieldsArePresent) { - // Add hfPair fields only if they exist in the schema for the cluster - boolean hfPair = shouldIncludeField(ImmutableSchema.HF_TERM_PAIRS_FIELD) - && shouldIncludeField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD); - config.getSettings().getIndexedFieldSettings().setIndexHighFreqTermPairs(hfPair); - } - return this; - } - - /** - * Add a feature configuration - */ - public final SchemaBuilder withFeatureConfiguration(String baseFieldName, String viewName, - FeatureConfiguration featureConfiguration) { - return withColumnStrideFieldView( - viewName, - // Defaulting all encoded tweet features to int since the underlying encoded tweet features - // are ints. - ThriftCSFType.INT, - featureConfiguration.getOutputType(), - baseFieldName, - featureConfiguration.getValueIndex(), - featureConfiguration.getBitStartPosition(), - featureConfiguration.getBitLength(), - featureConfiguration.getFeatureNormalizationType(), - featureConfiguration.getUpdateConstraints() - ); - } - - /** - * Add a long field in schema. This field uses LongTermAttribute. - */ - private SchemaBuilder addLongTermField(String fieldName, boolean useSortableEncoding) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldSettings longTermSettings = getEarlybirdNumericFieldSettings(); - ThriftTokenStreamSerializer tokenStreamSerializer = - new ThriftTokenStreamSerializer(tokenStreamSerializerVersion); - tokenStreamSerializer.setAttributeSerializerClassNames( - ImmutableList.of(LongTermAttributeSerializer.class.getName())); - longTermSettings.getIndexedFieldSettings().setTokenStreamSerializer(tokenStreamSerializer); - - ThriftIndexedNumericFieldSettings numericFieldSettings = - new ThriftIndexedNumericFieldSettings(true); - numericFieldSettings.setNumericType(ThriftNumericType.LONG); - numericFieldSettings.setUseSortableEncoding(useSortableEncoding); - longTermSettings.getIndexedFieldSettings().setNumericFieldSettings(numericFieldSettings); - - putIntoFieldConfigs(idMapping.getFieldID(fieldName), - new ThriftFieldConfiguration(fieldName).setSettings(longTermSettings)); - return this; - } - - public final SchemaBuilder withSortableLongTermField(String fieldName) { - return addLongTermField(fieldName, true); - } - - public final SchemaBuilder withLongTermField(String fieldName) { - return addLongTermField(fieldName, false); - } - - /** - * Add an int field in schema. This field uses IntTermAttribute. - */ - public final SchemaBuilder withIntTermField(String fieldName) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldSettings intTermSettings = getEarlybirdNumericFieldSettings(); - ThriftTokenStreamSerializer attributeSerializer = - new ThriftTokenStreamSerializer(tokenStreamSerializerVersion); - attributeSerializer.setAttributeSerializerClassNames( - ImmutableList.of(IntTermAttributeSerializer.class.getName())); - intTermSettings.getIndexedFieldSettings().setTokenStreamSerializer(attributeSerializer); - - ThriftIndexedNumericFieldSettings numericFieldSettings = - new ThriftIndexedNumericFieldSettings(true); - numericFieldSettings.setNumericType(ThriftNumericType.INT); - intTermSettings.getIndexedFieldSettings().setNumericFieldSettings(numericFieldSettings); - - putIntoFieldConfigs(idMapping.getFieldID(fieldName), - new ThriftFieldConfiguration(fieldName).setSettings(intTermSettings)); - return this; - } - - /** - * Timeline and ExpertSearch uses - * {@link com.twitter.search.common.util.analysis.PayloadWeightedTokenizer} to store weighted - * values. - * - * E.g. for the PRODUCED_LANGUAGES and CONSUMED_LANGUAGES fields, they contain not a single, - * value, but instead a list of values with a weight associated with each value. - * - * This method adds an indexed field that uses - * {@link com.twitter.search.common.util.analysis.PayloadWeightedTokenizer}. - */ - public final SchemaBuilder withCharTermPayloadWeightedField(String fieldName) { - ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName) - .setSettings(getPayloadWeightedSettings(ThriftIndexOptions.DOCS_AND_FREQS_AND_POSITIONS)); - putIntoFieldConfigs(idMapping.getFieldID(fieldName), config); - return this; - } - - /** - * Set the version and description of this schema. - */ - public final SchemaBuilder withSchemaVersion( - int majorVersionNumber, - int minorVersionNumber, - String versionDesc, - boolean isOfficial) { - schema.setMajorVersionNumber(majorVersionNumber); - schema.setMinorVersionNumber(minorVersionNumber); - - schema.setVersion(majorVersionNumber + ":" + versionDesc); - schema.setVersionIsOfficial(isOfficial); - - return this; - } - - public final SchemaBuilder withSchemaVersion( - int majorVersionNumber, - String versionDesc, - boolean isOfficial) { - return withSchemaVersion(majorVersionNumber, 0, versionDesc, isOfficial); - } - - protected void putIntoFieldConfigs(int id, ThriftFieldConfiguration config) { - if (schema.getFieldConfigs() != null && schema.getFieldConfigs().containsKey(id)) { - throw new IllegalStateException("Already have a ThriftFieldConfiguration for field id " + id); - } - - if (fieldNameSet.contains(config.getFieldName())) { - throw new IllegalStateException("Already have a ThriftFieldConfiguration for field " - + config.getFieldName()); - } - fieldNameSet.add(config.getFieldName()); - schema.putToFieldConfigs(id, config); - } - - // Default field settings. Most field settings are similar to this. - protected ThriftFieldSettings getDefaultSettings(ThriftIndexOptions indexOption) { - return getDefaultSettings(indexOption, false); - } - - protected ThriftFieldSettings getDefaultSettings(ThriftIndexOptions indexOption, - boolean supportOutOfOrderAppends) { - ThriftFieldSettings fieldSettings = new ThriftFieldSettings(); - ThriftIndexedFieldSettings indexedFieldSettings = new ThriftIndexedFieldSettings(); - indexedFieldSettings - .setIndexed(true) - .setStored(false) - .setTokenized(false) - .setStoreTermVectors(false) - .setStoreTermVectorOffsets(false) - .setStoreTermVectorPayloads(false) - .setStoreTermVectorPositions(false) - .setSupportOutOfOrderAppends(supportOutOfOrderAppends) - .setIndexOptions(indexOption) - .setOmitNorms(true); // All Earlybird fields omit norms. - fieldSettings.setIndexedFieldSettings(indexedFieldSettings); - return fieldSettings; - } - - /** - * Default field settings for fields that are pretokenized - * - * The fields that use these settings will need to be tokenized using a serializer with the - * attributes defined in {@link com.twitter.search.common.util.text.TweetTokenStreamSerializer}. - */ - protected final ThriftFieldSettings getDefaultPretokenizedSettings( - ThriftIndexOptions indexOption) { - ThriftFieldSettings fieldSettings = getDefaultSettings(indexOption); - fieldSettings.getIndexedFieldSettings().setTokenized(true); - ThriftTokenStreamSerializer attributeSerializer = - new ThriftTokenStreamSerializer(tokenStreamSerializerVersion); - attributeSerializer.setAttributeSerializerClassNames( - ImmutableList.of( - CharSequenceTermAttributeSerializer.class.getName(), - PositionIncrementAttributeSerializer.class.getName(), - TokenTypeAttributeSerializer.class.getName())); - - fieldSettings.getIndexedFieldSettings().setTokenStreamSerializer(attributeSerializer); - return fieldSettings; - } - - protected final ThriftFieldSettings getPretokenizedNoPositionFieldSetting() { - return getDefaultPretokenizedSettings(ThriftIndexOptions.DOCS_AND_FREQS); - } - - protected final ThriftFieldSettings getNoPositionNoFreqSettings() { - return getNoPositionNoFreqSettings(false); - } - - protected final ThriftFieldSettings getNoPositionNoFreqSettings( - boolean supportOutOfOrderAppends) { - return getDefaultSettings(ThriftIndexOptions.DOCS_ONLY, supportOutOfOrderAppends); - } - - protected final ThriftFieldSettings getEarlybirdNumericFieldSettings() { - // Supposedly numeric fields are not tokenized. - // However, Earlybird uses SingleTokenTokenStream to handle int/long fields. - // So we need to set indexed to true for these fields. - ThriftFieldSettings settings = getNoPositionNoFreqSettings(); - settings.getIndexedFieldSettings().setTokenized(true); - return settings; - } - - private ThriftFieldSettings getPayloadWeightedSettings(ThriftIndexOptions indexOption) { - ThriftFieldSettings fieldSettings = getDefaultSettings(indexOption); - fieldSettings.getIndexedFieldSettings().setTokenized(true); - ThriftTokenStreamSerializer attributeSerializer = - new ThriftTokenStreamSerializer(tokenStreamSerializerVersion); - attributeSerializer.setAttributeSerializerClassNames( - ImmutableList.of(CharTermAttributeSerializer.class.getName(), - PositionIncrementAttributeSerializer.class.getName(), - PayloadAttributeSerializer.class.getName())); - fieldSettings.getIndexedFieldSettings().setTokenStreamSerializer(attributeSerializer); - return fieldSettings; - } - - protected boolean shouldIncludeField(String fieldName) { - return true; - } -} diff --git a/src/java/com/twitter/search/common/schema/SchemaDocumentFactory.docx b/src/java/com/twitter/search/common/schema/SchemaDocumentFactory.docx new file mode 100644 index 000000000..a18289a7b Binary files /dev/null and b/src/java/com/twitter/search/common/schema/SchemaDocumentFactory.docx differ diff --git a/src/java/com/twitter/search/common/schema/SchemaDocumentFactory.java b/src/java/com/twitter/search/common/schema/SchemaDocumentFactory.java deleted file mode 100644 index 1efb745cc..000000000 --- a/src/java/com/twitter/search/common/schema/SchemaDocumentFactory.java +++ /dev/null @@ -1,433 +0,0 @@ -package com.twitter.search.common.schema; - -import java.io.IOException; -import java.io.StringReader; -import java.util.Collections; -import java.util.List; -import java.util.Set; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Sets; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetField; -import org.apache.lucene.util.BytesRef; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.text.token.TwitterTokenStream; -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.base.IndexedNumericFieldSettings; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftField; -import com.twitter.search.common.schema.thriftjava.ThriftFieldData; -import com.twitter.search.common.schema.thriftjava.ThriftGeoCoordinate; -import com.twitter.search.common.util.analysis.IntTermAttribute; -import com.twitter.search.common.util.analysis.LongTermAttribute; -import com.twitter.search.common.util.analysis.SortableLongTermAttribute; -import com.twitter.search.common.util.spatial.GeoUtil; -import com.twitter.search.common.util.text.HighFrequencyTermPairs; -import com.twitter.search.common.util.text.OmitNormTextField; -import com.twitter.search.common.util.text.SingleTokenStream; - -/** - * A document factory that converts {@link ThriftDocument} into Lucene {@link Document}s - * using the provided {@link com.twitter.search.common.schema.base.Schema}. - */ -public class SchemaDocumentFactory { - private static final Logger LOG = LoggerFactory.getLogger(SchemaDocumentFactory.class); - - private final Schema schema; - private final ImmutableList tokenStreamRewriters; - - /** - * Creates a SchemaDocumentFactory with a schema and the tokenStreamRewriters. - * - * @param tokenStreamRewriters a list of token stream rewriters, which will be applied in order. - */ - public SchemaDocumentFactory( - Schema schema, - List tokenStreamRewriters) { - this.schema = schema; - this.tokenStreamRewriters = ImmutableList.copyOf(tokenStreamRewriters); - } - - /** - * Creates a SchemaDocumentFactory with no tokenStreamRewriters. - */ - public SchemaDocumentFactory(Schema schema) { - this(schema, Collections.EMPTY_LIST); - } - - public final Document newDocument(ThriftDocument document) throws IOException { - return innerNewDocument(document); - } - - /** - * Create a Lucene document from the ThriftDocument. - */ - @VisibleForTesting - public Document innerNewDocument(ThriftDocument document) throws IOException { - Document luceneDocument = new Document(); - Set hfTerms = Sets.newHashSet(); - Set hfPhrases = Sets.newHashSet(); - - Analyzer defaultAnalyzer = schema.getDefaultAnalyzer(document.getDefaultAnalyzerOverride()); - - for (ThriftField field : document.getFields()) { - boolean successful = false; - try { - addLuceneFields(field, defaultAnalyzer, luceneDocument, hfTerms, hfPhrases); - successful = true; - } finally { - if (!successful) { - LOG.warn("Unexpected exception while trying to add field. Field ID: " - + field.getFieldConfigId() + " Field Name: " - + schema.getFieldName(field.getFieldConfigId())); - } - } - } - - for (String token : hfTerms) { - for (String token2 : hfTerms) { - if (token.compareTo(token2) < 0) { - luceneDocument.add(new Field(ImmutableSchema.HF_TERM_PAIRS_FIELD, - HighFrequencyTermPairs.createPair(token, token2), - OmitNormTextField.TYPE_NOT_STORED)); - } - } - } - - for (String phrase : hfPhrases) { - // Tokens in the phrase set are not terms and have already been processed with - // HighFrequencyTermPairs.createPhrasePair. - luceneDocument.add(new Field(ImmutableSchema.HF_PHRASE_PAIRS_FIELD, phrase, - OmitNormTextField.TYPE_NOT_STORED)); - } - - return schema.getFacetsConfig().build(luceneDocument); - } - - private void addLuceneFields(ThriftField field, Analyzer analyzer, Document doc, - Set hfTerms, Set hfPhrases) throws IOException { - Schema.FieldInfo fieldInfo = - schema.getFieldInfo(field.getFieldConfigId(), field.getFieldConfigOverride()); - - if (fieldInfo == null) { - // field not defined in schema - skip it - return; - } - - ThriftFieldData fieldData = field.getFieldData(); - if (fieldInfo.getFieldType().getCsfType() != null) { - addCSFField(doc, fieldInfo, fieldData); - return; - } - - // Checking which data type is set is not sufficient here. We also need to check schema to - // see what the type the field is configured to be. See SEARCH-5173 for more details. - // The problem is that Pig, while converting Tuples to Thrift, sets all primitive type - // fields to 0. (i.e. the isSet calls will return true). - IndexedNumericFieldSettings numericSettings = - fieldInfo.getFieldType().getNumericFieldSettings(); - if (fieldData.isSetTokenStreamValue()) { - addTokenField(doc, hfTerms, hfPhrases, fieldInfo, fieldData); - } else if (fieldData.isSetStringValue()) { - addStringField(analyzer, doc, hfTerms, hfPhrases, fieldInfo, fieldData); - } else if (fieldData.isSetBytesValue()) { - addBytesField(doc, fieldInfo, fieldData); - } else if (fieldData.isSetGeoCoordinate()) { - addGeoField(doc, fieldInfo, fieldData); - } else if (numericSettings != null) { - // handle numeric fields. - switch (numericSettings.getNumericType()) { - case INT: - Preconditions.checkState(fieldData.isSetIntValue(), - "Int field does not have int value set. Field name: %s", fieldInfo.getName()); - addIntField(doc, fieldInfo, fieldData); - break; - case LONG: - Preconditions.checkState(fieldData.isSetLongValue(), - "Long field does not have long value set. Field name: %s", fieldInfo.getName()); - addLongField(doc, fieldInfo, fieldData); - break; - case FLOAT: - Preconditions.checkState(fieldData.isSetFloatValue(), - "Float field does not have float value set. Field name: %s ", fieldInfo.getName()); - addFloatField(); - break; - case DOUBLE: - Preconditions.checkState(fieldData.isSetDoubleValue(), - "Double field does not have double value set. Field name: %s", fieldInfo.getName()); - addDoubleFIeld(); - break; - default: - throw new UnsupportedOperationException("Earlybird does not know how to handle field " - + field.getFieldConfigId() + " " + field); - } - } else { - throw new UnsupportedOperationException("Earlybird does not know how to handle field " - + field.getFieldConfigId() + " " + field); - } - } - - private void addCSFField(Document doc, Schema.FieldInfo fieldInfo, ThriftFieldData fieldData) { - if (fieldInfo.getFieldType().getCsfFixedLengthNumValuesPerDoc() > 1) { - - // As an optimization, TBinaryProtocol stores a byte array field as a part of a larger byte - // array field. Must call fieldData.getBytesValue(). fieldData.bytesValue.array() will - // return extraneous data. See: SEARCH-3996 - doc.add(new Field(fieldInfo.getName(), fieldData.getBytesValue(), fieldInfo.getFieldType())); - } else { - doc.add(new CSFField(fieldInfo.getName(), fieldInfo.getFieldType(), fieldData)); - } - } - - private void addTokenField( - Document doc, - Set hfTerms, - Set hfPhrases, - Schema.FieldInfo fieldInfo, - ThriftFieldData fieldData) throws IOException { - TwitterTokenStream twitterTokenStream - = fieldInfo.getFieldType().getTokenStreamSerializer().deserialize( - fieldData.getTokenStreamValue(), fieldData.getStringValue()); - - try { - for (TokenStreamRewriter rewriter : tokenStreamRewriters) { - twitterTokenStream = rewriter.rewrite(fieldInfo, twitterTokenStream); - } - - expandStream(doc, fieldInfo, twitterTokenStream, hfTerms, hfPhrases); - doc.add(new Field(fieldInfo.getName(), twitterTokenStream, fieldInfo.getFieldType())); - } finally { - twitterTokenStream.close(); - } - } - - private void addStringField(Analyzer analyzer, Document doc, Set hfTerms, - Set hfPhrases, Schema.FieldInfo fieldInfo, - ThriftFieldData fieldData) { - doc.add(new Field(fieldInfo.getName(), fieldData.getStringValue(), fieldInfo.getFieldType())); - if (fieldInfo.getFieldType().tokenized()) { - try { - TokenStream tokenStream = analyzer.tokenStream(fieldInfo.getName(), - new StringReader(fieldData.getStringValue())); - try { - expandStream( - doc, - fieldInfo, - tokenStream, - hfTerms, - hfPhrases); - } finally { - tokenStream.close(); - } - } catch (IOException e) { - LOG.error("IOException expanding token stream", e); - } - } else { - addFacetField(doc, fieldInfo, fieldData.getStringValue()); - } - } - - private void addBytesField(Document doc, Schema.FieldInfo fieldInfo, ThriftFieldData fieldData) { - doc.add(new Field(fieldInfo.getName(), fieldData.getBytesValue(), fieldInfo.getFieldType())); - } - - private void addIntField(Document doc, Schema.FieldInfo fieldInfo, - ThriftFieldData fieldData) { - int value = fieldData.getIntValue(); - addFacetField(doc, fieldInfo, String.valueOf(value)); - - if (fieldInfo.getFieldType().getNumericFieldSettings() == null) { - // No NumericFieldSettings. Even though the data is numeric, this field is not - // really a numerical field. Just add as a string. - doc.add(new Field(fieldInfo.getName(), String.valueOf(value), fieldInfo.getFieldType())); - } else if (fieldInfo.getFieldType().getNumericFieldSettings().isUseTwitterFormat()) { - addIntTermAttributeField(value, fieldInfo, doc); - } else { - // Use lucene style numerical fields - doc.add(NumericField.newIntField(fieldInfo.getName(), value)); - } - } - - private void addIntTermAttributeField(int value, - Schema.FieldInfo fieldInfo, - Document doc) { - SingleTokenStream singleToken = new SingleTokenStream(); - IntTermAttribute termAtt = singleToken.addAttribute(IntTermAttribute.class); - termAtt.setTerm(value); - doc.add(new Field(fieldInfo.getName(), singleToken, fieldInfo.getFieldType())); - } - - private void addLongField(Document doc, Schema.FieldInfo fieldInfo, - ThriftFieldData fieldData) { - long value = fieldData.getLongValue(); - addFacetField(doc, fieldInfo, String.valueOf(value)); - - if (fieldInfo.getFieldType().getNumericFieldSettings() == null) { - // No NumericFieldSettings. Even though the data is numeric, this field is not - // really a numerical field. Just add as a string. - doc.add(new Field(fieldInfo.getName(), String.valueOf(value), fieldInfo.getFieldType())); - } else if (fieldInfo.getFieldType().getNumericFieldSettings().isUseTwitterFormat()) { - // Twitter style numerical field: use LongTermAttribute - addLongTermAttributeField(value, fieldInfo, doc); - } else { - // Use lucene style numerical fields - doc.add(NumericField.newLongField(fieldInfo.getName(), value)); - } - } - - private void addLongTermAttributeField(long value, - Schema.FieldInfo fieldInfo, - Document doc) { - SingleTokenStream singleToken = new SingleTokenStream(); - boolean useSortableEncoding = - fieldInfo.getFieldType().getNumericFieldSettings().isUseSortableEncoding(); - - if (useSortableEncoding) { - SortableLongTermAttribute termAtt = singleToken.addAttribute(SortableLongTermAttribute.class); - termAtt.setTerm(value); - } else { - LongTermAttribute termAtt = singleToken.addAttribute(LongTermAttribute.class); - termAtt.setTerm(value); - } - doc.add(new Field(fieldInfo.getName(), singleToken, fieldInfo.getFieldType())); - } - - private void addFloatField() { - throw new UnsupportedOperationException("Earlybird does not support float values yet."); - } - - private void addDoubleFIeld() { - throw new UnsupportedOperationException("Earlybird does not support double values yet."); - } - - private void addGeoField(Document doc, Schema.FieldInfo fieldInfo, ThriftFieldData fieldData) { - ThriftGeoCoordinate coord = fieldData.getGeoCoordinate(); - if (GeoUtil.validateGeoCoordinates(coord.getLat(), coord.getLon())) { - GeoUtil.fillGeoFields(doc, fieldInfo.getName(), - coord.getLat(), coord.getLon(), coord.getAccuracy()); - } - } - - private void addFacetField(Document doc, Schema.FieldInfo fieldInfo, String value) { - Preconditions.checkArgument(doc != null); - Preconditions.checkArgument(fieldInfo != null); - Preconditions.checkArgument(value != null); - - if (fieldInfo.getFieldType().getFacetName() != null) { - doc.add(new SortedSetDocValuesFacetField(fieldInfo.getFieldType().getFacetName(), value)); - } - } - - private String getTerm(TermToBytesRefAttribute attr) { - if (attr instanceof CharTermAttribute) { - return ((CharTermAttribute) attr).toString(); - } else if (attr instanceof IntTermAttribute) { - return String.valueOf(((IntTermAttribute) attr).getTerm()); - } else if (attr instanceof LongTermAttribute) { - return String.valueOf(((LongTermAttribute) attr).getTerm()); - } else { - return attr.getBytesRef().utf8ToString(); - } - } - - /** - * Expand the TwitterTokenStream and populate high-frequency terms, phrases and/or facet category paths. - */ - private void expandStream( - Document doc, - Schema.FieldInfo fieldInfo, - TokenStream stream, - Set hfTerms, - Set hfPhrases) throws IOException { - // Checkstyle does not allow assignment to parameters. - Set facetHfTerms = hfTerms; - Set facetHfPhrases = hfPhrases; - - if (!(HighFrequencyTermPairs.INDEX_HF_TERM_PAIRS - && fieldInfo.getFieldType().isIndexHFTermPairs())) { - // high-frequency terms and phrases are not needed - if (fieldInfo.getFieldType().getFacetName() == null) { - // Facets are not needed either, simply return, would do nothing otherwise - return; - } - facetHfTerms = null; - facetHfPhrases = null; - } - - final TermToBytesRefAttribute attr = stream.getAttribute(TermToBytesRefAttribute.class); - stream.reset(); - - String lastHFTerm = null; - while (stream.incrementToken()) { - String term = getTerm(attr); - if (fieldInfo.getFieldType().getFacetName() != null) { - addFacetField(doc, fieldInfo, term); - } - if (HighFrequencyTermPairs.HF_TERM_SET.contains(term)) { - if (facetHfTerms != null) { - facetHfTerms.add(term); - } - if (lastHFTerm != null) { - if (facetHfPhrases != null) { - facetHfPhrases.add(HighFrequencyTermPairs.createPhrasePair(lastHFTerm, term)); - } - } - lastHFTerm = term; - } else { - lastHFTerm = null; - } - } - } - - public static final class CSFField extends Field { - /** - * Create a CSFField with the given fieldType, containing the given field data. - */ - public CSFField(String name, EarlybirdFieldType fieldType, ThriftFieldData data) { - super(name, fieldType); - - if (fieldType.isCsfVariableLength()) { - fieldsData = new BytesRef(data.getBytesValue()); - } else { - switch (fieldType.getCsfType()) { - case BYTE: - fieldsData = Long.valueOf(data.getByteValue()); - break; - case INT: - fieldsData = Long.valueOf(data.getIntValue()); - break; - case LONG: - fieldsData = Long.valueOf(data.getLongValue()); - break; - case FLOAT: - fieldsData = Long.valueOf(Float.floatToRawIntBits((float) data.getFloatValue())); - break; - case DOUBLE: - fieldsData = Long.valueOf(Double.doubleToRawLongBits(data.getDoubleValue())); - break; - default: - throw new IllegalArgumentException("Unknown csf type: " + fieldType.getCsfType()); - } - } - } - } - - public interface TokenStreamRewriter { - /** - * Rewrite the token stream. - */ - TwitterTokenStream rewrite(Schema.FieldInfo fieldInfo, TwitterTokenStream stream); - } -} diff --git a/src/java/com/twitter/search/common/schema/SchemaUtil.docx b/src/java/com/twitter/search/common/schema/SchemaUtil.docx new file mode 100644 index 000000000..410ebf09f Binary files /dev/null and b/src/java/com/twitter/search/common/schema/SchemaUtil.docx differ diff --git a/src/java/com/twitter/search/common/schema/SchemaUtil.java b/src/java/com/twitter/search/common/schema/SchemaUtil.java deleted file mode 100644 index cba903a2b..000000000 --- a/src/java/com/twitter/search/common/schema/SchemaUtil.java +++ /dev/null @@ -1,102 +0,0 @@ -package com.twitter.search.common.schema; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.DocValuesType; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.IndexedNumericFieldSettings; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.schema.thriftjava.ThriftNumericType; -import com.twitter.search.common.util.analysis.IntTermAttributeImpl; -import com.twitter.search.common.util.analysis.LongTermAttributeImpl; -import com.twitter.search.common.util.analysis.SortableLongTermAttributeImpl; - -public final class SchemaUtil { - private SchemaUtil() { - } - - /** - * Get the a fixed CSF field's number of values per doc. - * @param schema the Schema for the index - * @param fieldId the field id the CSF field - the field must be of binary integer type and - * in fixed size - * @return the number of values per doc - */ - public static int getCSFFieldFixedLength(ImmutableSchemaInterface schema, int fieldId) { - final Schema.FieldInfo fieldInfo = Preconditions.checkNotNull(schema.getFieldInfo(fieldId)); - return getCSFFieldFixedLength(fieldInfo); - } - - /** - * Get the a fixed CSF field's number of values per doc. - * @param schema the Schema for the index - * @param fieldName the field name of the CSF field - the field must be of binary integer type - * and in fixed size - * @return the number of values per doc - */ - public static int getCSFFieldFixedLength(ImmutableSchemaInterface schema, String fieldName) { - final Schema.FieldInfo fieldInfo = Preconditions.checkNotNull(schema.getFieldInfo(fieldName)); - return getCSFFieldFixedLength(fieldInfo); - } - - /** - * Get the a fixed CSF field's number of values per doc. - * @param fieldInfo the field of the CSF field - the field must be of binary integer type - * and in fixed size - * @return the number of values per doc - */ - public static int getCSFFieldFixedLength(Schema.FieldInfo fieldInfo) { - final EarlybirdFieldType fieldType = fieldInfo.getFieldType(); - Preconditions.checkState(fieldType.docValuesType() == DocValuesType.BINARY - && fieldType.getCsfType() == ThriftCSFType.INT); - return fieldType.getCsfFixedLengthNumValuesPerDoc(); - } - - /** Converts the given value to a BytesRef instance, according to the type of the given field. */ - public static BytesRef toBytesRef(Schema.FieldInfo fieldInfo, String value) { - EarlybirdFieldType fieldType = fieldInfo.getFieldType(); - Preconditions.checkArgument(fieldType.indexOptions() != IndexOptions.NONE); - IndexedNumericFieldSettings numericSetting = fieldType.getNumericFieldSettings(); - if (numericSetting != null) { - if (!numericSetting.isUseTwitterFormat()) { - throw new UnsupportedOperationException( - "Numeric field not using Twitter format: cannot drill down."); - } - - ThriftNumericType numericType = numericSetting.getNumericType(); - switch (numericType) { - case INT: - try { - return IntTermAttributeImpl.copyIntoNewBytesRef(Integer.parseInt(value)); - } catch (NumberFormatException e) { - throw new UnsupportedOperationException( - String.format("Cannot parse value for int field %s: %s", - fieldInfo.getName(), value), - e); - } - case LONG: - try { - return numericSetting.isUseSortableEncoding() - ? SortableLongTermAttributeImpl.copyIntoNewBytesRef(Long.parseLong(value)) - : LongTermAttributeImpl.copyIntoNewBytesRef(Long.parseLong(value)); - } catch (NumberFormatException e) { - throw new UnsupportedOperationException( - String.format("Cannot parse value for long field %s: %s", - fieldInfo.getName(), value), - e); - } - default: - throw new UnsupportedOperationException( - String.format("Unsupported numeric type for field %s: %s", - fieldInfo.getName(), numericType)); - } - } - - return new BytesRef(value); - } -} diff --git a/src/java/com/twitter/search/common/schema/SearchWhitespaceAnalyzer.docx b/src/java/com/twitter/search/common/schema/SearchWhitespaceAnalyzer.docx new file mode 100644 index 000000000..9d1544dd4 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/SearchWhitespaceAnalyzer.docx differ diff --git a/src/java/com/twitter/search/common/schema/SearchWhitespaceAnalyzer.java b/src/java/com/twitter/search/common/schema/SearchWhitespaceAnalyzer.java deleted file mode 100644 index fd94f0e78..000000000 --- a/src/java/com/twitter/search/common/schema/SearchWhitespaceAnalyzer.java +++ /dev/null @@ -1,27 +0,0 @@ -package com.twitter.search.common.schema; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.core.WhitespaceTokenizer; - -/** - * The majority of the code is copied from Lucene 3.1 analysis.core.WhitespaceAnalyzer. The only - * new code is the getPositionIncrementGap() - */ -public final class SearchWhitespaceAnalyzer extends Analyzer { - @Override - protected TokenStreamComponents createComponents(final String fieldName) { - return new TokenStreamComponents(new WhitespaceTokenizer()); - } - - /** - * Make sure that phrase queries do not match across 2 instances of the text field. - * - * See the Javadoc for Analyzer.getPositionIncrementGap() for a good explanation of how this - * method works. - */ - @Override - public int getPositionIncrementGap(String fieldName) { - // Hard-code "text" here, because we can't depend on EarlybirdFieldConstants. - return "text".equals(fieldName) ? 1 : super.getPositionIncrementGap(fieldName); - } -} diff --git a/src/java/com/twitter/search/common/schema/ThriftDocumentBuilder.docx b/src/java/com/twitter/search/common/schema/ThriftDocumentBuilder.docx new file mode 100644 index 000000000..16d377885 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/ThriftDocumentBuilder.docx differ diff --git a/src/java/com/twitter/search/common/schema/ThriftDocumentBuilder.java b/src/java/com/twitter/search/common/schema/ThriftDocumentBuilder.java deleted file mode 100644 index 7bec85040..000000000 --- a/src/java/com/twitter/search/common/schema/ThriftDocumentBuilder.java +++ /dev/null @@ -1,228 +0,0 @@ -package com.twitter.search.common.schema; - -import java.io.IOException; -import java.util.List; -import java.util.logging.Level; -import java.util.logging.Logger; - -import javax.annotation.Nullable; - -import com.twitter.common.text.util.PositionIncrementAttributeSerializer; -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.search.common.schema.base.FieldNameToIdMapping; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftField; -import com.twitter.search.common.schema.thriftjava.ThriftFieldData; -import com.twitter.search.common.schema.thriftjava.ThriftGeoCoordinate; -import com.twitter.search.common.util.analysis.CharTermAttributeSerializer; -import com.twitter.search.common.util.analysis.LongTermAttributeSerializer; -import com.twitter.search.common.util.analysis.LongTermsTokenStream; -import com.twitter.search.common.util.analysis.PayloadAttributeSerializer; -import com.twitter.search.common.util.analysis.PayloadWeightedTokenizer; -import com.twitter.search.common.util.spatial.GeoUtil; - -/** - * Builder class for building ThriftDocuments. - */ -public class ThriftDocumentBuilder { - private static final Logger LOG = Logger.getLogger(ThriftDocumentBuilder.class.getName()); - - protected final ThriftDocument doc = new ThriftDocument(); - protected final FieldNameToIdMapping idMapping; - - private static final ThreadLocal PAYLOAD_WEIGHTED_SERIALIZER_PER_THREAD = - new ThreadLocal() { - @Override - protected TokenStreamSerializer initialValue() { - return TokenStreamSerializer.builder() - .add(new CharTermAttributeSerializer()) - .add(new PositionIncrementAttributeSerializer()) - .add(new PayloadAttributeSerializer()) - .build(); - } - }; - - private static final ThreadLocal LONG_TERM_SERIALIZER_PER_THREAD = - new ThreadLocal() { - @Override - protected TokenStreamSerializer initialValue() { - return TokenStreamSerializer.builder() - .add(new LongTermAttributeSerializer()) - .build(); - } - }; - - public ThriftDocumentBuilder(FieldNameToIdMapping idMapping) { - this.idMapping = idMapping; - } - - protected void prepareToBuild() { - // left empty, subclass can override this. - } - - public ThriftDocument build() { - prepareToBuild(); - return doc; - } - - /** - * Add a long field. This is indexed as a - * {@link com.twitter.search.common.util.analysis.LongTermAttribute} - */ - public final ThriftDocumentBuilder withLongField(String fieldName, long value) { - ThriftFieldData fieldData = new ThriftFieldData().setLongValue(value); - ThriftField field = new ThriftField() - .setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData); - doc.addToFields(field); - return this; - } - - /** - * Add an int field. This is indexed as a - * {@link com.twitter.search.common.util.analysis.IntTermAttribute} - */ - public final ThriftDocumentBuilder withIntField(String fieldName, int value) { - ThriftFieldData fieldData = new ThriftFieldData().setIntValue(value); - ThriftField field = new ThriftField() - .setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData); - doc.addToFields(field); - return this; - } - - /** - * Add a field whose value is a single byte. - */ - public final ThriftDocumentBuilder withByteField(String fieldName, byte value) { - ThriftFieldData fieldData = new ThriftFieldData().setByteValue(value); - ThriftField field = new ThriftField() - .setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData); - doc.addToFields(field); - return this; - } - - /** - * Add a field whose value is a byte array. - */ - public final ThriftDocumentBuilder withBytesField(String fieldName, byte[] value) { - ThriftFieldData fieldData = new ThriftFieldData().setBytesValue(value); - ThriftField field = new ThriftField() - .setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData); - doc.addToFields(field); - return this; - } - - /** - * Add a field whose value is a float. - */ - public final ThriftDocumentBuilder withFloatField(String fieldName, float value) { - ThriftFieldData fieldData = new ThriftFieldData().setFloatValue(value); - ThriftField field = new ThriftField() - .setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData); - doc.addToFields(field); - return this; - } - - /** - * Added a field whose value is a Lucene TokenStream. - * The Lucene TokenStream is serialized using Twitter's - * {@link com.twitter.common.text.util.TokenStreamSerializer} - */ - public final ThriftDocumentBuilder withTokenStreamField(String fieldName, - @Nullable String tokenStreamText, - byte[] tokenStream) { - if (tokenStream == null) { - return this; - } - ThriftFieldData fieldData = new ThriftFieldData() - .setStringValue(tokenStreamText).setTokenStreamValue(tokenStream); - ThriftField field = new ThriftField() - .setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData); - doc.addToFields(field); - return this; - } - - /** - * Add a field whose value is a String. - * @param fieldName Name of the field where the string will be added. - * @param text This string is indexed as is (not analyzed). - */ - public final ThriftDocumentBuilder withStringField(String fieldName, String text) { - if (text == null || text.isEmpty()) { - return this; - } - - ThriftFieldData fieldData = new ThriftFieldData().setStringValue(text); - ThriftField field = new ThriftField() - .setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData); - doc.addToFields(field); - return this; - } - - /** - * Add a field whose value is a geo coordinate. - * Earlybird will process the coordinates into geo hashes before indexing. - */ - public final ThriftDocumentBuilder withGeoField(String fieldName, - double lat, double lon, int acc) { - if (!GeoUtil.validateGeoCoordinates(lat, lon)) { - // If the geo coordinates are invalid, don't add any field. - return this; - } - ThriftGeoCoordinate coord = new ThriftGeoCoordinate(); - coord.setLat(lat); - coord.setLon(lon); - coord.setAccuracy(acc); - - ThriftFieldData fieldData = new ThriftFieldData().setGeoCoordinate(coord); - ThriftField field = new ThriftField() - .setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData); - doc.addToFields(field); - return this; - } - - /** - * Added a list of tokens that are weighted. The weights are stored inside payload. - * See {@link com.twitter.search.common.util.analysis.PayloadWeightedTokenizer} for more details. - */ - public final ThriftDocumentBuilder withPayloadWeightTokenStreamField(String fieldName, - String tokens) { - byte[] serialized; - try { - PayloadWeightedTokenizer tokenizer = new PayloadWeightedTokenizer(tokens); - serialized = PAYLOAD_WEIGHTED_SERIALIZER_PER_THREAD.get().serialize(tokenizer); - tokenizer.close(); - } catch (IOException e) { - LOG.log(Level.WARNING, - "Failed to add PayloadWeightedTokenizer field. Bad token weight list: " + tokens, e); - return this; - } catch (NumberFormatException e) { - LOG.log(Level.WARNING, - "Failed to add PayloadWeightedTokenizer field. Cannot parse token weight: " + tokens, e); - return this; - } - withTokenStreamField(fieldName, tokens, serialized); - return this; - } - - /** - * Add a field whose value is a list of longs. - * Each long is encoded into a LongTermAttribute. - * The field will contain a LongTermTokenStream. - */ - public final ThriftDocumentBuilder withLongIDsField(String fieldName, - List longList) throws IOException { - - if (longList == null || longList.isEmpty()) { - return this; - } - LongTermsTokenStream stream = new LongTermsTokenStream(longList); - stream.reset(); - byte[] serializedStream = LONG_TERM_SERIALIZER_PER_THREAD.get().serialize(stream); - - ThriftFieldData fieldData = new ThriftFieldData().setTokenStreamValue(serializedStream); - ThriftField field = new ThriftField() - .setFieldConfigId(idMapping.getFieldID(fieldName)).setFieldData(fieldData); - doc.addToFields(field); - return this; - } -} diff --git a/src/java/com/twitter/search/common/schema/base/BUILD b/src/java/com/twitter/search/common/schema/base/BUILD deleted file mode 100644 index 8501bb387..000000000 --- a/src/java/com/twitter/search/common/schema/base/BUILD +++ /dev/null @@ -1,25 +0,0 @@ -# Library for Schema.java and other utilities with minimal dependencies. -java_library( - name = "base", - sources = ["*.java"], - platform = "java8", - provides = artifact( - org = "com.twitter.search.common", - name = "schema-base", - repo = artifactory, - ), - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/commons-lang", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/thrift:libthrift", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/text/util:token-util", - "src/thrift/com/twitter/search/common:features-java", - "src/thrift/com/twitter/search/common:schema-java", - ], -) diff --git a/src/java/com/twitter/search/common/schema/base/BUILD.docx b/src/java/com/twitter/search/common/schema/base/BUILD.docx new file mode 100644 index 000000000..91af7d38e Binary files /dev/null and b/src/java/com/twitter/search/common/schema/base/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/schema/base/EarlybirdFieldType.docx b/src/java/com/twitter/search/common/schema/base/EarlybirdFieldType.docx new file mode 100644 index 000000000..6a2413d98 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/base/EarlybirdFieldType.docx differ diff --git a/src/java/com/twitter/search/common/schema/base/EarlybirdFieldType.java b/src/java/com/twitter/search/common/schema/base/EarlybirdFieldType.java deleted file mode 100644 index f1e0a501e..000000000 --- a/src/java/com/twitter/search/common/schema/base/EarlybirdFieldType.java +++ /dev/null @@ -1,374 +0,0 @@ -package com.twitter.search.common.schema.base; - -import javax.annotation.Nullable; - -import org.apache.commons.lang.StringUtils; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.index.DocValuesType; -import org.apache.lucene.index.IndexOptions; - -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.schema.thriftjava.ThriftCSFViewSettings; -import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint; - -/** - * An extension of Lucene's {@link FieldType} that contains additional Earlybird-specific settings. - * Lucene IndexingChains can downcast the FieldType object to access these additional settings. - */ -public class EarlybirdFieldType extends FieldType { - - public static final EarlybirdFieldType LONG_CSF_FIELD_TYPE = new EarlybirdFieldType(); - public static final EarlybirdFieldType INT_CSF_FIELD_TYPE = new EarlybirdFieldType(); - public static final EarlybirdFieldType BYTE_CSF_FIELD_TYPE = new EarlybirdFieldType(); - - static { - LONG_CSF_FIELD_TYPE.setCsfType(ThriftCSFType.LONG); - LONG_CSF_FIELD_TYPE.setDocValuesType(DocValuesType.NUMERIC); - LONG_CSF_FIELD_TYPE.setCsfLoadIntoRam(true); - LONG_CSF_FIELD_TYPE.freeze(); - - INT_CSF_FIELD_TYPE.setCsfType(ThriftCSFType.INT); - INT_CSF_FIELD_TYPE.setDocValuesType(DocValuesType.NUMERIC); - INT_CSF_FIELD_TYPE.setCsfLoadIntoRam(true); - INT_CSF_FIELD_TYPE.freeze(); - - BYTE_CSF_FIELD_TYPE.setCsfType(ThriftCSFType.BYTE); - BYTE_CSF_FIELD_TYPE.setDocValuesType(DocValuesType.NUMERIC); - BYTE_CSF_FIELD_TYPE.setCsfLoadIntoRam(true); - BYTE_CSF_FIELD_TYPE.freeze(); - } - - - private boolean storePerPositionPayloads; - private int defaultPayloadLength; - // This is true for fields that become immutable after optimization - private boolean becomesImmutable = true; - private boolean supportOrderedTerms; - private boolean supportTermTextLookup; - private boolean indexHFTermPairs; - - /** - * This flag turns on tweet specific normalizations. - * This turns on the following two token processors: - * {@link com.twitter.search.common.util.text.splitter.HashtagMentionPunctuationSplitter} - * {@link com.twitter.search.common.util.text.filter.NormalizedTokenFilter} - * - * HashtagMentionPunctuationSplitter would break a mention or hashtag like @ab_cd or #ab_cd into - * tokens {ab, cd}. - * NormalizedTokenFilter strips out the # @ $ from the tokens. - * - * - * @deprecated we should remove this flag. It is confusing to have Earlybird apply additional - * tokenization on top of what ingester produced. - */ - @Deprecated - private boolean useTweetSpecificNormalization; - - @Nullable - private TokenStreamSerializer.Builder tokenStreamSerializerProvider = null; - - // csf type settings - private ThriftCSFType csfType; - private boolean csfVariableLength; - private int csfFixedLengthNumValuesPerDoc; - private boolean csfFixedLengthUpdateable; - private boolean csfLoadIntoRam; - private boolean csfDefaultValueSet; - private long csfDefaultValue; - // True if this is a CSF field which is a view on top of a different CSF field - private boolean csfViewField; - // If this field is a csf view, this is the ID of the CSF field backing the view - private int csfViewBaseFieldId; - private FeatureConfiguration csfViewFeatureConfiguration; - - // facet field settings - private String facetName; - private boolean storeFacetSkiplist; - private boolean storeFacetOffensiveCounters; - private boolean useCSFForFacetCounting; - - // Determines if this field is indexed - private boolean indexedField = false; - - // search field settings - // whether a field should be searched by default - private boolean textSearchableByDefault = false; - private float textSearchableFieldWeight = 1.0f; - - // For indexed numerical fields - private IndexedNumericFieldSettings numericFieldSettings = null; - - public boolean isStorePerPositionPayloads() { - return storePerPositionPayloads; - } - - public void setStorePerPositionPayloads(boolean storePerPositionPayloads) { - checkIfFrozen(); - this.storePerPositionPayloads = storePerPositionPayloads; - } - - public int getDefaultPayloadLength() { - return defaultPayloadLength; - } - - public void setDefaultPayloadLength(int defaultPayloadLength) { - checkIfFrozen(); - this.defaultPayloadLength = defaultPayloadLength; - } - - public boolean becomesImmutable() { - return becomesImmutable; - } - - public void setBecomesImmutable(boolean becomesImmutable) { - checkIfFrozen(); - this.becomesImmutable = becomesImmutable; - } - - public boolean isSupportOrderedTerms() { - return supportOrderedTerms; - } - - public void setSupportOrderedTerms(boolean supportOrderedTerms) { - checkIfFrozen(); - this.supportOrderedTerms = supportOrderedTerms; - } - - public boolean isSupportTermTextLookup() { - return supportTermTextLookup; - } - - public void setSupportTermTextLookup(boolean supportTermTextLookup) { - this.supportTermTextLookup = supportTermTextLookup; - } - - @Nullable - public TokenStreamSerializer getTokenStreamSerializer() { - return tokenStreamSerializerProvider == null ? null : tokenStreamSerializerProvider.safeBuild(); - } - - public void setTokenStreamSerializerBuilder(TokenStreamSerializer.Builder provider) { - checkIfFrozen(); - this.tokenStreamSerializerProvider = provider; - } - - public ThriftCSFType getCsfType() { - return csfType; - } - - public void setCsfType(ThriftCSFType csfType) { - checkIfFrozen(); - this.csfType = csfType; - } - - public boolean isCsfVariableLength() { - return csfVariableLength; - } - - public int getCsfFixedLengthNumValuesPerDoc() { - return csfFixedLengthNumValuesPerDoc; - } - - public void setCsfVariableLength() { - checkIfFrozen(); - this.csfVariableLength = true; - } - - /** - * Make the field a fixed length CSF, with the given length. - */ - public void setCsfFixedLengthSettings(int csfFixedLengthNumValuesPerDocument, - boolean isCsfFixedLengthUpdateable) { - checkIfFrozen(); - this.csfVariableLength = false; - this.csfFixedLengthNumValuesPerDoc = csfFixedLengthNumValuesPerDocument; - this.csfFixedLengthUpdateable = isCsfFixedLengthUpdateable; - } - - public boolean isCsfFixedLengthUpdateable() { - return csfFixedLengthUpdateable; - } - - public boolean isCsfLoadIntoRam() { - return csfLoadIntoRam; - } - - public void setCsfLoadIntoRam(boolean csfLoadIntoRam) { - checkIfFrozen(); - this.csfLoadIntoRam = csfLoadIntoRam; - } - - public void setCsfDefaultValue(long defaultValue) { - checkIfFrozen(); - this.csfDefaultValue = defaultValue; - this.csfDefaultValueSet = true; - } - - public long getCsfDefaultValue() { - return csfDefaultValue; - } - - public boolean isCsfDefaultValueSet() { - return csfDefaultValueSet; - } - - public String getFacetName() { - return facetName; - } - - public void setFacetName(String facetName) { - checkIfFrozen(); - this.facetName = facetName; - } - - public boolean isStoreFacetSkiplist() { - return storeFacetSkiplist; - } - - public void setStoreFacetSkiplist(boolean storeFacetSkiplist) { - checkIfFrozen(); - this.storeFacetSkiplist = storeFacetSkiplist; - } - - public boolean isStoreFacetOffensiveCounters() { - return storeFacetOffensiveCounters; - } - - public void setStoreFacetOffensiveCounters(boolean storeFacetOffensiveCounters) { - checkIfFrozen(); - this.storeFacetOffensiveCounters = storeFacetOffensiveCounters; - } - - public boolean isUseCSFForFacetCounting() { - return useCSFForFacetCounting; - } - - public void setUseCSFForFacetCounting(boolean useCSFForFacetCounting) { - checkIfFrozen(); - this.useCSFForFacetCounting = useCSFForFacetCounting; - } - - public boolean isFacetField() { - return facetName != null && !StringUtils.isEmpty(facetName); - } - - public boolean isIndexHFTermPairs() { - return indexHFTermPairs; - } - - public void setIndexHFTermPairs(boolean indexHFTermPairs) { - checkIfFrozen(); - this.indexHFTermPairs = indexHFTermPairs; - } - - public boolean acceptPretokenizedField() { - return tokenStreamSerializerProvider != null; - } - - /** - * set this field to use additional twitter specific tokenization. - * @deprecated should avoid doing additional tokenizations on top of what ingester produced. - */ - @Deprecated - public boolean useTweetSpecificNormalization() { - return useTweetSpecificNormalization; - } - - /** - * test whether this field uses additional twitter specific tokenization. - * @deprecated should avoid doing additional tokenizations on top of what ingester produced. - */ - @Deprecated - public void setUseTweetSpecificNormalization(boolean useTweetSpecificNormalization) { - checkIfFrozen(); - this.useTweetSpecificNormalization = useTweetSpecificNormalization; - } - - public boolean isIndexedField() { - return indexedField; - } - - public void setIndexedField(boolean indexedField) { - this.indexedField = indexedField; - } - - public boolean isTextSearchableByDefault() { - return textSearchableByDefault; - } - - public void setTextSearchableByDefault(boolean textSearchableByDefault) { - checkIfFrozen(); - this.textSearchableByDefault = textSearchableByDefault; - } - - public float getTextSearchableFieldWeight() { - return textSearchableFieldWeight; - } - - public void setTextSearchableFieldWeight(float textSearchableFieldWeight) { - checkIfFrozen(); - this.textSearchableFieldWeight = textSearchableFieldWeight; - } - - /** - * Convenience method to find out if this field stores positions. {@link #indexOptions()} can also - * be used to determine the index options for this field. - */ - public final boolean hasPositions() { - return indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS - || indexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS; - } - - public boolean isCsfViewField() { - return csfViewField; - } - - public int getCsfViewBaseFieldId() { - return csfViewBaseFieldId; - } - - public FeatureConfiguration getCsfViewFeatureConfiguration() { - return csfViewFeatureConfiguration; - } - - /** - * Set the CSF view settings. A CSF view is a portion of an another CSF. - */ - public void setCsfViewSettings(String fieldName, - ThriftCSFViewSettings csfViewSettings, - Schema.FieldInfo baseField) { - checkIfFrozen(); - this.csfViewField = true; - this.csfViewBaseFieldId = csfViewSettings.getBaseFieldConfigId(); - FeatureConfiguration.Builder builder = FeatureConfiguration.builder() - .withName(fieldName) - .withType(csfViewSettings.csfType) - .withBitRange(csfViewSettings.getValueIndex(), - csfViewSettings.getBitStartPosition(), - csfViewSettings.getBitLength()) - .withBaseField(baseField.getName()); - if (csfViewSettings.isSetOutputCSFType()) { - builder.withOutputType(csfViewSettings.getOutputCSFType()); - } - if (csfViewSettings.isSetNormalizationType()) { - builder.withFeatureNormalizationType(csfViewSettings.getNormalizationType()); - } - if (csfViewSettings.isSetFeatureUpdateConstraints()) { - for (ThriftFeatureUpdateConstraint c : csfViewSettings.getFeatureUpdateConstraints()) { - builder.withFeatureUpdateConstraint(c); - } - } - - this.csfViewFeatureConfiguration = builder.build(); - } - - public IndexedNumericFieldSettings getNumericFieldSettings() { - return numericFieldSettings; - } - - public void setNumericFieldSettings(IndexedNumericFieldSettings numericFieldSettings) { - checkIfFrozen(); - this.numericFieldSettings = numericFieldSettings; - } -} diff --git a/src/java/com/twitter/search/common/schema/base/FeatureConfiguration.docx b/src/java/com/twitter/search/common/schema/base/FeatureConfiguration.docx new file mode 100644 index 000000000..9ba64d00b Binary files /dev/null and b/src/java/com/twitter/search/common/schema/base/FeatureConfiguration.docx differ diff --git a/src/java/com/twitter/search/common/schema/base/FeatureConfiguration.java b/src/java/com/twitter/search/common/schema/base/FeatureConfiguration.java deleted file mode 100644 index 74cddf020..000000000 --- a/src/java/com/twitter/search/common/schema/base/FeatureConfiguration.java +++ /dev/null @@ -1,316 +0,0 @@ -package com.twitter.search.common.schema.base; - -import java.util.Set; - -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Sets; - -import com.twitter.common.base.MorePreconditions; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.schema.thriftjava.ThriftFeatureNormalizationType; -import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint; - -// FeatureConfiguration is defined for all the column stride view fields. -public final class FeatureConfiguration { - private final String name; - private final int intIndex; - // Start position in the given int (0-31) - private final int bitStartPos; - // Length in bits of the feature - private final int bitLength; - // precomputed for reuse - private final int bitMask; - private final int inverseBitMask; - private final int maxValue; - - private final ThriftCSFType type; - - // This is the client seen feature type: if this is null, this field is unused. - @Nullable - private final ThriftCSFType outputType; - - private final String baseField; - - private final Set featureUpdateConstraints; - - private final ThriftFeatureNormalizationType featureNormalizationType; - - /** - * Creates a new FeatureConfiguration with a base field. - * - * @param intIndex which integer is the feature in (0 based). - * @param bitStartPos at which bit does the feature start (0-31). - * @param bitLength length in bits of the feature - * @param baseField the CSF this feature is stored within. - */ - private FeatureConfiguration( - String name, - ThriftCSFType type, - ThriftCSFType outputType, - int intIndex, - int bitStartPos, - int bitLength, - String baseField, - Set featureUpdateConstraints, - ThriftFeatureNormalizationType featureNormalizationType) { - Preconditions.checkState(bitStartPos + bitLength <= Integer.SIZE, - "Feature must not cross int boundary."); - this.name = MorePreconditions.checkNotBlank(name); - this.type = Preconditions.checkNotNull(type); - this.outputType = outputType; - this.intIndex = intIndex; - this.bitStartPos = bitStartPos; - this.bitLength = bitLength; - // Technically, int-sized features can use all 32 bits to store a positive value greater than - // Integer.MAX_VALUE. But in practice, we will convert the values of those features to Java ints - // on the read side, so the max value for those features will still be Integer.MAX_VALUE. - this.maxValue = (1 << Math.min(bitLength, Integer.SIZE - 1)) - 1; - this.bitMask = (int) (((1L << bitLength) - 1) << bitStartPos); - this.inverseBitMask = ~bitMask; - this.baseField = baseField; - this.featureUpdateConstraints = featureUpdateConstraints; - this.featureNormalizationType = Preconditions.checkNotNull(featureNormalizationType); - } - - public String getName() { - return name; - } - - public int getMaxValue() { - return maxValue; - } - - @Override - public String toString() { - return new StringBuilder().append(name) - .append(" (").append(intIndex).append(", ") - .append(bitStartPos).append(", ") - .append(bitLength).append(") ").toString(); - } - - public int getValueIndex() { - return intIndex; - } - - public int getBitStartPosition() { - return bitStartPos; - } - - public int getBitLength() { - return bitLength; - } - - public int getBitMask() { - return bitMask; - } - - public int getInverseBitMask() { - return inverseBitMask; - } - - public String getBaseField() { - return baseField; - } - - public ThriftCSFType getType() { - return type; - } - - @Nullable - public ThriftCSFType getOutputType() { - return outputType; - } - - public ThriftFeatureNormalizationType getFeatureNormalizationType() { - return featureNormalizationType; - } - - /** - * Returns the update constraint for the feature. - */ - public Set getUpdateConstraints() { - if (featureUpdateConstraints == null) { - return null; - } - Set constraintSet = Sets.newHashSet(); - for (FeatureConstraint constraint : featureUpdateConstraints) { - constraintSet.add(constraint.getType()); - } - return constraintSet; - } - - /** - * Returns true if the given update satisfies all feature update constraints. - */ - public boolean validateFeatureUpdate(final Number oldValue, final Number newValue) { - if (featureUpdateConstraints != null) { - for (FeatureConstraint contraint : featureUpdateConstraints) { - if (!contraint.apply(oldValue, newValue)) { - return false; - } - } - } - - return true; - } - - @Override - public int hashCode() { - return (name == null ? 0 : name.hashCode()) - + intIndex * 7 - + bitStartPos * 13 - + bitLength * 23 - + bitMask * 31 - + inverseBitMask * 43 - + (int) maxValue * 53 - + (type == null ? 0 : type.hashCode()) * 61 - + (outputType == null ? 0 : outputType.hashCode()) * 71 - + (baseField == null ? 0 : baseField.hashCode()) * 83 - + (featureUpdateConstraints == null ? 0 : featureUpdateConstraints.hashCode()) * 87 - + (featureNormalizationType == null ? 0 : featureNormalizationType.hashCode()) * 97; - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof FeatureConfiguration)) { - return false; - } - - FeatureConfiguration featureConfiguration = FeatureConfiguration.class.cast(obj); - return (name == featureConfiguration.name) - && (bitStartPos == featureConfiguration.bitStartPos) - && (bitLength == featureConfiguration.bitLength) - && (bitMask == featureConfiguration.bitMask) - && (inverseBitMask == featureConfiguration.inverseBitMask) - && (maxValue == featureConfiguration.maxValue) - && (type == featureConfiguration.type) - && (outputType == featureConfiguration.outputType) - && (baseField == featureConfiguration.baseField) - && (featureUpdateConstraints == null - ? featureConfiguration.featureUpdateConstraints == null - : featureUpdateConstraints.equals(featureConfiguration.featureUpdateConstraints)) - && (featureNormalizationType == null - ? featureConfiguration.featureNormalizationType == null - : featureNormalizationType.equals(featureConfiguration.featureNormalizationType)); - } - - private interface FeatureConstraint { - boolean apply(Number oldValue, Number newValue); - ThriftFeatureUpdateConstraint getType(); - } - - public static Builder builder() { - return new Builder(); - } - - public static final class Builder { - private String name; - private ThriftCSFType type; - private ThriftCSFType outputType; - private int intIndex; - // Start position in the given int (0-31) - private int bitStartPos; - // Length in bits of the feature - private int bitLength; - - private String baseField; - - private Set featureUpdateConstraints; - - private ThriftFeatureNormalizationType featureNormalizationType = - ThriftFeatureNormalizationType.NONE; - - public FeatureConfiguration build() { - return new FeatureConfiguration(name, type, outputType, intIndex, bitStartPos, bitLength, - baseField, featureUpdateConstraints, featureNormalizationType); - } - - public Builder withName(String n) { - this.name = n; - return this; - } - - public Builder withType(ThriftCSFType featureType) { - this.type = featureType; - return this; - } - - public Builder withOutputType(ThriftCSFType featureFeatureType) { - this.outputType = featureFeatureType; - return this; - } - - public Builder withFeatureNormalizationType( - ThriftFeatureNormalizationType normalizationType) { - this.featureNormalizationType = Preconditions.checkNotNull(normalizationType); - return this; - } - - /** - * Sets the bit range at the given intIndex, startPos and length. - */ - public Builder withBitRange(int index, int startPos, int length) { - this.intIndex = index; - this.bitStartPos = startPos; - this.bitLength = length; - return this; - } - - public Builder withBaseField(String baseFieldName) { - this.baseField = baseFieldName; - return this; - } - - /** - * Adds a feature update constraint. - */ - public Builder withFeatureUpdateConstraint(final ThriftFeatureUpdateConstraint constraint) { - if (featureUpdateConstraints == null) { - featureUpdateConstraints = Sets.newHashSet(); - } - - switch (constraint) { - case IMMUTABLE: - featureUpdateConstraints.add(new FeatureConstraint() { - @Override public boolean apply(Number oldValue, Number newValue) { - return false; - } - @Override public ThriftFeatureUpdateConstraint getType() { - return ThriftFeatureUpdateConstraint.IMMUTABLE; - } - }); - break; - case INC_ONLY: - featureUpdateConstraints.add(new FeatureConstraint() { - @Override public boolean apply(Number oldValue, Number newValue) { - return newValue.intValue() > oldValue.intValue(); - } - @Override public ThriftFeatureUpdateConstraint getType() { - return ThriftFeatureUpdateConstraint.INC_ONLY; - } - }); - break; - case POSITIVE: - featureUpdateConstraints.add(new FeatureConstraint() { - @Override public boolean apply(Number oldValue, Number newValue) { - return newValue.intValue() >= 0; - } - @Override public ThriftFeatureUpdateConstraint getType() { - return ThriftFeatureUpdateConstraint.POSITIVE; - } - }); - break; - default: - } - - return this; - } - - private Builder() { - - } - } -} - diff --git a/src/java/com/twitter/search/common/schema/base/FieldNameToIdMapping.docx b/src/java/com/twitter/search/common/schema/base/FieldNameToIdMapping.docx new file mode 100644 index 000000000..c4ce25c0d Binary files /dev/null and b/src/java/com/twitter/search/common/schema/base/FieldNameToIdMapping.docx differ diff --git a/src/java/com/twitter/search/common/schema/base/FieldNameToIdMapping.java b/src/java/com/twitter/search/common/schema/base/FieldNameToIdMapping.java deleted file mode 100644 index 4a4db3bab..000000000 --- a/src/java/com/twitter/search/common/schema/base/FieldNameToIdMapping.java +++ /dev/null @@ -1,28 +0,0 @@ -package com.twitter.search.common.schema.base; - -import java.util.Map; - -import com.google.common.collect.ImmutableMap; - -/** - * Maps from fieldName to fieldIDs. - */ -public abstract class FieldNameToIdMapping { - /** - * Returns field ID for the given fieldName. - * Can throw unchecked exceptions is the fieldName is not known to Earlybird. - */ - public abstract int getFieldID(String fieldName); - - /** - * Wrap the given map into a fieldNameToIdMapping instance. - */ - public static FieldNameToIdMapping newFieldNameToIdMapping(Map map) { - final ImmutableMap immutableMap = ImmutableMap.copyOf(map); - return new FieldNameToIdMapping() { - @Override public int getFieldID(String fieldName) { - return immutableMap.get(fieldName); - } - }; - } -} diff --git a/src/java/com/twitter/search/common/schema/base/FieldWeightDefault.docx b/src/java/com/twitter/search/common/schema/base/FieldWeightDefault.docx new file mode 100644 index 000000000..e3dd5e171 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/base/FieldWeightDefault.docx differ diff --git a/src/java/com/twitter/search/common/schema/base/FieldWeightDefault.java b/src/java/com/twitter/search/common/schema/base/FieldWeightDefault.java deleted file mode 100644 index ec3842a5e..000000000 --- a/src/java/com/twitter/search/common/schema/base/FieldWeightDefault.java +++ /dev/null @@ -1,110 +0,0 @@ -package com.twitter.search.common.schema.base; - -import java.util.LinkedHashMap; -import java.util.Map; - -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; - -import static com.google.common.base.Preconditions.checkNotNull; - -/** - * Records whether a field's enabled for search by default and its default weight. Note that these - * two are decoupled -- a field can have a default weight but not enabled for search by default. - * In a query it can be enabled by an annotation that does not specify a weight (e.g., ":f:foo"), - * which would then use the default weight. - * - * Instances are mutable. - */ -public class FieldWeightDefault { - private final boolean enabled; - private final float weight; - - public FieldWeightDefault(boolean enabled, float weight) { - this.enabled = enabled; - this.weight = weight; - } - - public static FieldWeightDefault fromSignedWeight(float signedValue) { - return new FieldWeightDefault(signedValue >= 0, Math.abs(signedValue)); - } - - /** - * Returns an immutable map from field name to default field weights for only enabled fields. - * Fields that are not enabled for search by default will not be included. - */ - public static ImmutableMap getOnlyEnabled( - Map map) { - - ImmutableMap.Builder builder = ImmutableMap.builder(); - for (Map.Entry entry : map.entrySet()) { - if (entry.getValue().isEnabled()) { - builder.put(entry.getKey(), entry.getValue().getWeight()); - } - } - return builder.build(); - } - - public boolean isEnabled() { - return enabled; - } - - public float getWeight() { - return weight; - } - - /** - * Overlays the base field-weight map with the given one. Since it is an overlay, a - * field that does not exist in the base map will never be added. Also, negative value means - * the field is not enabled for search by default, but if it is, the absolute value would serve as - * the default. - */ - public static ImmutableMap overrideFieldWeightMap( - Map base, - Map fieldWeightMapOverride) { - - checkNotNull(base); - if (fieldWeightMapOverride == null) { - return ImmutableMap.copyOf(base); - } - - LinkedHashMap map = Maps.newLinkedHashMap(base); - for (Map.Entry entry : fieldWeightMapOverride.entrySet()) { - if (base.containsKey(entry.getKey()) - && entry.getValue() >= -Float.MAX_VALUE - && entry.getValue() <= Float.MAX_VALUE) { - - map.put( - entry.getKey(), - FieldWeightDefault.fromSignedWeight(entry.getValue().floatValue())); - } - } - - return ImmutableMap.copyOf(map); - } - - /** - * Creates a field-to-FieldWeightDefault map from the given field-to-weight map, where negative - * weight means the the field is not enabled for search by default, but if it is (e.g., - * by annotation), the absolute value of the weight shall be used. - */ - public static ImmutableMap fromSignedWeightMap( - Map signedWeightMap) { - - ImmutableMap.Builder builder = ImmutableMap.builder(); - for (Map.Entry entry : signedWeightMap.entrySet()) { - // If double to float conversion failed, we will get a float infinity. - // See http://stackoverflow.com/a/10075093/716468 - float floatValue = entry.getValue().floatValue(); - if (floatValue != Float.NEGATIVE_INFINITY - && floatValue != Float.POSITIVE_INFINITY) { - - builder.put( - entry.getKey(), - FieldWeightDefault.fromSignedWeight(floatValue)); - } - } - - return builder.build(); - } -} diff --git a/src/java/com/twitter/search/common/schema/base/ImmutableSchemaInterface.docx b/src/java/com/twitter/search/common/schema/base/ImmutableSchemaInterface.docx new file mode 100644 index 000000000..3e43209ca Binary files /dev/null and b/src/java/com/twitter/search/common/schema/base/ImmutableSchemaInterface.docx differ diff --git a/src/java/com/twitter/search/common/schema/base/ImmutableSchemaInterface.java b/src/java/com/twitter/search/common/schema/base/ImmutableSchemaInterface.java deleted file mode 100644 index ea04b16e0..000000000 --- a/src/java/com/twitter/search/common/schema/base/ImmutableSchemaInterface.java +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.search.common.schema.base; - -import javax.annotation.concurrent.Immutable; -import javax.annotation.concurrent.ThreadSafe; - -/** - * This interface carries the same signature as Schema with the only difference that this schema - * is immutable. This should be used by short sessions and the class would guarantee the schema - * would not change for the session. A typical usage is like a search query session. - */ -@Immutable -@ThreadSafe -public interface ImmutableSchemaInterface extends Schema { -} diff --git a/src/java/com/twitter/search/common/schema/base/IndexedNumericFieldSettings.docx b/src/java/com/twitter/search/common/schema/base/IndexedNumericFieldSettings.docx new file mode 100644 index 000000000..78efddd1a Binary files /dev/null and b/src/java/com/twitter/search/common/schema/base/IndexedNumericFieldSettings.docx differ diff --git a/src/java/com/twitter/search/common/schema/base/IndexedNumericFieldSettings.java b/src/java/com/twitter/search/common/schema/base/IndexedNumericFieldSettings.java deleted file mode 100644 index d436fc189..000000000 --- a/src/java/com/twitter/search/common/schema/base/IndexedNumericFieldSettings.java +++ /dev/null @@ -1,37 +0,0 @@ -package com.twitter.search.common.schema.base; - -import com.twitter.search.common.schema.thriftjava.ThriftIndexedNumericFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftNumericType; - -public class IndexedNumericFieldSettings { - private final ThriftNumericType numericType; - private final int numericPrecisionStep; - private final boolean useTwitterFormat; - private final boolean useSortableEncoding; - - /** - * Create a IndexedNumericFieldSettings from a ThriftIndexedNumericFieldSettings - */ - public IndexedNumericFieldSettings(ThriftIndexedNumericFieldSettings numericFieldSettings) { - this.numericType = numericFieldSettings.getNumericType(); - this.numericPrecisionStep = numericFieldSettings.getNumericPrecisionStep(); - this.useTwitterFormat = numericFieldSettings.isUseTwitterFormat(); - this.useSortableEncoding = numericFieldSettings.isUseSortableEncoding(); - } - - public ThriftNumericType getNumericType() { - return numericType; - } - - public int getNumericPrecisionStep() { - return numericPrecisionStep; - } - - public boolean isUseTwitterFormat() { - return useTwitterFormat; - } - - public boolean isUseSortableEncoding() { - return useSortableEncoding; - } -} diff --git a/src/java/com/twitter/search/common/schema/base/Schema.docx b/src/java/com/twitter/search/common/schema/base/Schema.docx new file mode 100644 index 000000000..381375737 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/base/Schema.docx differ diff --git a/src/java/com/twitter/search/common/schema/base/Schema.java b/src/java/com/twitter/search/common/schema/base/Schema.java deleted file mode 100644 index 51f90bd29..000000000 --- a/src/java/com/twitter/search/common/schema/base/Schema.java +++ /dev/null @@ -1,231 +0,0 @@ -package com.twitter.search.common.schema.base; - -import java.util.Collection; -import java.util.Map; - -import javax.annotation.Nullable; - -import com.google.common.base.Predicate; -import com.google.common.collect.ImmutableCollection; -import com.google.common.collect.ImmutableMap; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.index.FieldInfos; - -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchema; -import com.twitter.search.common.schema.thriftjava.ThriftAnalyzer; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration; - -/** - * Search Schema. - */ -public interface Schema { - /** - * Certain Schema implementations can evolve at run time. This call returns a snapshot of - * of the schema which is guaranteed to not change. - */ - ImmutableSchemaInterface getSchemaSnapshot(); - - /** - * Returns a string describing the current schema version. - */ - String getVersionDescription(); - - /** - * Returns whether the schema version is official. Only official segments are uploaded to HDFS. - */ - boolean isVersionOfficial(); - - /** - * Returns the schema's major version. - */ - int getMajorVersionNumber(); - - /** - * Returns the schema's minor version. - */ - int getMinorVersionNumber(); - - /** - * Returns the default analyzer. This analyzer is used when none is specified on the field info. - */ - Analyzer getDefaultAnalyzer(ThriftAnalyzer override); - - /** - * Returns whether the given field is configured in the schema. - */ - boolean hasField(int fieldConfigId); - - /** - * Returns whether the given field is configured in the schema. - */ - boolean hasField(String fieldName); - - /** - * Get the field name corresponding to the given field id. - */ - String getFieldName(int fieldConfigId); - - /** - * Return the FieldInfo of all fields. - */ - ImmutableCollection getFieldInfos(); - - /** - * Get the field info for the given field id. If an override is given, attempt to merge the - * base field info with the override config. - */ - FieldInfo getFieldInfo(int fieldConfigId, ThriftFieldConfiguration override); - - - /** - * Get the field info for the given field id. No override. - */ - @Nullable - FieldInfo getFieldInfo(int fieldConfigId); - - /** - * Get the field info for the given field name. No override. - */ - @Nullable - FieldInfo getFieldInfo(String fieldName); - - /** - * Builds a lucene FieldInfos instance, usually used for indexing. - */ - FieldInfos getLuceneFieldInfos(Predicate acceptedFields); - - /** - * Returns the number of facet fields in this schema. - */ - int getNumFacetFields(); - - /** - * Return facet configurations. - */ - FacetsConfig getFacetsConfig(); - - /** - * Get the facet field's field info by facet name. - */ - FieldInfo getFacetFieldByFacetName(String facetName); - - /** - * Get the facet field's field info by field name. - */ - FieldInfo getFacetFieldByFieldName(String fieldName); - - /** - * Get the field infos for all facet fields. - */ - Collection getFacetFields(); - - /** - * Get the field infos for all facet fields backed by column stride fields. - */ - Collection getCsfFacetFields(); - - /** - * Get the field weight map for text searchable fields. - */ - Map getFieldWeightMap(); - - /** - * Get scoring feature configuration by feature name. - */ - FeatureConfiguration getFeatureConfigurationByName(String featureName); - - /** - * Get scoring feature configuration by feature field id. The feature configuration is - * guaranteed to be not null, or a NullPointerException will be thrown out. - */ - FeatureConfiguration getFeatureConfigurationById(int featureFieldId); - - /** - * Returns the ThriftCSFType for a CSF field. - * Note: for non-CSF field, null will be returned. - */ - @Nullable - ThriftCSFType getCSFFieldType(String fieldName); - - /** - * Get the search result feature schema for all possible features in all search results. - * - * The returned value is not really immutable (because it's a pre-generated thrift struct). - * We want to return it directly because we want to pre-build it once and return with the thrift - * search results as is. - */ - ThriftSearchFeatureSchema getSearchFeatureSchema(); - - /** - * Get the mapping from feature id to feature configuration. - */ - ImmutableMap getFeatureIdToFeatureConfig(); - - /** - * Get the mapping from feature name to feature configuration. - */ - ImmutableMap getFeatureNameToFeatureConfig(); - - /** - * Field configuration for a single field. - */ - final class FieldInfo { - private final int fieldId; - private final String name; - private final EarlybirdFieldType luceneFieldType; - - public FieldInfo(int fieldId, String name, EarlybirdFieldType luceneFieldType) { - this.fieldId = fieldId; - this.name = name; - this.luceneFieldType = luceneFieldType; - } - - public int getFieldId() { - return fieldId; - } - - public String getName() { - return name; - } - - public EarlybirdFieldType getFieldType() { - return luceneFieldType; - } - - public String getDescription() { - return String.format( - "(FieldInfo [fieldId: %d, name: %s, luceneFieldType: %s])", - fieldId, name, luceneFieldType.getFacetName() - ); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof FieldInfo)) { - return false; - } - return fieldId == ((FieldInfo) obj).fieldId; - } - - @Override - public int hashCode() { - return fieldId; - } - } - - /** - * Exception thrown when errors or inconsistences are detected in a search schema. - */ - final class SchemaValidationException extends Exception { - public SchemaValidationException(String msg) { - super(msg); - } - - public SchemaValidationException(String msg, Exception e) { - super(msg, e); - } - } -} diff --git a/src/java/com/twitter/search/common/schema/base/ThriftDocumentUtil.docx b/src/java/com/twitter/search/common/schema/base/ThriftDocumentUtil.docx new file mode 100644 index 000000000..a18d6b78c Binary files /dev/null and b/src/java/com/twitter/search/common/schema/base/ThriftDocumentUtil.docx differ diff --git a/src/java/com/twitter/search/common/schema/base/ThriftDocumentUtil.java b/src/java/com/twitter/search/common/schema/base/ThriftDocumentUtil.java deleted file mode 100644 index 03f0e343e..000000000 --- a/src/java/com/twitter/search/common/schema/base/ThriftDocumentUtil.java +++ /dev/null @@ -1,146 +0,0 @@ -package com.twitter.search.common.schema.base; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftField; - -/** - * Utility APIs for ThriftDocument. - */ -public final class ThriftDocumentUtil { - private ThriftDocumentUtil() { - } - - /** - * Get ThriftField out of a ThriftDocument. - */ - public static ThriftField getField(ThriftDocument thriftDoc, - String fieldName, - FieldNameToIdMapping idMap) { - int id = idMap.getFieldID(fieldName); - for (ThriftField field : thriftDoc.getFields()) { - int fieldId = field.getFieldConfigId(); - if (fieldId == id) { - return field; - } - } - - return null; - } - - /** - * Get all fields out of a ThriftDocument that match the given field name. - */ - public static List getFields( - ThriftDocument thriftDoc, String fieldName, FieldNameToIdMapping idMap) { - - int id = idMap.getFieldID(fieldName); - List result = new ArrayList<>(); - - for (ThriftField field : thriftDoc.getFields()) { - int fieldId = field.getFieldConfigId(); - if (fieldId == id) { - result.add(field); - } - } - - return result; - } - - - /** - * Retrieve the long value from a thrift field - */ - public static long getLongValue(ThriftDocument thriftDoc, - String fieldName, - FieldNameToIdMapping idMap) { - ThriftField f = getField(thriftDoc, fieldName, idMap); - return f == null ? 0L : f.getFieldData().getLongValue(); - } - - /** - * Retrieve the byte value from a thrift field - */ - public static byte getByteValue(ThriftDocument thriftDoc, - String fieldName, - FieldNameToIdMapping idMap) { - ThriftField f = getField(thriftDoc, fieldName, idMap); - return f == null ? (byte) 0 : f.getFieldData().getByteValue(); - } - - /** - * Retrieve the bytes value from a thrift field - */ - public static byte[] getBytesValue(ThriftDocument thriftDoc, - String fieldName, - FieldNameToIdMapping idMap) { - ThriftField f = getField(thriftDoc, fieldName, idMap); - return f == null ? null : f.getFieldData().getBytesValue(); - } - - /** - * Retrieve the int value from a thrift field - */ - public static int getIntValue(ThriftDocument thriftDoc, - String fieldName, - FieldNameToIdMapping idMap) { - ThriftField f = getField(thriftDoc, fieldName, idMap); - return f == null ? 0 : f.getFieldData().getIntValue(); - } - - /** - * Retrieve the string value from a thrift field - */ - public static String getStringValue(ThriftDocument thriftDoc, - String fieldName, - FieldNameToIdMapping idMap) { - ThriftField f = getField(thriftDoc, fieldName, idMap); - return f == null ? null : f.getFieldData().getStringValue(); - } - - /** - * Retrieve the string values from all thrift fields with the given fieldName. - */ - public static List getStringValues( - ThriftDocument thriftDoc, - String fieldName, - FieldNameToIdMapping idMap) { - List fields = getFields(thriftDoc, fieldName, idMap); - List fieldStrings = new ArrayList<>(); - - for (ThriftField field : fields) { - fieldStrings.add(field.getFieldData().getStringValue()); - } - return fieldStrings; - } - - /** - * Returns whether the specified document has duplicate fields. - */ - public static boolean hasDuplicateFields(ThriftDocument thriftDoc) { - Set seen = new HashSet<>(); - for (ThriftField field : thriftDoc.getFields()) { - if (!seen.add(field.getFieldConfigId())) { - return true; - } - } - return false; - } - - /** - * Get ThriftField out of a ThriftDocument. - */ - public static ThriftField getField(ThriftDocument thriftDoc, int fieldId) { - for (ThriftField field : thriftDoc.getFields()) { - if (field.getFieldConfigId() == fieldId) { - return field; - } - } - - return null; - } -} diff --git a/src/java/com/twitter/search/common/schema/earlybird/BUILD b/src/java/com/twitter/search/common/schema/earlybird/BUILD deleted file mode 100644 index e7f8ea032..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/BUILD +++ /dev/null @@ -1,93 +0,0 @@ -# Library for earlybird-specific schema. -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/joda-time", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "cuad/projects/ner/thrift/src/main/thrift:thrift-java", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/common/text/util:token-util", - "src/java/com/twitter/common_internal/text", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/config", - "src/java/com/twitter/search/common/constants", - "src/java/com/twitter/search/common/encoding/docvalues", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/util:longintconverter", - "src/java/com/twitter/search/common/util/analysis", - "src/java/com/twitter/search/common/util/spatial", - "src/java/com/twitter/search/common/util/text", - "src/java/com/twitter/search/common/util/text/regex", - "src/java/com/twitter/search/common/util/url", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java", - ], - exports = [ - "src/thrift/com/twitter/search/common:indexing-java", - ], -) - -java_library( - name = "for-timelines", - sources = [ - "EarlybirdCluster.java", - "EarlybirdFieldConstants.java", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/joda-time", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "cuad/projects/ner/thrift/src/main/thrift:thrift-java", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/common/text/util:token-util", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/config", - "src/java/com/twitter/search/common/constants", - "src/java/com/twitter/search/common/encoding/docvalues", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/util:longintconverter", - "src/java/com/twitter/search/common/util/analysis", - "src/java/com/twitter/search/common/util/spatial", - "src/java/com/twitter/search/common/util/text", - "src/java/com/twitter/search/common/util/text/regex", - "src/java/com/twitter/search/common/util/url", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:schema-java", - "src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java", - ], - exports = [ - "src/thrift/com/twitter/search/common:indexing-java", - ], -) diff --git a/src/java/com/twitter/search/common/schema/earlybird/BUILD.docx b/src/java/com/twitter/search/common/schema/earlybird/BUILD.docx new file mode 100644 index 000000000..50ca153ba Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdCluster.docx b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdCluster.docx new file mode 100644 index 000000000..42306b35b Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdCluster.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdCluster.java b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdCluster.java deleted file mode 100644 index d956b341d..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdCluster.java +++ /dev/null @@ -1,90 +0,0 @@ -package com.twitter.search.common.schema.earlybird; - -import java.util.Set; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableSet; - -/** - * A list of existing Earlybird clusters. - */ -public enum EarlybirdCluster { - /** - * Realtime earlybird cluster. Has 100% of tweet for about 7 days. - */ - REALTIME, - /** - * Protected earlybird cluster. Has only tweets from protected accounts. - */ - PROTECTED, - /** - * Full archive cluster. Has all tweets until about 2 days ago. - */ - FULL_ARCHIVE, - /** - * SuperRoot cluster. Talks to the other clusters instead of talking directly to earlybirds. - */ - SUPERROOT, - - /** - * A dedicated cluster for Candidate Generation use cases based on Earlybird in Home/PushService - */ - REALTIME_CG; - - public String getNameForStats() { - return name().toLowerCase(); - } - - public static boolean isArchive(EarlybirdCluster cluster) { - return isClusterInSet(cluster, ARCHIVE_CLUSTERS); - } - - public static boolean isTwitterMemoryFormatCluster(EarlybirdCluster cluster) { - return isClusterInSet(cluster, TWITTER_IN_MEMORY_INDEX_FORMAT_GENERAL_PURPOSE_CLUSTERS); - } - - public static boolean hasEarlybirds(EarlybirdCluster cluster) { - return cluster != SUPERROOT; - } - - private static boolean isClusterInSet(EarlybirdCluster cluster, Set set) { - return set.contains(cluster); - } - - protected static final ImmutableSet ARCHIVE_CLUSTERS = - ImmutableSet.of(FULL_ARCHIVE); - - @VisibleForTesting - public static final ImmutableSet - TWITTER_IN_MEMORY_INDEX_FORMAT_GENERAL_PURPOSE_CLUSTERS = - ImmutableSet.of( - REALTIME, - PROTECTED); - - @VisibleForTesting - public static final ImmutableSet TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS = - ImmutableSet.of( - REALTIME, - PROTECTED, - REALTIME_CG); - - /** - * Constant for field used in general purpose clusters, - * Note that GENERAL_PURPOSE_CLUSTERS does not include REALTIME_CG. If you wish to include REALTIME_CG, - * please use ALL_CLUSTERS - */ - protected static final ImmutableSet GENERAL_PURPOSE_CLUSTERS = - ImmutableSet.of( - REALTIME, - PROTECTED, - FULL_ARCHIVE, - SUPERROOT); - - protected static final ImmutableSet ALL_CLUSTERS = - ImmutableSet.of( - REALTIME, - PROTECTED, - FULL_ARCHIVE, - SUPERROOT, - REALTIME_CG); -}