mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-12-22 10:11:52 +01:00
Merge b3909b73d5
into 72eda9a24f
This commit is contained in:
commit
0989664e98
@ -18,7 +18,7 @@ public class DistancedItemQueue<U, T> implements Iterable<DistancedItem<T>> {
|
|||||||
private final PriorityQueue<DistancedItem<T>> queue;
|
private final PriorityQueue<DistancedItem<T>> queue;
|
||||||
private final boolean minQueue;
|
private final boolean minQueue;
|
||||||
/**
|
/**
|
||||||
* Creates ontainer for items with their distances.
|
* Creates container for items with their distances.
|
||||||
*
|
*
|
||||||
* @param origin Origin (reference) point
|
* @param origin Origin (reference) point
|
||||||
* @param initial Initial list of elements to add in the structure
|
* @param initial Initial list of elements to add in the structure
|
||||||
@ -94,7 +94,7 @@ public class DistancedItemQueue<U, T> implements Iterable<DistancedItem<T>> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Dequeue all the elements from queueu with ordering mantained
|
* Dequeue all the elements from queue with ordering maintained
|
||||||
*
|
*
|
||||||
* @return remove all the elements in the order of the queue i.e min/max queue.
|
* @return remove all the elements in the order of the queue i.e min/max queue.
|
||||||
*/
|
*/
|
||||||
|
@ -379,7 +379,7 @@ public class HnswIndex<T, Q> {
|
|||||||
* This will reduce the recall.
|
* This will reduce the recall.
|
||||||
* <p>
|
* <p>
|
||||||
* For a full explanation of locking see this document: http://go/hnsw-locking
|
* For a full explanation of locking see this document: http://go/hnsw-locking
|
||||||
* The method returns the closest nearest neighbor (can be used as an enter point)
|
* The method returns the closest nearest neighbours (can be used as an enter point)
|
||||||
*/
|
*/
|
||||||
private T mutuallyConnectNewElement(
|
private T mutuallyConnectNewElement(
|
||||||
final T item,
|
final T item,
|
||||||
@ -532,7 +532,7 @@ public class HnswIndex<T, Q> {
|
|||||||
* @param numOfNeighbours Number of neighbours to search for.
|
* @param numOfNeighbours Number of neighbours to search for.
|
||||||
* @param ef This param controls the accuracy of the search.
|
* @param ef This param controls the accuracy of the search.
|
||||||
* Bigger the ef better the accuracy on the expense of latency.
|
* Bigger the ef better the accuracy on the expense of latency.
|
||||||
* Keep it atleast number of neighbours to find.
|
* Keep it at least number of neighbours to find.
|
||||||
* @return Neighbours
|
* @return Neighbours
|
||||||
*/
|
*/
|
||||||
public List<DistancedItem<T>> searchKnn(final Q query, final int numOfNeighbours, final int ef) {
|
public List<DistancedItem<T>> searchKnn(final Q query, final int numOfNeighbours, final int ef) {
|
||||||
|
@ -105,7 +105,7 @@ object SerializableBruteForceIndex {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This is a class that wrapps a BruteForceIndex and provides a method for serialization.
|
* This is a class that wraps a BruteForceIndex and provides a method for serialization.
|
||||||
*
|
*
|
||||||
* @param bruteForceIndex all queries and updates are sent to this index.
|
* @param bruteForceIndex all queries and updates are sent to this index.
|
||||||
* @param embeddingInjection injection that can convert embeddings to thrift embeddings.
|
* @param embeddingInjection injection that can convert embeddings to thrift embeddings.
|
||||||
|
@ -229,7 +229,7 @@ object ANNIndexBuilderBeamJob extends ScioBeamJob[ANNOptions] {
|
|||||||
|
|
||||||
// Generate Index
|
// Generate Index
|
||||||
processedCollection.saveAsCustomOutput(
|
processedCollection.saveAsCustomOutput(
|
||||||
"Serialise to Disk",
|
"Serialize to Disk",
|
||||||
OutputSink(
|
OutputSink(
|
||||||
out,
|
out,
|
||||||
opts.getAlgo.equals("faiss"),
|
opts.getAlgo.equals("faiss"),
|
||||||
|
@ -18,7 +18,7 @@ object TypedHnswIndex {
|
|||||||
* construction, but better index quality. At some point, increasing
|
* construction, but better index quality. At some point, increasing
|
||||||
* ef_construction does not improve the quality of the index. One way to
|
* ef_construction does not improve the quality of the index. One way to
|
||||||
* check if the selection of ef_construction was ok is to measure a recall
|
* check if the selection of ef_construction was ok is to measure a recall
|
||||||
* for M nearest neighbor search when ef = ef_constuction: if the recall is
|
* for M nearest neighbour search when ef = ef_construction: if the recall is
|
||||||
* lower than 0.9, than there is room for improvement.
|
* lower than 0.9, than there is room for improvement.
|
||||||
* @param maxM The number of bi-directional links created for every new element during construction.
|
* @param maxM The number of bi-directional links created for every new element during construction.
|
||||||
* Reasonable range for M is 2-100. Higher M work better on datasets with high
|
* Reasonable range for M is 2-100. Higher M work better on datasets with high
|
||||||
@ -64,7 +64,7 @@ object TypedHnswIndex {
|
|||||||
* construction, but better index quality. At some point, increasing
|
* construction, but better index quality. At some point, increasing
|
||||||
* ef_construction does not improve the quality of the index. One way to
|
* ef_construction does not improve the quality of the index. One way to
|
||||||
* check if the selection of ef_construction was ok is to measure a recall
|
* check if the selection of ef_construction was ok is to measure a recall
|
||||||
* for M nearest neighbor search when ef = ef_constuction: if the recall is
|
* for M nearest neighbour search when ef = ef_construction: if the recall is
|
||||||
* lower than 0.9, than there is room for improvement.
|
* lower than 0.9, than there is room for improvement.
|
||||||
* @param maxM The number of bi-directional links created for every new element during construction.
|
* @param maxM The number of bi-directional links created for every new element during construction.
|
||||||
* Reasonable range for M is 2-100. Higher M work better on datasets with high
|
* Reasonable range for M is 2-100. Higher M work better on datasets with high
|
||||||
|
@ -12,7 +12,7 @@ import com.twitter.scalding.UniqueID
|
|||||||
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
import com.twitter.scalding_internal.job.TwitterExecutionApp
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This job reads index embedding data, query embeddings data, and split into index set, query set and true nearest neigbor set
|
* This job reads index embedding data, query embeddings data, and split into index set, query set and true nearest neighbour set
|
||||||
* from query to index.
|
* from query to index.
|
||||||
*/
|
*/
|
||||||
object KnnTruthSetGenerator extends TwitterExecutionApp {
|
object KnnTruthSetGenerator extends TwitterExecutionApp {
|
||||||
|
@ -95,7 +95,7 @@ General arguments (specified as **--profile.{options}**):
|
|||||||
- **num_dimensions** Dimension of embedding in the input data. An exception will be thrown if any entry does not have a number of dimensions equal to this number.
|
- **num_dimensions** Dimension of embedding in the input data. An exception will be thrown if any entry does not have a number of dimensions equal to this number.
|
||||||
- **metric** Distance metric (InnerProduct/Cosine/L2)
|
- **metric** Distance metric (InnerProduct/Cosine/L2)
|
||||||
- **concurrency_level** Specifies how many parallel inserts happen to the index. This should probably be set to the number of cores on the machine.
|
- **concurrency_level** Specifies how many parallel inserts happen to the index. This should probably be set to the number of cores on the machine.
|
||||||
- **algo** The kind of index you want to ouput. The supported options right now are:
|
- **algo** The kind of index you want to output. The supported options right now are:
|
||||||
|
|
||||||
1. **hnsw** (Metric supported: Cosine, L2, InnerProduct)
|
1. **hnsw** (Metric supported: Cosine, L2, InnerProduct)
|
||||||
|
|
||||||
|
@ -179,7 +179,7 @@ class InMemoryLoadTestQueryRecorder[T](
|
|||||||
latencyHistogram.add(queryLatency.inMicroseconds)
|
latencyHistogram.add(queryLatency.inMicroseconds)
|
||||||
counter.incrementAndGet()
|
counter.incrementAndGet()
|
||||||
// Requests are assumed to have started around the time time of the first time record was called
|
// Requests are assumed to have started around the time time of the first time record was called
|
||||||
// plus the time it took for that query to hhave completed.
|
// plus the time it took for that query to have completed.
|
||||||
val (elapsedSinceFirstCall, firstQueryLatency) = elapsedTimeFun.get()
|
val (elapsedSinceFirstCall, firstQueryLatency) = elapsedTimeFun.get()
|
||||||
val durationSoFar = elapsedSinceFirstCall() + firstQueryLatency
|
val durationSoFar = elapsedSinceFirstCall() + firstQueryLatency
|
||||||
elapsedTime.set(durationSoFar)
|
elapsedTime.set(durationSoFar)
|
||||||
|
@ -24,7 +24,7 @@ abstract class BaseQueryIndexServer extends ThriftServer with Mtls {
|
|||||||
protected val environment: Flag[String] = flag[String]("environment", "service environment")
|
protected val environment: Flag[String] = flag[String]("environment", "service environment")
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Override with method to provide more module to guice.
|
* Override with method to provide more module to guide.
|
||||||
*/
|
*/
|
||||||
protected def additionalModules: Seq[Module]
|
protected def additionalModules: Seq[Module]
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ object IndexBuilderUtils {
|
|||||||
concurrencyLevel: Int
|
concurrencyLevel: Int
|
||||||
): Future[Int] = {
|
): Future[Int] = {
|
||||||
val count = new AtomicInteger()
|
val count = new AtomicInteger()
|
||||||
// Async stream allows us to procss at most concurrentLevel futures at a time.
|
// Async stream allows us to process at most concurrentLevel futures at a time.
|
||||||
Future.Unit.before {
|
Future.Unit.before {
|
||||||
val stream = AsyncStream.fromSeq(embeddings)
|
val stream = AsyncStream.fromSeq(embeddings)
|
||||||
val appendStream = stream.mapConcurrent(concurrencyLevel) { annEmbedding =>
|
val appendStream = stream.mapConcurrent(concurrencyLevel) { annEmbedding =>
|
||||||
|
@ -57,7 +57,7 @@ class UtegTweetCandidateGenerator @Inject() (
|
|||||||
* supported by the any existing Candidate type, so we created TweetWithScoreAndSocialProof
|
* supported by the any existing Candidate type, so we created TweetWithScoreAndSocialProof
|
||||||
* instead.
|
* instead.
|
||||||
*
|
*
|
||||||
* However, filters and light ranker expect Candidate-typed param to work. In order to minimise the
|
* However, filters and light ranker expect Candidate-typed param to work. In order to minimize the
|
||||||
* changes to them, we are doing conversions from/to TweetWithScoreAndSocialProof to/from Candidate
|
* changes to them, we are doing conversions from/to TweetWithScoreAndSocialProof to/from Candidate
|
||||||
* in this method.
|
* in this method.
|
||||||
*/
|
*/
|
||||||
|
@ -59,7 +59,7 @@ case class SimilarityEngineInfo(
|
|||||||
*
|
*
|
||||||
* @param sourceInfoOpt - this is optional as many consumerBased CG does not have a source
|
* @param sourceInfoOpt - this is optional as many consumerBased CG does not have a source
|
||||||
* @param similarityEngineInfo - the similarity engine used in Candidate Generation (eg., TweetBasedUnifiedSE). It can be an atomic SE or an composite SE
|
* @param similarityEngineInfo - the similarity engine used in Candidate Generation (eg., TweetBasedUnifiedSE). It can be an atomic SE or an composite SE
|
||||||
* @param contributingSimilarityEngines - only composite SE will have it (e.g., SANNN, UTG). Otherwise it is an empty Seq. All contributing SEs mst be atomic
|
* @param contributingSimilarityEngines - only composite SE will have it (e.g., SANN, UTG). Otherwise it is an empty Seq. All contributing SEs mst be atomic
|
||||||
*/
|
*/
|
||||||
case class CandidateGenerationInfo(
|
case class CandidateGenerationInfo(
|
||||||
sourceInfoOpt: Option[SourceInfo],
|
sourceInfoOpt: Option[SourceInfo],
|
||||||
|
@ -45,7 +45,7 @@ object ModelConfig {
|
|||||||
val DebuggerDemo: String = "DebuggerDemo"
|
val DebuggerDemo: String = "DebuggerDemo"
|
||||||
|
|
||||||
// ColdStartLookalike - this is not really a model name, it is as a placeholder to
|
// ColdStartLookalike - this is not really a model name, it is as a placeholder to
|
||||||
// indicate ColdStartLookalike candidate source, which is currently being pluged into
|
// indicate ColdStartLookalike candidate source, which is currently being plugged into
|
||||||
// CustomizedRetrievalCandidateGeneration temporarily.
|
// CustomizedRetrievalCandidateGeneration temporarily.
|
||||||
val ColdStartLookalikeModelName: String = "ConsumersBasedUtgColdStartLookalike20220707"
|
val ColdStartLookalikeModelName: String = "ConsumersBasedUtgColdStartLookalike20220707"
|
||||||
|
|
||||||
|
@ -63,7 +63,7 @@ case class WalsStats(scope: String, scopedStats: StatsReceiver) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// StatsMap maintains a mapping from Model's input signature to a stats receiver
|
// StatsMap maintains a mapping from Model's input signature to a stats receiver
|
||||||
// The Wals model suports multiple input signature which can run different graphs internally and
|
// The Wals model supports multiple input signature which can run different graphs internally and
|
||||||
// can have a different performance profile.
|
// can have a different performance profile.
|
||||||
// Invoking StatsReceiver.stat() on each request can create a new stat object and can be expensive
|
// Invoking StatsReceiver.stat() on each request can create a new stat object and can be expensive
|
||||||
// in performance critical paths.
|
// in performance critical paths.
|
||||||
|
@ -105,7 +105,7 @@ class RepeatedProfileVisitsSource @Inject() (
|
|||||||
val recommendationThreshold = params.getInt(RepeatedProfileVisitsParams.RecommendationThreshold)
|
val recommendationThreshold = params.getInt(RepeatedProfileVisitsParams.RecommendationThreshold)
|
||||||
val bucketingThreshold = params.getInt(RepeatedProfileVisitsParams.BucketingThreshold)
|
val bucketingThreshold = params.getInt(RepeatedProfileVisitsParams.BucketingThreshold)
|
||||||
|
|
||||||
// Get the list of repeatedly visited profilts. Only keep accounts with >= bucketingThreshold visits.
|
// Get the list of repeatedly visited profiles. Only keep accounts with >= bucketingThreshold visits.
|
||||||
val repeatedVisitedAccountsStitch: Stitch[Map[Long, Int]] =
|
val repeatedVisitedAccountsStitch: Stitch[Map[Long, Int]] =
|
||||||
getRepeatedVisitedAccounts(params, userId).map(_.filter(kv => kv._2 >= bucketingThreshold))
|
getRepeatedVisitedAccounts(params, userId).map(_.filter(kv => kv._2 >= bucketingThreshold))
|
||||||
|
|
||||||
|
@ -54,7 +54,7 @@ class WeightedCandidateSourceRanker[Target <: HasParams](
|
|||||||
// Note 1: Using map instead mapValue here since mapValue somehow caused infinite loop when used as part of Stream.
|
// Note 1: Using map instead mapValue here since mapValue somehow caused infinite loop when used as part of Stream.
|
||||||
val sortAndShuffledCandidates = input.map {
|
val sortAndShuffledCandidates = input.map {
|
||||||
case (source, candidates) =>
|
case (source, candidates) =>
|
||||||
// Note 2: toList is required here since candidates is a view, and it will result in infinit loop when used as part of Stream.
|
// Note 2: toList is required here since candidates is a view, and it will result in infinite loop when used as part of Stream.
|
||||||
// Note 3: there is no real sorting logic here, it assumes the input is already sorted by candidate sources
|
// Note 3: there is no real sorting logic here, it assumes the input is already sorted by candidate sources
|
||||||
val sortedCandidates = candidates.toList
|
val sortedCandidates = candidates.toList
|
||||||
source -> shuffleFn(sortedCandidates).iterator
|
source -> shuffleFn(sortedCandidates).iterator
|
||||||
|
@ -5,7 +5,7 @@ import com.twitter.timelines.configapi.FSParam
|
|||||||
|
|
||||||
object SamplingTransformParams {
|
object SamplingTransformParams {
|
||||||
|
|
||||||
case object TopKFixed // indicates how many of the fisrt K who-to-follow recommendations are reserved for the candidates with largest K CandidateUser.score where these candidates are sorted in decreasing order of score
|
case object TopKFixed // indicates how many of the first K who-to-follow recommendations are reserved for the candidates with largest K CandidateUser.score where these candidates are sorted in decreasing order of score
|
||||||
extends FSBoundedParam[Int](
|
extends FSBoundedParam[Int](
|
||||||
name = "post_nux_ml_flow_weighted_sampling_top_k_fixed",
|
name = "post_nux_ml_flow_weighted_sampling_top_k_fixed",
|
||||||
default = 0,
|
default = 0,
|
||||||
|
@ -29,7 +29,7 @@ import com.twitter.timelines.configapi.HasParams
|
|||||||
* - truncating to the top N merged results for ranking
|
* - truncating to the top N merged results for ranking
|
||||||
* - ML ranker
|
* - ML ranker
|
||||||
* - Interleaving ranker for producer-side experiments
|
* - Interleaving ranker for producer-side experiments
|
||||||
* - impression-based fatigueing
|
* - impression-based fatiguing
|
||||||
*/
|
*/
|
||||||
@Singleton
|
@Singleton
|
||||||
class PostNuxMlCombinedRankerBuilder[
|
class PostNuxMlCombinedRankerBuilder[
|
||||||
|
@ -125,7 +125,7 @@ object FrsLogger {
|
|||||||
/** The id of the current user. When the user is logged out, this method should return None. */
|
/** The id of the current user. When the user is logged out, this method should return None. */
|
||||||
override val userId: Option[Long] = clientContext.userId
|
override val userId: Option[Long] = clientContext.userId
|
||||||
|
|
||||||
/** The id of the guest, which is present in logged-in or loged-out states */
|
/** The id of the guest, which is present in logged-in or logged-out states */
|
||||||
override val guestId: Option[Long] = clientContext.guestId
|
override val guestId: Option[Long] = clientContext.guestId
|
||||||
|
|
||||||
/** The personalization id (pid) of the user, used to personalize Twitter services */
|
/** The personalization id (pid) of the user, used to personalize Twitter services */
|
||||||
|
@ -8,7 +8,7 @@ object FlagsModule extends TwitterModule {
|
|||||||
)
|
)
|
||||||
flag[Boolean](
|
flag[Boolean](
|
||||||
name = "interests_opt_out_prod_enabled",
|
name = "interests_opt_out_prod_enabled",
|
||||||
help = "Whether to fetch intersts opt out data from the prod strato column or not"
|
help = "Whether to fetch interests opt out data from the prod strato column or not"
|
||||||
)
|
)
|
||||||
flag[Boolean](
|
flag[Boolean](
|
||||||
name = "log_results",
|
name = "log_results",
|
||||||
|
@ -13,7 +13,7 @@ import com.twitter.timelines.configapi.FSParam
|
|||||||
/**
|
/**
|
||||||
* Include a clear cache timeline instruction when we satisfy these criteria:
|
* Include a clear cache timeline instruction when we satisfy these criteria:
|
||||||
* - Request Provenance is "pull to refresh"
|
* - Request Provenance is "pull to refresh"
|
||||||
* - Atleast N non-ad tweet entries in the response
|
* - At least N non-ad tweet entries in the response
|
||||||
*
|
*
|
||||||
* This is to ensure that we have sufficient new content to justify jumping users to the
|
* This is to ensure that we have sufficient new content to justify jumping users to the
|
||||||
* top of the new timelines response and don't add unnecessary load to backend systems
|
* top of the new timelines response and don't add unnecessary load to backend systems
|
||||||
|
@ -25,7 +25,7 @@ pub fn log_feature_match(
|
|||||||
dr_type: String,
|
dr_type: String,
|
||||||
) {
|
) {
|
||||||
// Note the following algorithm matches features from config using linear search.
|
// Note the following algorithm matches features from config using linear search.
|
||||||
// Also the record source is MinDataRecord. This includes only binary and continous features for now.
|
// Also the record source is MinDataRecord. This includes only binary and continuous features for now.
|
||||||
|
|
||||||
for (feature_id, feature_value) in dr.continuous_features.as_ref().unwrap() {
|
for (feature_id, feature_value) in dr.continuous_features.as_ref().unwrap() {
|
||||||
debug!(
|
debug!(
|
||||||
@ -303,7 +303,7 @@ impl BatchPredictionRequestToTorchTensorConverter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Todo : Refactor, create a generic version with different type and field accessors
|
// Todo : Refactor, create a generic version with different type and field accessors
|
||||||
// Example paramterize and then instiantiate the following
|
// Example parametrize and then instantiate the following
|
||||||
// (FLOAT --> FLOAT, DataRecord.continuous_feature)
|
// (FLOAT --> FLOAT, DataRecord.continuous_feature)
|
||||||
// (BOOL --> INT64, DataRecord.binary_feature)
|
// (BOOL --> INT64, DataRecord.binary_feature)
|
||||||
// (INT64 --> INT64, DataRecord.discrete_feature)
|
// (INT64 --> INT64, DataRecord.discrete_feature)
|
||||||
|
@ -77,7 +77,7 @@ public final class SingleBytePositiveFloatUtil {
|
|||||||
|
|
||||||
// Table used for converting mantissa into a significant
|
// Table used for converting mantissa into a significant
|
||||||
private static float[] mantissaToFractionTable = {
|
private static float[] mantissaToFractionTable = {
|
||||||
// Decimal Matisa value
|
// Decimal Mantissa value
|
||||||
STEP_SIZE * 0, // 0000
|
STEP_SIZE * 0, // 0000
|
||||||
STEP_SIZE * 1, // 0001
|
STEP_SIZE * 1, // 0001
|
||||||
STEP_SIZE * 1, // 0010
|
STEP_SIZE * 1, // 0010
|
||||||
|
@ -399,7 +399,7 @@ public final class TwitterMessageUtil {
|
|||||||
*
|
*
|
||||||
* @param text The text to truncate
|
* @param text The text to truncate
|
||||||
* @param maxLength The maximum length of the string after truncation
|
* @param maxLength The maximum length of the string after truncation
|
||||||
* @param field The field from which this string cames
|
* @param field The field from which this string came
|
||||||
* @param splitEmojisAtMaxLength If true, don't worry about emojis and just truncate at maxLength,
|
* @param splitEmojisAtMaxLength If true, don't worry about emojis and just truncate at maxLength,
|
||||||
* potentially splitting them. If false, truncate before the emoji if truncating at maxLength
|
* potentially splitting them. If false, truncate before the emoji if truncating at maxLength
|
||||||
* would cause the emoji to be split.
|
* would cause the emoji to be split.
|
||||||
|
@ -13,7 +13,7 @@ public abstract class MutableFeatureNormalizers {
|
|||||||
// value (255, if using a byte).
|
// value (255, if using a byte).
|
||||||
private static final int MAX_COUNTER_VALUE_SUPPORTED = 50000000;
|
private static final int MAX_COUNTER_VALUE_SUPPORTED = 50000000;
|
||||||
|
|
||||||
// Avoid using this normalizer for procesing any new data, always use SmartIntegerNormalizer
|
// Avoid using this normalizer for processing any new data, always use SmartIntegerNormalizer
|
||||||
// below.
|
// below.
|
||||||
public static final SingleBytePositiveFloatNormalizer BYTE_NORMALIZER =
|
public static final SingleBytePositiveFloatNormalizer BYTE_NORMALIZER =
|
||||||
new SingleBytePositiveFloatNormalizer();
|
new SingleBytePositiveFloatNormalizer();
|
||||||
|
@ -4,7 +4,7 @@ import com.twitter.search.common.encoding.features.EncodedFeatures;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Holds engagement features for a particular tweet and encodes them as a single int.
|
* Holds engagement features for a particular tweet and encodes them as a single int.
|
||||||
* The features are: retweet count, favorite count, itweet score, reply count.
|
* The features are: retweet count, favorite count, tweet score, reply count.
|
||||||
*/
|
*/
|
||||||
public class TweetEngagementFeatures extends EncodedFeatures {
|
public class TweetEngagementFeatures extends EncodedFeatures {
|
||||||
private static final int RETWEET_COUNT_BIT_SHIFT = 0;
|
private static final int RETWEET_COUNT_BIT_SHIFT = 0;
|
||||||
|
@ -133,7 +133,7 @@ public class TweetParser {
|
|||||||
TokenizerResult result,
|
TokenizerResult result,
|
||||||
PenguinVersion penguinVersion) {
|
PenguinVersion penguinVersion) {
|
||||||
if (message.getHashtags().isEmpty()) {
|
if (message.getHashtags().isEmpty()) {
|
||||||
// add hashtags to TwitterMessage if it doens't already have them, from
|
// add hashtags to TwitterMessage if it doesn't already have them, from
|
||||||
// JSON entities, this happens when we do offline indexing
|
// JSON entities, this happens when we do offline indexing
|
||||||
for (String hashtag : sanitizeTokenizerResults(result.hashtags, '#')) {
|
for (String hashtag : sanitizeTokenizerResults(result.hashtags, '#')) {
|
||||||
message.addHashtag(hashtag);
|
message.addHashtag(hashtag);
|
||||||
@ -141,7 +141,7 @@ public class TweetParser {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (message.getMentions().isEmpty()) {
|
if (message.getMentions().isEmpty()) {
|
||||||
// add mentions to TwitterMessage if it doens't already have them, from
|
// add mentions to TwitterMessage if it doesn't already have them, from
|
||||||
// JSON entities, this happens when we do offline indexing
|
// JSON entities, this happens when we do offline indexing
|
||||||
for (String mention : sanitizeTokenizerResults(result.mentions, '@')) {
|
for (String mention : sanitizeTokenizerResults(result.mentions, '@')) {
|
||||||
message.addMention(mention);
|
message.addMention(mention);
|
||||||
|
@ -32,7 +32,7 @@ public class TerminationTracker {
|
|||||||
private final int postTerminationOverheadMillis;
|
private final int postTerminationOverheadMillis;
|
||||||
|
|
||||||
// We don't check for early termination often enough. Some times requests timeout in between
|
// We don't check for early termination often enough. Some times requests timeout in between
|
||||||
// early termination checks. This buffer time is also substracted from deadline.
|
// early termination checks. This buffer time is also subtracted from deadline.
|
||||||
// To illustrate how this is used, let's use a simple example:
|
// To illustrate how this is used, let's use a simple example:
|
||||||
// If we spent 750ms searching 5 segments, a rough estimate is that we need 150ms to search
|
// If we spent 750ms searching 5 segments, a rough estimate is that we need 150ms to search
|
||||||
// one segment. If the timeout is set to 800ms, we should not starting searching the next segment.
|
// one segment. If the timeout is set to 800ms, we should not starting searching the next segment.
|
||||||
|
@ -54,7 +54,7 @@ public final class FacetsResultsUtils {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Prepare facet fields with empty entries and check if we need termStats for filtering.
|
* Prepare facet fields with empty entries and check if we need termStats for filtering.
|
||||||
* Returns true if termStats filtering is needed (thus the termStats servie call).
|
* Returns true if termStats filtering is needed (thus the termStats service call).
|
||||||
* @param facetRequest The related facet request.
|
* @param facetRequest The related facet request.
|
||||||
* @param facetFieldInfoMap The facet field info map to fill, a map from facet type to the facet
|
* @param facetFieldInfoMap The facet field info map to fill, a map from facet type to the facet
|
||||||
* fiels results info.
|
* fiels results info.
|
||||||
|
@ -53,7 +53,7 @@ public abstract class BaseModelBuilder implements ModelBuilder {
|
|||||||
* <p>
|
* <p>
|
||||||
* Model name (Generated by ML API, but ignored by this class)
|
* Model name (Generated by ML API, but ignored by this class)
|
||||||
* Feature definition:
|
* Feature definition:
|
||||||
* Name of the feature or definition from the MDL discretizer.
|
* Name of the feature or definition from the MDL discretizer
|
||||||
* Weight:
|
* Weight:
|
||||||
* Weight of the feature using LOGIT scale.
|
* Weight of the feature using LOGIT scale.
|
||||||
* <p>
|
* <p>
|
||||||
|
@ -3,7 +3,7 @@ package com.twitter.search.common.util.ml.prediction_engine;
|
|||||||
import com.google.common.base.Preconditions;
|
import com.google.common.base.Preconditions;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The discretized value range for a continous feature. After discretization a continuous feature
|
* The discretized value range for a continuous feature. After discretization a continuous feature
|
||||||
* may become multiple discretized binary features, each occupying a range. This class stores this
|
* may become multiple discretized binary features, each occupying a range. This class stores this
|
||||||
* range and a weight for it.
|
* range and a weight for it.
|
||||||
*/
|
*/
|
||||||
|
@ -20,7 +20,7 @@ import com.twitter.search.common.file.AbstractFile;
|
|||||||
*
|
*
|
||||||
* - Only linear models are supported.
|
* - Only linear models are supported.
|
||||||
* - Only binary and continuous features (i.e. it doesn't support discrete/categorical features).
|
* - Only binary and continuous features (i.e. it doesn't support discrete/categorical features).
|
||||||
* - It supports the MDL discretizer (but not the one based on trees).
|
* - It supports the MDL discretiser (but not the one based on trees).
|
||||||
* - It doesn't support feature crossings.
|
* - It doesn't support feature crossings.
|
||||||
*
|
*
|
||||||
* Instances of this class should be created using only the load methods (loadFromHdfs and
|
* Instances of this class should be created using only the load methods (loadFromHdfs and
|
||||||
|
@ -60,7 +60,7 @@ public class ModelLoader implements Runnable {
|
|||||||
* ${counterPrefix}_num_models:
|
* ${counterPrefix}_num_models:
|
||||||
* Number of models currently loaded.
|
* Number of models currently loaded.
|
||||||
* ${counterPrefix}_num_loads:
|
* ${counterPrefix}_num_loads:
|
||||||
* Number of succesful model loads.
|
* Number of successful model loads.
|
||||||
* ${counterPrefix}_num_errors:
|
* ${counterPrefix}_num_errors:
|
||||||
* Number of errors occurred while loading the models.
|
* Number of errors occurred while loading the models.
|
||||||
*/
|
*/
|
||||||
|
@ -156,7 +156,7 @@ public class EarlybirdIndexLoader {
|
|||||||
FlushInfo segmentsFlushInfo = indexInfo.getSubProperties(EarlybirdIndexFlusher.SEGMENTS);
|
FlushInfo segmentsFlushInfo = indexInfo.getSubProperties(EarlybirdIndexFlusher.SEGMENTS);
|
||||||
List<String> segmentNames = Lists.newArrayList(segmentsFlushInfo.getKeyIterator());
|
List<String> segmentNames = Lists.newArrayList(segmentsFlushInfo.getKeyIterator());
|
||||||
|
|
||||||
// This should only happen if you're running in stagingN and loading a prod index through
|
// This should only happen if you're running in staging and loading a prod index through
|
||||||
// the read_index_from_prod_location flag. In this case, we point to a directory that has
|
// the read_index_from_prod_location flag. In this case, we point to a directory that has
|
||||||
// a lot more than the number of segments we want in staging and we trim this list to the
|
// a lot more than the number of segments we want in staging and we trim this list to the
|
||||||
// desired number.
|
// desired number.
|
||||||
|
@ -94,7 +94,7 @@ public class EarlybirdFeatureSchemaMerger {
|
|||||||
* @param searchResults the response
|
* @param searchResults the response
|
||||||
* @param requestContext the request, which should record the client cached feature schemas
|
* @param requestContext the request, which should record the client cached feature schemas
|
||||||
* @param statPrefix the stats prefix string
|
* @param statPrefix the stats prefix string
|
||||||
* @param successfulResponses all successfull responses from downstream
|
* @param successfulResponses all successful responses from downstream
|
||||||
*/
|
*/
|
||||||
public void collectAndSetFeatureSchemaInResponse(
|
public void collectAndSetFeatureSchemaInResponse(
|
||||||
ThriftSearchResults searchResults,
|
ThriftSearchResults searchResults,
|
||||||
@ -149,7 +149,7 @@ public class EarlybirdFeatureSchemaMerger {
|
|||||||
* (This is done inside superroot)
|
* (This is done inside superroot)
|
||||||
* @param requestContext the search request context
|
* @param requestContext the search request context
|
||||||
* @param mergedResponse the merged result inside the superroot
|
* @param mergedResponse the merged result inside the superroot
|
||||||
* @param realtimeResponse the realtime tier resposne
|
* @param realtimeResponse the realtime tier response
|
||||||
* @param protectedResponse the protected tier response
|
* @param protectedResponse the protected tier response
|
||||||
* @param fullArchiveResponse the full archive tier response
|
* @param fullArchiveResponse the full archive tier response
|
||||||
* @param statsPrefix
|
* @param statsPrefix
|
||||||
|
@ -43,7 +43,7 @@ public final class QueryParsingUtils {
|
|||||||
*
|
*
|
||||||
* @param request the earlybird request to parse.
|
* @param request the earlybird request to parse.
|
||||||
* @return null if the request does not specify a serialized query.
|
* @return null if the request does not specify a serialized query.
|
||||||
* @throws QueryParserException if querry parsing fails.
|
* @throws QueryParserException if query parsing fails.
|
||||||
*/
|
*/
|
||||||
@Nullable
|
@Nullable
|
||||||
static Query getParsedQuery(EarlybirdRequest request) throws QueryParserException {
|
static Query getParsedQuery(EarlybirdRequest request) throws QueryParserException {
|
||||||
|
@ -131,7 +131,7 @@ public class EarlybirdTimeRangeFilter extends
|
|||||||
// As long as a query overlaps with the tier serving range on either side,
|
// As long as a query overlaps with the tier serving range on either side,
|
||||||
// the request is not filtered. I.e. we want to be conservative when doing this filtering,
|
// the request is not filtered. I.e. we want to be conservative when doing this filtering,
|
||||||
// because it is just an optimization. We ignore the inclusiveness / exclusiveness of the
|
// because it is just an optimization. We ignore the inclusiveness / exclusiveness of the
|
||||||
// boundaries. If the tier boundary and the query boundry happen to be the same, we do not
|
// boundaries. If the tier boundary and the query boundary happen to be the same, we do not
|
||||||
// filter the request.
|
// filter the request.
|
||||||
return queryRanges.getSinceIDExclusive().or(0L)
|
return queryRanges.getSinceIDExclusive().or(0L)
|
||||||
> servingRange.getServingRangeMaxId()
|
> servingRange.getServingRangeMaxId()
|
||||||
|
@ -138,7 +138,7 @@ public abstract class EarlybirdResponseMerger implements EarlyTerminateTierMerge
|
|||||||
// thread_running_future_{i-1} and thread_running_future_i is crossed. This guarantees
|
// thread_running_future_{i-1} and thread_running_future_i is crossed. This guarantees
|
||||||
// that thread_running_future_i will see the updates to mergeHelper before it sees the
|
// that thread_running_future_i will see the updates to mergeHelper before it sees the
|
||||||
// callbacks. (Or thread_running_future_{i-1} == thread_running_future_i, in which case
|
// callbacks. (Or thread_running_future_{i-1} == thread_running_future_i, in which case
|
||||||
// synchronization is not an issue, and correctness is guarateed by the order in which
|
// synchronization is not an issue, and correctness is guaranteed by the order in which
|
||||||
// things will run.)
|
// things will run.)
|
||||||
// 4. The same reasoning applies to currentFutureIndex.
|
// 4. The same reasoning applies to currentFutureIndex.
|
||||||
|
|
||||||
|
@ -481,8 +481,8 @@ public class RecencyResponseMerger extends EarlybirdResponseMerger {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Trim results based on search range. The search range [x, y] is determined by:
|
* Trim results based on search range. The search range [x, y] is determined by:
|
||||||
* x is the maximun of the minimun search IDs;
|
* x is the maximum of the minimum search IDs;
|
||||||
* y is the minimun of the maximum search IDs.
|
* y is the minimum of the maximum search IDs.
|
||||||
*
|
*
|
||||||
* Ids out side of this range are removed.
|
* Ids out side of this range are removed.
|
||||||
* If we do not get enough results after the removal, we add IDs back until we get enough results.
|
* If we do not get enough results after the removal, we add IDs back until we get enough results.
|
||||||
|
@ -212,7 +212,7 @@ public class StrictRecencyResponseMerger extends RecencyResponseMerger {
|
|||||||
// We don't need to worry about the tier bottom when merging partition responses in the full
|
// We don't need to worry about the tier bottom when merging partition responses in the full
|
||||||
// archive cluster: if all partitions were exhausted and we didn't trim the results, then
|
// archive cluster: if all partitions were exhausted and we didn't trim the results, then
|
||||||
// the early-terminated flag on the merged response will be false. If at least one partition
|
// the early-terminated flag on the merged response will be false. If at least one partition
|
||||||
// is early-terminated, or we trimmed some results, then the ealry-terminated flag on the
|
// is early-terminated, or we trimmed some results, then the early-terminated flag on the
|
||||||
// merged response will be true, and we should continue getting results from this tier before
|
// merged response will be true, and we should continue getting results from this tier before
|
||||||
// we move to the next one.
|
// we move to the next one.
|
||||||
return false;
|
return false;
|
||||||
|
@ -32,7 +32,7 @@ public class FuturePoolModule extends TwitterModule {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a future pool backed by executor service, with bounded thread pool and bounded backing
|
* Create a future pool backed by executor service, with bounded thread pool and bounded backing
|
||||||
* queue. ONLY VISIBILE FOR TESTING; don't invoke outside this class.
|
* queue. ONLY VISIBLE FOR TESTING; don't invoke outside this class.
|
||||||
*/
|
*/
|
||||||
@VisibleForTesting
|
@VisibleForTesting
|
||||||
public static ExecutorServiceFuturePool createFuturePool(
|
public static ExecutorServiceFuturePool createFuturePool(
|
||||||
|
@ -36,7 +36,7 @@ public class TweetypieModule extends TwitterModule {
|
|||||||
ThriftMux.Client thriftMux,
|
ThriftMux.Client thriftMux,
|
||||||
StatsReceiver statsReceiver) throws InterruptedException {
|
StatsReceiver statsReceiver) throws InterruptedException {
|
||||||
// TweetService is TweetService (tweetypie) with different api
|
// TweetService is TweetService (tweetypie) with different api
|
||||||
// Since TweetService will be primarly used for interacting with
|
// Since TweetService will be primarily used for interacting with
|
||||||
// tweetypie's flexible schema (MH), we will increase request
|
// tweetypie's flexible schema (MH), we will increase request
|
||||||
// timeout and retries but share other settings from TweetService.
|
// timeout and retries but share other settings from TweetService.
|
||||||
@SuppressWarnings("unchecked")
|
@SuppressWarnings("unchecked")
|
||||||
|
@ -44,7 +44,7 @@ public class ClientIdWhitelist extends PeriodicFileLoader {
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Creates clock and executor service needed to create a periodic file loading object
|
* Creates clock and executor service needed to create a periodic file loading object
|
||||||
* then returns object that accpets file.
|
* then returns object that accepts file.
|
||||||
* @param clientWhitelistPath
|
* @param clientWhitelistPath
|
||||||
* @return ClientIdWhitelist
|
* @return ClientIdWhitelist
|
||||||
* @throws Exception
|
* @throws Exception
|
||||||
|
@ -5,6 +5,6 @@ There are two types of ingesters:
|
|||||||
1. Tweet ingesters
|
1. Tweet ingesters
|
||||||
2. UserUpdates ingesters
|
2. UserUpdates ingesters
|
||||||
|
|
||||||
Tweet ingesters consume raw tweets and extract different fields and features for Earlybird to index. User updates ingester produces user safety information such as whether the user is deactivated, suspended or off-boarded. The user and tweet features produced by ingesters are then used by Earlybird during tweet retieval and ranking.
|
Tweet ingesters consume raw tweets and extract different fields and features for Earlybird to index. User updates ingester produces user safety information such as whether the user is deactivated, suspended or off-boarded. The user and tweet features produced by ingesters are then used by Earlybird during tweet retrieval and ranking.
|
||||||
|
|
||||||
Ingesters are made up of a pipeline of stages with each stage performing a different field/feature extraction. The pipeline configuration of the ingesters can be found at science/search/ingester/config
|
Ingesters are made up of a pipeline of stages with each stage performing a different field/feature extraction. The pipeline configuration of the ingesters can be found at science/search/ingester/config
|
||||||
|
@ -79,7 +79,7 @@ public abstract class TwitterBatchedBaseStage<T, R> extends
|
|||||||
protected abstract boolean needsToBeBatched(T element);
|
protected abstract boolean needsToBeBatched(T element);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Tranform from type T to U element.
|
* Transform from type T to U element.
|
||||||
* T and U might be different types so this function will help with the transformation
|
* T and U might be different types so this function will help with the transformation
|
||||||
* if the incoming T element is filtered out and is bypass directly to the next stage
|
* if the incoming T element is filtered out and is bypass directly to the next stage
|
||||||
* that takes incoming objects of type U
|
* that takes incoming objects of type U
|
||||||
|
Loading…
Reference in New Issue
Block a user