Merge b3909b73d5c814340cedf2c380ea1d680ad313f3 into 72eda9a24f815f6d566818cbf8518138e29d83e9

This commit is contained in:
Joanie Gannon 2023-07-17 21:42:18 -05:00 committed by GitHub
commit 0989664e98
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
45 changed files with 52 additions and 52 deletions

View File

@ -18,7 +18,7 @@ public class DistancedItemQueue<U, T> implements Iterable<DistancedItem<T>> {
private final PriorityQueue<DistancedItem<T>> queue;
private final boolean minQueue;
/**
* Creates ontainer for items with their distances.
* Creates container for items with their distances.
*
* @param origin Origin (reference) point
* @param initial Initial list of elements to add in the structure
@ -94,7 +94,7 @@ public class DistancedItemQueue<U, T> implements Iterable<DistancedItem<T>> {
}
/**
* Dequeue all the elements from queueu with ordering mantained
* Dequeue all the elements from queue with ordering maintained
*
* @return remove all the elements in the order of the queue i.e min/max queue.
*/

View File

@ -379,7 +379,7 @@ public class HnswIndex<T, Q> {
* This will reduce the recall.
* <p>
* For a full explanation of locking see this document: http://go/hnsw-locking
* The method returns the closest nearest neighbor (can be used as an enter point)
* The method returns the closest nearest neighbours (can be used as an enter point)
*/
private T mutuallyConnectNewElement(
final T item,
@ -532,7 +532,7 @@ public class HnswIndex<T, Q> {
* @param numOfNeighbours Number of neighbours to search for.
* @param ef This param controls the accuracy of the search.
* Bigger the ef better the accuracy on the expense of latency.
* Keep it atleast number of neighbours to find.
* Keep it at least number of neighbours to find.
* @return Neighbours
*/
public List<DistancedItem<T>> searchKnn(final Q query, final int numOfNeighbours, final int ef) {

View File

@ -105,7 +105,7 @@ object SerializableBruteForceIndex {
}
/**
* This is a class that wrapps a BruteForceIndex and provides a method for serialization.
* This is a class that wraps a BruteForceIndex and provides a method for serialization.
*
* @param bruteForceIndex all queries and updates are sent to this index.
* @param embeddingInjection injection that can convert embeddings to thrift embeddings.

View File

@ -229,7 +229,7 @@ object ANNIndexBuilderBeamJob extends ScioBeamJob[ANNOptions] {
// Generate Index
processedCollection.saveAsCustomOutput(
"Serialise to Disk",
"Serialize to Disk",
OutputSink(
out,
opts.getAlgo.equals("faiss"),

View File

@ -18,7 +18,7 @@ object TypedHnswIndex {
* construction, but better index quality. At some point, increasing
* ef_construction does not improve the quality of the index. One way to
* check if the selection of ef_construction was ok is to measure a recall
* for M nearest neighbor search when ef = ef_constuction: if the recall is
* for M nearest neighbour search when ef = ef_construction: if the recall is
* lower than 0.9, than there is room for improvement.
* @param maxM The number of bi-directional links created for every new element during construction.
* Reasonable range for M is 2-100. Higher M work better on datasets with high
@ -64,7 +64,7 @@ object TypedHnswIndex {
* construction, but better index quality. At some point, increasing
* ef_construction does not improve the quality of the index. One way to
* check if the selection of ef_construction was ok is to measure a recall
* for M nearest neighbor search when ef = ef_constuction: if the recall is
* for M nearest neighbour search when ef = ef_construction: if the recall is
* lower than 0.9, than there is room for improvement.
* @param maxM The number of bi-directional links created for every new element during construction.
* Reasonable range for M is 2-100. Higher M work better on datasets with high

View File

@ -12,7 +12,7 @@ import com.twitter.scalding.UniqueID
import com.twitter.scalding_internal.job.TwitterExecutionApp
/**
* This job reads index embedding data, query embeddings data, and split into index set, query set and true nearest neigbor set
* This job reads index embedding data, query embeddings data, and split into index set, query set and true nearest neighbour set
* from query to index.
*/
object KnnTruthSetGenerator extends TwitterExecutionApp {

View File

@ -95,7 +95,7 @@ General arguments (specified as **--profile.{options}**):
- **num_dimensions** Dimension of embedding in the input data. An exception will be thrown if any entry does not have a number of dimensions equal to this number.
- **metric** Distance metric (InnerProduct/Cosine/L2)
- **concurrency_level** Specifies how many parallel inserts happen to the index. This should probably be set to the number of cores on the machine.
- **algo** The kind of index you want to ouput. The supported options right now are:
- **algo** The kind of index you want to output. The supported options right now are:
1. **hnsw** (Metric supported: Cosine, L2, InnerProduct)

View File

@ -179,7 +179,7 @@ class InMemoryLoadTestQueryRecorder[T](
latencyHistogram.add(queryLatency.inMicroseconds)
counter.incrementAndGet()
// Requests are assumed to have started around the time time of the first time record was called
// plus the time it took for that query to hhave completed.
// plus the time it took for that query to have completed.
val (elapsedSinceFirstCall, firstQueryLatency) = elapsedTimeFun.get()
val durationSoFar = elapsedSinceFirstCall() + firstQueryLatency
elapsedTime.set(durationSoFar)

View File

@ -24,7 +24,7 @@ abstract class BaseQueryIndexServer extends ThriftServer with Mtls {
protected val environment: Flag[String] = flag[String]("environment", "service environment")
/**
* Override with method to provide more module to guice.
* Override with method to provide more module to guide.
*/
protected def additionalModules: Seq[Module]

View File

@ -15,7 +15,7 @@ object IndexBuilderUtils {
concurrencyLevel: Int
): Future[Int] = {
val count = new AtomicInteger()
// Async stream allows us to procss at most concurrentLevel futures at a time.
// Async stream allows us to process at most concurrentLevel futures at a time.
Future.Unit.before {
val stream = AsyncStream.fromSeq(embeddings)
val appendStream = stream.mapConcurrent(concurrencyLevel) { annEmbedding =>

View File

@ -57,7 +57,7 @@ class UtegTweetCandidateGenerator @Inject() (
* supported by the any existing Candidate type, so we created TweetWithScoreAndSocialProof
* instead.
*
* However, filters and light ranker expect Candidate-typed param to work. In order to minimise the
* However, filters and light ranker expect Candidate-typed param to work. In order to minimize the
* changes to them, we are doing conversions from/to TweetWithScoreAndSocialProof to/from Candidate
* in this method.
*/

View File

@ -59,7 +59,7 @@ case class SimilarityEngineInfo(
*
* @param sourceInfoOpt - this is optional as many consumerBased CG does not have a source
* @param similarityEngineInfo - the similarity engine used in Candidate Generation (eg., TweetBasedUnifiedSE). It can be an atomic SE or an composite SE
* @param contributingSimilarityEngines - only composite SE will have it (e.g., SANNN, UTG). Otherwise it is an empty Seq. All contributing SEs mst be atomic
* @param contributingSimilarityEngines - only composite SE will have it (e.g., SANN, UTG). Otherwise it is an empty Seq. All contributing SEs mst be atomic
*/
case class CandidateGenerationInfo(
sourceInfoOpt: Option[SourceInfo],

View File

@ -45,7 +45,7 @@ object ModelConfig {
val DebuggerDemo: String = "DebuggerDemo"
// ColdStartLookalike - this is not really a model name, it is as a placeholder to
// indicate ColdStartLookalike candidate source, which is currently being pluged into
// indicate ColdStartLookalike candidate source, which is currently being plugged into
// CustomizedRetrievalCandidateGeneration temporarily.
val ColdStartLookalikeModelName: String = "ConsumersBasedUtgColdStartLookalike20220707"

View File

@ -63,7 +63,7 @@ case class WalsStats(scope: String, scopedStats: StatsReceiver) {
}
// StatsMap maintains a mapping from Model's input signature to a stats receiver
// The Wals model suports multiple input signature which can run different graphs internally and
// The Wals model supports multiple input signature which can run different graphs internally and
// can have a different performance profile.
// Invoking StatsReceiver.stat() on each request can create a new stat object and can be expensive
// in performance critical paths.

View File

@ -105,7 +105,7 @@ class RepeatedProfileVisitsSource @Inject() (
val recommendationThreshold = params.getInt(RepeatedProfileVisitsParams.RecommendationThreshold)
val bucketingThreshold = params.getInt(RepeatedProfileVisitsParams.BucketingThreshold)
// Get the list of repeatedly visited profilts. Only keep accounts with >= bucketingThreshold visits.
// Get the list of repeatedly visited profiles. Only keep accounts with >= bucketingThreshold visits.
val repeatedVisitedAccountsStitch: Stitch[Map[Long, Int]] =
getRepeatedVisitedAccounts(params, userId).map(_.filter(kv => kv._2 >= bucketingThreshold))

View File

@ -54,7 +54,7 @@ class WeightedCandidateSourceRanker[Target <: HasParams](
// Note 1: Using map instead mapValue here since mapValue somehow caused infinite loop when used as part of Stream.
val sortAndShuffledCandidates = input.map {
case (source, candidates) =>
// Note 2: toList is required here since candidates is a view, and it will result in infinit loop when used as part of Stream.
// Note 2: toList is required here since candidates is a view, and it will result in infinite loop when used as part of Stream.
// Note 3: there is no real sorting logic here, it assumes the input is already sorted by candidate sources
val sortedCandidates = candidates.toList
source -> shuffleFn(sortedCandidates).iterator

View File

@ -5,7 +5,7 @@ import com.twitter.timelines.configapi.FSParam
object SamplingTransformParams {
case object TopKFixed // indicates how many of the fisrt K who-to-follow recommendations are reserved for the candidates with largest K CandidateUser.score where these candidates are sorted in decreasing order of score
case object TopKFixed // indicates how many of the first K who-to-follow recommendations are reserved for the candidates with largest K CandidateUser.score where these candidates are sorted in decreasing order of score
extends FSBoundedParam[Int](
name = "post_nux_ml_flow_weighted_sampling_top_k_fixed",
default = 0,

View File

@ -29,7 +29,7 @@ import com.twitter.timelines.configapi.HasParams
* - truncating to the top N merged results for ranking
* - ML ranker
* - Interleaving ranker for producer-side experiments
* - impression-based fatigueing
* - impression-based fatiguing
*/
@Singleton
class PostNuxMlCombinedRankerBuilder[

View File

@ -125,7 +125,7 @@ object FrsLogger {
/** The id of the current user. When the user is logged out, this method should return None. */
override val userId: Option[Long] = clientContext.userId
/** The id of the guest, which is present in logged-in or loged-out states */
/** The id of the guest, which is present in logged-in or logged-out states */
override val guestId: Option[Long] = clientContext.guestId
/** The personalization id (pid) of the user, used to personalize Twitter services */

View File

@ -8,7 +8,7 @@ object FlagsModule extends TwitterModule {
)
flag[Boolean](
name = "interests_opt_out_prod_enabled",
help = "Whether to fetch intersts opt out data from the prod strato column or not"
help = "Whether to fetch interests opt out data from the prod strato column or not"
)
flag[Boolean](
name = "log_results",

View File

@ -13,7 +13,7 @@ import com.twitter.timelines.configapi.FSParam
/**
* Include a clear cache timeline instruction when we satisfy these criteria:
* - Request Provenance is "pull to refresh"
* - Atleast N non-ad tweet entries in the response
* - At least N non-ad tweet entries in the response
*
* This is to ensure that we have sufficient new content to justify jumping users to the
* top of the new timelines response and don't add unnecessary load to backend systems

View File

@ -25,7 +25,7 @@ pub fn log_feature_match(
dr_type: String,
) {
// Note the following algorithm matches features from config using linear search.
// Also the record source is MinDataRecord. This includes only binary and continous features for now.
// Also the record source is MinDataRecord. This includes only binary and continuous features for now.
for (feature_id, feature_value) in dr.continuous_features.as_ref().unwrap() {
debug!(
@ -303,7 +303,7 @@ impl BatchPredictionRequestToTorchTensorConverter {
}
// Todo : Refactor, create a generic version with different type and field accessors
// Example paramterize and then instiantiate the following
// Example parametrize and then instantiate the following
// (FLOAT --> FLOAT, DataRecord.continuous_feature)
// (BOOL --> INT64, DataRecord.binary_feature)
// (INT64 --> INT64, DataRecord.discrete_feature)

View File

@ -77,7 +77,7 @@ public final class SingleBytePositiveFloatUtil {
// Table used for converting mantissa into a significant
private static float[] mantissaToFractionTable = {
// Decimal Matisa value
// Decimal Mantissa value
STEP_SIZE * 0, // 0000
STEP_SIZE * 1, // 0001
STEP_SIZE * 1, // 0010

View File

@ -399,7 +399,7 @@ public final class TwitterMessageUtil {
*
* @param text The text to truncate
* @param maxLength The maximum length of the string after truncation
* @param field The field from which this string cames
* @param field The field from which this string came
* @param splitEmojisAtMaxLength If true, don't worry about emojis and just truncate at maxLength,
* potentially splitting them. If false, truncate before the emoji if truncating at maxLength
* would cause the emoji to be split.

View File

@ -13,7 +13,7 @@ public abstract class MutableFeatureNormalizers {
// value (255, if using a byte).
private static final int MAX_COUNTER_VALUE_SUPPORTED = 50000000;
// Avoid using this normalizer for procesing any new data, always use SmartIntegerNormalizer
// Avoid using this normalizer for processing any new data, always use SmartIntegerNormalizer
// below.
public static final SingleBytePositiveFloatNormalizer BYTE_NORMALIZER =
new SingleBytePositiveFloatNormalizer();

View File

@ -4,7 +4,7 @@ import com.twitter.search.common.encoding.features.EncodedFeatures;
/**
* Holds engagement features for a particular tweet and encodes them as a single int.
* The features are: retweet count, favorite count, itweet score, reply count.
* The features are: retweet count, favorite count, tweet score, reply count.
*/
public class TweetEngagementFeatures extends EncodedFeatures {
private static final int RETWEET_COUNT_BIT_SHIFT = 0;

View File

@ -133,7 +133,7 @@ public class TweetParser {
TokenizerResult result,
PenguinVersion penguinVersion) {
if (message.getHashtags().isEmpty()) {
// add hashtags to TwitterMessage if it doens't already have them, from
// add hashtags to TwitterMessage if it doesn't already have them, from
// JSON entities, this happens when we do offline indexing
for (String hashtag : sanitizeTokenizerResults(result.hashtags, '#')) {
message.addHashtag(hashtag);
@ -141,7 +141,7 @@ public class TweetParser {
}
if (message.getMentions().isEmpty()) {
// add mentions to TwitterMessage if it doens't already have them, from
// add mentions to TwitterMessage if it doesn't already have them, from
// JSON entities, this happens when we do offline indexing
for (String mention : sanitizeTokenizerResults(result.mentions, '@')) {
message.addMention(mention);

View File

@ -32,7 +32,7 @@ public class TerminationTracker {
private final int postTerminationOverheadMillis;
// We don't check for early termination often enough. Some times requests timeout in between
// early termination checks. This buffer time is also substracted from deadline.
// early termination checks. This buffer time is also subtracted from deadline.
// To illustrate how this is used, let's use a simple example:
// If we spent 750ms searching 5 segments, a rough estimate is that we need 150ms to search
// one segment. If the timeout is set to 800ms, we should not starting searching the next segment.

View File

@ -54,7 +54,7 @@ public final class FacetsResultsUtils {
/**
* Prepare facet fields with empty entries and check if we need termStats for filtering.
* Returns true if termStats filtering is needed (thus the termStats servie call).
* Returns true if termStats filtering is needed (thus the termStats service call).
* @param facetRequest The related facet request.
* @param facetFieldInfoMap The facet field info map to fill, a map from facet type to the facet
* fiels results info.

View File

@ -53,7 +53,7 @@ public abstract class BaseModelBuilder implements ModelBuilder {
* <p>
* Model name (Generated by ML API, but ignored by this class)
* Feature definition:
* Name of the feature or definition from the MDL discretizer.
* Name of the feature or definition from the MDL discretizer
* Weight:
* Weight of the feature using LOGIT scale.
* <p>

View File

@ -3,7 +3,7 @@ package com.twitter.search.common.util.ml.prediction_engine;
import com.google.common.base.Preconditions;
/**
* The discretized value range for a continous feature. After discretization a continuous feature
* The discretized value range for a continuous feature. After discretization a continuous feature
* may become multiple discretized binary features, each occupying a range. This class stores this
* range and a weight for it.
*/

View File

@ -20,7 +20,7 @@ import com.twitter.search.common.file.AbstractFile;
*
* - Only linear models are supported.
* - Only binary and continuous features (i.e. it doesn't support discrete/categorical features).
* - It supports the MDL discretizer (but not the one based on trees).
* - It supports the MDL discretiser (but not the one based on trees).
* - It doesn't support feature crossings.
*
* Instances of this class should be created using only the load methods (loadFromHdfs and

View File

@ -60,7 +60,7 @@ public class ModelLoader implements Runnable {
* ${counterPrefix}_num_models:
* Number of models currently loaded.
* ${counterPrefix}_num_loads:
* Number of succesful model loads.
* Number of successful model loads.
* ${counterPrefix}_num_errors:
* Number of errors occurred while loading the models.
*/

View File

@ -156,7 +156,7 @@ public class EarlybirdIndexLoader {
FlushInfo segmentsFlushInfo = indexInfo.getSubProperties(EarlybirdIndexFlusher.SEGMENTS);
List<String> segmentNames = Lists.newArrayList(segmentsFlushInfo.getKeyIterator());
// This should only happen if you're running in stagingN and loading a prod index through
// This should only happen if you're running in staging and loading a prod index through
// the read_index_from_prod_location flag. In this case, we point to a directory that has
// a lot more than the number of segments we want in staging and we trim this list to the
// desired number.

View File

@ -94,7 +94,7 @@ public class EarlybirdFeatureSchemaMerger {
* @param searchResults the response
* @param requestContext the request, which should record the client cached feature schemas
* @param statPrefix the stats prefix string
* @param successfulResponses all successfull responses from downstream
* @param successfulResponses all successful responses from downstream
*/
public void collectAndSetFeatureSchemaInResponse(
ThriftSearchResults searchResults,
@ -149,7 +149,7 @@ public class EarlybirdFeatureSchemaMerger {
* (This is done inside superroot)
* @param requestContext the search request context
* @param mergedResponse the merged result inside the superroot
* @param realtimeResponse the realtime tier resposne
* @param realtimeResponse the realtime tier response
* @param protectedResponse the protected tier response
* @param fullArchiveResponse the full archive tier response
* @param statsPrefix

View File

@ -43,7 +43,7 @@ public final class QueryParsingUtils {
*
* @param request the earlybird request to parse.
* @return null if the request does not specify a serialized query.
* @throws QueryParserException if querry parsing fails.
* @throws QueryParserException if query parsing fails.
*/
@Nullable
static Query getParsedQuery(EarlybirdRequest request) throws QueryParserException {

View File

@ -131,7 +131,7 @@ public class EarlybirdTimeRangeFilter extends
// As long as a query overlaps with the tier serving range on either side,
// the request is not filtered. I.e. we want to be conservative when doing this filtering,
// because it is just an optimization. We ignore the inclusiveness / exclusiveness of the
// boundaries. If the tier boundary and the query boundry happen to be the same, we do not
// boundaries. If the tier boundary and the query boundary happen to be the same, we do not
// filter the request.
return queryRanges.getSinceIDExclusive().or(0L)
> servingRange.getServingRangeMaxId()

View File

@ -138,7 +138,7 @@ public abstract class EarlybirdResponseMerger implements EarlyTerminateTierMerge
// thread_running_future_{i-1} and thread_running_future_i is crossed. This guarantees
// that thread_running_future_i will see the updates to mergeHelper before it sees the
// callbacks. (Or thread_running_future_{i-1} == thread_running_future_i, in which case
// synchronization is not an issue, and correctness is guarateed by the order in which
// synchronization is not an issue, and correctness is guaranteed by the order in which
// things will run.)
// 4. The same reasoning applies to currentFutureIndex.

View File

@ -481,8 +481,8 @@ public class RecencyResponseMerger extends EarlybirdResponseMerger {
/**
* Trim results based on search range. The search range [x, y] is determined by:
* x is the maximun of the minimun search IDs;
* y is the minimun of the maximum search IDs.
* x is the maximum of the minimum search IDs;
* y is the minimum of the maximum search IDs.
*
* Ids out side of this range are removed.
* If we do not get enough results after the removal, we add IDs back until we get enough results.

View File

@ -212,7 +212,7 @@ public class StrictRecencyResponseMerger extends RecencyResponseMerger {
// We don't need to worry about the tier bottom when merging partition responses in the full
// archive cluster: if all partitions were exhausted and we didn't trim the results, then
// the early-terminated flag on the merged response will be false. If at least one partition
// is early-terminated, or we trimmed some results, then the ealry-terminated flag on the
// is early-terminated, or we trimmed some results, then the early-terminated flag on the
// merged response will be true, and we should continue getting results from this tier before
// we move to the next one.
return false;

View File

@ -32,7 +32,7 @@ public class FuturePoolModule extends TwitterModule {
/**
* Create a future pool backed by executor service, with bounded thread pool and bounded backing
* queue. ONLY VISIBILE FOR TESTING; don't invoke outside this class.
* queue. ONLY VISIBLE FOR TESTING; don't invoke outside this class.
*/
@VisibleForTesting
public static ExecutorServiceFuturePool createFuturePool(

View File

@ -36,7 +36,7 @@ public class TweetypieModule extends TwitterModule {
ThriftMux.Client thriftMux,
StatsReceiver statsReceiver) throws InterruptedException {
// TweetService is TweetService (tweetypie) with different api
// Since TweetService will be primarly used for interacting with
// Since TweetService will be primarily used for interacting with
// tweetypie's flexible schema (MH), we will increase request
// timeout and retries but share other settings from TweetService.
@SuppressWarnings("unchecked")

View File

@ -44,7 +44,7 @@ public class ClientIdWhitelist extends PeriodicFileLoader {
/**
* Creates clock and executor service needed to create a periodic file loading object
* then returns object that accpets file.
* then returns object that accepts file.
* @param clientWhitelistPath
* @return ClientIdWhitelist
* @throws Exception

View File

@ -5,6 +5,6 @@ There are two types of ingesters:
1. Tweet ingesters
2. UserUpdates ingesters
Tweet ingesters consume raw tweets and extract different fields and features for Earlybird to index. User updates ingester produces user safety information such as whether the user is deactivated, suspended or off-boarded. The user and tweet features produced by ingesters are then used by Earlybird during tweet retieval and ranking.
Tweet ingesters consume raw tweets and extract different fields and features for Earlybird to index. User updates ingester produces user safety information such as whether the user is deactivated, suspended or off-boarded. The user and tweet features produced by ingesters are then used by Earlybird during tweet retrieval and ranking.
Ingesters are made up of a pipeline of stages with each stage performing a different field/feature extraction. The pipeline configuration of the ingesters can be found at science/search/ingester/config

View File

@ -79,7 +79,7 @@ public abstract class TwitterBatchedBaseStage<T, R> extends
protected abstract boolean needsToBeBatched(T element);
/**
* Tranform from type T to U element.
* Transform from type T to U element.
* T and U might be different types so this function will help with the transformation
* if the incoming T element is filtered out and is bypass directly to the next stage
* that takes incoming objects of type U