mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-01 08:48:46 +02:00
Merge 477e4b7756
into fb54d8b549
This commit is contained in:
commit
274451cb61
|
@ -78,7 +78,7 @@ public class DistancedItemQueue<U, T> implements Iterable<DistancedItem<T>> {
|
|||
/**
|
||||
* Return root of the queue
|
||||
*
|
||||
* @return root of the queue i.e min/max element depending upon min-max queue
|
||||
* @return root of the queue i.e. min/max element depending upon min-max queue
|
||||
*/
|
||||
public DistancedItem<T> peek() {
|
||||
return queue.peek();
|
||||
|
@ -87,7 +87,7 @@ public class DistancedItemQueue<U, T> implements Iterable<DistancedItem<T>> {
|
|||
/**
|
||||
* Dequeue root of the queue.
|
||||
*
|
||||
* @return remove and return root of the queue i.e min/max element depending upon min-max queue
|
||||
* @return remove and return root of the queue i.e. min/max element depending upon min-max queue
|
||||
*/
|
||||
public DistancedItem<T> dequeue() {
|
||||
return queue.poll();
|
||||
|
@ -96,7 +96,7 @@ public class DistancedItemQueue<U, T> implements Iterable<DistancedItem<T>> {
|
|||
/**
|
||||
* Dequeue all the elements from queueu with ordering mantained
|
||||
*
|
||||
* @return remove all the elements in the order of the queue i.e min/max queue.
|
||||
* @return remove all the elements in the order of the queue i.e. min/max queue.
|
||||
*/
|
||||
public List<DistancedItem<T>> dequeueAll() {
|
||||
final List<DistancedItem<T>> list = new ArrayList<>(queue.size());
|
||||
|
|
|
@ -14,7 +14,7 @@ import com.twitter.search.common.file.{AbstractFile, LocalFile}
|
|||
import java.util.TimeZone
|
||||
|
||||
/**
|
||||
* Generates the nearest neighbour for users and store them in Manhattan format i.e sequence files.
|
||||
* Generates the nearest neighbour for users and store them in Manhattan format i.e. sequence files.
|
||||
* See README for oscar usage.
|
||||
*/
|
||||
object KnnOfflineJob extends TwitterExecutionApp {
|
||||
|
|
|
@ -91,7 +91,7 @@ General arguments (specified as **--profile.{options}**):
|
|||
|
||||
3. **input.{feature_store_args}** For feature store related args like `feature_store_embedding`, `feature_store_major_version`, `date_range`:
|
||||
|
||||
- **output_dir** Where to save the produced serialized ann index. Save to HDFS by specifying the full URI. e.g `hdfs://hadoop-dw2-nn.smf1.twitter.com/user/<user>/index_file` or using the default cluster `hdfs:///user/<user>/index_file`.
|
||||
- **output_dir** Where to save the produced serialized ann index. Save to HDFS by specifying the full URI. e.g. `hdfs://hadoop-dw2-nn.smf1.twitter.com/user/<user>/index_file` or using the default cluster `hdfs:///user/<user>/index_file`.
|
||||
- **num_dimensions** Dimension of embedding in the input data. An exception will be thrown if any entry does not have a number of dimensions equal to this number.
|
||||
- **metric** Distance metric (InnerProduct/Cosine/L2)
|
||||
- **concurrency_level** Specifies how many parallel inserts happen to the index. This should probably be set to the number of cores on the machine.
|
||||
|
|
|
@ -35,7 +35,7 @@ Load test results will be printed to stdout of an aurora job.
|
|||
# Loadtest ANN query service with query set
|
||||
|
||||
An ANN query service can be load-tested with sample queries drawn from the embeddings dataset.
|
||||
For creating sample queries i.e `query_set` refer this [section](#query-set-generator).
|
||||
For creating sample queries i.e. `query_set` refer this [section](#query-set-generator).
|
||||
|
||||
Test is run with `live` version of loadtest binary that is already available in packer.
|
||||
Example script to load test a ANN query service:
|
||||
|
|
|
@ -58,7 +58,7 @@ case class SimilarityEngineInfo(
|
|||
* to contribute to its final candidate list. We track these Contributing SEs in the contributingSimilarityEngines list
|
||||
*
|
||||
* @param sourceInfoOpt - this is optional as many consumerBased CG does not have a source
|
||||
* @param similarityEngineInfo - the similarity engine used in Candidate Generation (eg., TweetBasedUnifiedSE). It can be an atomic SE or an composite SE
|
||||
* @param similarityEngineInfo - the similarity engine used in Candidate Generation (e.g., TweetBasedUnifiedSE). It can be an atomic SE or an composite SE
|
||||
* @param contributingSimilarityEngines - only composite SE will have it (e.g., SANNN, UTG). Otherwise it is an empty Seq. All contributing SEs mst be atomic
|
||||
*/
|
||||
case class CandidateGenerationInfo(
|
||||
|
|
|
@ -352,7 +352,7 @@ object ProducerBasedUnifiedSimilarityEngine {
|
|||
/***
|
||||
* Every candidate will have the CG Info with ProducerBasedUnifiedSimilarityEngine
|
||||
* as they are generated by a composite of Similarity Engines.
|
||||
* Additionally, we store the contributing SEs (eg., SANN, UTG).
|
||||
* Additionally, we store the contributing SEs (e.g., SANN, UTG).
|
||||
*/
|
||||
private def getProducerBasedUnifiedCGInfo(
|
||||
sourceInfoOpt: Option[SourceInfo],
|
||||
|
|
|
@ -482,7 +482,7 @@ case class TweetBasedUnifiedSimilarityEngine(
|
|||
/***
|
||||
* Every candidate will have the CG Info with TweetBasedUnifiedSimilarityEngine
|
||||
* as they are generated by a composite of Similarity Engines.
|
||||
* Additionally, we store the contributing SEs (eg., SANN, UTG).
|
||||
* Additionally, we store the contributing SEs (e.g., SANN, UTG).
|
||||
*/
|
||||
private def getTweetBasedUnifiedCGInfo(
|
||||
sourceInfoOpt: Option[SourceInfo],
|
||||
|
|
|
@ -9,7 +9,7 @@ import com.twitter.util.Future
|
|||
|
||||
/***
|
||||
* A SourceGraphFetcher is a trait that extends from `SourceFetcher`
|
||||
* and is specialized in tackling User Graph (eg., RealGraphOon, FRS) fetch.
|
||||
* and is specialized in tackling User Graph (e.g., RealGraphOon, FRS) fetch.
|
||||
*
|
||||
* The [[ResultType]] of a SourceGraphFetcher is a `GraphSourceInfo` which contains a userSeedSet.
|
||||
* When we pass in userId, the underlying store returns one GraphSourceInfo.
|
||||
|
|
|
@ -8,7 +8,7 @@ import com.twitter.util.Future
|
|||
|
||||
/***
|
||||
* A SourceSignalFetcher is a trait that extends from `SourceFetcher`
|
||||
* and is specialized in tackling Signals (eg., USS, FRS) fetch.
|
||||
* and is specialized in tackling Signals (e.g., USS, FRS) fetch.
|
||||
* Currently, we define Signals as (but not limited to) a set of past engagements that
|
||||
* the user makes, such as RecentFav, RecentFollow, etc.
|
||||
*
|
||||
|
|
|
@ -121,7 +121,7 @@ object MetricTagUtil {
|
|||
/**
|
||||
* Special use case: used by Notifications team to generate the UserInterestedIn CRT push copy.
|
||||
*
|
||||
* if we have different types of InterestedIn (eg. UserInterestedIn, NextInterestedIn),
|
||||
* if we have different types of InterestedIn (e.g. UserInterestedIn, NextInterestedIn),
|
||||
* this if statement will have to be refactored to contain the real UserInterestedIn.
|
||||
* @return
|
||||
*/
|
||||
|
|
|
@ -7,7 +7,7 @@ import com.twitter.product_mixer.core.model.common.identifier.CandidateSourceIde
|
|||
|
||||
/**
|
||||
* Used to keep track of a candidate's source not so much as a feature but for filtering candidate
|
||||
* from specific sources (eg. GizmoduckPredicate)
|
||||
* from specific sources (e.g. GizmoduckPredicate)
|
||||
*/
|
||||
trait HasUserCandidateSourceDetails { candidateUser: CandidateUser =>
|
||||
def userCandidateSourceDetails: Option[UserCandidateSourceDetails]
|
||||
|
|
|
@ -52,7 +52,7 @@ class PromotedAccountsBlender @Inject() (statsReceiver: StatsReceiver)
|
|||
* merge a list of positioned users, aka. promoted, and a list of organic
|
||||
* users. The positioned promoted users are pre-sorted with regards to their
|
||||
* position ascendingly. Only requirement about position is to be within the
|
||||
* range, i.e, can not exceed the combined length if merge is successful, ok
|
||||
* range, i.e., can not exceed the combined length if merge is successful, ok
|
||||
* to be at the last position, but not beyond.
|
||||
* For more detailed description of location position:
|
||||
* http://confluence.local.twitter.com/display/ADS/Promoted+Tweets+in+Timeline+Design+Document
|
||||
|
|
|
@ -78,7 +78,7 @@ class PostNuxMlCombinedRankerBuilder[
|
|||
randomSeed = request.getRandomizationSeed
|
||||
).observe(weightedRankerStats)
|
||||
|
||||
// ranker that takes the first n results (ie truncates output) while merging duplicates
|
||||
// ranker that takes the first n results (i.e. truncates output) while merging duplicates
|
||||
val firstNRankerObs = firstNRanker.observe(firstNRankerStats)
|
||||
// either ML ranker that uses deepbirdv2 to score or no ranking
|
||||
val mainRanker: Ranker[T, CandidateUser] =
|
||||
|
|
|
@ -324,7 +324,7 @@ object EarlybirdResponseUtil {
|
|||
}
|
||||
}
|
||||
|
||||
// Omitting inNetwork features e.g source tweet features and follow graph.
|
||||
// Omitting inNetwork features e.g. source tweet features and follow graph.
|
||||
// Can be expanded to include InNetwork in the future.
|
||||
def applyOONUserDependentFeatures(
|
||||
searcherUserId: Long,
|
||||
|
|
|
@ -198,7 +198,7 @@ object TweetMediaFeaturesExtractor {
|
|||
}
|
||||
}.flatten
|
||||
|
||||
// 3rd party media providers. eg. giphy for gifs
|
||||
// 3rd party media providers. e.g. giphy for gifs
|
||||
private def getMediaOriginProviders(mediaEntities: Seq[tp.MediaEntity]): Seq[String] =
|
||||
for {
|
||||
mediaEntity <- mediaEntities
|
||||
|
|
|
@ -61,7 +61,7 @@ message CardinalityOptions {
|
|||
enum ComputeLevel {
|
||||
CARDINALITY_COMPUTE_UNSPECIFIED = 0;
|
||||
// Cardinality will only be computed if it can be determined in a cheap
|
||||
// manner (ie. without reading from file sources). If the cardinality would
|
||||
// manner (i.e. without reading from file sources). If the cardinality would
|
||||
// be nontrivial to compute, Cardinality() will return UNKNOWN_CARDINALITY.
|
||||
CARDINALITY_COMPUTE_LOW = 1;
|
||||
// Moderate effort will be made to determine cardinality, such as reading
|
||||
|
|
|
@ -11,7 +11,7 @@ option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framewo
|
|||
// Indicates when a distributed variable will be synced.
|
||||
enum VariableSynchronization {
|
||||
// `AUTO`: Indicates that the synchronization will be determined by the
|
||||
// current `DistributionStrategy` (eg. With `MirroredStrategy` this would be
|
||||
// current `DistributionStrategy` (e.g. With `MirroredStrategy` this would be
|
||||
// `ON_WRITE`).
|
||||
VARIABLE_SYNCHRONIZATION_AUTO = 0;
|
||||
// `NONE`: Indicates that there will only be one copy of the variable, so
|
||||
|
@ -21,7 +21,7 @@ enum VariableSynchronization {
|
|||
// every time it is written.
|
||||
VARIABLE_SYNCHRONIZATION_ON_WRITE = 2;
|
||||
// `ON_READ`: Indicates that the variable will be aggregated across devices
|
||||
// when it is read (eg. when checkpointing or when evaluating an op that uses
|
||||
// when it is read (e.g. when checkpointing or when evaluating an op that uses
|
||||
// the variable).
|
||||
VARIABLE_SYNCHRONIZATION_ON_READ = 3;
|
||||
}
|
||||
|
|
|
@ -383,7 +383,7 @@ message RPCOptions {
|
|||
|
||||
// Setting num_channels_per_target > 0 allows uses of multiple channels to
|
||||
// communicate to the same target. This can be used to improve the aggregate
|
||||
// throughput on high speed links (e.g 100G) where single connection is not
|
||||
// throughput on high speed links (e.g. 100G) where single connection is not
|
||||
// sufficient to maximize link utilization. Note that a single RPC only goes
|
||||
// on a single channel, this only helps in situations where there are multiple
|
||||
// transfers to the same target overlapping in time.
|
||||
|
|
|
@ -8,7 +8,7 @@ import com.twitter.product_mixer.core.model.common.UniversalNoun
|
|||
* user clicks on a CommerceProduct, they will be taken to the specific product page.
|
||||
*
|
||||
* @note Both CommerceProduct and CommerceProductGroups (below) can be shown in the same
|
||||
* TimelineModule (i.e Carousel)
|
||||
* TimelineModule (i.e. Carousel)
|
||||
*
|
||||
* @note Any additional fields should be added as a [[com.twitter.product_mixer.core.feature.Feature]]
|
||||
* on the candidate's [[com.twitter.product_mixer.core.feature.featuremap.FeatureMap]]. If the
|
||||
|
@ -90,7 +90,7 @@ object CommerceProductCandidate {
|
|||
* versions of the top level product.
|
||||
*
|
||||
* @note Both CommerceProduct (above) and CommerceProductGroups can be shown in the same
|
||||
* TimelineModule (i.e Carousel)
|
||||
* TimelineModule (i.e. Carousel)
|
||||
*
|
||||
* @note Any additional fields should be added as a [[com.twitter.product_mixer.core.feature.Feature]]
|
||||
* on the candidate's [[com.twitter.product_mixer.core.feature.featuremap.FeatureMap]]. If the
|
||||
|
|
|
@ -139,7 +139,7 @@ object NewPipelineArrowBuilder {
|
|||
}
|
||||
|
||||
/**
|
||||
* This is a pipeline specific instance of a step, i.e, a generic step with the step identifier
|
||||
* This is a pipeline specific instance of a step, i.e., a generic step with the step identifier
|
||||
* within the pipeline and its executor configs.
|
||||
* @param stepIdentifier Step identifier of the step within a pipeline
|
||||
* @param executorConfig Config to execute the step with
|
||||
|
|
|
@ -47,7 +47,7 @@ case class SelectorStep[
|
|||
config: Seq[Selector[Query]]
|
||||
): State = input.updateCandidatesWithDetails(executorResult.selectedCandidates)
|
||||
|
||||
// Selection is a bit different to other steps (i.e, other steps, empty means don't change anything)
|
||||
// Selection is a bit different to other steps (i.e., other steps, empty means don't change anything)
|
||||
// where an empty selection list drops all candidates.
|
||||
override def isEmpty(config: Seq[Selector[Query]]): Boolean = false
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ public final class CSFTypeUtil {
|
|||
* Convert bytes into a long value. Inverse function of convertToBytes.
|
||||
*/
|
||||
public static int convertFromBytes(byte[] data, int startOffset, int valueIndex) {
|
||||
// This should rarely happen, eg. when we get a corrupt ThriftIndexingEvent, we insert a new
|
||||
// This should rarely happen, e.g. when we get a corrupt ThriftIndexingEvent, we insert a new
|
||||
// Document which is blank. Such a document results in a length 0 BytesRef.
|
||||
if (data.length == 0) {
|
||||
return 0;
|
||||
|
|
|
@ -21,7 +21,7 @@ import org.apache.lucene.search.Weight;
|
|||
* and the filter is used only to do post-filtering. In other words, the filter is never used to
|
||||
* find the next doc ID: it's only used to filter out the doc IDs returned by the query's
|
||||
* DocIdSetIterator. This is useful when we need to have a conjunction between a query that can
|
||||
* quickly iterate through doc IDs (eg. a posting list), and an expensive filter (eg. a filter based
|
||||
* quickly iterate through doc IDs (e.g. a posting list), and an expensive filter (e.g. a filter based
|
||||
* on the values stored in a CSF).
|
||||
*
|
||||
* For example, let say we want to build a query that returns all docs that have at least 100 faves.
|
||||
|
|
|
@ -97,7 +97,7 @@ public final class QueryCommonFieldHitsVisitor extends SearchQueryVisitor<Set<St
|
|||
if (hits.isEmpty()) {
|
||||
// if it is empty, it means this query node is not of term type
|
||||
// and we do not include these in the field intersection
|
||||
// eg. cache filters, proximity groups
|
||||
// e.g. cache filters, proximity groups
|
||||
continue;
|
||||
}
|
||||
if (!initializedIntersections) {
|
||||
|
|
|
@ -136,7 +136,7 @@ public abstract class PartitionManager extends OneTaskScheduledExecutorManager {
|
|||
}
|
||||
|
||||
/**
|
||||
* Notifies all other threads that the partition manager has become current (ie. has indexed all
|
||||
* Notifies all other threads that the partition manager has become current (i.e. has indexed all
|
||||
* available events).
|
||||
*/
|
||||
public void becomeCurrent() {
|
||||
|
|
|
@ -86,7 +86,7 @@ public abstract class SimpleStreamIndexer<K, V> {
|
|||
}
|
||||
|
||||
/**
|
||||
* Consume updates on startup until current (eg. until we've seen a record within 5 seconds
|
||||
* Consume updates on startup until current (e.g. until we've seen a record within 5 seconds
|
||||
* of current time.)
|
||||
*/
|
||||
public void readRecordsUntilCurrent() {
|
||||
|
|
|
@ -49,7 +49,7 @@ import com.twitter.search.earlybird.partition.SegmentManager;
|
|||
* looks at all the createdAt dates for all of the documents in that segment.
|
||||
*
|
||||
* Also keeps track off an exposes as a stat the number of hours that do not have any tweets in the
|
||||
* min/max range of data that IS indexed on this earlybird. i.e if we only have data for
|
||||
* min/max range of data that IS indexed on this earlybird. i.e. if we only have data for
|
||||
* 2006/01/01:02 and 2006/01/01:04, it will consider 2006/01/01:03 as a missing hour.
|
||||
* Hours before 2006/01/01:02 or after 2006/01/01:04 will not be considered as missing.
|
||||
*/
|
||||
|
|
|
@ -64,7 +64,7 @@ public class ConfigBasedQuotaConfig extends PeriodicFileLoader {
|
|||
* @param clientQuotaKey The key that will be used to extract client quotas.
|
||||
* @param requireQuotaConfigForClients Determines whether a client can be skipped
|
||||
* if the associated object is missing the quota key
|
||||
* (ie a client that is a SuperRoot client but the current service is Archive)
|
||||
* (i.e. a client that is a SuperRoot client but the current service is Archive)
|
||||
*/
|
||||
public static ConfigBasedQuotaConfig newConfigBasedQuotaConfig(
|
||||
String quotaConfigPath,
|
||||
|
@ -121,7 +121,7 @@ public class ConfigBasedQuotaConfig extends PeriodicFileLoader {
|
|||
JSONObject clientQuota = quotaConfig.getJSONObject(clientId);
|
||||
|
||||
// Skip clients that don't send requests to this service.
|
||||
// (ie some SuperRoot clients are not Archive clients)
|
||||
// (i.e. some SuperRoot clients are not Archive clients)
|
||||
if (!requireQuotaConfigForClients && !clientQuota.has(clientQuotaKey)) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
## Feature Update Service
|
||||
Feature update service is a service that sends tweet feature updates e.g number of retweets, replies and favorites to Earlybird. Earlybird then indexes and uses these features to rank in-network Home Timeline tweets.
|
||||
Feature update service is a service that sends tweet feature updates e.g. number of retweets, replies and favorites to Earlybird. Earlybird then indexes and uses these features to rank in-network Home Timeline tweets.
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -56,7 +56,7 @@ public abstract class TwitterBatchedBaseStage<T, R> extends
|
|||
protected abstract Future<Collection<R>> innerProcessBatch(Collection<BatchedElement<T, R>>
|
||||
batch);
|
||||
|
||||
// classes that need to update their batch e.g after a decider change
|
||||
// classes that need to update their batch e.g. after a decider change
|
||||
// can override this
|
||||
protected void updateBatchSize() {
|
||||
}
|
||||
|
|
|
@ -27,7 +27,7 @@ object FeatureGeneratorUtil {
|
|||
|
||||
/**
|
||||
* Create vertex feature from InteractionGraphRawInput graph (src, dst, feature name, age, featureValue)
|
||||
* We will represent non-directional features (eg num_create_tweets) as "outgoing" values.
|
||||
* We will represent non-directional features (e.g., num_create_tweets) as "outgoing" values.
|
||||
* @return
|
||||
*/
|
||||
def getVertexFeature(
|
||||
|
@ -87,7 +87,7 @@ object FeatureGeneratorUtil {
|
|||
|
||||
/**
|
||||
* Create edge feature from InteractionGraphRawInput graph (src, dst, feature name, age, featureValue)
|
||||
* We will exclude all non-directional features (eg num_create_tweets) from all edge aggregates
|
||||
* We will exclude all non-directional features (e.g., num_create_tweets) from all edge aggregates
|
||||
*/
|
||||
def getEdgeFeature(
|
||||
input: SCollection[InteractionGraphRawInput]
|
||||
|
|
|
@ -8,7 +8,7 @@ import com.twitter.usersource.snapshot.flat.thriftscala.FlatUser
|
|||
object UserUtil {
|
||||
|
||||
/**
|
||||
* placeholder for the destId when representing vertex features with no dest (eg create tweet)
|
||||
* placeholder for the destId when representing vertex features with no dest (e.g., create tweet)
|
||||
* this will only be aggregated and saved in the vertex datasets but not the edge datasets
|
||||
*/
|
||||
val DUMMY_USER_ID = -1L
|
||||
|
|
|
@ -5,7 +5,7 @@ User Tweet Entity Graph (UTEG) is a Finalge thrift service built on the GraphJet
|
|||
|
||||
## How is it used on Twitter
|
||||
UTEG generates the "XXX Liked" out-of-network tweets seen on Twitter's Home Timeline.
|
||||
The core idea behind UTEG is collaborative filtering. UTEG takes a user's weighted follow graph (i.e a list of weighted userIds) as input,
|
||||
The core idea behind UTEG is collaborative filtering. UTEG takes a user's weighted follow graph (i.e. a list of weighted userIds) as input,
|
||||
performs efficient traversal & aggregation, and returns the top-weighted tweets engaged based on # of users that engaged the tweet, as well as
|
||||
the engaged users' weights.
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ User User Graph (UUG) is a Finalge thrift service built on the GraphJet framewor
|
|||
|
||||
## How is it used on Twitter
|
||||
UUG recommends users to follow based on who your follow graph have recently followed.
|
||||
The core idea behind UUG is collaborative filtering. UUG takes a user's weighted follow graph (i.e a list of weighted userIds) as input,
|
||||
The core idea behind UUG is collaborative filtering. UUG takes a user's weighted follow graph (i.e. a list of weighted userIds) as input,
|
||||
performs efficient traversal & aggregation, and returns the top weighted users basd on # of users that engaged the users, as well as
|
||||
the engaging users' weights.
|
||||
|
||||
|
|
|
@ -95,7 +95,7 @@ object DataSources {
|
|||
case (userId, l1Norm) if l1Norm <= maxNumBlocks =>
|
||||
userId
|
||||
}
|
||||
// retain only those users who give legit blocks (i.e those users who give less than numBlocks95)
|
||||
// retain only those users who give legit blocks (i.e. those users who give less than numBlocks95)
|
||||
userGivingBlocks.filterRows(usersWithLegitBlocks)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -14,7 +14,7 @@ object TopicsForProducersUtils {
|
|||
] =
|
||||
Bufferable.injectionOf[(SemanticCoreEntityId, Option[Language], Option[Country])]
|
||||
|
||||
// This function provides the set of 'valid' topics, i.e topics with atleast a certain number of
|
||||
// This function provides the set of 'valid' topics, i.e. topics with atleast a certain number of
|
||||
// follows. This helps remove some noisy topic associations to producers in the dataset.
|
||||
def getValidTopics(
|
||||
topicUsers: TypedPipe[((TopicId, Option[Language], Option[Country]), UserId, Double)],
|
||||
|
|
|
@ -39,7 +39,7 @@ object DataSources {
|
|||
case KeyVal(userId, clustersUserIsInterestedIn) =>
|
||||
val clustersPostFiltering = clustersUserIsInterestedIn.clusterIdToScores.filter {
|
||||
case (clusterId, clusterScores) =>
|
||||
// filter out popular clusters (i.e clusters with > 5M users interested in it) from the user embedding
|
||||
// filter out popular clusters (i.e. clusters with > 5M users interested in it) from the user embedding
|
||||
clusterScores.numUsersInterestedInThisClusterUpperBound.exists(
|
||||
_ < UserInterestedInReadableStore.MaxClusterSizeForUserInterestedInDataset)
|
||||
}
|
||||
|
|
|
@ -567,7 +567,7 @@ object UpdateKnownForSBFRunner {
|
|||
|
||||
val clusterAssignmentWithMaxScore: List[(ClusterId, Float)] =
|
||||
if (allClustersWithScores.length > 1) {
|
||||
// if sparseBinaryMatrix z has rows with more than one non-zero column (i.e a user
|
||||
// if sparseBinaryMatrix z has rows with more than one non-zero column (i.e. a user
|
||||
// initialized with more than one cluster), and the clustering algorithm doesnot find
|
||||
// a better proposal for cluster assignment, the user's multi-cluster membership
|
||||
// from the initialization step can continue.
|
||||
|
|
|
@ -1315,7 +1315,7 @@ struct EarlybirdDebugInfo {
|
|||
// Requests sent to dependent services. For example, superroot sends to realtime root,
|
||||
// archive root, etc.
|
||||
4: optional list<EarlybirdRequestResponse> sentRequests;
|
||||
// segment level debug info (eg. hitsPerSegment, max/minSearchedTime etc.)
|
||||
// segment level debug info (e.g. hitsPerSegment, max/minSearchedTime etc.)
|
||||
5: optional list<string> collectorDebugInfo
|
||||
6: optional list<string> termStatisticsDebugInfo
|
||||
}
|
||||
|
|
|
@ -117,7 +117,7 @@ object PartiallyHydratedTweet {
|
|||
sourceStatusId = retweetSourceTweetId.get,
|
||||
sourceUserId = retweetSourceUserId.get,
|
||||
parentStatusId =
|
||||
retweetSourceTweetId.get // Not always correct (eg, retweet of a retweet).
|
||||
retweetSourceTweetId.get // Not always correct (e.g., retweet of a retweet).
|
||||
)
|
||||
)
|
||||
} else None
|
||||
|
|
|
@ -183,7 +183,7 @@ object TweetMediaFeaturesExtractor {
|
|||
}
|
||||
}.flatten
|
||||
|
||||
// 3rd party media providers. eg. giphy for gifs
|
||||
// 3rd party media providers. e.g. giphy for gifs
|
||||
private def getMediaOriginProviders(mediaEntities: Seq[MediaEntity]): Seq[String] =
|
||||
for {
|
||||
mediaEntity <- mediaEntities
|
||||
|
|
|
@ -42,7 +42,7 @@ class FactorizationMachine(Layer):
|
|||
Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
|
||||
make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
|
||||
speed up at training time when input_size is large and optimizer handles sparse gradients
|
||||
correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
|
||||
correctly (e.g. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
|
||||
to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
|
||||
be large, so it's better to set it to `True`
|
||||
use_binary_values:
|
||||
|
|
|
@ -53,7 +53,7 @@ class FullSparse(Layer):
|
|||
Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
|
||||
make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
|
||||
speed up at training time when input_size is large and optimizer handles sparse gradients
|
||||
correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
|
||||
correctly (e.g. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
|
||||
to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
|
||||
be large, so it's better to set it to `True`
|
||||
num_partitions:
|
||||
|
@ -319,7 +319,7 @@ def full_sparse(
|
|||
Boolean, if `True` do sparse mat mul with `embedding_lookup_sparse`, which will
|
||||
make gradients to weight matrix also sparse in backward pass. This can lead to non-trivial
|
||||
speed up at training time when input_size is large and optimizer handles sparse gradients
|
||||
correctly (eg. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
|
||||
correctly (e.g. with SGD or LazyAdamOptimizer). If weight matrix is small, it's recommended
|
||||
to set this flag to `False`; for most use cases of FullSparse, however, weight matrix will
|
||||
be large, so it's better to set it to `True`
|
||||
num_partitions:
|
||||
|
|
Loading…
Reference in New Issue
Block a user