mirror of
https://github.com/twitter/the-algorithm.git
synced 2025-06-13 12:58:23 +02:00
[minor] Fix grammar + typo issues
Closes #557, closes #678, closes #748, closes #806, closes #818, closes #842, closes #866, closes #948, closes #1024, closes #1313, closes #1458, closes #1461, closes #1465, closes #1491, closes #1503, closes #1539, closes #1611
This commit is contained in:
@ -513,12 +513,12 @@ public class BasicIndexingConverter {
|
||||
Optional<Long> inReplyToUserId = Optional.of(inReplyToUserIdVal).filter(x -> x > 0);
|
||||
Optional<Long> inReplyToStatusId = Optional.of(inReplyToStatusIdVal).filter(x -> x > 0);
|
||||
|
||||
// We have six combinations here. A tweet can be
|
||||
// We have six combinations here. A Tweet can be
|
||||
// 1) a reply to another tweet (then it has both in-reply-to-user-id and
|
||||
// in-reply-to-status-id set),
|
||||
// 2) directed-at a user (then it only has in-reply-to-user-id set),
|
||||
// 3) not a reply at all.
|
||||
// Additionally, it may or may not be a retweet (if it is, then it has retweet-user-id and
|
||||
// Additionally, it may or may not be a Retweet (if it is, then it has retweet-user-id and
|
||||
// retweet-status-id set).
|
||||
//
|
||||
// We want to set some fields unconditionally, and some fields (reference-author-id and
|
||||
|
@ -22,13 +22,13 @@ import static com.twitter.search.modeling.tweet_ranking.TweetScoringFeatures.Fea
|
||||
/**
|
||||
* Loads the scoring models for tweets and provides access to them.
|
||||
*
|
||||
* This class relies on a list ModelLoader objects to retrieve the objects from them. It will
|
||||
* This class relies on a list of ModelLoader objects to retrieve the objects from them. It will
|
||||
* return the first model found according to the order in the list.
|
||||
*
|
||||
* For production, we load models from 2 sources: classpath and HDFS. If a model is available
|
||||
* from HDFS, we return it, otherwise we use the model from the classpath.
|
||||
*
|
||||
* The models used in for default requests (i.e. not experiments) MUST be present in the
|
||||
* The models used for default requests (i.e. not experiments) MUST be present in the
|
||||
* classpath, this allows us to avoid errors if they can't be loaded from HDFS.
|
||||
* Models for experiments can live only in HDFS, so we don't need to redeploy Earlybird if we
|
||||
* want to test them.
|
||||
|
@ -3,76 +3,81 @@ from twml.feature_config import FeatureConfigBuilder
|
||||
|
||||
|
||||
def get_feature_config(data_spec_path, label):
|
||||
return FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True) \
|
||||
return (
|
||||
FeatureConfigBuilder(data_spec_path=data_spec_path, debug=True)
|
||||
.batch_add_features(
|
||||
[
|
||||
("ebd.author_specific_score", "A"),
|
||||
("ebd.has_diff_lang", "A"),
|
||||
("ebd.has_english_tweet_diff_ui_lang", "A"),
|
||||
("ebd.has_english_ui_diff_tweet_lang", "A"),
|
||||
("ebd.is_self_tweet", "A"),
|
||||
("ebd.tweet_age_in_secs", "A"),
|
||||
("encoded_tweet_features.favorite_count", "A"),
|
||||
("encoded_tweet_features.from_verified_account_flag", "A"),
|
||||
("encoded_tweet_features.has_card_flag", "A"),
|
||||
# ("encoded_tweet_features.has_consumer_video_flag", "A"),
|
||||
("encoded_tweet_features.has_image_url_flag", "A"),
|
||||
("encoded_tweet_features.has_link_flag", "A"),
|
||||
("encoded_tweet_features.has_multiple_hashtags_or_trends_flag", "A"),
|
||||
# ("encoded_tweet_features.has_multiple_media_flag", "A"),
|
||||
("encoded_tweet_features.has_native_image_flag", "A"),
|
||||
("encoded_tweet_features.has_news_url_flag", "A"),
|
||||
("encoded_tweet_features.has_periscope_flag", "A"),
|
||||
("encoded_tweet_features.has_pro_video_flag", "A"),
|
||||
("encoded_tweet_features.has_quote_flag", "A"),
|
||||
("encoded_tweet_features.has_trend_flag", "A"),
|
||||
("encoded_tweet_features.has_video_url_flag", "A"),
|
||||
("encoded_tweet_features.has_vine_flag", "A"),
|
||||
("encoded_tweet_features.has_visible_link_flag", "A"),
|
||||
("encoded_tweet_features.is_offensive_flag", "A"),
|
||||
("encoded_tweet_features.is_reply_flag", "A"),
|
||||
("encoded_tweet_features.is_retweet_flag", "A"),
|
||||
("encoded_tweet_features.is_sensitive_content", "A"),
|
||||
# ("encoded_tweet_features.is_user_new_flag", "A"),
|
||||
("encoded_tweet_features.language", "A"),
|
||||
("encoded_tweet_features.link_language", "A"),
|
||||
("encoded_tweet_features.num_hashtags", "A"),
|
||||
("encoded_tweet_features.num_mentions", "A"),
|
||||
# ("encoded_tweet_features.profile_is_egg_flag", "A"),
|
||||
("encoded_tweet_features.reply_count", "A"),
|
||||
("encoded_tweet_features.retweet_count", "A"),
|
||||
("encoded_tweet_features.text_score", "A"),
|
||||
("encoded_tweet_features.user_reputation", "A"),
|
||||
("extended_encoded_tweet_features.embeds_impression_count", "A"),
|
||||
("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.embeds_url_count", "A"),
|
||||
("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.favorite_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.label_abusive_hi_rcl_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_dup_content_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_spam_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
|
||||
("extended_encoded_tweet_features.quote_count", "A"),
|
||||
("extended_encoded_tweet_features.reply_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.retweet_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.weighted_favorite_count", "A"),
|
||||
("extended_encoded_tweet_features.weighted_quote_count", "A"),
|
||||
("extended_encoded_tweet_features.weighted_reply_count", "A"),
|
||||
("extended_encoded_tweet_features.weighted_retweet_count", "A"),
|
||||
]
|
||||
).add_labels([
|
||||
label, # Tensor index: 0
|
||||
"recap.engagement.is_clicked", # Tensor index: 1
|
||||
"recap.engagement.is_favorited", # Tensor index: 2
|
||||
"recap.engagement.is_open_linked", # Tensor index: 3
|
||||
"recap.engagement.is_photo_expanded", # Tensor index: 4
|
||||
"recap.engagement.is_profile_clicked", # Tensor index: 5
|
||||
"recap.engagement.is_replied", # Tensor index: 6
|
||||
"recap.engagement.is_retweeted", # Tensor index: 7
|
||||
"recap.engagement.is_video_playback_50", # Tensor index: 8
|
||||
"timelines.earlybird_score", # Tensor index: 9
|
||||
]) \
|
||||
.define_weight("meta.record_weight/type=earlybird") \
|
||||
[
|
||||
("ebd.author_specific_score", "A"),
|
||||
("ebd.has_diff_lang", "A"),
|
||||
("ebd.has_english_tweet_diff_ui_lang", "A"),
|
||||
("ebd.has_english_ui_diff_tweet_lang", "A"),
|
||||
("ebd.is_self_tweet", "A"),
|
||||
("ebd.tweet_age_in_secs", "A"),
|
||||
("encoded_tweet_features.favorite_count", "A"),
|
||||
("encoded_tweet_features.from_verified_account_flag", "A"),
|
||||
("encoded_tweet_features.has_card_flag", "A"),
|
||||
# ("encoded_tweet_features.has_consumer_video_flag", "A"),
|
||||
("encoded_tweet_features.has_image_url_flag", "A"),
|
||||
("encoded_tweet_features.has_link_flag", "A"),
|
||||
("encoded_tweet_features.has_multiple_hashtags_or_trends_flag", "A"),
|
||||
# ("encoded_tweet_features.has_multiple_media_flag", "A"),
|
||||
("encoded_tweet_features.has_native_image_flag", "A"),
|
||||
("encoded_tweet_features.has_news_url_flag", "A"),
|
||||
("encoded_tweet_features.has_periscope_flag", "A"),
|
||||
("encoded_tweet_features.has_pro_video_flag", "A"),
|
||||
("encoded_tweet_features.has_quote_flag", "A"),
|
||||
("encoded_tweet_features.has_trend_flag", "A"),
|
||||
("encoded_tweet_features.has_video_url_flag", "A"),
|
||||
("encoded_tweet_features.has_vine_flag", "A"),
|
||||
("encoded_tweet_features.has_visible_link_flag", "A"),
|
||||
("encoded_tweet_features.is_offensive_flag", "A"),
|
||||
("encoded_tweet_features.is_reply_flag", "A"),
|
||||
("encoded_tweet_features.is_retweet_flag", "A"),
|
||||
("encoded_tweet_features.is_sensitive_content", "A"),
|
||||
# ("encoded_tweet_features.is_user_new_flag", "A"),
|
||||
("encoded_tweet_features.language", "A"),
|
||||
("encoded_tweet_features.link_language", "A"),
|
||||
("encoded_tweet_features.num_hashtags", "A"),
|
||||
("encoded_tweet_features.num_mentions", "A"),
|
||||
# ("encoded_tweet_features.profile_is_egg_flag", "A"),
|
||||
("encoded_tweet_features.reply_count", "A"),
|
||||
("encoded_tweet_features.retweet_count", "A"),
|
||||
("encoded_tweet_features.text_score", "A"),
|
||||
("encoded_tweet_features.user_reputation", "A"),
|
||||
("extended_encoded_tweet_features.embeds_impression_count", "A"),
|
||||
("extended_encoded_tweet_features.embeds_impression_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.embeds_url_count", "A"),
|
||||
("extended_encoded_tweet_features.embeds_url_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.favorite_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.label_abusive_hi_rcl_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_dup_content_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_nsfw_hi_rcl_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_spam_flag", "A"),
|
||||
("extended_encoded_tweet_features.label_spam_hi_rcl_flag", "A"),
|
||||
("extended_encoded_tweet_features.quote_count", "A"),
|
||||
("extended_encoded_tweet_features.reply_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.retweet_count_v2", "A"),
|
||||
("extended_encoded_tweet_features.weighted_favorite_count", "A"),
|
||||
("extended_encoded_tweet_features.weighted_quote_count", "A"),
|
||||
("extended_encoded_tweet_features.weighted_reply_count", "A"),
|
||||
("extended_encoded_tweet_features.weighted_retweet_count", "A"),
|
||||
]
|
||||
)
|
||||
.add_labels(
|
||||
[
|
||||
label, # Tensor index: 0
|
||||
"recap.engagement.is_clicked", # Tensor index: 1
|
||||
"recap.engagement.is_favorited", # Tensor index: 2
|
||||
"recap.engagement.is_open_linked", # Tensor index: 3
|
||||
"recap.engagement.is_photo_expanded", # Tensor index: 4
|
||||
"recap.engagement.is_profile_clicked", # Tensor index: 5
|
||||
"recap.engagement.is_replied", # Tensor index: 6
|
||||
"recap.engagement.is_retweeted", # Tensor index: 7
|
||||
"recap.engagement.is_video_playback_50", # Tensor index: 8
|
||||
"timelines.earlybird_score", # Tensor index: 9
|
||||
]
|
||||
)
|
||||
.define_weight("meta.record_weight/type=earlybird")
|
||||
.build()
|
||||
)
|
||||
|
@ -1,3 +1,5 @@
|
||||
Tweepcred
|
||||
|
||||
Tweepcred is a social network analysis tool that calculates the influence of Twitter users based on their interactions with other users. The tool uses the PageRank algorithm to rank users based on their influence.
|
||||
|
||||
PageRank Algorithm
|
||||
@ -70,4 +72,4 @@ The algorithm tests for convergence by calculating the total difference between
|
||||
|
||||
This is a helper class called Reputation that contains methods for calculating a user's reputation score. The first method called scaledReputation takes a Double parameter raw which represents the user's page rank, and returns a Byte value that represents the user's reputation on a scale of 0 to 100. This method uses a formula that involves converting the logarithm of the page rank to a number between 0 and 100.
|
||||
|
||||
The second method called adjustReputationsPostCalculation takes three parameters: mass (a Double value representing the user's page rank), numFollowers (an Int value representing the number of followers a user has), and numFollowings (an Int value representing the number of users a user is following). This method reduces the page rank of users who have a low number of followers but a high number of followings. It calculates a division factor based on the ratio of followings to followers, and reduces the user's page rank by dividing it by this factor. The method returns the adjusted page rank.
|
||||
The second method called adjustReputationsPostCalculation takes three parameters: mass (a Double value representing the user's page rank), numFollowers (an Int value representing the number of followers a user has), and numFollowings (an Int value representing the number of users a user is following). This method reduces the page rank of users who have a low number of followers but a high number of followings. It calculates a division factor based on the ratio of followings to followers, and reduces the user's page rank by dividing it by this factor. The method returns the adjusted page rank.
|
||||
|
@ -1,17 +1,17 @@
|
||||
# UserTweetEntityGraph (UTEG)
|
||||
|
||||
## What is it
|
||||
User Tweet Entity Graph (UTEG) is a Finalge thrift service built on the GraphJet framework. In maintains a graph of user-tweet relationships and serves user recommendations based on traversals in this graph.
|
||||
User Tweet Entity Graph (UTEG) is a Finalge thrift service built on the GraphJet framework. It maintains a graph of user-tweet relationships and serves user recommendations based on traversals in this graph.
|
||||
|
||||
## How is it used on Twitter
|
||||
UTEG generates the "XXX Liked" out-of-network tweets seen on Twitter's Home Timeline.
|
||||
The core idea behind UTEG is collaborative filtering. UTEG takes a user's weighted follow graph (i.e a list of weighted userIds) as input,
|
||||
performs efficient traversal & aggregation, and returns the top weighted tweets engaged basd on # of users that engaged the tweet, as well as
|
||||
The core idea behind UTEG is collaborative filtering. UTEG takes a user's weighted follow graph (i.e a list of weighted userIds) as input,
|
||||
performs efficient traversal & aggregation, and returns the top-weighted tweets engaged based on # of users that engaged the tweet, as well as
|
||||
the engaged users' weights.
|
||||
|
||||
UTEG is a stateful service and relies on a Kafka stream to ingest & persist states. It maintains an in-memory user engagements over the past
|
||||
24-48 hours. Older events are dropped and GC'ed.
|
||||
UTEG is a stateful service and relies on a Kafka stream to ingest & persist states. It maintains in-memory user engagements over the past
|
||||
24-48 hours. Older events are dropped and GC'ed.
|
||||
|
||||
For full details on storage & processing, please check out our open-sourced project GraphJet, a general-purpose high performance in-memory storage engine.
|
||||
For full details on storage & processing, please check out our open-sourced project GraphJet, a general-purpose high-performance in-memory storage engine.
|
||||
- https://github.com/twitter/GraphJet
|
||||
- http://www.vldb.org/pvldb/vol9/p1281-sharma.pdf
|
||||
|
@ -78,7 +78,7 @@ sealed trait SimClustersEmbedding extends Equals {
|
||||
CosineSimilarityUtil.applyNormArray(sortedScores, expScaledNorm)
|
||||
|
||||
/**
|
||||
* The Standard Deviation of a Embedding.
|
||||
* The Standard Deviation of an Embedding.
|
||||
*/
|
||||
lazy val std: Double = {
|
||||
if (scores.isEmpty) {
|
||||
|
@ -306,7 +306,7 @@ struct ThriftFacetRankingOptions {
|
||||
// penalty for keyword stuffing
|
||||
60: optional i32 multipleHashtagsOrTrendsPenalty
|
||||
|
||||
// Langauge related boosts, similar to those in relevance ranking options. By default they are
|
||||
// Language related boosts, similar to those in relevance ranking options. By default they are
|
||||
// all 1.0 (no-boost).
|
||||
// When the user language is english, facet language is not
|
||||
11: optional double langEnglishUIBoost = 1.0
|
||||
|
@ -728,7 +728,7 @@ struct ThriftSearchResultMetadata {
|
||||
29: optional double parusScore
|
||||
|
||||
// Extra feature data, all new feature fields you want to return from Earlybird should go into
|
||||
// this one, the outer one is always reaching its limit of the nubmer of fields JVM can
|
||||
// this one, the outer one is always reaching its limit of the number of fields JVM can
|
||||
// comfortably support!!
|
||||
86: optional ThriftSearchResultExtraMetadata extraMetadata
|
||||
|
||||
@ -831,7 +831,7 @@ struct ThriftSearchResult {
|
||||
12: optional list<hits.ThriftHits> cardTitleHitHighlights
|
||||
13: optional list<hits.ThriftHits> cardDescriptionHitHighlights
|
||||
|
||||
// Expansion types, if expandResult == False, the expasions set should be ignored.
|
||||
// Expansion types, if expandResult == False, the expansions set should be ignored.
|
||||
8: optional bool expandResult = 0
|
||||
9: optional set<expansions.ThriftTweetExpansionType> expansions
|
||||
|
||||
@ -971,7 +971,7 @@ struct ThriftTermStatisticsResults {
|
||||
// The binIds will correspond to the times of the hits matching the driving search query for this
|
||||
// term statistics request.
|
||||
// If there were no hits matching the search query, numBins binIds will be returned, but the
|
||||
// values of the binIds will not meaninfully correspond to anything related to the query, and
|
||||
// values of the binIds will not meaningfully correspond to anything related to the query, and
|
||||
// should not be used. Such cases can be identified by ThriftSearchResults.numHitsProcessed being
|
||||
// set to 0 in the response, and the response not being early terminated.
|
||||
3: optional list<i32> binIds
|
||||
@ -1097,8 +1097,8 @@ struct ThriftSearchResults {
|
||||
// Superroots' schema merge/choose logic when returning results to clients:
|
||||
// . pick the schema based on the order of: realtime > protected > archive
|
||||
// . because of the above ordering, it is possible that archive earlybird schema with a new flush
|
||||
// verion (with new bit features) might be lost to older realtime earlybird schema; this is
|
||||
// considered to to be rare and accetable because one realtime earlybird deploy would fix it
|
||||
// version (with new bit features) might be lost to older realtime earlybird schema; this is
|
||||
// considered to to be rare and acceptable because one realtime earlybird deploy would fix it
|
||||
21: optional features.ThriftSearchFeatureSchema featureSchema
|
||||
|
||||
// How long it took to score the results in earlybird (in nanoseconds). The number of results
|
||||
|
@ -29,8 +29,8 @@ struct AdhocSingleSideClusterScores {
|
||||
* we implement will use search abuse reports and impressions. We can build stores for new values
|
||||
* in the future.
|
||||
*
|
||||
* The consumer creates the interactions which the author recieves. For instance, the consumer
|
||||
* creates an abuse report for an author. The consumer scores are related to the interation creation
|
||||
* The consumer creates the interactions which the author receives. For instance, the consumer
|
||||
* creates an abuse report for an author. The consumer scores are related to the interaction creation
|
||||
* behavior of the consumer. The author scores are related to the whether the author receives these
|
||||
* interactions.
|
||||
*
|
||||
|
@ -70,7 +70,7 @@ struct TweetTopKTweetsWithScore {
|
||||
/**
|
||||
* The generic SimClustersEmbedding for online long-term storage and real-time calculation.
|
||||
* Use SimClustersEmbeddingId as the only identifier.
|
||||
* Warning: Doesn't include modelversion and embedding type in the value struct.
|
||||
* Warning: Doesn't include model version and embedding type in the value struct.
|
||||
**/
|
||||
struct SimClustersEmbedding {
|
||||
1: required list<SimClusterWithScore> embedding
|
||||
|
@ -50,7 +50,7 @@ struct CandidateTweets {
|
||||
}(hasPersonalData = 'true')
|
||||
|
||||
/**
|
||||
* An encapuslated collection of reference tweets
|
||||
* An encapsulated collection of reference tweets
|
||||
**/
|
||||
struct ReferenceTweets {
|
||||
1: required i64 targetUserId(personalDataType = 'UserId')
|
||||
|
@ -33,12 +33,12 @@ enum EmbeddingType {
|
||||
Pop10000RankDecay11Tweet = 31,
|
||||
OonPop1000RankDecayTweet = 32,
|
||||
|
||||
// [Experimental] Offline generated produciton-like LogFavScore-based Tweet Embedding
|
||||
// [Experimental] Offline generated production-like LogFavScore-based Tweet Embedding
|
||||
OfflineGeneratedLogFavBasedTweet = 40,
|
||||
|
||||
// Reserve 51-59 for Ads Embedding
|
||||
LogFavBasedAdsTweet = 51, // Experimenal embedding for ads tweet candidate
|
||||
LogFavClickBasedAdsTweet = 52, // Experimenal embedding for ads tweet candidate
|
||||
LogFavBasedAdsTweet = 51, // Experimental embedding for ads tweet candidate
|
||||
LogFavClickBasedAdsTweet = 52, // Experimental embedding for ads tweet candidate
|
||||
|
||||
// Reserve 60-69 for Evergreen content
|
||||
LogFavBasedEvergreenTweet = 60,
|
||||
@ -104,7 +104,7 @@ enum EmbeddingType {
|
||||
//Reserved 401 - 500 for Space embedding
|
||||
FavBasedApeSpace = 401 // DEPRECATED
|
||||
LogFavBasedListenerSpace = 402 // DEPRECATED
|
||||
LogFavBasedAPESpeakerSpace = 403 // DEPRCATED
|
||||
LogFavBasedAPESpeakerSpace = 403 // DEPRECATED
|
||||
LogFavBasedUserInterestedInListenerSpace = 404 // DEPRECATED
|
||||
|
||||
// Experimental, internal-only IDs
|
||||
|
Reference in New Issue
Block a user