mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-26 13:06:05 +02:00
![twitter-team](/assets/img/avatar_default.png)
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
260 lines
12 KiB
Thrift
260 lines
12 KiB
Thrift
namespace java com.twitter.simclusters_v2.thriftjava
|
|
namespace py gen.twitter.simclusters_v2.interests
|
|
#@namespace scala com.twitter.simclusters_v2.thriftscala
|
|
#@namespace strato com.twitter.simclusters_v2
|
|
|
|
/**
|
|
* All of the scores below assume that the knownFor vector for each cluster is already
|
|
* of unit L2 norm i.e. sum of squares is 1.
|
|
**/
|
|
struct UserToInterestedInClusterScores {
|
|
// dot product of user's binary follow vector with knownFor vector for this cluster
|
|
// TIP: By default, use this score or favScore.
|
|
1: optional double followScore(personalDataType = 'CountOfFollowersAndFollowees')
|
|
|
|
// first compute followScore as defined above
|
|
// then compute L2 norm of the vector of these scores for this cluster
|
|
// divide by that.
|
|
// essentially the more people are interested in this cluster, the lower this score gets
|
|
// TIP: Use this score if your use case needs to penalize clusters that a lot of other
|
|
// users are also interested in
|
|
2: optional double followScoreClusterNormalizedOnly(personalDataType = 'CountOfFollowersAndFollowees')
|
|
|
|
// dot product of user's producer normalized follow vector and knownFor vector for this cluster
|
|
// i.e. i^th entry in the normalized follow vector = 1.0/sqrt(number of followers of user i)
|
|
// TIP: Use this score if your use case needs to penalize clusters where the users known for
|
|
// that cluster are popular.
|
|
3: optional double followScoreProducerNormalizedOnly(personalDataType = 'CountOfFollowersAndFollowees')
|
|
|
|
// first compute followScoreProducerNormalizedOnly
|
|
// then compute L2 norm of the vector of these scores for this cluster
|
|
// divide by that.
|
|
// essentially the more people are interested in this cluster, the lower this score gets
|
|
// TIP: Use this score if your use case needs to penalize both clusters that a lot of other
|
|
// users are interested in, as well as clusters where the users known for that cluster are
|
|
// popular.
|
|
4: optional double followScoreClusterAndProducerNormalized(personalDataType = 'CountOfFollowersAndFollowees')
|
|
|
|
// dot product of user's favScoreHalfLife100Days vector with knownFor vector for this cluster
|
|
// TIP: By default, use this score or followScore.
|
|
5: optional double favScore(personalDataType = 'EngagementsPublic')
|
|
|
|
// first compute favScore as defined above
|
|
// then compute L2 norm of the vector of these scores for this cluster
|
|
// divide by that.
|
|
// essentially the more people are interested in this cluster, the lower this score gets
|
|
// TIP: Use this score if your use case needs to penalize clusters that a lot of other
|
|
// users are also interested in
|
|
6: optional double favScoreClusterNormalizedOnly(personalDataType = 'EngagementsPublic')
|
|
|
|
// dot product of user's favScoreHalfLife100DaysNormalizedByNeighborFaversL2 vector with
|
|
// knownFor vector for this cluster
|
|
// TIP: Use this score if your use case needs to penalize clusters where the users known for
|
|
// that cluster are popular.
|
|
7: optional double favScoreProducerNormalizedOnly(personalDataType = 'EngagementsPublic')
|
|
|
|
// first compute favScoreProducerNormalizedOnly as defined above
|
|
// then compute L2 norm of the vector of these scores for this cluster
|
|
// divide by that.
|
|
// essentially the more people are interested in this cluster, the lower this score gets
|
|
// TIP: Use this score if your use case needs to penalize both clusters that a lot of other
|
|
// users are interested in, as well as clusters where the users known for that cluster are
|
|
// popular.
|
|
8: optional double favScoreClusterAndProducerNormalized(personalDataType = 'EngagementsPublic')
|
|
|
|
// list of users who're known for this cluster as well as are being followed by the user.
|
|
9: optional list<i64> usersBeingFollowed(personalDataType = 'UserId')
|
|
|
|
// list of users who're known for this cluster as well as were faved at some point by the user.
|
|
10: optional list<i64> usersThatWereFaved(personalDataType = 'UserId')
|
|
|
|
// A pretty close upper bound on the number of users who are interested in this cluster.
|
|
// Useful to know if this is a niche community or a popular topic.
|
|
11: optional i32 numUsersInterestedInThisClusterUpperBound
|
|
|
|
// dot product of user's logFavScore vector with knownFor vector for this cluster
|
|
// TIP: this score is under experimentations
|
|
12: optional double logFavScore(personalDataType = 'EngagementsPublic')
|
|
|
|
// first compute logFavScore as defined above
|
|
// then compute L2 norm of the vector of these scores for this cluster
|
|
// divide by that.
|
|
// essentially the more people are interested in this cluster, the lower this score gets
|
|
// TIP: this score is under experimentations
|
|
13: optional double logFavScoreClusterNormalizedOnly(personalDataType = 'EngagementsPublic')
|
|
|
|
// actual count of number of users who're known for this cluster as well as are being followed by the user.
|
|
14: optional i32 numUsersBeingFollowed
|
|
|
|
// actual count of number of users who're known for this cluster as well as were faved at some point by the user.
|
|
15: optional i32 numUsersThatWereFaved
|
|
}(persisted = 'true', hasPersonalData = 'true')
|
|
|
|
struct UserToInterestedInClusters {
|
|
1: required i64 userId(personalDataType = 'UserId')
|
|
2: required string knownForModelVersion
|
|
3: required map<i32, UserToInterestedInClusterScores> clusterIdToScores(personalDataTypeKey = 'InferredInterests')
|
|
}(persisted="true", hasPersonalData = 'true')
|
|
|
|
struct LanguageToClusters {
|
|
1: required string language
|
|
2: required string knownForModelVersion
|
|
3: required map<i32, UserToInterestedInClusterScores> clusterIdToScores(personalDataTypeKey = 'InferredInterests')
|
|
}(persisted="true", hasPersonalData = 'true')
|
|
|
|
struct ClustersUserIsInterestedIn {
|
|
1: required string knownForModelVersion
|
|
2: required map<i32, UserToInterestedInClusterScores> clusterIdToScores(personalDataTypeKey = 'InferredInterests')
|
|
}(persisted = 'true', hasPersonalData = 'true')
|
|
|
|
struct UserToKnownForClusters {
|
|
1: required i64 userId(personalDataType = 'UserId')
|
|
2: required string knownForModelVersion
|
|
3: required map<i32, UserToKnownForClusterScores> clusterIdToScores(personalDataTypeKey = 'InferredInterests')
|
|
}(persisted="true", hasPersonalData = 'true')
|
|
|
|
struct UserToKnownForClusterScores {
|
|
1: optional double knownForScore
|
|
}(persisted = 'true', hasPersonalData = 'false')
|
|
|
|
struct ClustersUserIsKnownFor {
|
|
1: required string knownForModelVersion
|
|
2: required map<i32, UserToKnownForClusterScores> clusterIdToScores(personalDataTypeKey = 'InferredInterests')
|
|
}(persisted = 'true', hasPersonalData = 'true')
|
|
|
|
/** Thrift struct for storing quantile bounds output by QTreeMonoid in Algebird */
|
|
struct QuantileBounds {
|
|
1: required double lowerBound
|
|
2: required double upperBound
|
|
}(persisted = 'true', hasPersonalData = 'false')
|
|
|
|
/** Thrift struct giving the details of the distribution of a set of doubles */
|
|
struct DistributionDetails {
|
|
1: required double mean
|
|
2: optional double standardDeviation
|
|
3: optional double min
|
|
4: optional QuantileBounds p25
|
|
5: optional QuantileBounds p50
|
|
6: optional QuantileBounds p75
|
|
7: optional QuantileBounds p95
|
|
8: optional double max
|
|
}(persisted = 'true', hasPersonalData = 'false')
|
|
|
|
/** Note that the modelVersion here is specified somewhere outside, specifically, as part of the key */
|
|
struct ClusterNeighbor {
|
|
1: required i32 clusterId
|
|
/** Note that followCosineSimilarity is same as dot product over followScoreClusterNormalizedOnly
|
|
* since those scores form a unit vector **/
|
|
2: optional double followCosineSimilarity
|
|
/** Note that favCosineSimilarity is same as dot product over favScoreClusterNormalizedOnly
|
|
* since those scores form a unit vector **/
|
|
3: optional double favCosineSimilarity
|
|
/** Note that logFavCosineSimilarity is same as dot product over logFavScoreClusterNormalizedOnly
|
|
* since those scores form a unit vector **/
|
|
4: optional double logFavCosineSimilarity
|
|
}(persisted = 'true', hasPersonalData = 'false')
|
|
|
|
/** Useful for storing the list of users known for a cluster */
|
|
struct UserWithScore {
|
|
1: required i64 userId(personalDataType = 'UserId')
|
|
2: required double score
|
|
}(persisted="true", hasPersonalData = 'true')
|
|
|
|
// deprecated
|
|
struct EdgeCut {
|
|
1: required double cutEdges
|
|
2: required double totalVolume
|
|
}(persisted = 'true', hasPersonalData = 'false')
|
|
|
|
struct ClusterQuality {
|
|
// deprecated
|
|
1: optional EdgeCut deprecated_unweightedEdgeCut
|
|
// deprecated
|
|
2: optional EdgeCut deprecated_edgeWeightedCut
|
|
// deprecated
|
|
3: optional EdgeCut deprecated_nodeAndEdgeWeightedCut
|
|
|
|
// correlation of actual weight of (u, v) with I(u & v in same cluster) * score(u) * score(v)
|
|
4: optional double weightAndProductOfNodeScoresCorrelation
|
|
|
|
// fraction of edges staying inside cluster divided by total edges from nodes in the cluster
|
|
5: optional double unweightedRecall
|
|
|
|
// fraction of edge weights staying inside cluster divided by total edge weights from nodes in the cluster
|
|
6: optional double weightedRecall
|
|
|
|
// total edges from nodes in the cluster
|
|
7: optional double unweightedRecallDenominator
|
|
|
|
// total edge weights from nodes in the cluster
|
|
8: optional double weightedRecallDenominator
|
|
|
|
// sum of edge weights inside cluster / { #nodes * (#nodes - 1) }
|
|
9: optional double relativePrecisionNumerator
|
|
|
|
// above divided by the sum of edge weights in the total graph / { n * (n - 1) }
|
|
10: optional double relativePrecision
|
|
}(persisted = 'true', hasPersonalData = 'false')
|
|
|
|
/**
|
|
* This struct is the value of the ClusterDetails key-value dataset.
|
|
* The key is (modelVersion, clusterId)
|
|
**/
|
|
struct ClusterDetails {
|
|
1: required i32 numUsersWithAnyNonZeroScore
|
|
2: required i32 numUsersWithNonZeroFollowScore
|
|
3: required i32 numUsersWithNonZeroFavScore
|
|
4: optional DistributionDetails followScoreDistributionDetails
|
|
5: optional DistributionDetails favScoreDistributionDetails
|
|
6: optional list<UserWithScore> knownForUsersAndScores
|
|
7: optional list<ClusterNeighbor> neighborClusters
|
|
// fraction of users who're known for this cluster who're marked NSFW_User in UserSource
|
|
8: optional double fractionKnownForMarkedNSFWUser
|
|
// the major languages that this cluster's known_fors have as their "language" field in
|
|
// UserSource, and the fractions
|
|
9: optional map<string, double> languageToFractionDeviceLanguage
|
|
// the major country codes that this cluster's known_fors have as their "account_country_code"
|
|
// field in UserSource, and the fractions
|
|
10: optional map<string, double> countryCodeToFractionKnownForWithCountryCode
|
|
11: optional ClusterQuality qualityMeasuredOnSimsGraph
|
|
12: optional DistributionDetails logFavScoreDistributionDetails
|
|
// fraction of languages this cluster's known_fors produce based on what penguin_user_languages dataset infers
|
|
13: optional map<string, double> languageToFractionInferredLanguage
|
|
}(persisted="true", hasPersonalData = 'true')
|
|
|
|
struct SampledEdge {
|
|
1: required i64 followerId(personalDataType = 'UserId')
|
|
2: required i64 followeeId(personalDataType = 'UserId')
|
|
3: optional double favWtIfFollowEdge
|
|
4: optional double favWtIfFavEdge
|
|
5: optional double followScoreToCluster
|
|
6: optional double favScoreToCluster
|
|
7: optional double predictedFollowScore
|
|
8: optional double predictedFavScore
|
|
}(persisted="true", hasPersonalData = 'true')
|
|
|
|
/**
|
|
* The key here is (modelVersion, clusterId)
|
|
**/
|
|
struct BipartiteClusterQuality {
|
|
1: optional double inClusterFollowEdges
|
|
2: optional double inClusterFavEdges
|
|
3: optional double favWtSumOfInClusterFollowEdges
|
|
4: optional double favWtSumOfInClusterFavEdges
|
|
5: optional double outgoingFollowEdges
|
|
6: optional double outgoingFavEdges
|
|
7: optional double favWtSumOfOutgoingFollowEdges
|
|
8: optional double favWtSumOfOutgoingFavEdges
|
|
9: optional double incomingFollowEdges
|
|
10: optional double incomingFavEdges
|
|
11: optional double favWtSumOfIncomingFollowEdges
|
|
12: optional double favWtSumOfIncomingFavEdges
|
|
13: optional i32 interestedInSize
|
|
14: optional list<SampledEdge> sampledEdges
|
|
15: optional i32 knownForSize
|
|
16: optional double correlationOfFavWtIfFollowWithPredictedFollow
|
|
17: optional double correlationOfFavWtIfFavWithPredictedFav
|
|
18: optional double relativePrecisionUsingFavWtIfFav
|
|
19: optional double averagePrecisionOfWholeGraphUsingFavWtIfFav
|
|
}(persisted="true", hasPersonalData = 'true')
|