475 lines
17 KiB
Scala
475 lines
17 KiB
Scala
package com.twitter.representationscorer.twistlyfeatures
|
|
|
|
import com.twitter.finagle.stats.Counter
|
|
import com.twitter.finagle.stats.StatsReceiver
|
|
import com.twitter.representationscorer.common.TweetId
|
|
import com.twitter.representationscorer.common.UserId
|
|
import com.twitter.representationscorer.scorestore.ScoreStore
|
|
import com.twitter.representationscorer.thriftscala.SimClustersRecentEngagementSimilarities
|
|
import com.twitter.simclusters_v2.thriftscala.EmbeddingType
|
|
import com.twitter.simclusters_v2.thriftscala.InternalId
|
|
import com.twitter.simclusters_v2.thriftscala.ModelVersion
|
|
import com.twitter.simclusters_v2.thriftscala.ScoreId
|
|
import com.twitter.simclusters_v2.thriftscala.ScoreInternalId
|
|
import com.twitter.simclusters_v2.thriftscala.ScoringAlgorithm
|
|
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingId
|
|
import com.twitter.simclusters_v2.thriftscala.SimClustersEmbeddingPairScoreId
|
|
import com.twitter.stitch.Stitch
|
|
import javax.inject.Inject
|
|
|
|
class Scorer @Inject() (
|
|
fetchEngagementsFromUSS: Long => Stitch[Engagements],
|
|
scoreStore: ScoreStore,
|
|
stats: StatsReceiver) {
|
|
|
|
import Scorer._
|
|
|
|
private val scoreStats = stats.scope("score")
|
|
private val scoreCalculationStats = scoreStats.scope("calculation")
|
|
private val scoreResultStats = scoreStats.scope("result")
|
|
|
|
private val scoresNonEmptyCounter = scoreResultStats.scope("all").counter("nonEmpty")
|
|
private val scoresNonZeroCounter = scoreResultStats.scope("all").counter("nonZero")
|
|
|
|
private val tweetScoreStats = scoreCalculationStats.scope("tweetScore").stat("latency")
|
|
private val userScoreStats = scoreCalculationStats.scope("userScore").stat("latency")
|
|
|
|
private val favNonZero = scoreResultStats.scope("favs").counter("nonZero")
|
|
private val favNonEmpty = scoreResultStats.scope("favs").counter("nonEmpty")
|
|
|
|
private val retweetsNonZero = scoreResultStats.scope("retweets").counter("nonZero")
|
|
private val retweetsNonEmpty = scoreResultStats.scope("retweets").counter("nonEmpty")
|
|
|
|
private val followsNonZero = scoreResultStats.scope("follows").counter("nonZero")
|
|
private val followsNonEmpty = scoreResultStats.scope("follows").counter("nonEmpty")
|
|
|
|
private val sharesNonZero = scoreResultStats.scope("shares").counter("nonZero")
|
|
private val sharesNonEmpty = scoreResultStats.scope("shares").counter("nonEmpty")
|
|
|
|
private val repliesNonZero = scoreResultStats.scope("replies").counter("nonZero")
|
|
private val repliesNonEmpty = scoreResultStats.scope("replies").counter("nonEmpty")
|
|
|
|
private val originalTweetsNonZero = scoreResultStats.scope("originalTweets").counter("nonZero")
|
|
private val originalTweetsNonEmpty = scoreResultStats.scope("originalTweets").counter("nonEmpty")
|
|
|
|
private val videoViewsNonZero = scoreResultStats.scope("videoViews").counter("nonZero")
|
|
private val videoViewsNonEmpty = scoreResultStats.scope("videoViews").counter("nonEmpty")
|
|
|
|
private val blockNonZero = scoreResultStats.scope("block").counter("nonZero")
|
|
private val blockNonEmpty = scoreResultStats.scope("block").counter("nonEmpty")
|
|
|
|
private val muteNonZero = scoreResultStats.scope("mute").counter("nonZero")
|
|
private val muteNonEmpty = scoreResultStats.scope("mute").counter("nonEmpty")
|
|
|
|
private val reportNonZero = scoreResultStats.scope("report").counter("nonZero")
|
|
private val reportNonEmpty = scoreResultStats.scope("report").counter("nonEmpty")
|
|
|
|
private val dontlikeNonZero = scoreResultStats.scope("dontlike").counter("nonZero")
|
|
private val dontlikeNonEmpty = scoreResultStats.scope("dontlike").counter("nonEmpty")
|
|
|
|
private val seeFewerNonZero = scoreResultStats.scope("seeFewer").counter("nonZero")
|
|
private val seeFewerNonEmpty = scoreResultStats.scope("seeFewer").counter("nonEmpty")
|
|
|
|
private def getTweetScores(
|
|
candidateTweetId: TweetId,
|
|
sourceTweetIds: Seq[TweetId]
|
|
): Stitch[Seq[ScoreResult]] = {
|
|
val getScoresStitch = Stitch.traverse(sourceTweetIds) { sourceTweetId =>
|
|
scoreStore
|
|
.uniformScoringStoreStitch(getTweetScoreId(sourceTweetId, candidateTweetId))
|
|
.liftNotFoundToOption
|
|
.map(score => ScoreResult(sourceTweetId, score.map(_.score)))
|
|
}
|
|
|
|
Stitch.time(getScoresStitch).flatMap {
|
|
case (tryResult, duration) =>
|
|
tweetScoreStats.add(duration.inMillis)
|
|
Stitch.const(tryResult)
|
|
}
|
|
}
|
|
|
|
private def getUserScores(
|
|
tweetId: TweetId,
|
|
authorIds: Seq[UserId]
|
|
): Stitch[Seq[ScoreResult]] = {
|
|
val getScoresStitch = Stitch.traverse(authorIds) { authorId =>
|
|
scoreStore
|
|
.uniformScoringStoreStitch(getAuthorScoreId(authorId, tweetId))
|
|
.liftNotFoundToOption
|
|
.map(score => ScoreResult(authorId, score.map(_.score)))
|
|
}
|
|
|
|
Stitch.time(getScoresStitch).flatMap {
|
|
case (tryResult, duration) =>
|
|
userScoreStats.add(duration.inMillis)
|
|
Stitch.const(tryResult)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Get the [[SimClustersRecentEngagementSimilarities]] result containing the similarity
|
|
* features for the given userId-TweetId.
|
|
*/
|
|
def get(
|
|
userId: UserId,
|
|
tweetId: TweetId
|
|
): Stitch[SimClustersRecentEngagementSimilarities] = {
|
|
get(userId, Seq(tweetId)).map(x => x.head)
|
|
}
|
|
|
|
/**
|
|
* Get a list of [[SimClustersRecentEngagementSimilarities]] results containing the similarity
|
|
* features for the given tweets of the user Id.
|
|
* Guaranteed to be the same number/order as requested.
|
|
*/
|
|
def get(
|
|
userId: UserId,
|
|
tweetIds: Seq[TweetId]
|
|
): Stitch[Seq[SimClustersRecentEngagementSimilarities]] = {
|
|
fetchEngagementsFromUSS(userId)
|
|
.flatMap(engagements => {
|
|
// For each tweet received in the request, compute the similarity scores between them
|
|
// and the user signals fetched from USS.
|
|
Stitch
|
|
.join(
|
|
Stitch.traverse(tweetIds)(id => getTweetScores(id, engagements.tweetIds)),
|
|
Stitch.traverse(tweetIds)(id => getUserScores(id, engagements.authorIds)),
|
|
)
|
|
.map {
|
|
case (tweetScoresSeq, userScoreSeq) =>
|
|
// All seq have = size because when scores don't exist, they are returned as Option
|
|
(tweetScoresSeq, userScoreSeq).zipped.map { (tweetScores, userScores) =>
|
|
computeSimilarityScoresPerTweet(
|
|
engagements,
|
|
tweetScores.groupBy(_.id),
|
|
userScores.groupBy(_.id))
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
/**
|
|
*
|
|
* Computes the [[SimClustersRecentEngagementSimilarities]]
|
|
* using the given tweet-tweet and user-tweet scores in TweetScoresMap
|
|
* and the user signals in [[Engagements]].
|
|
*/
|
|
private def computeSimilarityScoresPerTweet(
|
|
engagements: Engagements,
|
|
tweetScores: Map[TweetId, Seq[ScoreResult]],
|
|
authorScores: Map[UserId, Seq[ScoreResult]]
|
|
): SimClustersRecentEngagementSimilarities = {
|
|
val favs7d = engagements.favs7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val favs1d = engagements.favs1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val retweets7d = engagements.retweets7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val retweets1d = engagements.retweets1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val follows30d = engagements.follows30d.view
|
|
.flatMap(s => authorScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val follows7d = engagements.follows7d.view
|
|
.flatMap(s => authorScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val shares7d = engagements.shares7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val shares1d = engagements.shares1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val replies7d = engagements.replies7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val replies1d = engagements.replies1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val originalTweets7d = engagements.originalTweets7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val originalTweets1d = engagements.originalTweets1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val videoViews7d = engagements.videoPlaybacks7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val videoViews1d = engagements.videoPlaybacks1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val block30d = engagements.block30d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val block7d = engagements.block7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val block1d = engagements.block1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val mute30d = engagements.mute30d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val mute7d = engagements.mute7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val mute1d = engagements.mute1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val report30d = engagements.report30d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val report7d = engagements.report7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val report1d = engagements.report1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val dontlike30d = engagements.dontlike30d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val dontlike7d = engagements.dontlike7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val dontlike1d = engagements.dontlike1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val seeFewer30d = engagements.seeFewer30d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val seeFewer7d = engagements.seeFewer7d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val seeFewer1d = engagements.seeFewer1d.view
|
|
.flatMap(s => tweetScores.get(s.targetId))
|
|
.flatten.flatMap(_.score)
|
|
.force
|
|
|
|
val result = SimClustersRecentEngagementSimilarities(
|
|
fav1dLast10Max = max(favs1d),
|
|
fav1dLast10Avg = avg(favs1d),
|
|
fav7dLast10Max = max(favs7d),
|
|
fav7dLast10Avg = avg(favs7d),
|
|
retweet1dLast10Max = max(retweets1d),
|
|
retweet1dLast10Avg = avg(retweets1d),
|
|
retweet7dLast10Max = max(retweets7d),
|
|
retweet7dLast10Avg = avg(retweets7d),
|
|
follow7dLast10Max = max(follows7d),
|
|
follow7dLast10Avg = avg(follows7d),
|
|
follow30dLast10Max = max(follows30d),
|
|
follow30dLast10Avg = avg(follows30d),
|
|
share1dLast10Max = max(shares1d),
|
|
share1dLast10Avg = avg(shares1d),
|
|
share7dLast10Max = max(shares7d),
|
|
share7dLast10Avg = avg(shares7d),
|
|
reply1dLast10Max = max(replies1d),
|
|
reply1dLast10Avg = avg(replies1d),
|
|
reply7dLast10Max = max(replies7d),
|
|
reply7dLast10Avg = avg(replies7d),
|
|
originalTweet1dLast10Max = max(originalTweets1d),
|
|
originalTweet1dLast10Avg = avg(originalTweets1d),
|
|
originalTweet7dLast10Max = max(originalTweets7d),
|
|
originalTweet7dLast10Avg = avg(originalTweets7d),
|
|
videoPlayback1dLast10Max = max(videoViews1d),
|
|
videoPlayback1dLast10Avg = avg(videoViews1d),
|
|
videoPlayback7dLast10Max = max(videoViews7d),
|
|
videoPlayback7dLast10Avg = avg(videoViews7d),
|
|
block1dLast10Max = max(block1d),
|
|
block1dLast10Avg = avg(block1d),
|
|
block7dLast10Max = max(block7d),
|
|
block7dLast10Avg = avg(block7d),
|
|
block30dLast10Max = max(block30d),
|
|
block30dLast10Avg = avg(block30d),
|
|
mute1dLast10Max = max(mute1d),
|
|
mute1dLast10Avg = avg(mute1d),
|
|
mute7dLast10Max = max(mute7d),
|
|
mute7dLast10Avg = avg(mute7d),
|
|
mute30dLast10Max = max(mute30d),
|
|
mute30dLast10Avg = avg(mute30d),
|
|
report1dLast10Max = max(report1d),
|
|
report1dLast10Avg = avg(report1d),
|
|
report7dLast10Max = max(report7d),
|
|
report7dLast10Avg = avg(report7d),
|
|
report30dLast10Max = max(report30d),
|
|
report30dLast10Avg = avg(report30d),
|
|
dontlike1dLast10Max = max(dontlike1d),
|
|
dontlike1dLast10Avg = avg(dontlike1d),
|
|
dontlike7dLast10Max = max(dontlike7d),
|
|
dontlike7dLast10Avg = avg(dontlike7d),
|
|
dontlike30dLast10Max = max(dontlike30d),
|
|
dontlike30dLast10Avg = avg(dontlike30d),
|
|
seeFewer1dLast10Max = max(seeFewer1d),
|
|
seeFewer1dLast10Avg = avg(seeFewer1d),
|
|
seeFewer7dLast10Max = max(seeFewer7d),
|
|
seeFewer7dLast10Avg = avg(seeFewer7d),
|
|
seeFewer30dLast10Max = max(seeFewer30d),
|
|
seeFewer30dLast10Avg = avg(seeFewer30d),
|
|
)
|
|
trackStats(result)
|
|
result
|
|
}
|
|
|
|
private def trackStats(result: SimClustersRecentEngagementSimilarities): Unit = {
|
|
val scores = Seq(
|
|
result.fav7dLast10Max,
|
|
result.retweet7dLast10Max,
|
|
result.follow30dLast10Max,
|
|
result.share1dLast10Max,
|
|
result.share7dLast10Max,
|
|
result.reply7dLast10Max,
|
|
result.originalTweet7dLast10Max,
|
|
result.videoPlayback7dLast10Max,
|
|
result.block30dLast10Max,
|
|
result.mute30dLast10Max,
|
|
result.report30dLast10Max,
|
|
result.dontlike30dLast10Max,
|
|
result.seeFewer30dLast10Max
|
|
)
|
|
|
|
val nonEmpty = scores.exists(_.isDefined)
|
|
val nonZero = scores.exists { case Some(score) if score > 0 => true; case _ => false }
|
|
|
|
if (nonEmpty) {
|
|
scoresNonEmptyCounter.incr()
|
|
}
|
|
|
|
if (nonZero) {
|
|
scoresNonZeroCounter.incr()
|
|
}
|
|
|
|
// We use the largest window of a given type of score,
|
|
// because the largest window is inclusive of smaller windows.
|
|
trackSignalStats(favNonEmpty, favNonZero, result.fav7dLast10Avg)
|
|
trackSignalStats(retweetsNonEmpty, retweetsNonZero, result.retweet7dLast10Avg)
|
|
trackSignalStats(followsNonEmpty, followsNonZero, result.follow30dLast10Avg)
|
|
trackSignalStats(sharesNonEmpty, sharesNonZero, result.share7dLast10Avg)
|
|
trackSignalStats(repliesNonEmpty, repliesNonZero, result.reply7dLast10Avg)
|
|
trackSignalStats(originalTweetsNonEmpty, originalTweetsNonZero, result.originalTweet7dLast10Avg)
|
|
trackSignalStats(videoViewsNonEmpty, videoViewsNonZero, result.videoPlayback7dLast10Avg)
|
|
trackSignalStats(blockNonEmpty, blockNonZero, result.block30dLast10Avg)
|
|
trackSignalStats(muteNonEmpty, muteNonZero, result.mute30dLast10Avg)
|
|
trackSignalStats(reportNonEmpty, reportNonZero, result.report30dLast10Avg)
|
|
trackSignalStats(dontlikeNonEmpty, dontlikeNonZero, result.dontlike30dLast10Avg)
|
|
trackSignalStats(seeFewerNonEmpty, seeFewerNonZero, result.seeFewer30dLast10Avg)
|
|
}
|
|
|
|
private def trackSignalStats(nonEmpty: Counter, nonZero: Counter, score: Option[Double]): Unit = {
|
|
if (score.nonEmpty) {
|
|
nonEmpty.incr()
|
|
|
|
if (score.get > 0)
|
|
nonZero.incr()
|
|
}
|
|
}
|
|
}
|
|
|
|
object Scorer {
|
|
def avg(s: Traversable[Double]): Option[Double] =
|
|
if (s.isEmpty) None else Some(s.sum / s.size)
|
|
def max(s: Traversable[Double]): Option[Double] =
|
|
if (s.isEmpty) None else Some(s.foldLeft(0.0D) { (curr, _max) => math.max(curr, _max) })
|
|
|
|
private def getAuthorScoreId(
|
|
userId: UserId,
|
|
tweetId: TweetId
|
|
) = {
|
|
ScoreId(
|
|
algorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
|
|
internalId = ScoreInternalId.SimClustersEmbeddingPairScoreId(
|
|
SimClustersEmbeddingPairScoreId(
|
|
SimClustersEmbeddingId(
|
|
internalId = InternalId.UserId(userId),
|
|
modelVersion = ModelVersion.Model20m145k2020,
|
|
embeddingType = EmbeddingType.FavBasedProducer
|
|
),
|
|
SimClustersEmbeddingId(
|
|
internalId = InternalId.TweetId(tweetId),
|
|
modelVersion = ModelVersion.Model20m145k2020,
|
|
embeddingType = EmbeddingType.LogFavBasedTweet
|
|
)
|
|
))
|
|
)
|
|
}
|
|
|
|
private def getTweetScoreId(
|
|
sourceTweetId: TweetId,
|
|
candidateTweetId: TweetId
|
|
) = {
|
|
ScoreId(
|
|
algorithm = ScoringAlgorithm.PairEmbeddingCosineSimilarity,
|
|
internalId = ScoreInternalId.SimClustersEmbeddingPairScoreId(
|
|
SimClustersEmbeddingPairScoreId(
|
|
SimClustersEmbeddingId(
|
|
internalId = InternalId.TweetId(sourceTweetId),
|
|
modelVersion = ModelVersion.Model20m145k2020,
|
|
embeddingType = EmbeddingType.LogFavLongestL2EmbeddingTweet
|
|
),
|
|
SimClustersEmbeddingId(
|
|
internalId = InternalId.TweetId(candidateTweetId),
|
|
modelVersion = ModelVersion.Model20m145k2020,
|
|
embeddingType = EmbeddingType.LogFavBasedTweet
|
|
)
|
|
))
|
|
)
|
|
}
|
|
}
|