the-algorithm/src/scala/com/twitter/simclusters_v2/summingbird/storm/PersistentTweetJob.scala
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

152 lines
6.2 KiB
Scala

package com.twitter.simclusters_v2.summingbird.storm
import com.twitter.simclusters_v2.common.TweetId
import com.twitter.simclusters_v2.summingbird.common.Implicits
import com.twitter.simclusters_v2.summingbird.common.Monoids.PersistentSimClustersEmbeddingLongestL2NormMonoid
import com.twitter.simclusters_v2.summingbird.common.StatsUtil
import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore.{
LatestEmbeddingVersion,
LongestL2EmbeddingVersion,
PersistentTweetEmbeddingId
}
import com.twitter.simclusters_v2.thriftscala.{
PersistentSimClustersEmbedding,
SimClustersEmbedding,
SimClustersEmbeddingMetadata
}
import com.twitter.summingbird.option.JobId
import com.twitter.summingbird.{Platform, Producer, TailProducer}
import com.twitter.timelineservice.thriftscala.Event
import com.twitter.tweetypie.thriftscala.StatusCounts
/**
* The job to save the qualified tweet SimClustersEmbedding into Strato Store(Back by Manhattan).
*
* The steps
* 1. Read from Favorite Stream.
* 2. Join with Tweet Status Count Service.
* 3. Filter out the tweets whose favorite count < 8.
* We consider these tweets' SimClusters embedding is too noisy and untrustable.
* 4. Update the SimClusters Tweet embedding with timestamp 0L.
* 0L is reserved for the latest tweet embedding. It's also used to maintain the tweet count.
* 5. If the SimClusters Tweet embedding's update count is 2 power N & N >= 3.
* Persistent the embeddings with the timestamp as part of the LK.
**/
private[storm] object PersistentTweetJob {
import StatsUtil._
private val MinFavoriteCount = 8
type Timestamp = Long
val longestL2NormMonoid = new PersistentSimClustersEmbeddingLongestL2NormMonoid()
def generate[P <: Platform[P]](
timelineEventSource: Producer[P, Event],
tweetStatusCountService: P#Service[TweetId, StatusCounts],
tweetEmbeddingService: P#Service[TweetId, SimClustersEmbedding],
persistentTweetEmbeddingStoreWithLatestAggregation: P#Store[
PersistentTweetEmbeddingId,
PersistentSimClustersEmbedding
],
persistentTweetEmbeddingStoreWithLongestL2NormAggregation: P#Store[
PersistentTweetEmbeddingId,
PersistentSimClustersEmbedding
]
)(
implicit jobId: JobId
): TailProducer[P, Any] = {
val timelineEvents: Producer[P, (TweetId, Timestamp)] = timelineEventSource
.collect {
case Event.Favorite(favoriteEvent) =>
(favoriteEvent.tweetId, favoriteEvent.eventTimeMs)
}
val filteredEvents = timelineEvents
.leftJoin[StatusCounts](tweetStatusCountService)
.filter {
case (_, (_, Some(statusCounts))) =>
// Only consider tweets which has more than 8 favorite
statusCounts.favoriteCount.exists(_ >= MinFavoriteCount)
case _ =>
false
}
.leftJoin[SimClustersEmbedding](tweetEmbeddingService)
val latestAndPersistentEmbeddingProducer = filteredEvents
.collect {
case (tweetId, ((eventTimeMs, _), Some(tweetEmbedding))) =>
(
// This special timestamp is a reserved space for the latest tweet embedding.
PersistentTweetEmbeddingId(tweetId, timestampInMs = LatestEmbeddingVersion),
PersistentSimClustersEmbedding(
tweetEmbedding,
SimClustersEmbeddingMetadata(updatedAtMs = Some(eventTimeMs), updatedCount = Some(1))
))
}
.observe("num_of_embedding_updates")
.sumByKey(persistentTweetEmbeddingStoreWithLatestAggregation)(
Implicits.persistentSimClustersEmbeddingMonoid)
.name("latest_embedding_producer")
.flatMap {
case (persistentTweetEmbeddingId, (maybeEmbedding, deltaEmbedding)) =>
lastQualifiedUpdatedCount(
maybeEmbedding.flatMap(_.metadata.updatedCount),
deltaEmbedding.metadata.updatedCount
).map { newUpdateCount =>
(
persistentTweetEmbeddingId.copy(timestampInMs =
deltaEmbedding.metadata.updatedAtMs.getOrElse(0L)),
deltaEmbedding.copy(metadata =
deltaEmbedding.metadata.copy(updatedCount = Some(newUpdateCount)))
)
}
}
.observe("num_of_extra_embedding")
.sumByKey(persistentTweetEmbeddingStoreWithLatestAggregation)(
Implicits.persistentSimClustersEmbeddingMonoid)
.name("persistent_embeddings_producer")
val longestL2NormEmbeddingProducer = filteredEvents
.collect {
case (tweetId, ((eventTimeMs, Some(statusCounts)), Some(tweetEmbedding))) =>
(
// This special timestamp is a reserved space for the latest tweet embedding.
PersistentTweetEmbeddingId(tweetId, timestampInMs = LongestL2EmbeddingVersion),
PersistentSimClustersEmbedding(
tweetEmbedding,
SimClustersEmbeddingMetadata(
updatedAtMs = Some(eventTimeMs),
// We're not aggregating the existing embedding, we're replacing it. The count
// therefore needs to be the absolute fav count for this tweet, not the delta.
updatedCount = statusCounts.favoriteCount.map(_ + 1)
)
))
}
.observe("num_longest_l2_norm_updates")
.sumByKey(persistentTweetEmbeddingStoreWithLongestL2NormAggregation)(longestL2NormMonoid)
.name("longest_l2_norm_embedding_producer")
latestAndPersistentEmbeddingProducer.also(longestL2NormEmbeddingProducer)
}
/*
If this change in counts crosses one or more powers of 2 (8,16,32...), return the last boundary
that was crossed. In the case where a count delta is large, it may skip a power of 2, and
thus we may not store embeddings for all 2^(i+3) where 0 <= i <= tweetFavCount.
*/
private def lastQualifiedUpdatedCount(
existingUpdateCount: Option[Long],
deltaUpdateCount: Option[Long]
): Option[Int] = {
val existing = existingUpdateCount.getOrElse(0L)
val sum = existing + deltaUpdateCount.getOrElse(0L)
qualifiedSet.filter { i => (existing < i) && (i <= sum) }.lastOption
}
// Only 2 Power n while n >= 3 is qualified for Persistent. The max = 16,777,216
private lazy val qualifiedSet = 3
.until(25).map { i => Math.pow(2, i).toInt }.toSet
}