mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-05-29 07:18:46 +02:00
ef4c5eb65e
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
152 lines
6.2 KiB
Scala
152 lines
6.2 KiB
Scala
package com.twitter.simclusters_v2.summingbird.storm
|
|
|
|
import com.twitter.simclusters_v2.common.TweetId
|
|
import com.twitter.simclusters_v2.summingbird.common.Implicits
|
|
import com.twitter.simclusters_v2.summingbird.common.Monoids.PersistentSimClustersEmbeddingLongestL2NormMonoid
|
|
import com.twitter.simclusters_v2.summingbird.common.StatsUtil
|
|
import com.twitter.simclusters_v2.summingbird.stores.PersistentTweetEmbeddingStore.{
|
|
LatestEmbeddingVersion,
|
|
LongestL2EmbeddingVersion,
|
|
PersistentTweetEmbeddingId
|
|
}
|
|
import com.twitter.simclusters_v2.thriftscala.{
|
|
PersistentSimClustersEmbedding,
|
|
SimClustersEmbedding,
|
|
SimClustersEmbeddingMetadata
|
|
}
|
|
import com.twitter.summingbird.option.JobId
|
|
import com.twitter.summingbird.{Platform, Producer, TailProducer}
|
|
import com.twitter.timelineservice.thriftscala.Event
|
|
import com.twitter.tweetypie.thriftscala.StatusCounts
|
|
|
|
/**
|
|
* The job to save the qualified tweet SimClustersEmbedding into Strato Store(Back by Manhattan).
|
|
*
|
|
* The steps
|
|
* 1. Read from Favorite Stream.
|
|
* 2. Join with Tweet Status Count Service.
|
|
* 3. Filter out the tweets whose favorite count < 8.
|
|
* We consider these tweets' SimClusters embedding is too noisy and untrustable.
|
|
* 4. Update the SimClusters Tweet embedding with timestamp 0L.
|
|
* 0L is reserved for the latest tweet embedding. It's also used to maintain the tweet count.
|
|
* 5. If the SimClusters Tweet embedding's update count is 2 power N & N >= 3.
|
|
* Persistent the embeddings with the timestamp as part of the LK.
|
|
**/
|
|
private[storm] object PersistentTweetJob {
|
|
import StatsUtil._
|
|
|
|
private val MinFavoriteCount = 8
|
|
type Timestamp = Long
|
|
|
|
val longestL2NormMonoid = new PersistentSimClustersEmbeddingLongestL2NormMonoid()
|
|
|
|
def generate[P <: Platform[P]](
|
|
timelineEventSource: Producer[P, Event],
|
|
tweetStatusCountService: P#Service[TweetId, StatusCounts],
|
|
tweetEmbeddingService: P#Service[TweetId, SimClustersEmbedding],
|
|
persistentTweetEmbeddingStoreWithLatestAggregation: P#Store[
|
|
PersistentTweetEmbeddingId,
|
|
PersistentSimClustersEmbedding
|
|
],
|
|
persistentTweetEmbeddingStoreWithLongestL2NormAggregation: P#Store[
|
|
PersistentTweetEmbeddingId,
|
|
PersistentSimClustersEmbedding
|
|
]
|
|
)(
|
|
implicit jobId: JobId
|
|
): TailProducer[P, Any] = {
|
|
|
|
val timelineEvents: Producer[P, (TweetId, Timestamp)] = timelineEventSource
|
|
.collect {
|
|
case Event.Favorite(favoriteEvent) =>
|
|
(favoriteEvent.tweetId, favoriteEvent.eventTimeMs)
|
|
}
|
|
|
|
val filteredEvents = timelineEvents
|
|
.leftJoin[StatusCounts](tweetStatusCountService)
|
|
.filter {
|
|
case (_, (_, Some(statusCounts))) =>
|
|
// Only consider tweets which has more than 8 favorite
|
|
statusCounts.favoriteCount.exists(_ >= MinFavoriteCount)
|
|
case _ =>
|
|
false
|
|
}
|
|
.leftJoin[SimClustersEmbedding](tweetEmbeddingService)
|
|
|
|
val latestAndPersistentEmbeddingProducer = filteredEvents
|
|
.collect {
|
|
case (tweetId, ((eventTimeMs, _), Some(tweetEmbedding))) =>
|
|
(
|
|
// This special timestamp is a reserved space for the latest tweet embedding.
|
|
PersistentTweetEmbeddingId(tweetId, timestampInMs = LatestEmbeddingVersion),
|
|
PersistentSimClustersEmbedding(
|
|
tweetEmbedding,
|
|
SimClustersEmbeddingMetadata(updatedAtMs = Some(eventTimeMs), updatedCount = Some(1))
|
|
))
|
|
}
|
|
.observe("num_of_embedding_updates")
|
|
.sumByKey(persistentTweetEmbeddingStoreWithLatestAggregation)(
|
|
Implicits.persistentSimClustersEmbeddingMonoid)
|
|
.name("latest_embedding_producer")
|
|
.flatMap {
|
|
case (persistentTweetEmbeddingId, (maybeEmbedding, deltaEmbedding)) =>
|
|
lastQualifiedUpdatedCount(
|
|
maybeEmbedding.flatMap(_.metadata.updatedCount),
|
|
deltaEmbedding.metadata.updatedCount
|
|
).map { newUpdateCount =>
|
|
(
|
|
persistentTweetEmbeddingId.copy(timestampInMs =
|
|
deltaEmbedding.metadata.updatedAtMs.getOrElse(0L)),
|
|
deltaEmbedding.copy(metadata =
|
|
deltaEmbedding.metadata.copy(updatedCount = Some(newUpdateCount)))
|
|
)
|
|
}
|
|
}
|
|
.observe("num_of_extra_embedding")
|
|
.sumByKey(persistentTweetEmbeddingStoreWithLatestAggregation)(
|
|
Implicits.persistentSimClustersEmbeddingMonoid)
|
|
.name("persistent_embeddings_producer")
|
|
|
|
val longestL2NormEmbeddingProducer = filteredEvents
|
|
.collect {
|
|
case (tweetId, ((eventTimeMs, Some(statusCounts)), Some(tweetEmbedding))) =>
|
|
(
|
|
// This special timestamp is a reserved space for the latest tweet embedding.
|
|
PersistentTweetEmbeddingId(tweetId, timestampInMs = LongestL2EmbeddingVersion),
|
|
PersistentSimClustersEmbedding(
|
|
tweetEmbedding,
|
|
SimClustersEmbeddingMetadata(
|
|
updatedAtMs = Some(eventTimeMs),
|
|
// We're not aggregating the existing embedding, we're replacing it. The count
|
|
// therefore needs to be the absolute fav count for this tweet, not the delta.
|
|
updatedCount = statusCounts.favoriteCount.map(_ + 1)
|
|
)
|
|
))
|
|
}
|
|
.observe("num_longest_l2_norm_updates")
|
|
.sumByKey(persistentTweetEmbeddingStoreWithLongestL2NormAggregation)(longestL2NormMonoid)
|
|
.name("longest_l2_norm_embedding_producer")
|
|
|
|
latestAndPersistentEmbeddingProducer.also(longestL2NormEmbeddingProducer)
|
|
}
|
|
|
|
/*
|
|
If this change in counts crosses one or more powers of 2 (8,16,32...), return the last boundary
|
|
that was crossed. In the case where a count delta is large, it may skip a power of 2, and
|
|
thus we may not store embeddings for all 2^(i+3) where 0 <= i <= tweetFavCount.
|
|
*/
|
|
private def lastQualifiedUpdatedCount(
|
|
existingUpdateCount: Option[Long],
|
|
deltaUpdateCount: Option[Long]
|
|
): Option[Int] = {
|
|
val existing = existingUpdateCount.getOrElse(0L)
|
|
val sum = existing + deltaUpdateCount.getOrElse(0L)
|
|
qualifiedSet.filter { i => (existing < i) && (i <= sum) }.lastOption
|
|
}
|
|
|
|
// Only 2 Power n while n >= 3 is qualified for Persistent. The max = 16,777,216
|
|
private lazy val qualifiedSet = 3
|
|
.until(25).map { i => Math.pow(2, i).toInt }.toSet
|
|
|
|
}
|