the-algorithm/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/StormAggregateSourceUtils.s...

255 lines
10 KiB
Scala

package com.twitter.timelines.prediction.common.aggregates.real_time
import com.twitter.finagle.stats.Counter
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.DataRecordMerger
import com.twitter.ml.api.Feature
import com.twitter.ml.api.RichDataRecord
import com.twitter.ml.featurestore.catalog.entities.core.Author
import com.twitter.ml.featurestore.catalog.entities.core.Tweet
import com.twitter.ml.featurestore.catalog.entities.core.User
import com.twitter.ml.featurestore.lib.online.FeatureStoreClient
import com.twitter.summingbird.Producer
import com.twitter.summingbird.storm.Storm
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.RealTimeAggregatesJobConfig
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
import java.lang.{Long => JLong}
import com.twitter.unified_user_actions.thriftscala.ActionType
import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
private[real_time] object StormAggregateSourceUtils {
type UserId = Long
type AuthorId = Long
type TweetId = Long
/**
* Attaches a [[FeatureStoreClient]] to the underyling [[Producer]]. The FeatureStoreClient
* hydrates additional user features.
*
* @param underlyingProducer converts a stream of [[com.twitter.clientapp.thriftscala.LogEvent]]
* to a stream of [[DataRecord]].
*/
def wrapByFeatureStoreClient(
underlyingProducer: Producer[Storm, Event[DataRecord]],
jobConfig: RealTimeAggregatesJobConfig,
scopedStatsReceiver: StatsReceiver
): Producer[Storm, Event[DataRecord]] = {
lazy val keyDataRecordCounter = scopedStatsReceiver.counter("keyDataRecord")
lazy val keyFeatureCounter = scopedStatsReceiver.counter("keyFeature")
lazy val leftDataRecordCounter = scopedStatsReceiver.counter("leftDataRecord")
lazy val rightDataRecordCounter = scopedStatsReceiver.counter("rightDataRecord")
lazy val mergeNumFeaturesCounter = scopedStatsReceiver.counter("mergeNumFeatures")
lazy val authorKeyDataRecordCounter = scopedStatsReceiver.counter("authorKeyDataRecord")
lazy val authorKeyFeatureCounter = scopedStatsReceiver.counter("authorKeyFeature")
lazy val authorLeftDataRecordCounter = scopedStatsReceiver.counter("authorLeftDataRecord")
lazy val authorRightDataRecordCounter = scopedStatsReceiver.counter("authorRightDataRecord")
lazy val authorMergeNumFeaturesCounter = scopedStatsReceiver.counter("authorMergeNumFeatures")
lazy val tweetKeyDataRecordCounter =
scopedStatsReceiver.counter("tweetKeyDataRecord")
lazy val tweetKeyFeatureCounter = scopedStatsReceiver.counter("tweetKeyFeature")
lazy val tweetLeftDataRecordCounter =
scopedStatsReceiver.counter("tweetLeftDataRecord")
lazy val tweetRightDataRecordCounter =
scopedStatsReceiver.counter("tweetRightDataRecord")
lazy val tweetMergeNumFeaturesCounter =
scopedStatsReceiver.counter("tweetMergeNumFeatures")
@transient lazy val featureStoreClient: FeatureStoreClient =
FeatureStoreUtils.mkFeatureStoreClient(
serviceIdentifier = jobConfig.serviceIdentifier,
statsReceiver = scopedStatsReceiver
)
lazy val joinUserFeaturesDataRecordProducer =
if (jobConfig.keyedByUserEnabled) {
lazy val keyedByUserFeaturesStormService: Storm#Service[Set[UserId], DataRecord] =
Storm.service(
new UserFeaturesReadableStore(
featureStoreClient = featureStoreClient,
userEntity = User,
userFeaturesAdapter = UserFeaturesAdapter
)
)
leftJoinDataRecordProducer(
keyFeature = SharedFeatures.USER_ID,
leftDataRecordProducer = underlyingProducer,
rightStormService = keyedByUserFeaturesStormService,
keyDataRecordCounter = keyDataRecordCounter,
keyFeatureCounter = keyFeatureCounter,
leftDataRecordCounter = leftDataRecordCounter,
rightDataRecordCounter = rightDataRecordCounter,
mergeNumFeaturesCounter = mergeNumFeaturesCounter
)
} else {
underlyingProducer
}
lazy val joinAuthorFeaturesDataRecordProducer =
if (jobConfig.keyedByAuthorEnabled) {
lazy val keyedByAuthorFeaturesStormService: Storm#Service[Set[AuthorId], DataRecord] =
Storm.service(
new UserFeaturesReadableStore(
featureStoreClient = featureStoreClient,
userEntity = Author,
userFeaturesAdapter = AuthorFeaturesAdapter
)
)
leftJoinDataRecordProducer(
keyFeature = TimelinesSharedFeatures.SOURCE_AUTHOR_ID,
leftDataRecordProducer = joinUserFeaturesDataRecordProducer,
rightStormService = keyedByAuthorFeaturesStormService,
keyDataRecordCounter = authorKeyDataRecordCounter,
keyFeatureCounter = authorKeyFeatureCounter,
leftDataRecordCounter = authorLeftDataRecordCounter,
rightDataRecordCounter = authorRightDataRecordCounter,
mergeNumFeaturesCounter = authorMergeNumFeaturesCounter
)
} else {
joinUserFeaturesDataRecordProducer
}
lazy val joinTweetFeaturesDataRecordProducer = {
if (jobConfig.keyedByTweetEnabled) {
lazy val keyedByTweetFeaturesStormService: Storm#Service[Set[TweetId], DataRecord] =
Storm.service(
new TweetFeaturesReadableStore(
featureStoreClient = featureStoreClient,
tweetEntity = Tweet,
tweetFeaturesAdapter = TweetFeaturesAdapter
)
)
leftJoinDataRecordProducer(
keyFeature = TimelinesSharedFeatures.SOURCE_TWEET_ID,
leftDataRecordProducer = joinAuthorFeaturesDataRecordProducer,
rightStormService = keyedByTweetFeaturesStormService,
keyDataRecordCounter = tweetKeyDataRecordCounter,
keyFeatureCounter = tweetKeyFeatureCounter,
leftDataRecordCounter = tweetLeftDataRecordCounter,
rightDataRecordCounter = tweetRightDataRecordCounter,
mergeNumFeaturesCounter = tweetMergeNumFeaturesCounter
)
} else {
joinAuthorFeaturesDataRecordProducer
}
}
joinTweetFeaturesDataRecordProducer
}
private[this] lazy val DataRecordMerger = new DataRecordMerger
/**
* Make join key from the client event data record and return both.
* @param keyFeature Feature to extract join key value: USER_ID, SOURCE_TWEET_ID, etc.
* @param record DataRecord containing client engagement and basic tweet-side features
* @return The return type is a tuple of this key and original data record which will be used
* in the subsequent leftJoin operation.
*/
private[this] def mkKey(
keyFeature: Feature[JLong],
record: DataRecord,
keyDataRecordCounter: Counter,
keyFeatureCounter: Counter
): Set[Long] = {
keyDataRecordCounter.incr()
val richRecord = new RichDataRecord(record)
if (richRecord.hasFeature(keyFeature)) {
keyFeatureCounter.incr()
val key: Long = richRecord.getFeatureValue(keyFeature).toLong
Set(key)
} else {
Set.empty[Long]
}
}
/**
* After the leftJoin, merge the client event data record and the joined data record
* into a single data record used for further aggregation.
*/
private[this] def mergeDataRecord(
leftRecord: Event[DataRecord],
rightRecordOpt: Option[DataRecord],
leftDataRecordCounter: Counter,
rightDataRecordCounter: Counter,
mergeNumFeaturesCounter: Counter
): Event[DataRecord] = {
leftDataRecordCounter.incr()
rightRecordOpt.foreach { rightRecord =>
rightDataRecordCounter.incr()
DataRecordMerger.merge(leftRecord.event, rightRecord)
mergeNumFeaturesCounter.incr(new RichDataRecord(leftRecord.event).numFeatures())
}
leftRecord
}
private[this] def leftJoinDataRecordProducer(
keyFeature: Feature[JLong],
leftDataRecordProducer: Producer[Storm, Event[DataRecord]],
rightStormService: Storm#Service[Set[Long], DataRecord],
keyDataRecordCounter: => Counter,
keyFeatureCounter: => Counter,
leftDataRecordCounter: => Counter,
rightDataRecordCounter: => Counter,
mergeNumFeaturesCounter: => Counter
): Producer[Storm, Event[DataRecord]] = {
val keyedLeftDataRecordProducer: Producer[Storm, (Set[Long], Event[DataRecord])] =
leftDataRecordProducer.map {
case dataRecord: HomeEvent[DataRecord] =>
val key = mkKey(
keyFeature = keyFeature,
record = dataRecord.event,
keyDataRecordCounter = keyDataRecordCounter,
keyFeatureCounter = keyFeatureCounter
)
(key, dataRecord)
case dataRecord: ProfileEvent[DataRecord] =>
val key = Set.empty[Long]
(key, dataRecord)
case dataRecord: SearchEvent[DataRecord] =>
val key = Set.empty[Long]
(key, dataRecord)
case dataRecord: UuaEvent[DataRecord] =>
val key = Set.empty[Long]
(key, dataRecord)
}
keyedLeftDataRecordProducer
.leftJoin(rightStormService)
.map {
case (_, (leftRecord, rightRecordOpt)) =>
mergeDataRecord(
leftRecord = leftRecord,
rightRecordOpt = rightRecordOpt,
leftDataRecordCounter = leftDataRecordCounter,
rightDataRecordCounter = rightDataRecordCounter,
mergeNumFeaturesCounter = mergeNumFeaturesCounter
)
}
}
/**
* Filter Unified User Actions events to include only actions that has home timeline visit prior to landing on the page
*/
def isUuaBCEEventsFromHome(event: UnifiedUserAction): Boolean = {
def breadcrumbViewsContain(view: String): Boolean =
event.eventMetadata.breadcrumbViews.map(_.contains(view)).getOrElse(false)
(event.actionType) match {
case ActionType.ClientTweetV2Impression if breadcrumbViewsContain("home") =>
true
case ActionType.ClientTweetVideoFullscreenV2Impression
if (breadcrumbViewsContain("home") & breadcrumbViewsContain("video")) =>
true
case ActionType.ClientProfileV2Impression if breadcrumbViewsContain("home") =>
true
case _ => false
}
}
}