the-algorithm/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSour...

186 lines
7.6 KiB
Scala

package com.twitter.timelines.prediction.common.aggregates.real_time
import com.twitter.clientapp.thriftscala.LogEvent
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.stats.Counter
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.snowflake.id.SnowflakeId
import com.twitter.summingbird._
import com.twitter.summingbird.storm.Storm
import com.twitter.summingbird_internal.sources.AppId
import com.twitter.summingbird_internal.sources.storm.remote.ClientEventSourceScrooge2
import com.twitter.timelines.data_processing.ad_hoc.suggests.common.AllScribeProcessor
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.RealTimeAggregatesJobConfig
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.StormAggregateSource
import com.twitter.timelines.prediction.adapters.client_log_event.ClientLogEventAdapter
import com.twitter.timelines.prediction.adapters.client_log_event.ProfileClientLogEventAdapter
import com.twitter.timelines.prediction.adapters.client_log_event.SearchClientLogEventAdapter
import com.twitter.timelines.prediction.adapters.client_log_event.UuaEventAdapter
import com.twitter.unified_user_actions.client.config.KafkaConfigs
import com.twitter.unified_user_actions.client.summingbird.UnifiedUserActionsSourceScrooge
import com.twitter.unified_user_actions.thriftscala.UnifiedUserAction
import scala.collection.JavaConverters._
/**
* Storm Producer for client events generated on Home, Profile, and Search
*/
class TimelinesStormAggregateSource extends StormAggregateSource {
override val name = "timelines_rta"
override val timestampFeature = SharedFeatures.TIMESTAMP
private lazy val TimelinesClientEventSourceName = "TL_EVENTS_SOURCE"
private lazy val ProfileClientEventSourceName = "PROFILE_EVENTS_SOURCE"
private lazy val SearchClientEventSourceName = "SEARCH_EVENTS_SOURCE"
private lazy val UuaEventSourceName = "UUA_EVENTS_SOURCE"
private lazy val CombinedProducerName = "COMBINED_PRODUCER"
private lazy val FeatureStoreProducerName = "FEATURE_STORE_PRODUCER"
private def isNewUserEvent(event: LogEvent): Boolean = {
event.logBase.flatMap(_.userId).flatMap(SnowflakeId.timeFromIdOpt).exists(_.untilNow < 30.days)
}
private def mkDataRecords(event: LogEvent, dataRecordCounter: Counter): Seq[DataRecord] = {
val dataRecords: Seq[DataRecord] =
if (AllScribeProcessor.isValidSuggestTweetEvent(event)) {
ClientLogEventAdapter.adaptToDataRecords(event).asScala
} else {
Seq.empty[DataRecord]
}
dataRecordCounter.incr(dataRecords.size)
dataRecords
}
private def mkProfileDataRecords(
event: LogEvent,
dataRecordCounter: Counter
): Seq[DataRecord] = {
val dataRecords: Seq[DataRecord] =
ProfileClientLogEventAdapter.adaptToDataRecords(event).asScala
dataRecordCounter.incr(dataRecords.size)
dataRecords
}
private def mkSearchDataRecords(
event: LogEvent,
dataRecordCounter: Counter
): Seq[DataRecord] = {
val dataRecords: Seq[DataRecord] =
SearchClientLogEventAdapter.adaptToDataRecords(event).asScala
dataRecordCounter.incr(dataRecords.size)
dataRecords
}
private def mkUuaDataRecords(
event: UnifiedUserAction,
dataRecordCounter: Counter
): Seq[DataRecord] = {
val dataRecords: Seq[DataRecord] =
UuaEventAdapter.adaptToDataRecords(event).asScala
dataRecordCounter.incr(dataRecords.size)
dataRecords
}
override def build(
statsReceiver: StatsReceiver,
jobConfig: RealTimeAggregatesJobConfig
): Producer[Storm, DataRecord] = {
lazy val scopedStatsReceiver = statsReceiver.scope(getClass.getSimpleName)
lazy val dataRecordCounter = scopedStatsReceiver.counter("dataRecord")
// Home Timeline Engagements
// Step 1: => LogEvent
lazy val clientEventProducer: Producer[Storm, HomeEvent[LogEvent]] =
ClientEventSourceScrooge2(
appId = AppId(jobConfig.appId),
topic = "julep_client_event_suggests",
resumeAtLastReadOffset = false,
enableTls = true
).source.map(HomeEvent[LogEvent]).name(TimelinesClientEventSourceName)
// Profile Engagements
// Step 1: => LogEvent
lazy val profileClientEventProducer: Producer[Storm, ProfileEvent[LogEvent]] =
ClientEventSourceScrooge2(
appId = AppId(jobConfig.appId),
topic = "julep_client_event_profile_real_time_engagement_metrics",
resumeAtLastReadOffset = false,
enableTls = true
).source
.map(ProfileEvent[LogEvent])
.name(ProfileClientEventSourceName)
// Search Engagements
// Step 1: => LogEvent
// Only process events for all users to save resource
lazy val searchClientEventProducer: Producer[Storm, SearchEvent[LogEvent]] =
ClientEventSourceScrooge2(
appId = AppId(jobConfig.appId),
topic = "julep_client_event_search_real_time_engagement_metrics",
resumeAtLastReadOffset = false,
enableTls = true
).source
.map(SearchEvent[LogEvent])
.name(SearchClientEventSourceName)
// Unified User Actions (includes Home and other product surfaces)
lazy val uuaEventProducer: Producer[Storm, UuaEvent[UnifiedUserAction]] =
UnifiedUserActionsSourceScrooge(
appId = AppId(jobConfig.appId),
parallelism = 10,
kafkaConfig = KafkaConfigs.ProdUnifiedUserActionsEngagementOnly
).source
.filter(StormAggregateSourceUtils.isUuaBCEEventsFromHome(_))
.map(UuaEvent[UnifiedUserAction])
.name(UuaEventSourceName)
// Combined
// Step 2:
// (a) Combine
// (b) Transform LogEvent => Seq[DataRecord]
// (c) Apply sampler
lazy val combinedClientEventDataRecordProducer: Producer[Storm, Event[DataRecord]] =
profileClientEventProducer // This becomes the bottom branch
.merge(clientEventProducer) // This becomes the middle branch
.merge(searchClientEventProducer)
.merge(uuaEventProducer) // This becomes the top
.flatMap { // LogEvent => Seq[DataRecord]
case e: HomeEvent[LogEvent] =>
mkDataRecords(e.event, dataRecordCounter).map(HomeEvent[DataRecord])
case e: ProfileEvent[LogEvent] =>
mkProfileDataRecords(e.event, dataRecordCounter).map(ProfileEvent[DataRecord])
case e: SearchEvent[LogEvent] =>
mkSearchDataRecords(e.event, dataRecordCounter).map(SearchEvent[DataRecord])
case e: UuaEvent[UnifiedUserAction] =>
mkUuaDataRecords(
e.event,
dataRecordCounter
).map(UuaEvent[DataRecord])
}
.flatMap { // Apply sampler
case e: HomeEvent[DataRecord] =>
jobConfig.sequentiallyTransform(e.event).map(HomeEvent[DataRecord])
case e: ProfileEvent[DataRecord] =>
jobConfig.sequentiallyTransform(e.event).map(ProfileEvent[DataRecord])
case e: SearchEvent[DataRecord] =>
jobConfig.sequentiallyTransform(e.event).map(SearchEvent[DataRecord])
case e: UuaEvent[DataRecord] =>
jobConfig.sequentiallyTransform(e.event).map(UuaEvent[DataRecord])
}
.name(CombinedProducerName)
// Step 3: Join with Feature Store features
lazy val featureStoreDataRecordProducer: Producer[Storm, DataRecord] =
StormAggregateSourceUtils
.wrapByFeatureStoreClient(
underlyingProducer = combinedClientEventDataRecordProducer,
jobConfig = jobConfig,
scopedStatsReceiver = scopedStatsReceiver
).map(_.event).name(FeatureStoreProducerName)
featureStoreDataRecordProducer
}
}