the-algorithm/timelines/data_processing/ml_util/aggregation_framework/heron/RealTimeAggregatesJobBase.scala
twitter-team 197bf2c563 Open-sourcing Timelines Aggregation Framework
Open sourcing Aggregation Framework, a config-driven Summingbird based framework for generating real-time and batch aggregate features to be consumed by ML models.
2023-04-28 14:17:02 -05:00

302 lines
13 KiB
Scala

package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron
import com.twitter.algebird.Monoid
import com.twitter.bijection.Injection
import com.twitter.bijection.thrift.CompactThriftCodec
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.heron.util.CommonMetric
import com.twitter.ml.api.DataRecord
import com.twitter.scalding.Args
import com.twitter.storehaus.algebra.MergeableStore
import com.twitter.storehaus.algebra.StoreAlgebra._
import com.twitter.storehaus_internal.memcache.Memcache
import com.twitter.storehaus_internal.store.CombinedStore
import com.twitter.storehaus_internal.store.ReplicatingWritableStore
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird.batch.Batcher
import com.twitter.summingbird.online.MergeableStoreFactory
import com.twitter.summingbird.online.option._
import com.twitter.summingbird.option.CacheSize
import com.twitter.summingbird.option.JobId
import com.twitter.summingbird.storm.option.FlatMapStormMetrics
import com.twitter.summingbird.storm.option.SummerStormMetrics
import com.twitter.summingbird.storm.Storm
import com.twitter.summingbird.storm.StormMetric
import com.twitter.summingbird.Options
import com.twitter.summingbird._
import com.twitter.summingbird_internal.runner.common.CapTicket
import com.twitter.summingbird_internal.runner.common.JobName
import com.twitter.summingbird_internal.runner.common.TeamEmail
import com.twitter.summingbird_internal.runner.common.TeamName
import com.twitter.summingbird_internal.runner.storm.ProductionStormConfig
import com.twitter.timelines.data_processing.ml_util.aggregation_framework._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.job.AggregatesV2Job
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.job.AggregatesV2Job
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.job.DataRecordFeatureCounter
import org.apache.heron.api.{Config => HeronConfig}
import org.apache.heron.common.basics.ByteAmount
import org.apache.storm.Config
import scala.collection.JavaConverters._
object RealTimeAggregatesJobBase {
lazy val commonMetric: StormMetric[CommonMetric] =
StormMetric(new CommonMetric(), CommonMetric.NAME, CommonMetric.POLL_INTERVAL)
lazy val flatMapMetrics: FlatMapStormMetrics = FlatMapStormMetrics(Iterable(commonMetric))
lazy val summerMetrics: SummerStormMetrics = SummerStormMetrics(Iterable(commonMetric))
}
trait RealTimeAggregatesJobBase extends Serializable {
import RealTimeAggregatesJobBase._
import com.twitter.summingbird_internal.bijection.BatchPairImplicits._
def statsReceiver: StatsReceiver
def aggregatesToCompute: Set[TypedAggregateGroup[_]]
def jobConfigs: RealTimeAggregatesJobConfigs
implicit lazy val dataRecordCodec: Injection[DataRecord, Array[Byte]] =
CompactThriftCodec[DataRecord]
implicit lazy val monoid: Monoid[DataRecord] = DataRecordAggregationMonoid(aggregatesToCompute)
implicit lazy val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] =
AggregationKeyInjection
val clusters: Set[String] = Set("atla", "pdxa")
def buildAggregateStoreToStorm(
isProd: Boolean,
serviceIdentifier: ServiceIdentifier,
jobConfig: RealTimeAggregatesJobConfig
): (AggregateStore => Option[Storm#Store[AggregationKey, DataRecord]]) = {
(store: AggregateStore) =>
store match {
case rtaStore: RealTimeAggregateStore if rtaStore.isProd == isProd => {
lazy val primaryStore: MergeableStore[(AggregationKey, BatchID), DataRecord] =
Memcache.getMemcacheStore[(AggregationKey, BatchID), DataRecord](
rtaStore.online(serviceIdentifier))
lazy val mergeableStore: MergeableStore[(AggregationKey, BatchID), DataRecord] =
if (jobConfig.enableUserReindexingNighthawkBtreeStore
|| jobConfig.enableUserReindexingNighthawkHashStore) {
val reindexingNighthawkBtreeWritableDataRecordStoreList =
if (jobConfig.enableUserReindexingNighthawkBtreeStore) {
lazy val cacheClientNighthawkConfig =
jobConfig.userReindexingNighthawkBtreeStoreConfig.online(serviceIdentifier)
List(
UserReindexingNighthawkWritableDataRecordStore.getBtreeStore(
nighthawkCacheConfig = cacheClientNighthawkConfig,
// Choose a reasonably large target size as this will be equivalent to the number of unique (user, timestamp)
// keys that are returned on read on the pKey, and we may have duplicate authors and associated records.
targetSize = 512,
statsReceiver = statsReceiver,
// Assuming trims are relatively expensive, choose a trimRate that's not as aggressive. In this case we trim on
// 10% of all writes.
trimRate = 0.1
))
} else { Nil }
val reindexingNighthawkHashWritableDataRecordStoreList =
if (jobConfig.enableUserReindexingNighthawkHashStore) {
lazy val cacheClientNighthawkConfig =
jobConfig.userReindexingNighthawkHashStoreConfig.online(serviceIdentifier)
List(
UserReindexingNighthawkWritableDataRecordStore.getHashStore(
nighthawkCacheConfig = cacheClientNighthawkConfig,
// Choose a reasonably large target size as this will be equivalent to the number of unique (user, timestamp)
// keys that are returned on read on the pKey, and we may have duplicate authors and associated records.
targetSize = 512,
statsReceiver = statsReceiver,
// Assuming trims are relatively expensive, choose a trimRate that's not as aggressive. In this case we trim on
// 10% of all writes.
trimRate = 0.1
))
} else { Nil }
lazy val replicatingWritableStore = new ReplicatingWritableStore(
stores = List(primaryStore) ++ reindexingNighthawkBtreeWritableDataRecordStoreList
++ reindexingNighthawkHashWritableDataRecordStoreList
)
lazy val combinedStoreWithReindexing = new CombinedStore(
read = primaryStore,
write = replicatingWritableStore
)
combinedStoreWithReindexing.toMergeable
} else {
primaryStore
}
lazy val storeFactory: MergeableStoreFactory[(AggregationKey, BatchID), DataRecord] =
Storm.store(mergeableStore)(Batcher.unit)
Some(storeFactory)
}
case _ => None
}
}
def buildDataRecordSourceToStorm(
jobConfig: RealTimeAggregatesJobConfig
): (AggregateSource => Option[Producer[Storm, DataRecord]]) = { (source: AggregateSource) =>
{
source match {
case stormAggregateSource: StormAggregateSource =>
Some(stormAggregateSource.build(statsReceiver, jobConfig))
case _ => None
}
}
}
def apply(args: Args): ProductionStormConfig = {
lazy val isProd = args.boolean("production")
lazy val cluster = args.getOrElse("cluster", "")
lazy val isDebug = args.boolean("debug")
lazy val role = args.getOrElse("role", "")
lazy val service =
args.getOrElse(
"service_name",
""
) // don't use the argument service, which is a reserved heron argument
lazy val environment = if (isProd) "prod" else "devel"
lazy val s2sEnabled = args.boolean("s2s")
lazy val keyedByUserEnabled = args.boolean("keyed_by_user")
lazy val keyedByAuthorEnabled = args.boolean("keyed_by_author")
require(clusters.contains(cluster))
if (s2sEnabled) {
require(role.length() > 0)
require(service.length() > 0)
}
lazy val serviceIdentifier = if (s2sEnabled) {
ServiceIdentifier(
role = role,
service = service,
environment = environment,
zone = cluster
)
} else EmptyServiceIdentifier
lazy val jobConfig = {
val jobConfig = if (isProd) jobConfigs.Prod else jobConfigs.Devel
jobConfig.copy(
serviceIdentifier = serviceIdentifier,
keyedByUserEnabled = keyedByUserEnabled,
keyedByAuthorEnabled = keyedByAuthorEnabled)
}
lazy val dataRecordSourceToStorm = buildDataRecordSourceToStorm(jobConfig)
lazy val aggregateStoreToStorm =
buildAggregateStoreToStorm(isProd, serviceIdentifier, jobConfig)
lazy val JaasConfigFlag = "-Djava.security.auth.login.config=resources/jaas.conf"
lazy val JaasDebugFlag = "-Dsun.security.krb5.debug=true"
lazy val JaasConfigString =
if (isDebug) { "%s %s".format(JaasConfigFlag, JaasDebugFlag) }
else JaasConfigFlag
new ProductionStormConfig {
implicit val jobId: JobId = JobId(jobConfig.name)
override val jobName = JobName(jobConfig.name)
override val teamName = TeamName(jobConfig.teamName)
override val teamEmail = TeamEmail(jobConfig.teamEmail)
override val capTicket = CapTicket("n/a")
val configureHeronJvmSettings = {
val heronJvmOptions = new java.util.HashMap[String, AnyRef]()
jobConfig.componentToRamGigaBytesMap.foreach {
case (component, gigabytes) =>
HeronConfig.setComponentRam(
heronJvmOptions,
component,
ByteAmount.fromGigabytes(gigabytes))
}
HeronConfig.setContainerRamRequested(
heronJvmOptions,
ByteAmount.fromGigabytes(jobConfig.containerRamGigaBytes)
)
jobConfig.componentsToKerberize.foreach { component =>
HeronConfig.setComponentJvmOptions(
heronJvmOptions,
component,
JaasConfigString
)
}
jobConfig.componentToMetaSpaceSizeMap.foreach {
case (component, metaspaceSize) =>
HeronConfig.setComponentJvmOptions(
heronJvmOptions,
component,
metaspaceSize
)
}
heronJvmOptions.asScala.toMap ++ AggregatesV2Job
.aggregateNames(aggregatesToCompute).map {
case (prefix, aggNames) => (s"extras.aggregateNames.${prefix}", aggNames)
}
}
override def transformConfig(m: Map[String, AnyRef]): Map[String, AnyRef] = {
super.transformConfig(m) ++ List(
/**
* Disable acking by setting acker executors to 0. Tuples that come off the
* spout will be immediately acked which effectively disables retries on tuple
* failures. This should help topology throughput/availability by relaxing consistency.
*/
Config.TOPOLOGY_ACKER_EXECUTORS -> int2Integer(0),
Config.TOPOLOGY_WORKERS -> int2Integer(jobConfig.topologyWorkers),
HeronConfig.TOPOLOGY_CONTAINER_CPU_REQUESTED -> int2Integer(8),
HeronConfig.TOPOLOGY_DROPTUPLES_UPON_BACKPRESSURE -> java.lang.Boolean.valueOf(true),
HeronConfig.TOPOLOGY_WORKER_CHILDOPTS -> List(
JaasConfigString,
s"-Dcom.twitter.eventbus.client.zoneName=${cluster}",
"-Dcom.twitter.eventbus.client.EnableKafkaSaslTls=true"
).mkString(" "),
"storm.job.uniqueId" -> jobId.get
) ++ configureHeronJvmSettings
}
override lazy val getNamedOptions: Map[String, Options] = jobConfig.topologyNamedOptions ++
Map(
"DEFAULT" -> Options()
.set(flatMapMetrics)
.set(summerMetrics)
.set(MaxWaitingFutures(1000))
.set(FlushFrequency(30.seconds))
.set(UseAsyncCache(true))
.set(AsyncPoolSize(4))
.set(SourceParallelism(jobConfig.sourceCount))
.set(SummerBatchMultiplier(1000)),
"FLATMAP" -> Options()
.set(FlatMapParallelism(jobConfig.flatMapCount))
.set(CacheSize(0)),
"SUMMER" -> Options()
.set(SummerParallelism(jobConfig.summerCount))
/**
* Sets number of tuples a Summer awaits before aggregation. Set higher
* if you need to lower qps to memcache at the expense of introducing
* some (stable) latency.
*/
.set(CacheSize(jobConfig.cacheSize))
)
val featureCounters: Seq[DataRecordFeatureCounter] =
Seq(DataRecordFeatureCounter.any(Counter(Group("feature_counter"), Name("num_records"))))
override def graph: TailProducer[Storm, Any] = AggregatesV2Job.generateJobGraph[Storm](
aggregateSet = aggregatesToCompute,
aggregateSourceToSummingbird = dataRecordSourceToStorm,
aggregateStoreToSummingbird = aggregateStoreToStorm,
featureCounters = featureCounters
)
}
}
}