twitter-team 197bf2c563 Open-sourcing Timelines Aggregation Framework
Open sourcing Aggregation Framework, a config-driven Summingbird based framework for generating real-time and batch aggregate features to be consumed by ML models.
2023-04-28 14:17:02 -05:00

227 lines
7.8 KiB

package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import java.lang.{Boolean => JBoolean}
import java.lang.{Double => JDouble}
case class CtrDescriptor(
engagementFeature: Feature[JDouble],
impressionFeature: Feature[JDouble],
outputFeature: Feature[JDouble])
object PickTopCtrBuilderHelper {
def createCtrDescriptors(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
outputSuffix: String
): Set[CtrDescriptor] = {
val aggregateFeatures = aggregatesToCompute
.filter(_.aggregatePrefix == aggregatePrefix)
val impressionFeature = aggregateFeatures
.flatMap { group =>
.filter(_.query.feature == None)
.filter(_.query.label == None)
val aggregateEngagementFeatures =
.flatMap { group =>
.filter(_.query.feature == None)
.filter { descriptor =>
//TODO: we should remove the need to pass around engagementLabels and just use all the labels available.
.map { aggregateEngagementFeature =>
engagementFeature = aggregateEngagementFeature,
impressionFeature = impressionFeature,
outputFeature = new Feature.Continuous(
aggregateEngagementFeature.getDenseFeatureName + "." + outputSuffix,
object PickTopCtrPolicy {
def build(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
smoothing: Double = 1.0,
outputSuffix: String = "ratio"
): PickTopCtrPolicy = {
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
aggregatePrefix = aggregatePrefix,
engagementLabels = engagementLabels,
aggregatesToCompute = aggregatesToCompute,
outputSuffix = outputSuffix
ctrDescriptors = ctrDescriptors,
smoothing = smoothing
object CombinedTopNCtrsByWilsonConfidenceIntervalPolicy {
def build(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
outputSuffix: String = "ratioWithWCI",
z: Double = 1.96,
topN: Int = 1
): CombinedTopNCtrsByWilsonConfidenceIntervalPolicy = {
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
aggregatePrefix = aggregatePrefix,
engagementLabels = engagementLabels,
aggregatesToCompute = aggregatesToCompute,
outputSuffix = outputSuffix
ctrDescriptors = ctrDescriptors,
z = z,
topN = topN
* A merge policy that picks the aggregate features corresponding to
* the sparse key value with the highest engagement rate (defined
* as the ratio of two specified features, representing engagements
* and impressions). Also outputs the engagement rate to the specified
* outputFeature.
* This is an abstract class. We can make variants of this policy by overriding
* the calculateCtr method.
abstract class PickTopCtrPolicyBase(ctrDescriptors: Set[CtrDescriptor])
extends SparseBinaryMergePolicy {
private def getContinuousFeature(
aggregateRecord: DataRecord,
feature: Feature[JDouble]
): Double = {
* For every provided descriptor, compute the corresponding CTR feature
* and only hydrate this result to the provided input record.
override def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit = {
.foreach {
case CtrDescriptor(engagementFeature, impressionFeature, outputFeature) =>
val sortedCtrs =
.map { aggregateRecord =>
val impressions = getContinuousFeature(aggregateRecord, impressionFeature)
val engagements = getContinuousFeature(aggregateRecord, engagementFeature)
calculateCtr(impressions, engagements)
.sortBy { ctr => -ctr }
.foreach { score =>
SRichDataRecord(mutableInputRecord).setFeatureValue(outputFeature, score)
protected def calculateCtr(impressions: Double, engagements: Double): Double
protected def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double]
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
case class PickTopCtrPolicy(ctrDescriptors: Set[CtrDescriptor], smoothing: Double = 1.0)
extends PickTopCtrPolicyBase(ctrDescriptors) {
require(smoothing > 0.0)
override def calculateCtr(impressions: Double, engagements: Double): Double =
(1.0 * engagements) / (smoothing + impressions)
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
case class CombinedTopNCtrsByWilsonConfidenceIntervalPolicy(
ctrDescriptors: Set[CtrDescriptor],
z: Double = 1.96,
topN: Int = 1)
extends PickTopCtrPolicyBase(ctrDescriptors) {
private val zSquared = z * z
private val zSquaredDiv2 = zSquared / 2.0
private val zSquaredDiv4 = zSquared / 4.0
* calculates the lower bound of wilson score interval. which roughly says "the actual engagement
* rate is at least this value" with confidence designated by the z-score:
override def calculateCtr(rawImpressions: Double, engagements: Double): Double = {
// just in case engagements happens to be more than impressions...
val impressions = Math.max(rawImpressions, engagements)
if (impressions > 0.0) {
val p = engagements / impressions
+ zSquaredDiv2 / impressions
- z * Math.sqrt(
(p * (1.0 - p) + zSquaredDiv4 / impressions) / impressions)) / (1.0 + zSquared / impressions)
} else 0.0
* takes the topN engagement rates, and returns the joint probability as {1.0 - Π(1.0 - p)}
* e.g. let's say you have 0.6 chance of clicking on a tweet shared by the user A.
* you also have 0.3 chance of clicking on a tweet shared by the user B.
* seeing a tweet shared by both A and B will not lead to 0.9 chance of you clicking on it.
* but you could say that you have 0.4*0.7 chance of NOT clicking on that tweet.
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
if (sortedCtrs.nonEmpty) {
val inverseLogP = sortedCtrs
.take(topN).map { p => Math.log(1.0 - p) }.sum
Some(1.0 - Math.exp(inverseLogP))
} else None