the-algorithm/timelines/data_processing/ml_util/aggregation_framework/conversion/PickTopCtrPolicy.scala

227 lines
7.8 KiB
Scala

package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.ml.api._
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import java.lang.{Boolean => JBoolean}
import java.lang.{Double => JDouble}
case class CtrDescriptor(
engagementFeature: Feature[JDouble],
impressionFeature: Feature[JDouble],
outputFeature: Feature[JDouble])
object PickTopCtrBuilderHelper {
def createCtrDescriptors(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
outputSuffix: String
): Set[CtrDescriptor] = {
val aggregateFeatures = aggregatesToCompute
.filter(_.aggregatePrefix == aggregatePrefix)
val impressionFeature = aggregateFeatures
.flatMap { group =>
group.individualAggregateDescriptors
.filter(_.query.feature == None)
.filter(_.query.label == None)
.flatMap(_.outputFeatures)
}
.head
.asInstanceOf[Feature[JDouble]]
val aggregateEngagementFeatures =
aggregateFeatures
.flatMap { group =>
group.individualAggregateDescriptors
.filter(_.query.feature == None)
.filter { descriptor =>
//TODO: we should remove the need to pass around engagementLabels and just use all the labels available.
descriptor.query.label.exists(engagementLabels.contains(_))
}
.flatMap(_.outputFeatures)
}
.map(_.asInstanceOf[Feature[JDouble]])
aggregateEngagementFeatures
.map { aggregateEngagementFeature =>
CtrDescriptor(
engagementFeature = aggregateEngagementFeature,
impressionFeature = impressionFeature,
outputFeature = new Feature.Continuous(
aggregateEngagementFeature.getDenseFeatureName + "." + outputSuffix,
AggregationMetricCommon.derivePersonalDataTypes(
Some(aggregateEngagementFeature),
Some(impressionFeature)
)
)
)
}
}
}
object PickTopCtrPolicy {
def build(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
smoothing: Double = 1.0,
outputSuffix: String = "ratio"
): PickTopCtrPolicy = {
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
aggregatePrefix = aggregatePrefix,
engagementLabels = engagementLabels,
aggregatesToCompute = aggregatesToCompute,
outputSuffix = outputSuffix
)
PickTopCtrPolicy(
ctrDescriptors = ctrDescriptors,
smoothing = smoothing
)
}
}
object CombinedTopNCtrsByWilsonConfidenceIntervalPolicy {
def build(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
outputSuffix: String = "ratioWithWCI",
z: Double = 1.96,
topN: Int = 1
): CombinedTopNCtrsByWilsonConfidenceIntervalPolicy = {
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
aggregatePrefix = aggregatePrefix,
engagementLabels = engagementLabels,
aggregatesToCompute = aggregatesToCompute,
outputSuffix = outputSuffix
)
CombinedTopNCtrsByWilsonConfidenceIntervalPolicy(
ctrDescriptors = ctrDescriptors,
z = z,
topN = topN
)
}
}
/*
* A merge policy that picks the aggregate features corresponding to
* the sparse key value with the highest engagement rate (defined
* as the ratio of two specified features, representing engagements
* and impressions). Also outputs the engagement rate to the specified
* outputFeature.
*
* This is an abstract class. We can make variants of this policy by overriding
* the calculateCtr method.
*/
abstract class PickTopCtrPolicyBase(ctrDescriptors: Set[CtrDescriptor])
extends SparseBinaryMergePolicy {
private def getContinuousFeature(
aggregateRecord: DataRecord,
feature: Feature[JDouble]
): Double = {
Option(SRichDataRecord(aggregateRecord).getFeatureValue(feature))
.map(_.asInstanceOf[JDouble].toDouble)
.getOrElse(0.0)
}
/**
* For every provided descriptor, compute the corresponding CTR feature
* and only hydrate this result to the provided input record.
*/
override def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit = {
ctrDescriptors
.foreach {
case CtrDescriptor(engagementFeature, impressionFeature, outputFeature) =>
val sortedCtrs =
aggregateRecords
.map { aggregateRecord =>
val impressions = getContinuousFeature(aggregateRecord, impressionFeature)
val engagements = getContinuousFeature(aggregateRecord, engagementFeature)
calculateCtr(impressions, engagements)
}
.sortBy { ctr => -ctr }
combineTopNCtrsToSingleScore(sortedCtrs)
.foreach { score =>
SRichDataRecord(mutableInputRecord).setFeatureValue(outputFeature, score)
}
}
}
protected def calculateCtr(impressions: Double, engagements: Double): Double
protected def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double]
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
ctrDescriptors
.map(_.outputFeature)
.toSet
}
case class PickTopCtrPolicy(ctrDescriptors: Set[CtrDescriptor], smoothing: Double = 1.0)
extends PickTopCtrPolicyBase(ctrDescriptors) {
require(smoothing > 0.0)
override def calculateCtr(impressions: Double, engagements: Double): Double =
(1.0 * engagements) / (smoothing + impressions)
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
sortedCtrs.headOption
}
case class CombinedTopNCtrsByWilsonConfidenceIntervalPolicy(
ctrDescriptors: Set[CtrDescriptor],
z: Double = 1.96,
topN: Int = 1)
extends PickTopCtrPolicyBase(ctrDescriptors) {
private val zSquared = z * z
private val zSquaredDiv2 = zSquared / 2.0
private val zSquaredDiv4 = zSquared / 4.0
/**
* calculates the lower bound of wilson score interval. which roughly says "the actual engagement
* rate is at least this value" with confidence designated by the z-score:
* https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
*/
override def calculateCtr(rawImpressions: Double, engagements: Double): Double = {
// just in case engagements happens to be more than impressions...
val impressions = Math.max(rawImpressions, engagements)
if (impressions > 0.0) {
val p = engagements / impressions
(p
+ zSquaredDiv2 / impressions
- z * Math.sqrt(
(p * (1.0 - p) + zSquaredDiv4 / impressions) / impressions)) / (1.0 + zSquared / impressions)
} else 0.0
}
/**
* takes the topN engagement rates, and returns the joint probability as {1.0 - Π(1.0 - p)}
*
* e.g. let's say you have 0.6 chance of clicking on a tweet shared by the user A.
* you also have 0.3 chance of clicking on a tweet shared by the user B.
* seeing a tweet shared by both A and B will not lead to 0.9 chance of you clicking on it.
* but you could say that you have 0.4*0.7 chance of NOT clicking on that tweet.
*/
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
if (sortedCtrs.nonEmpty) {
val inverseLogP = sortedCtrs
.take(topN).map { p => Math.log(1.0 - p) }.sum
Some(1.0 - Math.exp(inverseLogP))
} else None
}