the-algorithm/timelines/data_processing/ml_util/aggregation_framework/TypedAggregateGroup.scala

487 lines
19 KiB
Scala

package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
import com.twitter.util.Duration
import com.twitter.util.Try
import java.lang.{Boolean => JBoolean}
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import java.util.{Set => JSet}
import scala.annotation.tailrec
import scala.language.existentials
import scala.collection.JavaConverters._
import scala.util.matching.Regex
/**
* A case class contained precomputed data useful to quickly
* process operations over an aggregate.
*
* @param query The underlying feature being aggregated
* @param metric The aggregation metric
* @param outputFeatures The output features that aggregation will produce
* @param outputFeatureIds The precomputed hashes of the above outputFeatures
*/
case class PrecomputedAggregateDescriptor[T](
query: AggregateFeature[T],
metric: AggregationMetric[T, _],
outputFeatures: List[Feature[_]],
outputFeatureIds: List[JLong])
object TypedAggregateGroup {
/**
* Recursive function that generates all combinations of value
* assignments for a collection of sparse binary features.
*
* @param sparseBinaryIdValues list of sparse binary feature ids and possible values they can take
* @return A set of maps, where each map represents one possible assignment of values to ids
*/
def sparseBinaryPermutations(
sparseBinaryIdValues: List[(Long, Set[String])]
): Set[Map[Long, String]] = sparseBinaryIdValues match {
case (id, values) +: rest =>
tailRecSparseBinaryPermutations(
existingPermutations = values.map(value => Map(id -> value)),
remainingIdValues = rest
)
case Nil => Set.empty
}
@tailrec private[this] def tailRecSparseBinaryPermutations(
existingPermutations: Set[Map[Long, String]],
remainingIdValues: List[(Long, Set[String])]
): Set[Map[Long, String]] = remainingIdValues match {
case Nil => existingPermutations
case (id, values) +: rest =>
tailRecSparseBinaryPermutations(
existingPermutations.flatMap { existingIdValueMap =>
values.map(value => existingIdValueMap ++ Map(id -> value))
},
rest
)
}
val SparseFeatureSuffix = ".member"
def sparseFeature(sparseBinaryFeature: Feature[_]): Feature[String] =
new Feature.Text(
sparseBinaryFeature.getDenseFeatureName + SparseFeatureSuffix,
AggregationMetricCommon.derivePersonalDataTypes(Some(sparseBinaryFeature)))
/* Throws exception if obj not an instance of U */
private[this] def validate[U](obj: Any): U = {
require(obj.isInstanceOf[U])
obj.asInstanceOf[U]
}
private[this] def getFeatureOpt[U](dataRecord: DataRecord, feature: Feature[U]): Option[U] =
Option(SRichDataRecord(dataRecord).getFeatureValue(feature)).map(validate[U](_))
/**
* Get a mapping from feature ids
* (including individual sparse elements of a sparse feature) to values
* from the given data record, for a given feature type.
*
* @param dataRecord Data record to get features from
* @param keysToAggregate key features to get id-value mappings for
* @param featureType Feature type to get id-value maps for
*/
def getKeyFeatureIdValues[U](
dataRecord: DataRecord,
keysToAggregate: Set[Feature[_]],
featureType: FeatureType
): Set[(Long, Option[U])] = {
val featuresOfThisType: Set[Feature[U]] = keysToAggregate
.filter(_.getFeatureType == featureType)
.map(validate[Feature[U]])
featuresOfThisType
.map { feature: Feature[U] =>
val featureId: Long = getDenseFeatureId(feature)
val featureOpt: Option[U] = getFeatureOpt(dataRecord, feature)
(featureId, featureOpt)
}
}
// TypedAggregateGroup may transform the aggregate keys for internal use. This method generates
// denseFeatureIds for the transformed feature.
def getDenseFeatureId(feature: Feature[_]): Long =
if (feature.getFeatureType != FeatureType.SPARSE_BINARY) {
feature.getDenseFeatureId
} else {
sparseFeature(feature).getDenseFeatureId
}
/**
* Return denseFeatureIds for the input features after applying the custom transformation that
* TypedAggregateGroup applies to its keysToAggregate.
*
* @param keysToAggregate key features to get id for
*/
def getKeyFeatureIds(keysToAggregate: Set[Feature[_]]): Set[Long] =
keysToAggregate.map(getDenseFeatureId)
def checkIfAllKeysExist[U](featureIdValueMap: Map[Long, Option[U]]): Boolean =
featureIdValueMap.forall { case (_, valueOpt) => valueOpt.isDefined }
def liftOptions[U](featureIdValueMap: Map[Long, Option[U]]): Map[Long, U] =
featureIdValueMap
.flatMap {
case (id, valueOpt) =>
valueOpt.map { value => (id, value) }
}
val timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP
/**
* Builds all valid aggregation keys (for the output store) from
* a datarecord and a spec listing the keys to aggregate. There
* can be multiple aggregation keys generated from a single data
* record when grouping by sparse binary features, for which multiple
* values can be set within the data record.
*
* @param dataRecord Data record to read values for key features from
* @return A set of AggregationKeys encoding the values of all keys
*/
def buildAggregationKeys(
dataRecord: DataRecord,
keysToAggregate: Set[Feature[_]]
): Set[AggregationKey] = {
val discreteAggregationKeys = getKeyFeatureIdValues[Long](
dataRecord,
keysToAggregate,
FeatureType.DISCRETE
).toMap
val textAggregationKeys = getKeyFeatureIdValues[String](
dataRecord,
keysToAggregate,
FeatureType.STRING
).toMap
val sparseBinaryIdValues = getKeyFeatureIdValues[JSet[String]](
dataRecord,
keysToAggregate,
FeatureType.SPARSE_BINARY
).map {
case (id, values) =>
(
id,
values
.map(_.asScala.toSet)
.getOrElse(Set.empty[String])
)
}.toList
if (checkIfAllKeysExist(discreteAggregationKeys) &&
checkIfAllKeysExist(textAggregationKeys)) {
if (sparseBinaryIdValues.nonEmpty) {
sparseBinaryPermutations(sparseBinaryIdValues).map { sparseBinaryTextKeys =>
AggregationKey(
discreteFeaturesById = liftOptions(discreteAggregationKeys),
textFeaturesById = liftOptions(textAggregationKeys) ++ sparseBinaryTextKeys
)
}
} else {
Set(
AggregationKey(
discreteFeaturesById = liftOptions(discreteAggregationKeys),
textFeaturesById = liftOptions(textAggregationKeys)
)
)
}
} else Set.empty[AggregationKey]
}
}
/**
* Specifies one or more related aggregate(s) to compute in the summingbird job.
*
* @param inputSource Source to compute this aggregate over
* @param preTransforms Sequence of [[com.twitter.ml.api.RichITransform]] that transform
* data records pre-aggregation (e.g. discretization, renaming)
* @param samplingTransformOpt Optional [[OneToSomeTransform]] that transform data
* record to optional data record (e.g. for sampling) before aggregation
* @param aggregatePrefix Prefix to use for naming resultant aggregate features
* @param keysToAggregate Features to group by when computing the aggregates
* (e.g. USER_ID, AUTHOR_ID)
* @param featuresToAggregate Features to aggregate (e.g. blender_score or is_photo)
* @param labels Labels to cross the features with to make pair features, if any.
* use Label.All if you don't want to cross with a label.
* @param metrics Aggregation metrics to compute (e.g. count, mean)
* @param halfLives Half lives to use for the aggregations, to be crossed with the above.
* use Duration.Top for "forever" aggregations over an infinite time window (no decay).
* @param outputStore Store to output this aggregate to
* @param includeAnyFeature Aggregate label counts for any feature value
* @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
*
* The overall config for the summingbird job consists of a list of "AggregateGroup"
* case class objects, which get translated into strongly typed "TypedAggregateGroup"
* case class objects. A single TypedAggregateGroup always groups input data records from
* ''inputSource'' by a single set of aggregation keys (''featuresToAggregate'').
* Within these groups, we perform a comprehensive cross of:
*
* ''featuresToAggregate'' x ''labels'' x ''metrics'' x ''halfLives''
*
* All the resultant aggregate features are assigned a human-readable feature name
* beginning with ''aggregatePrefix'', and are written to DataRecords that get
* aggregated and written to the store specified by ''outputStore''.
*
* Illustrative example. Suppose we define our spec as follows:
*
* TypedAggregateGroup(
* inputSource = "timelines_recap_daily",
* aggregatePrefix = "user_author_aggregate",
* keysToAggregate = Set(USER_ID, AUTHOR_ID),
* featuresToAggregate = Set(RecapFeatures.TEXT_SCORE, RecapFeatures.BLENDER_SCORE),
* labels = Set(RecapFeatures.IS_FAVORITED, RecapFeatures.IS_REPLIED),
* metrics = Set(CountMetric, MeanMetric),
* halfLives = Set(7.Days, 30.Days),
* outputStore = "user_author_aggregate_store"
* )
*
* This will process data records from the source named "timelines_recap_daily"
* (see AggregateSource.scala for more details on how to add your own source)
* It will produce a total of 2x2x2x2 = 16 aggregation features, named like:
*
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.7days
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.30days
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.mean.7days
*
* ... (and so on)
*
* and all the result features will be stored in DataRecords, summed up, and written
* to the output store defined by the name "user_author_aggregate_store".
* (see AggregateStore.scala for details on how to add your own store).
*
* If you do not want a full cross, split up your config into multiple TypedAggregateGroup
* objects. Splitting is strongly advised to avoid blowing up and creating invalid
* or unnecessary combinations of aggregate features (note that some combinations
* are useless or invalid e.g. computing the mean of a binary feature). Splitting
* also does not cost anything in terms of real-time performance, because all
* Aggregate objects in the master spec that share the same ''keysToAggregate'', the
* same ''inputSource'' and the same ''outputStore'' are grouped by the summingbird
* job logic and stored into a single DataRecord in the output store. Overlapping
* aggregates will also automatically be deduplicated so don't worry about overlaps.
*/
case class TypedAggregateGroup[T](
inputSource: AggregateSource,
aggregatePrefix: String,
keysToAggregate: Set[Feature[_]],
featuresToAggregate: Set[Feature[T]],
labels: Set[_ <: Feature[JBoolean]],
metrics: Set[AggregationMetric[T, _]],
halfLives: Set[Duration],
outputStore: AggregateStore,
preTransforms: Seq[OneToSomeTransform] = Seq.empty,
includeAnyFeature: Boolean = true,
includeAnyLabel: Boolean = true,
aggExclusionRegex: Seq[String] = Seq.empty) {
import TypedAggregateGroup._
val compiledRegexes = aggExclusionRegex.map(new Regex(_))
// true if should drop, false if should keep
def filterOutAggregateFeature(
feature: PrecomputedAggregateDescriptor[_],
regexes: Seq[Regex]
): Boolean = {
if (regexes.nonEmpty)
feature.outputFeatures.exists { feature =>
regexes.exists { re => re.findFirstMatchIn(feature.getDenseFeatureName).nonEmpty }
}
else false
}
def buildAggregationKeys(
dataRecord: DataRecord
): Set[AggregationKey] = {
TypedAggregateGroup.buildAggregationKeys(dataRecord, keysToAggregate)
}
/**
* This val precomputes descriptors for all individual aggregates in this group
* (of type ''AggregateFeature''). Also precompute hashes of all aggregation
* "output" features generated by these operators for faster
* run-time performance (this turns out to be a primary CPU bottleneck).
* Ex: for the mean operator, "sum" and "count" are output features
*/
val individualAggregateDescriptors: Set[PrecomputedAggregateDescriptor[T]] = {
/*
* By default, in additional to all feature-label crosses, also
* compute in aggregates over each feature and label without crossing
*/
val labelOptions = labels.map(Option(_)) ++
(if (includeAnyLabel) Set(None) else Set.empty)
val featureOptions = featuresToAggregate.map(Option(_)) ++
(if (includeAnyFeature) Set(None) else Set.empty)
for {
feature <- featureOptions
label <- labelOptions
metric <- metrics
halfLife <- halfLives
} yield {
val query = AggregateFeature[T](aggregatePrefix, feature, label, halfLife)
val aggregateOutputFeatures = metric.getOutputFeatures(query)
val aggregateOutputFeatureIds = metric.getOutputFeatureIds(query)
PrecomputedAggregateDescriptor(
query,
metric,
aggregateOutputFeatures,
aggregateOutputFeatureIds
)
}
}.filterNot(filterOutAggregateFeature(_, compiledRegexes))
/* Precomputes a map from all generated aggregate feature ids to their half lives. */
val continuousFeatureIdsToHalfLives: Map[Long, Duration] =
individualAggregateDescriptors.flatMap { descriptor =>
descriptor.outputFeatures
.flatMap { feature =>
if (feature.getFeatureType() == FeatureType.CONTINUOUS) {
Try(feature.asInstanceOf[Feature[JDouble]]).toOption
.map(feature => (feature.getFeatureId(), descriptor.query.halfLife))
} else None
}
}.toMap
/*
* Sparse binary keys become individual string keys in the output.
* e.g. group by "words.in.tweet", output key: "words.in.tweet.member"
*/
val allOutputKeys: Set[Feature[_]] = keysToAggregate.map { key =>
if (key.getFeatureType == FeatureType.SPARSE_BINARY) sparseFeature(key)
else key
}
val allOutputFeatures: Set[Feature[_]] = individualAggregateDescriptors.flatMap {
case PrecomputedAggregateDescriptor(
query,
metric,
outputFeatures,
outputFeatureIds
) =>
outputFeatures
}
val aggregateContext: FeatureContext = new FeatureContext(allOutputFeatures.toList.asJava)
/**
* Adds all aggregates in this group found in the two input data records
* into a result, mutating the result. Uses a while loop for an
* approximately 10% gain in speed over a for comprehension.
*
* WARNING: mutates ''result''
*
* @param result The output data record to mutate
* @param left The left data record to add
* @param right The right data record to add
*/
def mutatePlus(result: DataRecord, left: DataRecord, right: DataRecord): Unit = {
val featureIterator = individualAggregateDescriptors.iterator
while (featureIterator.hasNext) {
val descriptor = featureIterator.next
descriptor.metric.mutatePlus(
result,
left,
right,
descriptor.query,
Some(descriptor.outputFeatureIds)
)
}
}
/**
* Apply preTransforms sequentially. If any transform results in a dropped (None)
* DataRecord, then entire tranform sequence will result in a dropped DataRecord.
* Note that preTransforms are order-dependent.
*/
private[this] def sequentiallyTransform(dataRecord: DataRecord): Option[DataRecord] = {
val recordOpt = Option(new DataRecord(dataRecord))
preTransforms.foldLeft(recordOpt) {
case (Some(previousRecord), preTransform) =>
preTransform(previousRecord)
case _ => Option.empty[DataRecord]
}
}
/**
* Given a data record, apply transforms and fetch the incremental contributions to
* each configured aggregate from this data record, and store these in an output data record.
*
* @param dataRecord Input data record to aggregate.
* @return A set of tuples (AggregationKey, DataRecord) whose first entry is an
* AggregationKey indicating what keys we're grouping by, and whose second entry
* is an output data record with incremental contributions to the aggregate value(s)
*/
def computeAggregateKVPairs(dataRecord: DataRecord): Set[(AggregationKey, DataRecord)] = {
sequentiallyTransform(dataRecord)
.flatMap { dataRecord =>
val aggregationKeys = buildAggregationKeys(dataRecord)
val increment = new DataRecord
val isNonEmptyIncrement = individualAggregateDescriptors
.map { descriptor =>
descriptor.metric.setIncrement(
output = increment,
input = dataRecord,
query = descriptor.query,
timestampFeature = inputSource.timestampFeature,
aggregateOutputs = Some(descriptor.outputFeatureIds)
)
}
.exists(identity)
if (isNonEmptyIncrement) {
SRichDataRecord(increment).setFeatureValue(
timestampFeature,
getTimestamp(dataRecord, inputSource.timestampFeature)
)
Some(aggregationKeys.map(key => (key, increment)))
} else {
None
}
}
.getOrElse(Set.empty[(AggregationKey, DataRecord)])
}
def outputFeaturesToRenamedOutputFeatures(prefix: String): Map[Feature[_], Feature[_]] = {
require(prefix.nonEmpty)
allOutputFeatures.map { feature =>
if (feature.isSetFeatureName) {
val renamedFeatureName = prefix + feature.getDenseFeatureName
val personalDataTypes =
if (feature.getPersonalDataTypes.isPresent) feature.getPersonalDataTypes.get()
else null
val renamedFeature = feature.getFeatureType match {
case FeatureType.BINARY =>
new Feature.Binary(renamedFeatureName, personalDataTypes)
case FeatureType.DISCRETE =>
new Feature.Discrete(renamedFeatureName, personalDataTypes)
case FeatureType.STRING =>
new Feature.Text(renamedFeatureName, personalDataTypes)
case FeatureType.CONTINUOUS =>
new Feature.Continuous(renamedFeatureName, personalDataTypes)
case FeatureType.SPARSE_BINARY =>
new Feature.SparseBinary(renamedFeatureName, personalDataTypes)
case FeatureType.SPARSE_CONTINUOUS =>
new Feature.SparseContinuous(renamedFeatureName, personalDataTypes)
}
feature -> renamedFeature
} else {
feature -> feature
}
}.toMap
}
}