487 lines
19 KiB
Scala
487 lines
19 KiB
Scala
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
|
|
|
import com.twitter.ml.api._
|
|
import com.twitter.ml.api.constant.SharedFeatures
|
|
import com.twitter.ml.api.util.SRichDataRecord
|
|
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
|
|
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
|
|
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
|
|
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._
|
|
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
|
|
import com.twitter.util.Duration
|
|
import com.twitter.util.Try
|
|
import java.lang.{Boolean => JBoolean}
|
|
import java.lang.{Double => JDouble}
|
|
import java.lang.{Long => JLong}
|
|
import java.util.{Set => JSet}
|
|
import scala.annotation.tailrec
|
|
import scala.language.existentials
|
|
import scala.collection.JavaConverters._
|
|
import scala.util.matching.Regex
|
|
|
|
/**
|
|
* A case class contained precomputed data useful to quickly
|
|
* process operations over an aggregate.
|
|
*
|
|
* @param query The underlying feature being aggregated
|
|
* @param metric The aggregation metric
|
|
* @param outputFeatures The output features that aggregation will produce
|
|
* @param outputFeatureIds The precomputed hashes of the above outputFeatures
|
|
*/
|
|
case class PrecomputedAggregateDescriptor[T](
|
|
query: AggregateFeature[T],
|
|
metric: AggregationMetric[T, _],
|
|
outputFeatures: List[Feature[_]],
|
|
outputFeatureIds: List[JLong])
|
|
|
|
object TypedAggregateGroup {
|
|
|
|
/**
|
|
* Recursive function that generates all combinations of value
|
|
* assignments for a collection of sparse binary features.
|
|
*
|
|
* @param sparseBinaryIdValues list of sparse binary feature ids and possible values they can take
|
|
* @return A set of maps, where each map represents one possible assignment of values to ids
|
|
*/
|
|
def sparseBinaryPermutations(
|
|
sparseBinaryIdValues: List[(Long, Set[String])]
|
|
): Set[Map[Long, String]] = sparseBinaryIdValues match {
|
|
case (id, values) +: rest =>
|
|
tailRecSparseBinaryPermutations(
|
|
existingPermutations = values.map(value => Map(id -> value)),
|
|
remainingIdValues = rest
|
|
)
|
|
case Nil => Set.empty
|
|
}
|
|
|
|
@tailrec private[this] def tailRecSparseBinaryPermutations(
|
|
existingPermutations: Set[Map[Long, String]],
|
|
remainingIdValues: List[(Long, Set[String])]
|
|
): Set[Map[Long, String]] = remainingIdValues match {
|
|
case Nil => existingPermutations
|
|
case (id, values) +: rest =>
|
|
tailRecSparseBinaryPermutations(
|
|
existingPermutations.flatMap { existingIdValueMap =>
|
|
values.map(value => existingIdValueMap ++ Map(id -> value))
|
|
},
|
|
rest
|
|
)
|
|
}
|
|
|
|
val SparseFeatureSuffix = ".member"
|
|
def sparseFeature(sparseBinaryFeature: Feature[_]): Feature[String] =
|
|
new Feature.Text(
|
|
sparseBinaryFeature.getDenseFeatureName + SparseFeatureSuffix,
|
|
AggregationMetricCommon.derivePersonalDataTypes(Some(sparseBinaryFeature)))
|
|
|
|
/* Throws exception if obj not an instance of U */
|
|
private[this] def validate[U](obj: Any): U = {
|
|
require(obj.isInstanceOf[U])
|
|
obj.asInstanceOf[U]
|
|
}
|
|
|
|
private[this] def getFeatureOpt[U](dataRecord: DataRecord, feature: Feature[U]): Option[U] =
|
|
Option(SRichDataRecord(dataRecord).getFeatureValue(feature)).map(validate[U](_))
|
|
|
|
/**
|
|
* Get a mapping from feature ids
|
|
* (including individual sparse elements of a sparse feature) to values
|
|
* from the given data record, for a given feature type.
|
|
*
|
|
* @param dataRecord Data record to get features from
|
|
* @param keysToAggregate key features to get id-value mappings for
|
|
* @param featureType Feature type to get id-value maps for
|
|
*/
|
|
def getKeyFeatureIdValues[U](
|
|
dataRecord: DataRecord,
|
|
keysToAggregate: Set[Feature[_]],
|
|
featureType: FeatureType
|
|
): Set[(Long, Option[U])] = {
|
|
val featuresOfThisType: Set[Feature[U]] = keysToAggregate
|
|
.filter(_.getFeatureType == featureType)
|
|
.map(validate[Feature[U]])
|
|
|
|
featuresOfThisType
|
|
.map { feature: Feature[U] =>
|
|
val featureId: Long = getDenseFeatureId(feature)
|
|
val featureOpt: Option[U] = getFeatureOpt(dataRecord, feature)
|
|
(featureId, featureOpt)
|
|
}
|
|
}
|
|
|
|
// TypedAggregateGroup may transform the aggregate keys for internal use. This method generates
|
|
// denseFeatureIds for the transformed feature.
|
|
def getDenseFeatureId(feature: Feature[_]): Long =
|
|
if (feature.getFeatureType != FeatureType.SPARSE_BINARY) {
|
|
feature.getDenseFeatureId
|
|
} else {
|
|
sparseFeature(feature).getDenseFeatureId
|
|
}
|
|
|
|
/**
|
|
* Return denseFeatureIds for the input features after applying the custom transformation that
|
|
* TypedAggregateGroup applies to its keysToAggregate.
|
|
*
|
|
* @param keysToAggregate key features to get id for
|
|
*/
|
|
def getKeyFeatureIds(keysToAggregate: Set[Feature[_]]): Set[Long] =
|
|
keysToAggregate.map(getDenseFeatureId)
|
|
|
|
def checkIfAllKeysExist[U](featureIdValueMap: Map[Long, Option[U]]): Boolean =
|
|
featureIdValueMap.forall { case (_, valueOpt) => valueOpt.isDefined }
|
|
|
|
def liftOptions[U](featureIdValueMap: Map[Long, Option[U]]): Map[Long, U] =
|
|
featureIdValueMap
|
|
.flatMap {
|
|
case (id, valueOpt) =>
|
|
valueOpt.map { value => (id, value) }
|
|
}
|
|
|
|
val timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP
|
|
|
|
/**
|
|
* Builds all valid aggregation keys (for the output store) from
|
|
* a datarecord and a spec listing the keys to aggregate. There
|
|
* can be multiple aggregation keys generated from a single data
|
|
* record when grouping by sparse binary features, for which multiple
|
|
* values can be set within the data record.
|
|
*
|
|
* @param dataRecord Data record to read values for key features from
|
|
* @return A set of AggregationKeys encoding the values of all keys
|
|
*/
|
|
def buildAggregationKeys(
|
|
dataRecord: DataRecord,
|
|
keysToAggregate: Set[Feature[_]]
|
|
): Set[AggregationKey] = {
|
|
val discreteAggregationKeys = getKeyFeatureIdValues[Long](
|
|
dataRecord,
|
|
keysToAggregate,
|
|
FeatureType.DISCRETE
|
|
).toMap
|
|
|
|
val textAggregationKeys = getKeyFeatureIdValues[String](
|
|
dataRecord,
|
|
keysToAggregate,
|
|
FeatureType.STRING
|
|
).toMap
|
|
|
|
val sparseBinaryIdValues = getKeyFeatureIdValues[JSet[String]](
|
|
dataRecord,
|
|
keysToAggregate,
|
|
FeatureType.SPARSE_BINARY
|
|
).map {
|
|
case (id, values) =>
|
|
(
|
|
id,
|
|
values
|
|
.map(_.asScala.toSet)
|
|
.getOrElse(Set.empty[String])
|
|
)
|
|
}.toList
|
|
|
|
if (checkIfAllKeysExist(discreteAggregationKeys) &&
|
|
checkIfAllKeysExist(textAggregationKeys)) {
|
|
if (sparseBinaryIdValues.nonEmpty) {
|
|
sparseBinaryPermutations(sparseBinaryIdValues).map { sparseBinaryTextKeys =>
|
|
AggregationKey(
|
|
discreteFeaturesById = liftOptions(discreteAggregationKeys),
|
|
textFeaturesById = liftOptions(textAggregationKeys) ++ sparseBinaryTextKeys
|
|
)
|
|
}
|
|
} else {
|
|
Set(
|
|
AggregationKey(
|
|
discreteFeaturesById = liftOptions(discreteAggregationKeys),
|
|
textFeaturesById = liftOptions(textAggregationKeys)
|
|
)
|
|
)
|
|
}
|
|
} else Set.empty[AggregationKey]
|
|
}
|
|
|
|
}
|
|
|
|
/**
|
|
* Specifies one or more related aggregate(s) to compute in the summingbird job.
|
|
*
|
|
* @param inputSource Source to compute this aggregate over
|
|
* @param preTransforms Sequence of [[com.twitter.ml.api.RichITransform]] that transform
|
|
* data records pre-aggregation (e.g. discretization, renaming)
|
|
* @param samplingTransformOpt Optional [[OneToSomeTransform]] that transform data
|
|
* record to optional data record (e.g. for sampling) before aggregation
|
|
* @param aggregatePrefix Prefix to use for naming resultant aggregate features
|
|
* @param keysToAggregate Features to group by when computing the aggregates
|
|
* (e.g. USER_ID, AUTHOR_ID)
|
|
* @param featuresToAggregate Features to aggregate (e.g. blender_score or is_photo)
|
|
* @param labels Labels to cross the features with to make pair features, if any.
|
|
* use Label.All if you don't want to cross with a label.
|
|
* @param metrics Aggregation metrics to compute (e.g. count, mean)
|
|
* @param halfLives Half lives to use for the aggregations, to be crossed with the above.
|
|
* use Duration.Top for "forever" aggregations over an infinite time window (no decay).
|
|
* @param outputStore Store to output this aggregate to
|
|
* @param includeAnyFeature Aggregate label counts for any feature value
|
|
* @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
|
|
*
|
|
* The overall config for the summingbird job consists of a list of "AggregateGroup"
|
|
* case class objects, which get translated into strongly typed "TypedAggregateGroup"
|
|
* case class objects. A single TypedAggregateGroup always groups input data records from
|
|
* ''inputSource'' by a single set of aggregation keys (''featuresToAggregate'').
|
|
* Within these groups, we perform a comprehensive cross of:
|
|
*
|
|
* ''featuresToAggregate'' x ''labels'' x ''metrics'' x ''halfLives''
|
|
*
|
|
* All the resultant aggregate features are assigned a human-readable feature name
|
|
* beginning with ''aggregatePrefix'', and are written to DataRecords that get
|
|
* aggregated and written to the store specified by ''outputStore''.
|
|
*
|
|
* Illustrative example. Suppose we define our spec as follows:
|
|
*
|
|
* TypedAggregateGroup(
|
|
* inputSource = "timelines_recap_daily",
|
|
* aggregatePrefix = "user_author_aggregate",
|
|
* keysToAggregate = Set(USER_ID, AUTHOR_ID),
|
|
* featuresToAggregate = Set(RecapFeatures.TEXT_SCORE, RecapFeatures.BLENDER_SCORE),
|
|
* labels = Set(RecapFeatures.IS_FAVORITED, RecapFeatures.IS_REPLIED),
|
|
* metrics = Set(CountMetric, MeanMetric),
|
|
* halfLives = Set(7.Days, 30.Days),
|
|
* outputStore = "user_author_aggregate_store"
|
|
* )
|
|
*
|
|
* This will process data records from the source named "timelines_recap_daily"
|
|
* (see AggregateSource.scala for more details on how to add your own source)
|
|
* It will produce a total of 2x2x2x2 = 16 aggregation features, named like:
|
|
*
|
|
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.7days
|
|
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.30days
|
|
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.mean.7days
|
|
*
|
|
* ... (and so on)
|
|
*
|
|
* and all the result features will be stored in DataRecords, summed up, and written
|
|
* to the output store defined by the name "user_author_aggregate_store".
|
|
* (see AggregateStore.scala for details on how to add your own store).
|
|
*
|
|
* If you do not want a full cross, split up your config into multiple TypedAggregateGroup
|
|
* objects. Splitting is strongly advised to avoid blowing up and creating invalid
|
|
* or unnecessary combinations of aggregate features (note that some combinations
|
|
* are useless or invalid e.g. computing the mean of a binary feature). Splitting
|
|
* also does not cost anything in terms of real-time performance, because all
|
|
* Aggregate objects in the master spec that share the same ''keysToAggregate'', the
|
|
* same ''inputSource'' and the same ''outputStore'' are grouped by the summingbird
|
|
* job logic and stored into a single DataRecord in the output store. Overlapping
|
|
* aggregates will also automatically be deduplicated so don't worry about overlaps.
|
|
*/
|
|
case class TypedAggregateGroup[T](
|
|
inputSource: AggregateSource,
|
|
aggregatePrefix: String,
|
|
keysToAggregate: Set[Feature[_]],
|
|
featuresToAggregate: Set[Feature[T]],
|
|
labels: Set[_ <: Feature[JBoolean]],
|
|
metrics: Set[AggregationMetric[T, _]],
|
|
halfLives: Set[Duration],
|
|
outputStore: AggregateStore,
|
|
preTransforms: Seq[OneToSomeTransform] = Seq.empty,
|
|
includeAnyFeature: Boolean = true,
|
|
includeAnyLabel: Boolean = true,
|
|
aggExclusionRegex: Seq[String] = Seq.empty) {
|
|
import TypedAggregateGroup._
|
|
|
|
val compiledRegexes = aggExclusionRegex.map(new Regex(_))
|
|
|
|
// true if should drop, false if should keep
|
|
def filterOutAggregateFeature(
|
|
feature: PrecomputedAggregateDescriptor[_],
|
|
regexes: Seq[Regex]
|
|
): Boolean = {
|
|
if (regexes.nonEmpty)
|
|
feature.outputFeatures.exists { feature =>
|
|
regexes.exists { re => re.findFirstMatchIn(feature.getDenseFeatureName).nonEmpty }
|
|
}
|
|
else false
|
|
}
|
|
|
|
def buildAggregationKeys(
|
|
dataRecord: DataRecord
|
|
): Set[AggregationKey] = {
|
|
TypedAggregateGroup.buildAggregationKeys(dataRecord, keysToAggregate)
|
|
}
|
|
|
|
/**
|
|
* This val precomputes descriptors for all individual aggregates in this group
|
|
* (of type ''AggregateFeature''). Also precompute hashes of all aggregation
|
|
* "output" features generated by these operators for faster
|
|
* run-time performance (this turns out to be a primary CPU bottleneck).
|
|
* Ex: for the mean operator, "sum" and "count" are output features
|
|
*/
|
|
val individualAggregateDescriptors: Set[PrecomputedAggregateDescriptor[T]] = {
|
|
/*
|
|
* By default, in additional to all feature-label crosses, also
|
|
* compute in aggregates over each feature and label without crossing
|
|
*/
|
|
val labelOptions = labels.map(Option(_)) ++
|
|
(if (includeAnyLabel) Set(None) else Set.empty)
|
|
val featureOptions = featuresToAggregate.map(Option(_)) ++
|
|
(if (includeAnyFeature) Set(None) else Set.empty)
|
|
for {
|
|
feature <- featureOptions
|
|
label <- labelOptions
|
|
metric <- metrics
|
|
halfLife <- halfLives
|
|
} yield {
|
|
val query = AggregateFeature[T](aggregatePrefix, feature, label, halfLife)
|
|
|
|
val aggregateOutputFeatures = metric.getOutputFeatures(query)
|
|
val aggregateOutputFeatureIds = metric.getOutputFeatureIds(query)
|
|
PrecomputedAggregateDescriptor(
|
|
query,
|
|
metric,
|
|
aggregateOutputFeatures,
|
|
aggregateOutputFeatureIds
|
|
)
|
|
}
|
|
}.filterNot(filterOutAggregateFeature(_, compiledRegexes))
|
|
|
|
/* Precomputes a map from all generated aggregate feature ids to their half lives. */
|
|
val continuousFeatureIdsToHalfLives: Map[Long, Duration] =
|
|
individualAggregateDescriptors.flatMap { descriptor =>
|
|
descriptor.outputFeatures
|
|
.flatMap { feature =>
|
|
if (feature.getFeatureType() == FeatureType.CONTINUOUS) {
|
|
Try(feature.asInstanceOf[Feature[JDouble]]).toOption
|
|
.map(feature => (feature.getFeatureId(), descriptor.query.halfLife))
|
|
} else None
|
|
}
|
|
}.toMap
|
|
|
|
/*
|
|
* Sparse binary keys become individual string keys in the output.
|
|
* e.g. group by "words.in.tweet", output key: "words.in.tweet.member"
|
|
*/
|
|
val allOutputKeys: Set[Feature[_]] = keysToAggregate.map { key =>
|
|
if (key.getFeatureType == FeatureType.SPARSE_BINARY) sparseFeature(key)
|
|
else key
|
|
}
|
|
|
|
val allOutputFeatures: Set[Feature[_]] = individualAggregateDescriptors.flatMap {
|
|
case PrecomputedAggregateDescriptor(
|
|
query,
|
|
metric,
|
|
outputFeatures,
|
|
outputFeatureIds
|
|
) =>
|
|
outputFeatures
|
|
}
|
|
|
|
val aggregateContext: FeatureContext = new FeatureContext(allOutputFeatures.toList.asJava)
|
|
|
|
/**
|
|
* Adds all aggregates in this group found in the two input data records
|
|
* into a result, mutating the result. Uses a while loop for an
|
|
* approximately 10% gain in speed over a for comprehension.
|
|
*
|
|
* WARNING: mutates ''result''
|
|
*
|
|
* @param result The output data record to mutate
|
|
* @param left The left data record to add
|
|
* @param right The right data record to add
|
|
*/
|
|
def mutatePlus(result: DataRecord, left: DataRecord, right: DataRecord): Unit = {
|
|
val featureIterator = individualAggregateDescriptors.iterator
|
|
while (featureIterator.hasNext) {
|
|
val descriptor = featureIterator.next
|
|
descriptor.metric.mutatePlus(
|
|
result,
|
|
left,
|
|
right,
|
|
descriptor.query,
|
|
Some(descriptor.outputFeatureIds)
|
|
)
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Apply preTransforms sequentially. If any transform results in a dropped (None)
|
|
* DataRecord, then entire tranform sequence will result in a dropped DataRecord.
|
|
* Note that preTransforms are order-dependent.
|
|
*/
|
|
private[this] def sequentiallyTransform(dataRecord: DataRecord): Option[DataRecord] = {
|
|
val recordOpt = Option(new DataRecord(dataRecord))
|
|
preTransforms.foldLeft(recordOpt) {
|
|
case (Some(previousRecord), preTransform) =>
|
|
preTransform(previousRecord)
|
|
case _ => Option.empty[DataRecord]
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Given a data record, apply transforms and fetch the incremental contributions to
|
|
* each configured aggregate from this data record, and store these in an output data record.
|
|
*
|
|
* @param dataRecord Input data record to aggregate.
|
|
* @return A set of tuples (AggregationKey, DataRecord) whose first entry is an
|
|
* AggregationKey indicating what keys we're grouping by, and whose second entry
|
|
* is an output data record with incremental contributions to the aggregate value(s)
|
|
*/
|
|
def computeAggregateKVPairs(dataRecord: DataRecord): Set[(AggregationKey, DataRecord)] = {
|
|
sequentiallyTransform(dataRecord)
|
|
.flatMap { dataRecord =>
|
|
val aggregationKeys = buildAggregationKeys(dataRecord)
|
|
val increment = new DataRecord
|
|
|
|
val isNonEmptyIncrement = individualAggregateDescriptors
|
|
.map { descriptor =>
|
|
descriptor.metric.setIncrement(
|
|
output = increment,
|
|
input = dataRecord,
|
|
query = descriptor.query,
|
|
timestampFeature = inputSource.timestampFeature,
|
|
aggregateOutputs = Some(descriptor.outputFeatureIds)
|
|
)
|
|
}
|
|
.exists(identity)
|
|
|
|
if (isNonEmptyIncrement) {
|
|
SRichDataRecord(increment).setFeatureValue(
|
|
timestampFeature,
|
|
getTimestamp(dataRecord, inputSource.timestampFeature)
|
|
)
|
|
Some(aggregationKeys.map(key => (key, increment)))
|
|
} else {
|
|
None
|
|
}
|
|
}
|
|
.getOrElse(Set.empty[(AggregationKey, DataRecord)])
|
|
}
|
|
|
|
def outputFeaturesToRenamedOutputFeatures(prefix: String): Map[Feature[_], Feature[_]] = {
|
|
require(prefix.nonEmpty)
|
|
|
|
allOutputFeatures.map { feature =>
|
|
if (feature.isSetFeatureName) {
|
|
val renamedFeatureName = prefix + feature.getDenseFeatureName
|
|
val personalDataTypes =
|
|
if (feature.getPersonalDataTypes.isPresent) feature.getPersonalDataTypes.get()
|
|
else null
|
|
|
|
val renamedFeature = feature.getFeatureType match {
|
|
case FeatureType.BINARY =>
|
|
new Feature.Binary(renamedFeatureName, personalDataTypes)
|
|
case FeatureType.DISCRETE =>
|
|
new Feature.Discrete(renamedFeatureName, personalDataTypes)
|
|
case FeatureType.STRING =>
|
|
new Feature.Text(renamedFeatureName, personalDataTypes)
|
|
case FeatureType.CONTINUOUS =>
|
|
new Feature.Continuous(renamedFeatureName, personalDataTypes)
|
|
case FeatureType.SPARSE_BINARY =>
|
|
new Feature.SparseBinary(renamedFeatureName, personalDataTypes)
|
|
case FeatureType.SPARSE_CONTINUOUS =>
|
|
new Feature.SparseContinuous(renamedFeatureName, personalDataTypes)
|
|
}
|
|
feature -> renamedFeature
|
|
} else {
|
|
feature -> feature
|
|
}
|
|
}.toMap
|
|
}
|
|
}
|