the-algorithm/timelines/data_processing/ml_util/aggregation_framework/AggregateGroup.scala

package com.twitter.timelines.data_processing.ml_util.aggregation_framework

import com.twitter.ml.api._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.EasyMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.MaxMetric
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
import com.twitter.util.Duration
import java.lang.{Boolean => JBoolean}
import java.lang.{Long => JLong}
import scala.language.existentials

/**
 * A wrapper for [[com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup]]
 * (see TypedAggregateGroup.scala) with some convenient syntactic sugar that avoids
 * the user having to specify different groups for different types of features.
 * Gets translated into multiple strongly typed TypedAggregateGroup(s)
 * by the buildTypedAggregateGroups() method defined below.
 *
 * @param inputSource Source to compute this aggregate over
 * @param preTransforms Sequence of [[ITransform]] that is applied to
 * data records pre-aggregation (e.g. discretization, renaming)
 * @param samplingTransformOpt Optional [[OneToSomeTransform]] that samples data record
 * @param aggregatePrefix Prefix to use for naming resultant aggregate features
 * @param keys Features to group by when computing the aggregates
 * (e.g. USER_ID, AUTHOR_ID). These must be either discrete, string or sparse binary.
 * Grouping by a sparse binary feature is different than grouping by a discrete or string
 * feature. For example, if you have a sparse binary feature WORDS_IN_TWEET which is
 * a set of all words in a tweet, then grouping by this feature generates a
 * separate aggregate mean/count/etc for each value of the feature (each word), and
 * not just a single aggregate count for different "sets of words"
 * @param features Features to aggregate (e.g. blender_score or is_photo).
 * @param labels Labels to cross the features with to make pair features, if any.
 * @param metrics Aggregation metrics to compute (e.g. count, mean)
 * @param halfLives Half lives to use for the aggregations, to be crossed with the above.
 * use Duration.Top for "forever" aggregations over an infinite time window (no decay).
 * @param outputStore Store to output this aggregate to
 * @param includeAnyFeature Aggregate label counts for any feature value
 * @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
 * @param includeTimestampFeature compute max aggregate on timestamp feature
 * @param aggExclusionRegex Sequence of Regexes, which define features to
 */
case class AggregateGroup(
  inputSource: AggregateSource,
  aggregatePrefix: String,
  keys: Set[Feature[_]],
  features: Set[Feature[_]],
  labels: Set[_ <: Feature[JBoolean]],
  metrics: Set[EasyMetric],
  halfLives: Set[Duration],
  outputStore: AggregateStore,
  preTransforms: Seq[OneToSomeTransform] = Seq.empty,
  includeAnyFeature: Boolean = true,
  includeAnyLabel: Boolean = true,
  includeTimestampFeature: Boolean = false,
  aggExclusionRegex: Seq[String] = Seq.empty) {

  private def toStrongType[T](
    metrics: Set[EasyMetric],
    features: Set[Feature[_]],
    featureType: FeatureType
  ): TypedAggregateGroup[_] = {
    val underlyingMetrics: Set[AggregationMetric[T, _]] =
      metrics.flatMap(_.forFeatureType[T](featureType))
    val underlyingFeatures: Set[Feature[T]] = features
      .map(_.asInstanceOf[Feature[T]])

    TypedAggregateGroup[T](
      inputSource = inputSource,
      aggregatePrefix = aggregatePrefix,
      keysToAggregate = keys,
      featuresToAggregate = underlyingFeatures,
      labels = labels,
      metrics = underlyingMetrics,
      halfLives = halfLives,
      outputStore = outputStore,
      preTransforms = preTransforms,
      includeAnyFeature,
      includeAnyLabel,
      aggExclusionRegex
    )
  }

  private def timestampTypedAggregateGroup: TypedAggregateGroup[_] = {
    val metrics: Set[AggregationMetric[JLong, _]] =
      Set(MaxMetric.forFeatureType[JLong](TypedAggregateGroup.timestampFeature.getFeatureType).get)

    TypedAggregateGroup[JLong](
      inputSource = inputSource,
      aggregatePrefix = aggregatePrefix,
      keysToAggregate = keys,
      featuresToAggregate = Set(TypedAggregateGroup.timestampFeature),
      labels = Set.empty,
      metrics = metrics,
      halfLives = Set(Duration.Top),
      outputStore = outputStore,
      preTransforms = preTransforms,
      includeAnyFeature = false,
      includeAnyLabel = true,
      aggExclusionRegex = Seq.empty
    )
  }

  def buildTypedAggregateGroups(): List[TypedAggregateGroup[_]] = {
    val typedAggregateGroupsList = {
      if (features.isEmpty) {
        List(toStrongType(metrics, features, FeatureType.BINARY))
      } else {
        features
          .groupBy(_.getFeatureType())
          .toList
          .map {
            case (featureType, features) =>
              toStrongType(metrics, features, featureType)
          }
      }
    }

    val optionalTimestampTypedAggregateGroup =
      if (includeTimestampFeature) List(timestampTypedAggregateGroup) else List()

    typedAggregateGroupsList ++ optionalTimestampTypedAggregateGroup
  }
}
Open-sourcing Timelines Aggregation Framework Open sourcing Aggregation Framework, a config-driven Summingbird based framework for generating real-time and batch aggregate features to be consumed by ML models. 2023-04-27 22:58:07 +02:00			`package com.twitter.timelines.data_processing.ml_util.aggregation_framework`

			`import com.twitter.ml.api._`
			`import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric`
			`import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.EasyMetric`
			`import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.MaxMetric`
			`import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform`
			`import com.twitter.util.Duration`
			`import java.lang.{Boolean => JBoolean}`
			`import java.lang.{Long => JLong}`
			`import scala.language.existentials`

			`/**`
			`* A wrapper for [[com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup]]`
			`* (see TypedAggregateGroup.scala) with some convenient syntactic sugar that avoids`
			`* the user having to specify different groups for different types of features.`
			`* Gets translated into multiple strongly typed TypedAggregateGroup(s)`
			`* by the buildTypedAggregateGroups() method defined below.`
			`*`
			`* @param inputSource Source to compute this aggregate over`
			`* @param preTransforms Sequence of [[ITransform]] that is applied to`
			`* data records pre-aggregation (e.g. discretization, renaming)`
			`* @param samplingTransformOpt Optional [[OneToSomeTransform]] that samples data record`
			`* @param aggregatePrefix Prefix to use for naming resultant aggregate features`
			`* @param keys Features to group by when computing the aggregates`
			`* (e.g. USER_ID, AUTHOR_ID). These must be either discrete, string or sparse binary.`
			`* Grouping by a sparse binary feature is different than grouping by a discrete or string`
			`* feature. For example, if you have a sparse binary feature WORDS_IN_TWEET which is`
			`* a set of all words in a tweet, then grouping by this feature generates a`
			`* separate aggregate mean/count/etc for each value of the feature (each word), and`
			`* not just a single aggregate count for different "sets of words"`
			`* @param features Features to aggregate (e.g. blender_score or is_photo).`
			`* @param labels Labels to cross the features with to make pair features, if any.`
			`* @param metrics Aggregation metrics to compute (e.g. count, mean)`
			`* @param halfLives Half lives to use for the aggregations, to be crossed with the above.`
			`* use Duration.Top for "forever" aggregations over an infinite time window (no decay).`
			`* @param outputStore Store to output this aggregate to`
			`* @param includeAnyFeature Aggregate label counts for any feature value`
			`* @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)`
			`* @param includeTimestampFeature compute max aggregate on timestamp feature`
			`* @param aggExclusionRegex Sequence of Regexes, which define features to`
			`*/`
			`case class AggregateGroup(`
			`inputSource: AggregateSource,`
			`aggregatePrefix: String,`
			`keys: Set[Feature[_]],`
			`features: Set[Feature[_]],`
			`labels: Set[_ <: Feature[JBoolean]],`
			`metrics: Set[EasyMetric],`
			`halfLives: Set[Duration],`
			`outputStore: AggregateStore,`
			`preTransforms: Seq[OneToSomeTransform] = Seq.empty,`
			`includeAnyFeature: Boolean = true,`
			`includeAnyLabel: Boolean = true,`
			`includeTimestampFeature: Boolean = false,`
			`aggExclusionRegex: Seq[String] = Seq.empty) {`

			`private def toStrongType[T](`
			`metrics: Set[EasyMetric],`
			`features: Set[Feature[_]],`
			`featureType: FeatureType`
			`): TypedAggregateGroup[_] = {`
			`val underlyingMetrics: Set[AggregationMetric[T, _]] =`
			`metrics.flatMap(_.forFeatureType[T](featureType))`
			`val underlyingFeatures: Set[Feature[T]] = features`
			`.map(_.asInstanceOf[Feature[T]])`

			`TypedAggregateGroup[T](`
			`inputSource = inputSource,`
			`aggregatePrefix = aggregatePrefix,`
			`keysToAggregate = keys,`
			`featuresToAggregate = underlyingFeatures,`
			`labels = labels,`
			`metrics = underlyingMetrics,`
			`halfLives = halfLives,`
			`outputStore = outputStore,`
			`preTransforms = preTransforms,`
			`includeAnyFeature,`
			`includeAnyLabel,`
			`aggExclusionRegex`
			`)`
			`}`

			`private def timestampTypedAggregateGroup: TypedAggregateGroup[_] = {`
			`val metrics: Set[AggregationMetric[JLong, _]] =`
			`Set(MaxMetric.forFeatureType[JLong](TypedAggregateGroup.timestampFeature.getFeatureType).get)`

			`TypedAggregateGroup[JLong](`
			`inputSource = inputSource,`
			`aggregatePrefix = aggregatePrefix,`
			`keysToAggregate = keys,`
			`featuresToAggregate = Set(TypedAggregateGroup.timestampFeature),`
			`labels = Set.empty,`
			`metrics = metrics,`
			`halfLives = Set(Duration.Top),`
			`outputStore = outputStore,`
			`preTransforms = preTransforms,`
			`includeAnyFeature = false,`
			`includeAnyLabel = true,`
			`aggExclusionRegex = Seq.empty`
			`)`
			`}`

			`def buildTypedAggregateGroups(): List[TypedAggregateGroup[_]] = {`
			`val typedAggregateGroupsList = {`
			`if (features.isEmpty) {`
			`List(toStrongType(metrics, features, FeatureType.BINARY))`
			`} else {`
			`features`
			`.groupBy(_.getFeatureType())`
			`.toList`
			`.map {`
			`case (featureType, features) =>`
			`toStrongType(metrics, features, featureType)`
			`}`
			`}`
			`}`

			`val optionalTimestampTypedAggregateGroup =`
			`if (includeTimestampFeature) List(timestampTypedAggregateGroup) else List()`

			`typedAggregateGroupsList ++ optionalTimestampTypedAggregateGroup`
			`}`
			`}`