the-algorithm/timelines/data_processing/ml_util/aggregation_framework/TypedAggregateGroup.scala

package com.twitter.timelines.data_processing.ml_util.aggregation_framework

import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
import com.twitter.util.Duration
import com.twitter.util.Try
import java.lang.{Boolean => JBoolean}
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import java.util.{Set => JSet}
import scala.annotation.tailrec
import scala.language.existentials
import scala.collection.JavaConverters._
import scala.util.matching.Regex

/**
 * A case class contained precomputed data useful to quickly
 * process operations over an aggregate.
 *
 * @param query The underlying feature being aggregated
 * @param metric The aggregation metric
 * @param outputFeatures The output features that aggregation will produce
 * @param outputFeatureIds The precomputed hashes of the above outputFeatures
 */
case class PrecomputedAggregateDescriptor[T](
  query: AggregateFeature[T],
  metric: AggregationMetric[T, _],
  outputFeatures: List[Feature[_]],
  outputFeatureIds: List[JLong])

object TypedAggregateGroup {

  /**
   * Recursive function that generates all combinations of value
   * assignments for a collection of sparse binary features.
   *
   * @param sparseBinaryIdValues list of sparse binary feature ids and possible values they can take
   * @return A set of maps, where each map represents one possible assignment of values to ids
   */
  def sparseBinaryPermutations(
    sparseBinaryIdValues: List[(Long, Set[String])]
  ): Set[Map[Long, String]] = sparseBinaryIdValues match {
    case (id, values) +: rest =>
      tailRecSparseBinaryPermutations(
        existingPermutations = values.map(value => Map(id -> value)),
        remainingIdValues = rest
      )
    case Nil => Set.empty
  }

  @tailrec private[this] def tailRecSparseBinaryPermutations(
    existingPermutations: Set[Map[Long, String]],
    remainingIdValues: List[(Long, Set[String])]
  ): Set[Map[Long, String]] = remainingIdValues match {
    case Nil => existingPermutations
    case (id, values) +: rest =>
      tailRecSparseBinaryPermutations(
        existingPermutations.flatMap { existingIdValueMap =>
          values.map(value => existingIdValueMap ++ Map(id -> value))
        },
        rest
      )
  }

  val SparseFeatureSuffix = ".member"
  def sparseFeature(sparseBinaryFeature: Feature[_]): Feature[String] =
    new Feature.Text(
      sparseBinaryFeature.getDenseFeatureName + SparseFeatureSuffix,
      AggregationMetricCommon.derivePersonalDataTypes(Some(sparseBinaryFeature)))

  /* Throws exception if obj not an instance of U */
  private[this] def validate[U](obj: Any): U = {
    require(obj.isInstanceOf[U])
    obj.asInstanceOf[U]
  }

  private[this] def getFeatureOpt[U](dataRecord: DataRecord, feature: Feature[U]): Option[U] =
    Option(SRichDataRecord(dataRecord).getFeatureValue(feature)).map(validate[U](_))

  /**
   * Get a mapping from feature ids
   * (including individual sparse elements of a sparse feature) to values
   * from the given data record, for a given feature type.
   *
   * @param dataRecord Data record to get features from
   * @param keysToAggregate key features to get id-value mappings for
   * @param featureType Feature type to get id-value maps for
   */
  def getKeyFeatureIdValues[U](
    dataRecord: DataRecord,
    keysToAggregate: Set[Feature[_]],
    featureType: FeatureType
  ): Set[(Long, Option[U])] = {
    val featuresOfThisType: Set[Feature[U]] = keysToAggregate
      .filter(_.getFeatureType == featureType)
      .map(validate[Feature[U]])

    featuresOfThisType
      .map { feature: Feature[U] =>
        val featureId: Long = getDenseFeatureId(feature)
        val featureOpt: Option[U] = getFeatureOpt(dataRecord, feature)
        (featureId, featureOpt)
      }
  }

  // TypedAggregateGroup may transform the aggregate keys for internal use. This method generates
  // denseFeatureIds for the transformed feature.
  def getDenseFeatureId(feature: Feature[_]): Long =
    if (feature.getFeatureType != FeatureType.SPARSE_BINARY) {
      feature.getDenseFeatureId
    } else {
      sparseFeature(feature).getDenseFeatureId
    }

  /**
   * Return denseFeatureIds for the input features after applying the custom transformation that
   * TypedAggregateGroup applies to its keysToAggregate.
   *
   * @param keysToAggregate key features to get id for
   */
  def getKeyFeatureIds(keysToAggregate: Set[Feature[_]]): Set[Long] =
    keysToAggregate.map(getDenseFeatureId)

  def checkIfAllKeysExist[U](featureIdValueMap: Map[Long, Option[U]]): Boolean =
    featureIdValueMap.forall { case (_, valueOpt) => valueOpt.isDefined }

  def liftOptions[U](featureIdValueMap: Map[Long, Option[U]]): Map[Long, U] =
    featureIdValueMap
      .flatMap {
        case (id, valueOpt) =>
          valueOpt.map { value => (id, value) }
      }

  val timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP

  /**
   * Builds all valid aggregation keys (for the output store) from
   * a datarecord and a spec listing the keys to aggregate. There
   * can be multiple aggregation keys generated from a single data
   * record when grouping by sparse binary features, for which multiple
   * values can be set within the data record.
   *
   * @param dataRecord Data record to read values for key features from
   * @return A set of AggregationKeys encoding the values of all keys
   */
  def buildAggregationKeys(
    dataRecord: DataRecord,
    keysToAggregate: Set[Feature[_]]
  ): Set[AggregationKey] = {
    val discreteAggregationKeys = getKeyFeatureIdValues[Long](
      dataRecord,
      keysToAggregate,
      FeatureType.DISCRETE
    ).toMap

    val textAggregationKeys = getKeyFeatureIdValues[String](
      dataRecord,
      keysToAggregate,
      FeatureType.STRING
    ).toMap

    val sparseBinaryIdValues = getKeyFeatureIdValues[JSet[String]](
      dataRecord,
      keysToAggregate,
      FeatureType.SPARSE_BINARY
    ).map {
      case (id, values) =>
        (
          id,
          values
            .map(_.asScala.toSet)
            .getOrElse(Set.empty[String])
        )
    }.toList

    if (checkIfAllKeysExist(discreteAggregationKeys) &&
      checkIfAllKeysExist(textAggregationKeys)) {
      if (sparseBinaryIdValues.nonEmpty) {
        sparseBinaryPermutations(sparseBinaryIdValues).map { sparseBinaryTextKeys =>
          AggregationKey(
            discreteFeaturesById = liftOptions(discreteAggregationKeys),
            textFeaturesById = liftOptions(textAggregationKeys) ++ sparseBinaryTextKeys
          )
        }
      } else {
        Set(
          AggregationKey(
            discreteFeaturesById = liftOptions(discreteAggregationKeys),
            textFeaturesById = liftOptions(textAggregationKeys)
          )
        )
      }
    } else Set.empty[AggregationKey]
  }

}

/**
 * Specifies one or more related aggregate(s) to compute in the summingbird job.
 *
 * @param inputSource Source to compute this aggregate over
 * @param preTransforms Sequence of [[com.twitter.ml.api.RichITransform]] that transform
 * data records pre-aggregation (e.g. discretization, renaming)
 * @param samplingTransformOpt Optional [[OneToSomeTransform]] that transform data
 * record to optional data record (e.g. for sampling) before aggregation
 * @param aggregatePrefix Prefix to use for naming resultant aggregate features
 * @param keysToAggregate Features to group by when computing the aggregates
 * (e.g. USER_ID, AUTHOR_ID)
 * @param featuresToAggregate Features to aggregate (e.g. blender_score or is_photo)
 * @param labels Labels to cross the features with to make pair features, if any.
 * use Label.All if you don't want to cross with a label.
 * @param metrics Aggregation metrics to compute (e.g. count, mean)
 * @param halfLives Half lives to use for the aggregations, to be crossed with the above.
 * use Duration.Top for "forever" aggregations over an infinite time window (no decay).
 * @param outputStore Store to output this aggregate to
 * @param includeAnyFeature Aggregate label counts for any feature value
 * @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
 *
 * The overall config for the summingbird job consists of a list of "AggregateGroup"
 * case class objects, which get translated into strongly typed "TypedAggregateGroup"
 * case class objects. A single TypedAggregateGroup always groups input data records from
 * ''inputSource'' by a single set of aggregation keys (''featuresToAggregate'').
 * Within these groups, we perform a comprehensive cross of:
 *
 * ''featuresToAggregate'' x ''labels'' x ''metrics'' x ''halfLives''
 *
 * All the resultant aggregate features are assigned a human-readable feature name
 * beginning with ''aggregatePrefix'', and are written to DataRecords that get
 * aggregated and written to the store specified by ''outputStore''.
 *
 * Illustrative example. Suppose we define our spec as follows:
 *
 * TypedAggregateGroup(
 *   inputSource         = "timelines_recap_daily",
 *   aggregatePrefix     = "user_author_aggregate",
 *   keysToAggregate     = Set(USER_ID, AUTHOR_ID),
 *   featuresToAggregate = Set(RecapFeatures.TEXT_SCORE, RecapFeatures.BLENDER_SCORE),
 *   labels              = Set(RecapFeatures.IS_FAVORITED, RecapFeatures.IS_REPLIED),
 *   metrics             = Set(CountMetric, MeanMetric),
 *   halfLives           = Set(7.Days, 30.Days),
 *   outputStore         = "user_author_aggregate_store"
 * )
 *
 * This will process data records from the source named "timelines_recap_daily"
 * (see AggregateSource.scala for more details on how to add your own source)
 * It will produce a total of 2x2x2x2 = 16 aggregation features, named like:
 *
 * user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.7days
 * user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.30days
 * user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.mean.7days
 *
 * ... (and so on)
 *
 * and all the result features will be stored in DataRecords, summed up, and written
 * to the output store defined by the name "user_author_aggregate_store".
 * (see AggregateStore.scala for details on how to add your own store).
 *
 * If you do not want a full cross, split up your config into multiple TypedAggregateGroup
 * objects. Splitting is strongly advised to avoid blowing up and creating invalid
 * or unnecessary combinations of aggregate features (note that some combinations
 * are useless or invalid e.g. computing the mean of a binary feature). Splitting
 * also does not cost anything in terms of real-time performance, because all
 * Aggregate objects in the master spec that share the same ''keysToAggregate'', the
 * same ''inputSource'' and the same ''outputStore'' are grouped by the summingbird
 * job logic and stored into a single DataRecord in the output store. Overlapping
 * aggregates will also automatically be deduplicated so don't worry about overlaps.
 */
case class TypedAggregateGroup[T](
  inputSource: AggregateSource,
  aggregatePrefix: String,
  keysToAggregate: Set[Feature[_]],
  featuresToAggregate: Set[Feature[T]],
  labels: Set[_ <: Feature[JBoolean]],
  metrics: Set[AggregationMetric[T, _]],
  halfLives: Set[Duration],
  outputStore: AggregateStore,
  preTransforms: Seq[OneToSomeTransform] = Seq.empty,
  includeAnyFeature: Boolean = true,
  includeAnyLabel: Boolean = true,
  aggExclusionRegex: Seq[String] = Seq.empty) {
  import TypedAggregateGroup._

  val compiledRegexes = aggExclusionRegex.map(new Regex(_))

  // true if should drop, false if should keep
  def filterOutAggregateFeature(
    feature: PrecomputedAggregateDescriptor[_],
    regexes: Seq[Regex]
  ): Boolean = {
    if (regexes.nonEmpty)
      feature.outputFeatures.exists { feature =>
        regexes.exists { re => re.findFirstMatchIn(feature.getDenseFeatureName).nonEmpty }
      }
    else false
  }

  def buildAggregationKeys(
    dataRecord: DataRecord
  ): Set[AggregationKey] = {
    TypedAggregateGroup.buildAggregationKeys(dataRecord, keysToAggregate)
  }

  /**
   * This val precomputes descriptors for all individual aggregates in this group
   * (of type ''AggregateFeature''). Also precompute hashes of all aggregation
   * "output" features generated by these operators for faster
   * run-time performance (this turns out to be a primary CPU bottleneck).
   * Ex: for the mean operator, "sum" and "count" are output features
   */
  val individualAggregateDescriptors: Set[PrecomputedAggregateDescriptor[T]] = {
    /*
     * By default, in additional to all feature-label crosses, also
     * compute in aggregates over each feature and label without crossing
     */
    val labelOptions = labels.map(Option(_)) ++
      (if (includeAnyLabel) Set(None) else Set.empty)
    val featureOptions = featuresToAggregate.map(Option(_)) ++
      (if (includeAnyFeature) Set(None) else Set.empty)
    for {
      feature <- featureOptions
      label <- labelOptions
      metric <- metrics
      halfLife <- halfLives
    } yield {
      val query = AggregateFeature[T](aggregatePrefix, feature, label, halfLife)

      val aggregateOutputFeatures = metric.getOutputFeatures(query)
      val aggregateOutputFeatureIds = metric.getOutputFeatureIds(query)
      PrecomputedAggregateDescriptor(
        query,
        metric,
        aggregateOutputFeatures,
        aggregateOutputFeatureIds
      )
    }
  }.filterNot(filterOutAggregateFeature(_, compiledRegexes))

  /* Precomputes a map from all generated aggregate feature ids to their half lives. */
  val continuousFeatureIdsToHalfLives: Map[Long, Duration] =
    individualAggregateDescriptors.flatMap { descriptor =>
      descriptor.outputFeatures
        .flatMap { feature =>
          if (feature.getFeatureType() == FeatureType.CONTINUOUS) {
            Try(feature.asInstanceOf[Feature[JDouble]]).toOption
              .map(feature => (feature.getFeatureId(), descriptor.query.halfLife))
          } else None
        }
    }.toMap

  /*
   * Sparse binary keys become individual string keys in the output.
   * e.g. group by "words.in.tweet", output key: "words.in.tweet.member"
   */
  val allOutputKeys: Set[Feature[_]] = keysToAggregate.map { key =>
    if (key.getFeatureType == FeatureType.SPARSE_BINARY) sparseFeature(key)
    else key
  }

  val allOutputFeatures: Set[Feature[_]] = individualAggregateDescriptors.flatMap {
    case PrecomputedAggregateDescriptor(
          query,
          metric,
          outputFeatures,
          outputFeatureIds
        ) =>
      outputFeatures
  }

  val aggregateContext: FeatureContext = new FeatureContext(allOutputFeatures.toList.asJava)

  /**
   * Adds all aggregates in this group found in the two input data records
   * into a result, mutating the result. Uses a while loop for an
   * approximately 10% gain in speed over a for comprehension.
   *
   * WARNING: mutates ''result''
   *
   * @param result The output data record to mutate
   * @param left The left data record to add
   * @param right The right data record to add
   */
  def mutatePlus(result: DataRecord, left: DataRecord, right: DataRecord): Unit = {
    val featureIterator = individualAggregateDescriptors.iterator
    while (featureIterator.hasNext) {
      val descriptor = featureIterator.next
      descriptor.metric.mutatePlus(
        result,
        left,
        right,
        descriptor.query,
        Some(descriptor.outputFeatureIds)
      )
    }
  }

  /**
   * Apply preTransforms sequentially. If any transform results in a dropped (None)
   * DataRecord, then entire tranform sequence will result in a dropped DataRecord.
   * Note that preTransforms are order-dependent.
   */
  private[this] def sequentiallyTransform(dataRecord: DataRecord): Option[DataRecord] = {
    val recordOpt = Option(new DataRecord(dataRecord))
    preTransforms.foldLeft(recordOpt) {
      case (Some(previousRecord), preTransform) =>
        preTransform(previousRecord)
      case _ => Option.empty[DataRecord]
    }
  }

  /**
   * Given a data record, apply transforms and fetch the incremental contributions to
   * each configured aggregate from this data record, and store these in an output data record.
   *
   * @param dataRecord Input data record to aggregate.
   * @return A set of tuples (AggregationKey, DataRecord) whose first entry is an
   * AggregationKey indicating what keys we're grouping by, and whose second entry
   * is an output data record with incremental contributions to the aggregate value(s)
   */
  def computeAggregateKVPairs(dataRecord: DataRecord): Set[(AggregationKey, DataRecord)] = {
    sequentiallyTransform(dataRecord)
      .flatMap { dataRecord =>
        val aggregationKeys = buildAggregationKeys(dataRecord)
        val increment = new DataRecord

        val isNonEmptyIncrement = individualAggregateDescriptors
          .map { descriptor =>
            descriptor.metric.setIncrement(
              output = increment,
              input = dataRecord,
              query = descriptor.query,
              timestampFeature = inputSource.timestampFeature,
              aggregateOutputs = Some(descriptor.outputFeatureIds)
            )
          }
          .exists(identity)

        if (isNonEmptyIncrement) {
          SRichDataRecord(increment).setFeatureValue(
            timestampFeature,
            getTimestamp(dataRecord, inputSource.timestampFeature)
          )
          Some(aggregationKeys.map(key => (key, increment)))
        } else {
          None
        }
      }
      .getOrElse(Set.empty[(AggregationKey, DataRecord)])
  }

  def outputFeaturesToRenamedOutputFeatures(prefix: String): Map[Feature[_], Feature[_]] = {
    require(prefix.nonEmpty)

    allOutputFeatures.map { feature =>
      if (feature.isSetFeatureName) {
        val renamedFeatureName = prefix + feature.getDenseFeatureName
        val personalDataTypes =
          if (feature.getPersonalDataTypes.isPresent) feature.getPersonalDataTypes.get()
          else null

        val renamedFeature = feature.getFeatureType match {
          case FeatureType.BINARY =>
            new Feature.Binary(renamedFeatureName, personalDataTypes)
          case FeatureType.DISCRETE =>
            new Feature.Discrete(renamedFeatureName, personalDataTypes)
          case FeatureType.STRING =>
            new Feature.Text(renamedFeatureName, personalDataTypes)
          case FeatureType.CONTINUOUS =>
            new Feature.Continuous(renamedFeatureName, personalDataTypes)
          case FeatureType.SPARSE_BINARY =>
            new Feature.SparseBinary(renamedFeatureName, personalDataTypes)
          case FeatureType.SPARSE_CONTINUOUS =>
            new Feature.SparseContinuous(renamedFeatureName, personalDataTypes)
        }
        feature -> renamedFeature
      } else {
        feature -> feature
      }
    }.toMap
  }
}