mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-14 15:18:55 +02:00
197bf2c563
Open sourcing Aggregation Framework, a config-driven Summingbird based framework for generating real-time and batch aggregate features to be consumed by ML models.
51 lines
2.3 KiB
Scala
51 lines
2.3 KiB
Scala
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
|
|
|
import com.twitter.bijection.Bufferable
|
|
import com.twitter.bijection.Injection
|
|
import scala.util.Try
|
|
|
|
/**
|
|
* Case class that represents the "grouping" key for any aggregate feature.
|
|
* Used by Summingbird to output aggregates to the key-value "store" using sumByKey()
|
|
*
|
|
* @discreteFeaturesById All discrete featureids (+ values) that are part of this key
|
|
* @textFeaturesById All string featureids (+ values) that are part of this key
|
|
*
|
|
* Example 1: the user aggregate features in aggregatesv1 all group by USER_ID,
|
|
* which is a discrete feature. When storing these features, the key would be:
|
|
*
|
|
* discreteFeaturesById = Map(hash(USER_ID) -> <the actual user id>), textFeaturesById = Map()
|
|
*
|
|
* Ex 2: If aggregating grouped by USER_ID, AUTHOR_ID, tweet link url, the key would be:
|
|
*
|
|
* discreteFeaturesById = Map(hash(USER_ID) -> <actual user id>, hash(AUTHOR_ID) -> <actual author id>),
|
|
* textFeaturesById = Map(hash(URL_FEATURE) -> <the link url>)
|
|
*
|
|
* I could have just used a DataRecord for the key, but I wanted to make it strongly typed
|
|
* and only support grouping by discrete and string features, so using a case class instead.
|
|
*
|
|
* Re: efficiency, storing the hash of the feature in addition to just the feature value
|
|
* is somewhat more inefficient than only storing the feature value in the key, but it
|
|
* adds flexibility to group multiple types of aggregates in the same output store. If we
|
|
* decide this isn't a good tradeoff to make later, we can reverse/refactor this decision.
|
|
*/
|
|
case class AggregationKey(
|
|
discreteFeaturesById: Map[Long, Long],
|
|
textFeaturesById: Map[Long, String])
|
|
|
|
/**
|
|
* A custom injection for the above case class,
|
|
* so that Summingbird knows how to store it in Manhattan.
|
|
*/
|
|
object AggregationKeyInjection extends Injection[AggregationKey, Array[Byte]] {
|
|
/* Injection from tuple representation of AggregationKey to Array[Byte] */
|
|
val featureMapsInjection: Injection[(Map[Long, Long], Map[Long, String]), Array[Byte]] =
|
|
Bufferable.injectionOf[(Map[Long, Long], Map[Long, String])]
|
|
|
|
def apply(aggregationKey: AggregationKey): Array[Byte] =
|
|
featureMapsInjection(AggregationKey.unapply(aggregationKey).get)
|
|
|
|
def invert(ab: Array[Byte]): Try[AggregationKey] =
|
|
featureMapsInjection.invert(ab).map(AggregationKey.tupled(_))
|
|
}
|