the-algorithm/timelines/data_processing/ml_util/aggregation_framework/scalding/AggregatesStoreComparisonJob.scala
twitter-team 197bf2c563 Open-sourcing Timelines Aggregation Framework
Open sourcing Aggregation Framework, a config-driven Summingbird based framework for generating real-time and batch aggregate features to be consumed by ML models.
2023-04-28 14:17:02 -05:00

201 lines
7.9 KiB
Scala

package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding
import com.twitter.algebird.ScMapMonoid
import com.twitter.bijection.Injection
import com.twitter.bijection.thrift.CompactThriftCodec
import com.twitter.ml.api.util.CompactDataRecordConverter
import com.twitter.ml.api.CompactDataRecord
import com.twitter.ml.api.DataRecord
import com.twitter.scalding.commons.source.VersionedKeyValSource
import com.twitter.scalding.Args
import com.twitter.scalding.Days
import com.twitter.scalding.Duration
import com.twitter.scalding.RichDate
import com.twitter.scalding.TypedPipe
import com.twitter.scalding.TypedTsv
import com.twitter.scalding_internal.job.HasDateRange
import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchJob
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird_internal.bijection.BatchPairImplicits
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKeyInjection
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import scala.collection.JavaConverters._
/**
* The job takes four inputs:
* - The path to a AggregateStore using the DataRecord format.
* - The path to a AggregateStore using the CompactDataRecord format.
* - A version that must be present in both sources.
* - A sink to write the comparison statistics.
*
* The job reads in the two stores, converts the second one to DataRecords and
* then compared each key to see if the two stores have identical DataRecords,
* modulo the loss in precision on converting the Double to Float.
*/
class AggregatesStoreComparisonJob(args: Args)
extends AnalyticsBatchJob(args)
with BatchPairImplicits
with HasDateRange {
import AggregatesStoreComparisonJob._
override def batchIncrement: Duration = Days(1)
override def firstTime: RichDate = RichDate(args("firstTime"))
private val dataRecordSourcePath = args("dataRecordSource")
private val compactDataRecordSourcePath = args("compactDataRecordSource")
private val version = args.long("version")
private val statsSink = args("sink")
require(dataRecordSourcePath != compactDataRecordSourcePath)
private val dataRecordSource =
VersionedKeyValSource[AggregationKey, (BatchID, DataRecord)](
path = dataRecordSourcePath,
sourceVersion = Some(version)
)
private val compactDataRecordSource =
VersionedKeyValSource[AggregationKey, (BatchID, CompactDataRecord)](
path = compactDataRecordSourcePath,
sourceVersion = Some(version)
)
private val dataRecordPipe: TypedPipe[((AggregationKey, BatchID), DataRecord)] = TypedPipe
.from(dataRecordSource)
.map { case (key, (batchId, record)) => ((key, batchId), record) }
private val compactDataRecordPipe: TypedPipe[((AggregationKey, BatchID), DataRecord)] = TypedPipe
.from(compactDataRecordSource)
.map {
case (key, (batchId, compactRecord)) =>
val record = compactConverter.compactDataRecordToDataRecord(compactRecord)
((key, batchId), record)
}
dataRecordPipe
.outerJoin(compactDataRecordPipe)
.mapValues { case (leftOpt, rightOpt) => compareDataRecords(leftOpt, rightOpt) }
.values
.sum(mapMonoid)
.flatMap(_.toList)
.write(TypedTsv(statsSink))
}
object AggregatesStoreComparisonJob {
val mapMonoid: ScMapMonoid[String, Long] = new ScMapMonoid[String, Long]()
implicit private val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] =
AggregationKeyInjection
implicit private val aggregationKeyOrdering: Ordering[AggregationKey] = AggregationKeyOrdering
implicit private val dataRecordCodec: Injection[DataRecord, Array[Byte]] =
CompactThriftCodec[DataRecord]
implicit private val compactDataRecordCodec: Injection[CompactDataRecord, Array[Byte]] =
CompactThriftCodec[CompactDataRecord]
private val compactConverter = new CompactDataRecordConverter
val missingRecordFromLeft = "missingRecordFromLeft"
val missingRecordFromRight = "missingRecordFromRight"
val nonContinuousFeaturesDidNotMatch = "nonContinuousFeaturesDidNotMatch"
val missingFeaturesFromLeft = "missingFeaturesFromLeft"
val missingFeaturesFromRight = "missingFeaturesFromRight"
val recordsWithUnmatchedKeys = "recordsWithUnmatchedKeys"
val featureValuesMatched = "featureValuesMatched"
val featureValuesThatDidNotMatch = "featureValuesThatDidNotMatch"
val equalRecords = "equalRecords"
val keyCount = "keyCount"
def compareDataRecords(
leftOpt: Option[DataRecord],
rightOpt: Option[DataRecord]
): collection.Map[String, Long] = {
val stats = collection.Map((keyCount, 1L))
(leftOpt, rightOpt) match {
case (Some(left), Some(right)) =>
if (isIdenticalNonContinuousFeatureSet(left, right)) {
getContinuousFeaturesStats(left, right).foldLeft(stats)(mapMonoid.add)
} else {
mapMonoid.add(stats, (nonContinuousFeaturesDidNotMatch, 1L))
}
case (Some(_), None) => mapMonoid.add(stats, (missingRecordFromRight, 1L))
case (None, Some(_)) => mapMonoid.add(stats, (missingRecordFromLeft, 1L))
case (None, None) => throw new IllegalArgumentException("Should never be possible")
}
}
/**
* For Continuous features.
*/
private def getContinuousFeaturesStats(
left: DataRecord,
right: DataRecord
): Seq[(String, Long)] = {
val leftFeatures = Option(left.getContinuousFeatures)
.map(_.asScala.toMap)
.getOrElse(Map.empty[JLong, JDouble])
val rightFeatures = Option(right.getContinuousFeatures)
.map(_.asScala.toMap)
.getOrElse(Map.empty[JLong, JDouble])
val numMissingFeaturesLeft = (rightFeatures.keySet diff leftFeatures.keySet).size
val numMissingFeaturesRight = (leftFeatures.keySet diff rightFeatures.keySet).size
if (numMissingFeaturesLeft == 0 && numMissingFeaturesRight == 0) {
val Epsilon = 1e-5
val numUnmatchedValues = leftFeatures.map {
case (id, lValue) =>
val rValue = rightFeatures(id)
// The approximate match is to account for the precision loss due to
// the Double -> Float -> Double conversion.
if (math.abs(lValue - rValue) <= Epsilon) 0L else 1L
}.sum
if (numUnmatchedValues == 0) {
Seq(
(equalRecords, 1L),
(featureValuesMatched, leftFeatures.size.toLong)
)
} else {
Seq(
(featureValuesThatDidNotMatch, numUnmatchedValues),
(
featureValuesMatched,
math.max(leftFeatures.size, rightFeatures.size) - numUnmatchedValues)
)
}
} else {
Seq(
(recordsWithUnmatchedKeys, 1L),
(missingFeaturesFromLeft, numMissingFeaturesLeft.toLong),
(missingFeaturesFromRight, numMissingFeaturesRight.toLong)
)
}
}
/**
* For feature types that are not Feature.Continuous. We expect these to match exactly in the two stores.
* Mutable change
*/
private def isIdenticalNonContinuousFeatureSet(left: DataRecord, right: DataRecord): Boolean = {
val booleanMatched = safeEquals(left.binaryFeatures, right.binaryFeatures)
val discreteMatched = safeEquals(left.discreteFeatures, right.discreteFeatures)
val stringMatched = safeEquals(left.stringFeatures, right.stringFeatures)
val sparseBinaryMatched = safeEquals(left.sparseBinaryFeatures, right.sparseBinaryFeatures)
val sparseContinuousMatched =
safeEquals(left.sparseContinuousFeatures, right.sparseContinuousFeatures)
val blobMatched = safeEquals(left.blobFeatures, right.blobFeatures)
val tensorsMatched = safeEquals(left.tensors, right.tensors)
val sparseTensorsMatched = safeEquals(left.sparseTensors, right.sparseTensors)
booleanMatched && discreteMatched && stringMatched && sparseBinaryMatched &&
sparseContinuousMatched && blobMatched && tensorsMatched && sparseTensorsMatched
}
def safeEquals[T](l: T, r: T): Boolean = Option(l).equals(Option(r))
}