package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding import com.twitter.algebird.ScMapMonoid import com.twitter.bijection.Injection import com.twitter.bijection.thrift.CompactThriftCodec import com.twitter.ml.api.util.CompactDataRecordConverter import com.twitter.ml.api.CompactDataRecord import com.twitter.ml.api.DataRecord import com.twitter.scalding.commons.source.VersionedKeyValSource import com.twitter.scalding.Args import com.twitter.scalding.Days import com.twitter.scalding.Duration import com.twitter.scalding.RichDate import com.twitter.scalding.TypedPipe import com.twitter.scalding.TypedTsv import com.twitter.scalding_internal.job.HasDateRange import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchJob import com.twitter.summingbird.batch.BatchID import com.twitter.summingbird_internal.bijection.BatchPairImplicits import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKeyInjection import java.lang.{Double => JDouble} import java.lang.{Long => JLong} import scala.collection.JavaConverters._ /** * The job takes four inputs: * - The path to a AggregateStore using the DataRecord format. * - The path to a AggregateStore using the CompactDataRecord format. * - A version that must be present in both sources. * - A sink to write the comparison statistics. * * The job reads in the two stores, converts the second one to DataRecords and * then compared each key to see if the two stores have identical DataRecords, * modulo the loss in precision on converting the Double to Float. */ class AggregatesStoreComparisonJob(args: Args) extends AnalyticsBatchJob(args) with BatchPairImplicits with HasDateRange { import AggregatesStoreComparisonJob._ override def batchIncrement: Duration = Days(1) override def firstTime: RichDate = RichDate(args("firstTime")) private val dataRecordSourcePath = args("dataRecordSource") private val compactDataRecordSourcePath = args("compactDataRecordSource") private val version = args.long("version") private val statsSink = args("sink") require(dataRecordSourcePath != compactDataRecordSourcePath) private val dataRecordSource = VersionedKeyValSource[AggregationKey, (BatchID, DataRecord)]( path = dataRecordSourcePath, sourceVersion = Some(version) ) private val compactDataRecordSource = VersionedKeyValSource[AggregationKey, (BatchID, CompactDataRecord)]( path = compactDataRecordSourcePath, sourceVersion = Some(version) ) private val dataRecordPipe: TypedPipe[((AggregationKey, BatchID), DataRecord)] = TypedPipe .from(dataRecordSource) .map { case (key, (batchId, record)) => ((key, batchId), record) } private val compactDataRecordPipe: TypedPipe[((AggregationKey, BatchID), DataRecord)] = TypedPipe .from(compactDataRecordSource) .map { case (key, (batchId, compactRecord)) => val record = compactConverter.compactDataRecordToDataRecord(compactRecord) ((key, batchId), record) } dataRecordPipe .outerJoin(compactDataRecordPipe) .mapValues { case (leftOpt, rightOpt) => compareDataRecords(leftOpt, rightOpt) } .values .sum(mapMonoid) .flatMap(_.toList) .write(TypedTsv(statsSink)) } object AggregatesStoreComparisonJob { val mapMonoid: ScMapMonoid[String, Long] = new ScMapMonoid[String, Long]() implicit private val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] = AggregationKeyInjection implicit private val aggregationKeyOrdering: Ordering[AggregationKey] = AggregationKeyOrdering implicit private val dataRecordCodec: Injection[DataRecord, Array[Byte]] = CompactThriftCodec[DataRecord] implicit private val compactDataRecordCodec: Injection[CompactDataRecord, Array[Byte]] = CompactThriftCodec[CompactDataRecord] private val compactConverter = new CompactDataRecordConverter val missingRecordFromLeft = "missingRecordFromLeft" val missingRecordFromRight = "missingRecordFromRight" val nonContinuousFeaturesDidNotMatch = "nonContinuousFeaturesDidNotMatch" val missingFeaturesFromLeft = "missingFeaturesFromLeft" val missingFeaturesFromRight = "missingFeaturesFromRight" val recordsWithUnmatchedKeys = "recordsWithUnmatchedKeys" val featureValuesMatched = "featureValuesMatched" val featureValuesThatDidNotMatch = "featureValuesThatDidNotMatch" val equalRecords = "equalRecords" val keyCount = "keyCount" def compareDataRecords( leftOpt: Option[DataRecord], rightOpt: Option[DataRecord] ): collection.Map[String, Long] = { val stats = collection.Map((keyCount, 1L)) (leftOpt, rightOpt) match { case (Some(left), Some(right)) => if (isIdenticalNonContinuousFeatureSet(left, right)) { getContinuousFeaturesStats(left, right).foldLeft(stats)(mapMonoid.add) } else { mapMonoid.add(stats, (nonContinuousFeaturesDidNotMatch, 1L)) } case (Some(_), None) => mapMonoid.add(stats, (missingRecordFromRight, 1L)) case (None, Some(_)) => mapMonoid.add(stats, (missingRecordFromLeft, 1L)) case (None, None) => throw new IllegalArgumentException("Should never be possible") } } /** * For Continuous features. */ private def getContinuousFeaturesStats( left: DataRecord, right: DataRecord ): Seq[(String, Long)] = { val leftFeatures = Option(left.getContinuousFeatures) .map(_.asScala.toMap) .getOrElse(Map.empty[JLong, JDouble]) val rightFeatures = Option(right.getContinuousFeatures) .map(_.asScala.toMap) .getOrElse(Map.empty[JLong, JDouble]) val numMissingFeaturesLeft = (rightFeatures.keySet diff leftFeatures.keySet).size val numMissingFeaturesRight = (leftFeatures.keySet diff rightFeatures.keySet).size if (numMissingFeaturesLeft == 0 && numMissingFeaturesRight == 0) { val Epsilon = 1e-5 val numUnmatchedValues = leftFeatures.map { case (id, lValue) => val rValue = rightFeatures(id) // The approximate match is to account for the precision loss due to // the Double -> Float -> Double conversion. if (math.abs(lValue - rValue) <= Epsilon) 0L else 1L }.sum if (numUnmatchedValues == 0) { Seq( (equalRecords, 1L), (featureValuesMatched, leftFeatures.size.toLong) ) } else { Seq( (featureValuesThatDidNotMatch, numUnmatchedValues), ( featureValuesMatched, math.max(leftFeatures.size, rightFeatures.size) - numUnmatchedValues) ) } } else { Seq( (recordsWithUnmatchedKeys, 1L), (missingFeaturesFromLeft, numMissingFeaturesLeft.toLong), (missingFeaturesFromRight, numMissingFeaturesRight.toLong) ) } } /** * For feature types that are not Feature.Continuous. We expect these to match exactly in the two stores. * Mutable change */ private def isIdenticalNonContinuousFeatureSet(left: DataRecord, right: DataRecord): Boolean = { val booleanMatched = safeEquals(left.binaryFeatures, right.binaryFeatures) val discreteMatched = safeEquals(left.discreteFeatures, right.discreteFeatures) val stringMatched = safeEquals(left.stringFeatures, right.stringFeatures) val sparseBinaryMatched = safeEquals(left.sparseBinaryFeatures, right.sparseBinaryFeatures) val sparseContinuousMatched = safeEquals(left.sparseContinuousFeatures, right.sparseContinuousFeatures) val blobMatched = safeEquals(left.blobFeatures, right.blobFeatures) val tensorsMatched = safeEquals(left.tensors, right.tensors) val sparseTensorsMatched = safeEquals(left.sparseTensors, right.sparseTensors) booleanMatched && discreteMatched && stringMatched && sparseBinaryMatched && sparseContinuousMatched && blobMatched && tensorsMatched && sparseTensorsMatched } def safeEquals[T](l: T, r: T): Boolean = Option(l).equals(Option(r)) }