the-algorithm/timelines/data_processing/ml_util/aggregation_framework/OfflineAggregateStore.scala

129 lines
5.2 KiB
Scala

package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.ml.api.DataRecord
import com.twitter.scalding.DateParser
import com.twitter.scalding.RichDate
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.storehaus_internal.manhattan._
import com.twitter.storehaus_internal.util.ApplicationID
import com.twitter.storehaus_internal.util.DatasetName
import com.twitter.storehaus_internal.util.HDFSPath
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird.batch.Batcher
import com.twitter.summingbird_internal.runner.store_config._
import java.util.TimeZone
import com.twitter.summingbird.batch.MillisecondBatcher
/*
* Configuration common to all offline aggregate stores
*
* @param outputHdfsPathPrefix HDFS prefix to store all output aggregate types offline
* @param dummyAppId Dummy manhattan app id required by summingbird (unused)
* @param dummyDatasetPrefix Dummy manhattan dataset prefix required by summingbird (unused)
* @param startDate Start date for summingbird job to begin computing aggregates
*/
case class OfflineAggregateStoreCommonConfig(
outputHdfsPathPrefix: String,
dummyAppId: String,
dummyDatasetPrefix: String,
startDate: String)
/**
* A trait inherited by any object that defines
* a HDFS prefix to write output data to. E.g. timelines has its own
* output prefix to write aggregates_v2 results, your team can create
* its own.
*/
trait OfflineStoreCommonConfig extends Serializable {
/*
* @param startDate Date to create config for
* @return OfflineAggregateStoreCommonConfig object with all config details for output populated
*/
def apply(startDate: String): OfflineAggregateStoreCommonConfig
}
/**
* @param name Uniquely identifiable human-readable name for this output store
* @param startDate Start date for this output store from which aggregates should be computed
* @param commonConfig Provider of other common configuration details
* @param batchesToKeep Retention policy on output (number of batches to keep)
*/
abstract class OfflineAggregateStoreBase
extends OfflineStoreOnlyConfig[ManhattanROConfig]
with AggregateStore {
override def name: String
def startDate: String
def commonConfig: OfflineStoreCommonConfig
def batchesToKeep: Int
def maxKvSourceFailures: Int
val datedCommonConfig: OfflineAggregateStoreCommonConfig = commonConfig.apply(startDate)
val manhattan: ManhattanROConfig = ManhattanROConfig(
/* This is a sample config, will be replaced with production config later */
HDFSPath(s"${datedCommonConfig.outputHdfsPathPrefix}/${name}"),
ApplicationID(datedCommonConfig.dummyAppId),
DatasetName(s"${datedCommonConfig.dummyDatasetPrefix}_${name}_1"),
com.twitter.storehaus_internal.manhattan.Adama
)
val batcherSize = 24
val batcher: MillisecondBatcher = Batcher.ofHours(batcherSize)
val startTime: RichDate =
RichDate(datedCommonConfig.startDate)(TimeZone.getTimeZone("UTC"), DateParser.default)
val offline: ManhattanROConfig = manhattan
}
/**
* Defines an aggregates store which is composed of DataRecords
* @param name Uniquely identifiable human-readable name for this output store
* @param startDate Start date for this output store from which aggregates should be computed
* @param commonConfig Provider of other common configuration details
* @param batchesToKeep Retention policy on output (number of batches to keep)
*/
case class OfflineAggregateDataRecordStore(
override val name: String,
override val startDate: String,
override val commonConfig: OfflineStoreCommonConfig,
override val batchesToKeep: Int = 7,
override val maxKvSourceFailures: Int = 0)
extends OfflineAggregateStoreBase {
def toOfflineAggregateDataRecordStoreWithDAL(
dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
): OfflineAggregateDataRecordStoreWithDAL =
OfflineAggregateDataRecordStoreWithDAL(
name = name,
startDate = startDate,
commonConfig = commonConfig,
dalDataset = dalDataset,
maxKvSourceFailures = maxKvSourceFailures
)
}
trait withDALDataset {
def dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
}
/**
* Defines an aggregates store which is composed of DataRecords and writes using DAL.
* @param name Uniquely identifiable human-readable name for this output store
* @param startDate Start date for this output store from which aggregates should be computed
* @param commonConfig Provider of other common configuration details
* @param dalDataset The KeyValDALDataset for this output store
* @param batchesToKeep Unused, kept for interface compatibility. You must define a separate Oxpecker
* retention policy to maintain the desired number of versions.
*/
case class OfflineAggregateDataRecordStoreWithDAL(
override val name: String,
override val startDate: String,
override val commonConfig: OfflineStoreCommonConfig,
override val dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]],
override val batchesToKeep: Int = -1,
override val maxKvSourceFailures: Int = 0)
extends OfflineAggregateStoreBase
with withDALDataset