129 lines
5.2 KiB
Scala
129 lines
5.2 KiB
Scala
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
|
|
|
|
import com.twitter.dal.client.dataset.KeyValDALDataset
|
|
import com.twitter.ml.api.DataRecord
|
|
import com.twitter.scalding.DateParser
|
|
import com.twitter.scalding.RichDate
|
|
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
|
|
import com.twitter.storehaus_internal.manhattan._
|
|
import com.twitter.storehaus_internal.util.ApplicationID
|
|
import com.twitter.storehaus_internal.util.DatasetName
|
|
import com.twitter.storehaus_internal.util.HDFSPath
|
|
import com.twitter.summingbird.batch.BatchID
|
|
import com.twitter.summingbird.batch.Batcher
|
|
import com.twitter.summingbird_internal.runner.store_config._
|
|
import java.util.TimeZone
|
|
import com.twitter.summingbird.batch.MillisecondBatcher
|
|
|
|
/*
|
|
* Configuration common to all offline aggregate stores
|
|
*
|
|
* @param outputHdfsPathPrefix HDFS prefix to store all output aggregate types offline
|
|
* @param dummyAppId Dummy manhattan app id required by summingbird (unused)
|
|
* @param dummyDatasetPrefix Dummy manhattan dataset prefix required by summingbird (unused)
|
|
* @param startDate Start date for summingbird job to begin computing aggregates
|
|
*/
|
|
case class OfflineAggregateStoreCommonConfig(
|
|
outputHdfsPathPrefix: String,
|
|
dummyAppId: String,
|
|
dummyDatasetPrefix: String,
|
|
startDate: String)
|
|
|
|
/**
|
|
* A trait inherited by any object that defines
|
|
* a HDFS prefix to write output data to. E.g. timelines has its own
|
|
* output prefix to write aggregates_v2 results, your team can create
|
|
* its own.
|
|
*/
|
|
trait OfflineStoreCommonConfig extends Serializable {
|
|
/*
|
|
* @param startDate Date to create config for
|
|
* @return OfflineAggregateStoreCommonConfig object with all config details for output populated
|
|
*/
|
|
def apply(startDate: String): OfflineAggregateStoreCommonConfig
|
|
}
|
|
|
|
/**
|
|
* @param name Uniquely identifiable human-readable name for this output store
|
|
* @param startDate Start date for this output store from which aggregates should be computed
|
|
* @param commonConfig Provider of other common configuration details
|
|
* @param batchesToKeep Retention policy on output (number of batches to keep)
|
|
*/
|
|
abstract class OfflineAggregateStoreBase
|
|
extends OfflineStoreOnlyConfig[ManhattanROConfig]
|
|
with AggregateStore {
|
|
|
|
override def name: String
|
|
def startDate: String
|
|
def commonConfig: OfflineStoreCommonConfig
|
|
def batchesToKeep: Int
|
|
def maxKvSourceFailures: Int
|
|
|
|
val datedCommonConfig: OfflineAggregateStoreCommonConfig = commonConfig.apply(startDate)
|
|
val manhattan: ManhattanROConfig = ManhattanROConfig(
|
|
/* This is a sample config, will be replaced with production config later */
|
|
HDFSPath(s"${datedCommonConfig.outputHdfsPathPrefix}/${name}"),
|
|
ApplicationID(datedCommonConfig.dummyAppId),
|
|
DatasetName(s"${datedCommonConfig.dummyDatasetPrefix}_${name}_1"),
|
|
com.twitter.storehaus_internal.manhattan.Adama
|
|
)
|
|
|
|
val batcherSize = 24
|
|
val batcher: MillisecondBatcher = Batcher.ofHours(batcherSize)
|
|
|
|
val startTime: RichDate =
|
|
RichDate(datedCommonConfig.startDate)(TimeZone.getTimeZone("UTC"), DateParser.default)
|
|
|
|
val offline: ManhattanROConfig = manhattan
|
|
}
|
|
|
|
/**
|
|
* Defines an aggregates store which is composed of DataRecords
|
|
* @param name Uniquely identifiable human-readable name for this output store
|
|
* @param startDate Start date for this output store from which aggregates should be computed
|
|
* @param commonConfig Provider of other common configuration details
|
|
* @param batchesToKeep Retention policy on output (number of batches to keep)
|
|
*/
|
|
case class OfflineAggregateDataRecordStore(
|
|
override val name: String,
|
|
override val startDate: String,
|
|
override val commonConfig: OfflineStoreCommonConfig,
|
|
override val batchesToKeep: Int = 7,
|
|
override val maxKvSourceFailures: Int = 0)
|
|
extends OfflineAggregateStoreBase {
|
|
|
|
def toOfflineAggregateDataRecordStoreWithDAL(
|
|
dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
|
|
): OfflineAggregateDataRecordStoreWithDAL =
|
|
OfflineAggregateDataRecordStoreWithDAL(
|
|
name = name,
|
|
startDate = startDate,
|
|
commonConfig = commonConfig,
|
|
dalDataset = dalDataset,
|
|
maxKvSourceFailures = maxKvSourceFailures
|
|
)
|
|
}
|
|
|
|
trait withDALDataset {
|
|
def dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
|
|
}
|
|
|
|
/**
|
|
* Defines an aggregates store which is composed of DataRecords and writes using DAL.
|
|
* @param name Uniquely identifiable human-readable name for this output store
|
|
* @param startDate Start date for this output store from which aggregates should be computed
|
|
* @param commonConfig Provider of other common configuration details
|
|
* @param dalDataset The KeyValDALDataset for this output store
|
|
* @param batchesToKeep Unused, kept for interface compatibility. You must define a separate Oxpecker
|
|
* retention policy to maintain the desired number of versions.
|
|
*/
|
|
case class OfflineAggregateDataRecordStoreWithDAL(
|
|
override val name: String,
|
|
override val startDate: String,
|
|
override val commonConfig: OfflineStoreCommonConfig,
|
|
override val dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]],
|
|
override val batchesToKeep: Int = -1,
|
|
override val maxKvSourceFailures: Int = 0)
|
|
extends OfflineAggregateStoreBase
|
|
with withDALDataset
|