Delete timelines/data_processing directory

This commit is contained in:
dogemanttv 2024-01-10 17:09:03 -06:00 committed by GitHub
parent 502f1f9d11
commit 425ea8f3de
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
87 changed files with 0 additions and 8122 deletions

View File

@ -1,8 +0,0 @@
target(
name = "earlybird_ranking",
dependencies = [
"timelines/data_processing/ad_hoc/earlybird_ranking/common",
"timelines/data_processing/ad_hoc/earlybird_ranking/model_evaluation",
"timelines/data_processing/ad_hoc/earlybird_ranking/training_data_generation",
],
)

View File

@ -1,24 +0,0 @@
scala_library(
name = "common",
sources = ["*.scala"],
platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
],
dependencies = [
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/java/com/twitter/ml/api/transform",
"src/java/com/twitter/search/modeling/tweet_ranking",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/timelines/prediction/features/common",
"src/scala/com/twitter/timelines/prediction/features/itl",
"src/scala/com/twitter/timelines/prediction/features/real_graph",
"src/scala/com/twitter/timelines/prediction/features/recap",
"src/scala/com/twitter/timelines/prediction/features/request_context",
"src/scala/com/twitter/timelines/prediction/features/time_features",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:transform-java",
],
)

View File

@ -1,271 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api.ITransform
import com.twitter.ml.api.transform.CascadeTransform
import com.twitter.ml.api.transform.TransformFactory
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.search.common.features.SearchResultFeature
import com.twitter.search.common.features.ExternalTweetFeature
import com.twitter.search.common.features.TweetFeature
import com.twitter.timelines.prediction.features.recap.RecapFeatures
import com.twitter.timelines.prediction.features.request_context.RequestContextFeatures
import com.twitter.timelines.prediction.features.time_features.TimeDataRecordFeatures
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
import com.twitter.timelines.prediction.features.real_graph.RealGraphDataRecordFeatures
import scala.collection.JavaConverters._
import java.lang.{Boolean => JBoolean}
case class LabelInfo(name: String, downsampleFraction: Double, importance: Double)
case class LabelInfoWithFeature(info: LabelInfo, feature: Feature[JBoolean])
trait EarlybirdTrainingConfiguration {
protected def labels: Map[String, Feature.Binary]
protected def weights: Map[String, Double] = Map(
"detail_expanded" -> 0.3,
"favorited" -> 1.0,
"open_linked" -> 0.1,
"photo_expanded" -> 0.03,
"profile_clicked" -> 1.0,
"replied" -> 9.0,
"retweeted" -> 1.0,
"video_playback50" -> 0.01
)
// we basically should not downsample any of the precious positive data.
// importance are currently set to match the full model's weights.
protected def PositiveSamplingRate: Double = 1.0
private def NegativeSamplingRate: Double = PositiveSamplingRate * 0.08
// we basically should not downsample any of the precious positive data.
// importance are currently set to match the full model's weights.
final lazy val LabelInfos: List[LabelInfoWithFeature] = {
assert(labels.keySet == weights.keySet)
labels.keySet.map(makeLabelInfoWithFeature).toList
}
def makeLabelInfoWithFeature(labelName: String): LabelInfoWithFeature = {
LabelInfoWithFeature(
LabelInfo(labelName, PositiveSamplingRate, weights(labelName)),
labels(labelName))
}
final lazy val NegativeInfo: LabelInfo = LabelInfo("negative", NegativeSamplingRate, 1.0)
// example of features available in schema based namespace:
protected def featureToSearchResultFeatureMap: Map[Feature[_], SearchResultFeature] = Map(
RecapFeatures.TEXT_SCORE -> TweetFeature.TEXT_SCORE,
RecapFeatures.REPLY_COUNT -> TweetFeature.REPLY_COUNT,
RecapFeatures.RETWEET_COUNT -> TweetFeature.RETWEET_COUNT,
RecapFeatures.FAV_COUNT -> TweetFeature.FAVORITE_COUNT,
RecapFeatures.HAS_CARD -> TweetFeature.HAS_CARD_FLAG,
RecapFeatures.HAS_CONSUMER_VIDEO -> TweetFeature.HAS_CONSUMER_VIDEO_FLAG,
RecapFeatures.HAS_PRO_VIDEO -> TweetFeature.HAS_PRO_VIDEO_FLAG,
// no corresponding HAS_NATIVE_VIDEO feature in TweetFeature
RecapFeatures.HAS_VINE -> TweetFeature.HAS_VINE_FLAG,
RecapFeatures.HAS_PERISCOPE -> TweetFeature.HAS_PERISCOPE_FLAG,
RecapFeatures.HAS_NATIVE_IMAGE -> TweetFeature.HAS_NATIVE_IMAGE_FLAG,
RecapFeatures.HAS_IMAGE -> TweetFeature.HAS_IMAGE_URL_FLAG,
RecapFeatures.HAS_NEWS -> TweetFeature.HAS_NEWS_URL_FLAG,
RecapFeatures.HAS_VIDEO -> TweetFeature.HAS_VIDEO_URL_FLAG,
RecapFeatures.HAS_TREND -> TweetFeature.HAS_TREND_FLAG,
RecapFeatures.HAS_MULTIPLE_HASHTAGS_OR_TRENDS -> TweetFeature.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG,
RecapFeatures.IS_OFFENSIVE -> TweetFeature.IS_OFFENSIVE_FLAG,
RecapFeatures.IS_REPLY -> TweetFeature.IS_REPLY_FLAG,
RecapFeatures.IS_RETWEET -> TweetFeature.IS_RETWEET_FLAG,
RecapFeatures.IS_AUTHOR_BOT -> TweetFeature.IS_USER_BOT_FLAG,
RecapFeatures.FROM_VERIFIED_ACCOUNT -> TweetFeature.FROM_VERIFIED_ACCOUNT_FLAG,
RecapFeatures.USER_REP -> TweetFeature.USER_REPUTATION,
RecapFeatures.EMBEDS_IMPRESSION_COUNT -> TweetFeature.EMBEDS_IMPRESSION_COUNT,
RecapFeatures.EMBEDS_URL_COUNT -> TweetFeature.EMBEDS_URL_COUNT,
// RecapFeatures.VIDEO_VIEW_COUNT deprecated
RecapFeatures.FAV_COUNT_V2 -> TweetFeature.FAVORITE_COUNT_V2,
RecapFeatures.RETWEET_COUNT_V2 -> TweetFeature.RETWEET_COUNT_V2,
RecapFeatures.REPLY_COUNT_V2 -> TweetFeature.REPLY_COUNT_V2,
RecapFeatures.IS_SENSITIVE -> TweetFeature.IS_SENSITIVE_CONTENT,
RecapFeatures.HAS_MULTIPLE_MEDIA -> TweetFeature.HAS_MULTIPLE_MEDIA_FLAG,
RecapFeatures.IS_AUTHOR_PROFILE_EGG -> TweetFeature.PROFILE_IS_EGG_FLAG,
RecapFeatures.IS_AUTHOR_NEW -> TweetFeature.IS_USER_NEW_FLAG,
RecapFeatures.NUM_MENTIONS -> TweetFeature.NUM_MENTIONS,
RecapFeatures.NUM_HASHTAGS -> TweetFeature.NUM_HASHTAGS,
RecapFeatures.HAS_VISIBLE_LINK -> TweetFeature.HAS_VISIBLE_LINK_FLAG,
RecapFeatures.HAS_LINK -> TweetFeature.HAS_LINK_FLAG,
//note: DISCRETE features are not supported by the modelInterpreter tool.
// for the following features, we will create separate CONTINUOUS features instead of renaming
//RecapFeatures.LINK_LANGUAGE
//RecapFeatures.LANGUAGE
TimelinesSharedFeatures.HAS_QUOTE -> TweetFeature.HAS_QUOTE_FLAG,
TimelinesSharedFeatures.QUOTE_COUNT -> TweetFeature.QUOTE_COUNT,
TimelinesSharedFeatures.WEIGHTED_FAV_COUNT -> TweetFeature.WEIGHTED_FAVORITE_COUNT,
TimelinesSharedFeatures.WEIGHTED_QUOTE_COUNT -> TweetFeature.WEIGHTED_QUOTE_COUNT,
TimelinesSharedFeatures.WEIGHTED_REPLY_COUNT -> TweetFeature.WEIGHTED_REPLY_COUNT,
TimelinesSharedFeatures.WEIGHTED_RETWEET_COUNT -> TweetFeature.WEIGHTED_RETWEET_COUNT,
TimelinesSharedFeatures.DECAYED_FAVORITE_COUNT -> TweetFeature.DECAYED_FAVORITE_COUNT,
TimelinesSharedFeatures.DECAYED_RETWEET_COUNT -> TweetFeature.DECAYED_RETWEET_COUNT,
TimelinesSharedFeatures.DECAYED_REPLY_COUNT -> TweetFeature.DECAYED_RETWEET_COUNT,
TimelinesSharedFeatures.DECAYED_QUOTE_COUNT -> TweetFeature.DECAYED_QUOTE_COUNT,
TimelinesSharedFeatures.FAKE_FAVORITE_COUNT -> TweetFeature.FAKE_FAVORITE_COUNT,
TimelinesSharedFeatures.FAKE_RETWEET_COUNT -> TweetFeature.FAKE_RETWEET_COUNT,
TimelinesSharedFeatures.FAKE_REPLY_COUNT -> TweetFeature.FAKE_REPLY_COUNT,
TimelinesSharedFeatures.FAKE_QUOTE_COUNT -> TweetFeature.FAKE_QUOTE_COUNT,
TimelinesSharedFeatures.EMBEDS_IMPRESSION_COUNT_V2 -> TweetFeature.EMBEDS_IMPRESSION_COUNT_V2,
TimelinesSharedFeatures.EMBEDS_URL_COUNT_V2 -> TweetFeature.EMBEDS_URL_COUNT_V2,
TimelinesSharedFeatures.LABEL_ABUSIVE_FLAG -> TweetFeature.LABEL_ABUSIVE_FLAG,
TimelinesSharedFeatures.LABEL_ABUSIVE_HI_RCL_FLAG -> TweetFeature.LABEL_ABUSIVE_HI_RCL_FLAG,
TimelinesSharedFeatures.LABEL_DUP_CONTENT_FLAG -> TweetFeature.LABEL_DUP_CONTENT_FLAG,
TimelinesSharedFeatures.LABEL_NSFW_HI_PRC_FLAG -> TweetFeature.LABEL_NSFW_HI_PRC_FLAG,
TimelinesSharedFeatures.LABEL_NSFW_HI_RCL_FLAG -> TweetFeature.LABEL_NSFW_HI_RCL_FLAG,
TimelinesSharedFeatures.LABEL_SPAM_FLAG -> TweetFeature.LABEL_SPAM_FLAG,
TimelinesSharedFeatures.LABEL_SPAM_HI_RCL_FLAG -> TweetFeature.LABEL_SPAM_HI_RCL_FLAG
)
protected def derivedFeaturesAdder: ITransform =
new ITransform {
private val hasEnglishTweetDiffUiLangFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.HAS_ENGLISH_TWEET_DIFF_UI_LANG)
.asInstanceOf[Feature.Binary]
private val hasEnglishUiDiffTweetLangFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.HAS_ENGLISH_UI_DIFF_TWEET_LANG)
.asInstanceOf[Feature.Binary]
private val hasDiffLangFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.HAS_DIFF_LANG)
.asInstanceOf[Feature.Binary]
private val isSelfTweetFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.IS_SELF_TWEET)
.asInstanceOf[Feature.Binary]
private val tweetAgeInSecsFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.TWEET_AGE_IN_SECS)
.asInstanceOf[Feature.Continuous]
private val authorSpecificScoreFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.AUTHOR_SPECIFIC_SCORE)
.asInstanceOf[Feature.Continuous]
// see comments above
private val linkLanguageFeature = new Feature.Continuous(TweetFeature.LINK_LANGUAGE.getName)
private val languageFeature = new Feature.Continuous(TweetFeature.LANGUAGE.getName)
override def transformContext(featureContext: FeatureContext): FeatureContext =
featureContext.addFeatures(
authorSpecificScoreFeature,
// used when training against the full scoreEarlybirdModelEvaluationJob.scala
// TimelinesSharedFeatures.PREDICTED_SCORE_LOG,
hasEnglishTweetDiffUiLangFeature,
hasEnglishUiDiffTweetLangFeature,
hasDiffLangFeature,
isSelfTweetFeature,
tweetAgeInSecsFeature,
linkLanguageFeature,
languageFeature
)
override def transform(record: DataRecord): Unit = {
val srecord = SRichDataRecord(record)
srecord.getFeatureValueOpt(RealGraphDataRecordFeatures.WEIGHT).map { realgraphWeight =>
srecord.setFeatureValue(authorSpecificScoreFeature, realgraphWeight)
}
// use this when training against the log of the full score
// srecord.getFeatureValueOpt(TimelinesSharedFeatures.PREDICTED_SCORE).map { score =>
// if (score > 0.0) {
// srecord.setFeatureValue(TimelinesSharedFeatures.PREDICTED_SCORE_LOG, Math.log(score))
// }
// }
if (srecord.hasFeature(RequestContextFeatures.LANGUAGE_CODE) && srecord.hasFeature(
RecapFeatures.LANGUAGE)) {
val uilangIsEnglish = srecord
.getFeatureValue(RequestContextFeatures.LANGUAGE_CODE).toString == "en"
val tweetIsEnglish = srecord.getFeatureValue(RecapFeatures.LANGUAGE) == 5
srecord.setFeatureValue(
hasEnglishTweetDiffUiLangFeature,
tweetIsEnglish && !uilangIsEnglish
)
srecord.setFeatureValue(
hasEnglishUiDiffTweetLangFeature,
uilangIsEnglish && !tweetIsEnglish
)
}
srecord.getFeatureValueOpt(RecapFeatures.MATCH_UI_LANG).map { match_ui_lang =>
srecord.setFeatureValue(
hasDiffLangFeature,
!match_ui_lang
)
}
for {
author_id <- srecord.getFeatureValueOpt(SharedFeatures.AUTHOR_ID)
user_id <- srecord.getFeatureValueOpt(SharedFeatures.USER_ID)
} srecord.setFeatureValue(
isSelfTweetFeature,
author_id == user_id
)
srecord.getFeatureValueOpt(TimeDataRecordFeatures.TIME_SINCE_TWEET_CREATION).map {
time_since_tweet_creation =>
srecord.setFeatureValue(
tweetAgeInSecsFeature,
time_since_tweet_creation / 1000.0
)
}
srecord.getFeatureValueOpt(RecapFeatures.LINK_LANGUAGE).map { link_language =>
srecord.setFeatureValue(
linkLanguageFeature,
link_language.toDouble
)
}
srecord.getFeatureValueOpt(RecapFeatures.LANGUAGE).map { language =>
srecord.setFeatureValue(
languageFeature,
language.toDouble
)
}
}
}
protected def featureInstanceFromSearchResultFeature(
tweetFeature: SearchResultFeature
): Feature[_] = {
val featureType = tweetFeature.getType
val featureName = tweetFeature.getName
require(
!tweetFeature.isDiscrete && (
featureType == com.twitter.search.common.features.thrift.ThriftSearchFeatureType.BOOLEAN_VALUE ||
featureType == com.twitter.search.common.features.thrift.ThriftSearchFeatureType.DOUBLE_VALUE ||
featureType == com.twitter.search.common.features.thrift.ThriftSearchFeatureType.INT32_VALUE
)
)
if (featureType == com.twitter.search.common.features.thrift.ThriftSearchFeatureType.BOOLEAN_VALUE)
new Feature.Binary(featureName)
else
new Feature.Continuous(featureName)
}
lazy val EarlybirdFeatureRenamer: ITransform = {
val earlybirdFeatureRenameMap: Map[Feature[_], Feature[_]] =
featureToSearchResultFeatureMap.map {
case (originalFeature, tweetFeature) =>
originalFeature -> featureInstanceFromSearchResultFeature(tweetFeature)
}.toMap
new CascadeTransform(
List(
derivedFeaturesAdder,
TransformFactory.produceTransform(
TransformFactory.produceFeatureRenameTransformSpec(
earlybirdFeatureRenameMap.asJava
)
)
).asJava
)
}
}

View File

@ -1,17 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common
import com.twitter.ml.api.Feature
import com.twitter.timelines.prediction.features.recap.RecapFeatures
class EarlybirdTrainingRecapConfiguration extends EarlybirdTrainingConfiguration {
override val labels: Map[String, Feature.Binary] = Map(
"detail_expanded" -> RecapFeatures.IS_CLICKED,
"favorited" -> RecapFeatures.IS_FAVORITED,
"open_linked" -> RecapFeatures.IS_OPEN_LINKED,
"photo_expanded" -> RecapFeatures.IS_PHOTO_EXPANDED,
"profile_clicked" -> RecapFeatures.IS_PROFILE_CLICKED,
"replied" -> RecapFeatures.IS_REPLIED,
"retweeted" -> RecapFeatures.IS_RETWEETED,
"video_playback50" -> RecapFeatures.IS_VIDEO_PLAYBACK_50
)
}

View File

@ -1,100 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api.ITransform
import com.twitter.ml.api.transform.CascadeTransform
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.search.common.features.SearchResultFeature
import com.twitter.search.common.features.TweetFeature
import com.twitter.timelines.prediction.features.itl.ITLFeatures._
import scala.collection.JavaConverters._
class EarlybirdTrainingRectweetConfiguration extends EarlybirdTrainingConfiguration {
override val labels: Map[String, Feature.Binary] = Map(
"detail_expanded" -> IS_CLICKED,
"favorited" -> IS_FAVORITED,
"open_linked" -> IS_OPEN_LINKED,
"photo_expanded" -> IS_PHOTO_EXPANDED,
"profile_clicked" -> IS_PROFILE_CLICKED,
"replied" -> IS_REPLIED,
"retweeted" -> IS_RETWEETED,
"video_playback50" -> IS_VIDEO_PLAYBACK_50
)
override val PositiveSamplingRate: Double = 0.5
override def featureToSearchResultFeatureMap: Map[Feature[_], SearchResultFeature] =
super.featureToSearchResultFeatureMap ++ Map(
TEXT_SCORE -> TweetFeature.TEXT_SCORE,
REPLY_COUNT -> TweetFeature.REPLY_COUNT,
RETWEET_COUNT -> TweetFeature.RETWEET_COUNT,
FAV_COUNT -> TweetFeature.FAVORITE_COUNT,
HAS_CARD -> TweetFeature.HAS_CARD_FLAG,
HAS_CONSUMER_VIDEO -> TweetFeature.HAS_CONSUMER_VIDEO_FLAG,
HAS_PRO_VIDEO -> TweetFeature.HAS_PRO_VIDEO_FLAG,
HAS_VINE -> TweetFeature.HAS_VINE_FLAG,
HAS_PERISCOPE -> TweetFeature.HAS_PERISCOPE_FLAG,
HAS_NATIVE_IMAGE -> TweetFeature.HAS_NATIVE_IMAGE_FLAG,
HAS_IMAGE -> TweetFeature.HAS_IMAGE_URL_FLAG,
HAS_NEWS -> TweetFeature.HAS_NEWS_URL_FLAG,
HAS_VIDEO -> TweetFeature.HAS_VIDEO_URL_FLAG,
// some features that exist for recap are not available in rectweet
// HAS_TREND
// HAS_MULTIPLE_HASHTAGS_OR_TRENDS
// IS_OFFENSIVE
// IS_REPLY
// IS_RETWEET
IS_AUTHOR_BOT -> TweetFeature.IS_USER_BOT_FLAG,
IS_AUTHOR_SPAM -> TweetFeature.IS_USER_SPAM_FLAG,
IS_AUTHOR_NSFW -> TweetFeature.IS_USER_NSFW_FLAG,
// FROM_VERIFIED_ACCOUNT
USER_REP -> TweetFeature.USER_REPUTATION,
// EMBEDS_IMPRESSION_COUNT
// EMBEDS_URL_COUNT
// VIDEO_VIEW_COUNT
FAV_COUNT_V2 -> TweetFeature.FAVORITE_COUNT_V2,
RETWEET_COUNT_V2 -> TweetFeature.RETWEET_COUNT_V2,
REPLY_COUNT_V2 -> TweetFeature.REPLY_COUNT_V2,
IS_SENSITIVE -> TweetFeature.IS_SENSITIVE_CONTENT,
HAS_MULTIPLE_MEDIA -> TweetFeature.HAS_MULTIPLE_MEDIA_FLAG,
IS_AUTHOR_PROFILE_EGG -> TweetFeature.PROFILE_IS_EGG_FLAG,
IS_AUTHOR_NEW -> TweetFeature.IS_USER_NEW_FLAG,
NUM_MENTIONS -> TweetFeature.NUM_MENTIONS,
NUM_HASHTAGS -> TweetFeature.NUM_HASHTAGS,
HAS_VISIBLE_LINK -> TweetFeature.HAS_VISIBLE_LINK_FLAG,
HAS_LINK -> TweetFeature.HAS_LINK_FLAG
)
override def derivedFeaturesAdder: CascadeTransform = {
// only LINK_LANGUAGE availabe in rectweet. no LANGUAGE feature
val linkLanguageTransform = new ITransform {
private val linkLanguageFeature = new Feature.Continuous(TweetFeature.LINK_LANGUAGE.getName)
override def transformContext(featureContext: FeatureContext): FeatureContext =
featureContext.addFeatures(
linkLanguageFeature
)
override def transform(record: DataRecord): Unit = {
val srecord = SRichDataRecord(record)
srecord.getFeatureValueOpt(LINK_LANGUAGE).map { link_language =>
srecord.setFeatureValue(
linkLanguageFeature,
link_language.toDouble
)
}
}
}
new CascadeTransform(
List(
super.derivedFeaturesAdder,
linkLanguageTransform
).asJava
)
}
}

View File

@ -1,36 +0,0 @@
scala_library(
name = "model_evaluation",
sources = ["*.scala"],
platform = "java8",
strict_deps = False,
dependencies = [
"3rdparty/src/jvm/com/twitter/scalding:json",
"src/scala/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api/prediction_engine",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/scalding_internal/job",
"src/scala/com/twitter/timelines/prediction/adapters/recap",
"src/scala/com/twitter/timelines/prediction/features/recap",
"timelines/data_processing/ad_hoc/earlybird_ranking/common",
"timelines/data_processing/util:rich-request",
"timelines/data_processing/util/example",
"timelines/data_processing/util/execution",
"twadoop_config/configuration/log_categories/group/timelines:timelineservice_injection_request_log-scala",
],
)
hadoop_binary(
name = "bin",
basename = "earlybird_model_evaluation-deploy",
main = "com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.model_evaluation.EarlybirdModelEvaluationJob",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":model_evaluation",
],
)

View File

@ -1,203 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.model_evaluation
import scala.collection.GenTraversableOnce
case class CandidateRecord(tweetId: Long, fullScore: Double, earlyScore: Double, served: Boolean)
/**
* A metric that compares scores generated by a "full" prediction
* model to a "light" (Earlybird) model. The metric is calculated for candidates
* from a single request.
*/
sealed trait EarlybirdEvaluationMetric {
def name: String
def apply(candidates: Seq[CandidateRecord]): Option[Double]
}
/**
* Picks the set of `k` top candidates using light scores, and calculates
* recall of these light-score based candidates among set of `k` top candidates
* using full scores.
*
* If there are fewer than `k` candidates, then we can choose to filter out requests (will
* lower value of recall) or keep them by trivially computing recall as 1.0.
*/
case class TopKRecall(k: Int, filterFewerThanK: Boolean) extends EarlybirdEvaluationMetric {
override val name: String = s"top_${k}_recall${if (filterFewerThanK) "_filtered" else ""}"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
if (filterFewerThanK) None else Some(1.0)
} else {
val topFull = candidates.sortBy(-_.fullScore).take(k)
val topLight = candidates.sortBy(-_.earlyScore).take(k)
val overlap = topFull.map(_.tweetId).intersect(topLight.map(_.tweetId))
val truePos = overlap.size.toDouble
Some(truePos / k.toDouble)
}
}
}
/**
* Calculates the probability that a random pair of candidates will be ordered the same by the
* full and earlybird models.
*
* Note: A pair with same scores for one model and different for the other will contribute 1
* to the sum. Pairs that are strictly ordered the same, will contribute 2.
* It follows that the score for a constant model is 0.5, which is approximately equal to a
* random model as expected.
*/
case object ProbabilityOfCorrectOrdering extends EarlybirdEvaluationMetric {
def fractionOf[A](trav: GenTraversableOnce[A])(p: A => Boolean): Double = {
if (trav.isEmpty)
0.0
else {
val (numPos, numElements) = trav.foldLeft((0, 0)) {
case ((numPosAcc, numElementsAcc), elem) =>
(if (p(elem)) numPosAcc + 1 else numPosAcc, numElementsAcc + 1)
}
numPos.toDouble / numElements
}
}
override def name: String = "probability_of_correct_ordering"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size < 2)
None
else {
val pairs = for {
left <- candidates.iterator
right <- candidates.iterator
if left != right
} yield (left, right)
val probabilityOfCorrect = fractionOf(pairs) {
case (left, right) =>
(left.fullScore > right.fullScore) == (left.earlyScore > right.earlyScore)
}
Some(probabilityOfCorrect)
}
}
}
/**
* Like `TopKRecall`, but uses `n` % of top candidates instead.
*/
case class TopNPercentRecall(percent: Double) extends EarlybirdEvaluationMetric {
override val name: String = s"top_${percent}_pct_recall"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
val k = Math.floor(candidates.size * percent).toInt
if (k > 0) {
val topFull = candidates.sortBy(-_.fullScore).take(k)
val topLight = candidates.sortBy(-_.earlyScore).take(k)
val overlap = topFull.map(_.tweetId).intersect(topLight.map(_.tweetId))
val truePos = overlap.size.toDouble
Some(truePos / k.toDouble)
} else {
None
}
}
}
/**
* Picks the set of `k` top candidates using light scores, and calculates
* recall of selected light-score based candidates among set of actual
* shown candidates.
*/
case class ShownTweetRecall(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"shown_tweet_recall_$k"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
None
} else {
val topLight = candidates.sortBy(-_.earlyScore).take(k)
val truePos = topLight.count(_.served).toDouble
val allPos = candidates.count(_.served).toDouble
if (allPos > 0) Some(truePos / allPos)
else None
}
}
}
/**
* Like `ShownTweetRecall`, but uses `n` % of top candidates instead.
*/
case class ShownTweetPercentRecall(percent: Double) extends EarlybirdEvaluationMetric {
override val name: String = s"shown_tweet_recall_${percent}_pct"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
val k = Math.floor(candidates.size * percent).toInt
val topLight = candidates.sortBy(-_.earlyScore).take(k)
val truePos = topLight.count(_.served).toDouble
val allPos = candidates.count(_.served).toDouble
if (allPos > 0) Some(truePos / allPos)
else None
}
}
/**
* Like `ShownTweetRecall`, but calculated using *full* scores. This is a sanity metric,
* because by definition the top full-scored candidates will be served. If the value is
* < 1, this is due to the ranked section being smaller than k.
*/
case class ShownTweetRecallWithFullScores(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"shown_tweet_recall_with_full_scores_$k"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
None
} else {
val topFull = candidates.sortBy(-_.fullScore).take(k)
val truePos = topFull.count(_.served).toDouble
val allPos = candidates.count(_.served).toDouble
if (allPos > 0) Some(truePos / allPos)
else None
}
}
}
/**
* Picks the set of `k` top candidates using the light scores, and calculates
* average full score for the candidates.
*/
case class AverageFullScoreForTopLight(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"average_full_score_for_top_light_$k"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
None
} else {
val topLight = candidates.sortBy(-_.earlyScore).take(k)
Some(topLight.map(_.fullScore).sum / topLight.size)
}
}
}
/**
* Picks the set of `k` top candidates using the light scores, and calculates
* sum of full scores for those. Divides that by sum of `k` top full scores,
* overall, to get a "score recall".
*/
case class SumScoreRecallForTopLight(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"sum_score_recall_for_top_light_$k"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
None
} else {
val sumFullScoresForTopLight = candidates.sortBy(-_.earlyScore).take(k).map(_.fullScore).sum
val sumScoresForTopFull = candidates.sortBy(-_.fullScore).take(k).map(_.fullScore).sum
Some(sumFullScoresForTopLight / sumScoresForTopFull)
}
}
}
case class HasFewerThanKCandidates(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"has_fewer_than_${k}_candidates"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] =
Some(if (candidates.size <= k) 1.0 else 0.0)
}
case object NumberOfCandidates extends EarlybirdEvaluationMetric {
override val name: String = s"number_of_candidates"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] =
Some(candidates.size.toDouble)
}

View File

@ -1,214 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.model_evaluation
import com.twitter.algebird.Aggregator
import com.twitter.algebird.AveragedValue
import com.twitter.ml.api.prediction_engine.PredictionEnginePlugin
import com.twitter.ml.api.util.FDsl
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.IRecordOneToManyAdapter
import com.twitter.scalding.Args
import com.twitter.scalding.DateRange
import com.twitter.scalding.Execution
import com.twitter.scalding.TypedJson
import com.twitter.scalding.TypedPipe
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingRecapConfiguration
import com.twitter.timelines.data_processing.util.RequestImplicits.RichRequest
import com.twitter.timelines.data_processing.util.example.RecapTweetExample
import com.twitter.timelines.data_processing.util.execution.UTCDateRangeFromArgs
import com.twitter.timelines.prediction.adapters.recap.RecapSuggestionRecordAdapter
import com.twitter.timelines.prediction.features.recap.RecapFeatures
import com.twitter.timelines.suggests.common.record.thriftscala.SuggestionRecord
import com.twitter.timelineservice.suggests.logging.recap.thriftscala.HighlightTweet
import com.twitter.timelineservice.suggests.logging.thriftscala.SuggestsRequestLog
import scala.collection.JavaConverters._
import scala.language.reflectiveCalls
import scala.util.Random
import twadoop_config.configuration.log_categories.group.timelines.TimelineserviceInjectionRequestLogScalaDataset
/**
* Evaluates an Earlybird model using 1% injection request logs.
*
* Arguments:
* --model_base_path path to Earlybird model snapshots
* --models list of model names to evaluate
* --output path to output stats
* --parallelism (default: 3) number of tasks to run in parallel
* --topks (optional) list of values of `k` (integers) for top-K metrics
* --topn_fractions (optional) list of values of `n` (doubles) for top-N-fraction metrics
* --seed (optional) seed for random number generator
*/
object EarlybirdModelEvaluationJob extends TwitterExecutionApp with UTCDateRangeFromArgs {
import FDsl._
import PredictionEnginePlugin._
private[this] val averager: Aggregator[Double, AveragedValue, Double] =
AveragedValue.aggregator
private[this] val recapAdapter: IRecordOneToManyAdapter[SuggestionRecord] =
new RecapSuggestionRecordAdapter(checkDwellTime = false)
override def job: Execution[Unit] = {
for {
args <- Execution.getArgs
dateRange <- dateRangeEx
metrics = getMetrics(args)
random = buildRandom(args)
modelBasePath = args("model_base_path")
models = args.list("models")
parallelism = args.int("parallelism", 3)
logs = logsHavingCandidates(dateRange)
modelScoredCandidates = models.map { model =>
(model, scoreCandidatesUsingModel(logs, s"$modelBasePath/$model"))
}
functionScoredCandidates = List(
("random", scoreCandidatesUsingFunction(logs, _ => Some(random.nextDouble()))),
("original_earlybird", scoreCandidatesUsingFunction(logs, extractOriginalEarlybirdScore)),
("blender", scoreCandidatesUsingFunction(logs, extractBlenderScore))
)
allCandidates = modelScoredCandidates ++ functionScoredCandidates
statsExecutions = allCandidates.map {
case (name, pipe) =>
for {
saved <- pipe.forceToDiskExecution
stats <- computeMetrics(saved, metrics, parallelism)
} yield (name, stats)
}
stats <- Execution.withParallelism(statsExecutions, parallelism)
_ <- TypedPipe.from(stats).writeExecution(TypedJson(args("output")))
} yield ()
}
private[this] def computeMetrics(
requests: TypedPipe[Seq[CandidateRecord]],
metricsToCompute: Seq[EarlybirdEvaluationMetric],
parallelism: Int
): Execution[Map[String, Double]] = {
val metricExecutions = metricsToCompute.map { metric =>
val metricEx = requests.flatMap(metric(_)).aggregate(averager).toOptionExecution
metricEx.map { value => value.map((metric.name, _)) }
}
Execution.withParallelism(metricExecutions, parallelism).map(_.flatten.toMap)
}
private[this] def getMetrics(args: Args): Seq[EarlybirdEvaluationMetric] = {
val topKs = args.list("topks").map(_.toInt)
val topNFractions = args.list("topn_fractions").map(_.toDouble)
val topKMetrics = topKs.flatMap { topK =>
Seq(
TopKRecall(topK, filterFewerThanK = false),
TopKRecall(topK, filterFewerThanK = true),
ShownTweetRecall(topK),
AverageFullScoreForTopLight(topK),
SumScoreRecallForTopLight(topK),
HasFewerThanKCandidates(topK),
ShownTweetRecallWithFullScores(topK),
ProbabilityOfCorrectOrdering
)
}
val topNPercentMetrics = topNFractions.flatMap { topNPercent =>
Seq(
TopNPercentRecall(topNPercent),
ShownTweetPercentRecall(topNPercent)
)
}
topKMetrics ++ topNPercentMetrics ++ Seq(NumberOfCandidates)
}
private[this] def buildRandom(args: Args): Random = {
val seedOpt = args.optional("seed").map(_.toLong)
seedOpt.map(new Random(_)).getOrElse(new Random())
}
private[this] def logsHavingCandidates(dateRange: DateRange): TypedPipe[SuggestsRequestLog] =
DAL
.read(TimelineserviceInjectionRequestLogScalaDataset, dateRange)
.toTypedPipe
.filter(_.recapCandidates.exists(_.nonEmpty))
/**
* Uses a model defined at `earlybirdModelPath` to score candidates and
* returns a Seq[CandidateRecord] for each request.
*/
private[this] def scoreCandidatesUsingModel(
logs: TypedPipe[SuggestsRequestLog],
earlybirdModelPath: String
): TypedPipe[Seq[CandidateRecord]] = {
logs
.usingScorer(earlybirdModelPath)
.map {
case (scorer: PredictionEngineScorer, log: SuggestsRequestLog) =>
val suggestionRecords =
RecapTweetExample
.extractCandidateTweetExamples(log)
.map(_.asSuggestionRecord)
val servedTweetIds = log.servedHighlightTweets.flatMap(_.tweetId).toSet
val renamer = (new EarlybirdTrainingRecapConfiguration).EarlybirdFeatureRenamer
suggestionRecords.flatMap { suggestionRecord =>
val dataRecordOpt = recapAdapter.adaptToDataRecords(suggestionRecord).asScala.headOption
dataRecordOpt.foreach(renamer.transform)
for {
tweetId <- suggestionRecord.itemId
fullScore <- suggestionRecord.recapFeatures.flatMap(_.combinedModelScore)
earlybirdScore <- dataRecordOpt.flatMap(calculateLightScore(_, scorer))
} yield CandidateRecord(
tweetId = tweetId,
fullScore = fullScore,
earlyScore = earlybirdScore,
served = servedTweetIds.contains(tweetId)
)
}
}
}
/**
* Uses a simple function to score candidates and returns a Seq[CandidateRecord] for each
* request.
*/
private[this] def scoreCandidatesUsingFunction(
logs: TypedPipe[SuggestsRequestLog],
earlyScoreExtractor: HighlightTweet => Option[Double]
): TypedPipe[Seq[CandidateRecord]] = {
logs
.map { log =>
val tweetCandidates = log.recapTweetCandidates.getOrElse(Nil)
val servedTweetIds = log.servedHighlightTweets.flatMap(_.tweetId).toSet
for {
candidate <- tweetCandidates
tweetId <- candidate.tweetId
fullScore <- candidate.recapFeatures.flatMap(_.combinedModelScore)
earlyScore <- earlyScoreExtractor(candidate)
} yield CandidateRecord(
tweetId = tweetId,
fullScore = fullScore,
earlyScore = earlyScore,
served = servedTweetIds.contains(tweetId)
)
}
}
private[this] def extractOriginalEarlybirdScore(candidate: HighlightTweet): Option[Double] =
for {
recapFeatures <- candidate.recapFeatures
tweetFeatures <- recapFeatures.tweetFeatures
} yield tweetFeatures.earlybirdScore
private[this] def extractBlenderScore(candidate: HighlightTweet): Option[Double] =
for {
recapFeatures <- candidate.recapFeatures
tweetFeatures <- recapFeatures.tweetFeatures
} yield tweetFeatures.blenderScore
private[this] def calculateLightScore(
dataRecord: DataRecord,
scorer: PredictionEngineScorer
): Option[Double] = {
val scoredRecord = scorer(dataRecord)
if (scoredRecord.hasFeature(RecapFeatures.PREDICTED_IS_UNIFIED_ENGAGEMENT)) {
Some(scoredRecord.getFeatureValue(RecapFeatures.PREDICTED_IS_UNIFIED_ENGAGEMENT).toDouble)
} else {
None
}
}
}

View File

@ -1,89 +0,0 @@
create_datarecord_datasets(
base_name = "earlybird_recap_data_records",
platform = "java8",
role = "timelines",
segment_type = "partitioned",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
],
)
create_datarecord_datasets(
base_name = "earlybird_rectweet_data_records",
platform = "java8",
role = "timelines",
segment_type = "partitioned",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
],
)
scala_library(
name = "training_data_generation",
sources = ["*.scala"],
platform = "java8",
strict_deps = True,
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
],
dependencies = [
":earlybird_recap_data_records-java",
":earlybird_rectweet_data_records-java",
"3rdparty/jvm/com/ibm/icu:icu4j",
"3rdparty/src/jvm/com/twitter/scalding:json",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/java/com/twitter/ml/api/matcher",
"src/java/com/twitter/search/common/features",
"src/scala/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api/analytics",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/scalding_internal/dalv2",
"src/scala/com/twitter/scalding_internal/dalv2/dataset",
"src/scala/com/twitter/scalding_internal/job",
"src/scala/com/twitter/scalding_internal/job/analytics_batch",
"src/scala/com/twitter/timelines/prediction/features/common",
"src/scala/com/twitter/timelines/prediction/features/recap",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:dataset-analytics-java",
"timelines/data_processing/ad_hoc/earlybird_ranking/common",
"timelines/data_processing/ad_hoc/recap/dataset_utils",
"timelines/data_processing/ad_hoc/recap/offline_execution",
"timelines/data_processing/util/execution",
],
)
hadoop_binary(
name = "bin",
basename = "earlybird_training_data_generation-deploy",
main = "com.twitter.scalding.Tool",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":training_data_generation",
],
)
hadoop_binary(
name = "earlybird_training_data_generation_prod",
basename = "earlybird_training_data_generation_prod-deploy",
main = "com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation.EarlybirdTrainingDataProdJob",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":training_data_generation",
],
)

View File

@ -1,65 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.DataSetPipe
import com.twitter.ml.api.Feature
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.LabelInfo
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.LabelInfoWithFeature
import com.twitter.timelines.prediction.features.recap.RecapFeatures
import java.lang.{Double => JDouble}
import scala.util.Random
/**
* Adds an IsGlobalEngagement label to records containing any recap label, and adjusts
* weights accordingly. See [[weightAndSample]] for details on operation.
*/
class EarlybirdExampleSampler(
random: Random,
labelInfos: List[LabelInfoWithFeature],
negativeInfo: LabelInfo) {
import com.twitter.ml.api.util.FDsl._
private[this] val ImportanceFeature: Feature[JDouble] =
SharedFeatures.RECORD_WEIGHT_FEATURE_BUILDER
.extensionBuilder()
.addExtension("type", "earlybird")
.build()
private[this] def uniformSample(labelInfo: LabelInfo) =
random.nextDouble() < labelInfo.downsampleFraction
private[this] def weightedImportance(labelInfo: LabelInfo) =
labelInfo.importance / labelInfo.downsampleFraction
/**
* Generates a IsGlobalEngagement label for records that contain any
* recap label. Adds an "importance" value per recap label found
* in the record. Simultaneously, downsamples positive and negative examples based on provided
* downsample rates.
*/
def weightAndSample(data: DataSetPipe): DataSetPipe = {
val updatedRecords = data.records.flatMap { record =>
val featuresOn = labelInfos.filter(labelInfo => record.hasFeature(labelInfo.feature))
if (featuresOn.nonEmpty) {
val sampled = featuresOn.map(_.info).filter(uniformSample)
if (sampled.nonEmpty) {
record.setFeatureValue(RecapFeatures.IS_EARLYBIRD_UNIFIED_ENGAGEMENT, true)
Some(record.setFeatureValue(ImportanceFeature, sampled.map(weightedImportance).sum))
} else {
None
}
} else if (uniformSample(negativeInfo)) {
Some(record.setFeatureValue(ImportanceFeature, weightedImportance(negativeInfo)))
} else {
None
}
}
DataSetPipe(
updatedRecords,
data.featureContext
.addFeatures(ImportanceFeature, RecapFeatures.IS_EARLYBIRD_UNIFIED_ENGAGEMENT)
)
}
}

View File

@ -1,63 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation
import com.twitter.ml.api.analytics.DataSetAnalyticsPlugin
import com.twitter.ml.api.matcher.FeatureMatcher
import com.twitter.ml.api.util.FDsl
import com.twitter.ml.api.DailySuffixFeatureSource
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.DataSetPipe
import com.twitter.ml.api.FeatureStats
import com.twitter.ml.api.IMatcher
import com.twitter.scalding.typed.TypedPipe
import com.twitter.scalding.Execution
import com.twitter.scalding.TypedJson
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.timelines.data_processing.util.execution.UTCDateRangeFromArgs
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingConfiguration
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingRecapConfiguration
import com.twitter.timelines.prediction.features.recap.RecapFeatures
import scala.collection.JavaConverters._
/**
* Compute counts and fractions for all labels in a Recap data source.
*
* Arguments:
* --input recap data source (containing all labels)
* --output path to output JSON file containing stats
*/
object EarlybirdStatsJob extends TwitterExecutionApp with UTCDateRangeFromArgs {
import DataSetAnalyticsPlugin._
import FDsl._
import RecapFeatures.IS_EARLYBIRD_UNIFIED_ENGAGEMENT
lazy val constants: EarlybirdTrainingConfiguration = new EarlybirdTrainingRecapConfiguration
private[this] def addGlobalEngagementLabel(record: DataRecord) = {
if (constants.LabelInfos.exists { labelInfo => record.hasFeature(labelInfo.feature) }) {
record.setFeatureValue(IS_EARLYBIRD_UNIFIED_ENGAGEMENT, true)
}
record
}
private[this] def labelFeatureMatcher: IMatcher = {
val allLabels =
(IS_EARLYBIRD_UNIFIED_ENGAGEMENT :: constants.LabelInfos.map(_.feature)).map(_.getFeatureName)
FeatureMatcher.names(allLabels.asJava)
}
private[this] def computeStats(data: DataSetPipe): TypedPipe[FeatureStats] = {
data
.viaRecords { _.map(addGlobalEngagementLabel) }
.project(labelFeatureMatcher)
.collectFeatureStats()
}
override def job: Execution[Unit] = {
for {
args <- Execution.getArgs
dateRange <- dateRangeEx
data = DailySuffixFeatureSource(args("input"))(dateRange).read
_ <- computeStats(data).writeExecution(TypedJson(args("output")))
} yield ()
}
}

View File

@ -1,92 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation
import com.twitter.ml.api.HourlySuffixFeatureSource
import com.twitter.ml.api.IRecord
import com.twitter.scalding.Args
import com.twitter.scalding.DateRange
import com.twitter.scalding.Days
import com.twitter.scalding.Execution
import com.twitter.scalding.ExecutionUtil
import com.twitter.scalding_internal.dalv2.DALWrite.D
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingRecapConfiguration
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingRectweetConfiguration
import com.twitter.timelines.data_processing.ad_hoc.recap.offline_execution.OfflineAdhocExecution
import com.twitter.timelines.data_processing.ad_hoc.recap.offline_execution.OfflineAnalyticsBatchExecution
import com.twitter.timelines.data_processing.ad_hoc.recap.offline_execution.OfflineExecution
import scala.util.Random
import com.twitter.scalding_internal.dalv2.dataset.DALWrite._
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
import timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation._
/**
* Generates data for training an Earlybird-friendly model.
* Produces a single "global" engagement, and samples data accordingly.
* Also converts features from Earlybird to their original Earlybird
* feature names so they can be used as is in EB.
*
* Arguments:
* --input path to raw Recap training data (all labels)
* --output path to write sampled Earlybird-friendly training data
* --seed (optional) for random number generator (in sampling)
* --parallelism (default: 1) number of days to generate data for in parallel
* [splits long date range into single days]
*/
trait GenerateEarlybirdTrainingData { _: OfflineExecution =>
def isEligibleForEarlybirdScoring(record: IRecord): Boolean = {
// The rationale behind this logic is available in TQ-9678.
record.getFeatureValue(TimelinesSharedFeatures.EARLYBIRD_SCORE) <= 100.0
}
override def executionFromParams(args: Args)(implicit dateRange: DateRange): Execution[Unit] = {
val seedOpt = args.optional("seed").map(_.toLong)
val parallelism = args.int("parallelism", 1)
val rectweet = args.boolean("rectweet")
ExecutionUtil
.runDateRangeWithParallelism(Days(1), parallelism) { splitRange =>
val data = HourlySuffixFeatureSource(args("input"))(splitRange).read
.filter(isEligibleForEarlybirdScoring _)
lazy val rng = seedOpt.map(new Random(_)).getOrElse(new Random())
val (constants, sink) =
if (rectweet)
(new EarlybirdTrainingRectweetConfiguration, EarlybirdRectweetDataRecordsJavaDataset)
else (new EarlybirdTrainingRecapConfiguration, EarlybirdRecapDataRecordsJavaDataset)
val earlybirdSampler =
new EarlybirdExampleSampler(
random = rng,
labelInfos = constants.LabelInfos,
negativeInfo = constants.NegativeInfo
)
val outputPath = args("output")
earlybirdSampler
.weightAndSample(data)
.transform(constants.EarlybirdFeatureRenamer)
// shuffle row-wise in order to get rid of clustered replies
// also keep number of part files small
.viaRecords { record =>
record
.groupRandomly(partitions = 500)
.sortBy { _ => rng.nextDouble() }
.values
}
.writeDALExecution(
sink,
D.Daily,
D.Suffix(outputPath),
D.EBLzo()
)(splitRange)
}(dateRange).unit
}
}
object EarlybirdTrainingDataAdHocJob
extends OfflineAdhocExecution
with GenerateEarlybirdTrainingData
object EarlybirdTrainingDataProdJob
extends OfflineAnalyticsBatchExecution
with GenerateEarlybirdTrainingData

View File

@ -1,124 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.EasyMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.MaxMetric
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
import com.twitter.util.Duration
import java.lang.{Boolean => JBoolean}
import java.lang.{Long => JLong}
import scala.language.existentials
/**
* A wrapper for [[com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup]]
* (see TypedAggregateGroup.scala) with some convenient syntactic sugar that avoids
* the user having to specify different groups for different types of features.
* Gets translated into multiple strongly typed TypedAggregateGroup(s)
* by the buildTypedAggregateGroups() method defined below.
*
* @param inputSource Source to compute this aggregate over
* @param preTransforms Sequence of [[ITransform]] that is applied to
* data records pre-aggregation (e.g. discretization, renaming)
* @param samplingTransformOpt Optional [[OneToSomeTransform]] that samples data record
* @param aggregatePrefix Prefix to use for naming resultant aggregate features
* @param keys Features to group by when computing the aggregates
* (e.g. USER_ID, AUTHOR_ID). These must be either discrete, string or sparse binary.
* Grouping by a sparse binary feature is different than grouping by a discrete or string
* feature. For example, if you have a sparse binary feature WORDS_IN_TWEET which is
* a set of all words in a tweet, then grouping by this feature generates a
* separate aggregate mean/count/etc for each value of the feature (each word), and
* not just a single aggregate count for different "sets of words"
* @param features Features to aggregate (e.g. blender_score or is_photo).
* @param labels Labels to cross the features with to make pair features, if any.
* @param metrics Aggregation metrics to compute (e.g. count, mean)
* @param halfLives Half lives to use for the aggregations, to be crossed with the above.
* use Duration.Top for "forever" aggregations over an infinite time window (no decay).
* @param outputStore Store to output this aggregate to
* @param includeAnyFeature Aggregate label counts for any feature value
* @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
* @param includeTimestampFeature compute max aggregate on timestamp feature
* @param aggExclusionRegex Sequence of Regexes, which define features to
*/
case class AggregateGroup(
inputSource: AggregateSource,
aggregatePrefix: String,
keys: Set[Feature[_]],
features: Set[Feature[_]],
labels: Set[_ <: Feature[JBoolean]],
metrics: Set[EasyMetric],
halfLives: Set[Duration],
outputStore: AggregateStore,
preTransforms: Seq[OneToSomeTransform] = Seq.empty,
includeAnyFeature: Boolean = true,
includeAnyLabel: Boolean = true,
includeTimestampFeature: Boolean = false,
aggExclusionRegex: Seq[String] = Seq.empty) {
private def toStrongType[T](
metrics: Set[EasyMetric],
features: Set[Feature[_]],
featureType: FeatureType
): TypedAggregateGroup[_] = {
val underlyingMetrics: Set[AggregationMetric[T, _]] =
metrics.flatMap(_.forFeatureType[T](featureType))
val underlyingFeatures: Set[Feature[T]] = features
.map(_.asInstanceOf[Feature[T]])
TypedAggregateGroup[T](
inputSource = inputSource,
aggregatePrefix = aggregatePrefix,
keysToAggregate = keys,
featuresToAggregate = underlyingFeatures,
labels = labels,
metrics = underlyingMetrics,
halfLives = halfLives,
outputStore = outputStore,
preTransforms = preTransforms,
includeAnyFeature,
includeAnyLabel,
aggExclusionRegex
)
}
private def timestampTypedAggregateGroup: TypedAggregateGroup[_] = {
val metrics: Set[AggregationMetric[JLong, _]] =
Set(MaxMetric.forFeatureType[JLong](TypedAggregateGroup.timestampFeature.getFeatureType).get)
TypedAggregateGroup[JLong](
inputSource = inputSource,
aggregatePrefix = aggregatePrefix,
keysToAggregate = keys,
featuresToAggregate = Set(TypedAggregateGroup.timestampFeature),
labels = Set.empty,
metrics = metrics,
halfLives = Set(Duration.Top),
outputStore = outputStore,
preTransforms = preTransforms,
includeAnyFeature = false,
includeAnyLabel = true,
aggExclusionRegex = Seq.empty
)
}
def buildTypedAggregateGroups(): List[TypedAggregateGroup[_]] = {
val typedAggregateGroupsList = {
if (features.isEmpty) {
List(toStrongType(metrics, features, FeatureType.BINARY))
} else {
features
.groupBy(_.getFeatureType())
.toList
.map {
case (featureType, features) =>
toStrongType(metrics, features, featureType)
}
}
}
val optionalTimestampTypedAggregateGroup =
if (includeTimestampFeature) List(timestampTypedAggregateGroup) else List()
typedAggregateGroupsList ++ optionalTimestampTypedAggregateGroup
}
}

View File

@ -1,9 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api.Feature
import java.lang.{Long => JLong}
trait AggregateSource extends Serializable {
def name: String
def timestampFeature: Feature[JLong]
}

View File

@ -1,5 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
trait AggregateStore extends Serializable {
def name: String
}

View File

@ -1,5 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
trait AggregationConfig {
def aggregatesToCompute: Set[TypedAggregateGroup[_]]
}

View File

@ -1,50 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.bijection.Bufferable
import com.twitter.bijection.Injection
import scala.util.Try
/**
* Case class that represents the "grouping" key for any aggregate feature.
* Used by Summingbird to output aggregates to the key-value "store" using sumByKey()
*
* @discreteFeaturesById All discrete featureids (+ values) that are part of this key
* @textFeaturesById All string featureids (+ values) that are part of this key
*
* Example 1: the user aggregate features in aggregatesv1 all group by USER_ID,
* which is a discrete feature. When storing these features, the key would be:
*
* discreteFeaturesById = Map(hash(USER_ID) -> <the actual user id>), textFeaturesById = Map()
*
* Ex 2: If aggregating grouped by USER_ID, AUTHOR_ID, tweet link url, the key would be:
*
* discreteFeaturesById = Map(hash(USER_ID) -> <actual user id>, hash(AUTHOR_ID) -> <actual author id>),
* textFeaturesById = Map(hash(URL_FEATURE) -> <the link url>)
*
* I could have just used a DataRecord for the key, but I wanted to make it strongly typed
* and only support grouping by discrete and string features, so using a case class instead.
*
* Re: efficiency, storing the hash of the feature in addition to just the feature value
* is somewhat more inefficient than only storing the feature value in the key, but it
* adds flexibility to group multiple types of aggregates in the same output store. If we
* decide this isn't a good tradeoff to make later, we can reverse/refactor this decision.
*/
case class AggregationKey(
discreteFeaturesById: Map[Long, Long],
textFeaturesById: Map[Long, String])
/**
* A custom injection for the above case class,
* so that Summingbird knows how to store it in Manhattan.
*/
object AggregationKeyInjection extends Injection[AggregationKey, Array[Byte]] {
/* Injection from tuple representation of AggregationKey to Array[Byte] */
val featureMapsInjection: Injection[(Map[Long, Long], Map[Long, String]), Array[Byte]] =
Bufferable.injectionOf[(Map[Long, Long], Map[Long, String])]
def apply(aggregationKey: AggregationKey): Array[Byte] =
featureMapsInjection(AggregationKey.unapply(aggregationKey).get)
def invert(ab: Array[Byte]): Try[AggregationKey] =
featureMapsInjection.invert(ab).map(AggregationKey.tupled(_))
}

View File

@ -1,101 +0,0 @@
scala_library(
name = "common_types",
sources = ["*.scala"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/twitter/algebird:bijection",
"3rdparty/jvm/com/twitter/algebird:core",
"3rdparty/jvm/com/twitter/algebird:util",
"3rdparty/jvm/com/twitter/bijection:core",
"3rdparty/jvm/com/twitter/bijection:json",
"3rdparty/jvm/com/twitter/bijection:macros",
"3rdparty/jvm/com/twitter/bijection:netty",
"3rdparty/jvm/com/twitter/bijection:scrooge",
"3rdparty/jvm/com/twitter/bijection:thrift",
"3rdparty/jvm/com/twitter/bijection:util",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/src/jvm/com/twitter/scalding:date",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/scala/com/twitter/dal/client/dataset",
"src/scala/com/twitter/ml/api/util:datarecord",
"src/scala/com/twitter/scalding_internal/dalv2/vkvs",
"src/scala/com/twitter/scalding_internal/multiformat/format/keyval",
"src/scala/com/twitter/storehaus_internal/manhattan/config",
"src/scala/com/twitter/storehaus_internal/offline",
"src/scala/com/twitter/storehaus_internal/util",
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
"src/scala/com/twitter/summingbird_internal/runner/store_config",
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
"src/thrift/com/twitter/dal/personal_data:personal_data-scala",
"src/thrift/com/twitter/ml/api:data-java",
"timelines/data_processing/ml_util/aggregation_framework/metrics",
"timelines/data_processing/ml_util/transforms",
"util/util-core:util-core-util",
],
)
target(
name = "common_online_stores",
dependencies = [
"src/scala/com/twitter/storehaus_internal/memcache",
],
)
target(
name = "common_offline_stores",
dependencies = [
"src/scala/com/twitter/storehaus_internal/manhattan",
],
)
target(
name = "user_job",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/job",
],
)
target(
name = "scalding",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/scalding",
],
)
target(
name = "conversion",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/conversion",
],
)
target(
name = "query",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/query",
],
)
target(
name = "heron",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/heron",
],
)
target(
dependencies = [
":common_offline_stores",
":common_online_stores",
":common_types",
":conversion",
":heron",
":query",
":scalding",
],
)

View File

@ -1,92 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.algebird.Monoid
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.SRichDataRecord
import scala.collection.mutable
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._
/**
* Monoid to aggregate over DataRecord objects.
*
* @param aggregates Set of ''TypedAggregateGroup'' case classes*
* to compute using this monoid (see TypedAggregateGroup.scala)
*/
trait DataRecordMonoid extends Monoid[DataRecord] {
val aggregates: Set[TypedAggregateGroup[_]]
def zero(): DataRecord = new DataRecord
/*
* Add two datarecords using this monoid.
*
* @param left Left datarecord to add
* @param right Right datarecord to add
* @return Sum of the two datarecords as a DataRecord
*/
def plus(left: DataRecord, right: DataRecord): DataRecord = {
val result = zero()
aggregates.foreach(_.mutatePlus(result, left, right))
val leftTimestamp = getTimestamp(left)
val rightTimestamp = getTimestamp(right)
SRichDataRecord(result).setFeatureValue(
SharedFeatures.TIMESTAMP,
leftTimestamp.max(rightTimestamp)
)
result
}
}
case class DataRecordAggregationMonoid(aggregates: Set[TypedAggregateGroup[_]])
extends DataRecordMonoid {
private def sumBuffer(buffer: mutable.ArrayBuffer[DataRecord]): Unit = {
val bufferSum = zero()
buffer.toIterator.foreach { value =>
val leftTimestamp = getTimestamp(bufferSum)
val rightTimestamp = getTimestamp(value)
aggregates.foreach(_.mutatePlus(bufferSum, bufferSum, value))
SRichDataRecord(bufferSum).setFeatureValue(
SharedFeatures.TIMESTAMP,
leftTimestamp.max(rightTimestamp)
)
}
buffer.clear()
buffer += bufferSum
}
/*
* Efficient batched aggregation of datarecords using
* this monoid + a buffer, for performance.
*
* @param dataRecordIter An iterator of datarecords to sum
* @return A datarecord option containing the sum
*/
override def sumOption(dataRecordIter: TraversableOnce[DataRecord]): Option[DataRecord] = {
if (dataRecordIter.isEmpty) {
None
} else {
var buffer = mutable.ArrayBuffer[DataRecord]()
val BatchSize = 1000
dataRecordIter.foreach { u =>
if (buffer.size > BatchSize) sumBuffer(buffer)
buffer += u
}
if (buffer.size > 1) sumBuffer(buffer)
Some(buffer(0))
}
}
}
/*
* This class is used when there is no need to use sumBuffer functionality, as in the case of
* online aggregation of datarecords where using a buffer on a small number of datarecords
* would add some performance overhead.
*/
case class DataRecordAggregationMonoidNoBuffer(aggregates: Set[TypedAggregateGroup[_]])
extends DataRecordMonoid {}

View File

@ -1,27 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api.DataRecord
/**
* Keyed record that is used to reprsent the aggregation type and its corresponding data record.
*
* @constructor creates a new keyed record.
*
* @param aggregateType the aggregate type
* @param record the data record associated with the key
**/
case class KeyedRecord(aggregateType: AggregateType.Value, record: DataRecord)
/**
* Keyed record map with multiple data record.
*
* @constructor creates a new keyed record map.
*
* @param aggregateType the aggregate type
* @param recordMap a map with key of type Long and value of type DataRecord
* where the key indicates the index and the value indicating the record
*
**/
case class KeyedRecordMap(
aggregateType: AggregateType.Value,
recordMap: scala.collection.Map[Long, DataRecord])

View File

@ -1,46 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.dal.personal_data.thriftscala.PersonalDataType
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Batched
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.JavaCompactThrift
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.genericInjection
import com.twitter.summingbird.batch.BatchID
import scala.collection.JavaConverters._
object OfflineAggregateInjections {
val offlineDataRecordAggregateInjection: KeyValInjection[AggregationKey, (BatchID, DataRecord)] =
KeyValInjection(
genericInjection(AggregationKeyInjection),
Batched(JavaCompactThrift[DataRecord])
)
private[aggregation_framework] def getPdts[T](
aggregateGroups: Iterable[T],
featureExtractor: T => Iterable[Feature[_]]
): Option[Set[PersonalDataType]] = {
val pdts: Set[PersonalDataType] = for {
group <- aggregateGroups.toSet[T]
feature <- featureExtractor(group)
pdtSet <- feature.getPersonalDataTypes.asSet().asScala
javaPdt <- pdtSet.asScala
scalaPdt <- PersonalDataType.get(javaPdt.getValue)
} yield {
scalaPdt
}
if (pdts.nonEmpty) Some(pdts) else None
}
def getInjection(
aggregateGroups: Set[TypedAggregateGroup[_]]
): KeyValInjection[AggregationKey, (BatchID, DataRecord)] = {
val keyPdts = getPdts[TypedAggregateGroup[_]](aggregateGroups, _.allOutputKeys)
val valuePdts = getPdts[TypedAggregateGroup[_]](aggregateGroups, _.allOutputFeatures)
KeyValInjection(
genericInjection(AggregationKeyInjection, keyPdts),
genericInjection(Batched(JavaCompactThrift[DataRecord]), valuePdts)
)
}
}

View File

@ -1,21 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.dal.client.dataset.TimePartitionedDALDataset
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import java.lang.{Long => JLong}
case class OfflineAggregateSource(
override val name: String,
override val timestampFeature: Feature[JLong],
scaldingHdfsPath: Option[String] = None,
scaldingSuffixType: Option[String] = None,
dalDataSet: Option[TimePartitionedDALDataset[DataRecord]] = None,
withValidation: Boolean = true) // context: https://jira.twitter.biz/browse/TQ-10618
extends AggregateSource {
/*
* Th help transition callers to use DAL.read, we check that either the HDFS
* path is defined, or the dalDataset. Both options cannot be set at the same time.
*/
assert(!(scaldingHdfsPath.isDefined && dalDataSet.isDefined))
}

View File

@ -1,128 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.ml.api.DataRecord
import com.twitter.scalding.DateParser
import com.twitter.scalding.RichDate
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.storehaus_internal.manhattan._
import com.twitter.storehaus_internal.util.ApplicationID
import com.twitter.storehaus_internal.util.DatasetName
import com.twitter.storehaus_internal.util.HDFSPath
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird.batch.Batcher
import com.twitter.summingbird_internal.runner.store_config._
import java.util.TimeZone
import com.twitter.summingbird.batch.MillisecondBatcher
/*
* Configuration common to all offline aggregate stores
*
* @param outputHdfsPathPrefix HDFS prefix to store all output aggregate types offline
* @param dummyAppId Dummy manhattan app id required by summingbird (unused)
* @param dummyDatasetPrefix Dummy manhattan dataset prefix required by summingbird (unused)
* @param startDate Start date for summingbird job to begin computing aggregates
*/
case class OfflineAggregateStoreCommonConfig(
outputHdfsPathPrefix: String,
dummyAppId: String,
dummyDatasetPrefix: String,
startDate: String)
/**
* A trait inherited by any object that defines
* a HDFS prefix to write output data to. E.g. timelines has its own
* output prefix to write aggregates_v2 results, your team can create
* its own.
*/
trait OfflineStoreCommonConfig extends Serializable {
/*
* @param startDate Date to create config for
* @return OfflineAggregateStoreCommonConfig object with all config details for output populated
*/
def apply(startDate: String): OfflineAggregateStoreCommonConfig
}
/**
* @param name Uniquely identifiable human-readable name for this output store
* @param startDate Start date for this output store from which aggregates should be computed
* @param commonConfig Provider of other common configuration details
* @param batchesToKeep Retention policy on output (number of batches to keep)
*/
abstract class OfflineAggregateStoreBase
extends OfflineStoreOnlyConfig[ManhattanROConfig]
with AggregateStore {
override def name: String
def startDate: String
def commonConfig: OfflineStoreCommonConfig
def batchesToKeep: Int
def maxKvSourceFailures: Int
val datedCommonConfig: OfflineAggregateStoreCommonConfig = commonConfig.apply(startDate)
val manhattan: ManhattanROConfig = ManhattanROConfig(
/* This is a sample config, will be replaced with production config later */
HDFSPath(s"${datedCommonConfig.outputHdfsPathPrefix}/${name}"),
ApplicationID(datedCommonConfig.dummyAppId),
DatasetName(s"${datedCommonConfig.dummyDatasetPrefix}_${name}_1"),
com.twitter.storehaus_internal.manhattan.Adama
)
val batcherSize = 24
val batcher: MillisecondBatcher = Batcher.ofHours(batcherSize)
val startTime: RichDate =
RichDate(datedCommonConfig.startDate)(TimeZone.getTimeZone("UTC"), DateParser.default)
val offline: ManhattanROConfig = manhattan
}
/**
* Defines an aggregates store which is composed of DataRecords
* @param name Uniquely identifiable human-readable name for this output store
* @param startDate Start date for this output store from which aggregates should be computed
* @param commonConfig Provider of other common configuration details
* @param batchesToKeep Retention policy on output (number of batches to keep)
*/
case class OfflineAggregateDataRecordStore(
override val name: String,
override val startDate: String,
override val commonConfig: OfflineStoreCommonConfig,
override val batchesToKeep: Int = 7,
override val maxKvSourceFailures: Int = 0)
extends OfflineAggregateStoreBase {
def toOfflineAggregateDataRecordStoreWithDAL(
dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
): OfflineAggregateDataRecordStoreWithDAL =
OfflineAggregateDataRecordStoreWithDAL(
name = name,
startDate = startDate,
commonConfig = commonConfig,
dalDataset = dalDataset,
maxKvSourceFailures = maxKvSourceFailures
)
}
trait withDALDataset {
def dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
}
/**
* Defines an aggregates store which is composed of DataRecords and writes using DAL.
* @param name Uniquely identifiable human-readable name for this output store
* @param startDate Start date for this output store from which aggregates should be computed
* @param commonConfig Provider of other common configuration details
* @param dalDataset The KeyValDALDataset for this output store
* @param batchesToKeep Unused, kept for interface compatibility. You must define a separate Oxpecker
* retention policy to maintain the desired number of versions.
*/
case class OfflineAggregateDataRecordStoreWithDAL(
override val name: String,
override val startDate: String,
override val commonConfig: OfflineStoreCommonConfig,
override val dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]],
override val batchesToKeep: Int = -1,
override val maxKvSourceFailures: Int = 0)
extends OfflineAggregateStoreBase
with withDALDataset

View File

@ -1,39 +0,0 @@
Overview
========
The **aggregation framework** is a set of libraries and utilities that allows teams to flexibly
compute aggregate (counting) features in both batch and in real-time. Aggregate features can capture
historical interactions between on arbitrary entities (and sets thereof), conditional on provided features
and labels.
These types of engineered aggregate features have proven to be highly impactful across different teams at Twitter.
What are some features we can compute?
--------------------------------------
The framework supports computing aggregate features on provided grouping keys. The only constraint is that these keys are sparse binary features (or are sets thereof).
For example, a common use case is to calculate a user's past engagement history with various types of tweets (photo, video, retweets, etc.), specific authors, specific in-network engagers or any other entity the user has interacted with and that could provide signal. In this case, the underlying aggregation keys are `userId`, `(userId, authorId)` or `(userId, engagerId)`.
In Timelines and MagicRecs, we also compute custom aggregate engagement counts on every `tweetId`. Similary, other aggregations are possible, perhaps on `advertiserId` or `mediaId` as long as the grouping key is sparse binary.
What implementations are supported?
-----------------------------------
Offline, we support the daily batch processing of DataRecords containing all required input features to generate
aggregate features. These are then uploaded to Manhattan for online hydration.
Online, we support the real-time aggregation of DataRecords through Storm with a backing memcache that can be queried
for the real-time aggregate features.
Additional documentation exists in the [docs folder](docs)
Where is this used?
--------------------
The Home Timeline heavy ranker uses a varierty of both [batch and real time features](../../../../src/scala/com/twitter/timelines/prediction/common/aggregates/README.md) generated by this framework.
These features are also used for email and other recommendations.

View File

@ -1,68 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureType
/**
* Convenience class to describe the stores that make up a particular type of aggregate.
*
* For example, as of 2018/07, user aggregates are generate by merging the individual
* "user_aggregates", "rectweet_user_aggregates", and, "twitter_wide_user_aggregates".
*
* @param storeNames Name of the stores.
* @param aggregateType Type of aggregate, usually differentiated by the aggregation key.
* @param shouldHash Used at TimelineRankingAggregatesUtil.extractSecondary when extracting the
* secondary key value.
*/
case class StoreConfig[T](
storeNames: Set[String],
aggregateType: AggregateType.Value,
shouldHash: Boolean = false
)(
implicit storeMerger: StoreMerger) {
require(storeMerger.isValidToMerge(storeNames))
private val representativeStore = storeNames.head
val aggregationKeyIds: Set[Long] = storeMerger.getAggregateKeys(representativeStore)
val aggregationKeyFeatures: Set[Feature[_]] =
storeMerger.getAggregateKeyFeatures(representativeStore)
val secondaryKeyFeatureOpt: Option[Feature[_]] = storeMerger.getSecondaryKey(representativeStore)
}
trait StoreMerger {
def aggregationConfig: AggregationConfig
def getAggregateKeyFeatures(storeName: String): Set[Feature[_]] =
aggregationConfig.aggregatesToCompute
.filter(_.outputStore.name == storeName)
.flatMap(_.keysToAggregate)
def getAggregateKeys(storeName: String): Set[Long] =
TypedAggregateGroup.getKeyFeatureIds(getAggregateKeyFeatures(storeName))
def getSecondaryKey(storeName: String): Option[Feature[_]] = {
val keys = getAggregateKeyFeatures(storeName)
require(keys.size <= 2, "Only singleton or binary aggregation keys are supported.")
require(keys.contains(SharedFeatures.USER_ID), "USER_ID must be one of the aggregation keys.")
keys
.filterNot(_ == SharedFeatures.USER_ID)
.headOption
.map { possiblySparseKey =>
if (possiblySparseKey.getFeatureType != FeatureType.SPARSE_BINARY) {
possiblySparseKey
} else {
TypedAggregateGroup.sparseFeature(possiblySparseKey)
}
}
}
/**
* Stores may only be merged if they have the same aggregation key.
*/
def isValidToMerge(storeNames: Set[String]): Boolean = {
val expectedKeyOpt = storeNames.headOption.map(getAggregateKeys)
storeNames.forall(v => getAggregateKeys(v) == expectedKeyOpt.get)
}
}

View File

@ -1,13 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
trait StoreRegister {
def allStores: Set[StoreConfig[_]]
lazy val storeMap: Map[AggregateType.Value, StoreConfig[_]] = allStores
.map(store => (store.aggregateType, store))
.toMap
lazy val storeNameToTypeMap: Map[String, AggregateType.Value] = allStores
.flatMap(store => store.storeNames.map(name => (name, store.aggregateType)))
.toMap
}

View File

@ -1,486 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
import com.twitter.util.Duration
import com.twitter.util.Try
import java.lang.{Boolean => JBoolean}
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import java.util.{Set => JSet}
import scala.annotation.tailrec
import scala.language.existentials
import scala.collection.JavaConverters._
import scala.util.matching.Regex
/**
* A case class contained precomputed data useful to quickly
* process operations over an aggregate.
*
* @param query The underlying feature being aggregated
* @param metric The aggregation metric
* @param outputFeatures The output features that aggregation will produce
* @param outputFeatureIds The precomputed hashes of the above outputFeatures
*/
case class PrecomputedAggregateDescriptor[T](
query: AggregateFeature[T],
metric: AggregationMetric[T, _],
outputFeatures: List[Feature[_]],
outputFeatureIds: List[JLong])
object TypedAggregateGroup {
/**
* Recursive function that generates all combinations of value
* assignments for a collection of sparse binary features.
*
* @param sparseBinaryIdValues list of sparse binary feature ids and possible values they can take
* @return A set of maps, where each map represents one possible assignment of values to ids
*/
def sparseBinaryPermutations(
sparseBinaryIdValues: List[(Long, Set[String])]
): Set[Map[Long, String]] = sparseBinaryIdValues match {
case (id, values) +: rest =>
tailRecSparseBinaryPermutations(
existingPermutations = values.map(value => Map(id -> value)),
remainingIdValues = rest
)
case Nil => Set.empty
}
@tailrec private[this] def tailRecSparseBinaryPermutations(
existingPermutations: Set[Map[Long, String]],
remainingIdValues: List[(Long, Set[String])]
): Set[Map[Long, String]] = remainingIdValues match {
case Nil => existingPermutations
case (id, values) +: rest =>
tailRecSparseBinaryPermutations(
existingPermutations.flatMap { existingIdValueMap =>
values.map(value => existingIdValueMap ++ Map(id -> value))
},
rest
)
}
val SparseFeatureSuffix = ".member"
def sparseFeature(sparseBinaryFeature: Feature[_]): Feature[String] =
new Feature.Text(
sparseBinaryFeature.getDenseFeatureName + SparseFeatureSuffix,
AggregationMetricCommon.derivePersonalDataTypes(Some(sparseBinaryFeature)))
/* Throws exception if obj not an instance of U */
private[this] def validate[U](obj: Any): U = {
require(obj.isInstanceOf[U])
obj.asInstanceOf[U]
}
private[this] def getFeatureOpt[U](dataRecord: DataRecord, feature: Feature[U]): Option[U] =
Option(SRichDataRecord(dataRecord).getFeatureValue(feature)).map(validate[U](_))
/**
* Get a mapping from feature ids
* (including individual sparse elements of a sparse feature) to values
* from the given data record, for a given feature type.
*
* @param dataRecord Data record to get features from
* @param keysToAggregate key features to get id-value mappings for
* @param featureType Feature type to get id-value maps for
*/
def getKeyFeatureIdValues[U](
dataRecord: DataRecord,
keysToAggregate: Set[Feature[_]],
featureType: FeatureType
): Set[(Long, Option[U])] = {
val featuresOfThisType: Set[Feature[U]] = keysToAggregate
.filter(_.getFeatureType == featureType)
.map(validate[Feature[U]])
featuresOfThisType
.map { feature: Feature[U] =>
val featureId: Long = getDenseFeatureId(feature)
val featureOpt: Option[U] = getFeatureOpt(dataRecord, feature)
(featureId, featureOpt)
}
}
// TypedAggregateGroup may transform the aggregate keys for internal use. This method generates
// denseFeatureIds for the transformed feature.
def getDenseFeatureId(feature: Feature[_]): Long =
if (feature.getFeatureType != FeatureType.SPARSE_BINARY) {
feature.getDenseFeatureId
} else {
sparseFeature(feature).getDenseFeatureId
}
/**
* Return denseFeatureIds for the input features after applying the custom transformation that
* TypedAggregateGroup applies to its keysToAggregate.
*
* @param keysToAggregate key features to get id for
*/
def getKeyFeatureIds(keysToAggregate: Set[Feature[_]]): Set[Long] =
keysToAggregate.map(getDenseFeatureId)
def checkIfAllKeysExist[U](featureIdValueMap: Map[Long, Option[U]]): Boolean =
featureIdValueMap.forall { case (_, valueOpt) => valueOpt.isDefined }
def liftOptions[U](featureIdValueMap: Map[Long, Option[U]]): Map[Long, U] =
featureIdValueMap
.flatMap {
case (id, valueOpt) =>
valueOpt.map { value => (id, value) }
}
val timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP
/**
* Builds all valid aggregation keys (for the output store) from
* a datarecord and a spec listing the keys to aggregate. There
* can be multiple aggregation keys generated from a single data
* record when grouping by sparse binary features, for which multiple
* values can be set within the data record.
*
* @param dataRecord Data record to read values for key features from
* @return A set of AggregationKeys encoding the values of all keys
*/
def buildAggregationKeys(
dataRecord: DataRecord,
keysToAggregate: Set[Feature[_]]
): Set[AggregationKey] = {
val discreteAggregationKeys = getKeyFeatureIdValues[Long](
dataRecord,
keysToAggregate,
FeatureType.DISCRETE
).toMap
val textAggregationKeys = getKeyFeatureIdValues[String](
dataRecord,
keysToAggregate,
FeatureType.STRING
).toMap
val sparseBinaryIdValues = getKeyFeatureIdValues[JSet[String]](
dataRecord,
keysToAggregate,
FeatureType.SPARSE_BINARY
).map {
case (id, values) =>
(
id,
values
.map(_.asScala.toSet)
.getOrElse(Set.empty[String])
)
}.toList
if (checkIfAllKeysExist(discreteAggregationKeys) &&
checkIfAllKeysExist(textAggregationKeys)) {
if (sparseBinaryIdValues.nonEmpty) {
sparseBinaryPermutations(sparseBinaryIdValues).map { sparseBinaryTextKeys =>
AggregationKey(
discreteFeaturesById = liftOptions(discreteAggregationKeys),
textFeaturesById = liftOptions(textAggregationKeys) ++ sparseBinaryTextKeys
)
}
} else {
Set(
AggregationKey(
discreteFeaturesById = liftOptions(discreteAggregationKeys),
textFeaturesById = liftOptions(textAggregationKeys)
)
)
}
} else Set.empty[AggregationKey]
}
}
/**
* Specifies one or more related aggregate(s) to compute in the summingbird job.
*
* @param inputSource Source to compute this aggregate over
* @param preTransforms Sequence of [[com.twitter.ml.api.RichITransform]] that transform
* data records pre-aggregation (e.g. discretization, renaming)
* @param samplingTransformOpt Optional [[OneToSomeTransform]] that transform data
* record to optional data record (e.g. for sampling) before aggregation
* @param aggregatePrefix Prefix to use for naming resultant aggregate features
* @param keysToAggregate Features to group by when computing the aggregates
* (e.g. USER_ID, AUTHOR_ID)
* @param featuresToAggregate Features to aggregate (e.g. blender_score or is_photo)
* @param labels Labels to cross the features with to make pair features, if any.
* use Label.All if you don't want to cross with a label.
* @param metrics Aggregation metrics to compute (e.g. count, mean)
* @param halfLives Half lives to use for the aggregations, to be crossed with the above.
* use Duration.Top for "forever" aggregations over an infinite time window (no decay).
* @param outputStore Store to output this aggregate to
* @param includeAnyFeature Aggregate label counts for any feature value
* @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
*
* The overall config for the summingbird job consists of a list of "AggregateGroup"
* case class objects, which get translated into strongly typed "TypedAggregateGroup"
* case class objects. A single TypedAggregateGroup always groups input data records from
* ''inputSource'' by a single set of aggregation keys (''featuresToAggregate'').
* Within these groups, we perform a comprehensive cross of:
*
* ''featuresToAggregate'' x ''labels'' x ''metrics'' x ''halfLives''
*
* All the resultant aggregate features are assigned a human-readable feature name
* beginning with ''aggregatePrefix'', and are written to DataRecords that get
* aggregated and written to the store specified by ''outputStore''.
*
* Illustrative example. Suppose we define our spec as follows:
*
* TypedAggregateGroup(
* inputSource = "timelines_recap_daily",
* aggregatePrefix = "user_author_aggregate",
* keysToAggregate = Set(USER_ID, AUTHOR_ID),
* featuresToAggregate = Set(RecapFeatures.TEXT_SCORE, RecapFeatures.BLENDER_SCORE),
* labels = Set(RecapFeatures.IS_FAVORITED, RecapFeatures.IS_REPLIED),
* metrics = Set(CountMetric, MeanMetric),
* halfLives = Set(7.Days, 30.Days),
* outputStore = "user_author_aggregate_store"
* )
*
* This will process data records from the source named "timelines_recap_daily"
* (see AggregateSource.scala for more details on how to add your own source)
* It will produce a total of 2x2x2x2 = 16 aggregation features, named like:
*
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.7days
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.30days
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.mean.7days
*
* ... (and so on)
*
* and all the result features will be stored in DataRecords, summed up, and written
* to the output store defined by the name "user_author_aggregate_store".
* (see AggregateStore.scala for details on how to add your own store).
*
* If you do not want a full cross, split up your config into multiple TypedAggregateGroup
* objects. Splitting is strongly advised to avoid blowing up and creating invalid
* or unnecessary combinations of aggregate features (note that some combinations
* are useless or invalid e.g. computing the mean of a binary feature). Splitting
* also does not cost anything in terms of real-time performance, because all
* Aggregate objects in the master spec that share the same ''keysToAggregate'', the
* same ''inputSource'' and the same ''outputStore'' are grouped by the summingbird
* job logic and stored into a single DataRecord in the output store. Overlapping
* aggregates will also automatically be deduplicated so don't worry about overlaps.
*/
case class TypedAggregateGroup[T](
inputSource: AggregateSource,
aggregatePrefix: String,
keysToAggregate: Set[Feature[_]],
featuresToAggregate: Set[Feature[T]],
labels: Set[_ <: Feature[JBoolean]],
metrics: Set[AggregationMetric[T, _]],
halfLives: Set[Duration],
outputStore: AggregateStore,
preTransforms: Seq[OneToSomeTransform] = Seq.empty,
includeAnyFeature: Boolean = true,
includeAnyLabel: Boolean = true,
aggExclusionRegex: Seq[String] = Seq.empty) {
import TypedAggregateGroup._
val compiledRegexes = aggExclusionRegex.map(new Regex(_))
// true if should drop, false if should keep
def filterOutAggregateFeature(
feature: PrecomputedAggregateDescriptor[_],
regexes: Seq[Regex]
): Boolean = {
if (regexes.nonEmpty)
feature.outputFeatures.exists { feature =>
regexes.exists { re => re.findFirstMatchIn(feature.getDenseFeatureName).nonEmpty }
}
else false
}
def buildAggregationKeys(
dataRecord: DataRecord
): Set[AggregationKey] = {
TypedAggregateGroup.buildAggregationKeys(dataRecord, keysToAggregate)
}
/**
* This val precomputes descriptors for all individual aggregates in this group
* (of type ''AggregateFeature''). Also precompute hashes of all aggregation
* "output" features generated by these operators for faster
* run-time performance (this turns out to be a primary CPU bottleneck).
* Ex: for the mean operator, "sum" and "count" are output features
*/
val individualAggregateDescriptors: Set[PrecomputedAggregateDescriptor[T]] = {
/*
* By default, in additional to all feature-label crosses, also
* compute in aggregates over each feature and label without crossing
*/
val labelOptions = labels.map(Option(_)) ++
(if (includeAnyLabel) Set(None) else Set.empty)
val featureOptions = featuresToAggregate.map(Option(_)) ++
(if (includeAnyFeature) Set(None) else Set.empty)
for {
feature <- featureOptions
label <- labelOptions
metric <- metrics
halfLife <- halfLives
} yield {
val query = AggregateFeature[T](aggregatePrefix, feature, label, halfLife)
val aggregateOutputFeatures = metric.getOutputFeatures(query)
val aggregateOutputFeatureIds = metric.getOutputFeatureIds(query)
PrecomputedAggregateDescriptor(
query,
metric,
aggregateOutputFeatures,
aggregateOutputFeatureIds
)
}
}.filterNot(filterOutAggregateFeature(_, compiledRegexes))
/* Precomputes a map from all generated aggregate feature ids to their half lives. */
val continuousFeatureIdsToHalfLives: Map[Long, Duration] =
individualAggregateDescriptors.flatMap { descriptor =>
descriptor.outputFeatures
.flatMap { feature =>
if (feature.getFeatureType() == FeatureType.CONTINUOUS) {
Try(feature.asInstanceOf[Feature[JDouble]]).toOption
.map(feature => (feature.getFeatureId(), descriptor.query.halfLife))
} else None
}
}.toMap
/*
* Sparse binary keys become individual string keys in the output.
* e.g. group by "words.in.tweet", output key: "words.in.tweet.member"
*/
val allOutputKeys: Set[Feature[_]] = keysToAggregate.map { key =>
if (key.getFeatureType == FeatureType.SPARSE_BINARY) sparseFeature(key)
else key
}
val allOutputFeatures: Set[Feature[_]] = individualAggregateDescriptors.flatMap {
case PrecomputedAggregateDescriptor(
query,
metric,
outputFeatures,
outputFeatureIds
) =>
outputFeatures
}
val aggregateContext: FeatureContext = new FeatureContext(allOutputFeatures.toList.asJava)
/**
* Adds all aggregates in this group found in the two input data records
* into a result, mutating the result. Uses a while loop for an
* approximately 10% gain in speed over a for comprehension.
*
* WARNING: mutates ''result''
*
* @param result The output data record to mutate
* @param left The left data record to add
* @param right The right data record to add
*/
def mutatePlus(result: DataRecord, left: DataRecord, right: DataRecord): Unit = {
val featureIterator = individualAggregateDescriptors.iterator
while (featureIterator.hasNext) {
val descriptor = featureIterator.next
descriptor.metric.mutatePlus(
result,
left,
right,
descriptor.query,
Some(descriptor.outputFeatureIds)
)
}
}
/**
* Apply preTransforms sequentially. If any transform results in a dropped (None)
* DataRecord, then entire tranform sequence will result in a dropped DataRecord.
* Note that preTransforms are order-dependent.
*/
private[this] def sequentiallyTransform(dataRecord: DataRecord): Option[DataRecord] = {
val recordOpt = Option(new DataRecord(dataRecord))
preTransforms.foldLeft(recordOpt) {
case (Some(previousRecord), preTransform) =>
preTransform(previousRecord)
case _ => Option.empty[DataRecord]
}
}
/**
* Given a data record, apply transforms and fetch the incremental contributions to
* each configured aggregate from this data record, and store these in an output data record.
*
* @param dataRecord Input data record to aggregate.
* @return A set of tuples (AggregationKey, DataRecord) whose first entry is an
* AggregationKey indicating what keys we're grouping by, and whose second entry
* is an output data record with incremental contributions to the aggregate value(s)
*/
def computeAggregateKVPairs(dataRecord: DataRecord): Set[(AggregationKey, DataRecord)] = {
sequentiallyTransform(dataRecord)
.flatMap { dataRecord =>
val aggregationKeys = buildAggregationKeys(dataRecord)
val increment = new DataRecord
val isNonEmptyIncrement = individualAggregateDescriptors
.map { descriptor =>
descriptor.metric.setIncrement(
output = increment,
input = dataRecord,
query = descriptor.query,
timestampFeature = inputSource.timestampFeature,
aggregateOutputs = Some(descriptor.outputFeatureIds)
)
}
.exists(identity)
if (isNonEmptyIncrement) {
SRichDataRecord(increment).setFeatureValue(
timestampFeature,
getTimestamp(dataRecord, inputSource.timestampFeature)
)
Some(aggregationKeys.map(key => (key, increment)))
} else {
None
}
}
.getOrElse(Set.empty[(AggregationKey, DataRecord)])
}
def outputFeaturesToRenamedOutputFeatures(prefix: String): Map[Feature[_], Feature[_]] = {
require(prefix.nonEmpty)
allOutputFeatures.map { feature =>
if (feature.isSetFeatureName) {
val renamedFeatureName = prefix + feature.getDenseFeatureName
val personalDataTypes =
if (feature.getPersonalDataTypes.isPresent) feature.getPersonalDataTypes.get()
else null
val renamedFeature = feature.getFeatureType match {
case FeatureType.BINARY =>
new Feature.Binary(renamedFeatureName, personalDataTypes)
case FeatureType.DISCRETE =>
new Feature.Discrete(renamedFeatureName, personalDataTypes)
case FeatureType.STRING =>
new Feature.Text(renamedFeatureName, personalDataTypes)
case FeatureType.CONTINUOUS =>
new Feature.Continuous(renamedFeatureName, personalDataTypes)
case FeatureType.SPARSE_BINARY =>
new Feature.SparseBinary(renamedFeatureName, personalDataTypes)
case FeatureType.SPARSE_CONTINUOUS =>
new Feature.SparseContinuous(renamedFeatureName, personalDataTypes)
}
feature -> renamedFeature
} else {
feature -> feature
}
}.toMap
}
}

View File

@ -1,122 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.algebird.ScMapMonoid
import com.twitter.algebird.Semigroup
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureType
import com.twitter.ml.api.util.SRichDataRecord
import java.lang.{Long => JLong}
import scala.collection.{Map => ScMap}
object Utils {
val dataRecordMerger: DataRecordMerger = new DataRecordMerger
def EmptyDataRecord: DataRecord = new DataRecord()
private val random = scala.util.Random
private val keyedDataRecordMapMonoid = {
val dataRecordMergerSg = new Semigroup[DataRecord] {
override def plus(x: DataRecord, y: DataRecord): DataRecord = {
dataRecordMerger.merge(x, y)
x
}
}
new ScMapMonoid[Long, DataRecord]()(dataRecordMergerSg)
}
def keyFromLong(record: DataRecord, feature: Feature[JLong]): Long =
SRichDataRecord(record).getFeatureValue(feature).longValue
def keyFromString(record: DataRecord, feature: Feature[String]): Long =
try {
SRichDataRecord(record).getFeatureValue(feature).toLong
} catch {
case _: NumberFormatException => 0L
}
def keyFromHash(record: DataRecord, feature: Feature[String]): Long =
SRichDataRecord(record).getFeatureValue(feature).hashCode.toLong
def extractSecondary[T](
record: DataRecord,
secondaryKey: Feature[T],
shouldHash: Boolean = false
): Long = secondaryKey.getFeatureType match {
case FeatureType.STRING =>
if (shouldHash) keyFromHash(record, secondaryKey.asInstanceOf[Feature[String]])
else keyFromString(record, secondaryKey.asInstanceOf[Feature[String]])
case FeatureType.DISCRETE => keyFromLong(record, secondaryKey.asInstanceOf[Feature[JLong]])
case f => throw new IllegalArgumentException(s"Feature type $f is not supported.")
}
def mergeKeyedRecordOpts(args: Option[KeyedRecord]*): Option[KeyedRecord] = {
val keyedRecords = args.flatten
if (keyedRecords.isEmpty) {
None
} else {
val keys = keyedRecords.map(_.aggregateType)
require(keys.toSet.size == 1, "All merged records must have the same aggregate key.")
val mergedRecord = mergeRecords(keyedRecords.map(_.record): _*)
Some(KeyedRecord(keys.head, mergedRecord))
}
}
private def mergeRecords(args: DataRecord*): DataRecord =
if (args.isEmpty) EmptyDataRecord
else {
// can just do foldLeft(new DataRecord) for both cases, but try reusing the EmptyDataRecord singleton as much as possible
args.tail.foldLeft(args.head) { (merged, record) =>
dataRecordMerger.merge(merged, record)
merged
}
}
def mergeKeyedRecordMapOpts(
opt1: Option[KeyedRecordMap],
opt2: Option[KeyedRecordMap],
maxSize: Int = Int.MaxValue
): Option[KeyedRecordMap] = {
if (opt1.isEmpty && opt2.isEmpty) {
None
} else {
val keys = Seq(opt1, opt2).flatten.map(_.aggregateType)
require(keys.toSet.size == 1, "All merged records must have the same aggregate key.")
val mergedRecordMap = mergeMapOpts(opt1.map(_.recordMap), opt2.map(_.recordMap), maxSize)
Some(KeyedRecordMap(keys.head, mergedRecordMap))
}
}
private def mergeMapOpts(
opt1: Option[ScMap[Long, DataRecord]],
opt2: Option[ScMap[Long, DataRecord]],
maxSize: Int = Int.MaxValue
): ScMap[Long, DataRecord] = {
require(maxSize >= 0)
val keySet = opt1.map(_.keySet).getOrElse(Set.empty) ++ opt2.map(_.keySet).getOrElse(Set.empty)
val totalSize = keySet.size
val rate = if (totalSize <= maxSize) 1.0 else maxSize.toDouble / totalSize
val prunedOpt1 = opt1.map(downsample(_, rate))
val prunedOpt2 = opt2.map(downsample(_, rate))
Seq(prunedOpt1, prunedOpt2).flatten
.foldLeft(keyedDataRecordMapMonoid.zero)(keyedDataRecordMapMonoid.plus)
}
def downsample[K, T](m: ScMap[K, T], samplingRate: Double): ScMap[K, T] = {
if (samplingRate >= 1.0) {
m
} else if (samplingRate <= 0) {
Map.empty
} else {
m.filter {
case (key, _) =>
// It is important that the same user with the same sampling rate be deterministically
// selected or rejected. Otherwise, mergeMapOpts will choose different keys for the
// two input maps and their union will be larger than the limit we want.
random.setSeed((key.hashCode, samplingRate.hashCode).hashCode)
random.nextDouble < samplingRate
}
}
}
}

View File

@ -1,165 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.algebird.DecayedValue
import com.twitter.algebird.DecayedValueMonoid
import com.twitter.algebird.Monoid
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.FDsl._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.summingbird.batch.BatchID
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
import com.twitter.util.Duration
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import scala.collection.JavaConverters._
import scala.collection.mutable
import java.{util => ju}
object AggregatesV2Adapter {
type AggregatesV2Tuple = (AggregationKey, (BatchID, DataRecord))
val Epsilon: Double = 1e-6
val decayedValueMonoid: Monoid[DecayedValue] = DecayedValueMonoid(Epsilon)
/*
* Decays the storedValue from timestamp -> sourceVersion
*
* @param storedValue value read from the aggregates v2 output store
* @param timestamp timestamp corresponding to store value
* @param sourceVersion timestamp of version to decay all values to uniformly
* @param halfLife Half life duration to use for applying decay
*
* By applying this function, the feature values for all users are decayed
* to sourceVersion. This is important to ensure that a user whose aggregates
* were updated long in the past does not have an artifically inflated count
* compared to one whose aggregates were updated (and hence decayed) more recently.
*/
def decayValueToSourceVersion(
storedValue: Double,
timestamp: Long,
sourceVersion: Long,
halfLife: Duration
): Double =
if (timestamp > sourceVersion) {
storedValue
} else {
decayedValueMonoid
.plus(
DecayedValue.build(storedValue, timestamp, halfLife.inMilliseconds),
DecayedValue.build(0, sourceVersion, halfLife.inMilliseconds)
)
.value
}
/*
* Decays all the aggregate features occurring in the ''inputRecord''
* to a given timestamp, and mutates the ''outputRecord'' accordingly.
* Note that inputRecord and outputRecord can be the same if you want
* to mutate the input in place, the function does this correctly.
*
* @param inputRecord Input record to get features from
* @param aggregates Aggregates to decay
* @param decayTo Timestamp to decay to
* @param trimThreshold Drop features below this trim threshold
* @param outputRecord Output record to mutate
* @return the mutated outputRecord
*/
def mutateDecay(
inputRecord: DataRecord,
aggregateFeaturesAndHalfLives: List[(Feature[_], Duration)],
decayTo: Long,
trimThreshold: Double,
outputRecord: DataRecord
): DataRecord = {
val timestamp = inputRecord.getFeatureValue(SharedFeatures.TIMESTAMP).toLong
aggregateFeaturesAndHalfLives.foreach {
case (aggregateFeature: Feature[_], halfLife: Duration) =>
if (aggregateFeature.getFeatureType() == FeatureType.CONTINUOUS) {
val continuousFeature = aggregateFeature.asInstanceOf[Feature[JDouble]]
if (inputRecord.hasFeature(continuousFeature)) {
val storedValue = inputRecord.getFeatureValue(continuousFeature).toDouble
val decayedValue = decayValueToSourceVersion(storedValue, timestamp, decayTo, halfLife)
if (math.abs(decayedValue) > trimThreshold) {
outputRecord.setFeatureValue(continuousFeature, decayedValue)
}
}
}
}
/* Update timestamp to version (now that we've decayed all aggregates) */
outputRecord.setFeatureValue(SharedFeatures.TIMESTAMP, decayTo)
outputRecord
}
}
class AggregatesV2Adapter(
aggregates: Set[TypedAggregateGroup[_]],
sourceVersion: Long,
trimThreshold: Double)
extends IRecordOneToManyAdapter[AggregatesV2Adapter.AggregatesV2Tuple] {
import AggregatesV2Adapter._
val keyFeatures: List[Feature[_]] = aggregates.flatMap(_.allOutputKeys).toList
val aggregateFeatures: List[Feature[_]] = aggregates.flatMap(_.allOutputFeatures).toList
val timestampFeatures: List[Feature[JLong]] = List(SharedFeatures.TIMESTAMP)
val allFeatures: List[Feature[_]] = keyFeatures ++ aggregateFeatures ++ timestampFeatures
val featureContext: FeatureContext = new FeatureContext(allFeatures.asJava)
override def getFeatureContext: FeatureContext = featureContext
val aggregateFeaturesAndHalfLives: List[(Feature[_$3], Duration) forSome { type _$3 }] =
aggregateFeatures.map { aggregateFeature: Feature[_] =>
val halfLife = AggregateFeature.parseHalfLife(aggregateFeature)
(aggregateFeature, halfLife)
}
override def adaptToDataRecords(tuple: AggregatesV2Tuple): ju.List[DataRecord] = tuple match {
case (key: AggregationKey, (batchId: BatchID, record: DataRecord)) => {
val resultRecord = new SRichDataRecord(new DataRecord, featureContext)
val itr = resultRecord.continuousFeaturesIterator()
val featuresToClear = mutable.Set[Feature[JDouble]]()
while (itr.moveNext()) {
val nextFeature = itr.getFeature
if (!aggregateFeatures.contains(nextFeature)) {
featuresToClear += nextFeature
}
}
featuresToClear.foreach(resultRecord.clearFeature)
keyFeatures.foreach { keyFeature: Feature[_] =>
if (keyFeature.getFeatureType == FeatureType.DISCRETE) {
resultRecord.setFeatureValue(
keyFeature.asInstanceOf[Feature[JLong]],
key.discreteFeaturesById(keyFeature.getDenseFeatureId)
)
} else if (keyFeature.getFeatureType == FeatureType.STRING) {
resultRecord.setFeatureValue(
keyFeature.asInstanceOf[Feature[String]],
key.textFeaturesById(keyFeature.getDenseFeatureId)
)
}
}
if (record.hasFeature(SharedFeatures.TIMESTAMP)) {
mutateDecay(
record,
aggregateFeaturesAndHalfLives,
sourceVersion,
trimThreshold,
resultRecord)
List(resultRecord.getRecord).asJava
} else {
List.empty[DataRecord].asJava
}
}
}
}

View File

@ -1,171 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.bijection.Injection
import com.twitter.bijection.thrift.CompactThriftCodec
import com.twitter.ml.api.AdaptedFeatureSource
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.IRecordOneToManyAdapter
import com.twitter.ml.api.TypedFeatureSource
import com.twitter.scalding.DateRange
import com.twitter.scalding.RichDate
import com.twitter.scalding.TypedPipe
import com.twitter.scalding.commons.source.VersionedKeyValSource
import com.twitter.scalding.commons.tap.VersionedTap.TapMode
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird_internal.bijection.BatchPairImplicits
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKeyInjection
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import org.apache.hadoop.mapred.JobConf
import scala.collection.JavaConverters._
import AggregatesV2Adapter._
object AggregatesV2AdaptedSource {
val DefaultTrimThreshold = 0
}
trait AggregatesV2AdaptedSource extends AggregatesV2AdaptedSourceBase[DataRecord] {
override def storageFormatCodec: Injection[DataRecord, Array[Byte]] =
CompactThriftCodec[DataRecord]
override def toDataRecord(v: DataRecord): DataRecord = v
}
trait AggregatesV2AdaptedSourceBase[StorageFormat]
extends TypedFeatureSource[AggregatesV2Tuple]
with AdaptedFeatureSource[AggregatesV2Tuple]
with BatchPairImplicits {
/* Output root path of aggregates v2 job, excluding store name and version */
def rootPath: String
/* Name of store under root path to read */
def storeName: String
// max bijection failures
def maxFailures: Int = 0
/* Aggregate config used to generate above output */
def aggregates: Set[TypedAggregateGroup[_]]
/* trimThreshold Trim all aggregates below a certain threshold to save memory */
def trimThreshold: Double
def toDataRecord(v: StorageFormat): DataRecord
def sourceVersionOpt: Option[Long]
def enableMostRecentBeforeSourceVersion: Boolean = false
implicit private val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] =
AggregationKeyInjection
implicit def storageFormatCodec: Injection[StorageFormat, Array[Byte]]
private def filteredAggregates = aggregates.filter(_.outputStore.name == storeName)
def storePath: String = List(rootPath, storeName).mkString("/")
def mostRecentVkvs: VersionedKeyValSource[_, _] = {
VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)](
path = storePath,
sourceVersion = None,
maxFailures = maxFailures
)
}
private def availableVersions: Seq[Long] =
mostRecentVkvs
.getTap(TapMode.SOURCE)
.getStore(new JobConf(true))
.getAllVersions()
.asScala
.map(_.toLong)
private def mostRecentVersion: Long = {
require(!availableVersions.isEmpty, s"$storeName has no available versions")
availableVersions.max
}
def versionToUse: Long =
if (enableMostRecentBeforeSourceVersion) {
sourceVersionOpt
.map(sourceVersion =>
availableVersions.filter(_ <= sourceVersion) match {
case Seq() =>
throw new IllegalArgumentException(
"No version older than version: %s, available versions: %s"
.format(sourceVersion, availableVersions)
)
case versionList => versionList.max
})
.getOrElse(mostRecentVersion)
} else {
sourceVersionOpt.getOrElse(mostRecentVersion)
}
override lazy val adapter: IRecordOneToManyAdapter[AggregatesV2Tuple] =
new AggregatesV2Adapter(filteredAggregates, versionToUse, trimThreshold)
override def getData: TypedPipe[AggregatesV2Tuple] = {
val vkvsToUse: VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)] = {
VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)](
path = storePath,
sourceVersion = Some(versionToUse),
maxFailures = maxFailures
)
}
TypedPipe.from(vkvsToUse).map {
case (key, (batch, value)) => (key, (batch, toDataRecord(value)))
}
}
}
/*
* Adapted data record feature source from aggregates v2 manhattan output
* Params documented in parent trait.
*/
case class AggregatesV2FeatureSource(
override val rootPath: String,
override val storeName: String,
override val aggregates: Set[TypedAggregateGroup[_]],
override val trimThreshold: Double = 0,
override val maxFailures: Int = 0,
)(
implicit val dateRange: DateRange)
extends AggregatesV2AdaptedSource {
// Increment end date by 1 millisec since summingbird output for date D is stored at (D+1)T00
override val sourceVersionOpt: Some[Long] = Some(dateRange.end.timestamp + 1)
}
/*
* Reads most recent available AggregatesV2FeatureSource.
* There is no constraint on recency.
* Params documented in parent trait.
*/
case class AggregatesV2MostRecentFeatureSource(
override val rootPath: String,
override val storeName: String,
override val aggregates: Set[TypedAggregateGroup[_]],
override val trimThreshold: Double = AggregatesV2AdaptedSource.DefaultTrimThreshold,
override val maxFailures: Int = 0)
extends AggregatesV2AdaptedSource {
override val sourceVersionOpt: None.type = None
}
/*
* Reads most recent available AggregatesV2FeatureSource
* on or before the specified beforeDate.
* Params documented in parent trait.
*/
case class AggregatesV2MostRecentFeatureSourceBeforeDate(
override val rootPath: String,
override val storeName: String,
override val aggregates: Set[TypedAggregateGroup[_]],
override val trimThreshold: Double = AggregatesV2AdaptedSource.DefaultTrimThreshold,
beforeDate: RichDate,
override val maxFailures: Int = 0)
extends AggregatesV2AdaptedSource {
override val enableMostRecentBeforeSourceVersion = true
override val sourceVersionOpt: Some[Long] = Some(beforeDate.timestamp + 1)
}

View File

@ -1,71 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/algebird:core",
"3rdparty/jvm/com/twitter/algebird:util",
"3rdparty/jvm/com/twitter/bijection:core",
"3rdparty/jvm/com/twitter/bijection:json",
"3rdparty/jvm/com/twitter/bijection:netty",
"3rdparty/jvm/com/twitter/bijection:scrooge",
"3rdparty/jvm/com/twitter/bijection:thrift",
"3rdparty/jvm/com/twitter/bijection:util",
"3rdparty/jvm/com/twitter/storehaus:algebra",
"3rdparty/jvm/com/twitter/storehaus:core",
"3rdparty/src/jvm/com/twitter/scalding:commons",
"3rdparty/src/jvm/com/twitter/scalding:core",
"3rdparty/src/jvm/com/twitter/scalding:date",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"3rdparty/src/jvm/com/twitter/summingbird:core",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/scala/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:interpretable-model-java",
"src/thrift/com/twitter/summingbird",
"timelines/data_processing/ml_util/aggregation_framework:common_types",
"timelines/data_processing/ml_util/aggregation_framework/metrics",
"util/util-core:scala",
],
)
scala_library(
name = "for-timelines",
sources = [
"CombineCountsPolicy.scala",
"SparseBinaryMergePolicy.scala",
],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/algebird:core",
"3rdparty/jvm/com/twitter/algebird:util",
"3rdparty/jvm/com/twitter/bijection:core",
"3rdparty/jvm/com/twitter/bijection:json",
"3rdparty/jvm/com/twitter/bijection:netty",
"3rdparty/jvm/com/twitter/bijection:scrooge",
"3rdparty/jvm/com/twitter/bijection:thrift",
"3rdparty/jvm/com/twitter/bijection:util",
"3rdparty/jvm/com/twitter/storehaus:algebra",
"3rdparty/jvm/com/twitter/storehaus:core",
"3rdparty/src/jvm/com/twitter/scalding:commons",
"3rdparty/src/jvm/com/twitter/scalding:core",
"3rdparty/src/jvm/com/twitter/scalding:date",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"3rdparty/src/jvm/com/twitter/summingbird:core",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:interpretable-model-java",
"src/thrift/com/twitter/summingbird",
"timelines/data_processing/ml_util/aggregation_framework:common_types",
"timelines/data_processing/ml_util/aggregation_framework/metrics",
"util/util-core:scala",
],
)

View File

@ -1,223 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.google.common.annotations.VisibleForTesting
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.TypedCountMetric
import java.lang.{Double => JDouble}
import scala.collection.JavaConverters._
case class CombinedFeatures(
sum: Feature[JDouble],
nonzero: Feature[JDouble],
mean: Feature[JDouble],
topK: Seq[Feature[JDouble]])
trait CombineCountsBase {
val SparseSum = "sparse_sum"
val SparseNonzero = "sparse_nonzero"
val SparseMean = "sparse_mean"
val SparseTop = "sparse_top"
def topK: Int
def hardLimit: Option[Int]
def precomputedCountFeatures: Seq[Feature[_]]
lazy val precomputedFeaturesMap: Map[Feature[_], CombinedFeatures] =
precomputedCountFeatures.map { countFeature =>
val derivedPersonalDataTypes =
AggregationMetricCommon.derivePersonalDataTypes(Some(countFeature))
val sum = new Feature.Continuous(
countFeature.getDenseFeatureName + "." + SparseSum,
derivedPersonalDataTypes)
val nonzero = new Feature.Continuous(
countFeature.getDenseFeatureName + "." + SparseNonzero,
derivedPersonalDataTypes)
val mean = new Feature.Continuous(
countFeature.getDenseFeatureName + "." + SparseMean,
derivedPersonalDataTypes)
val topKFeatures = (1 to topK).map { k =>
new Feature.Continuous(
countFeature.getDenseFeatureName + "." + SparseTop + k,
derivedPersonalDataTypes)
}
(countFeature, CombinedFeatures(sum, nonzero, mean, topKFeatures))
}.toMap
lazy val outputFeaturesPostMerge: Set[Feature[JDouble]] =
precomputedFeaturesMap.values.flatMap { combinedFeatures: CombinedFeatures =>
Seq(
combinedFeatures.sum,
combinedFeatures.nonzero,
combinedFeatures.mean
) ++ combinedFeatures.topK
}.toSet
private case class ComputedStats(sum: Double, nonzero: Double, mean: Double)
private def preComputeStats(featureValues: Seq[Double]): ComputedStats = {
val (sum, nonzero) = featureValues.foldLeft((0.0, 0.0)) {
case ((accSum, accNonzero), value) =>
(accSum + value, if (value > 0.0) accNonzero + 1.0 else accNonzero)
}
ComputedStats(sum, nonzero, if (nonzero > 0.0) sum / nonzero else 0.0)
}
private def computeSortedFeatureValues(featureValues: List[Double]): List[Double] =
featureValues.sortBy(-_)
private def extractKth(sortedFeatureValues: Seq[Double], k: Int): Double =
sortedFeatureValues
.lift(k - 1)
.getOrElse(0.0)
private def setContinuousFeatureIfNonZero(
record: SRichDataRecord,
feature: Feature[JDouble],
value: Double
): Unit =
if (value != 0.0) {
record.setFeatureValue(feature, value)
}
def hydrateCountFeatures(
richRecord: SRichDataRecord,
features: Seq[Feature[_]],
featureValuesMap: Map[Feature[_], List[Double]]
): Unit =
for {
feature <- features
featureValues <- featureValuesMap.get(feature)
} {
mergeRecordFromCountFeature(
countFeature = feature,
featureValues = featureValues,
richInputRecord = richRecord
)
}
def mergeRecordFromCountFeature(
richInputRecord: SRichDataRecord,
countFeature: Feature[_],
featureValues: List[Double]
): Unit = {
// In majority of calls to this method from timeline scorer
// the featureValues list is empty.
// While with empty list each operation will be not that expensive, these
// small things do add up. By adding early stop here we can avoid sorting
// empty list, allocating several options and making multiple function
// calls. In addition to that, we won't iterate over [1, topK].
if (featureValues.nonEmpty) {
val sortedFeatureValues = hardLimit
.map { limit =>
computeSortedFeatureValues(featureValues).take(limit)
}.getOrElse(computeSortedFeatureValues(featureValues)).toIndexedSeq
val computed = preComputeStats(sortedFeatureValues)
val combinedFeatures = precomputedFeaturesMap(countFeature)
setContinuousFeatureIfNonZero(
richInputRecord,
combinedFeatures.sum,
computed.sum
)
setContinuousFeatureIfNonZero(
richInputRecord,
combinedFeatures.nonzero,
computed.nonzero
)
setContinuousFeatureIfNonZero(
richInputRecord,
combinedFeatures.mean,
computed.mean
)
(1 to topK).foreach { k =>
setContinuousFeatureIfNonZero(
richInputRecord,
combinedFeatures.topK(k - 1),
extractKth(sortedFeatureValues, k)
)
}
}
}
}
object CombineCountsPolicy {
def getCountFeatures(aggregateContext: FeatureContext): Seq[Feature[_]] =
aggregateContext.getAllFeatures.asScala.toSeq
.filter { feature =>
feature.getFeatureType == FeatureType.CONTINUOUS &&
feature.getDenseFeatureName.endsWith(TypedCountMetric[JDouble]().operatorName)
}
@VisibleForTesting
private[conversion] def getFeatureValues(
dataRecordsWithCounts: List[DataRecord],
countFeature: Feature[_]
): List[Double] =
dataRecordsWithCounts.map(new SRichDataRecord(_)).flatMap { record =>
Option(record.getFeatureValue(countFeature)).map(_.asInstanceOf[JDouble].toDouble)
}
}
/**
* A merge policy that works whenever all aggregate features are
* counts (computed using CountMetric), and typically represent
* either impressions or engagements. For each such input count
* feature, the policy outputs the following (3+k) derived features
* into the output data record:
*
* Sum of the feature's value across all aggregate records
* Number of aggregate records that have the feature set to non-zero
* Mean of the feature's value across all aggregate records
* topK values of the feature across all aggregate records
*
* @param topK topK values to compute
* @param hardLimit when set, records are sorted and only the top values will be used for aggregation if
* the number of records are higher than this hard limit.
*/
case class CombineCountsPolicy(
override val topK: Int,
aggregateContextToPrecompute: FeatureContext,
override val hardLimit: Option[Int] = None)
extends SparseBinaryMergePolicy
with CombineCountsBase {
import CombineCountsPolicy._
override val precomputedCountFeatures: Seq[Feature[_]] = getCountFeatures(
aggregateContextToPrecompute)
override def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit = {
// Assumes aggregateContext === aggregateContextToPrecompute
mergeRecordFromCountFeatures(mutableInputRecord, aggregateRecords, precomputedCountFeatures)
}
def defaultMergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord]
): Unit = {
mergeRecordFromCountFeatures(mutableInputRecord, aggregateRecords, precomputedCountFeatures)
}
def mergeRecordFromCountFeatures(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
countFeatures: Seq[Feature[_]]
): Unit = {
val richInputRecord = new SRichDataRecord(mutableInputRecord)
countFeatures.foreach { countFeature =>
mergeRecordFromCountFeature(
richInputRecord = richInputRecord,
countFeature = countFeature,
featureValues = getFeatureValues(aggregateRecords, countFeature)
)
}
}
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
outputFeaturesPostMerge.map(_.asInstanceOf[Feature[_]])
}

View File

@ -1,46 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.bijection.Injection
import com.twitter.ml.api._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.scalding.TypedPipe
object DataSetPipeSketchJoin {
val DefaultSketchNumReducers = 500
val dataRecordMerger: DataRecordMerger = new DataRecordMerger
implicit val str2Byte: String => Array[Byte] =
implicitly[Injection[String, Array[Byte]]].toFunction
/* Computes a left sketch join on a set of skewed keys. */
def apply(
inputDataSet: DataSetPipe,
skewedJoinKeys: Product,
joinFeaturesDataSet: DataSetPipe,
sketchNumReducers: Int = DefaultSketchNumReducers
): DataSetPipe = {
val joinKeyList = skewedJoinKeys.productIterator.toList.asInstanceOf[List[Feature[_]]]
def makeKey(record: DataRecord): String =
joinKeyList
.map(SRichDataRecord(record).getFeatureValue(_))
.toString
def byKey(pipe: DataSetPipe): TypedPipe[(String, DataRecord)] =
pipe.records.map(record => (makeKey(record), record))
val joinedRecords = byKey(inputDataSet)
.sketch(sketchNumReducers)
.leftJoin(byKey(joinFeaturesDataSet))
.values
.map {
case (inputRecord, joinFeaturesOpt) =>
joinFeaturesOpt.foreach { joinRecord => dataRecordMerger.merge(inputRecord, joinRecord) }
inputRecord
}
DataSetPipe(
joinedRecords,
FeatureContext.merge(inputDataSet.featureContext, joinFeaturesDataSet.featureContext)
)
}
}

View File

@ -1,26 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.ml.api._
import com.twitter.ml.api.FeatureContext
import scala.collection.JavaConverters._
/*
* A really bad default merge policy that picks all the aggregate
* features corresponding to the first sparse key value in the list.
* Does not rename any of the aggregate features for simplicity.
* Avoid using this merge policy if at all possible.
*/
object PickFirstRecordPolicy extends SparseBinaryMergePolicy {
val dataRecordMerger: DataRecordMerger = new DataRecordMerger
override def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit =
aggregateRecords.headOption
.foreach(aggregateRecord => dataRecordMerger.merge(mutableInputRecord, aggregateRecord))
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
aggregateContext.getAllFeatures.asScala.toSet
}

View File

@ -1,226 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.ml.api._
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import java.lang.{Boolean => JBoolean}
import java.lang.{Double => JDouble}
case class CtrDescriptor(
engagementFeature: Feature[JDouble],
impressionFeature: Feature[JDouble],
outputFeature: Feature[JDouble])
object PickTopCtrBuilderHelper {
def createCtrDescriptors(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
outputSuffix: String
): Set[CtrDescriptor] = {
val aggregateFeatures = aggregatesToCompute
.filter(_.aggregatePrefix == aggregatePrefix)
val impressionFeature = aggregateFeatures
.flatMap { group =>
group.individualAggregateDescriptors
.filter(_.query.feature == None)
.filter(_.query.label == None)
.flatMap(_.outputFeatures)
}
.head
.asInstanceOf[Feature[JDouble]]
val aggregateEngagementFeatures =
aggregateFeatures
.flatMap { group =>
group.individualAggregateDescriptors
.filter(_.query.feature == None)
.filter { descriptor =>
//TODO: we should remove the need to pass around engagementLabels and just use all the labels available.
descriptor.query.label.exists(engagementLabels.contains(_))
}
.flatMap(_.outputFeatures)
}
.map(_.asInstanceOf[Feature[JDouble]])
aggregateEngagementFeatures
.map { aggregateEngagementFeature =>
CtrDescriptor(
engagementFeature = aggregateEngagementFeature,
impressionFeature = impressionFeature,
outputFeature = new Feature.Continuous(
aggregateEngagementFeature.getDenseFeatureName + "." + outputSuffix,
AggregationMetricCommon.derivePersonalDataTypes(
Some(aggregateEngagementFeature),
Some(impressionFeature)
)
)
)
}
}
}
object PickTopCtrPolicy {
def build(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
smoothing: Double = 1.0,
outputSuffix: String = "ratio"
): PickTopCtrPolicy = {
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
aggregatePrefix = aggregatePrefix,
engagementLabels = engagementLabels,
aggregatesToCompute = aggregatesToCompute,
outputSuffix = outputSuffix
)
PickTopCtrPolicy(
ctrDescriptors = ctrDescriptors,
smoothing = smoothing
)
}
}
object CombinedTopNCtrsByWilsonConfidenceIntervalPolicy {
def build(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
outputSuffix: String = "ratioWithWCI",
z: Double = 1.96,
topN: Int = 1
): CombinedTopNCtrsByWilsonConfidenceIntervalPolicy = {
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
aggregatePrefix = aggregatePrefix,
engagementLabels = engagementLabels,
aggregatesToCompute = aggregatesToCompute,
outputSuffix = outputSuffix
)
CombinedTopNCtrsByWilsonConfidenceIntervalPolicy(
ctrDescriptors = ctrDescriptors,
z = z,
topN = topN
)
}
}
/*
* A merge policy that picks the aggregate features corresponding to
* the sparse key value with the highest engagement rate (defined
* as the ratio of two specified features, representing engagements
* and impressions). Also outputs the engagement rate to the specified
* outputFeature.
*
* This is an abstract class. We can make variants of this policy by overriding
* the calculateCtr method.
*/
abstract class PickTopCtrPolicyBase(ctrDescriptors: Set[CtrDescriptor])
extends SparseBinaryMergePolicy {
private def getContinuousFeature(
aggregateRecord: DataRecord,
feature: Feature[JDouble]
): Double = {
Option(SRichDataRecord(aggregateRecord).getFeatureValue(feature))
.map(_.asInstanceOf[JDouble].toDouble)
.getOrElse(0.0)
}
/**
* For every provided descriptor, compute the corresponding CTR feature
* and only hydrate this result to the provided input record.
*/
override def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit = {
ctrDescriptors
.foreach {
case CtrDescriptor(engagementFeature, impressionFeature, outputFeature) =>
val sortedCtrs =
aggregateRecords
.map { aggregateRecord =>
val impressions = getContinuousFeature(aggregateRecord, impressionFeature)
val engagements = getContinuousFeature(aggregateRecord, engagementFeature)
calculateCtr(impressions, engagements)
}
.sortBy { ctr => -ctr }
combineTopNCtrsToSingleScore(sortedCtrs)
.foreach { score =>
SRichDataRecord(mutableInputRecord).setFeatureValue(outputFeature, score)
}
}
}
protected def calculateCtr(impressions: Double, engagements: Double): Double
protected def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double]
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
ctrDescriptors
.map(_.outputFeature)
.toSet
}
case class PickTopCtrPolicy(ctrDescriptors: Set[CtrDescriptor], smoothing: Double = 1.0)
extends PickTopCtrPolicyBase(ctrDescriptors) {
require(smoothing > 0.0)
override def calculateCtr(impressions: Double, engagements: Double): Double =
(1.0 * engagements) / (smoothing + impressions)
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
sortedCtrs.headOption
}
case class CombinedTopNCtrsByWilsonConfidenceIntervalPolicy(
ctrDescriptors: Set[CtrDescriptor],
z: Double = 1.96,
topN: Int = 1)
extends PickTopCtrPolicyBase(ctrDescriptors) {
private val zSquared = z * z
private val zSquaredDiv2 = zSquared / 2.0
private val zSquaredDiv4 = zSquared / 4.0
/**
* calculates the lower bound of wilson score interval. which roughly says "the actual engagement
* rate is at least this value" with confidence designated by the z-score:
* https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
*/
override def calculateCtr(rawImpressions: Double, engagements: Double): Double = {
// just in case engagements happens to be more than impressions...
val impressions = Math.max(rawImpressions, engagements)
if (impressions > 0.0) {
val p = engagements / impressions
(p
+ zSquaredDiv2 / impressions
- z * Math.sqrt(
(p * (1.0 - p) + zSquaredDiv4 / impressions) / impressions)) / (1.0 + zSquared / impressions)
} else 0.0
}
/**
* takes the topN engagement rates, and returns the joint probability as {1.0 - Π(1.0 - p)}
*
* e.g. let's say you have 0.6 chance of clicking on a tweet shared by the user A.
* you also have 0.3 chance of clicking on a tweet shared by the user B.
* seeing a tweet shared by both A and B will not lead to 0.9 chance of you clicking on it.
* but you could say that you have 0.4*0.7 chance of NOT clicking on that tweet.
*/
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
if (sortedCtrs.nonEmpty) {
val inverseLogP = sortedCtrs
.take(topN).map { p => Math.log(1.0 - p) }.sum
Some(1.0 - Math.exp(inverseLogP))
} else None
}

View File

@ -1,199 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.ml.api._
import com.twitter.ml.api.Feature
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.scalding.typed.TypedPipe
import com.twitter.scalding.typed.UnsortedGrouped
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import java.util.{Set => JSet}
import scala.collection.JavaConverters._
object SparseBinaryAggregateJoin {
import TypedAggregateGroup._
def makeKey(record: DataRecord, joinKeyList: List[Feature[_]]): String = {
joinKeyList.map {
case sparseKey: Feature.SparseBinary =>
SRichDataRecord(record).getFeatureValue(sparseFeature(sparseKey))
case nonSparseKey: Feature[_] =>
SRichDataRecord(record).getFeatureValue(nonSparseKey)
}.toString
}
/**
* @param record Data record to get all possible sparse aggregate keys from
* @param List of join key features (some can be sparse and some non-sparse)
* @return A list of string keys to use for joining
*/
def makeKeyPermutations(record: DataRecord, joinKeyList: List[Feature[_]]): List[String] = {
val allIdValues = joinKeyList.flatMap {
case sparseKey: Feature.SparseBinary => {
val id = sparseKey.getDenseFeatureId
val valuesOpt = Option(SRichDataRecord(record).getFeatureValue(sparseKey))
.map(_.asInstanceOf[JSet[String]].asScala.toSet)
valuesOpt.map { (id, _) }
}
case nonSparseKey: Feature[_] => {
val id = nonSparseKey.getDenseFeatureId
Option(SRichDataRecord(record).getFeatureValue(nonSparseKey)).map { value =>
(id, Set(value.toString))
}
}
}
sparseBinaryPermutations(allIdValues).toList.map { idValues =>
joinKeyList.map { key => idValues.getOrElse(key.getDenseFeatureId, "") }.toString
}
}
private[this] def mkKeyIndexedAggregates(
joinFeaturesDataSet: DataSetPipe,
joinKeyList: List[Feature[_]]
): TypedPipe[(String, DataRecord)] =
joinFeaturesDataSet.records
.map { record => (makeKey(record, joinKeyList), record) }
private[this] def mkKeyIndexedInput(
inputDataSet: DataSetPipe,
joinKeyList: List[Feature[_]]
): TypedPipe[(String, DataRecord)] =
inputDataSet.records
.flatMap { record =>
for {
key <- makeKeyPermutations(record, joinKeyList)
} yield { (key, record) }
}
private[this] def mkKeyIndexedInputWithUniqueId(
inputDataSet: DataSetPipe,
joinKeyList: List[Feature[_]],
uniqueIdFeatureList: List[Feature[_]]
): TypedPipe[(String, String)] =
inputDataSet.records
.flatMap { record =>
for {
key <- makeKeyPermutations(record, joinKeyList)
} yield { (key, makeKey(record, uniqueIdFeatureList)) }
}
private[this] def mkRecordIndexedAggregates(
keyIndexedInput: TypedPipe[(String, DataRecord)],
keyIndexedAggregates: TypedPipe[(String, DataRecord)]
): UnsortedGrouped[DataRecord, List[DataRecord]] =
keyIndexedInput
.join(keyIndexedAggregates)
.map { case (_, (inputRecord, aggregateRecord)) => (inputRecord, aggregateRecord) }
.group
.toList
private[this] def mkRecordIndexedAggregatesWithUniqueId(
keyIndexedInput: TypedPipe[(String, String)],
keyIndexedAggregates: TypedPipe[(String, DataRecord)]
): UnsortedGrouped[String, List[DataRecord]] =
keyIndexedInput
.join(keyIndexedAggregates)
.map { case (_, (inputId, aggregateRecord)) => (inputId, aggregateRecord) }
.group
.toList
def mkJoinedDataSet(
inputDataSet: DataSetPipe,
joinFeaturesDataSet: DataSetPipe,
recordIndexedAggregates: UnsortedGrouped[DataRecord, List[DataRecord]],
mergePolicy: SparseBinaryMergePolicy
): TypedPipe[DataRecord] =
inputDataSet.records
.map(record => (record, ()))
.leftJoin(recordIndexedAggregates)
.map {
case (inputRecord, (_, aggregateRecordsOpt)) =>
aggregateRecordsOpt
.map { aggregateRecords =>
mergePolicy.mergeRecord(
inputRecord,
aggregateRecords,
joinFeaturesDataSet.featureContext
)
inputRecord
}
.getOrElse(inputRecord)
}
def mkJoinedDataSetWithUniqueId(
inputDataSet: DataSetPipe,
joinFeaturesDataSet: DataSetPipe,
recordIndexedAggregates: UnsortedGrouped[String, List[DataRecord]],
mergePolicy: SparseBinaryMergePolicy,
uniqueIdFeatureList: List[Feature[_]]
): TypedPipe[DataRecord] =
inputDataSet.records
.map(record => (makeKey(record, uniqueIdFeatureList), record))
.leftJoin(recordIndexedAggregates)
.map {
case (_, (inputRecord, aggregateRecordsOpt)) =>
aggregateRecordsOpt
.map { aggregateRecords =>
mergePolicy.mergeRecord(
inputRecord,
aggregateRecords,
joinFeaturesDataSet.featureContext
)
inputRecord
}
.getOrElse(inputRecord)
}
/**
* If uniqueIdFeatures is non-empty and the join keys include a sparse binary
* key, the join will use this set of keys as a unique id to reduce
* memory consumption. You should need this option only for
* memory-intensive joins to avoid OOM errors.
*/
def apply(
inputDataSet: DataSetPipe,
joinKeys: Product,
joinFeaturesDataSet: DataSetPipe,
mergePolicy: SparseBinaryMergePolicy = PickFirstRecordPolicy,
uniqueIdFeaturesOpt: Option[Product] = None
): DataSetPipe = {
val joinKeyList = joinKeys.productIterator.toList.asInstanceOf[List[Feature[_]]]
val sparseBinaryJoinKeySet =
joinKeyList.toSet.filter(_.getFeatureType() == FeatureType.SPARSE_BINARY)
val containsSparseBinaryKey = !sparseBinaryJoinKeySet.isEmpty
if (containsSparseBinaryKey) {
val uniqueIdFeatureList = uniqueIdFeaturesOpt
.map(uniqueIdFeatures =>
uniqueIdFeatures.productIterator.toList.asInstanceOf[List[Feature[_]]])
.getOrElse(List.empty[Feature[_]])
val keyIndexedAggregates = mkKeyIndexedAggregates(joinFeaturesDataSet, joinKeyList)
val joinedDataSet = if (uniqueIdFeatureList.isEmpty) {
val keyIndexedInput = mkKeyIndexedInput(inputDataSet, joinKeyList)
val recordIndexedAggregates =
mkRecordIndexedAggregates(keyIndexedInput, keyIndexedAggregates)
mkJoinedDataSet(inputDataSet, joinFeaturesDataSet, recordIndexedAggregates, mergePolicy)
} else {
val keyIndexedInput =
mkKeyIndexedInputWithUniqueId(inputDataSet, joinKeyList, uniqueIdFeatureList)
val recordIndexedAggregates =
mkRecordIndexedAggregatesWithUniqueId(keyIndexedInput, keyIndexedAggregates)
mkJoinedDataSetWithUniqueId(
inputDataSet,
joinFeaturesDataSet,
recordIndexedAggregates,
mergePolicy,
uniqueIdFeatureList
)
}
DataSetPipe(
joinedDataSet,
mergePolicy.mergeContext(
inputDataSet.featureContext,
joinFeaturesDataSet.featureContext
)
)
} else {
inputDataSet.joinWithSmaller(joinKeys, joinFeaturesDataSet) { _.pass }
}
}
}

View File

@ -1,81 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.ml.api._
import com.twitter.ml.api.FeatureContext
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import scala.collection.JavaConverters._
/**
* When using the aggregates framework to group by sparse binary keys,
* we generate different aggregate feature values for each possible
* value of the sparse key. Hence, when joining back the aggregate
* features with a training data set, each individual training record
* has multiple aggregate features to choose from, for each value taken
* by the sparse key(s) in the training record. The merge policy trait
* below specifies how to condense/combine this variable number of
* aggregate features into a constant number of features for training.
* Some simple policies might be: pick the first feature set (randomly),
* pick the top sorted by some attribute, or take some average.
*
* Example: suppose we group by (ADVERTISER_ID, INTEREST_ID) where INTEREST_ID
* is the sparse key, and compute a "CTR" aggregate feature for each such
* pair measuring the click through rate on ads with (ADVERTISER_ID, INTEREST_ID).
* Say we have the following aggregate records:
*
* (ADVERTISER_ID = 1, INTEREST_ID = 1, CTR = 5%)
* (ADVERTISER_ID = 1, INTEREST_ID = 2, CTR = 15%)
* (ADVERTISER_ID = 2, INTEREST_ID = 1, CTR = 1%)
* (ADVERTISER_ID = 2, INTEREST_ID = 2, CTR = 10%)
* ...
* At training time, each training record has one value for ADVERTISER_ID, but it
* has multiple values for INTEREST_ID e.g.
*
* (ADVERTISER_ID = 1, INTEREST_IDS = (1,2))
*
* There are multiple potential CTRs we can get when joining in the aggregate features:
* in this case 2 values (5% and 15%) but in general it could be many depending on how
* many interests the user has. When joining back the CTR features, the merge policy says how to
* combine all these CTRs to engineer features.
*
* "Pick first" would say - pick some random CTR (whatever is first in the list, maybe 5%)
* for training (probably not a good policy). "Sort by CTR" could be a policy
* that just picks the top CTR and uses it as a feature (here 15%). Similarly, you could
* imagine "Top K sorted by CTR" (use both 5 and 15%) or "Avg CTR" (10%) or other policies,
* all of which are defined as objects/case classes that override this trait.
*/
trait SparseBinaryMergePolicy {
/**
* @param mutableInputRecord Input record to add aggregates to
* @param aggregateRecords Aggregate feature records
* @param aggregateContext Context for aggregate records
*/
def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit
def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]]
/**
* @param inputContext Context for input record
* @param aggregateContext Context for aggregate records
* @return Context for record returned by mergeRecord()
*/
def mergeContext(
inputContext: FeatureContext,
aggregateContext: FeatureContext
): FeatureContext = new FeatureContext(
(inputContext.getAllFeatures.asScala.toSet ++ aggregateFeaturesPostMerge(
aggregateContext)).toSeq.asJava
)
def allOutputFeaturesPostMergePolicy[T](config: TypedAggregateGroup[T]): Set[Feature[_]] = {
val containsSparseBinary = config.keysToAggregate
.exists(_.getFeatureType == FeatureType.SPARSE_BINARY)
if (!containsSparseBinary) config.allOutputFeatures
else aggregateFeaturesPostMerge(new FeatureContext(config.allOutputFeatures.toSeq.asJava))
}
}

View File

@ -1,109 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.bijection.Injection
import com.twitter.ml.api._
import com.twitter.ml.api.Feature
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.scalding.typed.TypedPipe
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup.sparseFeature
import scala.collection.JavaConverters._
case class SparseJoinConfig(
aggregates: DataSetPipe,
sparseKey: Feature.SparseBinary,
mergePolicies: SparseBinaryMergePolicy*)
object SparseBinaryMultipleAggregateJoin {
type CommonMap = (String, ((Feature.SparseBinary, String), DataRecord))
def apply(
source: DataSetPipe,
commonKey: Feature[_],
joinConfigs: Set[SparseJoinConfig],
rightJoin: Boolean = false,
isSketchJoin: Boolean = false,
numSketchJoinReducers: Int = 0
): DataSetPipe = {
val emptyPipe: TypedPipe[CommonMap] = TypedPipe.empty
val aggregateMaps: Set[TypedPipe[CommonMap]] = joinConfigs.map { joinConfig =>
joinConfig.aggregates.records.map { record =>
val sparseKeyValue =
SRichDataRecord(record).getFeatureValue(sparseFeature(joinConfig.sparseKey)).toString
val commonKeyValue = SRichDataRecord(record).getFeatureValue(commonKey).toString
(commonKeyValue, ((joinConfig.sparseKey, sparseKeyValue), record))
}
}
val commonKeyToAggregateMap = aggregateMaps
.foldLeft(emptyPipe) {
case (union: TypedPipe[CommonMap], next: TypedPipe[CommonMap]) =>
union ++ next
}
.group
.toList
.map {
case (commonKeyValue, aggregateTuples) =>
(commonKeyValue, aggregateTuples.toMap)
}
val commonKeyToRecordMap = source.records
.map { record =>
val commonKeyValue = SRichDataRecord(record).getFeatureValue(commonKey).toString
(commonKeyValue, record)
}
// rightJoin is not supported by Sketched, so rightJoin will be ignored if isSketchJoin is set
implicit val string2Byte = (value: String) => Injection[String, Array[Byte]](value)
val intermediateRecords = if (isSketchJoin) {
commonKeyToRecordMap.group
.sketch(numSketchJoinReducers)
.leftJoin(commonKeyToAggregateMap)
.toTypedPipe
} else if (rightJoin) {
commonKeyToAggregateMap
.rightJoin(commonKeyToRecordMap)
.mapValues(_.swap)
.toTypedPipe
} else {
commonKeyToRecordMap.leftJoin(commonKeyToAggregateMap).toTypedPipe
}
val joinedRecords = intermediateRecords
.map {
case (commonKeyValue, (inputRecord, aggregateTupleMapOpt)) =>
aggregateTupleMapOpt.foreach { aggregateTupleMap =>
joinConfigs.foreach { joinConfig =>
val sparseKeyValues = Option(
SRichDataRecord(inputRecord)
.getFeatureValue(joinConfig.sparseKey)
).map(_.asScala.toList)
.getOrElse(List.empty[String])
val aggregateRecords = sparseKeyValues.flatMap { sparseKeyValue =>
aggregateTupleMap.get((joinConfig.sparseKey, sparseKeyValue))
}
joinConfig.mergePolicies.foreach { mergePolicy =>
mergePolicy.mergeRecord(
inputRecord,
aggregateRecords,
joinConfig.aggregates.featureContext
)
}
}
}
inputRecord
}
val joinedFeatureContext = joinConfigs
.foldLeft(source.featureContext) {
case (left, joinConfig) =>
joinConfig.mergePolicies.foldLeft(left) {
case (soFar, mergePolicy) =>
mergePolicy.mergeContext(soFar, joinConfig.aggregates.featureContext)
}
}
DataSetPipe(joinedRecords, joinedFeatureContext)
}
}

View File

@ -1,5 +0,0 @@
aggregation.rst
batch.rst
index.rst
real-time.rst
troubleshooting.rst

View File

@ -1,167 +0,0 @@
.. _aggregation:
Core Concepts
=============
This page provides an overview of the aggregation framework and goes through examples on how to define aggregate features. In general, we can think of an aggregate feature as a grouped set of records, on which we incrementally update the aggregate feature values, crossed by the provided features and conditional on the provided labels.
AggregateGroup
--------------
An `AggregateGroup` defines a single unit of aggregate computation, similar to a SQL query. These are executed by the underlying jobs (internally, a `DataRecordAggregationMonoid <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/DataRecordAggregationMonoid.scala#n42>`_ is applied to `DataRecords` that contain the features to aggregate). Many of these groups can exist to define different types of aggregate features.
Let's start with the following examples of an `AggregateGroup` to discuss the meaning of each of its constructor arguments:
.. code-block:: scala
val UserAggregateStore = "user_aggregates"
val aggregatesToCompute: Set[TypedAggregateGroup[_]] = Set(
AggregateGroup(
inputSource = timelinesDailyRecapSource,
aggregatePrefix = "user_aggregate_v2",
preTransformOpt = Some(RemoveUserIdZero),
keys = Set(USER_ID),
features = Set(HAS_PHOTO),
labels = Set(IS_FAVORITED),
metrics = Set(CountMetric, SumMetric),
halfLives = Set(50.days),
outputStore = OfflineAggregateStore(
name = UserAggregateStore,
startDate = "2016-07-15 00:00",
commonConfig = timelinesDailyAggregateSink,
batchesToKeep = 5
)
)
.flatMap(_.buildTypedAggregateGroups)
)
This `AggregateGroup` computes the number of times each user has faved a tweet with a photo. The aggregate count is decayed with a 50 day halflife.
Naming and preprocessing
------------------------
`UserAggregateStore` is a string val that acts as a scope of a "root path" to which this group of aggregate features will be written. The root path is provided separately by the implementing job.
`inputSource` defines the input source of `DataRecords` that we aggregate on. These records contain the relevant features required for aggregation.
`aggregatePrefix` tells the framework what prefix to use for the aggregate features it generates. A descriptive naming scheme with versioning makes it easier to maintain features as you add or remove them over the long-term.
`preTransforms` is a `Seq[com.twitter.ml.api.ITransform] <https://cgit.twitter.biz/source/tree/src/java/com/twitter/ml/api/ITransform.java>`_ that can be applied to the data records read from the input source before they are fed into the `AggregateGroup` to apply aggregation. These transforms are optional but can be useful for certain preprocessing operations for a group's raw input features.
.. admonition:: Examples
You can downsample input data records by providing `preTransforms`. In addition, you could also join different input labels (e.g. "is_push_openend" and "is_push_favorited") and transform them into a combined label that is their union ("is_push_engaged") on which aggregate counts will be calculated.
Keys
----
`keys` is a crucial field in the config. It defines a `Set[com.twitter.ml.api.Feature]` which specifies a set of grouping keys to use for this `AggregateGroup`.
Keys can only be of 3 supported types currently: `DISCRETE`, `STRING` and `SPARSE_BINARY`. Using a discrete or a string/text feature as a key specifies the unit to group records by before applying counting/aggregation operators.
.. admonition:: Examples
.. cssclass:: shortlist
#. If the key is `USER_ID`, this tells the framework to group all records by `USER_ID`, and then apply aggregations (sum/count/etc) within each users data to generate aggregate features for each user.
#. If the key is `(USER_ID, AUTHOR_ID)`, then the `AggregateGroup` will output features for each unique user-author pair in the input data.
#. Finally, using a sparse binary feature as key has special "flattening" or "flatMap" like semantics. For example, consider grouping by `(USER_ID, AUTHOR_INTEREST_IDS)` where `AUTHOR_INTEREST_IDS` is a sparse binary feature which represents a set of topic IDs the author may be tweeting about. This creates one record for each `(user_id, interest_id)` pair - so each record with multiple author interests is flattened before feeding it to the aggregation.
Features
--------
`features` specifies a `Set[com.twitter.ml.api.Feature]` to aggregate within each group (defined by the keys specified earlier).
We support 2 types of `features`: `BINARY` and `CONTINUOUS`.
The semantics of how the aggregation works is slightly different based on the type of “feature”, and based on the “metric” (or aggregation operation):
.. cssclass:: shortlist
#. Binary Feature, Count Metric: Suppose we have a binary feature `HAS_PHOTO` in this set, and are applying the “Count” metric (see below for more details on the metrics), with key `USER_ID`. The semantics is that this computes a feature which measures the count of records with `HAS_PHOTO` set to true for each user.
#. Binary Feature, Sum Metric - Does not apply. No feature will be computed.
#. Continuous Feature, Count Metric - The count metric treats all features as binary features ignoring their value. For example, suppose we have a continuous feature `NUM_CHARACTERS_IN_TWEET`, and key `USER_ID`. This measures the count of records that have this feature `NUM_CHARACTERS_IN_TWEET` present.
#. Continuous Feature, Sum Metric - In the above example, the features measures the sum of (num_characters_in_tweet) over all a users records. Dividing this sum feature by the count feature would give the average number of characters in all tweets.
.. admonition:: Unsupported feature types
`DISCRETE` and `SPARSE` features are not supported by the Sum Metric, because there is no meaning in summing a discrete feature or a sparse feature. You can use them with the CountMetric, but they may not do what you would expect since they will be treated as binary features losing all the information within the feature. The best way to use these is as “keys” and not as “features”.
.. admonition:: Setting includeAnyFeature
If constructor argument `includeAnyFeature` is set, the framework will append a feature with scope `any_feature` to the set of all features you define. This additional feature simply measures the total count of records. So if you set your features to be equal to Set.empty, this will measure the count of records for a given `USER_ID`.
Labels
------
`labels` specifies a set of `BINARY` features that you can cross with, prior to applying aggregations on the `features`. This essentially restricts the aggregate computation to a subset of the records within a particular key.
We typically use this to represent engagement labels in an ML model, in this case, `IS_FAVORITED`.
In this example, we are grouping by `USER_ID`, the feature is `HAS_PHOTO`, the label is `IS_FAVORITED`, and we are computing `CountMetric`. The system will output a feature for each user that represents the number of favorites on tweets having photos by this `userId`.
.. admonition:: Setting includeAnyLabel
If constructor argument `includeAnyLabel` is set (as it is by default), then similar to `any_feature`, the framework automatically appends a label of type `any_label` to the set of all labels you define, which represents not applying any filter or cross.
In this example, `any_label` and `any_feature` are set by default and the system would actually output 4 features for each `user_id`:
.. cssclass:: shortlist
#. The number of `IS_FAVORITED` (favorites) on tweet impressions having `HAS_PHOTO=true`
#. The number of `IS_FAVORITED` (favorites) on all tweet impressions (`any_feature` aggregate)
#. The number of tweet impressions having `HAS_PHOTO=true` (`any_label` aggregate)
#. The total number of tweet impressions for this user id (`any_feature.any_label` aggregate)
.. admonition:: Disabling includeAnyLabel
To disable this automatically generated feature you can use `includeAnyLabel = false` in your config. This will remove some useful features (particularly for counterfactual signal), but it can greatly save on space since it does not store every possible impressed set of keys in the output store. So use this if you are short on space, but not otherwise.
Metrics
-------
`metrics` specifies the aggregate operators to apply. The most commonly used are `Count`, `Sum` and `SumSq`.
As mentioned before, `Count` can be applied to all types of features, but treats every feature as binary and ignores the value of the feature. `Sum` and `SumSq` can only be applied to Continuous features - they will ignore all other features you specify. By combining sum and sumsq and count, you can produce powerful “z-score” features or other distributional features using a post-transform.
It is also possible to add your own aggregate operators (e.g. `LastResetMetric <https://phabricator.twitter.biz/D228537>`_) to the framework with some additional work.
HalfLives
---------
`halfLives` specifies how fast aggregate features should be decayed. It is important to note that the framework works on an incremental basis: in the batch implementation, the summingbird-scalding job takes in the most recently computed aggregate features, processed on data until day `N-1`, then reads new data records for day `N` and computes updated values of the aggregate features. Similarly, the decay of real-time aggregate features takes the actual time delta between the current time and the last time the aggregate feature value was updated.
The halflife `H` specifies how fast to decay old sums/counts to simulate a sliding window of counts. The implementation is such that it will take `H` amount of time to decay an aggregate feature to half its initial value. New observed values of sums/counts are added to the aggregate feature value.
.. admonition:: Batch and real-time
In the batch use case where aggregate features are recomputed on a daily basis, we typically take halflives on the order of weeks or longer (in Timelines, 50 days). In the real-time use case, shorter halflives are appropriate (hours) since they are updated as client engagements are received by the summingbird job.
SQL Equivalent
--------------
Conceptually, you can also think of it as:
.. code-block:: sql
INSERT INTO <outputStore>.<aggregatePrefix>
SELECT AGG(<features>) /* AGG is <metrics>, which is a exponentially decaying SUM or COUNT etc. based on the halfLifves */
FROM (
SELECT preTransformOpt(*) FROM <inputSource>
)
GROUP BY <keys>
WHERE <labels> = True
any_features is AGG(*).
any_labels removes the WHERE clause.

View File

@ -1,215 +0,0 @@
.. _batch:
Batch aggregate feature jobs
============================
In the previous section, we went over the core concepts of the aggregation framework and discussed how you can set up you own `AggregateGroups` to compute aggregate features.
Given these groups, this section will discuss how you can setup offline batch jobs to produce the corresponding aggregate features, updated daily. To accomplish this, we need to setup a summingbird-scalding job that is pointed to the input data records containing features and labels to be aggregated.
Input Data
----------
In order to generate aggregate features, the relevant input features need to be available offline as a daily scalding source in `DataRecord` format (typically `DailySuffixFeatureSource <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/ml/api/FeatureSource.scala>`_, though `HourlySuffixFeatureSource` could also be usable but we have not tested this).
.. admonition:: Note
The input data source should contain the keys, features and labels you want to use in your `AggregateGroups`.
Aggregation Config
------------------
Now that we have a daily data source with input features and labels, we need to setup the `AggregateGroup` config itself. This contains all aggregation groups that you would like to compute and we will go through the implementation step-by-step.
.. admonition:: Example: Timelines Quality config
`TimelinesAggregationConfig <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.scala>`_ imports the configured `AggregationGroups` from `TimelinesAggregationConfigDetails <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.scala>`_. The config is then referenced by the implementing summingbird-scalding job which we will setup below.
OfflineAggregateSource
----------------------
Each `AggregateGroup` will need to define a (daily) source of input features. We use `OfflineAggregateSource` for this to tell the aggregation framework where the input data set is and the required timestamp feature that the framework uses to decay aggregate feature values:
.. code-block:: scala
val timelinesDailyRecapSource = OfflineAggregateSource(
name = "timelines_daily_recap",
timestampFeature = TIMESTAMP,
scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/data_records"),
scaldingSuffixType = Some("daily"),
withValidation = true
)
.. admonition:: Note
.. cssclass:: shortlist
#. The name is not important as long as it is unique.
#. `timestampFeature` must be a discrete feature of type `com.twitter.ml.api.Feature[Long]` and represents the “time” of a given training record in milliseconds - for example, the time at which an engagement, push open event, or abuse event took place that you are trying to train on. If you do not already have such a feature in your daily training data, you need to add one.
#. `scaldingSuffixType` can be “hourly” or “daily” depending on the type of source (`HourlySuffixFeatureSource` vs `DailySuffixFeatureSource`).
#. Set `withValidation` to true to validate the presence of _SUCCESS file. Context: https://jira.twitter.biz/browse/TQ-10618
Output HDFS store
-----------------
The output HDFS store is where the computed aggregate features are stored. This store contains all computed aggregate feature values and is incrementally updated by the aggregates job every day.
.. code-block:: scala
val outputHdfsPath = "/user/timelines/processed/aggregates_v2"
val timelinesOfflineAggregateSink = new OfflineStoreCommonConfig {
override def apply(startDate: String) = new OfflineAggregateStoreCommonConfig(
outputHdfsPathPrefix = outputHdfsPath,
dummyAppId = "timelines_aggregates_v2_ro", // unused - can be arbitrary
dummyDatasetPrefix = "timelines_aggregates_v2_ro", // unused - can be arbitrary
startDate = startDate
)
}
Note: `dummyAppId` and `dummyDatasetPrefix` are unused so can be set to any arbitrary value. They should be removed on the framework side.
The `outputHdfsPathPrefix` is the only field that matters, and should be set to the HDFS path where you want to store the aggregate features. Make sure you have a lot of quota available at that path.
Setting Up Aggregates Job
-------------------------
Once you have defined a config file with the aggregates you would like to compute, the next step is to create the aggregates scalding job using the config (`example <https://cgit.twitter.biz/source/tree/timelines/data_processing/ad_hoc/aggregate_interactions/v2/offline_aggregation/TimelinesAggregationScaldingJob.scala>`_). This is very concise and requires only a few lines of code:
.. code-block:: scala
object TimelinesAggregationScaldingJob extends AggregatesV2ScaldingJob {
override val aggregatesToCompute = TimelinesAggregationConfig.aggregatesToCompute
}
Now that the scalding job is implemented with the aggregation config, we need to setup a capesos config similar to https://cgit.twitter.biz/source/tree/science/scalding/mesos/timelines/prod.yml:
.. code-block:: scala
# Common configuration shared by all aggregates v2 jobs
__aggregates_v2_common__: &__aggregates_v2_common__
class: HadoopSummingbirdProducer
bundle: offline_aggregation-deploy.tar.gz
mainjar: offline_aggregation-deploy.jar
pants_target: "bundle timelines/data_processing/ad_hoc/aggregate_interactions/v2/offline_aggregation:bin"
cron_collision_policy: CANCEL_NEW
use_libjar_wild_card: true
.. code-block:: scala
# Specific job computing user aggregates
user_aggregates_v2:
<<: *__aggregates_v2_common__
cron_schedule: "25 * * * *"
arguments: --batches 1 --output_stores user_aggregates --job_name timelines_user_aggregates_v2
.. admonition:: Important
Each AggregateGroup in your config should have its own associated offline job which specifies `output_stores` pointing to the output store name you defined in your config.
Running The Job
---------------
When you run the batch job for the first time, you need to add a temporary entry to your capesos yml file that looks like this:
.. code-block:: scala
user_aggregates_v2_initial_run:
<<: *__aggregates_v2_common__
cron_schedule: "25 * * * *"
arguments: --batches 1 --start-time “2017-03-03 00:00:00” --output_stores user_aggregates --job_name timelines_user_aggregates_v2
.. admonition:: Start Time
The additional `--start-time` argument should match the `startDate` in your config for that AggregateGroup, but in the format `yyyy-mm-dd hh:mm:ss`.
To invoke the initial run via capesos, we would do the following (in Timelines case):
.. code-block:: scala
CAPESOSPY_ENV=prod capesospy-v2 update --build_locally --start_cron user_aggregates_v2_initial_run science/scalding/mesos/timelines/prod.yml
Once it is running smoothly, you can deschedule the initial run job and delete the temporary entry from your production yml config.
.. code-block:: scala
aurora cron deschedule atla/timelines/prod/user_aggregates_v2_initial_run
Note: deschedule it preemptively to avoid repeatedly overwriting the same initial results
Then schedule the production job from jenkins using something like this:
.. code-block:: scala
CAPESOSPY_ENV=prod capesospy-v2 update user_aggregates_v2 science/scalding/mesos/timelines/prod.yml
All future runs (2nd onwards) will use the permanent entry in the capesos yml config that does not have the `start-time` specified.
.. admonition:: Job name has to match
It's important that the production run should share the same `--job_name` with the initial_run so that eagleeye/statebird knows how to keep track of it correctly.
Output Aggregate Features
-------------------------
This scalding job using the example config from the earlier section would output a VersionedKeyValSource to `/user/timelines/processed/aggregates_v2/user_aggregates` on HDFS.
Note that `/user/timelines/processed/aggregates_v2` is the explicitly defined root path while `user_aggregates` is the output directory of the example `AggregateGroup` defined earlier. The latter can be different for different `AggregateGroups` defined in your config.
The VersionedKeyValSource is difficult to use directly in your jobs/offline trainings, but we provide an adapted source `AggregatesV2FeatureSource` that makes it easy to join and use in your jobs:
.. code-block:: scala
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion._
val pipe: DataSetPipe = AggregatesV2FeatureSource(
rootPath = "/user/timelines/processed/aggregates_v2",
storeName = "user_aggregates",
aggregates = TimelinesAggregationConfig.aggregatesToCompute,
trimThreshold = 0
)(dateRange).read
Simply replace the `rootPath`, `storeName` and `aggregates` object to whatever you defined. The `trimThreshold` tells the framework to trim all features below a certain cutoff: 0 is a safe default to use to begin with.
.. admonition:: Usage
This can now be used like any other `DataSetPipe` in offline ML jobs. You can write out the features to a `DailySuffixFeatureSource`, you can join them with your data offline for trainings, or you can write them to a Manhattan store for serving online.
Aggregate Features Example
--------------------------
Here is an example of sample of the aggregate features we just computed:
.. code-block:: scala
user_aggregate_v2.pair.any_label.any_feature.50.days.count: 100.0
user_aggregate_v2.pair.any_label.tweetsource.is_quote.50.days.count: 30.0
user_aggregate_v2.pair.is_favorited.any_feature.50.days.count: 10.0
user_aggregate_v2.pair.is_favorited.tweetsource.is_quote.50.days.count: 6.0
meta.user_id: 123456789
Aggregate feature names match a `prefix.pair.label.feature.half_life.metric` schema and correspond to what was defined in the aggregation config for each of these fields.
.. admonition:: Example
In this example, the above features are capturing that userId 123456789L has:
..
A 50-day decayed count of 100 training records with any label or feature (“tweet impressions”)
A 50-day decayed count of 30 records that are “quote tweets” (tweetsource.is_quote = true)
A 50-day decayed count of 10 records that are favorites on any type of tweet (is_favorited = true)
A 50-day decayed count of 6 records that are “favorites” on “quote tweets” (both of the above are true)
By combining the above, a model might infer that for this specific user, quote tweets comprise 30% of all impressions, have a favorite rate of 6/30 = 20%, compared to a favorite rate of 10/100 = 10% on the total population of tweets.
Therefore, being a quote tweet makes this specific user `123456789L` approximately twice as likely to favorite the tweet, which is useful for prediction and could result in the ML model giving higher scores to & ranking quote tweets higher in a personalized fashion for this user.
Tests for Feature Names
--------------------------
When you change or add AggregateGroup, feature names might change. And the Feature Store provides a testing mechanism to assert that the feature names change as you expect. See `tests for feature names <https://docbird.twitter.biz/ml_feature_store/catalog.html#tests-for-feature-names>`_.

View File

@ -1,59 +0,0 @@
# -*- coding: utf-8 -*-
#
# docbird documentation build configuration file
# Note that not all possible configuration values are present in this
# autogenerated file.
#
from os.path import abspath, dirname, isfile, join
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.ifconfig",
"sphinx.ext.graphviz",
"twitter.docbird.ext.thriftlexer",
"twitter.docbird.ext.toctree_default_caption",
"sphinxcontrib.httpdomain",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The suffix of source filenames.
source_suffix = ".rst"
# The master toctree document.
master_doc = "index"
# General information about the project.
project = u"""Aggregation Framework"""
description = u""""""
# The short X.Y version.
version = u"""1.0"""
# The full version, including alpha/beta/rc tags.
release = u"""1.0"""
exclude_patterns = ["_build"]
pygments_style = "sphinx"
html_theme = "default"
html_static_path = ["_static"]
html_logo = u""""""
# Automagically add project logo, if it exists
# (checks on any build, not just init)
# Scan for some common defaults (png or svg format,
# called "logo" or project name, in docs folder)
if not html_logo:
location = dirname(abspath(__file__))
for logo_file in ["logo.png", "logo.svg", ("%s.png" % project), ("%s.svg" % project)]:
html_logo = logo_file if isfile(join(location, logo_file)) else html_logo
graphviz_output_format = "svg"

View File

@ -1,11 +0,0 @@
.. markdowninclude:: ../README.md
.. toctree::
:maxdepth: 2
:hidden:
aggregation
batch
real-time
joining
troubleshooting

View File

@ -1,72 +0,0 @@
.. _joining:
Joining aggregates features to records
======================================
After setting up either offline batch jobs or online real-time summingbird jobs to produce
aggregate features and querying them, we are left with data records containing aggregate features.
This page will go over how to join them with other data records to produce offline training data.
(To discuss: joining aggregates to records online)
Joining Aggregates on Discrete/String Keys
------------------------------------------
Joining aggregate features keyed on discrete or text features to your training data is very easy -
you can use the built in methods provided by `DataSetPipe`. For example, suppose you have aggregates
keyed by `(USER_ID, AUTHOR_ID)`:
.. code-block:: scala
val userAuthorAggregates: DataSetPipe = AggregatesV2FeatureSource(
rootPath = “/path/to/my/aggregates”,
storeName = “user_author_aggregates”,
aggregates = MyConfig.aggregatesToCompute,
trimThreshold = 0
)(dateRange).read
Offline, you can then join with your training data set as follows:
.. code-block:: scala
val myTrainingData: DataSetPipe = ...
val joinedData = myTrainingData.joinWithLarger((USER_ID, AUTHOR_ID), userAuthorAggregates)
You can read from `AggregatesV2MostRecentFeatureSourceBeforeDate` in order to read the most recent aggregates
before a provided date `beforeDate`. Just note that `beforeDate` must be aligned with the date boundary so if
youre passing in a `dateRange`, use `dateRange.end`).
Joining Aggregates on Sparse Binary Keys
----------------------------------------
When joining on sparse binary keys, there can be multiple aggregate records to join to each training record in
your training data set. For example, suppose you have setup an aggregate group that is keyed on `(INTEREST_ID, AUTHOR_ID)`
capturing engagement counts of users interested in a particular `INTEREST_ID` for specific authors provided by `AUTHOR_ID`.
Suppose now that you have a training data record representing a specific user action. This training data record contains
a sparse binary feature `INTEREST_IDS` representing all the "interests" of that user - e.g. music, sports, and so on. Each `interest_id`
translates to a different set of counting features found in your aggregates data. Therefore we need a way to merge all of
these different sets of counting features to produce a more compact, fixed-size set of features.
.. admonition:: Merge policies
To do this, the aggregate framework provides a trait `SparseBinaryMergePolicy <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryMergePolicy.scala>`_. Classes overriding this trait define policies
that state how to merge the individual aggregate features from each sparse binary value (in this case, each `INTEREST_ID` for a user).
Furthermore, we provide `SparseBinaryMultipleAggregateJoin` which executes these policies to merge aggregates.
A simple policy might simply average all the counts from the individual interests, or just take the max, or
a specific quantile. More advanced policies might use custom criteria to decide which interest is most relevant and choose
features from that interest to represent the user, or use some weighted combination of counts.
The framework provides two simple in-built policies (`PickTopCtrPolicy <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/conversion/PickTopCtrPolicy.scala>`_
and `CombineCountsPolicy <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/conversion/CombineCountsPolicy.scala>`_, which keeps the topK counts per
record) that you can get started with, though you likely want to implement your own policy based on domain knowledge to get
the best results for your specific problem domain.
.. admonition:: Offline Code Example
The scalding job `TrainingDataWithAggV2Generator <https://cgit.twitter.biz/source/tree/timelines/data_processing/ad_hoc/recap/training_data_generator/TrainingDataWithAggV2Generator.scala>`_ shows how multiple merge policies are defined and implemented to merge aggregates on sparse binary keys to the TQ's training data records.
.. admonition:: Online Code Example
In our (non-FeatureStore enabled) online code path, we merge aggregates on sparse binary keys using the `CombineCountsPolicy <https://cgit.twitter.biz/source/tree/timelinemixer/server/src/main/scala/com/twitter/timelinemixer/injection/recapbase/aggregates/UserFeaturesHydrator.scala#n201>`_.

View File

@ -1,327 +0,0 @@
.. _real_time:
Real-Time aggregate features
============================
In addition to computing batch aggregate features, the aggregation framework supports real-time aggregates as well. The framework concepts used here are identical to the batch use case, however, the underlying implementation differs and is provided by summingbird-storm jobs.
RTA Runbook
-----------
For operational details, please visit http://go/tqrealtimeaggregates.
Prerequisites
-------------
In order to start computing real-time aggregate features, the framework requires the following to be provided:
* A backing memcached store that will hold the computed aggregate features. This is conceptually equivalent to the output HDFS store in the batch compute case.
* Implementation of `StormAggregateSource <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/heron/StormAggregateSource.scala#n15>`_ that creates `DataRecords` with the necessary input features. This serves as the input to the aggregation operations.
* Definition of aggregate features by defining `AggregateGroup` in an implementation of `OnlineAggregationConfigTrait`. This is identical to the batch case.
* Job config file defining the backing memcached for feature storage and retrieval, and job-related parameters.
We will now go through the details in setting up each required component.
Memcached store
---------------
Real-time aggregates use Memcache as the backing cache to store and update aggregate features keys. Caches can be provisioned on `go/cacheboard <https://cacheboardv2--prod--cache.service.atla.twitter.biz/>`_.
.. admonition:: Test and prod caches
For development, it is sufficient to setup a test cache that your new job can query and write to. At the same time, a production cache request should also be submitted as these generally have significant lead times for provisioning.
StormAggregateSource
--------------------
To enable aggregation of your features, we need to start with defining a `StormAggregateSource` that builds a `Producer[Storm, DataRecord]`. This summingbird producer generates `DataRecords` that contain the input features and labels that the real-time aggregate job will compute aggregate features on. Conceptually, this is equivalent to the input data set in the offline batch use case.
.. admonition:: Example
If you are planning to aggregate on client engagements, you would need to subscribe to the `ClientEvent` kafka stream and then convert each event to a `DataRecord` that contains the key and the engagement on which to aggregate.
Typically, we would setup a julep filter for the relevant client events that we would like to aggregate on. This gives us a `Producer[Storm, LogEvent]` object which we then convert to `Producer[Storm, DataRecord]` with adapters that we wrote:
.. code-block:: scala
lazy val clientEventProducer: Producer[Storm, LogEvent] =
ClientEventSourceScrooge(
appId = AppId(jobConfig.appId),
topic = "julep_client_event_suggests",
resumeAtLastReadOffset = false
).source.name("timelines_events")
lazy val clientEventWithCachedFeaturesProducer: Producer[Storm, DataRecord] = clientEventProducer
.flatMap(mkDataRecords)
Note that this way of composing the storm graph gives us flexiblity in how we can hydrate input features. If you would like to join more complex features to `DataRecord`, you can do so here with additional storm components which can implement cache queries.
.. admonition:: Timelines Quality use case
In Timelines Quality, we aggregate client engagements on `userId` or `tweetId` and implement
`TimelinesStormAggregateSource <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.scala>`_. We create
`Producer[Storm,LogEvent]` of Timelines engagements to which we apply `ClientLogEventAdapter <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/adapters/client_log_event/ClientLogEventAdapter.scala>`_ which converts the event to `DataRecord` containing `userId`, `tweetId`, `timestampFeature` of the engagement and the engagement label itself.
.. admonition:: MagicRecs use case
MagicRecs has a very similar setup for real-time aggregate features. In addition, they also implement a more complex cache query to fetch the user's history in the `StormAggregateSource` for each observed client engagement to hydrate a richer set of input `DataRecords`:
.. code-block:: scala
val userHistoryStoreService: Storm#Service[Long, History] =
Storm.service(UserHistoryReadableStore)
val clientEventDataRecordProducer: Producer[Storm, DataRecord] =
magicRecsClientEventProducer
.flatMap { ...
(userId, logEvent)
}.leftJoin(userHistoryStoreService)
.flatMap {
case (_, (logEvent, history)) =>
mkDataRecords(LogEventHistoryPair(logEvent, history))
}
.. admonition:: EmailRecs use case
EmailRecs shares the same cache as MagicRecs. They combine notification scribe data with email history data to identify the particular item a user engaged with in an email:
.. code-block:: scala
val emailHistoryStoreService: Storm#Service[Long, History] =
Storm.service(EmailHistoryReadableStore)
val emailEventDataRecordProducer: Producer[Storm, DataRecord] =
emailEventProducer
.flatMap { ...
(userId, logEvent)
}.leftJoin(emailHistoryStoreService)
.flatMap {
case (_, (scribe, history)) =>
mkDataRecords(ScribeHistoryPair(scribe, history))
}
Aggregation config
------------------
The real-time aggregation config is extended from `OnlineAggregationConfigTrait <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/heron/OnlineAggregationConfigTrait.scala>`_ and defines the features to aggregate and the backing memcached store to which they will be written.
Setting up real-time aggregates follows the same rules as in the offline batch use case. The major difference here is that `inputSource` should point to the `StormAggregateSource` implementation that provides the `DataRecord` containing the engagements and core features on which to aggregate. In the offline case, this would have been an `OfflineAggregateSource` pointing to an offline source of daily records.
Finally, `RealTimeAggregateStore` defines the backing memcache to be used and should be provided here as the `outputStore`.
.. NOTE::
Please make sure to provide an `AggregateGroup` for both staging and production. The main difference should be the `outputStore` where features in either environment are read from and written to. You want to make sure that a staged real-time aggregates summingbird job is reading/writing only to the test memcache store and does not mutate the production store.
Job config
----------
In addition to the aggregation config that defines the features to aggregate, the final piece we need to provide is a `RealTimeAggregatesJobConfig` that specificies job values such as `appId`, `teamName` and counts for the various topology components that define the capacity of the job (`Timelines example <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.scala#n22>`_).
Once you have the job config, implementing the storm job itself is easy and almost as concise as in the batch use case:
.. code-block:: scala
object TimelinesRealTimeAggregatesJob extends RealTimeAggregatesJobBase {
override lazy val statsReceiver = DefaultStatsReceiver.scope("timelines_real_time_aggregates")
override lazy val jobConfigs = TimelinesRealTimeAggregatesJobConfigs
override lazy val aggregatesToCompute = TimelinesOnlineAggregationConfig.AggregatesToCompute
}
.. NOTE::
There are some topology settings that are currently hard-coded. In particular, we enable `Config.TOPOLOGY_DROPTUPLES_UPON_BACKPRESSURE` to be true for added robustness. This may be made user-definable in the future.
Steps to hydrate RTAs
--------------------
1. Make the changes to RTAs and follow the steps for `Running the topology`.
2. Register the new RTAs to feature store. Sample phab: https://phabricator.twitter.biz/D718120
3. Wire the features from feature store to TLX. This is usually done with the feature switch set to False. So it's just a code change and will not yet start hydrating the features yet. Merge the phab. Sample phab: https://phabricator.twitter.biz/D718424
4. Now we hydrate the features to TLX gradually by doing it shard wise. For this, first create a PCM and then enable the hydration. Sample PCM: https://jira.twitter.biz/browse/PCM-147814
Running the topology
--------------------
0. For phab that makes change to the topology (such as adding new ML features), before landing the phab, please create a PCM (`example <https://jira.twitter.biz/browse/PCM-131614>`_) and deploy the change to devel topology first and then prod (atla and pdxa). Once it is confirmed that the prod topology can handle the change, the phab can be landed.
1. Go to https://ci.twitter.biz/job/tq-ci/build
2. In `commands` input
.. code-block:: bash
. src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/deploy_local.sh [devel|atla|pdxa]
One can only deploy either `devel`, `atla` (prod atla), `pdxa` (prod pdxa) at a time.
For example, to deploy both pdxa and atla prod topologies, one needs to build/run the above steps twice, one with `pdxa` and the other with `atla`.
The status and performance stats of the topology are found at `go/heron-ui <http://heron-ui-new--prod--heron.service.pdxa.twitter.biz/topologies>`_. Here you can view whether the job is processing tuples, whether it is under any memory or backpressure and provides general observability.
Finally, since we enable `Config.TOPOLOGY_DROPTUPLES_UPON_BACKPRESSURE` by default in the topology, we also need to monitor and alert on the number of dropped tuples. Since this is a job generating features a small fraction of dropped tuples is tolerable if that enables us to avoid backpressure that would hold up global computation in the entire graph.
Hydrating Real-Time Aggregate Features
--------------------------------------
Once the job is up and running, the aggregate features will be accessible in the backing memcached store. To access these features and hydrate to your online pipeline, we need to build a Memcache client with the right query key.
.. admonition:: Example
Some care needs to be taken to define the key injection and codec correctly for the memcached store. These types do not change and you can use the Timelines `memcache client builder <https://cgit.twitter.biz/source/tree/timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/real_time_aggregates_cache/RealTimeAggregatesMemcacheBuilder.scala>`_ as an example.
Aggregate features are written to store with a `(AggregationKey, BatchID)` key.
`AggregationKey <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/AggregationKey.scala#n31>`_ is an instant of the keys that you previously defined in `AggregateGroup`. If your aggregation key is `USER_ID`, you would need to instantiate `AggregationKey` with the `USER_ID` featureId and the userId value.
.. admonition:: Returned features
The `DataRecord` that is returned by the cache now contains all real-time aggregate features for the query `AggregationKey` (similar to the batch use case). If your online hydration flow produces data records, the real-time aggregate features can be joined with your existing records in a straightforward way.
Adding features from Feature Store to RTA
--------------------------------------------
To add features from Feature Store to RTA and create real time aggregated features based on them, one needs to follow these steps:
**Step 1**
Copy Strato column for features that one wants to explore and add a cache if needed. See details at `Customize any Columns for your Team as Needed <https://docbird.twitter.biz/ml_feature_store/productionisation-checklist.html?highlight=manhattan#customize-any-columns-for-your-team-as-needed>`_. As an `example <https://phabricator.twitter.biz/D441050>`_, we copy Strato column of recommendationsUserFeaturesProd.User.strato and add a cache for timelines team's usage.
**Step 2**
Create a new ReadableStore which uses Feature Store Client to request features from Feature Store. Implement FeaturesAdapter which extends TimelinesAdapterBase and derive new features based on raw features from Feature Store. As an `example <https://phabricator.twitter.biz/D458168>`_, we create UserFeaturesReadableStore which reads discrete feature user state, and convert it to a list of boolean user state features.
**Step 3**
Join these derived features from Feature Store to timelines storm aggregate source. Depends on the characteristic of these derived features, joined key could be tweet id, user id or others. As an `example <https://phabricator.twitter.biz/D454408>`_, because user state is per user, the joined key is user id.
**Step 4**
Define `AggregateGroup` based on derived features in RTA
Adding New Aggregate Features from an Existing Dataset
--------------------------------
To add a new aggregate feature group from an existing dataset for use in home models, use the following steps:
1. Identify the hypothesis being tested by the addition of the features, in accordance with `go/tpfeatureguide <http://go/tpfeatureguide>`_.
2. Modify or add a new AggregateGroup to `TimelinesOnlineAggregationConfigBase.scala <https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.scala>`_ to define the aggregation key, set of features, labels and metrics. An example phab to add more halflives can be found at `D204415 <https://phabricator.twitter.biz/D204415>`_.
3. If the change is expected to be very large, it may be recommended to perform capacity estimation. See :ref:`Capacity Estimation` for more details.
4. Create feature catalog items for the new RTAs. An example phab is `D706348 <https://phabricator.twitter.biz/D706438>`_. For approval from a featurestore owner ping #help-ml-features on slack.
5. Add new features to the featurestore. An example phab is `D706112 <https://phabricator.twitter.biz/D706112>`_. This change can be rolled out with feature switches or by canarying TLX, depending on the risk. An example PCM for feature switches is: `PCM-148654 <https://jira.twitter.biz/browse/PCM-148654>`_. An example PCM for canarying is: `PCM-145753 <https://jira.twitter.biz/browse/PCM-145753>`_.
6. Wait for redeploy and confirm the new features are available. One way is querying in BigQuery from a table like `twitter-bq-timelines-prod.continuous_training_recap_fav`. Another way is to inspect individual records using pcat. The command to be used is like:
.. code-block:: bash
java -cp pcat-deploy.jar:$(hadoop classpath) com.twitter.ml.tool.pcat.PredictionCatTool
-path /atla/proc2/user/timelines/processed/suggests/recap/continuous_training_data_records/fav/data/YYYY/MM/DD/01/part-00000.lzo
-fc /atla/proc2/user/timelines/processed/suggests/recap/continuous_training_data_records/fav/data_spec.json
-dates YYYY-MM-DDT01 -record_limit 100 | grep [feature_group]
7. Create a phab with the new features and test the performance of a model with them compared to a control model without them. Test offline using `Deepbird for training <https://docbird.twitter.biz/tq_gcp_guide/deepbird.html to train>`_ and `RCE Hypothesis Testing <https://docbird.twitter.biz/Timelines_Deepbird_v2/training.html#model-evaluation-rce-hypothesis-testing>`_ to test. Test online using a DDG. Some helpful instructions are available in `Serving Timelines Models <https://docbird.twitter.biz/timelines_deepbird_v2/serving.html>`_ and the `Experiment Cookbook <https://docs.google.com/document/d/1FTaqd_XOzdTppzePeipLhAgYA9hercN5a_SyQXbuGws/edit#>`_
Capacity Estimation
--------------------------------
This section describes how to approximate the capacity required for a new aggregate group. It is not expected to be exact, but should give a rough estimate.
There are two main components that must be stored for each aggregate group.
Key space: Each AggregationKey struct consists of two maps, one of which is populated with tuples [Long, Long] representing <featureId, value> of discrete features. This takes up 4 x 8 bytes or 32 bytes. The cache team estimates an additional 40 bytes of overhead.
Features: An aggregate feature is represented as a <Long, Double> pair (16 bytes) and is produced for each feature x label x metric x halflife combination.
1. Use bigquery to estimate how many unique values exist for the selected key (key_count). Also collect the number of features, labels, metrics, and half-lives being used.
2. Compute the number of entries to be created, which is num_entires = feature_count * label_count * metric_count * halflife_count
3. Compute the number of bytes per entry, which is num_entry_bytes = 16*num_entries + 32 bytes (key storage) + 40 bytes (overhead)
4. Compute total space required = num_entry_bytes * key_count
Debugging New Aggregate Features
--------------------------------
To debug problems in the setup of your job, there are several steps you can take.
First, ensure that data is being received from the input stream and passed through to create data records. This can be achieved by logging results at various places in your code, and especially at the point of data record creation.
For example, suppose you want to ensure that a data record is being created with
the features you expect. With push and email features, we find that data records
are created in the adaptor, using logic like the following:
.. code-block:: scala
val record = new SRichDataRecord(new DataRecord)
...
record.setFeatureValue(feature, value)
To see what these feature values look like, we can have our adaptor class extend
Twitter's `Logging` trait, and write each created record to a log file.
.. code-block:: scala
class MyEventAdaptor extends TimelinesAdapterBase[MyObject] with Logging {
...
...
def mkDataRecord(myFeatures: MyFeatures): DataRecord = {
val record = new SRichDataRecord(new DataRecord)
...
record.setFeatureValue(feature, value)
logger.info("data record xyz: " + record.getRecord.toString)
}
This way, every time a data record is sent to the aggregator, it will also be
logged. To inspect these logs, you can push these changes to a staging instance,
ssh into that aurora instance, and grep the `log-files` directory for `xyz`. The
data record objects you find should resemble a map from feature ids to their
values.
To check that steps in the aggregation are being performed, you can also inspect the job's topology on go/heronui.
Lastly, to verify that values are being written to your cache you can check the `set` chart in your cache's viz.
To check particular feature values for a given key, you can spin up a Scala REPL like so:
.. code-block:: bash
$ ssh -fN -L*:2181:sdzookeeper-read.atla.twitter.com:2181 -D *:50001 nest.atlc.twitter.com
$ ./pants repl --jvm-repl-scala-options='-DsocksProxyHost=localhost -DsocksProxyPort=50001 -Dcom.twitter.server.resolverZkHosts=localhost:2181' timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/real_time_aggregates_cache
You will then need to create a connection to the cache, and a key with which to query it.
.. code-block:: scala
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.stats.{DefaultStatsReceiver, StatsReceiver}
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.summingbird.batch.Batcher
import com.twitter.timelinemixer.clients.real_time_aggregates_cache.RealTimeAggregatesMemcacheBuilder
import com.twitter.timelines.clients.memcache_common.StorehausMemcacheConfig
val userFeature = -1887718638306251279L // feature id corresponding to User feature
val userId = 12L // replace with a user id logged when creating your data record
val key = (AggregationKey(Map(userFeature -> userId), Map.empty), Batcher.unit.currentBatch)
val dataset = "twemcache_magicrecs_real_time_aggregates_cache_staging" // replace with the appropriate cache name
val dest = s"/srv#/test/local/cache/twemcache_/$dataset"
val statsReceiver: StatsReceiver = DefaultStatsReceiver
val cache = new RealTimeAggregatesMemcacheBuilder(
config = StorehausMemcacheConfig(
destName = dest,
keyPrefix = "",
requestTimeout = 10.seconds,
numTries = 1,
globalTimeout = 10.seconds,
tcpConnectTimeout = 10.seconds,
connectionAcquisitionTimeout = 10.seconds,
numPendingRequests = 250,
isReadOnly = true
),
statsReceiver.scope(dataset)
).build
val result = cache.get(key)
Another option is to create a debugger which points to the staging cache and creates a cache connection and key similar to the logic above.
Run CQL query to find metrics/counters
--------------------------------
We can also visualize the counters from our job to verify new features. Run CQL query on terminal to find the right path of metrics/counters. For example, in order to check counter mergeNumFeatures, run:
cql -z atla keys heron/summingbird_timelines_real_time_aggregates Tail-FlatMap | grep mergeNumFeatures
Then use the right path to create the viz, example: https://monitoring.twitter.biz/tiny/2552105

View File

@ -1,117 +0,0 @@
.. _troubleshooting:
TroubleShooting
==================
[Batch] Regenerating a corrupt version
--------------------------------------
Symptom
~~~~~~~~~~
The Summingbird batch job failed due to the following error:
.. code:: bash
Caused by: com.twitter.bijection.InversionFailure: ...
It typically indicates the corrupt records of the aggregate store (not the other side of the DataRecord source).
The following describes the method to re-generate the required (typically the latest) version:
Solution
~~~~~~~~~~
1. Copy **the second to last version** of the problematic data to canaries folder. For example, if 11/20's job keeps failing, then copy the 11/19's data.
.. code:: bash
$ hadoop --config /etc/hadoop/hadoop-conf-proc2-atla/ \
distcp -m 1000 \
/atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605744000000 \
/atla/proc2/user/timelines/canaries/processed/aggregates_v2/user_mention_aggregates/1605744000000
2. Setup canary run for the date of the problem with fallback path pointing to `1605744000000` in the prod/canaries folder.
3. Deschedule the production job and kill the current run:
For example,
.. code:: bash
$ aurora cron deschedule atla/timelines/prod/user_mention_aggregates
$ aurora job killall atla/timelines/prod/user_mention_aggregates
4. Create backup folder and move the corrupt prod store output there
.. code:: bash
$ hdfs dfs -mkdir /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup
$ hdfs dfs -mv /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605830400000 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/
$ hadoop fs -count /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/1605830400000
1 1001 10829136677614 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/1605830400000
5. Copy canary output store to prod folder:
.. code:: bash
$ hadoop --config /etc/hadoop/hadoop-conf-proc2-atla/ distcp -m 1000 /atla/proc2/user/timelines/canaries/processed/aggregates_v2/user_mention_aggregates/1605830400000 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605830400000
We can see the slight difference of size:
.. code:: bash
$ hadoop fs -count /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/1605830400000
1 1001 10829136677614 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates_backup/1605830400000
$ hadoop fs -count /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605830400000
1 1001 10829136677844 /atla/proc2/user/timelines/processed/aggregates_v2/user_mention_aggregates/1605830400000
6. Deploy prod job again and observe whether it can successfully process the new output for the date of interest.
7. Verify the new run succeeded and job is unblocked.
Example
~~~~~~~~
There is an example in https://phabricator.twitter.biz/D591174
[Batch] Skipping the offline job ahead
---------------------------------------
Symptom
~~~~~~~~~~
The Summingbird batch job keeps failing and the DataRecord source is no longer available (e.g. due to retention) and there is no way for the job succeed **OR**
..
The job is stuck processing old data (more than one week old) and it will not catch up to the new data on its own if it is left alone
Solution
~~~~~~~~
We will need to skip the job ahead. Unfortunately, this involves manual effort. We also need help from the ADP team (Slack #adp).
1. Ask the ADP team to manually insert an entry into the store via the #adp Slack channel. You may refer to https://jira.twitter.biz/browse/AIPIPE-7520 and https://jira.twitter.biz/browse/AIPIPE-9300 as references. However, please don't create and assign tickets directly to an ADP team member unless they ask you to.
2. Copy the latest version of the store to the same HDFS directory but with a different destination name. The name MUST be the same as the above inserted version.
For example, if the ADP team manually inserted a version on 12/09/2020, then we can see the version by running
.. code:: bash
$ dalv2 segment list --name user_original_author_aggregates --role timelines --location-name proc2-atla --location-type hadoop-cluster
...
None 2020-12-09T00:00:00Z viewfs://hadoop-proc2-nn.atla.twitter.com/user/timelines/processed/aggregates_v2/user_original_author_aggregates/1607472000000 Unknown None
where `1607472000000` is the timestamp of 12/09/2020.
Then you will need to duplicate the latest version of the store to a dir of `1607472000000`.
For example,
.. code:: bash
$ hadoop --config /etc/hadoop/hadoop-conf-proc2-atla/ distcp -m 1000 /atla/proc2/user/timelines/processed/aggregates_v2/user_original_author_aggregates/1605052800000 /atla/proc2/user/timelines/processed/aggregates_v2/user_original_author_aggregates/1607472000000
3. Go to the EagleEye UI of the job and click on the "Skip Ahead" button to the desired datetime. In our example, it should be `2020-12-09 12am`
4. Wait for the job to start. Now the job should be running the 2020-12-09 partition.

View File

@ -1,74 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
strict_deps = False,
tags = ["bazel-compatible"],
dependencies = [
":configs",
"3rdparty/jvm/storm:heron-oss-storm",
"3rdparty/src/jvm/com/twitter/scalding:args",
"3rdparty/src/jvm/com/twitter/summingbird:storm",
"src/java/com/twitter/heron/util",
"src/java/com/twitter/ml",
"src/scala/com/twitter/storehaus_internal/nighthawk_kv",
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
"src/scala/com/twitter/summingbird_internal/runner/common",
"src/scala/com/twitter/summingbird_internal/runner/storm",
"src/scala/com/twitter/timelines/prediction/features/common",
"timelines/data_processing/ml_util/aggregation_framework:user_job",
],
)
scala_library(
name = "configs",
sources = [
"NighthawkUnderlyingStoreConfig.scala",
"OnlineAggregationConfigTrait.scala",
"OnlineAggregationStoresTrait.scala",
"RealTimeAggregateStore.scala",
"RealTimeAggregatesJobConfig.scala",
"StormAggregateSource.scala",
],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
":base-config",
"3rdparty/jvm/storm:heron-oss-storm",
"3rdparty/src/jvm/com/twitter/summingbird:core",
"3rdparty/src/jvm/com/twitter/summingbird:storm",
"finagle/finagle-core/src/main",
"src/java/com/twitter/ml/api:api-base",
"src/scala/com/twitter/storehaus_internal/memcache",
"src/scala/com/twitter/storehaus_internal/memcache/config",
"src/scala/com/twitter/storehaus_internal/nighthawk_kv",
"src/scala/com/twitter/storehaus_internal/nighthawk_kv/config",
"src/scala/com/twitter/storehaus_internal/online",
"src/scala/com/twitter/storehaus_internal/store",
"src/scala/com/twitter/storehaus_internal/util",
"src/scala/com/twitter/summingbird_internal/runner/store_config",
"src/thrift/com/twitter/clientapp/gen:clientapp-java",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:data-scala",
"src/thrift/com/twitter/ml/api:feature_context-java",
"timelines/data_processing/ml_util/aggregation_framework:common_types",
"timelines/data_processing/ml_util/transforms",
"util/util-core:scala",
"util/util-core:util-core-util",
"util/util-stats/src/main/scala/com/twitter/finagle/stats",
],
)
scala_library(
name = "base-config",
sources = [
"OnlineAggregationConfigTrait.scala",
],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"src/java/com/twitter/ml/api:api-base",
"timelines/data_processing/ml_util/aggregation_framework:common_types",
],
)

View File

@ -1,31 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.finagle.ssl.OpportunisticTls
import com.twitter.storehaus_internal.nighthawk_kv.CacheClientNighthawkConfig
import com.twitter.storehaus_internal.util.TTL
import com.twitter.storehaus_internal.util.TableName
import com.twitter.summingbird_internal.runner.store_config.OnlineStoreOnlyConfig
import com.twitter.util.Duration
case class NighthawkUnderlyingStoreConfig(
serversetPath: String = "",
tableName: String = "",
cacheTTL: Duration = 1.day)
extends OnlineStoreOnlyConfig[CacheClientNighthawkConfig] {
def online: CacheClientNighthawkConfig = online(EmptyServiceIdentifier)
def online(
serviceIdentifier: ServiceIdentifier = EmptyServiceIdentifier
): CacheClientNighthawkConfig =
CacheClientNighthawkConfig(
serversetPath,
TableName(tableName),
TTL(cacheTTL),
serviceIdentifier = serviceIdentifier,
opportunisticTlsLevel = OpportunisticTls.Required
)
}

View File

@ -1,28 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import com.twitter.ml.api.Feature
trait OnlineAggregationConfigTrait {
def ProdAggregates: Set[TypedAggregateGroup[_]]
def StagingAggregates: Set[TypedAggregateGroup[_]]
def ProdCommonAggregates: Set[TypedAggregateGroup[_]]
/**
* AggregateToCompute: This defines the complete set of aggregates to be
* computed by the aggregation job and to be stored in memcache.
*/
def AggregatesToCompute: Set[TypedAggregateGroup[_]]
/**
* ProdFeatures: This defines the subset of aggregates to be extracted
* and hydrated (or adapted) by callers to the aggregates features cache.
* This should only contain production aggregates and aggregates on
* product specific engagements.
* ProdCommonFeatures: Similar to ProdFeatures but containing user-level
* aggregate features. This is provided to PredictionService just
* once per user.
*/
lazy val ProdFeatures: Set[Feature[_]] = ProdAggregates.flatMap(_.allOutputFeatures)
lazy val ProdCommonFeatures: Set[Feature[_]] = ProdCommonAggregates.flatMap(_.allOutputFeatures)
}

View File

@ -1,6 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron
trait OnlineAggregationStoresTrait {
def ProductionStore: RealTimeAggregateStore
def StagingStore: RealTimeAggregateStore
}

View File

@ -1,50 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.storehaus_internal.memcache.ConnectionConfig
import com.twitter.storehaus_internal.memcache.MemcacheConfig
import com.twitter.storehaus_internal.util.KeyPrefix
import com.twitter.storehaus_internal.util.TTL
import com.twitter.storehaus_internal.util.ZkEndPoint
import com.twitter.summingbird_internal.runner.store_config.OnlineStoreOnlyConfig
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateStore
import com.twitter.util.Duration
object RealTimeAggregateStore {
val twCacheWilyPrefix = "/srv#" // s2s is only supported for wily path
def makeEndpoint(
memcacheDataSet: String,
isProd: Boolean,
twCacheWilyPrefix: String = twCacheWilyPrefix
): String = {
val env = if (isProd) "prod" else "test"
s"$twCacheWilyPrefix/$env/local/cache/$memcacheDataSet"
}
}
case class RealTimeAggregateStore(
memcacheDataSet: String,
isProd: Boolean = false,
cacheTTL: Duration = 1.day)
extends OnlineStoreOnlyConfig[MemcacheConfig]
with AggregateStore {
import RealTimeAggregateStore._
override val name: String = ""
val storeKeyPrefix: KeyPrefix = KeyPrefix(name)
val memcacheZkEndPoint: String = makeEndpoint(memcacheDataSet, isProd)
def online: MemcacheConfig = online(serviceIdentifier = EmptyServiceIdentifier)
def online(serviceIdentifier: ServiceIdentifier = EmptyServiceIdentifier): MemcacheConfig =
new MemcacheConfig {
val endpoint = ZkEndPoint(memcacheZkEndPoint)
override val connectionConfig =
ConnectionConfig(endpoint, serviceIdentifier = serviceIdentifier)
override val keyPrefix = storeKeyPrefix
override val ttl = TTL(Duration.fromMilliseconds(cacheTTL.inMillis))
}
}

View File

@ -1,301 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron
import com.twitter.algebird.Monoid
import com.twitter.bijection.Injection
import com.twitter.bijection.thrift.CompactThriftCodec
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.heron.util.CommonMetric
import com.twitter.ml.api.DataRecord
import com.twitter.scalding.Args
import com.twitter.storehaus.algebra.MergeableStore
import com.twitter.storehaus.algebra.StoreAlgebra._
import com.twitter.storehaus_internal.memcache.Memcache
import com.twitter.storehaus_internal.store.CombinedStore
import com.twitter.storehaus_internal.store.ReplicatingWritableStore
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird.batch.Batcher
import com.twitter.summingbird.online.MergeableStoreFactory
import com.twitter.summingbird.online.option._
import com.twitter.summingbird.option.CacheSize
import com.twitter.summingbird.option.JobId
import com.twitter.summingbird.storm.option.FlatMapStormMetrics
import com.twitter.summingbird.storm.option.SummerStormMetrics
import com.twitter.summingbird.storm.Storm
import com.twitter.summingbird.storm.StormMetric
import com.twitter.summingbird.Options
import com.twitter.summingbird._
import com.twitter.summingbird_internal.runner.common.CapTicket
import com.twitter.summingbird_internal.runner.common.JobName
import com.twitter.summingbird_internal.runner.common.TeamEmail
import com.twitter.summingbird_internal.runner.common.TeamName
import com.twitter.summingbird_internal.runner.storm.ProductionStormConfig
import com.twitter.timelines.data_processing.ml_util.aggregation_framework._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.job.AggregatesV2Job
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.job.AggregatesV2Job
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.job.DataRecordFeatureCounter
import org.apache.heron.api.{Config => HeronConfig}
import org.apache.heron.common.basics.ByteAmount
import org.apache.storm.Config
import scala.collection.JavaConverters._
object RealTimeAggregatesJobBase {
lazy val commonMetric: StormMetric[CommonMetric] =
StormMetric(new CommonMetric(), CommonMetric.NAME, CommonMetric.POLL_INTERVAL)
lazy val flatMapMetrics: FlatMapStormMetrics = FlatMapStormMetrics(Iterable(commonMetric))
lazy val summerMetrics: SummerStormMetrics = SummerStormMetrics(Iterable(commonMetric))
}
trait RealTimeAggregatesJobBase extends Serializable {
import RealTimeAggregatesJobBase._
import com.twitter.summingbird_internal.bijection.BatchPairImplicits._
def statsReceiver: StatsReceiver
def aggregatesToCompute: Set[TypedAggregateGroup[_]]
def jobConfigs: RealTimeAggregatesJobConfigs
implicit lazy val dataRecordCodec: Injection[DataRecord, Array[Byte]] =
CompactThriftCodec[DataRecord]
implicit lazy val monoid: Monoid[DataRecord] = DataRecordAggregationMonoid(aggregatesToCompute)
implicit lazy val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] =
AggregationKeyInjection
val clusters: Set[String] = Set("atla", "pdxa")
def buildAggregateStoreToStorm(
isProd: Boolean,
serviceIdentifier: ServiceIdentifier,
jobConfig: RealTimeAggregatesJobConfig
): (AggregateStore => Option[Storm#Store[AggregationKey, DataRecord]]) = {
(store: AggregateStore) =>
store match {
case rtaStore: RealTimeAggregateStore if rtaStore.isProd == isProd => {
lazy val primaryStore: MergeableStore[(AggregationKey, BatchID), DataRecord] =
Memcache.getMemcacheStore[(AggregationKey, BatchID), DataRecord](
rtaStore.online(serviceIdentifier))
lazy val mergeableStore: MergeableStore[(AggregationKey, BatchID), DataRecord] =
if (jobConfig.enableUserReindexingNighthawkBtreeStore
|| jobConfig.enableUserReindexingNighthawkHashStore) {
val reindexingNighthawkBtreeWritableDataRecordStoreList =
if (jobConfig.enableUserReindexingNighthawkBtreeStore) {
lazy val cacheClientNighthawkConfig =
jobConfig.userReindexingNighthawkBtreeStoreConfig.online(serviceIdentifier)
List(
UserReindexingNighthawkWritableDataRecordStore.getBtreeStore(
nighthawkCacheConfig = cacheClientNighthawkConfig,
// Choose a reasonably large target size as this will be equivalent to the number of unique (user, timestamp)
// keys that are returned on read on the pKey, and we may have duplicate authors and associated records.
targetSize = 512,
statsReceiver = statsReceiver,
// Assuming trims are relatively expensive, choose a trimRate that's not as aggressive. In this case we trim on
// 10% of all writes.
trimRate = 0.1
))
} else { Nil }
val reindexingNighthawkHashWritableDataRecordStoreList =
if (jobConfig.enableUserReindexingNighthawkHashStore) {
lazy val cacheClientNighthawkConfig =
jobConfig.userReindexingNighthawkHashStoreConfig.online(serviceIdentifier)
List(
UserReindexingNighthawkWritableDataRecordStore.getHashStore(
nighthawkCacheConfig = cacheClientNighthawkConfig,
// Choose a reasonably large target size as this will be equivalent to the number of unique (user, timestamp)
// keys that are returned on read on the pKey, and we may have duplicate authors and associated records.
targetSize = 512,
statsReceiver = statsReceiver,
// Assuming trims are relatively expensive, choose a trimRate that's not as aggressive. In this case we trim on
// 10% of all writes.
trimRate = 0.1
))
} else { Nil }
lazy val replicatingWritableStore = new ReplicatingWritableStore(
stores = List(primaryStore) ++ reindexingNighthawkBtreeWritableDataRecordStoreList
++ reindexingNighthawkHashWritableDataRecordStoreList
)
lazy val combinedStoreWithReindexing = new CombinedStore(
read = primaryStore,
write = replicatingWritableStore
)
combinedStoreWithReindexing.toMergeable
} else {
primaryStore
}
lazy val storeFactory: MergeableStoreFactory[(AggregationKey, BatchID), DataRecord] =
Storm.store(mergeableStore)(Batcher.unit)
Some(storeFactory)
}
case _ => None
}
}
def buildDataRecordSourceToStorm(
jobConfig: RealTimeAggregatesJobConfig
): (AggregateSource => Option[Producer[Storm, DataRecord]]) = { (source: AggregateSource) =>
{
source match {
case stormAggregateSource: StormAggregateSource =>
Some(stormAggregateSource.build(statsReceiver, jobConfig))
case _ => None
}
}
}
def apply(args: Args): ProductionStormConfig = {
lazy val isProd = args.boolean("production")
lazy val cluster = args.getOrElse("cluster", "")
lazy val isDebug = args.boolean("debug")
lazy val role = args.getOrElse("role", "")
lazy val service =
args.getOrElse(
"service_name",
""
) // don't use the argument service, which is a reserved heron argument
lazy val environment = if (isProd) "prod" else "devel"
lazy val s2sEnabled = args.boolean("s2s")
lazy val keyedByUserEnabled = args.boolean("keyed_by_user")
lazy val keyedByAuthorEnabled = args.boolean("keyed_by_author")
require(clusters.contains(cluster))
if (s2sEnabled) {
require(role.length() > 0)
require(service.length() > 0)
}
lazy val serviceIdentifier = if (s2sEnabled) {
ServiceIdentifier(
role = role,
service = service,
environment = environment,
zone = cluster
)
} else EmptyServiceIdentifier
lazy val jobConfig = {
val jobConfig = if (isProd) jobConfigs.Prod else jobConfigs.Devel
jobConfig.copy(
serviceIdentifier = serviceIdentifier,
keyedByUserEnabled = keyedByUserEnabled,
keyedByAuthorEnabled = keyedByAuthorEnabled)
}
lazy val dataRecordSourceToStorm = buildDataRecordSourceToStorm(jobConfig)
lazy val aggregateStoreToStorm =
buildAggregateStoreToStorm(isProd, serviceIdentifier, jobConfig)
lazy val JaasConfigFlag = "-Djava.security.auth.login.config=resources/jaas.conf"
lazy val JaasDebugFlag = "-Dsun.security.krb5.debug=true"
lazy val JaasConfigString =
if (isDebug) { "%s %s".format(JaasConfigFlag, JaasDebugFlag) }
else JaasConfigFlag
new ProductionStormConfig {
implicit val jobId: JobId = JobId(jobConfig.name)
override val jobName = JobName(jobConfig.name)
override val teamName = TeamName(jobConfig.teamName)
override val teamEmail = TeamEmail(jobConfig.teamEmail)
override val capTicket = CapTicket("n/a")
val configureHeronJvmSettings = {
val heronJvmOptions = new java.util.HashMap[String, AnyRef]()
jobConfig.componentToRamGigaBytesMap.foreach {
case (component, gigabytes) =>
HeronConfig.setComponentRam(
heronJvmOptions,
component,
ByteAmount.fromGigabytes(gigabytes))
}
HeronConfig.setContainerRamRequested(
heronJvmOptions,
ByteAmount.fromGigabytes(jobConfig.containerRamGigaBytes)
)
jobConfig.componentsToKerberize.foreach { component =>
HeronConfig.setComponentJvmOptions(
heronJvmOptions,
component,
JaasConfigString
)
}
jobConfig.componentToMetaSpaceSizeMap.foreach {
case (component, metaspaceSize) =>
HeronConfig.setComponentJvmOptions(
heronJvmOptions,
component,
metaspaceSize
)
}
heronJvmOptions.asScala.toMap ++ AggregatesV2Job
.aggregateNames(aggregatesToCompute).map {
case (prefix, aggNames) => (s"extras.aggregateNames.${prefix}", aggNames)
}
}
override def transformConfig(m: Map[String, AnyRef]): Map[String, AnyRef] = {
super.transformConfig(m) ++ List(
/**
* Disable acking by setting acker executors to 0. Tuples that come off the
* spout will be immediately acked which effectively disables retries on tuple
* failures. This should help topology throughput/availability by relaxing consistency.
*/
Config.TOPOLOGY_ACKER_EXECUTORS -> int2Integer(0),
Config.TOPOLOGY_WORKERS -> int2Integer(jobConfig.topologyWorkers),
HeronConfig.TOPOLOGY_CONTAINER_CPU_REQUESTED -> int2Integer(8),
HeronConfig.TOPOLOGY_DROPTUPLES_UPON_BACKPRESSURE -> java.lang.Boolean.valueOf(true),
HeronConfig.TOPOLOGY_WORKER_CHILDOPTS -> List(
JaasConfigString,
s"-Dcom.twitter.eventbus.client.zoneName=${cluster}",
"-Dcom.twitter.eventbus.client.EnableKafkaSaslTls=true"
).mkString(" "),
"storm.job.uniqueId" -> jobId.get
) ++ configureHeronJvmSettings
}
override lazy val getNamedOptions: Map[String, Options] = jobConfig.topologyNamedOptions ++
Map(
"DEFAULT" -> Options()
.set(flatMapMetrics)
.set(summerMetrics)
.set(MaxWaitingFutures(1000))
.set(FlushFrequency(30.seconds))
.set(UseAsyncCache(true))
.set(AsyncPoolSize(4))
.set(SourceParallelism(jobConfig.sourceCount))
.set(SummerBatchMultiplier(1000)),
"FLATMAP" -> Options()
.set(FlatMapParallelism(jobConfig.flatMapCount))
.set(CacheSize(0)),
"SUMMER" -> Options()
.set(SummerParallelism(jobConfig.summerCount))
/**
* Sets number of tuples a Summer awaits before aggregation. Set higher
* if you need to lower qps to memcache at the expense of introducing
* some (stable) latency.
*/
.set(CacheSize(jobConfig.cacheSize))
)
val featureCounters: Seq[DataRecordFeatureCounter] =
Seq(DataRecordFeatureCounter.any(Counter(Group("feature_counter"), Name("num_records"))))
override def graph: TailProducer[Storm, Any] = AggregatesV2Job.generateJobGraph[Storm](
aggregateSet = aggregatesToCompute,
aggregateSourceToSummingbird = dataRecordSourceToStorm,
aggregateStoreToSummingbird = aggregateStoreToStorm,
featureCounters = featureCounters
)
}
}
}

View File

@ -1,79 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron
import com.twitter.finagle.mtls.authentication.EmptyServiceIdentifier
import com.twitter.finagle.mtls.authentication.ServiceIdentifier
import com.twitter.ml.api.DataRecord
import com.twitter.summingbird.Options
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
/**
*
* @param appId application id for topology job
* @param topologyWorkers number of workers/containers of topology
* @param sourceCount number of parallel sprouts of topology
* @param summerCount number of Summer of topology
* @param cacheSize number of tuples a Summer awaits before aggregation.
* @param flatMapCount number of parallel FlatMap of topology
* @param containerRamGigaBytes total RAM of each worker/container has
* @param name name of topology job
* @param teamName name of team who owns topology job
* @param teamEmail email of team who owns topology job
* @param componentsToKerberize component of topology job (eg. Tail-FlatMap-Source) which enables kerberization
* @param componentToMetaSpaceSizeMap MetaSpaceSize settings for components of topology job
* @param topologyNamedOptions Sets spout allocations for named topology components
* @param serviceIdentifier represents the identifier used for Service to Service Authentication
* @param onlinePreTransforms sequential data record transforms applied to Producer of DataRecord before creating AggregateGroup.
* While preTransforms defined at AggregateGroup are applied to each aggregate group, onlinePreTransforms are applied to the whole producer source.
* @param keyedByUserEnabled boolean value to enable/disable merging user-level features from Feature Store
* @param keyedByAuthorEnabled boolean value to enable/disable merging author-level features from Feature Store
* @param enableUserReindexingNighthawkBtreeStore boolean value to enable reindexing RTAs on user id with btree backed nighthawk
* @param enableUserReindexingNighthawkHashStore boolean value to enable reindexing RTAs on user id with hash backed nighthawk
* @param userReindexingNighthawkBtreeStoreConfig NH btree store config used in reindexing user RTAs
* @param userReindexingNighthawkHashStoreConfig NH hash store config used in reindexing user RTAs
*/
case class RealTimeAggregatesJobConfig(
appId: String,
topologyWorkers: Int,
sourceCount: Int,
summerCount: Int,
cacheSize: Int,
flatMapCount: Int,
containerRamGigaBytes: Int,
name: String,
teamName: String,
teamEmail: String,
componentsToKerberize: Seq[String] = Seq.empty,
componentToMetaSpaceSizeMap: Map[String, String] = Map.empty,
componentToRamGigaBytesMap: Map[String, Int] = Map("Tail" -> 4),
topologyNamedOptions: Map[String, Options] = Map.empty,
serviceIdentifier: ServiceIdentifier = EmptyServiceIdentifier,
onlinePreTransforms: Seq[OneToSomeTransform] = Seq.empty,
keyedByUserEnabled: Boolean = false,
keyedByAuthorEnabled: Boolean = false,
keyedByTweetEnabled: Boolean = false,
enableUserReindexingNighthawkBtreeStore: Boolean = false,
enableUserReindexingNighthawkHashStore: Boolean = false,
userReindexingNighthawkBtreeStoreConfig: NighthawkUnderlyingStoreConfig =
NighthawkUnderlyingStoreConfig(),
userReindexingNighthawkHashStoreConfig: NighthawkUnderlyingStoreConfig =
NighthawkUnderlyingStoreConfig()) {
/**
* Apply transforms sequentially. If any transform results in a dropped (None)
* DataRecord, then entire transform sequence will result in a dropped DataRecord.
* Note that transforms are order-dependent.
*/
def sequentiallyTransform(dataRecord: DataRecord): Option[DataRecord] = {
val recordOpt = Option(new DataRecord(dataRecord))
onlinePreTransforms.foldLeft(recordOpt) {
case (Some(previousRecord), preTransform) =>
preTransform(previousRecord)
case _ => Option.empty[DataRecord]
}
}
}
trait RealTimeAggregatesJobConfigs {
def Prod: RealTimeAggregatesJobConfig
def Devel: RealTimeAggregatesJobConfig
}

View File

@ -1,27 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.summingbird._
import com.twitter.summingbird.storm.Storm
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateSource
import java.lang.{Long => JLong}
/**
* Use this trait to implement online summingbird producer that subscribes to
* spouts and generates a data record.
*/
trait StormAggregateSource extends AggregateSource {
def name: String
def timestampFeature: Feature[JLong]
/**
* Constructs the storm Producer with the implemented topology at runtime.
*/
def build(
statsReceiver: StatsReceiver,
jobConfig: RealTimeAggregatesJobConfig
): Producer[Storm, DataRecord]
}

View File

@ -1,309 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron
import com.twitter.bijection.Injection
import com.twitter.bijection.thrift.CompactThriftCodec
import com.twitter.cache.client._
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.storehaus.WritableStore
import com.twitter.storehaus_internal.nighthawk_kv.CacheClientNighthawkConfig
import com.twitter.storehaus_internal.nighthawk_kv.NighthawkStore
import com.twitter.summingbird.batch.BatchID
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.heron.UserReindexingNighthawkWritableDataRecordStore._
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
import com.twitter.util.Future
import com.twitter.util.Time
import com.twitter.util.Try
import com.twitter.util.logging.Logger
import java.nio.ByteBuffer
import java.util
import scala.util.Random
object UserReindexingNighthawkWritableDataRecordStore {
implicit val longInjection = Injection.long2BigEndian
implicit val dataRecordInjection: Injection[DataRecord, Array[Byte]] =
CompactThriftCodec[DataRecord]
val arrayToByteBuffer = Injection.connect[Array[Byte], ByteBuffer]
val longToByteBuffer = longInjection.andThen(arrayToByteBuffer)
val dataRecordToByteBuffer = dataRecordInjection.andThen(arrayToByteBuffer)
def getBtreeStore(
nighthawkCacheConfig: CacheClientNighthawkConfig,
targetSize: Int,
statsReceiver: StatsReceiver,
trimRate: Double
): UserReindexingNighthawkBtreeWritableDataRecordStore =
new UserReindexingNighthawkBtreeWritableDataRecordStore(
nighthawkStore = NighthawkStore[UserId, TimestampMs, DataRecord](nighthawkCacheConfig)
.asInstanceOf[NighthawkStore[UserId, TimestampMs, DataRecord]],
tableName = nighthawkCacheConfig.table.toString,
targetSize = targetSize,
statsReceiver = statsReceiver,
trimRate = trimRate
)
def getHashStore(
nighthawkCacheConfig: CacheClientNighthawkConfig,
targetSize: Int,
statsReceiver: StatsReceiver,
trimRate: Double
): UserReindexingNighthawkHashWritableDataRecordStore =
new UserReindexingNighthawkHashWritableDataRecordStore(
nighthawkStore = NighthawkStore[UserId, AuthorId, DataRecord](nighthawkCacheConfig)
.asInstanceOf[NighthawkStore[UserId, AuthorId, DataRecord]],
tableName = nighthawkCacheConfig.table.toString,
targetSize = targetSize,
statsReceiver = statsReceiver,
trimRate = trimRate
)
def buildTimestampedByteBuffer(timestamp: Long, bb: ByteBuffer): ByteBuffer = {
val timestampedBb = ByteBuffer.allocate(getLength(bb) + java.lang.Long.SIZE)
timestampedBb.putLong(timestamp)
timestampedBb.put(bb)
timestampedBb
}
def extractTimestampFromTimestampedByteBuffer(bb: ByteBuffer): Long = {
bb.getLong(0)
}
def extractValueFromTimestampedByteBuffer(bb: ByteBuffer): ByteBuffer = {
val bytes = new Array[Byte](getLength(bb) - java.lang.Long.SIZE)
util.Arrays.copyOfRange(bytes, java.lang.Long.SIZE, getLength(bb))
ByteBuffer.wrap(bytes)
}
def transformAndBuildKeyValueMapping(
table: String,
userId: UserId,
authorIdsAndDataRecords: Seq[(AuthorId, DataRecord)]
): KeyValue = {
val timestamp = Time.now.inMillis
val pkey = longToByteBuffer(userId)
val lkeysAndTimestampedValues = authorIdsAndDataRecords.map {
case (authorId, dataRecord) =>
val lkey = longToByteBuffer(authorId)
// Create a byte buffer with a prepended timestamp to reduce deserialization cost
// when parsing values. We only have to extract and deserialize the timestamp in the
// ByteBuffer in order to sort the value, as opposed to deserializing the DataRecord
// and having to get a timestamp feature value from the DataRecord.
val dataRecordBb = dataRecordToByteBuffer(dataRecord)
val timestampedValue = buildTimestampedByteBuffer(timestamp, dataRecordBb)
(lkey, timestampedValue)
}
buildKeyValueMapping(table, pkey, lkeysAndTimestampedValues)
}
def buildKeyValueMapping(
table: String,
pkey: ByteBuffer,
lkeysAndTimestampedValues: Seq[(ByteBuffer, ByteBuffer)]
): KeyValue = {
val lkeys = lkeysAndTimestampedValues.map { case (lkey, _) => lkey }
val timestampedValues = lkeysAndTimestampedValues.map { case (_, value) => value }
val kv = KeyValue(
key = Key(table = table, pkey = pkey, lkeys = lkeys),
value = Value(timestampedValues)
)
kv
}
private def getLength(bb: ByteBuffer): Int = {
// capacity can be an over-estimate of the actual length (remaining - start position)
// but it's the safest to avoid overflows.
bb.capacity()
}
}
/**
* Implements a NH store that stores aggregate feature DataRecords using userId as the primary key.
*
* This store re-indexes user-author keyed real-time aggregate (RTA) features on userId by
* writing to a userId primary key (pkey) and timestamp secondary key (lkey). To fetch user-author
* RTAs for a given user from cache, the caller just needs to make a single RPC for the userId pkey.
* The downside of a re-indexing store is that we cannot store arbitrarily many secondary keys
* under the primary key. This specific implementation using the NH btree backend also mandates
* mandates an ordering of secondary keys - we therefore use timestamp as the secondary key
* as opposed to say authorId.
*
* Note that a caller of the btree backed NH re-indexing store receives back a response where the
* secondary key is a timestamp. The associated value is a DataRecord containing user-author related
* aggregate features which was last updated at the timestamp. The caller therefore needs to handle
* the response and dedupe on unique, most recent user-author pairs.
*
* For a discussion on this and other implementations, please see:
* https://docs.google.com/document/d/1yVzAbQ_ikLqwSf230URxCJmSKj5yZr5dYv6TwBlQw18/edit
*/
class UserReindexingNighthawkBtreeWritableDataRecordStore(
nighthawkStore: NighthawkStore[UserId, TimestampMs, DataRecord],
tableName: String,
targetSize: Int,
statsReceiver: StatsReceiver,
trimRate: Double = 0.1 // by default, trim on 10% of puts
) extends WritableStore[(AggregationKey, BatchID), Option[DataRecord]] {
private val scope = getClass.getSimpleName
private val failures = statsReceiver.counter(scope, "failures")
private val log = Logger.getLogger(getClass)
private val random: Random = new Random(1729L)
override def put(kv: ((AggregationKey, BatchID), Option[DataRecord])): Future[Unit] = {
val ((aggregationKey, _), dataRecordOpt) = kv
// Fire-and-forget below because the store itself should just be a side effect
// as it's just making re-indexed writes based on the writes to the primary store.
for {
userId <- aggregationKey.discreteFeaturesById.get(SharedFeatures.USER_ID.getFeatureId)
dataRecord <- dataRecordOpt
} yield {
SRichDataRecord(dataRecord)
.getFeatureValueOpt(TypedAggregateGroup.timestampFeature)
.map(_.toLong) // convert to Scala Long
.map { timestamp =>
val trim: Future[Unit] = if (random.nextDouble <= trimRate) {
val trimKey = TrimKey(
table = tableName,
pkey = longToByteBuffer(userId),
targetSize = targetSize,
ascending = true
)
nighthawkStore.client.trim(Seq(trimKey)).unit
} else {
Future.Unit
}
// We should wait for trim to complete above
val fireAndForget = trim.before {
val kvTuple = ((userId, timestamp), Some(dataRecord))
nighthawkStore.put(kvTuple)
}
fireAndForget.onFailure {
case e =>
failures.incr()
log.error("Failure in UserReindexingNighthawkHashWritableDataRecordStore", e)
}
}
}
// Ignore fire-and-forget result above and simply return
Future.Unit
}
}
/**
* Implements a NH store that stores aggregate feature DataRecords using userId as the primary key.
*
* This store re-indexes user-author keyed real-time aggregate (RTA) features on userId by
* writing to a userId primary key (pkey) and authorId secondary key (lkey). To fetch user-author
* RTAs for a given user from cache, the caller just needs to make a single RPC for the userId pkey.
* The downside of a re-indexing store is that we cannot store arbitrarily
* many secondary keys under the primary key. We have to limit them in some way;
* here, we do so by randomly (based on trimRate) issuing an HGETALL command (via scan) to
* retrieve the whole hash, sort by oldest timestamp, and then remove the oldest authors to keep
* only targetSize authors (aka trim), where targetSize is configurable.
*
* @note The full hash returned from scan could be as large (or even larger) than targetSize,
* which could mean many DataRecords to deserialize, especially at high write qps.
* To reduce deserialization cost post-scan, we use timestamped values with a prepended timestamp
* in the value ByteBuffer; this allows us to only deserialize the timestamp and not the full
* DataRecord when sorting. This is necessary in order to identify the oldest values to trim.
* When we do a put for a new (user, author) pair, we also write out timestamped values.
*
* For a discussion on this and other implementations, please see:
* https://docs.google.com/document/d/1yVzAbQ_ikLqwSf230URxCJmSKj5yZr5dYv6TwBlQw18/edit
*/
class UserReindexingNighthawkHashWritableDataRecordStore(
nighthawkStore: NighthawkStore[UserId, AuthorId, DataRecord],
tableName: String,
targetSize: Int,
statsReceiver: StatsReceiver,
trimRate: Double = 0.1 // by default, trim on 10% of puts
) extends WritableStore[(AggregationKey, BatchID), Option[DataRecord]] {
private val scope = getClass.getSimpleName
private val scanMismatchErrors = statsReceiver.counter(scope, "scanMismatchErrors")
private val failures = statsReceiver.counter(scope, "failures")
private val log = Logger.getLogger(getClass)
private val random: Random = new Random(1729L)
private val arrayToByteBuffer = Injection.connect[Array[Byte], ByteBuffer]
private val longToByteBuffer = Injection.long2BigEndian.andThen(arrayToByteBuffer)
override def put(kv: ((AggregationKey, BatchID), Option[DataRecord])): Future[Unit] = {
val ((aggregationKey, _), dataRecordOpt) = kv
// Fire-and-forget below because the store itself should just be a side effect
// as it's just making re-indexed writes based on the writes to the primary store.
for {
userId <- aggregationKey.discreteFeaturesById.get(SharedFeatures.USER_ID.getFeatureId)
authorId <- aggregationKey.discreteFeaturesById.get(
TimelinesSharedFeatures.SOURCE_AUTHOR_ID.getFeatureId)
dataRecord <- dataRecordOpt
} yield {
val scanAndTrim: Future[Unit] = if (random.nextDouble <= trimRate) {
val scanKey = ScanKey(
table = tableName,
pkey = longToByteBuffer(userId)
)
nighthawkStore.client.scan(Seq(scanKey)).flatMap { scanResults: Seq[Try[KeyValue]] =>
scanResults.headOption
.flatMap(_.toOption).map { keyValue: KeyValue =>
val lkeys: Seq[ByteBuffer] = keyValue.key.lkeys
// these are timestamped bytebuffers
val timestampedValues: Seq[ByteBuffer] = keyValue.value.values
// this should fail loudly if this is not true. it would indicate
// there is a mistake in the scan.
if (lkeys.size != timestampedValues.size) scanMismatchErrors.incr()
assert(lkeys.size == timestampedValues.size)
if (lkeys.size > targetSize) {
val numToRemove = targetSize - lkeys.size
// sort by oldest and take top k oldest and remove - this is equivalent to a trim
val oldestKeys: Seq[ByteBuffer] = lkeys
.zip(timestampedValues)
.map {
case (lkey, timestampedValue) =>
val timestamp = extractTimestampFromTimestampedByteBuffer(timestampedValue)
(timestamp, lkey)
}
.sortBy { case (timestamp, _) => timestamp }
.take(numToRemove)
.map { case (_, k) => k }
val pkey = longToByteBuffer(userId)
val key = Key(table = tableName, pkey = pkey, lkeys = oldestKeys)
// NOTE: `remove` is a batch API, and we group all lkeys into a single batch (batch
// size = single group of lkeys = 1). Instead, we could separate lkeys into smaller
// groups and have batch size = number of groups, but this is more complex.
// Performance implications of batching vs non-batching need to be assessed.
nighthawkStore.client
.remove(Seq(key))
.map { responses =>
responses.map(resp => nighthawkStore.processValue(resp))
}.unit
} else {
Future.Unit
}
}.getOrElse(Future.Unit)
}
} else {
Future.Unit
}
// We should wait for scan and trim to complete above
val fireAndForget = scanAndTrim.before {
val kv = transformAndBuildKeyValueMapping(tableName, userId, Seq((authorId, dataRecord)))
nighthawkStore.client
.put(Seq(kv))
.map { responses =>
responses.map(resp => nighthawkStore.processValue(resp))
}.unit
}
fireAndForget.onFailure {
case e =>
failures.incr()
log.error("Failure in UserReindexingNighthawkHashWritableDataRecordStore", e)
}
}
// Ignore fire-and-forget result above and simply return
Future.Unit
}
}

View File

@ -1,8 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
package object heron {
// NOTE: please sort alphabetically
type AuthorId = Long
type UserId = Long
type TimestampMs = Long
}

View File

@ -1,163 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.job
import com.twitter.algebird.Semigroup
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.DataRecordMerger
import com.twitter.summingbird.Platform
import com.twitter.summingbird.Producer
import com.twitter.summingbird.TailProducer
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateSource
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregateStore
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
object AggregatesV2Job {
private lazy val merger = new DataRecordMerger
/**
* Merges all "incremental" records with the same aggregation key
* into a single record.
*
* @param recordsPerKey A set of (AggregationKey, DataRecord) tuples
* known to share the same AggregationKey
* @return A single merged datarecord
*/
def mergeRecords(recordsPerKey: Set[(AggregationKey, DataRecord)]): DataRecord =
recordsPerKey.foldLeft(new DataRecord) {
case (merged: DataRecord, (key: AggregationKey, elem: DataRecord)) => {
merger.merge(merged, elem)
merged
}
}
/**
* Given a set of aggregates to compute and a datarecord, extract key-value
* pairs to output to the summingbird store.
*
* @param dataRecord input data record
* @param aggregates set of aggregates to compute
* @param featureCounters counters to apply to each input data record
* @return computed aggregates
*/
def computeAggregates(
dataRecord: DataRecord,
aggregates: Set[TypedAggregateGroup[_]],
featureCounters: Seq[DataRecordFeatureCounter]
): Map[AggregationKey, DataRecord] = {
val computedAggregates = aggregates
.flatMap(_.computeAggregateKVPairs(dataRecord))
.groupBy { case (aggregationKey: AggregationKey, _) => aggregationKey }
.mapValues(mergeRecords)
featureCounters.foreach(counter =>
computedAggregates.map(agg => DataRecordFeatureCounter(counter, agg._2)))
computedAggregates
}
/**
* Util method to apply a filter on containment in an optional set.
*
* @param setOptional Optional set of items to check containment in.
* @param toCheck Item to check if contained in set.
* @return If the optional set is None, returns true.
*/
def setFilter[T](setOptional: Option[Set[T]], toCheck: T): Boolean =
setOptional.map(_.contains(toCheck)).getOrElse(true)
/**
* Util for filtering a collection of `TypedAggregateGroup`
*
* @param aggregates a set of aggregates
* @param sourceNames Optional filter on which AggregateGroups to process
* based on the name of the input source.
* @param storeNames Optional filter on which AggregateGroups to process
* based on the name of the output store.
* @return filtered aggregates
*/
def filterAggregates(
aggregates: Set[TypedAggregateGroup[_]],
sourceNames: Option[Set[String]],
storeNames: Option[Set[String]]
): Set[TypedAggregateGroup[_]] =
aggregates
.filter { aggregateGroup =>
val sourceName = aggregateGroup.inputSource.name
val storeName = aggregateGroup.outputStore.name
val containsSource = setFilter(sourceNames, sourceName)
val containsStore = setFilter(storeNames, storeName)
containsSource && containsStore
}
/**
* The core summingbird job code.
*
* For each aggregate in the set passed in, the job
* processes all datarecords in the input producer
* stream to generate "incremental" contributions to
* these aggregates, and emits them grouped by
* aggregation key so that summingbird can aggregate them.
*
* It is important that after applying the sourceNameFilter and storeNameFilter,
* all the result AggregateGroups share the same startDate, otherwise the job
* will fail or give invalid results.
*
* @param aggregateSet A set of aggregates to compute. All aggregates
* in this set that pass the sourceNameFilter and storeNameFilter
* defined below, if any, will be computed.
* @param aggregateSourceToSummingbird Function that maps from our logical
* AggregateSource abstraction to the underlying physical summingbird
* producer of data records to aggregate (e.g. scalding/eventbus source)
* @param aggregateStoreToSummingbird Function that maps from our logical
* AggregateStore abstraction to the underlying physical summingbird
* store to write output aggregate records to (e.g. mahattan for scalding,
* or memcache for heron)
* @param featureCounters counters to use with each input DataRecord
* @return summingbird tail producer
*/
def generateJobGraph[P <: Platform[P]](
aggregateSet: Set[TypedAggregateGroup[_]],
aggregateSourceToSummingbird: AggregateSource => Option[Producer[P, DataRecord]],
aggregateStoreToSummingbird: AggregateStore => Option[P#Store[AggregationKey, DataRecord]],
featureCounters: Seq[DataRecordFeatureCounter] = Seq.empty
)(
implicit semigroup: Semigroup[DataRecord]
): TailProducer[P, Any] = {
val tailProducerList: List[TailProducer[P, Any]] = aggregateSet
.groupBy { aggregate => (aggregate.inputSource, aggregate.outputStore) }
.flatMap {
case (
(inputSource: AggregateSource, outputStore: AggregateStore),
aggregatesInThisStore
) => {
val producerOpt = aggregateSourceToSummingbird(inputSource)
val storeOpt = aggregateStoreToSummingbird(outputStore)
(producerOpt, storeOpt) match {
case (Some(producer), Some(store)) =>
Some(
producer
.flatMap(computeAggregates(_, aggregatesInThisStore, featureCounters))
.name("FLATMAP")
.sumByKey(store)
.name("SUMMER")
)
case _ => None
}
}
}
.toList
tailProducerList.reduceLeft { (left, right) => left.also(right) }
}
def aggregateNames(aggregateSet: Set[TypedAggregateGroup[_]]) = {
aggregateSet
.map(typedGroup =>
(
typedGroup.aggregatePrefix,
typedGroup.individualAggregateDescriptors
.flatMap(_.outputFeatures.map(_.getFeatureName)).mkString(",")))
}.toMap
}

View File

@ -1,19 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/algebird:core",
"3rdparty/jvm/com/twitter/algebird:util",
"3rdparty/jvm/com/twitter/storehaus:algebra",
"3rdparty/jvm/com/twitter/storehaus:core",
"3rdparty/src/jvm/com/twitter/scalding:commons",
"3rdparty/src/jvm/com/twitter/scalding:core",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"3rdparty/src/jvm/com/twitter/summingbird:core",
"src/java/com/twitter/ml/api:api-base",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:interpretable-model-java",
"timelines/data_processing/ml_util/aggregation_framework:common_types",
],
)

View File

@ -1,39 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.job
import com.twitter.ml.api.DataRecord
import com.twitter.summingbird.Counter
/**
* A summingbird Counter which is associated with a predicate which operates on
* [[com.twitter.ml.api.DataRecord]] instances.
*
* For example, for a data record which represents a Tweet, one could define a predicate
* which checks whether the Tweet contains a binary feature representing the presence of
* an image. The counter can then be used to represent the the count of Tweets with
* images processed.
*
* @param predicate a predicate which gates the counter
* @param counter a summingbird Counter instance
*/
case class DataRecordFeatureCounter(predicate: DataRecord => Boolean, counter: Counter)
object DataRecordFeatureCounter {
/**
* Increments the counter if the record satisfies the predicate
*
* @param recordCounter a data record counter
* @param record a data record
*/
def apply(recordCounter: DataRecordFeatureCounter, record: DataRecord): Unit =
if (recordCounter.predicate(record)) recordCounter.counter.incr()
/**
* Defines a feature counter with a predicate that is always true
*
* @param counter a summingbird Counter instance
* @return a data record counter
*/
def any(counter: Counter): DataRecordFeatureCounter =
DataRecordFeatureCounter({ _: DataRecord => true }, counter)
}

View File

@ -1,51 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.util.Duration
import com.twitter.ml.api._
import java.lang.{Boolean => JBoolean}
/**
* Case class used as shared argument for
* getAggregateValue() and setAggregateValue() in AggregationMetric.
*
* @param aggregatePrefix Prefix for aggregate feature name
* @param feature Simple (non-aggregate) feature being aggregated. This
is optional; if None, then the label is aggregated on its own without
being crossed with any feature.
* @param label Label being paired with. This is optional; if None, then
the feature is aggregated on its own without being crossed with any label.
* @param halfLife Half life being used for aggregation
*/
case class AggregateFeature[T](
aggregatePrefix: String,
feature: Option[Feature[T]],
label: Option[Feature[JBoolean]],
halfLife: Duration) {
val aggregateType = "pair"
val labelName: String = label.map(_.getDenseFeatureName()).getOrElse("any_label")
val featureName: String = feature.map(_.getDenseFeatureName()).getOrElse("any_feature")
/*
* This val precomputes a portion of the feature name
* for faster processing. String building turns
* out to be a significant bottleneck.
*/
val featurePrefix: String = List(
aggregatePrefix,
aggregateType,
labelName,
featureName,
halfLife.toString
).mkString(".")
}
/* Companion object with util methods. */
object AggregateFeature {
def parseHalfLife(aggregateFeature: Feature[_]): Duration = {
val aggregateComponents = aggregateFeature.getDenseFeatureName().split("\\.")
val numComponents = aggregateComponents.length
val halfLifeStr = aggregateComponents(numComponents - 3) + "." +
aggregateComponents(numComponents - 2)
Duration.parse(halfLifeStr)
}
}

View File

@ -1,184 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.util.Duration
import java.lang.{Long => JLong}
/**
* Represents an aggregation operator (e.g. count or mean).
* Override all functions in this trait to implement your own metric.
* The operator is parameterized on an input type T, which is the type
* of feature it aggregates, and a TimedValue[A] which is
* the result type of aggregation for this metric.
*/
trait AggregationMetric[T, A] extends FeatureCache[T] {
/*
* Combines two timed aggregate values ''left'' and ''right''
* with the specified half life ''halfLife'' to produce a result
* TimedValue
*
* @param left Left timed value
* @param right Right timed value
* @param halfLife Half life to use for adding timed values
* @return Result timed value
*/
def plus(left: TimedValue[A], right: TimedValue[A], halfLife: Duration): TimedValue[A]
/*
* Gets increment value given a datarecord and a feature.
*
* @param dataRecord to get increment value from.
* @param feature Feature to get increment value for. If None,
then the semantics is to just aggregate the label.
* @param timestampFeature Feature to use as millisecond timestamp
for decayed value aggregation.
* @return The incremental contribution to the aggregate of ''feature'' from ''dataRecord''.
*
* For example, if the aggregation metric is count, the incremental
* contribution is always a TimedValue (1.0, time). If the aggregation metric
* is mean, and the feature is a continuous feature (double), the incremental
* contribution looks like a tuple (value, 1.0, time)
*/
def getIncrementValue(
dataRecord: DataRecord,
feature: Option[Feature[T]],
timestampFeature: Feature[JLong]
): TimedValue[A]
/*
* The "zero" value for aggregation.
* For example, the zero is 0 for the count operator.
*/
def zero(timeOpt: Option[Long] = None): TimedValue[A]
/*
* Gets the value of aggregate feature(s) stored in a datarecord, if any.
* Different aggregate operators might store this info in the datarecord
* differently. E.g. count just stores a count, while mean needs to
* store both a sum and a count, and compile them into a TimedValue. We call
* these features stored in the record "output" features.
*
* @param record Record to get value from
* @param query AggregateFeature (see above) specifying details of aggregate
* @param aggregateOutputs An optional precomputed set of aggregation "output"
* feature hashes for this (query, metric) pair. This can be derived from ''query'',
* but we precompute and pass this in for significantly (approximately 4x = 400%)
* faster performance. If not passed in, the operator should reconstruct these features
* from scratch.
*
* @return The aggregate value if found in ''record'', else the appropriate "zero"
for this type of aggregation.
*/
def getAggregateValue(
record: DataRecord,
query: AggregateFeature[T],
aggregateOutputs: Option[List[JLong]] = None
): TimedValue[A]
/*
* Sets the value of aggregate feature(s) in a datarecord. Different operators
* will have different representations (see example above).
*
* @param record Record to set value in
* @param query AggregateFeature (see above) specifying details of aggregate
* @param aggregateOutputs An optional precomputed set of aggregation "output"
* features for this (query, metric) pair. This can be derived from ''query'',
* but we precompute and pass this in for significantly (approximately 4x = 400%)
* faster performance. If not passed in, the operator should reconstruct these features
* from scratch.
*
* @param value Value to set for aggregate feature in the record being passed in via ''query''
*/
def setAggregateValue(
record: DataRecord,
query: AggregateFeature[T],
aggregateOutputs: Option[List[JLong]] = None,
value: TimedValue[A]
): Unit
/**
* Get features used to store aggregate output representation
* in partially aggregated data records.
*
* @query AggregateFeature (see above) specifying details of aggregate
* @return A list of "output" features used by this metric to store
* output representation. For example, for the "count" operator, we
* have only one element in this list, which is the result "count" feature.
* For the "mean" operator, we have three elements in this list: the "count"
* feature, the "sum" feature and the "mean" feature.
*/
def getOutputFeatures(query: AggregateFeature[T]): List[Feature[_]]
/**
* Get feature hashes used to store aggregate output representation
* in partially aggregated data records.
*
* @query AggregateFeature (see above) specifying details of aggregate
* @return A list of "output" feature hashes used by this metric to store
* output representation. For example, for the "count" operator, we
* have only one element in this list, which is the result "count" feature.
* For the "mean" operator, we have three elements in this list: the "count"
* feature, the "sum" feature and the "mean" feature.
*/
def getOutputFeatureIds(query: AggregateFeature[T]): List[JLong] =
getOutputFeatures(query)
.map(_.getDenseFeatureId().asInstanceOf[JLong])
/*
* Sums the given feature in two datarecords into a result record
* WARNING: this method has side-effects; it modifies combined
*
* @param combined Result datarecord to mutate and store addition result in
* @param left Left datarecord to add
* @param right Right datarecord to add
* @param query Details of aggregate to add
* @param aggregateOutputs An optional precomputed set of aggregation "output"
* feature hashes for this (query, metric) pair. This can be derived from ''query'',
* but we precompute and pass this in for significantly (approximately 4x = 400%)
* faster performance. If not passed in, the operator should reconstruct these features
* from scratch.
*/
def mutatePlus(
combined: DataRecord,
left: DataRecord,
right: DataRecord,
query: AggregateFeature[T],
aggregateOutputs: Option[List[JLong]] = None
): Unit = {
val leftValue = getAggregateValue(left, query, aggregateOutputs)
val rightValue = getAggregateValue(right, query, aggregateOutputs)
val combinedValue = plus(leftValue, rightValue, query.halfLife)
setAggregateValue(combined, query, aggregateOutputs, combinedValue)
}
/**
* Helper function to get increment value from an input DataRecord
* and copy it to an output DataRecord, given an AggregateFeature query spec.
*
* @param output Datarecord to output increment to (will be mutated by this method)
* @param input Datarecord to get increment from
* @param query Details of aggregation
* @param aggregateOutputs An optional precomputed set of aggregation "output"
* feature hashes for this (query, metric) pair. This can be derived from ''query'',
* but we precompute and pass this in for significantly (approximately 4x = 400%)
* faster performance. If not passed in, the operator should reconstruct these features
* from scratch.
* @return True if an increment was set in the output record, else false
*/
def setIncrement(
output: DataRecord,
input: DataRecord,
query: AggregateFeature[T],
timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP,
aggregateOutputs: Option[List[JLong]] = None
): Boolean = {
if (query.label == None ||
(query.label.isDefined && SRichDataRecord(input).hasFeature(query.label.get))) {
val incrementValue: TimedValue[A] = getIncrementValue(input, query.feature, timestampFeature)
setAggregateValue(output, query, aggregateOutputs, incrementValue)
true
} else false
}
}

View File

@ -1,55 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.algebird.DecayedValue
import com.twitter.algebird.DecayedValueMonoid
import com.twitter.algebird.Monoid
import com.twitter.dal.personal_data.thriftjava.PersonalDataType
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.util.Duration
import java.lang.{Long => JLong}
import java.util.{HashSet => JHashSet}
import java.util.{Set => JSet}
object AggregationMetricCommon {
/* Shared definitions and utils that can be reused by child classes */
val Epsilon: Double = 1e-6
val decayedValueMonoid: Monoid[DecayedValue] = DecayedValueMonoid(Epsilon)
val TimestampHash: JLong = SharedFeatures.TIMESTAMP.getDenseFeatureId()
def toDecayedValue(tv: TimedValue[Double], halfLife: Duration): DecayedValue = {
DecayedValue.build(
tv.value,
tv.timestamp.inMilliseconds,
halfLife.inMilliseconds
)
}
def getTimestamp(
record: DataRecord,
timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP
): Long = {
Option(
SRichDataRecord(record)
.getFeatureValue(timestampFeature)
).map(_.toLong)
.getOrElse(0L)
}
/*
* Union the PDTs of the input featureOpts.
* Return null if empty, else the JSet[PersonalDataType]
*/
def derivePersonalDataTypes(features: Option[Feature[_]]*): JSet[PersonalDataType] = {
val unionPersonalDataTypes = new JHashSet[PersonalDataType]()
for {
featureOpt <- features
feature <- featureOpt
pdtSetOptional = feature.getPersonalDataTypes
if pdtSetOptional.isPresent
pdtSet = pdtSetOptional.get
} unionPersonalDataTypes.addAll(pdtSet)
if (unionPersonalDataTypes.isEmpty) null else unionPersonalDataTypes
}
}

View File

@ -1,15 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/algebird:core",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/scala/com/twitter/ml/api/util:datarecord",
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:interpretable-model-java",
"util/util-core:scala",
],
)

View File

@ -1,5 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
object ConversionUtils {
def booleanToDouble(value: Boolean): Double = if (value) 1.0 else 0.0
}

View File

@ -1,41 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.util.Time
import java.lang.{Long => JLong}
case class TypedCountMetric[T](
) extends TypedSumLikeMetric[T] {
import AggregationMetricCommon._
import ConversionUtils._
override val operatorName = "count"
override def getIncrementValue(
record: DataRecord,
feature: Option[Feature[T]],
timestampFeature: Feature[JLong]
): TimedValue[Double] = {
val featureExists: Boolean = feature match {
case Some(f) => SRichDataRecord(record).hasFeature(f)
case None => true
}
TimedValue[Double](
value = booleanToDouble(featureExists),
timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature))
)
}
}
/**
* Syntactic sugar for the count metric that works with
* any feature type as opposed to being tied to a specific type.
* See EasyMetric.scala for more details on why this is useful.
*/
object CountMetric extends EasyMetric {
override def forFeatureType[T](
featureType: FeatureType,
): Option[AggregationMetric[T, _]] =
Some(TypedCountMetric[T]())
}

View File

@ -1,34 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
/**
* A "human-readable" metric that can be applied to features of multiple
* different types. Wrapper around AggregationMetric used as syntactic sugar
* for easier config.
*/
trait EasyMetric extends Serializable {
/*
* Given a feature type, fetches the corrrect underlying AggregationMetric
* to perform this operation over the given feature type, if any. If no such
* metric is available, returns None. For example, MEAN cannot be applied
* to FeatureType.String and would return None.
*
* @param featureType Type of feature to fetch metric for
* @param useFixedDecay Param to control whether the metric should use fixed decay
* logic (if appropriate)
* @return Strongly typed aggregation metric to use for this feature type
*
* For example, if the EasyMetric is MEAN and the featureType is
* FeatureType.Continuous, the underlying AggregationMetric should be a
* scalar mean. If the EasyMetric is MEAN and the featureType is
* FeatureType.SparseContinuous, the AggregationMetric returned could be a
* "vector" mean that averages sparse maps. Using the single logical name
* MEAN for both is nice syntactic sugar making for an easier to read top
* level config, though different underlying operators are used underneath
* for the actual implementation.
*/
def forFeatureType[T](
featureType: FeatureType,
): Option[AggregationMetric[T, _]]
}

View File

@ -1,72 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
import scala.collection.mutable
trait FeatureCache[T] {
/*
* Constructs feature names from scratch given an aggregate query and an output
* feature name. E.g. given mean operator and "sum". This function is slow and should
* only be called at pre-computation time.
*
* @param query Details of aggregate feature
* @name Name of "output" feature for which we want to construct feature name
* @return Full name of output feature
*/
private def uncachedFullFeatureName(query: AggregateFeature[T], name: String): String =
List(query.featurePrefix, name).mkString(".")
/*
* A cache from (aggregate query, output feature name) -> fully qualified feature name
* lazy since it doesn't need to be serialized to the mappers
*/
private lazy val featureNameCache = mutable.Map[(AggregateFeature[T], String), String]()
/*
* A cache from (aggregate query, output feature name) -> precomputed output feature
* lazy since it doesn't need to be serialized to the mappers
*/
private lazy val featureCache = mutable.Map[(AggregateFeature[T], String), Feature[_]]()
/**
* Given an (aggregate query, output feature name, output feature type),
* look it up using featureNameCache and featureCache, falling back to uncachedFullFeatureName()
* as a last resort to construct a precomputed output feature. Should only be
* called at pre-computation time.
*
* @param query Details of aggregate feature
* @name Name of "output" feature we want to precompute
* @aggregateFeatureType type of "output" feature we want to precompute
*/
def cachedFullFeature(
query: AggregateFeature[T],
name: String,
aggregateFeatureType: FeatureType
): Feature[_] = {
lazy val cachedFeatureName = featureNameCache.getOrElseUpdate(
(query, name),
uncachedFullFeatureName(query, name)
)
def uncachedFullFeature(): Feature[_] = {
val personalDataTypes =
AggregationMetricCommon.derivePersonalDataTypes(query.feature, query.label)
aggregateFeatureType match {
case FeatureType.BINARY => new Feature.Binary(cachedFeatureName, personalDataTypes)
case FeatureType.DISCRETE => new Feature.Discrete(cachedFeatureName, personalDataTypes)
case FeatureType.STRING => new Feature.Text(cachedFeatureName, personalDataTypes)
case FeatureType.CONTINUOUS => new Feature.Continuous(cachedFeatureName, personalDataTypes)
case FeatureType.SPARSE_BINARY =>
new Feature.SparseBinary(cachedFeatureName, personalDataTypes)
case FeatureType.SPARSE_CONTINUOUS =>
new Feature.SparseContinuous(cachedFeatureName, personalDataTypes)
}
}
featureCache.getOrElseUpdate(
(query, name),
uncachedFullFeature()
)
}
}

View File

@ -1,107 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import java.lang.{Long => JLong}
import com.twitter.ml.api._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.ConversionUtils._
import com.twitter.util.Duration
import com.twitter.util.Time
import scala.math.max
/**
* This metric measures how recently an action has taken place. A value of 1.0
* indicates the action happened just now. This value decays with time if the
* action has not taken place and is reset to 1 when the action happens. So lower
* value indicates a stale or older action.
*
* For example consider an action of "user liking a video". The last reset metric
* value changes as follows for a half life of 1 day.
*
* ----------------------------------------------------------------------------
* day | action | feature value | Description
* ----------------------------------------------------------------------------
* 1 | user likes the video | 1.0 | Set the value to 1
* 2 | user does not like video | 0.5 | Decay the value
* 3 | user does not like video | 0.25 | Decay the value
* 4 | user likes the video | 1.0 | Reset the value to 1
* -----------------------------------------------------------------------------
*
* @tparam T
*/
case class TypedLastResetMetric[T]() extends TimedValueAggregationMetric[T] {
import AggregationMetricCommon._
override val operatorName = "last_reset"
override def getIncrementValue(
record: DataRecord,
feature: Option[Feature[T]],
timestampFeature: Feature[JLong]
): TimedValue[Double] = {
val featureExists: Boolean = feature match {
case Some(f) => SRichDataRecord(record).hasFeature(f)
case None => true
}
TimedValue[Double](
value = booleanToDouble(featureExists),
timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature))
)
}
private def getDecayedValue(
olderTimedValue: TimedValue[Double],
newerTimestamp: Time,
halfLife: Duration
): Double = {
if (halfLife.inMilliseconds == 0L) {
0.0
} else {
val timeDelta = newerTimestamp.inMilliseconds - olderTimedValue.timestamp.inMilliseconds
val resultValue = olderTimedValue.value / math.pow(2.0, timeDelta / halfLife.inMillis)
if (resultValue > AggregationMetricCommon.Epsilon) resultValue else 0.0
}
}
override def plus(
left: TimedValue[Double],
right: TimedValue[Double],
halfLife: Duration
): TimedValue[Double] = {
val (newerTimedValue, olderTimedValue) = if (left.timestamp > right.timestamp) {
(left, right)
} else {
(right, left)
}
val optionallyDecayedOlderValue = if (halfLife == Duration.Top) {
// Since we don't want to decay, older value is not changed
olderTimedValue.value
} else {
// Decay older value
getDecayedValue(olderTimedValue, newerTimedValue.timestamp, halfLife)
}
TimedValue[Double](
value = max(newerTimedValue.value, optionallyDecayedOlderValue),
timestamp = newerTimedValue.timestamp
)
}
override def zero(timeOpt: Option[Long]): TimedValue[Double] = TimedValue[Double](
value = 0.0,
timestamp = Time.fromMilliseconds(0)
)
}
/**
* Syntactic sugar for the last reset metric that works with
* any feature type as opposed to being tied to a specific type.
* See EasyMetric.scala for more details on why this is useful.
*/
object LastResetMetric extends EasyMetric {
override def forFeatureType[T](
featureType: FeatureType
): Option[AggregationMetric[T, _]] =
Some(TypedLastResetMetric[T]())
}

View File

@ -1,69 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureType
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon.getTimestamp
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.EasyMetric
import com.twitter.util.Duration
import com.twitter.util.Time
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import java.lang.{Number => JNumber}
case class TypedLatestMetric[T <: JNumber](defaultValue: Double = 0.0)
extends TimedValueAggregationMetric[T] {
override val operatorName = "latest"
override def plus(
left: TimedValue[Double],
right: TimedValue[Double],
halfLife: Duration
): TimedValue[Double] = {
assert(
halfLife.toString == "Duration.Top",
s"halfLife must be Duration.Top when using latest metric, but ${halfLife.toString} is used"
)
if (left.timestamp > right.timestamp) {
left
} else {
right
}
}
override def getIncrementValue(
dataRecord: DataRecord,
feature: Option[Feature[T]],
timestampFeature: Feature[JLong]
): TimedValue[Double] = {
val value = feature
.flatMap(SRichDataRecord(dataRecord).getFeatureValueOpt(_))
.map(_.doubleValue()).getOrElse(defaultValue)
val timestamp = Time.fromMilliseconds(getTimestamp(dataRecord, timestampFeature))
TimedValue[Double](value = value, timestamp = timestamp)
}
override def zero(timeOpt: Option[Long]): TimedValue[Double] =
TimedValue[Double](
value = 0.0,
timestamp = Time.fromMilliseconds(0)
)
}
object LatestMetric extends EasyMetric {
override def forFeatureType[T](
featureType: FeatureType
): Option[AggregationMetric[T, _]] = {
featureType match {
case FeatureType.CONTINUOUS =>
Some(TypedLatestMetric[JDouble]().asInstanceOf[AggregationMetric[T, Double]])
case FeatureType.DISCRETE =>
Some(TypedLatestMetric[JLong]().asInstanceOf[AggregationMetric[T, Double]])
case _ => None
}
}
}

View File

@ -1,64 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon.getTimestamp
import com.twitter.util.Duration
import com.twitter.util.Time
import java.lang.{Long => JLong}
import java.lang.{Number => JNumber}
import java.lang.{Double => JDouble}
import scala.math.max
case class TypedMaxMetric[T <: JNumber](defaultValue: Double = 0.0)
extends TimedValueAggregationMetric[T] {
override val operatorName = "max"
override def getIncrementValue(
dataRecord: DataRecord,
feature: Option[Feature[T]],
timestampFeature: Feature[JLong]
): TimedValue[Double] = {
val value = feature
.flatMap(SRichDataRecord(dataRecord).getFeatureValueOpt(_))
.map(_.doubleValue()).getOrElse(defaultValue)
val timestamp = Time.fromMilliseconds(getTimestamp(dataRecord, timestampFeature))
TimedValue[Double](value = value, timestamp = timestamp)
}
override def plus(
left: TimedValue[Double],
right: TimedValue[Double],
halfLife: Duration
): TimedValue[Double] = {
assert(
halfLife.toString == "Duration.Top",
s"halfLife must be Duration.Top when using max metric, but ${halfLife.toString} is used"
)
TimedValue[Double](
value = max(left.value, right.value),
timestamp = left.timestamp.max(right.timestamp)
)
}
override def zero(timeOpt: Option[Long]): TimedValue[Double] =
TimedValue[Double](
value = 0.0,
timestamp = Time.fromMilliseconds(0)
)
}
object MaxMetric extends EasyMetric {
def forFeatureType[T](
featureType: FeatureType,
): Option[AggregationMetric[T, _]] =
featureType match {
case FeatureType.CONTINUOUS =>
Some(TypedMaxMetric[JDouble]().asInstanceOf[AggregationMetric[T, Double]])
case FeatureType.DISCRETE =>
Some(TypedMaxMetric[JLong]().asInstanceOf[AggregationMetric[T, Double]])
case _ => None
}
}

View File

@ -1,66 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
import com.twitter.util.Duration
import com.twitter.util.Time
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import java.util.{Map => JMap}
/*
* TypedSumLikeMetric aggregates a sum over any feature transform.
* TypedCountMetric, TypedSumMetric, TypedSumSqMetric are examples
* of metrics that are inherited from this trait. To implement a new
* "sum like" metric, override the getIncrementValue() and operatorName
* members of this trait.
*
* getIncrementValue() is inherited from the
* parent trait AggregationMetric, but not overriden in this trait, so
* it needs to be overloaded by any metric that extends TypedSumLikeMetric.
*
* operatorName is a string used for naming the resultant aggregate feature
* (e.g. "count" if its a count feature, or "sum" if a sum feature).
*/
trait TypedSumLikeMetric[T] extends TimedValueAggregationMetric[T] {
import AggregationMetricCommon._
def useFixedDecay = true
override def plus(
left: TimedValue[Double],
right: TimedValue[Double],
halfLife: Duration
): TimedValue[Double] = {
val resultValue = if (halfLife == Duration.Top) {
/* We could use decayedValueMonoid here, but
* a simple addition is slightly more accurate */
left.value + right.value
} else {
val decayedLeft = toDecayedValue(left, halfLife)
val decayedRight = toDecayedValue(right, halfLife)
decayedValueMonoid.plus(decayedLeft, decayedRight).value
}
TimedValue[Double](
resultValue,
left.timestamp.max(right.timestamp)
)
}
override def zero(timeOpt: Option[Long]): TimedValue[Double] = {
val timestamp =
/*
* Please see TQ-11279 for documentation for this fix to the decay logic.
*/
if (useFixedDecay) {
Time.fromMilliseconds(timeOpt.getOrElse(0L))
} else {
Time.fromMilliseconds(0L)
}
TimedValue[Double](
value = 0.0,
timestamp = timestamp
)
}
}

View File

@ -1,52 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.util.Time
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
case class TypedSumMetric(
) extends TypedSumLikeMetric[JDouble] {
import AggregationMetricCommon._
override val operatorName = "sum"
/*
* Transform feature -> its value in the given record,
* or 0 when feature = None (sum has no meaning in this case)
*/
override def getIncrementValue(
record: DataRecord,
feature: Option[Feature[JDouble]],
timestampFeature: Feature[JLong]
): TimedValue[Double] = feature match {
case Some(f) => {
TimedValue[Double](
value = Option(SRichDataRecord(record).getFeatureValue(f)).map(_.toDouble).getOrElse(0.0),
timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature))
)
}
case None =>
TimedValue[Double](
value = 0.0,
timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature))
)
}
}
/**
* Syntactic sugar for the sum metric that works with continuous features.
* See EasyMetric.scala for more details on why this is useful.
*/
object SumMetric extends EasyMetric {
override def forFeatureType[T](
featureType: FeatureType
): Option[AggregationMetric[T, _]] =
featureType match {
case FeatureType.CONTINUOUS =>
Some(TypedSumMetric().asInstanceOf[AggregationMetric[T, Double]])
case _ => None
}
}

View File

@ -1,53 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.util.Time
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
case class TypedSumSqMetric() extends TypedSumLikeMetric[JDouble] {
import AggregationMetricCommon._
override val operatorName = "sumsq"
/*
* Transform feature -> its squared value in the given record
* or 0 when feature = None (sumsq has no meaning in this case)
*/
override def getIncrementValue(
record: DataRecord,
feature: Option[Feature[JDouble]],
timestampFeature: Feature[JLong]
): TimedValue[Double] = feature match {
case Some(f) => {
val featureVal =
Option(SRichDataRecord(record).getFeatureValue(f)).map(_.toDouble).getOrElse(0.0)
TimedValue[Double](
value = featureVal * featureVal,
timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature))
)
}
case None =>
TimedValue[Double](
value = 0.0,
timestamp = Time.fromMilliseconds(getTimestamp(record, timestampFeature))
)
}
}
/**
* Syntactic sugar for the sum of squares metric that works with continuous features.
* See EasyMetric.scala for more details on why this is useful.
*/
object SumSqMetric extends EasyMetric {
override def forFeatureType[T](
featureType: FeatureType
): Option[AggregationMetric[T, _]] =
featureType match {
case FeatureType.CONTINUOUS =>
Some(TypedSumSqMetric().asInstanceOf[AggregationMetric[T, Double]])
case _ => None
}
}

View File

@ -1,14 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.util.Time
/**
* Case class wrapping a (value, timestamp) tuple.
* All aggregate metrics must operate over this class
* to ensure we can implement decay and half lives for them.
* This is translated to an algebird DecayedValue under the hood.
*
* @param value Value being wrapped
* @param timestamp Time after epoch at which value is being measured
*/
case class TimedValue[T](value: T, timestamp: Time)

View File

@ -1,90 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics
import com.twitter.ml.api._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.TimedValue
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
import com.twitter.util.Duration
import com.twitter.util.Time
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import java.util.{Map => JMap}
/*
* ContinuousAggregationMetric overrides method AggregationMetric dealing
* with reading and writing continuous values from a data record.
*
* operatorName is a string used for naming the resultant aggregate feature
* (e.g. "count" if its a count feature, or "sum" if a sum feature).
*/
trait TimedValueAggregationMetric[T] extends AggregationMetric[T, Double] {
import AggregationMetricCommon._
val operatorName: String
override def getAggregateValue(
record: DataRecord,
query: AggregateFeature[T],
aggregateOutputs: Option[List[JLong]] = None
): TimedValue[Double] = {
/*
* We know aggregateOutputs(0) will have the continuous feature,
* since we put it there in getOutputFeatureIds() - see code below.
* This helps us get a 4x speedup. Using any structure more complex
* than a list was also a performance bottleneck.
*/
val featureHash: JLong = aggregateOutputs
.getOrElse(getOutputFeatureIds(query))
.head
val continuousValueOption: Option[Double] = Option(record.continuousFeatures)
.flatMap { case jmap: JMap[JLong, JDouble] => Option(jmap.get(featureHash)) }
.map(_.toDouble)
val timeOption = Option(record.discreteFeatures)
.flatMap { case jmap: JMap[JLong, JLong] => Option(jmap.get(TimestampHash)) }
.map(_.toLong)
val resultOption: Option[TimedValue[Double]] = (continuousValueOption, timeOption) match {
case (Some(featureValue), Some(timesamp)) =>
Some(TimedValue[Double](featureValue, Time.fromMilliseconds(timesamp)))
case _ => None
}
resultOption.getOrElse(zero(timeOption))
}
override def setAggregateValue(
record: DataRecord,
query: AggregateFeature[T],
aggregateOutputs: Option[List[JLong]] = None,
value: TimedValue[Double]
): Unit = {
/*
* We know aggregateOutputs(0) will have the continuous feature,
* since we put it there in getOutputFeatureIds() - see code below.
* This helps us get a 4x speedup. Using any structure more complex
* than a list was also a performance bottleneck.
*/
val featureHash: JLong = aggregateOutputs
.getOrElse(getOutputFeatureIds(query))
.head
/* Only set value if non-zero to save space */
if (value.value != 0.0) {
record.putToContinuousFeatures(featureHash, value.value)
}
/*
* We do not set timestamp since that might affect correctness of
* future aggregations due to the decay semantics.
*/
}
/* Only one feature stored in the aggregated datarecord: the result continuous value */
override def getOutputFeatures(query: AggregateFeature[T]): List[Feature[_]] = {
val feature = cachedFullFeature(query, operatorName, FeatureType.CONTINUOUS)
List(feature)
}
}

View File

@ -1,19 +0,0 @@
package com.twitter.timelines.data_processing.ml_util
import com.twitter.ml.api.DataRecord
package object aggregation_framework {
object AggregateType extends Enumeration {
type AggregateType = Value
val User, UserAuthor, UserEngager, UserMention, UserRequestHour, UserRequestDow,
UserOriginalAuthor, UserList, UserTopic, UserInferredTopic, UserMediaUnderstandingAnnotation =
Value
}
type AggregateUserEntityKey = (Long, AggregateType.Value, Option[Long])
case class MergedRecordsDescriptor(
userId: Long,
keyedRecords: Map[AggregateType.Value, Option[KeyedRecord]],
keyedRecordMaps: Map[AggregateType.Value, Option[KeyedRecordMap]])
}

View File

@ -1,12 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"finagle/finagle-stats",
"src/java/com/twitter/ml/api:api-base",
"src/thrift/com/twitter/ml/api:data-scala",
"src/thrift/com/twitter/ml/api:interpretable-model-java",
"timelines/data_processing/ml_util/aggregation_framework/metrics",
],
)

View File

@ -1,159 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.query
import com.twitter.dal.personal_data.thriftjava.PersonalDataType
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureBuilder
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api.thriftscala.{DataRecord => ScalaDataRecord}
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import scala.collection.JavaConverters._
/**
* Provides methods to build "scoped" aggregates, where base features generated by aggregates
* V2 are scoped with a specific key.
*
* The class provides methods that take a Map of T -> DataRecord, where T is a key type, and
* the DataRecord contains features produced by the aggregation_framework. The methods then
* generate a _new_ DataRecord, containing "scoped" aggregate features, where each scoped
* feature has the value of the scope key in the feature name, and the value of the feature
* is the value of the original aggregate feature in the corresponding value from the original
* Map.
*
* For efficiency reasons, the builder is initialized with the set of features that should be
* scoped and the set of keys for which scoping should be supported.
*
* To understand how scope feature names are constructed, consider the following:
*
* {{{
* val features = Set(
* new Feature.Continuous("user_injection_aggregate.pair.any_label.any_feature.5.days.count"),
* new Feature.Continuous("user_injection_aggregate.pair.any_label.any_feature.10.days.count")
* )
* val scopes = Set(SuggestType.Recap, SuggestType.WhoToFollow)
* val scopeName = "InjectionType"
* val scopedAggregateBuilder = ScopedAggregateBuilder(features, scopes, scopeName)
*
* }}}
*
* Then, generated scoped features would be among the following:
* - user_injection_aggregate.scoped.pair.any_label.any_feature.5.days.count/scope_name=InjectionType/scope=Recap
* - user_injection_aggregate.scoped.pair.any_label.any_feature.5.days.count/scope_name=InjectionType/scope=WhoToFollow
* - user_injection_aggregate.scoped.pair.any_label.any_feature.10.days.count/scope_name=InjectionType/scope=Recap
* - user_injection_aggregate.scoped.pair.any_label.any_feature.10.days.count/scope_name=InjectionType/scope=WhoToFollow
*
* @param featuresToScope the set of features for which one should generate scoped versions
* @param scopeKeys the set of scope keys to generate scopes with
* @param scopeName a string indicating what the scopes represent. This is also added to the scoped feature
* @tparam K the type of scope key
*/
class ScopedAggregateBuilder[K](
featuresToScope: Set[Feature[JDouble]],
scopeKeys: Set[K],
scopeName: String) {
private[this] def buildScopedAggregateFeature(
baseName: String,
scopeValue: String,
personalDataTypes: java.util.Set[PersonalDataType]
): Feature[JDouble] = {
val components = baseName.split("\\.").toList
val newName = (components.head :: "scoped" :: components.tail).mkString(".")
new FeatureBuilder.Continuous()
.addExtensionDimensions("scope_name", "scope")
.setBaseName(newName)
.setPersonalDataTypes(personalDataTypes)
.extensionBuilder()
.addExtension("scope_name", scopeName)
.addExtension("scope", scopeValue)
.build()
}
/**
* Index of (base aggregate feature name, key) -> key scoped count feature.
*/
private[this] val keyScopedAggregateMap: Map[(String, K), Feature[JDouble]] = {
featuresToScope.flatMap { feat =>
scopeKeys.map { key =>
(feat.getFeatureName, key) ->
buildScopedAggregateFeature(
feat.getFeatureName,
key.toString,
AggregationMetricCommon.derivePersonalDataTypes(Some(feat))
)
}
}.toMap
}
type ContinuousFeaturesMap = Map[JLong, JDouble]
/**
* Create key-scoped features for raw aggregate feature ID to value maps, partitioned by key.
*/
private[this] def buildAggregates(featureMapsByKey: Map[K, ContinuousFeaturesMap]): DataRecord = {
val continuousFeatures = featureMapsByKey
.flatMap {
case (key, featureMap) =>
featuresToScope.flatMap { feature =>
val newFeatureOpt = keyScopedAggregateMap.get((feature.getFeatureName, key))
newFeatureOpt.flatMap { newFeature =>
featureMap.get(feature.getFeatureId).map(new JLong(newFeature.getFeatureId) -> _)
}
}.toMap
}
new DataRecord().setContinuousFeatures(continuousFeatures.asJava)
}
/**
* Create key-scoped features for Java [[DataRecord]] aggregate records partitioned by key.
*
* As an example, if the provided Map includes the key `SuggestType.Recap`, and [[scopeKeys]]
* includes this key, then for a feature "xyz.pair.any_label.any_feature.5.days.count", the method
* will generate the scoped feature "xyz.scoped.pair.any_label.any_feature.5.days.count/scope_name=InjectionType/scope=Recap",
* with the value being the value of the original feature from the Map.
*
* @param aggregatesByKey a map from key to a continuous feature map (ie. feature ID -> Double)
* @return a Java [[DataRecord]] containing key-scoped features
*/
def buildAggregatesJava(aggregatesByKey: Map[K, DataRecord]): DataRecord = {
val featureMapsByKey = aggregatesByKey.mapValues(_.continuousFeatures.asScala.toMap)
buildAggregates(featureMapsByKey)
}
/**
* Create key-scoped features for Scala [[DataRecord]] aggregate records partitioned by key.
*
* As an example, if the provided Map includes the key `SuggestType.Recap`, and [[scopeKeys]]
* includes this key, then for a feature "xyz.pair.any_label.any_feature.5.days.count", the method
* will generate the scoped feature "xyz.scoped.pair.any_label.any_feature.5.days.count/scope_name=InjectionType/scope=Recap",
* with the value being the value of the original feature from the Map.
*
* This is a convenience method for some use cases where aggregates are read from Scala
* thrift objects. Note that this still returns a Java [[DataRecord]], since most ML API
* use the Java version.
*
* @param aggregatesByKey a map from key to a continuous feature map (ie. feature ID -> Double)
* @return a Java [[DataRecord]] containing key-scoped features
*/
def buildAggregatesScala(aggregatesByKey: Map[K, ScalaDataRecord]): DataRecord = {
val featureMapsByKey =
aggregatesByKey
.mapValues { record =>
val featureMap = record.continuousFeatures.getOrElse(Map[Long, Double]()).toMap
featureMap.map { case (k, v) => new JLong(k) -> new JDouble(v) }
}
buildAggregates(featureMapsByKey)
}
/**
* Returns a [[FeatureContext]] including all possible scoped features generated using this builder.
*
* @return a [[FeatureContext]] containing all scoped features.
*/
def scopedFeatureContext: FeatureContext = new FeatureContext(keyScopedAggregateMap.values.asJava)
}

View File

@ -1,213 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.scalding.Stat
import com.twitter.scalding.typed.TypedPipe
import com.twitter.timelines.data_processing.ml_util.aggregation_framework._
import com.twitter.timelines.data_processing.ml_util.sampling.SamplingUtils
trait AggregateFeaturesMergerBase {
import Utils._
def samplingRateOpt: Option[Double]
def numReducers: Int = 2000
def numReducersMerge: Int = 20000
def aggregationConfig: AggregationConfig
def storeRegister: StoreRegister
def storeMerger: StoreMerger
def getAggregatePipe(storeName: String): DataSetPipe
def applyMaxSizeByTypeOpt(aggregateType: AggregateType.Value): Option[Int] = Option.empty[Int]
def usersActiveSourcePipe: TypedPipe[Long]
def numRecords: Stat
def numFilteredRecords: Stat
/*
* This method should only be called with a storeName that corresponds
* to a user aggregate store.
*/
def extractUserFeaturesMap(storeName: String): TypedPipe[(Long, KeyedRecord)] = {
val aggregateKey = storeRegister.storeNameToTypeMap(storeName)
samplingRateOpt
.map(rate => SamplingUtils.userBasedSample(getAggregatePipe(storeName), rate))
.getOrElse(getAggregatePipe(storeName)) // must return store with only user aggregates
.records
.map { r: DataRecord =>
val record = SRichDataRecord(r)
val userId = record.getFeatureValue(USER_ID).longValue
record.clearFeature(USER_ID)
(userId, KeyedRecord(aggregateKey, r))
}
}
/*
* When the secondaryKey being used is a String, then the shouldHash function should be set to true.
* Refactor such that the shouldHash parameter is removed and the behavior
* is defaulted to true.
*
* This method should only be called with a storeName that contains records with the
* desired secondaryKey. We provide secondaryKeyFilterPipeOpt against which secondary
* keys can be filtered to help prune the final merged MH dataset.
*/
def extractSecondaryTuples[T](
storeName: String,
secondaryKey: Feature[T],
shouldHash: Boolean = false,
maxSizeOpt: Option[Int] = None,
secondaryKeyFilterPipeOpt: Option[TypedPipe[Long]] = None
): TypedPipe[(Long, KeyedRecordMap)] = {
val aggregateKey = storeRegister.storeNameToTypeMap(storeName)
val extractedRecordsBySecondaryKey =
samplingRateOpt
.map(rate => SamplingUtils.userBasedSample(getAggregatePipe(storeName), rate))
.getOrElse(getAggregatePipe(storeName))
.records
.map { r: DataRecord =>
val record = SRichDataRecord(r)
val userId = keyFromLong(r, USER_ID)
val secondaryId = extractSecondary(r, secondaryKey, shouldHash)
record.clearFeature(USER_ID)
record.clearFeature(secondaryKey)
numRecords.inc()
(userId, secondaryId -> r)
}
val grouped =
(secondaryKeyFilterPipeOpt match {
case Some(secondaryKeyFilterPipe: TypedPipe[Long]) =>
extractedRecordsBySecondaryKey
.map {
// In this step, we swap `userId` with `secondaryId` to join on the `secondaryId`
// It is important to swap them back after the join, otherwise the job will fail.
case (userId, (secondaryId, r)) =>
(secondaryId, (userId, r))
}
.join(secondaryKeyFilterPipe.groupBy(identity))
.map {
case (secondaryId, ((userId, r), _)) =>
numFilteredRecords.inc()
(userId, secondaryId -> r)
}
case _ => extractedRecordsBySecondaryKey
}).group
.withReducers(numReducers)
maxSizeOpt match {
case Some(maxSize) =>
grouped
.take(maxSize)
.mapValueStream(recordsIter => Iterator(KeyedRecordMap(aggregateKey, recordsIter.toMap)))
.toTypedPipe
case None =>
grouped
.mapValueStream(recordsIter => Iterator(KeyedRecordMap(aggregateKey, recordsIter.toMap)))
.toTypedPipe
}
}
def userPipes: Seq[TypedPipe[(Long, KeyedRecord)]] =
storeRegister.allStores.flatMap { storeConfig =>
val StoreConfig(storeNames, aggregateType, _) = storeConfig
require(storeMerger.isValidToMerge(storeNames))
if (aggregateType == AggregateType.User) {
storeNames.map(extractUserFeaturesMap)
} else None
}.toSeq
private def getSecondaryKeyFilterPipeOpt(
aggregateType: AggregateType.Value
): Option[TypedPipe[Long]] = {
if (aggregateType == AggregateType.UserAuthor) {
Some(usersActiveSourcePipe)
} else None
}
def userSecondaryKeyPipes: Seq[TypedPipe[(Long, KeyedRecordMap)]] = {
storeRegister.allStores.flatMap { storeConfig =>
val StoreConfig(storeNames, aggregateType, shouldHash) = storeConfig
require(storeMerger.isValidToMerge(storeNames))
if (aggregateType != AggregateType.User) {
storeNames.flatMap { storeName =>
storeConfig.secondaryKeyFeatureOpt
.map { secondaryFeature =>
extractSecondaryTuples(
storeName,
secondaryFeature,
shouldHash,
applyMaxSizeByTypeOpt(aggregateType),
getSecondaryKeyFilterPipeOpt(aggregateType)
)
}
}
} else None
}.toSeq
}
def joinedAggregates: TypedPipe[(Long, MergedRecordsDescriptor)] = {
(userPipes ++ userSecondaryKeyPipes)
.reduce(_ ++ _)
.group
.withReducers(numReducersMerge)
.mapGroup {
case (uid, keyedRecordsAndMaps) =>
/*
* For every user, partition their records by aggregate type.
* AggregateType.User should only contain KeyedRecord whereas
* other aggregate types (with secondary keys) contain KeyedRecordMap.
*/
val (userRecords, userSecondaryKeyRecords) = keyedRecordsAndMaps.toList
.map { record =>
record match {
case record: KeyedRecord => (record.aggregateType, record)
case record: KeyedRecordMap => (record.aggregateType, record)
}
}
.groupBy(_._1)
.mapValues(_.map(_._2))
.partition(_._1 == AggregateType.User)
val userAggregateRecordMap: Map[AggregateType.Value, Option[KeyedRecord]] =
userRecords
.asInstanceOf[Map[AggregateType.Value, List[KeyedRecord]]]
.map {
case (aggregateType, keyedRecords) =>
val mergedKeyedRecordOpt = mergeKeyedRecordOpts(keyedRecords.map(Some(_)): _*)
(aggregateType, mergedKeyedRecordOpt)
}
val userSecondaryKeyAggregateRecordOpt: Map[AggregateType.Value, Option[KeyedRecordMap]] =
userSecondaryKeyRecords
.asInstanceOf[Map[AggregateType.Value, List[KeyedRecordMap]]]
.map {
case (aggregateType, keyedRecordMaps) =>
val keyedRecordMapOpt =
keyedRecordMaps.foldLeft(Option.empty[KeyedRecordMap]) {
(mergedRecOpt, nextRec) =>
applyMaxSizeByTypeOpt(aggregateType)
.map { maxSize =>
mergeKeyedRecordMapOpts(mergedRecOpt, Some(nextRec), maxSize)
}.getOrElse {
mergeKeyedRecordMapOpts(mergedRecOpt, Some(nextRec))
}
}
(aggregateType, keyedRecordMapOpt)
}
Iterator(
MergedRecordsDescriptor(
userId = uid,
keyedRecords = userAggregateRecordMap,
keyedRecordMaps = userSecondaryKeyAggregateRecordOpt
)
)
}.toTypedPipe
}
}

View File

@ -1,200 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding
import com.twitter.algebird.ScMapMonoid
import com.twitter.bijection.Injection
import com.twitter.bijection.thrift.CompactThriftCodec
import com.twitter.ml.api.util.CompactDataRecordConverter
import com.twitter.ml.api.CompactDataRecord
import com.twitter.ml.api.DataRecord
import com.twitter.scalding.commons.source.VersionedKeyValSource
import com.twitter.scalding.Args
import com.twitter.scalding.Days
import com.twitter.scalding.Duration
import com.twitter.scalding.RichDate
import com.twitter.scalding.TypedPipe
import com.twitter.scalding.TypedTsv
import com.twitter.scalding_internal.job.HasDateRange
import com.twitter.scalding_internal.job.analytics_batch.AnalyticsBatchJob
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird_internal.bijection.BatchPairImplicits
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKeyInjection
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import scala.collection.JavaConverters._
/**
* The job takes four inputs:
* - The path to a AggregateStore using the DataRecord format.
* - The path to a AggregateStore using the CompactDataRecord format.
* - A version that must be present in both sources.
* - A sink to write the comparison statistics.
*
* The job reads in the two stores, converts the second one to DataRecords and
* then compared each key to see if the two stores have identical DataRecords,
* modulo the loss in precision on converting the Double to Float.
*/
class AggregatesStoreComparisonJob(args: Args)
extends AnalyticsBatchJob(args)
with BatchPairImplicits
with HasDateRange {
import AggregatesStoreComparisonJob._
override def batchIncrement: Duration = Days(1)
override def firstTime: RichDate = RichDate(args("firstTime"))
private val dataRecordSourcePath = args("dataRecordSource")
private val compactDataRecordSourcePath = args("compactDataRecordSource")
private val version = args.long("version")
private val statsSink = args("sink")
require(dataRecordSourcePath != compactDataRecordSourcePath)
private val dataRecordSource =
VersionedKeyValSource[AggregationKey, (BatchID, DataRecord)](
path = dataRecordSourcePath,
sourceVersion = Some(version)
)
private val compactDataRecordSource =
VersionedKeyValSource[AggregationKey, (BatchID, CompactDataRecord)](
path = compactDataRecordSourcePath,
sourceVersion = Some(version)
)
private val dataRecordPipe: TypedPipe[((AggregationKey, BatchID), DataRecord)] = TypedPipe
.from(dataRecordSource)
.map { case (key, (batchId, record)) => ((key, batchId), record) }
private val compactDataRecordPipe: TypedPipe[((AggregationKey, BatchID), DataRecord)] = TypedPipe
.from(compactDataRecordSource)
.map {
case (key, (batchId, compactRecord)) =>
val record = compactConverter.compactDataRecordToDataRecord(compactRecord)
((key, batchId), record)
}
dataRecordPipe
.outerJoin(compactDataRecordPipe)
.mapValues { case (leftOpt, rightOpt) => compareDataRecords(leftOpt, rightOpt) }
.values
.sum(mapMonoid)
.flatMap(_.toList)
.write(TypedTsv(statsSink))
}
object AggregatesStoreComparisonJob {
val mapMonoid: ScMapMonoid[String, Long] = new ScMapMonoid[String, Long]()
implicit private val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] =
AggregationKeyInjection
implicit private val aggregationKeyOrdering: Ordering[AggregationKey] = AggregationKeyOrdering
implicit private val dataRecordCodec: Injection[DataRecord, Array[Byte]] =
CompactThriftCodec[DataRecord]
implicit private val compactDataRecordCodec: Injection[CompactDataRecord, Array[Byte]] =
CompactThriftCodec[CompactDataRecord]
private val compactConverter = new CompactDataRecordConverter
val missingRecordFromLeft = "missingRecordFromLeft"
val missingRecordFromRight = "missingRecordFromRight"
val nonContinuousFeaturesDidNotMatch = "nonContinuousFeaturesDidNotMatch"
val missingFeaturesFromLeft = "missingFeaturesFromLeft"
val missingFeaturesFromRight = "missingFeaturesFromRight"
val recordsWithUnmatchedKeys = "recordsWithUnmatchedKeys"
val featureValuesMatched = "featureValuesMatched"
val featureValuesThatDidNotMatch = "featureValuesThatDidNotMatch"
val equalRecords = "equalRecords"
val keyCount = "keyCount"
def compareDataRecords(
leftOpt: Option[DataRecord],
rightOpt: Option[DataRecord]
): collection.Map[String, Long] = {
val stats = collection.Map((keyCount, 1L))
(leftOpt, rightOpt) match {
case (Some(left), Some(right)) =>
if (isIdenticalNonContinuousFeatureSet(left, right)) {
getContinuousFeaturesStats(left, right).foldLeft(stats)(mapMonoid.add)
} else {
mapMonoid.add(stats, (nonContinuousFeaturesDidNotMatch, 1L))
}
case (Some(_), None) => mapMonoid.add(stats, (missingRecordFromRight, 1L))
case (None, Some(_)) => mapMonoid.add(stats, (missingRecordFromLeft, 1L))
case (None, None) => throw new IllegalArgumentException("Should never be possible")
}
}
/**
* For Continuous features.
*/
private def getContinuousFeaturesStats(
left: DataRecord,
right: DataRecord
): Seq[(String, Long)] = {
val leftFeatures = Option(left.getContinuousFeatures)
.map(_.asScala.toMap)
.getOrElse(Map.empty[JLong, JDouble])
val rightFeatures = Option(right.getContinuousFeatures)
.map(_.asScala.toMap)
.getOrElse(Map.empty[JLong, JDouble])
val numMissingFeaturesLeft = (rightFeatures.keySet diff leftFeatures.keySet).size
val numMissingFeaturesRight = (leftFeatures.keySet diff rightFeatures.keySet).size
if (numMissingFeaturesLeft == 0 && numMissingFeaturesRight == 0) {
val Epsilon = 1e-5
val numUnmatchedValues = leftFeatures.map {
case (id, lValue) =>
val rValue = rightFeatures(id)
// The approximate match is to account for the precision loss due to
// the Double -> Float -> Double conversion.
if (math.abs(lValue - rValue) <= Epsilon) 0L else 1L
}.sum
if (numUnmatchedValues == 0) {
Seq(
(equalRecords, 1L),
(featureValuesMatched, leftFeatures.size.toLong)
)
} else {
Seq(
(featureValuesThatDidNotMatch, numUnmatchedValues),
(
featureValuesMatched,
math.max(leftFeatures.size, rightFeatures.size) - numUnmatchedValues)
)
}
} else {
Seq(
(recordsWithUnmatchedKeys, 1L),
(missingFeaturesFromLeft, numMissingFeaturesLeft.toLong),
(missingFeaturesFromRight, numMissingFeaturesRight.toLong)
)
}
}
/**
* For feature types that are not Feature.Continuous. We expect these to match exactly in the two stores.
* Mutable change
*/
private def isIdenticalNonContinuousFeatureSet(left: DataRecord, right: DataRecord): Boolean = {
val booleanMatched = safeEquals(left.binaryFeatures, right.binaryFeatures)
val discreteMatched = safeEquals(left.discreteFeatures, right.discreteFeatures)
val stringMatched = safeEquals(left.stringFeatures, right.stringFeatures)
val sparseBinaryMatched = safeEquals(left.sparseBinaryFeatures, right.sparseBinaryFeatures)
val sparseContinuousMatched =
safeEquals(left.sparseContinuousFeatures, right.sparseContinuousFeatures)
val blobMatched = safeEquals(left.blobFeatures, right.blobFeatures)
val tensorsMatched = safeEquals(left.tensors, right.tensors)
val sparseTensorsMatched = safeEquals(left.sparseTensors, right.sparseTensors)
booleanMatched && discreteMatched && stringMatched && sparseBinaryMatched &&
sparseContinuousMatched && blobMatched && tensorsMatched && sparseTensorsMatched
}
def safeEquals[T](l: T, r: T): Boolean = Option(l).equals(Option(r))
}

View File

@ -1,216 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding
import com.twitter.bijection.thrift.CompactThriftCodec
import com.twitter.bijection.Codec
import com.twitter.bijection.Injection
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures.TIMESTAMP
import com.twitter.ml.api.util.CompactDataRecordConverter
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.scalding.Args
import com.twitter.scalding_internal.dalv2.DALWrite.D
import com.twitter.storehaus_internal.manhattan.ManhattanROConfig
import com.twitter.summingbird.batch.option.Reducers
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird.batch.Batcher
import com.twitter.summingbird.batch.Timestamp
import com.twitter.summingbird.option._
import com.twitter.summingbird.scalding.Scalding
import com.twitter.summingbird.scalding.batch.{BatchedStore => ScaldingBatchedStore}
import com.twitter.summingbird.Options
import com.twitter.summingbird.Producer
import com.twitter.summingbird_internal.bijection.BatchPairImplicits._
import com.twitter.summingbird_internal.runner.common.JobName
import com.twitter.summingbird_internal.runner.scalding.GenericRunner
import com.twitter.summingbird_internal.runner.scalding.ScaldingConfig
import com.twitter.summingbird_internal.runner.scalding.StatebirdState
import com.twitter.summingbird_internal.dalv2.DAL
import com.twitter.summingbird_internal.runner.store_config._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding.sources._
import job.AggregatesV2Job
import org.apache.hadoop.conf.Configuration
/*
* Offline scalding version of summingbird job to compute aggregates v2.
* This is loosely based on the template created by sb-gen.
* Extend this trait in your own scalding job, and override the val
* "aggregatesToCompute" with your own desired set of aggregates.
*/
trait AggregatesV2ScaldingJob {
val aggregatesToCompute: Set[TypedAggregateGroup[_]]
implicit val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] =
AggregationKeyInjection
implicit val aggregationKeyOrdering: AggregationKeyOrdering.type = AggregationKeyOrdering
implicit val dataRecordCodec: Injection[DataRecord, Array[Byte]] = CompactThriftCodec[DataRecord]
private implicit val compactDataRecordCodec: Injection[CompactDataRecord, Array[Byte]] =
CompactThriftCodec[CompactDataRecord]
private val compactDataRecordConverter = new CompactDataRecordConverter()
def numReducers: Int = -1
/**
* Function that maps from a logical ''AggregateSource''
* to an underlying physical source. The physical source
* for the scalding platform is a ScaldingAggregateSource.
*/
def dataRecordSourceToScalding(
source: AggregateSource
): Option[Producer[Scalding, DataRecord]] = {
source match {
case offlineSource: OfflineAggregateSource =>
Some(ScaldingAggregateSource(offlineSource).source)
case _ => None
}
}
/**
* Creates and returns a versioned store using the config parameters
* with a specific number of versions to keep, and which can read from
* the most recent available version on HDFS rather than a specific
* version number. The store applies a timestamp correction based on the
* number of days of aggregate data skipped over at read time to ensure
* that skipping data plays nicely with halfLife decay.
*
* @param config specifying the Manhattan store parameters
* @param versionsToKeep number of old versions to keep
*/
def getMostRecentLagCorrectingVersionedStoreWithRetention[
Key: Codec: Ordering,
ValInStore: Codec,
ValInMemory
](
config: OfflineStoreOnlyConfig[ManhattanROConfig],
versionsToKeep: Int,
lagCorrector: (ValInMemory, Long) => ValInMemory,
packer: ValInMemory => ValInStore,
unpacker: ValInStore => ValInMemory
): ScaldingBatchedStore[Key, ValInMemory] = {
MostRecentLagCorrectingVersionedStore[Key, ValInStore, ValInMemory](
config.offline.hdfsPath.toString,
packer = packer,
unpacker = unpacker,
versionsToKeep = versionsToKeep)(
Injection.connect[(Key, (BatchID, ValInStore)), (Array[Byte], Array[Byte])],
config.batcher,
implicitly[Ordering[Key]],
lagCorrector
).withInitialBatch(config.batcher.batchOf(config.startTime.value))
}
def mutablyCorrectDataRecordTimestamp(
record: DataRecord,
lagToCorrectMillis: Long
): DataRecord = {
val richRecord = SRichDataRecord(record)
if (richRecord.hasFeature(TIMESTAMP)) {
val timestamp = richRecord.getFeatureValue(TIMESTAMP).toLong
richRecord.setFeatureValue(TIMESTAMP, timestamp + lagToCorrectMillis)
}
record
}
/**
* Function that maps from a logical ''AggregateStore''
* to an underlying physical store. The physical store for
* scalding is a HDFS VersionedKeyValSource dataset.
*/
def aggregateStoreToScalding(
store: AggregateStore
): Option[Scalding#Store[AggregationKey, DataRecord]] = {
store match {
case offlineStore: OfflineAggregateDataRecordStore =>
Some(
getMostRecentLagCorrectingVersionedStoreWithRetention[
AggregationKey,
DataRecord,
DataRecord](
offlineStore,
versionsToKeep = offlineStore.batchesToKeep,
lagCorrector = mutablyCorrectDataRecordTimestamp,
packer = Injection.identity[DataRecord],
unpacker = Injection.identity[DataRecord]
)
)
case offlineStore: OfflineAggregateDataRecordStoreWithDAL =>
Some(
DAL.versionedKeyValStore[AggregationKey, DataRecord](
dataset = offlineStore.dalDataset,
pathLayout = D.Suffix(offlineStore.offline.hdfsPath.toString),
batcher = offlineStore.batcher,
maybeStartTime = Some(offlineStore.startTime),
maxErrors = offlineStore.maxKvSourceFailures
))
case _ => None
}
}
def generate(args: Args): ScaldingConfig = new ScaldingConfig {
val jobName = JobName(args("job_name"))
/*
* Add registrars for chill serialization for user-defined types.
* We use the default: an empty List().
*/
override def registrars = List()
/* Use transformConfig to set Hadoop options. */
override def transformConfig(config: Map[String, AnyRef]): Map[String, AnyRef] =
super.transformConfig(config) ++ Map(
"mapreduce.output.fileoutputformat.compress" -> "true",
"mapreduce.output.fileoutputformat.compress.codec" -> "com.hadoop.compression.lzo.LzoCodec",
"mapreduce.output.fileoutputformat.compress.type" -> "BLOCK"
)
/*
* Use getNamedOptions to set Summingbird runtime options
* The options we set are:
* 1) Set monoid to non-commutative to disable map-side
* aggregation and force all aggregation to reducers (provides a 20% speedup)
*/
override def getNamedOptions: Map[String, Options] = Map(
"DEFAULT" -> Options()
.set(MonoidIsCommutative(false))
.set(Reducers(numReducers))
)
implicit val batcher: Batcher = Batcher.ofHours(24)
/* State implementation that uses Statebird (go/statebird) to track the batches processed. */
def getWaitingState(hadoopConfig: Configuration, startDate: Option[Timestamp], batches: Int) =
StatebirdState(
jobName,
startDate,
batches,
args.optional("statebird_service_destination"),
args.optional("statebird_client_id_name")
)(batcher)
val sourceNameFilter: Option[Set[String]] =
args.optional("input_sources").map(_.split(",").toSet)
val storeNameFilter: Option[Set[String]] =
args.optional("output_stores").map(_.split(",").toSet)
val filteredAggregates =
AggregatesV2Job.filterAggregates(
aggregates = aggregatesToCompute,
sourceNames = sourceNameFilter,
storeNames = storeNameFilter
)
override val graph =
AggregatesV2Job.generateJobGraph[Scalding](
filteredAggregates,
dataRecordSourceToScalding,
aggregateStoreToScalding
)(DataRecordAggregationMonoid(filteredAggregates))
}
def main(args: Array[String]): Unit = {
GenericRunner(args, generate(_))
}
}

View File

@ -1,17 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding
import com.twitter.scalding_internal.job.RequiredBinaryComparators.ordSer
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.MacroEqualityOrderedSerialization
object AggregationKeyOrdering extends Ordering[AggregationKey] {
implicit val featureMapsOrdering: MacroEqualityOrderedSerialization[
(Map[Long, Long], Map[Long, String])
] = ordSer[(Map[Long, Long], Map[Long, String])]
override def compare(left: AggregationKey, right: AggregationKey): Int =
featureMapsOrdering.compare(
AggregationKey.unapply(left).get,
AggregationKey.unapply(right).get
)
}

View File

@ -1,72 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/bijection:core",
"3rdparty/jvm/com/twitter/bijection:json",
"3rdparty/jvm/com/twitter/bijection:netty",
"3rdparty/jvm/com/twitter/bijection:scrooge",
"3rdparty/jvm/com/twitter/bijection:thrift",
"3rdparty/jvm/com/twitter/bijection:util",
"3rdparty/jvm/com/twitter/chill:bijection",
"3rdparty/jvm/com/twitter/storehaus:algebra",
"3rdparty/jvm/com/twitter/storehaus:core",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/src/jvm/com/twitter/scalding:args",
"3rdparty/src/jvm/com/twitter/scalding:commons",
"3rdparty/src/jvm/com/twitter/scalding:core",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"3rdparty/src/jvm/com/twitter/summingbird:batch-hadoop",
"3rdparty/src/jvm/com/twitter/summingbird:chill",
"3rdparty/src/jvm/com/twitter/summingbird:core",
"3rdparty/src/jvm/com/twitter/summingbird:scalding",
"finagle/finagle-core/src/main",
"gizmoduck/snapshot/src/main/scala/com/twitter/gizmoduck/snapshot:deleted_user-scala",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/scalding_internal/dalv2",
"src/scala/com/twitter/scalding_internal/job/analytics_batch",
"src/scala/com/twitter/scalding_internal/util",
"src/scala/com/twitter/storehaus_internal/manhattan/config",
"src/scala/com/twitter/storehaus_internal/offline",
"src/scala/com/twitter/storehaus_internal/util",
"src/scala/com/twitter/summingbird_internal/bijection",
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
"src/scala/com/twitter/summingbird_internal/dalv2",
"src/scala/com/twitter/summingbird_internal/runner/common",
"src/scala/com/twitter/summingbird_internal/runner/scalding",
"src/scala/com/twitter/summingbird_internal/runner/store_config",
"src/scala/com/twitter/summingbird_internal/runner/store_config/versioned_store",
"src/scala/com/twitter/summingbird_internal/sources/common",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:interpretable-model-java",
"src/thrift/com/twitter/statebird:compiled-v2-java",
"timelines/data_processing/ml_util/aggregation_framework:common_types",
"timelines/data_processing/ml_util/aggregation_framework:user_job",
"timelines/data_processing/ml_util/aggregation_framework/scalding/sources",
"timelines/data_processing/ml_util/sampling:sampling_utils",
],
exports = [
"3rdparty/src/jvm/com/twitter/summingbird:scalding",
"src/scala/com/twitter/storehaus_internal/manhattan/config",
"src/scala/com/twitter/summingbird_internal/runner/store_config",
],
)
hadoop_binary(
name = "bin",
basename = "aggregation_framework_scalding-deploy",
main = "com.twitter.scalding.Tool",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":scalding",
],
)

View File

@ -1,97 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding
import com.twitter.gizmoduck.snapshot.DeletedUserScalaDataset
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.scalding.typed.TypedPipe
import com.twitter.scalding.DateOps
import com.twitter.scalding.DateRange
import com.twitter.scalding.Days
import com.twitter.scalding.RichDate
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
import com.twitter.scalding_internal.job.RequiredBinaryComparators.ordSer
import com.twitter.scalding_internal.pruner.Pruner
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import com.twitter.scalding.serialization.macros.impl.ordered_serialization.runtime_helpers.MacroEqualityOrderedSerialization
import java.{util => ju}
object DeletedUserSeqPruner extends Pruner[Seq[Long]] {
implicit val tz: ju.TimeZone = DateOps.UTC
implicit val userIdSequenceOrdering: MacroEqualityOrderedSerialization[Seq[Long]] =
ordSer[Seq[Long]]
private[scalding] def pruneDeletedUsers[T](
input: TypedPipe[T],
extractor: T => Seq[Long],
deletedUsers: TypedPipe[Long]
): TypedPipe[T] = {
val userIdsAndValues = input.map { t: T =>
val userIds: Seq[Long] = extractor(t)
(userIds, t)
}
// Find all valid sequences of userids in the input pipe
// that contain at least one deleted user. This is efficient
// as long as the number of deleted users is small.
val userSequencesWithDeletedUsers = userIdsAndValues
.flatMap { case (userIds, _) => userIds.map((_, userIds)) }
.leftJoin(deletedUsers.asKeys)
.collect { case (_, (userIds, Some(_))) => userIds }
.distinct
userIdsAndValues
.leftJoin(userSequencesWithDeletedUsers.asKeys)
.collect { case (_, (t, None)) => t }
}
override def prune[T](
input: TypedPipe[T],
put: (T, Seq[Long]) => Option[T],
get: T => Seq[Long],
writeTime: RichDate
): TypedPipe[T] = {
lazy val deletedUsers = DAL
.readMostRecentSnapshot(DeletedUserScalaDataset, DateRange(writeTime - Days(7), writeTime))
.withRemoteReadPolicy(AllowCrossClusterSameDC)
.toTypedPipe
.map(_.userId)
pruneDeletedUsers(input, get, deletedUsers)
}
}
object AggregationKeyPruner {
/**
* Makes a pruner that prunes aggregate records where any of the
* "userIdFeatures" set in the aggregation key correspond to a
* user who has deleted their account. Here, "userIdFeatures" is
* intended as a catch-all term for all features corresponding to
* a Twitter user in the input data record -- the feature itself
* could represent an authorId, retweeterId, engagerId, etc.
*/
def mkDeletedUsersPruner(
userIdFeatures: Seq[Feature[_]]
): Pruner[(AggregationKey, DataRecord)] = {
val userIdFeatureIds = userIdFeatures.map(TypedAggregateGroup.getDenseFeatureId)
def getter(tupled: (AggregationKey, DataRecord)): Seq[Long] = {
tupled match {
case (aggregationKey, _) =>
userIdFeatureIds.flatMap { id =>
aggregationKey.discreteFeaturesById
.get(id)
.orElse(aggregationKey.textFeaturesById.get(id).map(_.toLong))
}
}
}
// Setting putter to always return None here. The put function is not used within pruneDeletedUsers, this function is just needed for xmap api.
def putter: ((AggregationKey, DataRecord), Seq[Long]) => Option[(AggregationKey, DataRecord)] =
(t, seq) => None
DeletedUserSeqPruner.xmap(putter, getter)
}
}

View File

@ -1,100 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding
import com.twitter.bijection.Injection
import com.twitter.scalding.commons.source.VersionedKeyValSource
import com.twitter.scalding.TypedPipe
import com.twitter.scalding.{Hdfs => HdfsMode}
import com.twitter.summingbird.batch.store.HDFSMetadata
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird.batch.Batcher
import com.twitter.summingbird.batch.OrderedFromOrderingExt
import com.twitter.summingbird.batch.PrunedSpace
import com.twitter.summingbird.scalding._
import com.twitter.summingbird.scalding.store.VersionedBatchStore
import org.slf4j.LoggerFactory
object MostRecentLagCorrectingVersionedStore {
def apply[Key, ValInStore, ValInMemory](
rootPath: String,
packer: ValInMemory => ValInStore,
unpacker: ValInStore => ValInMemory,
versionsToKeep: Int = VersionedKeyValSource.defaultVersionsToKeep,
prunedSpace: PrunedSpace[(Key, ValInMemory)] = PrunedSpace.neverPruned
)(
implicit injection: Injection[(Key, (BatchID, ValInStore)), (Array[Byte], Array[Byte])],
batcher: Batcher,
ord: Ordering[Key],
lagCorrector: (ValInMemory, Long) => ValInMemory
): MostRecentLagCorrectingVersionedBatchStore[Key, ValInMemory, Key, (BatchID, ValInStore)] = {
new MostRecentLagCorrectingVersionedBatchStore[Key, ValInMemory, Key, (BatchID, ValInStore)](
rootPath,
versionsToKeep,
batcher
)(lagCorrector)({ case (batchID, (k, v)) => (k, (batchID.next, packer(v))) })({
case (k, (_, v)) => (k, unpacker(v))
}) {
override def select(b: List[BatchID]) = List(b.last)
override def pruning: PrunedSpace[(Key, ValInMemory)] = prunedSpace
}
}
}
/**
* @param lagCorrector lagCorrector allows one to take data from one batch and pretend as if it
* came from a different batch.
* @param pack Converts the in-memory tuples to the type used by the underlying key-val store.
* @param unpack Converts the key-val tuples from the store in the form used by the calling object.
*/
class MostRecentLagCorrectingVersionedBatchStore[KeyInMemory, ValInMemory, KeyInStore, ValInStore](
rootPath: String,
versionsToKeep: Int,
override val batcher: Batcher
)(
lagCorrector: (ValInMemory, Long) => ValInMemory
)(
pack: (BatchID, (KeyInMemory, ValInMemory)) => (KeyInStore, ValInStore)
)(
unpack: ((KeyInStore, ValInStore)) => (KeyInMemory, ValInMemory)
)(
implicit @transient injection: Injection[(KeyInStore, ValInStore), (Array[Byte], Array[Byte])],
override val ordering: Ordering[KeyInMemory])
extends VersionedBatchStore[KeyInMemory, ValInMemory, KeyInStore, ValInStore](
rootPath,
versionsToKeep,
batcher)(pack)(unpack)(injection, ordering) {
import OrderedFromOrderingExt._
@transient private val logger =
LoggerFactory.getLogger(classOf[MostRecentLagCorrectingVersionedBatchStore[_, _, _, _]])
override protected def lastBatch(
exclusiveUB: BatchID,
mode: HdfsMode
): Option[(BatchID, FlowProducer[TypedPipe[(KeyInMemory, ValInMemory)]])] = {
val batchToPretendAs = exclusiveUB.prev
val versionToPretendAs = batchIDToVersion(batchToPretendAs)
logger.info(
s"Most recent lag correcting versioned batched store at $rootPath entering lastBatch method versionToPretendAs = $versionToPretendAs")
val meta = new HDFSMetadata(mode.conf, rootPath)
meta.versions
.map { ver => (versionToBatchID(ver), readVersion(ver)) }
.filter { _._1 < exclusiveUB }
.reduceOption { (a, b) => if (a._1 > b._1) a else b }
.map {
case (
lastBatchID: BatchID,
flowProducer: FlowProducer[TypedPipe[(KeyInMemory, ValInMemory)]]) =>
val lastVersion = batchIDToVersion(lastBatchID)
val lagToCorrectMillis: Long =
batchIDToVersion(batchToPretendAs) - batchIDToVersion(lastBatchID)
logger.info(
s"Most recent available version is $lastVersion, so lagToCorrectMillis is $lagToCorrectMillis")
val lagCorrectedFlowProducer = flowProducer.map {
pipe: TypedPipe[(KeyInMemory, ValInMemory)] =>
pipe.map { case (k, v) => (k, lagCorrector(v, lagToCorrectMillis)) }
}
(batchToPretendAs, lagCorrectedFlowProducer)
}
}
}

View File

@ -1,26 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/storehaus:algebra",
"3rdparty/src/jvm/com/twitter/scalding:commons",
"3rdparty/src/jvm/com/twitter/scalding:core",
"3rdparty/src/jvm/com/twitter/scalding:date",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"3rdparty/src/jvm/com/twitter/summingbird:batch-hadoop",
"3rdparty/src/jvm/com/twitter/summingbird:chill",
"3rdparty/src/jvm/com/twitter/summingbird:core",
"3rdparty/src/jvm/com/twitter/summingbird:scalding",
"src/java/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api/internal",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/scalding_internal/dalv2",
"src/scala/com/twitter/scalding_internal/dalv2/remote_access",
"src/scala/com/twitter/summingbird_internal/sources/common",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:interpretable-model-java",
"timelines/data_processing/ml_util/aggregation_framework:common_types",
],
)

View File

@ -1,77 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.scalding.sources
import com.twitter.ml.api.DailySuffixFeatureSource
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.FixedPathFeatureSource
import com.twitter.ml.api.HourlySuffixFeatureSource
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.scalding._
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.dalv2.remote_access.AllowCrossClusterSameDC
import com.twitter.statebird.v2.thriftscala.Environment
import com.twitter.summingbird._
import com.twitter.summingbird.scalding.Scalding.pipeFactoryExact
import com.twitter.summingbird.scalding._
import com.twitter.summingbird_internal.sources.SourceFactory
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.OfflineAggregateSource
import java.lang.{Long => JLong}
/*
* Summingbird offline HDFS source that reads from data records on HDFS.
*
* @param offlineSource Underlying offline source that contains
* all the config info to build this platform-specific (scalding) source.
*/
case class ScaldingAggregateSource(offlineSource: OfflineAggregateSource)
extends SourceFactory[Scalding, DataRecord] {
val hdfsPath: String = offlineSource.scaldingHdfsPath.getOrElse("")
val suffixType: String = offlineSource.scaldingSuffixType.getOrElse("daily")
val withValidation: Boolean = offlineSource.withValidation
def name: String = offlineSource.name
def description: String =
"Summingbird offline source that reads from data records at: " + hdfsPath
implicit val timeExtractor: TimeExtractor[DataRecord] = TimeExtractor((record: DataRecord) =>
SRichDataRecord(record).getFeatureValue[JLong, JLong](offlineSource.timestampFeature))
def getSourceForDateRange(dateRange: DateRange) = {
suffixType match {
case "daily" => DailySuffixFeatureSource(hdfsPath)(dateRange).source
case "hourly" => HourlySuffixFeatureSource(hdfsPath)(dateRange).source
case "fixed_path" => FixedPathFeatureSource(hdfsPath).source
case "dal" =>
offlineSource.dalDataSet match {
case Some(dataset) =>
DAL
.read(dataset, dateRange)
.withRemoteReadPolicy(AllowCrossClusterSameDC)
.withEnvironment(Environment.Prod)
.toTypedSource
case _ =>
throw new IllegalArgumentException(
"cannot provide an empty dataset when defining DAL as the suffix type"
)
}
}
}
/**
* This method is similar to [[Scalding.sourceFromMappable]] except that this uses [[pipeFactoryExact]]
* instead of [[pipeFactory]]. [[pipeFactoryExact]] also invokes [[FileSource.validateTaps]] on the source.
* The validation ensures the presence of _SUCCESS file before processing. For more details, please refer to
* https://jira.twitter.biz/browse/TQ-10618
*/
def sourceFromMappableWithValidation[T: TimeExtractor: Manifest](
factory: (DateRange) => Mappable[T]
): Producer[Scalding, T] = {
Producer.source[Scalding, T](pipeFactoryExact(factory))
}
def source: Producer[Scalding, DataRecord] = {
if (withValidation)
sourceFromMappableWithValidation(getSourceForDateRange)
else
Scalding.sourceFromMappable(getSourceForDateRange)
}
}