[docx] split commit for file 5600

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
Ari Archer 2024-01-23 19:19:19 +02:00
parent f0aa618947
commit be139a2dd1
No known key found for this signature in database
GPG Key ID: A50D5B4B599AF8A2
400 changed files with 0 additions and 17776 deletions

View File

@ -1,114 +0,0 @@
package com.twitter.timelineranker.util
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.servo.util.Gate
import com.twitter.timelineranker.contentfeatures.ContentFeaturesProvider
import com.twitter.timelineranker.core.HydratedTweets
import com.twitter.timelineranker.model.RecapQuery
import com.twitter.timelineranker.recap.model.ContentFeatures
import com.twitter.timelines.clients.tweetypie.TweetyPieClient
import com.twitter.timelines.model.TweetId
import com.twitter.timelines.model.tweet.HydratedTweet
import com.twitter.timelines.util.FailOpenHandler
import com.twitter.tweetypie.thriftscala.MediaEntity
import com.twitter.tweetypie.thriftscala.TweetInclude
import com.twitter.tweetypie.thriftscala.{Tweet => TTweet}
import com.twitter.util.Future
object TweetypieContentFeaturesProvider {
val DefaultTweetyPieFieldsToHydrate: Set[TweetInclude] = TweetyPieClient.CoreTweetFields ++
TweetyPieClient.MediaFields ++
TweetyPieClient.SelfThreadFields ++
Set[TweetInclude](TweetInclude.MediaEntityFieldId(MediaEntity.AdditionalMetadataField.id))
//add Tweet fields from semantic core
val TweetyPieFieldsToHydrate: Set[TweetInclude] = DefaultTweetyPieFieldsToHydrate ++
Set[TweetInclude](TweetInclude.TweetFieldId(TTweet.EscherbirdEntityAnnotationsField.id))
val EmptyHydratedTweets: HydratedTweets =
HydratedTweets(Seq.empty[HydratedTweet], Seq.empty[HydratedTweet])
val EmptyHydratedTweetsFuture: Future[HydratedTweets] = Future.value(EmptyHydratedTweets)
}
class TweetypieContentFeaturesProvider(
tweetHydrator: TweetHydrator,
enableContentFeaturesGate: Gate[RecapQuery],
enableTokensInContentFeaturesGate: Gate[RecapQuery],
enableTweetTextInContentFeaturesGate: Gate[RecapQuery],
enableConversationControlContentFeaturesGate: Gate[RecapQuery],
enableTweetMediaHydrationGate: Gate[RecapQuery],
statsReceiver: StatsReceiver)
extends ContentFeaturesProvider {
val scopedStatsReceiver: StatsReceiver = statsReceiver.scope("TweetypieContentFeaturesProvider")
override def apply(
query: RecapQuery,
tweetIds: Seq[TweetId]
): Future[Map[TweetId, ContentFeatures]] = {
import TweetypieContentFeaturesProvider._
val tweetypieHydrationHandler = new FailOpenHandler(scopedStatsReceiver)
val hydratePenguinTextFeatures = enableContentFeaturesGate(query)
val hydrateSemanticCoreFeatures = enableContentFeaturesGate(query)
val hydrateTokens = enableTokensInContentFeaturesGate(query)
val hydrateTweetText = enableTweetTextInContentFeaturesGate(query)
val hydrateConversationControl = enableConversationControlContentFeaturesGate(query)
val userId = query.userId
val hydratedTweetsFuture = tweetypieHydrationHandler {
// tweetyPie fields to hydrate given hydrateSemanticCoreFeatures
val fieldsToHydrateWithSemanticCore = if (hydrateSemanticCoreFeatures) {
TweetyPieFieldsToHydrate
} else {
DefaultTweetyPieFieldsToHydrate
}
// tweetyPie fields to hydrate given hydrateSemanticCoreFeatures & hydrateConversationControl
val fieldsToHydrateWithConversationControl = if (hydrateConversationControl) {
fieldsToHydrateWithSemanticCore ++ TweetyPieClient.ConversationControlField
} else {
fieldsToHydrateWithSemanticCore
}
tweetHydrator.hydrate(Some(userId), tweetIds, fieldsToHydrateWithConversationControl)
} { e: Throwable => EmptyHydratedTweetsFuture }
hydratedTweetsFuture.map[Map[TweetId, ContentFeatures]] { hydratedTweets =>
hydratedTweets.outerTweets.map { hydratedTweet =>
val contentFeaturesFromTweet = ContentFeatures.Empty.copy(
selfThreadMetadata = hydratedTweet.tweet.selfThreadMetadata
)
val contentFeaturesWithText = TweetTextFeaturesExtractor.addTextFeaturesFromTweet(
contentFeaturesFromTweet,
hydratedTweet.tweet,
hydratePenguinTextFeatures,
hydrateTokens,
hydrateTweetText
)
val contentFeaturesWithMedia = TweetMediaFeaturesExtractor.addMediaFeaturesFromTweet(
contentFeaturesWithText,
hydratedTweet.tweet,
enableTweetMediaHydrationGate(query)
)
val contentFeaturesWithAnnotations = TweetAnnotationFeaturesExtractor
.addAnnotationFeaturesFromTweet(
contentFeaturesWithMedia,
hydratedTweet.tweet,
hydrateSemanticCoreFeatures
)
// add conversationControl to content features if hydrateConversationControl is true
if (hydrateConversationControl) {
val contentFeaturesWithConversationControl = contentFeaturesWithAnnotations.copy(
conversationControl = hydratedTweet.tweet.conversationControl
)
hydratedTweet.tweetId -> contentFeaturesWithConversationControl
} else {
hydratedTweet.tweetId -> contentFeaturesWithAnnotations
}
}.toMap
}
}
}

View File

@ -1,17 +0,0 @@
scala_library(
sources = ["*.scala"],
compiler_option_sets = ["fatal_warnings"],
tags = ["bazel-compatible"],
dependencies = [
"servo/repo/src/main/scala",
"src/thrift/com/twitter/wtf/candidate:wtf-candidate-scala",
"timelineranker/server/src/main/scala/com/twitter/timelineranker/core",
"timelines:visibility",
"timelines/src/main/scala/com/twitter/timelines/clients/socialgraph",
"timelines/src/main/scala/com/twitter/timelines/util",
"timelines/src/main/scala/com/twitter/timelines/util/stats",
"util/util-core:util-core-util",
"util/util-logging/src/main/scala",
"util/util-stats/src/main/scala",
],
)

View File

@ -1,25 +0,0 @@
package com.twitter.timelineranker.visibility
import com.twitter.timelineranker.core.FollowGraphData
import com.twitter.timelineranker.core.FollowGraphDataFuture
import com.twitter.timelines.model.UserId
import com.twitter.util.Future
trait FollowGraphDataProvider {
/**
* Gets follow graph data for the given user.
*
* @param userId user whose follow graph details are to be obtained.
* @param maxFollowingCount Maximum number of followed user IDs to fetch.
* If the given user follows more than these many users,
* then the most recent maxFollowingCount users are returned.
*/
def get(userId: UserId, maxFollowingCount: Int): Future[FollowGraphData]
def getAsync(userId: UserId, maxFollowingCount: Int): FollowGraphDataFuture
def getFollowing(userId: UserId, maxFollowingCount: Int): Future[Seq[UserId]]
def getMutuallyFollowingUserIds(userId: UserId, followingIds: Seq[UserId]): Future[Set[UserId]]
}

View File

@ -1,134 +0,0 @@
package com.twitter.timelineranker.visibility
import com.twitter.finagle.stats.Stat
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.servo.repository.KeyValueRepository
import com.twitter.servo.util.Gate
import com.twitter.timelineranker.core.FollowGraphData
import com.twitter.timelineranker.core.FollowGraphDataFuture
import com.twitter.timelines.clients.socialgraph.SocialGraphClient
import com.twitter.timelines.model.UserId
import com.twitter.timelines.util.FailOpenHandler
import com.twitter.util.Future
import com.twitter.util.Stopwatch
import com.twitter.wtf.candidate.thriftscala.CandidateSeq
object RealGraphFollowGraphDataProvider {
val EmptyRealGraphResponse = CandidateSeq(Nil)
}
/**
* Wraps an underlying FollowGraphDataProvider (which in practice will usually be a
* [[SgsFollowGraphDataProvider]]) and supplements the list of followings provided by the
* underlying provider with additional followings fetched from RealGraph if it looks like the
* underlying provider did not get the full list of the user's followings.
*
* First checks whether the size of the underlying following list is >= the max requested following
* count, which implies that there were additional followings beyond the max requested count. If so,
* fetches the full set of followings from RealGraph (go/realgraph), which will be at most 2000.
*
* Because the RealGraph dataset is not realtime and thus can potentially include stale followings,
* the provider confirms that the followings fetched from RealGraph are valid using SGS's
* getFollowOverlap method, and then merges the valid RealGraph followings with the underlying
* followings.
*
* Note that this supplementing is expected to be very rare as most users do not have more than
* the max followings we fetch from SGS. Also note that this class is mainly intended for use
* in the home timeline materialization path, with the goal of preventing a case where users
* who follow a very large number of accounts may not see Tweets from their earlier follows if we
* used SGS-based follow fetching alone.
*/
class RealGraphFollowGraphDataProvider(
underlying: FollowGraphDataProvider,
realGraphClient: KeyValueRepository[Seq[UserId], UserId, CandidateSeq],
socialGraphClient: SocialGraphClient,
supplementFollowsWithRealGraphGate: Gate[UserId],
statsReceiver: StatsReceiver)
extends FollowGraphDataProvider {
import RealGraphFollowGraphDataProvider._
private[this] val scopedStatsReceiver = statsReceiver.scope("realGraphFollowGraphDataProvider")
private[this] val requestCounter = scopedStatsReceiver.counter("requests")
private[this] val atMaxCounter = scopedStatsReceiver.counter("followsAtMax")
private[this] val totalLatencyStat = scopedStatsReceiver.stat("totalLatencyWhenSupplementing")
private[this] val supplementLatencyStat = scopedStatsReceiver.stat("supplementFollowsLatency")
private[this] val realGraphResponseSizeStat = scopedStatsReceiver.stat("realGraphFollows")
private[this] val realGraphEmptyCounter = scopedStatsReceiver.counter("realGraphEmpty")
private[this] val nonOverlappingSizeStat = scopedStatsReceiver.stat("nonOverlappingFollows")
private[this] val failOpenHandler = new FailOpenHandler(scopedStatsReceiver)
override def get(userId: UserId, maxFollowingCount: Int): Future[FollowGraphData] = {
getAsync(userId, maxFollowingCount).get()
}
override def getAsync(userId: UserId, maxFollowingCount: Int): FollowGraphDataFuture = {
val startTime = Stopwatch.timeMillis()
val underlyingResult = underlying.getAsync(userId, maxFollowingCount)
if (supplementFollowsWithRealGraphGate(userId)) {
val supplementedFollows = underlyingResult.followedUserIdsFuture.flatMap { sgsFollows =>
supplementFollowsWithRealGraph(userId, maxFollowingCount, sgsFollows, startTime)
}
underlyingResult.copy(followedUserIdsFuture = supplementedFollows)
} else {
underlyingResult
}
}
override def getFollowing(userId: UserId, maxFollowingCount: Int): Future[Seq[UserId]] = {
val startTime = Stopwatch.timeMillis()
val underlyingFollows = underlying.getFollowing(userId, maxFollowingCount)
if (supplementFollowsWithRealGraphGate(userId)) {
underlying.getFollowing(userId, maxFollowingCount).flatMap { sgsFollows =>
supplementFollowsWithRealGraph(userId, maxFollowingCount, sgsFollows, startTime)
}
} else {
underlyingFollows
}
}
private[this] def supplementFollowsWithRealGraph(
userId: UserId,
maxFollowingCount: Int,
sgsFollows: Seq[Long],
startTime: Long
): Future[Seq[UserId]] = {
requestCounter.incr()
if (sgsFollows.size >= maxFollowingCount) {
atMaxCounter.incr()
val supplementedFollowsFuture = realGraphClient(Seq(userId))
.map(_.getOrElse(userId, EmptyRealGraphResponse))
.map(_.candidates.map(_.userId))
.flatMap {
case realGraphFollows if realGraphFollows.nonEmpty =>
realGraphResponseSizeStat.add(realGraphFollows.size)
// Filter out "stale" follows from realgraph by checking them against SGS
val verifiedRealGraphFollows =
socialGraphClient.getFollowOverlap(userId, realGraphFollows)
verifiedRealGraphFollows.map { follows =>
val combinedFollows = (sgsFollows ++ follows).distinct
val additionalFollows = combinedFollows.size - sgsFollows.size
if (additionalFollows > 0) nonOverlappingSizeStat.add(additionalFollows)
combinedFollows
}
case _ =>
realGraphEmptyCounter.incr()
Future.value(sgsFollows)
}
.onSuccess { _ => totalLatencyStat.add(Stopwatch.timeMillis() - startTime) }
Stat.timeFuture(supplementLatencyStat) {
failOpenHandler(supplementedFollowsFuture) { _ => Future.value(sgsFollows) }
}
} else {
Future.value(sgsFollows)
}
}
override def getMutuallyFollowingUserIds(
userId: UserId,
followingIds: Seq[UserId]
): Future[Set[UserId]] = {
underlying.getMutuallyFollowingUserIds(userId, followingIds)
}
}

View File

@ -1,266 +0,0 @@
package com.twitter.timelineranker.visibility
import com.twitter.finagle.stats.Stat
import com.twitter.finagle.stats.StatsReceiver
import com.twitter.timelineranker.core.FollowGraphData
import com.twitter.timelineranker.core.FollowGraphDataFuture
import com.twitter.timelines.clients.socialgraph.ScopedSocialGraphClientFactory
import com.twitter.timelines.model._
import com.twitter.timelines.util.FailOpenHandler
import com.twitter.timelines.util.stats._
import com.twitter.timelines.visibility._
import com.twitter.util.Future
object SgsFollowGraphDataProvider {
val EmptyUserIdsSet: Set[UserId] = Set.empty[UserId]
val EmptyUserIdsSetFuture: Future[Set[UserId]] = Future.value(EmptyUserIdsSet)
val EmptyUserIdsSeq: Seq[UserId] = Seq.empty[UserId]
val EmptyUserIdsSeqFuture: Future[Seq[UserId]] = Future.value(EmptyUserIdsSeq)
val EmptyVisibilityProfiles: Map[UserId, VisibilityProfile] = Map.empty[UserId, VisibilityProfile]
val EmptyVisibilityProfilesFuture: Future[Map[UserId, VisibilityProfile]] =
Future.value(EmptyVisibilityProfiles)
}
object SgsFollowGraphDataFields extends Enumeration {
val FollowedUserIds: Value = Value
val MutuallyFollowingUserIds: Value = Value
val MutedUserIds: Value = Value
val RetweetsMutedUserIds: Value = Value
val None: ValueSet = SgsFollowGraphDataFields.ValueSet()
def throwIfInvalid(fields: SgsFollowGraphDataFields.ValueSet): Unit = {
if (fields.contains(MutuallyFollowingUserIds) && !fields.contains(FollowedUserIds)) {
throw new IllegalArgumentException(
"MutuallyFollowingUserIds field requires FollowedUserIds field to be defined."
)
}
}
}
/**
* Provides information on the follow graph of a given user.
*/
class SgsFollowGraphDataProvider(
socialGraphClientFactory: ScopedSocialGraphClientFactory,
visibilityProfileHydratorFactory: VisibilityProfileHydratorFactory,
fieldsToFetch: SgsFollowGraphDataFields.ValueSet,
scope: RequestScope,
statsReceiver: StatsReceiver)
extends FollowGraphDataProvider
with RequestStats {
SgsFollowGraphDataFields.throwIfInvalid(fieldsToFetch)
private[this] val stats = scope.stats("followGraphDataProvider", statsReceiver)
private[this] val scopedStatsReceiver = stats.scopedStatsReceiver
private[this] val followingScope = scopedStatsReceiver.scope("following")
private[this] val followingLatencyStat = followingScope.stat(LatencyMs)
private[this] val followingSizeStat = followingScope.stat(Size)
private[this] val followingTruncatedCounter = followingScope.counter("numTruncated")
private[this] val mutuallyFollowingScope = scopedStatsReceiver.scope("mutuallyFollowing")
private[this] val mutuallyFollowingLatencyStat = mutuallyFollowingScope.stat(LatencyMs)
private[this] val mutuallyFollowingSizeStat = mutuallyFollowingScope.stat(Size)
private[this] val visibilityScope = scopedStatsReceiver.scope("visibility")
private[this] val visibilityLatencyStat = visibilityScope.stat(LatencyMs)
private[this] val mutedStat = visibilityScope.stat("muted")
private[this] val retweetsMutedStat = visibilityScope.stat("retweetsMuted")
private[this] val socialGraphClient = socialGraphClientFactory.scope(scope)
private[this] val visibilityProfileHydrator =
createVisibilityProfileHydrator(visibilityProfileHydratorFactory, scope, fieldsToFetch)
private[this] val failOpenScope = scopedStatsReceiver.scope("failOpen")
private[this] val mutuallyFollowingHandler =
new FailOpenHandler(failOpenScope, "mutuallyFollowing")
private[this] val obtainVisibilityProfiles = fieldsToFetch.contains(
SgsFollowGraphDataFields.MutedUserIds
) || fieldsToFetch.contains(SgsFollowGraphDataFields.RetweetsMutedUserIds)
/**
* Gets follow graph data for the given user.
*
* @param userId user whose follow graph details are to be obtained.
* @param maxFollowingCount Maximum number of followed user IDs to fetch.
* If the given user follows more than these many users,
* then the most recent maxFollowingCount users are returned.
*/
def get(
userId: UserId,
maxFollowingCount: Int
): Future[FollowGraphData] = {
getAsync(
userId,
maxFollowingCount
).get()
}
def getAsync(
userId: UserId,
maxFollowingCount: Int
): FollowGraphDataFuture = {
stats.statRequest()
val followedUserIdsFuture =
if (fieldsToFetch.contains(SgsFollowGraphDataFields.FollowedUserIds)) {
getFollowing(userId, maxFollowingCount)
} else {
SgsFollowGraphDataProvider.EmptyUserIdsSeqFuture
}
val mutuallyFollowingUserIdsFuture =
if (fieldsToFetch.contains(SgsFollowGraphDataFields.MutuallyFollowingUserIds)) {
followedUserIdsFuture.flatMap { followedUserIds =>
getMutuallyFollowingUserIds(userId, followedUserIds)
}
} else {
SgsFollowGraphDataProvider.EmptyUserIdsSetFuture
}
val visibilityProfilesFuture = if (obtainVisibilityProfiles) {
followedUserIdsFuture.flatMap { followedUserIds =>
getVisibilityProfiles(userId, followedUserIds)
}
} else {
SgsFollowGraphDataProvider.EmptyVisibilityProfilesFuture
}
val mutedUserIdsFuture = if (fieldsToFetch.contains(SgsFollowGraphDataFields.MutedUserIds)) {
getMutedUsers(visibilityProfilesFuture).map { mutedUserIds =>
mutedStat.add(mutedUserIds.size)
mutedUserIds
}
} else {
SgsFollowGraphDataProvider.EmptyUserIdsSetFuture
}
val retweetsMutedUserIdsFuture =
if (fieldsToFetch.contains(SgsFollowGraphDataFields.RetweetsMutedUserIds)) {
getRetweetsMutedUsers(visibilityProfilesFuture).map { retweetsMutedUserIds =>
retweetsMutedStat.add(retweetsMutedUserIds.size)
retweetsMutedUserIds
}
} else {
SgsFollowGraphDataProvider.EmptyUserIdsSetFuture
}
FollowGraphDataFuture(
userId,
followedUserIdsFuture,
mutuallyFollowingUserIdsFuture,
mutedUserIdsFuture,
retweetsMutedUserIdsFuture
)
}
private[this] def getVisibilityProfiles(
userId: UserId,
followingIds: Seq[UserId]
): Future[Map[UserId, VisibilityProfile]] = {
Stat.timeFuture(visibilityLatencyStat) {
visibilityProfileHydrator(Some(userId), Future.value(followingIds.toSeq))
}
}
def getFollowing(userId: UserId, maxFollowingCount: Int): Future[Seq[UserId]] = {
Stat.timeFuture(followingLatencyStat) {
// We fetch 1 more than the limit so that we can decide if we ended up
// truncating the followings.
val followingIdsFuture = socialGraphClient.getFollowing(userId, Some(maxFollowingCount + 1))
followingIdsFuture.map { followingIds =>
followingSizeStat.add(followingIds.length)
if (followingIds.length > maxFollowingCount) {
followingTruncatedCounter.incr()
followingIds.take(maxFollowingCount)
} else {
followingIds
}
}
}
}
def getMutuallyFollowingUserIds(
userId: UserId,
followingIds: Seq[UserId]
): Future[Set[UserId]] = {
Stat.timeFuture(mutuallyFollowingLatencyStat) {
mutuallyFollowingHandler {
val mutuallyFollowingIdsFuture =
socialGraphClient.getFollowOverlap(followingIds.toSeq, userId)
mutuallyFollowingIdsFuture.map { mutuallyFollowingIds =>
mutuallyFollowingSizeStat.add(mutuallyFollowingIds.size)
}
mutuallyFollowingIdsFuture
} { e: Throwable => SgsFollowGraphDataProvider.EmptyUserIdsSetFuture }
}
}
private[this] def getRetweetsMutedUsers(
visibilityProfilesFuture: Future[Map[UserId, VisibilityProfile]]
): Future[Set[UserId]] = {
// If the hydrator is not able to fetch retweets-muted status, we default to true.
getUsersMatchingVisibilityPredicate(
visibilityProfilesFuture,
(visibilityProfile: VisibilityProfile) => visibilityProfile.areRetweetsMuted.getOrElse(true)
)
}
private[this] def getMutedUsers(
visibilityProfilesFuture: Future[Map[UserId, VisibilityProfile]]
): Future[Set[UserId]] = {
// If the hydrator is not able to fetch muted status, we default to true.
getUsersMatchingVisibilityPredicate(
visibilityProfilesFuture,
(visibilityProfile: VisibilityProfile) => visibilityProfile.isMuted.getOrElse(true)
)
}
private[this] def getUsersMatchingVisibilityPredicate(
visibilityProfilesFuture: Future[Map[UserId, VisibilityProfile]],
predicate: (VisibilityProfile => Boolean)
): Future[Set[UserId]] = {
visibilityProfilesFuture.map { visibilityProfiles =>
visibilityProfiles
.filter {
case (_, visibilityProfile) =>
predicate(visibilityProfile)
}
.collect { case (userId, _) => userId }
.toSet
}
}
private[this] def createVisibilityProfileHydrator(
factory: VisibilityProfileHydratorFactory,
scope: RequestScope,
fieldsToFetch: SgsFollowGraphDataFields.ValueSet
): VisibilityProfileHydrator = {
val hydrationProfileRequest = HydrationProfileRequest(
getMuted = fieldsToFetch.contains(SgsFollowGraphDataFields.MutedUserIds),
getRetweetsMuted = fieldsToFetch.contains(SgsFollowGraphDataFields.RetweetsMutedUserIds)
)
factory(hydrationProfileRequest, scope)
}
}
class ScopedSgsFollowGraphDataProviderFactory(
socialGraphClientFactory: ScopedSocialGraphClientFactory,
visibilityProfileHydratorFactory: VisibilityProfileHydratorFactory,
fieldsToFetch: SgsFollowGraphDataFields.ValueSet,
statsReceiver: StatsReceiver)
extends ScopedFactory[SgsFollowGraphDataProvider] {
override def scope(scope: RequestScope): SgsFollowGraphDataProvider = {
new SgsFollowGraphDataProvider(
socialGraphClientFactory,
visibilityProfileHydratorFactory,
fieldsToFetch,
scope,
statsReceiver
)
}
}

View File

@ -1,8 +0,0 @@
target(
name = "earlybird_ranking",
dependencies = [
"timelines/data_processing/ad_hoc/earlybird_ranking/common",
"timelines/data_processing/ad_hoc/earlybird_ranking/model_evaluation",
"timelines/data_processing/ad_hoc/earlybird_ranking/training_data_generation",
],
)

View File

@ -1,24 +0,0 @@
scala_library(
name = "common",
sources = ["*.scala"],
platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
],
dependencies = [
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/java/com/twitter/ml/api/transform",
"src/java/com/twitter/search/modeling/tweet_ranking",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/timelines/prediction/features/common",
"src/scala/com/twitter/timelines/prediction/features/itl",
"src/scala/com/twitter/timelines/prediction/features/real_graph",
"src/scala/com/twitter/timelines/prediction/features/recap",
"src/scala/com/twitter/timelines/prediction/features/request_context",
"src/scala/com/twitter/timelines/prediction/features/time_features",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:transform-java",
],
)

View File

@ -1,271 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api.ITransform
import com.twitter.ml.api.transform.CascadeTransform
import com.twitter.ml.api.transform.TransformFactory
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.search.common.features.SearchResultFeature
import com.twitter.search.common.features.ExternalTweetFeature
import com.twitter.search.common.features.TweetFeature
import com.twitter.timelines.prediction.features.recap.RecapFeatures
import com.twitter.timelines.prediction.features.request_context.RequestContextFeatures
import com.twitter.timelines.prediction.features.time_features.TimeDataRecordFeatures
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
import com.twitter.timelines.prediction.features.real_graph.RealGraphDataRecordFeatures
import scala.collection.JavaConverters._
import java.lang.{Boolean => JBoolean}
case class LabelInfo(name: String, downsampleFraction: Double, importance: Double)
case class LabelInfoWithFeature(info: LabelInfo, feature: Feature[JBoolean])
trait EarlybirdTrainingConfiguration {
protected def labels: Map[String, Feature.Binary]
protected def weights: Map[String, Double] = Map(
"detail_expanded" -> 0.3,
"favorited" -> 1.0,
"open_linked" -> 0.1,
"photo_expanded" -> 0.03,
"profile_clicked" -> 1.0,
"replied" -> 9.0,
"retweeted" -> 1.0,
"video_playback50" -> 0.01
)
// we basically should not downsample any of the precious positive data.
// importance are currently set to match the full model's weights.
protected def PositiveSamplingRate: Double = 1.0
private def NegativeSamplingRate: Double = PositiveSamplingRate * 0.08
// we basically should not downsample any of the precious positive data.
// importance are currently set to match the full model's weights.
final lazy val LabelInfos: List[LabelInfoWithFeature] = {
assert(labels.keySet == weights.keySet)
labels.keySet.map(makeLabelInfoWithFeature).toList
}
def makeLabelInfoWithFeature(labelName: String): LabelInfoWithFeature = {
LabelInfoWithFeature(
LabelInfo(labelName, PositiveSamplingRate, weights(labelName)),
labels(labelName))
}
final lazy val NegativeInfo: LabelInfo = LabelInfo("negative", NegativeSamplingRate, 1.0)
// example of features available in schema based namespace:
protected def featureToSearchResultFeatureMap: Map[Feature[_], SearchResultFeature] = Map(
RecapFeatures.TEXT_SCORE -> TweetFeature.TEXT_SCORE,
RecapFeatures.REPLY_COUNT -> TweetFeature.REPLY_COUNT,
RecapFeatures.RETWEET_COUNT -> TweetFeature.RETWEET_COUNT,
RecapFeatures.FAV_COUNT -> TweetFeature.FAVORITE_COUNT,
RecapFeatures.HAS_CARD -> TweetFeature.HAS_CARD_FLAG,
RecapFeatures.HAS_CONSUMER_VIDEO -> TweetFeature.HAS_CONSUMER_VIDEO_FLAG,
RecapFeatures.HAS_PRO_VIDEO -> TweetFeature.HAS_PRO_VIDEO_FLAG,
// no corresponding HAS_NATIVE_VIDEO feature in TweetFeature
RecapFeatures.HAS_VINE -> TweetFeature.HAS_VINE_FLAG,
RecapFeatures.HAS_PERISCOPE -> TweetFeature.HAS_PERISCOPE_FLAG,
RecapFeatures.HAS_NATIVE_IMAGE -> TweetFeature.HAS_NATIVE_IMAGE_FLAG,
RecapFeatures.HAS_IMAGE -> TweetFeature.HAS_IMAGE_URL_FLAG,
RecapFeatures.HAS_NEWS -> TweetFeature.HAS_NEWS_URL_FLAG,
RecapFeatures.HAS_VIDEO -> TweetFeature.HAS_VIDEO_URL_FLAG,
RecapFeatures.HAS_TREND -> TweetFeature.HAS_TREND_FLAG,
RecapFeatures.HAS_MULTIPLE_HASHTAGS_OR_TRENDS -> TweetFeature.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG,
RecapFeatures.IS_OFFENSIVE -> TweetFeature.IS_OFFENSIVE_FLAG,
RecapFeatures.IS_REPLY -> TweetFeature.IS_REPLY_FLAG,
RecapFeatures.IS_RETWEET -> TweetFeature.IS_RETWEET_FLAG,
RecapFeatures.IS_AUTHOR_BOT -> TweetFeature.IS_USER_BOT_FLAG,
RecapFeatures.FROM_VERIFIED_ACCOUNT -> TweetFeature.FROM_VERIFIED_ACCOUNT_FLAG,
RecapFeatures.USER_REP -> TweetFeature.USER_REPUTATION,
RecapFeatures.EMBEDS_IMPRESSION_COUNT -> TweetFeature.EMBEDS_IMPRESSION_COUNT,
RecapFeatures.EMBEDS_URL_COUNT -> TweetFeature.EMBEDS_URL_COUNT,
// RecapFeatures.VIDEO_VIEW_COUNT deprecated
RecapFeatures.FAV_COUNT_V2 -> TweetFeature.FAVORITE_COUNT_V2,
RecapFeatures.RETWEET_COUNT_V2 -> TweetFeature.RETWEET_COUNT_V2,
RecapFeatures.REPLY_COUNT_V2 -> TweetFeature.REPLY_COUNT_V2,
RecapFeatures.IS_SENSITIVE -> TweetFeature.IS_SENSITIVE_CONTENT,
RecapFeatures.HAS_MULTIPLE_MEDIA -> TweetFeature.HAS_MULTIPLE_MEDIA_FLAG,
RecapFeatures.IS_AUTHOR_PROFILE_EGG -> TweetFeature.PROFILE_IS_EGG_FLAG,
RecapFeatures.IS_AUTHOR_NEW -> TweetFeature.IS_USER_NEW_FLAG,
RecapFeatures.NUM_MENTIONS -> TweetFeature.NUM_MENTIONS,
RecapFeatures.NUM_HASHTAGS -> TweetFeature.NUM_HASHTAGS,
RecapFeatures.HAS_VISIBLE_LINK -> TweetFeature.HAS_VISIBLE_LINK_FLAG,
RecapFeatures.HAS_LINK -> TweetFeature.HAS_LINK_FLAG,
//note: DISCRETE features are not supported by the modelInterpreter tool.
// for the following features, we will create separate CONTINUOUS features instead of renaming
//RecapFeatures.LINK_LANGUAGE
//RecapFeatures.LANGUAGE
TimelinesSharedFeatures.HAS_QUOTE -> TweetFeature.HAS_QUOTE_FLAG,
TimelinesSharedFeatures.QUOTE_COUNT -> TweetFeature.QUOTE_COUNT,
TimelinesSharedFeatures.WEIGHTED_FAV_COUNT -> TweetFeature.WEIGHTED_FAVORITE_COUNT,
TimelinesSharedFeatures.WEIGHTED_QUOTE_COUNT -> TweetFeature.WEIGHTED_QUOTE_COUNT,
TimelinesSharedFeatures.WEIGHTED_REPLY_COUNT -> TweetFeature.WEIGHTED_REPLY_COUNT,
TimelinesSharedFeatures.WEIGHTED_RETWEET_COUNT -> TweetFeature.WEIGHTED_RETWEET_COUNT,
TimelinesSharedFeatures.DECAYED_FAVORITE_COUNT -> TweetFeature.DECAYED_FAVORITE_COUNT,
TimelinesSharedFeatures.DECAYED_RETWEET_COUNT -> TweetFeature.DECAYED_RETWEET_COUNT,
TimelinesSharedFeatures.DECAYED_REPLY_COUNT -> TweetFeature.DECAYED_RETWEET_COUNT,
TimelinesSharedFeatures.DECAYED_QUOTE_COUNT -> TweetFeature.DECAYED_QUOTE_COUNT,
TimelinesSharedFeatures.FAKE_FAVORITE_COUNT -> TweetFeature.FAKE_FAVORITE_COUNT,
TimelinesSharedFeatures.FAKE_RETWEET_COUNT -> TweetFeature.FAKE_RETWEET_COUNT,
TimelinesSharedFeatures.FAKE_REPLY_COUNT -> TweetFeature.FAKE_REPLY_COUNT,
TimelinesSharedFeatures.FAKE_QUOTE_COUNT -> TweetFeature.FAKE_QUOTE_COUNT,
TimelinesSharedFeatures.EMBEDS_IMPRESSION_COUNT_V2 -> TweetFeature.EMBEDS_IMPRESSION_COUNT_V2,
TimelinesSharedFeatures.EMBEDS_URL_COUNT_V2 -> TweetFeature.EMBEDS_URL_COUNT_V2,
TimelinesSharedFeatures.LABEL_ABUSIVE_FLAG -> TweetFeature.LABEL_ABUSIVE_FLAG,
TimelinesSharedFeatures.LABEL_ABUSIVE_HI_RCL_FLAG -> TweetFeature.LABEL_ABUSIVE_HI_RCL_FLAG,
TimelinesSharedFeatures.LABEL_DUP_CONTENT_FLAG -> TweetFeature.LABEL_DUP_CONTENT_FLAG,
TimelinesSharedFeatures.LABEL_NSFW_HI_PRC_FLAG -> TweetFeature.LABEL_NSFW_HI_PRC_FLAG,
TimelinesSharedFeatures.LABEL_NSFW_HI_RCL_FLAG -> TweetFeature.LABEL_NSFW_HI_RCL_FLAG,
TimelinesSharedFeatures.LABEL_SPAM_FLAG -> TweetFeature.LABEL_SPAM_FLAG,
TimelinesSharedFeatures.LABEL_SPAM_HI_RCL_FLAG -> TweetFeature.LABEL_SPAM_HI_RCL_FLAG
)
protected def derivedFeaturesAdder: ITransform =
new ITransform {
private val hasEnglishTweetDiffUiLangFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.HAS_ENGLISH_TWEET_DIFF_UI_LANG)
.asInstanceOf[Feature.Binary]
private val hasEnglishUiDiffTweetLangFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.HAS_ENGLISH_UI_DIFF_TWEET_LANG)
.asInstanceOf[Feature.Binary]
private val hasDiffLangFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.HAS_DIFF_LANG)
.asInstanceOf[Feature.Binary]
private val isSelfTweetFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.IS_SELF_TWEET)
.asInstanceOf[Feature.Binary]
private val tweetAgeInSecsFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.TWEET_AGE_IN_SECS)
.asInstanceOf[Feature.Continuous]
private val authorSpecificScoreFeature =
featureInstanceFromSearchResultFeature(ExternalTweetFeature.AUTHOR_SPECIFIC_SCORE)
.asInstanceOf[Feature.Continuous]
// see comments above
private val linkLanguageFeature = new Feature.Continuous(TweetFeature.LINK_LANGUAGE.getName)
private val languageFeature = new Feature.Continuous(TweetFeature.LANGUAGE.getName)
override def transformContext(featureContext: FeatureContext): FeatureContext =
featureContext.addFeatures(
authorSpecificScoreFeature,
// used when training against the full scoreEarlybirdModelEvaluationJob.scala
// TimelinesSharedFeatures.PREDICTED_SCORE_LOG,
hasEnglishTweetDiffUiLangFeature,
hasEnglishUiDiffTweetLangFeature,
hasDiffLangFeature,
isSelfTweetFeature,
tweetAgeInSecsFeature,
linkLanguageFeature,
languageFeature
)
override def transform(record: DataRecord): Unit = {
val srecord = SRichDataRecord(record)
srecord.getFeatureValueOpt(RealGraphDataRecordFeatures.WEIGHT).map { realgraphWeight =>
srecord.setFeatureValue(authorSpecificScoreFeature, realgraphWeight)
}
// use this when training against the log of the full score
// srecord.getFeatureValueOpt(TimelinesSharedFeatures.PREDICTED_SCORE).map { score =>
// if (score > 0.0) {
// srecord.setFeatureValue(TimelinesSharedFeatures.PREDICTED_SCORE_LOG, Math.log(score))
// }
// }
if (srecord.hasFeature(RequestContextFeatures.LANGUAGE_CODE) && srecord.hasFeature(
RecapFeatures.LANGUAGE)) {
val uilangIsEnglish = srecord
.getFeatureValue(RequestContextFeatures.LANGUAGE_CODE).toString == "en"
val tweetIsEnglish = srecord.getFeatureValue(RecapFeatures.LANGUAGE) == 5
srecord.setFeatureValue(
hasEnglishTweetDiffUiLangFeature,
tweetIsEnglish && !uilangIsEnglish
)
srecord.setFeatureValue(
hasEnglishUiDiffTweetLangFeature,
uilangIsEnglish && !tweetIsEnglish
)
}
srecord.getFeatureValueOpt(RecapFeatures.MATCH_UI_LANG).map { match_ui_lang =>
srecord.setFeatureValue(
hasDiffLangFeature,
!match_ui_lang
)
}
for {
author_id <- srecord.getFeatureValueOpt(SharedFeatures.AUTHOR_ID)
user_id <- srecord.getFeatureValueOpt(SharedFeatures.USER_ID)
} srecord.setFeatureValue(
isSelfTweetFeature,
author_id == user_id
)
srecord.getFeatureValueOpt(TimeDataRecordFeatures.TIME_SINCE_TWEET_CREATION).map {
time_since_tweet_creation =>
srecord.setFeatureValue(
tweetAgeInSecsFeature,
time_since_tweet_creation / 1000.0
)
}
srecord.getFeatureValueOpt(RecapFeatures.LINK_LANGUAGE).map { link_language =>
srecord.setFeatureValue(
linkLanguageFeature,
link_language.toDouble
)
}
srecord.getFeatureValueOpt(RecapFeatures.LANGUAGE).map { language =>
srecord.setFeatureValue(
languageFeature,
language.toDouble
)
}
}
}
protected def featureInstanceFromSearchResultFeature(
tweetFeature: SearchResultFeature
): Feature[_] = {
val featureType = tweetFeature.getType
val featureName = tweetFeature.getName
require(
!tweetFeature.isDiscrete && (
featureType == com.twitter.search.common.features.thrift.ThriftSearchFeatureType.BOOLEAN_VALUE ||
featureType == com.twitter.search.common.features.thrift.ThriftSearchFeatureType.DOUBLE_VALUE ||
featureType == com.twitter.search.common.features.thrift.ThriftSearchFeatureType.INT32_VALUE
)
)
if (featureType == com.twitter.search.common.features.thrift.ThriftSearchFeatureType.BOOLEAN_VALUE)
new Feature.Binary(featureName)
else
new Feature.Continuous(featureName)
}
lazy val EarlybirdFeatureRenamer: ITransform = {
val earlybirdFeatureRenameMap: Map[Feature[_], Feature[_]] =
featureToSearchResultFeatureMap.map {
case (originalFeature, tweetFeature) =>
originalFeature -> featureInstanceFromSearchResultFeature(tweetFeature)
}.toMap
new CascadeTransform(
List(
derivedFeaturesAdder,
TransformFactory.produceTransform(
TransformFactory.produceFeatureRenameTransformSpec(
earlybirdFeatureRenameMap.asJava
)
)
).asJava
)
}
}

View File

@ -1,17 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common
import com.twitter.ml.api.Feature
import com.twitter.timelines.prediction.features.recap.RecapFeatures
class EarlybirdTrainingRecapConfiguration extends EarlybirdTrainingConfiguration {
override val labels: Map[String, Feature.Binary] = Map(
"detail_expanded" -> RecapFeatures.IS_CLICKED,
"favorited" -> RecapFeatures.IS_FAVORITED,
"open_linked" -> RecapFeatures.IS_OPEN_LINKED,
"photo_expanded" -> RecapFeatures.IS_PHOTO_EXPANDED,
"profile_clicked" -> RecapFeatures.IS_PROFILE_CLICKED,
"replied" -> RecapFeatures.IS_REPLIED,
"retweeted" -> RecapFeatures.IS_RETWEETED,
"video_playback50" -> RecapFeatures.IS_VIDEO_PLAYBACK_50
)
}

View File

@ -1,100 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api.ITransform
import com.twitter.ml.api.transform.CascadeTransform
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.search.common.features.SearchResultFeature
import com.twitter.search.common.features.TweetFeature
import com.twitter.timelines.prediction.features.itl.ITLFeatures._
import scala.collection.JavaConverters._
class EarlybirdTrainingRectweetConfiguration extends EarlybirdTrainingConfiguration {
override val labels: Map[String, Feature.Binary] = Map(
"detail_expanded" -> IS_CLICKED,
"favorited" -> IS_FAVORITED,
"open_linked" -> IS_OPEN_LINKED,
"photo_expanded" -> IS_PHOTO_EXPANDED,
"profile_clicked" -> IS_PROFILE_CLICKED,
"replied" -> IS_REPLIED,
"retweeted" -> IS_RETWEETED,
"video_playback50" -> IS_VIDEO_PLAYBACK_50
)
override val PositiveSamplingRate: Double = 0.5
override def featureToSearchResultFeatureMap: Map[Feature[_], SearchResultFeature] =
super.featureToSearchResultFeatureMap ++ Map(
TEXT_SCORE -> TweetFeature.TEXT_SCORE,
REPLY_COUNT -> TweetFeature.REPLY_COUNT,
RETWEET_COUNT -> TweetFeature.RETWEET_COUNT,
FAV_COUNT -> TweetFeature.FAVORITE_COUNT,
HAS_CARD -> TweetFeature.HAS_CARD_FLAG,
HAS_CONSUMER_VIDEO -> TweetFeature.HAS_CONSUMER_VIDEO_FLAG,
HAS_PRO_VIDEO -> TweetFeature.HAS_PRO_VIDEO_FLAG,
HAS_VINE -> TweetFeature.HAS_VINE_FLAG,
HAS_PERISCOPE -> TweetFeature.HAS_PERISCOPE_FLAG,
HAS_NATIVE_IMAGE -> TweetFeature.HAS_NATIVE_IMAGE_FLAG,
HAS_IMAGE -> TweetFeature.HAS_IMAGE_URL_FLAG,
HAS_NEWS -> TweetFeature.HAS_NEWS_URL_FLAG,
HAS_VIDEO -> TweetFeature.HAS_VIDEO_URL_FLAG,
// some features that exist for recap are not available in rectweet
// HAS_TREND
// HAS_MULTIPLE_HASHTAGS_OR_TRENDS
// IS_OFFENSIVE
// IS_REPLY
// IS_RETWEET
IS_AUTHOR_BOT -> TweetFeature.IS_USER_BOT_FLAG,
IS_AUTHOR_SPAM -> TweetFeature.IS_USER_SPAM_FLAG,
IS_AUTHOR_NSFW -> TweetFeature.IS_USER_NSFW_FLAG,
// FROM_VERIFIED_ACCOUNT
USER_REP -> TweetFeature.USER_REPUTATION,
// EMBEDS_IMPRESSION_COUNT
// EMBEDS_URL_COUNT
// VIDEO_VIEW_COUNT
FAV_COUNT_V2 -> TweetFeature.FAVORITE_COUNT_V2,
RETWEET_COUNT_V2 -> TweetFeature.RETWEET_COUNT_V2,
REPLY_COUNT_V2 -> TweetFeature.REPLY_COUNT_V2,
IS_SENSITIVE -> TweetFeature.IS_SENSITIVE_CONTENT,
HAS_MULTIPLE_MEDIA -> TweetFeature.HAS_MULTIPLE_MEDIA_FLAG,
IS_AUTHOR_PROFILE_EGG -> TweetFeature.PROFILE_IS_EGG_FLAG,
IS_AUTHOR_NEW -> TweetFeature.IS_USER_NEW_FLAG,
NUM_MENTIONS -> TweetFeature.NUM_MENTIONS,
NUM_HASHTAGS -> TweetFeature.NUM_HASHTAGS,
HAS_VISIBLE_LINK -> TweetFeature.HAS_VISIBLE_LINK_FLAG,
HAS_LINK -> TweetFeature.HAS_LINK_FLAG
)
override def derivedFeaturesAdder: CascadeTransform = {
// only LINK_LANGUAGE availabe in rectweet. no LANGUAGE feature
val linkLanguageTransform = new ITransform {
private val linkLanguageFeature = new Feature.Continuous(TweetFeature.LINK_LANGUAGE.getName)
override def transformContext(featureContext: FeatureContext): FeatureContext =
featureContext.addFeatures(
linkLanguageFeature
)
override def transform(record: DataRecord): Unit = {
val srecord = SRichDataRecord(record)
srecord.getFeatureValueOpt(LINK_LANGUAGE).map { link_language =>
srecord.setFeatureValue(
linkLanguageFeature,
link_language.toDouble
)
}
}
}
new CascadeTransform(
List(
super.derivedFeaturesAdder,
linkLanguageTransform
).asJava
)
}
}

View File

@ -1,36 +0,0 @@
scala_library(
name = "model_evaluation",
sources = ["*.scala"],
platform = "java8",
strict_deps = False,
dependencies = [
"3rdparty/src/jvm/com/twitter/scalding:json",
"src/scala/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api/prediction_engine",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/scalding_internal/job",
"src/scala/com/twitter/timelines/prediction/adapters/recap",
"src/scala/com/twitter/timelines/prediction/features/recap",
"timelines/data_processing/ad_hoc/earlybird_ranking/common",
"timelines/data_processing/util:rich-request",
"timelines/data_processing/util/example",
"timelines/data_processing/util/execution",
"twadoop_config/configuration/log_categories/group/timelines:timelineservice_injection_request_log-scala",
],
)
hadoop_binary(
name = "bin",
basename = "earlybird_model_evaluation-deploy",
main = "com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.model_evaluation.EarlybirdModelEvaluationJob",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":model_evaluation",
],
)

View File

@ -1,203 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.model_evaluation
import scala.collection.GenTraversableOnce
case class CandidateRecord(tweetId: Long, fullScore: Double, earlyScore: Double, served: Boolean)
/**
* A metric that compares scores generated by a "full" prediction
* model to a "light" (Earlybird) model. The metric is calculated for candidates
* from a single request.
*/
sealed trait EarlybirdEvaluationMetric {
def name: String
def apply(candidates: Seq[CandidateRecord]): Option[Double]
}
/**
* Picks the set of `k` top candidates using light scores, and calculates
* recall of these light-score based candidates among set of `k` top candidates
* using full scores.
*
* If there are fewer than `k` candidates, then we can choose to filter out requests (will
* lower value of recall) or keep them by trivially computing recall as 1.0.
*/
case class TopKRecall(k: Int, filterFewerThanK: Boolean) extends EarlybirdEvaluationMetric {
override val name: String = s"top_${k}_recall${if (filterFewerThanK) "_filtered" else ""}"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
if (filterFewerThanK) None else Some(1.0)
} else {
val topFull = candidates.sortBy(-_.fullScore).take(k)
val topLight = candidates.sortBy(-_.earlyScore).take(k)
val overlap = topFull.map(_.tweetId).intersect(topLight.map(_.tweetId))
val truePos = overlap.size.toDouble
Some(truePos / k.toDouble)
}
}
}
/**
* Calculates the probability that a random pair of candidates will be ordered the same by the
* full and earlybird models.
*
* Note: A pair with same scores for one model and different for the other will contribute 1
* to the sum. Pairs that are strictly ordered the same, will contribute 2.
* It follows that the score for a constant model is 0.5, which is approximately equal to a
* random model as expected.
*/
case object ProbabilityOfCorrectOrdering extends EarlybirdEvaluationMetric {
def fractionOf[A](trav: GenTraversableOnce[A])(p: A => Boolean): Double = {
if (trav.isEmpty)
0.0
else {
val (numPos, numElements) = trav.foldLeft((0, 0)) {
case ((numPosAcc, numElementsAcc), elem) =>
(if (p(elem)) numPosAcc + 1 else numPosAcc, numElementsAcc + 1)
}
numPos.toDouble / numElements
}
}
override def name: String = "probability_of_correct_ordering"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size < 2)
None
else {
val pairs = for {
left <- candidates.iterator
right <- candidates.iterator
if left != right
} yield (left, right)
val probabilityOfCorrect = fractionOf(pairs) {
case (left, right) =>
(left.fullScore > right.fullScore) == (left.earlyScore > right.earlyScore)
}
Some(probabilityOfCorrect)
}
}
}
/**
* Like `TopKRecall`, but uses `n` % of top candidates instead.
*/
case class TopNPercentRecall(percent: Double) extends EarlybirdEvaluationMetric {
override val name: String = s"top_${percent}_pct_recall"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
val k = Math.floor(candidates.size * percent).toInt
if (k > 0) {
val topFull = candidates.sortBy(-_.fullScore).take(k)
val topLight = candidates.sortBy(-_.earlyScore).take(k)
val overlap = topFull.map(_.tweetId).intersect(topLight.map(_.tweetId))
val truePos = overlap.size.toDouble
Some(truePos / k.toDouble)
} else {
None
}
}
}
/**
* Picks the set of `k` top candidates using light scores, and calculates
* recall of selected light-score based candidates among set of actual
* shown candidates.
*/
case class ShownTweetRecall(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"shown_tweet_recall_$k"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
None
} else {
val topLight = candidates.sortBy(-_.earlyScore).take(k)
val truePos = topLight.count(_.served).toDouble
val allPos = candidates.count(_.served).toDouble
if (allPos > 0) Some(truePos / allPos)
else None
}
}
}
/**
* Like `ShownTweetRecall`, but uses `n` % of top candidates instead.
*/
case class ShownTweetPercentRecall(percent: Double) extends EarlybirdEvaluationMetric {
override val name: String = s"shown_tweet_recall_${percent}_pct"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
val k = Math.floor(candidates.size * percent).toInt
val topLight = candidates.sortBy(-_.earlyScore).take(k)
val truePos = topLight.count(_.served).toDouble
val allPos = candidates.count(_.served).toDouble
if (allPos > 0) Some(truePos / allPos)
else None
}
}
/**
* Like `ShownTweetRecall`, but calculated using *full* scores. This is a sanity metric,
* because by definition the top full-scored candidates will be served. If the value is
* < 1, this is due to the ranked section being smaller than k.
*/
case class ShownTweetRecallWithFullScores(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"shown_tweet_recall_with_full_scores_$k"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
None
} else {
val topFull = candidates.sortBy(-_.fullScore).take(k)
val truePos = topFull.count(_.served).toDouble
val allPos = candidates.count(_.served).toDouble
if (allPos > 0) Some(truePos / allPos)
else None
}
}
}
/**
* Picks the set of `k` top candidates using the light scores, and calculates
* average full score for the candidates.
*/
case class AverageFullScoreForTopLight(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"average_full_score_for_top_light_$k"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
None
} else {
val topLight = candidates.sortBy(-_.earlyScore).take(k)
Some(topLight.map(_.fullScore).sum / topLight.size)
}
}
}
/**
* Picks the set of `k` top candidates using the light scores, and calculates
* sum of full scores for those. Divides that by sum of `k` top full scores,
* overall, to get a "score recall".
*/
case class SumScoreRecallForTopLight(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"sum_score_recall_for_top_light_$k"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] = {
if (candidates.size <= k) {
None
} else {
val sumFullScoresForTopLight = candidates.sortBy(-_.earlyScore).take(k).map(_.fullScore).sum
val sumScoresForTopFull = candidates.sortBy(-_.fullScore).take(k).map(_.fullScore).sum
Some(sumFullScoresForTopLight / sumScoresForTopFull)
}
}
}
case class HasFewerThanKCandidates(k: Int) extends EarlybirdEvaluationMetric {
override val name: String = s"has_fewer_than_${k}_candidates"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] =
Some(if (candidates.size <= k) 1.0 else 0.0)
}
case object NumberOfCandidates extends EarlybirdEvaluationMetric {
override val name: String = s"number_of_candidates"
override def apply(candidates: Seq[CandidateRecord]): Option[Double] =
Some(candidates.size.toDouble)
}

View File

@ -1,214 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.model_evaluation
import com.twitter.algebird.Aggregator
import com.twitter.algebird.AveragedValue
import com.twitter.ml.api.prediction_engine.PredictionEnginePlugin
import com.twitter.ml.api.util.FDsl
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.IRecordOneToManyAdapter
import com.twitter.scalding.Args
import com.twitter.scalding.DateRange
import com.twitter.scalding.Execution
import com.twitter.scalding.TypedJson
import com.twitter.scalding.TypedPipe
import com.twitter.scalding_internal.dalv2.DAL
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingRecapConfiguration
import com.twitter.timelines.data_processing.util.RequestImplicits.RichRequest
import com.twitter.timelines.data_processing.util.example.RecapTweetExample
import com.twitter.timelines.data_processing.util.execution.UTCDateRangeFromArgs
import com.twitter.timelines.prediction.adapters.recap.RecapSuggestionRecordAdapter
import com.twitter.timelines.prediction.features.recap.RecapFeatures
import com.twitter.timelines.suggests.common.record.thriftscala.SuggestionRecord
import com.twitter.timelineservice.suggests.logging.recap.thriftscala.HighlightTweet
import com.twitter.timelineservice.suggests.logging.thriftscala.SuggestsRequestLog
import scala.collection.JavaConverters._
import scala.language.reflectiveCalls
import scala.util.Random
import twadoop_config.configuration.log_categories.group.timelines.TimelineserviceInjectionRequestLogScalaDataset
/**
* Evaluates an Earlybird model using 1% injection request logs.
*
* Arguments:
* --model_base_path path to Earlybird model snapshots
* --models list of model names to evaluate
* --output path to output stats
* --parallelism (default: 3) number of tasks to run in parallel
* --topks (optional) list of values of `k` (integers) for top-K metrics
* --topn_fractions (optional) list of values of `n` (doubles) for top-N-fraction metrics
* --seed (optional) seed for random number generator
*/
object EarlybirdModelEvaluationJob extends TwitterExecutionApp with UTCDateRangeFromArgs {
import FDsl._
import PredictionEnginePlugin._
private[this] val averager: Aggregator[Double, AveragedValue, Double] =
AveragedValue.aggregator
private[this] val recapAdapter: IRecordOneToManyAdapter[SuggestionRecord] =
new RecapSuggestionRecordAdapter(checkDwellTime = false)
override def job: Execution[Unit] = {
for {
args <- Execution.getArgs
dateRange <- dateRangeEx
metrics = getMetrics(args)
random = buildRandom(args)
modelBasePath = args("model_base_path")
models = args.list("models")
parallelism = args.int("parallelism", 3)
logs = logsHavingCandidates(dateRange)
modelScoredCandidates = models.map { model =>
(model, scoreCandidatesUsingModel(logs, s"$modelBasePath/$model"))
}
functionScoredCandidates = List(
("random", scoreCandidatesUsingFunction(logs, _ => Some(random.nextDouble()))),
("original_earlybird", scoreCandidatesUsingFunction(logs, extractOriginalEarlybirdScore)),
("blender", scoreCandidatesUsingFunction(logs, extractBlenderScore))
)
allCandidates = modelScoredCandidates ++ functionScoredCandidates
statsExecutions = allCandidates.map {
case (name, pipe) =>
for {
saved <- pipe.forceToDiskExecution
stats <- computeMetrics(saved, metrics, parallelism)
} yield (name, stats)
}
stats <- Execution.withParallelism(statsExecutions, parallelism)
_ <- TypedPipe.from(stats).writeExecution(TypedJson(args("output")))
} yield ()
}
private[this] def computeMetrics(
requests: TypedPipe[Seq[CandidateRecord]],
metricsToCompute: Seq[EarlybirdEvaluationMetric],
parallelism: Int
): Execution[Map[String, Double]] = {
val metricExecutions = metricsToCompute.map { metric =>
val metricEx = requests.flatMap(metric(_)).aggregate(averager).toOptionExecution
metricEx.map { value => value.map((metric.name, _)) }
}
Execution.withParallelism(metricExecutions, parallelism).map(_.flatten.toMap)
}
private[this] def getMetrics(args: Args): Seq[EarlybirdEvaluationMetric] = {
val topKs = args.list("topks").map(_.toInt)
val topNFractions = args.list("topn_fractions").map(_.toDouble)
val topKMetrics = topKs.flatMap { topK =>
Seq(
TopKRecall(topK, filterFewerThanK = false),
TopKRecall(topK, filterFewerThanK = true),
ShownTweetRecall(topK),
AverageFullScoreForTopLight(topK),
SumScoreRecallForTopLight(topK),
HasFewerThanKCandidates(topK),
ShownTweetRecallWithFullScores(topK),
ProbabilityOfCorrectOrdering
)
}
val topNPercentMetrics = topNFractions.flatMap { topNPercent =>
Seq(
TopNPercentRecall(topNPercent),
ShownTweetPercentRecall(topNPercent)
)
}
topKMetrics ++ topNPercentMetrics ++ Seq(NumberOfCandidates)
}
private[this] def buildRandom(args: Args): Random = {
val seedOpt = args.optional("seed").map(_.toLong)
seedOpt.map(new Random(_)).getOrElse(new Random())
}
private[this] def logsHavingCandidates(dateRange: DateRange): TypedPipe[SuggestsRequestLog] =
DAL
.read(TimelineserviceInjectionRequestLogScalaDataset, dateRange)
.toTypedPipe
.filter(_.recapCandidates.exists(_.nonEmpty))
/**
* Uses a model defined at `earlybirdModelPath` to score candidates and
* returns a Seq[CandidateRecord] for each request.
*/
private[this] def scoreCandidatesUsingModel(
logs: TypedPipe[SuggestsRequestLog],
earlybirdModelPath: String
): TypedPipe[Seq[CandidateRecord]] = {
logs
.usingScorer(earlybirdModelPath)
.map {
case (scorer: PredictionEngineScorer, log: SuggestsRequestLog) =>
val suggestionRecords =
RecapTweetExample
.extractCandidateTweetExamples(log)
.map(_.asSuggestionRecord)
val servedTweetIds = log.servedHighlightTweets.flatMap(_.tweetId).toSet
val renamer = (new EarlybirdTrainingRecapConfiguration).EarlybirdFeatureRenamer
suggestionRecords.flatMap { suggestionRecord =>
val dataRecordOpt = recapAdapter.adaptToDataRecords(suggestionRecord).asScala.headOption
dataRecordOpt.foreach(renamer.transform)
for {
tweetId <- suggestionRecord.itemId
fullScore <- suggestionRecord.recapFeatures.flatMap(_.combinedModelScore)
earlybirdScore <- dataRecordOpt.flatMap(calculateLightScore(_, scorer))
} yield CandidateRecord(
tweetId = tweetId,
fullScore = fullScore,
earlyScore = earlybirdScore,
served = servedTweetIds.contains(tweetId)
)
}
}
}
/**
* Uses a simple function to score candidates and returns a Seq[CandidateRecord] for each
* request.
*/
private[this] def scoreCandidatesUsingFunction(
logs: TypedPipe[SuggestsRequestLog],
earlyScoreExtractor: HighlightTweet => Option[Double]
): TypedPipe[Seq[CandidateRecord]] = {
logs
.map { log =>
val tweetCandidates = log.recapTweetCandidates.getOrElse(Nil)
val servedTweetIds = log.servedHighlightTweets.flatMap(_.tweetId).toSet
for {
candidate <- tweetCandidates
tweetId <- candidate.tweetId
fullScore <- candidate.recapFeatures.flatMap(_.combinedModelScore)
earlyScore <- earlyScoreExtractor(candidate)
} yield CandidateRecord(
tweetId = tweetId,
fullScore = fullScore,
earlyScore = earlyScore,
served = servedTweetIds.contains(tweetId)
)
}
}
private[this] def extractOriginalEarlybirdScore(candidate: HighlightTweet): Option[Double] =
for {
recapFeatures <- candidate.recapFeatures
tweetFeatures <- recapFeatures.tweetFeatures
} yield tweetFeatures.earlybirdScore
private[this] def extractBlenderScore(candidate: HighlightTweet): Option[Double] =
for {
recapFeatures <- candidate.recapFeatures
tweetFeatures <- recapFeatures.tweetFeatures
} yield tweetFeatures.blenderScore
private[this] def calculateLightScore(
dataRecord: DataRecord,
scorer: PredictionEngineScorer
): Option[Double] = {
val scoredRecord = scorer(dataRecord)
if (scoredRecord.hasFeature(RecapFeatures.PREDICTED_IS_UNIFIED_ENGAGEMENT)) {
Some(scoredRecord.getFeatureValue(RecapFeatures.PREDICTED_IS_UNIFIED_ENGAGEMENT).toDouble)
} else {
None
}
}
}

View File

@ -1,89 +0,0 @@
create_datarecord_datasets(
base_name = "earlybird_recap_data_records",
platform = "java8",
role = "timelines",
segment_type = "partitioned",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
],
)
create_datarecord_datasets(
base_name = "earlybird_rectweet_data_records",
platform = "java8",
role = "timelines",
segment_type = "partitioned",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
],
)
scala_library(
name = "training_data_generation",
sources = ["*.scala"],
platform = "java8",
strict_deps = True,
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
],
dependencies = [
":earlybird_recap_data_records-java",
":earlybird_rectweet_data_records-java",
"3rdparty/jvm/com/ibm/icu:icu4j",
"3rdparty/src/jvm/com/twitter/scalding:json",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/java/com/twitter/ml/api/matcher",
"src/java/com/twitter/search/common/features",
"src/scala/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api/analytics",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/scalding_internal/dalv2",
"src/scala/com/twitter/scalding_internal/dalv2/dataset",
"src/scala/com/twitter/scalding_internal/job",
"src/scala/com/twitter/scalding_internal/job/analytics_batch",
"src/scala/com/twitter/timelines/prediction/features/common",
"src/scala/com/twitter/timelines/prediction/features/recap",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:dataset-analytics-java",
"timelines/data_processing/ad_hoc/earlybird_ranking/common",
"timelines/data_processing/ad_hoc/recap/dataset_utils",
"timelines/data_processing/ad_hoc/recap/offline_execution",
"timelines/data_processing/util/execution",
],
)
hadoop_binary(
name = "bin",
basename = "earlybird_training_data_generation-deploy",
main = "com.twitter.scalding.Tool",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":training_data_generation",
],
)
hadoop_binary(
name = "earlybird_training_data_generation_prod",
basename = "earlybird_training_data_generation_prod-deploy",
main = "com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation.EarlybirdTrainingDataProdJob",
platform = "java8",
runtime_platform = "java8",
tags = [
"bazel-compatible",
"bazel-compatible:migrated",
"bazel-only",
],
dependencies = [
":training_data_generation",
],
)

View File

@ -1,65 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.DataSetPipe
import com.twitter.ml.api.Feature
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.LabelInfo
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.LabelInfoWithFeature
import com.twitter.timelines.prediction.features.recap.RecapFeatures
import java.lang.{Double => JDouble}
import scala.util.Random
/**
* Adds an IsGlobalEngagement label to records containing any recap label, and adjusts
* weights accordingly. See [[weightAndSample]] for details on operation.
*/
class EarlybirdExampleSampler(
random: Random,
labelInfos: List[LabelInfoWithFeature],
negativeInfo: LabelInfo) {
import com.twitter.ml.api.util.FDsl._
private[this] val ImportanceFeature: Feature[JDouble] =
SharedFeatures.RECORD_WEIGHT_FEATURE_BUILDER
.extensionBuilder()
.addExtension("type", "earlybird")
.build()
private[this] def uniformSample(labelInfo: LabelInfo) =
random.nextDouble() < labelInfo.downsampleFraction
private[this] def weightedImportance(labelInfo: LabelInfo) =
labelInfo.importance / labelInfo.downsampleFraction
/**
* Generates a IsGlobalEngagement label for records that contain any
* recap label. Adds an "importance" value per recap label found
* in the record. Simultaneously, downsamples positive and negative examples based on provided
* downsample rates.
*/
def weightAndSample(data: DataSetPipe): DataSetPipe = {
val updatedRecords = data.records.flatMap { record =>
val featuresOn = labelInfos.filter(labelInfo => record.hasFeature(labelInfo.feature))
if (featuresOn.nonEmpty) {
val sampled = featuresOn.map(_.info).filter(uniformSample)
if (sampled.nonEmpty) {
record.setFeatureValue(RecapFeatures.IS_EARLYBIRD_UNIFIED_ENGAGEMENT, true)
Some(record.setFeatureValue(ImportanceFeature, sampled.map(weightedImportance).sum))
} else {
None
}
} else if (uniformSample(negativeInfo)) {
Some(record.setFeatureValue(ImportanceFeature, weightedImportance(negativeInfo)))
} else {
None
}
}
DataSetPipe(
updatedRecords,
data.featureContext
.addFeatures(ImportanceFeature, RecapFeatures.IS_EARLYBIRD_UNIFIED_ENGAGEMENT)
)
}
}

View File

@ -1,63 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation
import com.twitter.ml.api.analytics.DataSetAnalyticsPlugin
import com.twitter.ml.api.matcher.FeatureMatcher
import com.twitter.ml.api.util.FDsl
import com.twitter.ml.api.DailySuffixFeatureSource
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.DataSetPipe
import com.twitter.ml.api.FeatureStats
import com.twitter.ml.api.IMatcher
import com.twitter.scalding.typed.TypedPipe
import com.twitter.scalding.Execution
import com.twitter.scalding.TypedJson
import com.twitter.scalding_internal.job.TwitterExecutionApp
import com.twitter.timelines.data_processing.util.execution.UTCDateRangeFromArgs
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingConfiguration
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingRecapConfiguration
import com.twitter.timelines.prediction.features.recap.RecapFeatures
import scala.collection.JavaConverters._
/**
* Compute counts and fractions for all labels in a Recap data source.
*
* Arguments:
* --input recap data source (containing all labels)
* --output path to output JSON file containing stats
*/
object EarlybirdStatsJob extends TwitterExecutionApp with UTCDateRangeFromArgs {
import DataSetAnalyticsPlugin._
import FDsl._
import RecapFeatures.IS_EARLYBIRD_UNIFIED_ENGAGEMENT
lazy val constants: EarlybirdTrainingConfiguration = new EarlybirdTrainingRecapConfiguration
private[this] def addGlobalEngagementLabel(record: DataRecord) = {
if (constants.LabelInfos.exists { labelInfo => record.hasFeature(labelInfo.feature) }) {
record.setFeatureValue(IS_EARLYBIRD_UNIFIED_ENGAGEMENT, true)
}
record
}
private[this] def labelFeatureMatcher: IMatcher = {
val allLabels =
(IS_EARLYBIRD_UNIFIED_ENGAGEMENT :: constants.LabelInfos.map(_.feature)).map(_.getFeatureName)
FeatureMatcher.names(allLabels.asJava)
}
private[this] def computeStats(data: DataSetPipe): TypedPipe[FeatureStats] = {
data
.viaRecords { _.map(addGlobalEngagementLabel) }
.project(labelFeatureMatcher)
.collectFeatureStats()
}
override def job: Execution[Unit] = {
for {
args <- Execution.getArgs
dateRange <- dateRangeEx
data = DailySuffixFeatureSource(args("input"))(dateRange).read
_ <- computeStats(data).writeExecution(TypedJson(args("output")))
} yield ()
}
}

View File

@ -1,92 +0,0 @@
package com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation
import com.twitter.ml.api.HourlySuffixFeatureSource
import com.twitter.ml.api.IRecord
import com.twitter.scalding.Args
import com.twitter.scalding.DateRange
import com.twitter.scalding.Days
import com.twitter.scalding.Execution
import com.twitter.scalding.ExecutionUtil
import com.twitter.scalding_internal.dalv2.DALWrite.D
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingRecapConfiguration
import com.twitter.timelines.data_processing.ad_hoc.earlybird_ranking.common.EarlybirdTrainingRectweetConfiguration
import com.twitter.timelines.data_processing.ad_hoc.recap.offline_execution.OfflineAdhocExecution
import com.twitter.timelines.data_processing.ad_hoc.recap.offline_execution.OfflineAnalyticsBatchExecution
import com.twitter.timelines.data_processing.ad_hoc.recap.offline_execution.OfflineExecution
import scala.util.Random
import com.twitter.scalding_internal.dalv2.dataset.DALWrite._
import com.twitter.timelines.prediction.features.common.TimelinesSharedFeatures
import timelines.data_processing.ad_hoc.earlybird_ranking.training_data_generation._
/**
* Generates data for training an Earlybird-friendly model.
* Produces a single "global" engagement, and samples data accordingly.
* Also converts features from Earlybird to their original Earlybird
* feature names so they can be used as is in EB.
*
* Arguments:
* --input path to raw Recap training data (all labels)
* --output path to write sampled Earlybird-friendly training data
* --seed (optional) for random number generator (in sampling)
* --parallelism (default: 1) number of days to generate data for in parallel
* [splits long date range into single days]
*/
trait GenerateEarlybirdTrainingData { _: OfflineExecution =>
def isEligibleForEarlybirdScoring(record: IRecord): Boolean = {
// The rationale behind this logic is available in TQ-9678.
record.getFeatureValue(TimelinesSharedFeatures.EARLYBIRD_SCORE) <= 100.0
}
override def executionFromParams(args: Args)(implicit dateRange: DateRange): Execution[Unit] = {
val seedOpt = args.optional("seed").map(_.toLong)
val parallelism = args.int("parallelism", 1)
val rectweet = args.boolean("rectweet")
ExecutionUtil
.runDateRangeWithParallelism(Days(1), parallelism) { splitRange =>
val data = HourlySuffixFeatureSource(args("input"))(splitRange).read
.filter(isEligibleForEarlybirdScoring _)
lazy val rng = seedOpt.map(new Random(_)).getOrElse(new Random())
val (constants, sink) =
if (rectweet)
(new EarlybirdTrainingRectweetConfiguration, EarlybirdRectweetDataRecordsJavaDataset)
else (new EarlybirdTrainingRecapConfiguration, EarlybirdRecapDataRecordsJavaDataset)
val earlybirdSampler =
new EarlybirdExampleSampler(
random = rng,
labelInfos = constants.LabelInfos,
negativeInfo = constants.NegativeInfo
)
val outputPath = args("output")
earlybirdSampler
.weightAndSample(data)
.transform(constants.EarlybirdFeatureRenamer)
// shuffle row-wise in order to get rid of clustered replies
// also keep number of part files small
.viaRecords { record =>
record
.groupRandomly(partitions = 500)
.sortBy { _ => rng.nextDouble() }
.values
}
.writeDALExecution(
sink,
D.Daily,
D.Suffix(outputPath),
D.EBLzo()
)(splitRange)
}(dateRange).unit
}
}
object EarlybirdTrainingDataAdHocJob
extends OfflineAdhocExecution
with GenerateEarlybirdTrainingData
object EarlybirdTrainingDataProdJob
extends OfflineAnalyticsBatchExecution
with GenerateEarlybirdTrainingData

View File

@ -1,124 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.EasyMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.MaxMetric
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
import com.twitter.util.Duration
import java.lang.{Boolean => JBoolean}
import java.lang.{Long => JLong}
import scala.language.existentials
/**
* A wrapper for [[com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup]]
* (see TypedAggregateGroup.scala) with some convenient syntactic sugar that avoids
* the user having to specify different groups for different types of features.
* Gets translated into multiple strongly typed TypedAggregateGroup(s)
* by the buildTypedAggregateGroups() method defined below.
*
* @param inputSource Source to compute this aggregate over
* @param preTransforms Sequence of [[ITransform]] that is applied to
* data records pre-aggregation (e.g. discretization, renaming)
* @param samplingTransformOpt Optional [[OneToSomeTransform]] that samples data record
* @param aggregatePrefix Prefix to use for naming resultant aggregate features
* @param keys Features to group by when computing the aggregates
* (e.g. USER_ID, AUTHOR_ID). These must be either discrete, string or sparse binary.
* Grouping by a sparse binary feature is different than grouping by a discrete or string
* feature. For example, if you have a sparse binary feature WORDS_IN_TWEET which is
* a set of all words in a tweet, then grouping by this feature generates a
* separate aggregate mean/count/etc for each value of the feature (each word), and
* not just a single aggregate count for different "sets of words"
* @param features Features to aggregate (e.g. blender_score or is_photo).
* @param labels Labels to cross the features with to make pair features, if any.
* @param metrics Aggregation metrics to compute (e.g. count, mean)
* @param halfLives Half lives to use for the aggregations, to be crossed with the above.
* use Duration.Top for "forever" aggregations over an infinite time window (no decay).
* @param outputStore Store to output this aggregate to
* @param includeAnyFeature Aggregate label counts for any feature value
* @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
* @param includeTimestampFeature compute max aggregate on timestamp feature
* @param aggExclusionRegex Sequence of Regexes, which define features to
*/
case class AggregateGroup(
inputSource: AggregateSource,
aggregatePrefix: String,
keys: Set[Feature[_]],
features: Set[Feature[_]],
labels: Set[_ <: Feature[JBoolean]],
metrics: Set[EasyMetric],
halfLives: Set[Duration],
outputStore: AggregateStore,
preTransforms: Seq[OneToSomeTransform] = Seq.empty,
includeAnyFeature: Boolean = true,
includeAnyLabel: Boolean = true,
includeTimestampFeature: Boolean = false,
aggExclusionRegex: Seq[String] = Seq.empty) {
private def toStrongType[T](
metrics: Set[EasyMetric],
features: Set[Feature[_]],
featureType: FeatureType
): TypedAggregateGroup[_] = {
val underlyingMetrics: Set[AggregationMetric[T, _]] =
metrics.flatMap(_.forFeatureType[T](featureType))
val underlyingFeatures: Set[Feature[T]] = features
.map(_.asInstanceOf[Feature[T]])
TypedAggregateGroup[T](
inputSource = inputSource,
aggregatePrefix = aggregatePrefix,
keysToAggregate = keys,
featuresToAggregate = underlyingFeatures,
labels = labels,
metrics = underlyingMetrics,
halfLives = halfLives,
outputStore = outputStore,
preTransforms = preTransforms,
includeAnyFeature,
includeAnyLabel,
aggExclusionRegex
)
}
private def timestampTypedAggregateGroup: TypedAggregateGroup[_] = {
val metrics: Set[AggregationMetric[JLong, _]] =
Set(MaxMetric.forFeatureType[JLong](TypedAggregateGroup.timestampFeature.getFeatureType).get)
TypedAggregateGroup[JLong](
inputSource = inputSource,
aggregatePrefix = aggregatePrefix,
keysToAggregate = keys,
featuresToAggregate = Set(TypedAggregateGroup.timestampFeature),
labels = Set.empty,
metrics = metrics,
halfLives = Set(Duration.Top),
outputStore = outputStore,
preTransforms = preTransforms,
includeAnyFeature = false,
includeAnyLabel = true,
aggExclusionRegex = Seq.empty
)
}
def buildTypedAggregateGroups(): List[TypedAggregateGroup[_]] = {
val typedAggregateGroupsList = {
if (features.isEmpty) {
List(toStrongType(metrics, features, FeatureType.BINARY))
} else {
features
.groupBy(_.getFeatureType())
.toList
.map {
case (featureType, features) =>
toStrongType(metrics, features, featureType)
}
}
}
val optionalTimestampTypedAggregateGroup =
if (includeTimestampFeature) List(timestampTypedAggregateGroup) else List()
typedAggregateGroupsList ++ optionalTimestampTypedAggregateGroup
}
}

View File

@ -1,9 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api.Feature
import java.lang.{Long => JLong}
trait AggregateSource extends Serializable {
def name: String
def timestampFeature: Feature[JLong]
}

View File

@ -1,5 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
trait AggregateStore extends Serializable {
def name: String
}

View File

@ -1,5 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
trait AggregationConfig {
def aggregatesToCompute: Set[TypedAggregateGroup[_]]
}

View File

@ -1,50 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.bijection.Bufferable
import com.twitter.bijection.Injection
import scala.util.Try
/**
* Case class that represents the "grouping" key for any aggregate feature.
* Used by Summingbird to output aggregates to the key-value "store" using sumByKey()
*
* @discreteFeaturesById All discrete featureids (+ values) that are part of this key
* @textFeaturesById All string featureids (+ values) that are part of this key
*
* Example 1: the user aggregate features in aggregatesv1 all group by USER_ID,
* which is a discrete feature. When storing these features, the key would be:
*
* discreteFeaturesById = Map(hash(USER_ID) -> <the actual user id>), textFeaturesById = Map()
*
* Ex 2: If aggregating grouped by USER_ID, AUTHOR_ID, tweet link url, the key would be:
*
* discreteFeaturesById = Map(hash(USER_ID) -> <actual user id>, hash(AUTHOR_ID) -> <actual author id>),
* textFeaturesById = Map(hash(URL_FEATURE) -> <the link url>)
*
* I could have just used a DataRecord for the key, but I wanted to make it strongly typed
* and only support grouping by discrete and string features, so using a case class instead.
*
* Re: efficiency, storing the hash of the feature in addition to just the feature value
* is somewhat more inefficient than only storing the feature value in the key, but it
* adds flexibility to group multiple types of aggregates in the same output store. If we
* decide this isn't a good tradeoff to make later, we can reverse/refactor this decision.
*/
case class AggregationKey(
discreteFeaturesById: Map[Long, Long],
textFeaturesById: Map[Long, String])
/**
* A custom injection for the above case class,
* so that Summingbird knows how to store it in Manhattan.
*/
object AggregationKeyInjection extends Injection[AggregationKey, Array[Byte]] {
/* Injection from tuple representation of AggregationKey to Array[Byte] */
val featureMapsInjection: Injection[(Map[Long, Long], Map[Long, String]), Array[Byte]] =
Bufferable.injectionOf[(Map[Long, Long], Map[Long, String])]
def apply(aggregationKey: AggregationKey): Array[Byte] =
featureMapsInjection(AggregationKey.unapply(aggregationKey).get)
def invert(ab: Array[Byte]): Try[AggregationKey] =
featureMapsInjection.invert(ab).map(AggregationKey.tupled(_))
}

View File

@ -1,101 +0,0 @@
scala_library(
name = "common_types",
sources = ["*.scala"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/twitter/algebird:bijection",
"3rdparty/jvm/com/twitter/algebird:core",
"3rdparty/jvm/com/twitter/algebird:util",
"3rdparty/jvm/com/twitter/bijection:core",
"3rdparty/jvm/com/twitter/bijection:json",
"3rdparty/jvm/com/twitter/bijection:macros",
"3rdparty/jvm/com/twitter/bijection:netty",
"3rdparty/jvm/com/twitter/bijection:scrooge",
"3rdparty/jvm/com/twitter/bijection:thrift",
"3rdparty/jvm/com/twitter/bijection:util",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/src/jvm/com/twitter/scalding:date",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/scala/com/twitter/dal/client/dataset",
"src/scala/com/twitter/ml/api/util:datarecord",
"src/scala/com/twitter/scalding_internal/dalv2/vkvs",
"src/scala/com/twitter/scalding_internal/multiformat/format/keyval",
"src/scala/com/twitter/storehaus_internal/manhattan/config",
"src/scala/com/twitter/storehaus_internal/offline",
"src/scala/com/twitter/storehaus_internal/util",
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
"src/scala/com/twitter/summingbird_internal/runner/store_config",
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
"src/thrift/com/twitter/dal/personal_data:personal_data-scala",
"src/thrift/com/twitter/ml/api:data-java",
"timelines/data_processing/ml_util/aggregation_framework/metrics",
"timelines/data_processing/ml_util/transforms",
"util/util-core:util-core-util",
],
)
target(
name = "common_online_stores",
dependencies = [
"src/scala/com/twitter/storehaus_internal/memcache",
],
)
target(
name = "common_offline_stores",
dependencies = [
"src/scala/com/twitter/storehaus_internal/manhattan",
],
)
target(
name = "user_job",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/job",
],
)
target(
name = "scalding",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/scalding",
],
)
target(
name = "conversion",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/conversion",
],
)
target(
name = "query",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/query",
],
)
target(
name = "heron",
dependencies = [
"timelines/data_processing/ml_util/aggregation_framework/heron",
],
)
target(
dependencies = [
":common_offline_stores",
":common_online_stores",
":common_types",
":conversion",
":heron",
":query",
":scalding",
],
)

View File

@ -1,92 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.algebird.Monoid
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.SRichDataRecord
import scala.collection.mutable
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._
/**
* Monoid to aggregate over DataRecord objects.
*
* @param aggregates Set of ''TypedAggregateGroup'' case classes*
* to compute using this monoid (see TypedAggregateGroup.scala)
*/
trait DataRecordMonoid extends Monoid[DataRecord] {
val aggregates: Set[TypedAggregateGroup[_]]
def zero(): DataRecord = new DataRecord
/*
* Add two datarecords using this monoid.
*
* @param left Left datarecord to add
* @param right Right datarecord to add
* @return Sum of the two datarecords as a DataRecord
*/
def plus(left: DataRecord, right: DataRecord): DataRecord = {
val result = zero()
aggregates.foreach(_.mutatePlus(result, left, right))
val leftTimestamp = getTimestamp(left)
val rightTimestamp = getTimestamp(right)
SRichDataRecord(result).setFeatureValue(
SharedFeatures.TIMESTAMP,
leftTimestamp.max(rightTimestamp)
)
result
}
}
case class DataRecordAggregationMonoid(aggregates: Set[TypedAggregateGroup[_]])
extends DataRecordMonoid {
private def sumBuffer(buffer: mutable.ArrayBuffer[DataRecord]): Unit = {
val bufferSum = zero()
buffer.toIterator.foreach { value =>
val leftTimestamp = getTimestamp(bufferSum)
val rightTimestamp = getTimestamp(value)
aggregates.foreach(_.mutatePlus(bufferSum, bufferSum, value))
SRichDataRecord(bufferSum).setFeatureValue(
SharedFeatures.TIMESTAMP,
leftTimestamp.max(rightTimestamp)
)
}
buffer.clear()
buffer += bufferSum
}
/*
* Efficient batched aggregation of datarecords using
* this monoid + a buffer, for performance.
*
* @param dataRecordIter An iterator of datarecords to sum
* @return A datarecord option containing the sum
*/
override def sumOption(dataRecordIter: TraversableOnce[DataRecord]): Option[DataRecord] = {
if (dataRecordIter.isEmpty) {
None
} else {
var buffer = mutable.ArrayBuffer[DataRecord]()
val BatchSize = 1000
dataRecordIter.foreach { u =>
if (buffer.size > BatchSize) sumBuffer(buffer)
buffer += u
}
if (buffer.size > 1) sumBuffer(buffer)
Some(buffer(0))
}
}
}
/*
* This class is used when there is no need to use sumBuffer functionality, as in the case of
* online aggregation of datarecords where using a buffer on a small number of datarecords
* would add some performance overhead.
*/
case class DataRecordAggregationMonoidNoBuffer(aggregates: Set[TypedAggregateGroup[_]])
extends DataRecordMonoid {}

View File

@ -1,27 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api.DataRecord
/**
* Keyed record that is used to reprsent the aggregation type and its corresponding data record.
*
* @constructor creates a new keyed record.
*
* @param aggregateType the aggregate type
* @param record the data record associated with the key
**/
case class KeyedRecord(aggregateType: AggregateType.Value, record: DataRecord)
/**
* Keyed record map with multiple data record.
*
* @constructor creates a new keyed record map.
*
* @param aggregateType the aggregate type
* @param recordMap a map with key of type Long and value of type DataRecord
* where the key indicates the index and the value indicating the record
*
**/
case class KeyedRecordMap(
aggregateType: AggregateType.Value,
recordMap: scala.collection.Map[Long, DataRecord])

View File

@ -1,46 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.dal.personal_data.thriftscala.PersonalDataType
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.Batched
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.JavaCompactThrift
import com.twitter.scalding_internal.multiformat.format.keyval.KeyValInjection.genericInjection
import com.twitter.summingbird.batch.BatchID
import scala.collection.JavaConverters._
object OfflineAggregateInjections {
val offlineDataRecordAggregateInjection: KeyValInjection[AggregationKey, (BatchID, DataRecord)] =
KeyValInjection(
genericInjection(AggregationKeyInjection),
Batched(JavaCompactThrift[DataRecord])
)
private[aggregation_framework] def getPdts[T](
aggregateGroups: Iterable[T],
featureExtractor: T => Iterable[Feature[_]]
): Option[Set[PersonalDataType]] = {
val pdts: Set[PersonalDataType] = for {
group <- aggregateGroups.toSet[T]
feature <- featureExtractor(group)
pdtSet <- feature.getPersonalDataTypes.asSet().asScala
javaPdt <- pdtSet.asScala
scalaPdt <- PersonalDataType.get(javaPdt.getValue)
} yield {
scalaPdt
}
if (pdts.nonEmpty) Some(pdts) else None
}
def getInjection(
aggregateGroups: Set[TypedAggregateGroup[_]]
): KeyValInjection[AggregationKey, (BatchID, DataRecord)] = {
val keyPdts = getPdts[TypedAggregateGroup[_]](aggregateGroups, _.allOutputKeys)
val valuePdts = getPdts[TypedAggregateGroup[_]](aggregateGroups, _.allOutputFeatures)
KeyValInjection(
genericInjection(AggregationKeyInjection, keyPdts),
genericInjection(Batched(JavaCompactThrift[DataRecord]), valuePdts)
)
}
}

View File

@ -1,21 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.dal.client.dataset.TimePartitionedDALDataset
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import java.lang.{Long => JLong}
case class OfflineAggregateSource(
override val name: String,
override val timestampFeature: Feature[JLong],
scaldingHdfsPath: Option[String] = None,
scaldingSuffixType: Option[String] = None,
dalDataSet: Option[TimePartitionedDALDataset[DataRecord]] = None,
withValidation: Boolean = true) // context: https://jira.twitter.biz/browse/TQ-10618
extends AggregateSource {
/*
* Th help transition callers to use DAL.read, we check that either the HDFS
* path is defined, or the dalDataset. Both options cannot be set at the same time.
*/
assert(!(scaldingHdfsPath.isDefined && dalDataSet.isDefined))
}

View File

@ -1,128 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.dal.client.dataset.KeyValDALDataset
import com.twitter.ml.api.DataRecord
import com.twitter.scalding.DateParser
import com.twitter.scalding.RichDate
import com.twitter.scalding_internal.multiformat.format.keyval.KeyVal
import com.twitter.storehaus_internal.manhattan._
import com.twitter.storehaus_internal.util.ApplicationID
import com.twitter.storehaus_internal.util.DatasetName
import com.twitter.storehaus_internal.util.HDFSPath
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird.batch.Batcher
import com.twitter.summingbird_internal.runner.store_config._
import java.util.TimeZone
import com.twitter.summingbird.batch.MillisecondBatcher
/*
* Configuration common to all offline aggregate stores
*
* @param outputHdfsPathPrefix HDFS prefix to store all output aggregate types offline
* @param dummyAppId Dummy manhattan app id required by summingbird (unused)
* @param dummyDatasetPrefix Dummy manhattan dataset prefix required by summingbird (unused)
* @param startDate Start date for summingbird job to begin computing aggregates
*/
case class OfflineAggregateStoreCommonConfig(
outputHdfsPathPrefix: String,
dummyAppId: String,
dummyDatasetPrefix: String,
startDate: String)
/**
* A trait inherited by any object that defines
* a HDFS prefix to write output data to. E.g. timelines has its own
* output prefix to write aggregates_v2 results, your team can create
* its own.
*/
trait OfflineStoreCommonConfig extends Serializable {
/*
* @param startDate Date to create config for
* @return OfflineAggregateStoreCommonConfig object with all config details for output populated
*/
def apply(startDate: String): OfflineAggregateStoreCommonConfig
}
/**
* @param name Uniquely identifiable human-readable name for this output store
* @param startDate Start date for this output store from which aggregates should be computed
* @param commonConfig Provider of other common configuration details
* @param batchesToKeep Retention policy on output (number of batches to keep)
*/
abstract class OfflineAggregateStoreBase
extends OfflineStoreOnlyConfig[ManhattanROConfig]
with AggregateStore {
override def name: String
def startDate: String
def commonConfig: OfflineStoreCommonConfig
def batchesToKeep: Int
def maxKvSourceFailures: Int
val datedCommonConfig: OfflineAggregateStoreCommonConfig = commonConfig.apply(startDate)
val manhattan: ManhattanROConfig = ManhattanROConfig(
/* This is a sample config, will be replaced with production config later */
HDFSPath(s"${datedCommonConfig.outputHdfsPathPrefix}/${name}"),
ApplicationID(datedCommonConfig.dummyAppId),
DatasetName(s"${datedCommonConfig.dummyDatasetPrefix}_${name}_1"),
com.twitter.storehaus_internal.manhattan.Adama
)
val batcherSize = 24
val batcher: MillisecondBatcher = Batcher.ofHours(batcherSize)
val startTime: RichDate =
RichDate(datedCommonConfig.startDate)(TimeZone.getTimeZone("UTC"), DateParser.default)
val offline: ManhattanROConfig = manhattan
}
/**
* Defines an aggregates store which is composed of DataRecords
* @param name Uniquely identifiable human-readable name for this output store
* @param startDate Start date for this output store from which aggregates should be computed
* @param commonConfig Provider of other common configuration details
* @param batchesToKeep Retention policy on output (number of batches to keep)
*/
case class OfflineAggregateDataRecordStore(
override val name: String,
override val startDate: String,
override val commonConfig: OfflineStoreCommonConfig,
override val batchesToKeep: Int = 7,
override val maxKvSourceFailures: Int = 0)
extends OfflineAggregateStoreBase {
def toOfflineAggregateDataRecordStoreWithDAL(
dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
): OfflineAggregateDataRecordStoreWithDAL =
OfflineAggregateDataRecordStoreWithDAL(
name = name,
startDate = startDate,
commonConfig = commonConfig,
dalDataset = dalDataset,
maxKvSourceFailures = maxKvSourceFailures
)
}
trait withDALDataset {
def dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]]
}
/**
* Defines an aggregates store which is composed of DataRecords and writes using DAL.
* @param name Uniquely identifiable human-readable name for this output store
* @param startDate Start date for this output store from which aggregates should be computed
* @param commonConfig Provider of other common configuration details
* @param dalDataset The KeyValDALDataset for this output store
* @param batchesToKeep Unused, kept for interface compatibility. You must define a separate Oxpecker
* retention policy to maintain the desired number of versions.
*/
case class OfflineAggregateDataRecordStoreWithDAL(
override val name: String,
override val startDate: String,
override val commonConfig: OfflineStoreCommonConfig,
override val dalDataset: KeyValDALDataset[KeyVal[AggregationKey, (BatchID, DataRecord)]],
override val batchesToKeep: Int = -1,
override val maxKvSourceFailures: Int = 0)
extends OfflineAggregateStoreBase
with withDALDataset

View File

@ -1,39 +0,0 @@
Overview
========
The **aggregation framework** is a set of libraries and utilities that allows teams to flexibly
compute aggregate (counting) features in both batch and in real-time. Aggregate features can capture
historical interactions between on arbitrary entities (and sets thereof), conditional on provided features
and labels.
These types of engineered aggregate features have proven to be highly impactful across different teams at Twitter.
What are some features we can compute?
--------------------------------------
The framework supports computing aggregate features on provided grouping keys. The only constraint is that these keys are sparse binary features (or are sets thereof).
For example, a common use case is to calculate a user's past engagement history with various types of tweets (photo, video, retweets, etc.), specific authors, specific in-network engagers or any other entity the user has interacted with and that could provide signal. In this case, the underlying aggregation keys are `userId`, `(userId, authorId)` or `(userId, engagerId)`.
In Timelines and MagicRecs, we also compute custom aggregate engagement counts on every `tweetId`. Similary, other aggregations are possible, perhaps on `advertiserId` or `mediaId` as long as the grouping key is sparse binary.
What implementations are supported?
-----------------------------------
Offline, we support the daily batch processing of DataRecords containing all required input features to generate
aggregate features. These are then uploaded to Manhattan for online hydration.
Online, we support the real-time aggregation of DataRecords through Storm with a backing memcache that can be queried
for the real-time aggregate features.
Additional documentation exists in the [docs folder](docs)
Where is this used?
--------------------
The Home Timeline heavy ranker uses a varierty of both [batch and real time features](../../../../src/scala/com/twitter/timelines/prediction/common/aggregates/README.md) generated by this framework.
These features are also used for email and other recommendations.

View File

@ -1,68 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureType
/**
* Convenience class to describe the stores that make up a particular type of aggregate.
*
* For example, as of 2018/07, user aggregates are generate by merging the individual
* "user_aggregates", "rectweet_user_aggregates", and, "twitter_wide_user_aggregates".
*
* @param storeNames Name of the stores.
* @param aggregateType Type of aggregate, usually differentiated by the aggregation key.
* @param shouldHash Used at TimelineRankingAggregatesUtil.extractSecondary when extracting the
* secondary key value.
*/
case class StoreConfig[T](
storeNames: Set[String],
aggregateType: AggregateType.Value,
shouldHash: Boolean = false
)(
implicit storeMerger: StoreMerger) {
require(storeMerger.isValidToMerge(storeNames))
private val representativeStore = storeNames.head
val aggregationKeyIds: Set[Long] = storeMerger.getAggregateKeys(representativeStore)
val aggregationKeyFeatures: Set[Feature[_]] =
storeMerger.getAggregateKeyFeatures(representativeStore)
val secondaryKeyFeatureOpt: Option[Feature[_]] = storeMerger.getSecondaryKey(representativeStore)
}
trait StoreMerger {
def aggregationConfig: AggregationConfig
def getAggregateKeyFeatures(storeName: String): Set[Feature[_]] =
aggregationConfig.aggregatesToCompute
.filter(_.outputStore.name == storeName)
.flatMap(_.keysToAggregate)
def getAggregateKeys(storeName: String): Set[Long] =
TypedAggregateGroup.getKeyFeatureIds(getAggregateKeyFeatures(storeName))
def getSecondaryKey(storeName: String): Option[Feature[_]] = {
val keys = getAggregateKeyFeatures(storeName)
require(keys.size <= 2, "Only singleton or binary aggregation keys are supported.")
require(keys.contains(SharedFeatures.USER_ID), "USER_ID must be one of the aggregation keys.")
keys
.filterNot(_ == SharedFeatures.USER_ID)
.headOption
.map { possiblySparseKey =>
if (possiblySparseKey.getFeatureType != FeatureType.SPARSE_BINARY) {
possiblySparseKey
} else {
TypedAggregateGroup.sparseFeature(possiblySparseKey)
}
}
}
/**
* Stores may only be merged if they have the same aggregation key.
*/
def isValidToMerge(storeNames: Set[String]): Boolean = {
val expectedKeyOpt = storeNames.headOption.map(getAggregateKeys)
storeNames.forall(v => getAggregateKeys(v) == expectedKeyOpt.get)
}
}

View File

@ -1,13 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
trait StoreRegister {
def allStores: Set[StoreConfig[_]]
lazy val storeMap: Map[AggregateType.Value, StoreConfig[_]] = allStores
.map(store => (store.aggregateType, store))
.toMap
lazy val storeNameToTypeMap: Map[String, AggregateType.Value] = allStores
.flatMap(store => store.storeNames.map(name => (name, store.aggregateType)))
.toMap
}

View File

@ -1,486 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetric
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon._
import com.twitter.timelines.data_processing.ml_util.transforms.OneToSomeTransform
import com.twitter.util.Duration
import com.twitter.util.Try
import java.lang.{Boolean => JBoolean}
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import java.util.{Set => JSet}
import scala.annotation.tailrec
import scala.language.existentials
import scala.collection.JavaConverters._
import scala.util.matching.Regex
/**
* A case class contained precomputed data useful to quickly
* process operations over an aggregate.
*
* @param query The underlying feature being aggregated
* @param metric The aggregation metric
* @param outputFeatures The output features that aggregation will produce
* @param outputFeatureIds The precomputed hashes of the above outputFeatures
*/
case class PrecomputedAggregateDescriptor[T](
query: AggregateFeature[T],
metric: AggregationMetric[T, _],
outputFeatures: List[Feature[_]],
outputFeatureIds: List[JLong])
object TypedAggregateGroup {
/**
* Recursive function that generates all combinations of value
* assignments for a collection of sparse binary features.
*
* @param sparseBinaryIdValues list of sparse binary feature ids and possible values they can take
* @return A set of maps, where each map represents one possible assignment of values to ids
*/
def sparseBinaryPermutations(
sparseBinaryIdValues: List[(Long, Set[String])]
): Set[Map[Long, String]] = sparseBinaryIdValues match {
case (id, values) +: rest =>
tailRecSparseBinaryPermutations(
existingPermutations = values.map(value => Map(id -> value)),
remainingIdValues = rest
)
case Nil => Set.empty
}
@tailrec private[this] def tailRecSparseBinaryPermutations(
existingPermutations: Set[Map[Long, String]],
remainingIdValues: List[(Long, Set[String])]
): Set[Map[Long, String]] = remainingIdValues match {
case Nil => existingPermutations
case (id, values) +: rest =>
tailRecSparseBinaryPermutations(
existingPermutations.flatMap { existingIdValueMap =>
values.map(value => existingIdValueMap ++ Map(id -> value))
},
rest
)
}
val SparseFeatureSuffix = ".member"
def sparseFeature(sparseBinaryFeature: Feature[_]): Feature[String] =
new Feature.Text(
sparseBinaryFeature.getDenseFeatureName + SparseFeatureSuffix,
AggregationMetricCommon.derivePersonalDataTypes(Some(sparseBinaryFeature)))
/* Throws exception if obj not an instance of U */
private[this] def validate[U](obj: Any): U = {
require(obj.isInstanceOf[U])
obj.asInstanceOf[U]
}
private[this] def getFeatureOpt[U](dataRecord: DataRecord, feature: Feature[U]): Option[U] =
Option(SRichDataRecord(dataRecord).getFeatureValue(feature)).map(validate[U](_))
/**
* Get a mapping from feature ids
* (including individual sparse elements of a sparse feature) to values
* from the given data record, for a given feature type.
*
* @param dataRecord Data record to get features from
* @param keysToAggregate key features to get id-value mappings for
* @param featureType Feature type to get id-value maps for
*/
def getKeyFeatureIdValues[U](
dataRecord: DataRecord,
keysToAggregate: Set[Feature[_]],
featureType: FeatureType
): Set[(Long, Option[U])] = {
val featuresOfThisType: Set[Feature[U]] = keysToAggregate
.filter(_.getFeatureType == featureType)
.map(validate[Feature[U]])
featuresOfThisType
.map { feature: Feature[U] =>
val featureId: Long = getDenseFeatureId(feature)
val featureOpt: Option[U] = getFeatureOpt(dataRecord, feature)
(featureId, featureOpt)
}
}
// TypedAggregateGroup may transform the aggregate keys for internal use. This method generates
// denseFeatureIds for the transformed feature.
def getDenseFeatureId(feature: Feature[_]): Long =
if (feature.getFeatureType != FeatureType.SPARSE_BINARY) {
feature.getDenseFeatureId
} else {
sparseFeature(feature).getDenseFeatureId
}
/**
* Return denseFeatureIds for the input features after applying the custom transformation that
* TypedAggregateGroup applies to its keysToAggregate.
*
* @param keysToAggregate key features to get id for
*/
def getKeyFeatureIds(keysToAggregate: Set[Feature[_]]): Set[Long] =
keysToAggregate.map(getDenseFeatureId)
def checkIfAllKeysExist[U](featureIdValueMap: Map[Long, Option[U]]): Boolean =
featureIdValueMap.forall { case (_, valueOpt) => valueOpt.isDefined }
def liftOptions[U](featureIdValueMap: Map[Long, Option[U]]): Map[Long, U] =
featureIdValueMap
.flatMap {
case (id, valueOpt) =>
valueOpt.map { value => (id, value) }
}
val timestampFeature: Feature[JLong] = SharedFeatures.TIMESTAMP
/**
* Builds all valid aggregation keys (for the output store) from
* a datarecord and a spec listing the keys to aggregate. There
* can be multiple aggregation keys generated from a single data
* record when grouping by sparse binary features, for which multiple
* values can be set within the data record.
*
* @param dataRecord Data record to read values for key features from
* @return A set of AggregationKeys encoding the values of all keys
*/
def buildAggregationKeys(
dataRecord: DataRecord,
keysToAggregate: Set[Feature[_]]
): Set[AggregationKey] = {
val discreteAggregationKeys = getKeyFeatureIdValues[Long](
dataRecord,
keysToAggregate,
FeatureType.DISCRETE
).toMap
val textAggregationKeys = getKeyFeatureIdValues[String](
dataRecord,
keysToAggregate,
FeatureType.STRING
).toMap
val sparseBinaryIdValues = getKeyFeatureIdValues[JSet[String]](
dataRecord,
keysToAggregate,
FeatureType.SPARSE_BINARY
).map {
case (id, values) =>
(
id,
values
.map(_.asScala.toSet)
.getOrElse(Set.empty[String])
)
}.toList
if (checkIfAllKeysExist(discreteAggregationKeys) &&
checkIfAllKeysExist(textAggregationKeys)) {
if (sparseBinaryIdValues.nonEmpty) {
sparseBinaryPermutations(sparseBinaryIdValues).map { sparseBinaryTextKeys =>
AggregationKey(
discreteFeaturesById = liftOptions(discreteAggregationKeys),
textFeaturesById = liftOptions(textAggregationKeys) ++ sparseBinaryTextKeys
)
}
} else {
Set(
AggregationKey(
discreteFeaturesById = liftOptions(discreteAggregationKeys),
textFeaturesById = liftOptions(textAggregationKeys)
)
)
}
} else Set.empty[AggregationKey]
}
}
/**
* Specifies one or more related aggregate(s) to compute in the summingbird job.
*
* @param inputSource Source to compute this aggregate over
* @param preTransforms Sequence of [[com.twitter.ml.api.RichITransform]] that transform
* data records pre-aggregation (e.g. discretization, renaming)
* @param samplingTransformOpt Optional [[OneToSomeTransform]] that transform data
* record to optional data record (e.g. for sampling) before aggregation
* @param aggregatePrefix Prefix to use for naming resultant aggregate features
* @param keysToAggregate Features to group by when computing the aggregates
* (e.g. USER_ID, AUTHOR_ID)
* @param featuresToAggregate Features to aggregate (e.g. blender_score or is_photo)
* @param labels Labels to cross the features with to make pair features, if any.
* use Label.All if you don't want to cross with a label.
* @param metrics Aggregation metrics to compute (e.g. count, mean)
* @param halfLives Half lives to use for the aggregations, to be crossed with the above.
* use Duration.Top for "forever" aggregations over an infinite time window (no decay).
* @param outputStore Store to output this aggregate to
* @param includeAnyFeature Aggregate label counts for any feature value
* @param includeAnyLabel Aggregate feature counts for any label value (e.g. all impressions)
*
* The overall config for the summingbird job consists of a list of "AggregateGroup"
* case class objects, which get translated into strongly typed "TypedAggregateGroup"
* case class objects. A single TypedAggregateGroup always groups input data records from
* ''inputSource'' by a single set of aggregation keys (''featuresToAggregate'').
* Within these groups, we perform a comprehensive cross of:
*
* ''featuresToAggregate'' x ''labels'' x ''metrics'' x ''halfLives''
*
* All the resultant aggregate features are assigned a human-readable feature name
* beginning with ''aggregatePrefix'', and are written to DataRecords that get
* aggregated and written to the store specified by ''outputStore''.
*
* Illustrative example. Suppose we define our spec as follows:
*
* TypedAggregateGroup(
* inputSource = "timelines_recap_daily",
* aggregatePrefix = "user_author_aggregate",
* keysToAggregate = Set(USER_ID, AUTHOR_ID),
* featuresToAggregate = Set(RecapFeatures.TEXT_SCORE, RecapFeatures.BLENDER_SCORE),
* labels = Set(RecapFeatures.IS_FAVORITED, RecapFeatures.IS_REPLIED),
* metrics = Set(CountMetric, MeanMetric),
* halfLives = Set(7.Days, 30.Days),
* outputStore = "user_author_aggregate_store"
* )
*
* This will process data records from the source named "timelines_recap_daily"
* (see AggregateSource.scala for more details on how to add your own source)
* It will produce a total of 2x2x2x2 = 16 aggregation features, named like:
*
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.7days
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.count.30days
* user_author_aggregate.pair.recap.engagement.is_favorited.recap.searchfeature.blender_score.mean.7days
*
* ... (and so on)
*
* and all the result features will be stored in DataRecords, summed up, and written
* to the output store defined by the name "user_author_aggregate_store".
* (see AggregateStore.scala for details on how to add your own store).
*
* If you do not want a full cross, split up your config into multiple TypedAggregateGroup
* objects. Splitting is strongly advised to avoid blowing up and creating invalid
* or unnecessary combinations of aggregate features (note that some combinations
* are useless or invalid e.g. computing the mean of a binary feature). Splitting
* also does not cost anything in terms of real-time performance, because all
* Aggregate objects in the master spec that share the same ''keysToAggregate'', the
* same ''inputSource'' and the same ''outputStore'' are grouped by the summingbird
* job logic and stored into a single DataRecord in the output store. Overlapping
* aggregates will also automatically be deduplicated so don't worry about overlaps.
*/
case class TypedAggregateGroup[T](
inputSource: AggregateSource,
aggregatePrefix: String,
keysToAggregate: Set[Feature[_]],
featuresToAggregate: Set[Feature[T]],
labels: Set[_ <: Feature[JBoolean]],
metrics: Set[AggregationMetric[T, _]],
halfLives: Set[Duration],
outputStore: AggregateStore,
preTransforms: Seq[OneToSomeTransform] = Seq.empty,
includeAnyFeature: Boolean = true,
includeAnyLabel: Boolean = true,
aggExclusionRegex: Seq[String] = Seq.empty) {
import TypedAggregateGroup._
val compiledRegexes = aggExclusionRegex.map(new Regex(_))
// true if should drop, false if should keep
def filterOutAggregateFeature(
feature: PrecomputedAggregateDescriptor[_],
regexes: Seq[Regex]
): Boolean = {
if (regexes.nonEmpty)
feature.outputFeatures.exists { feature =>
regexes.exists { re => re.findFirstMatchIn(feature.getDenseFeatureName).nonEmpty }
}
else false
}
def buildAggregationKeys(
dataRecord: DataRecord
): Set[AggregationKey] = {
TypedAggregateGroup.buildAggregationKeys(dataRecord, keysToAggregate)
}
/**
* This val precomputes descriptors for all individual aggregates in this group
* (of type ''AggregateFeature''). Also precompute hashes of all aggregation
* "output" features generated by these operators for faster
* run-time performance (this turns out to be a primary CPU bottleneck).
* Ex: for the mean operator, "sum" and "count" are output features
*/
val individualAggregateDescriptors: Set[PrecomputedAggregateDescriptor[T]] = {
/*
* By default, in additional to all feature-label crosses, also
* compute in aggregates over each feature and label without crossing
*/
val labelOptions = labels.map(Option(_)) ++
(if (includeAnyLabel) Set(None) else Set.empty)
val featureOptions = featuresToAggregate.map(Option(_)) ++
(if (includeAnyFeature) Set(None) else Set.empty)
for {
feature <- featureOptions
label <- labelOptions
metric <- metrics
halfLife <- halfLives
} yield {
val query = AggregateFeature[T](aggregatePrefix, feature, label, halfLife)
val aggregateOutputFeatures = metric.getOutputFeatures(query)
val aggregateOutputFeatureIds = metric.getOutputFeatureIds(query)
PrecomputedAggregateDescriptor(
query,
metric,
aggregateOutputFeatures,
aggregateOutputFeatureIds
)
}
}.filterNot(filterOutAggregateFeature(_, compiledRegexes))
/* Precomputes a map from all generated aggregate feature ids to their half lives. */
val continuousFeatureIdsToHalfLives: Map[Long, Duration] =
individualAggregateDescriptors.flatMap { descriptor =>
descriptor.outputFeatures
.flatMap { feature =>
if (feature.getFeatureType() == FeatureType.CONTINUOUS) {
Try(feature.asInstanceOf[Feature[JDouble]]).toOption
.map(feature => (feature.getFeatureId(), descriptor.query.halfLife))
} else None
}
}.toMap
/*
* Sparse binary keys become individual string keys in the output.
* e.g. group by "words.in.tweet", output key: "words.in.tweet.member"
*/
val allOutputKeys: Set[Feature[_]] = keysToAggregate.map { key =>
if (key.getFeatureType == FeatureType.SPARSE_BINARY) sparseFeature(key)
else key
}
val allOutputFeatures: Set[Feature[_]] = individualAggregateDescriptors.flatMap {
case PrecomputedAggregateDescriptor(
query,
metric,
outputFeatures,
outputFeatureIds
) =>
outputFeatures
}
val aggregateContext: FeatureContext = new FeatureContext(allOutputFeatures.toList.asJava)
/**
* Adds all aggregates in this group found in the two input data records
* into a result, mutating the result. Uses a while loop for an
* approximately 10% gain in speed over a for comprehension.
*
* WARNING: mutates ''result''
*
* @param result The output data record to mutate
* @param left The left data record to add
* @param right The right data record to add
*/
def mutatePlus(result: DataRecord, left: DataRecord, right: DataRecord): Unit = {
val featureIterator = individualAggregateDescriptors.iterator
while (featureIterator.hasNext) {
val descriptor = featureIterator.next
descriptor.metric.mutatePlus(
result,
left,
right,
descriptor.query,
Some(descriptor.outputFeatureIds)
)
}
}
/**
* Apply preTransforms sequentially. If any transform results in a dropped (None)
* DataRecord, then entire tranform sequence will result in a dropped DataRecord.
* Note that preTransforms are order-dependent.
*/
private[this] def sequentiallyTransform(dataRecord: DataRecord): Option[DataRecord] = {
val recordOpt = Option(new DataRecord(dataRecord))
preTransforms.foldLeft(recordOpt) {
case (Some(previousRecord), preTransform) =>
preTransform(previousRecord)
case _ => Option.empty[DataRecord]
}
}
/**
* Given a data record, apply transforms and fetch the incremental contributions to
* each configured aggregate from this data record, and store these in an output data record.
*
* @param dataRecord Input data record to aggregate.
* @return A set of tuples (AggregationKey, DataRecord) whose first entry is an
* AggregationKey indicating what keys we're grouping by, and whose second entry
* is an output data record with incremental contributions to the aggregate value(s)
*/
def computeAggregateKVPairs(dataRecord: DataRecord): Set[(AggregationKey, DataRecord)] = {
sequentiallyTransform(dataRecord)
.flatMap { dataRecord =>
val aggregationKeys = buildAggregationKeys(dataRecord)
val increment = new DataRecord
val isNonEmptyIncrement = individualAggregateDescriptors
.map { descriptor =>
descriptor.metric.setIncrement(
output = increment,
input = dataRecord,
query = descriptor.query,
timestampFeature = inputSource.timestampFeature,
aggregateOutputs = Some(descriptor.outputFeatureIds)
)
}
.exists(identity)
if (isNonEmptyIncrement) {
SRichDataRecord(increment).setFeatureValue(
timestampFeature,
getTimestamp(dataRecord, inputSource.timestampFeature)
)
Some(aggregationKeys.map(key => (key, increment)))
} else {
None
}
}
.getOrElse(Set.empty[(AggregationKey, DataRecord)])
}
def outputFeaturesToRenamedOutputFeatures(prefix: String): Map[Feature[_], Feature[_]] = {
require(prefix.nonEmpty)
allOutputFeatures.map { feature =>
if (feature.isSetFeatureName) {
val renamedFeatureName = prefix + feature.getDenseFeatureName
val personalDataTypes =
if (feature.getPersonalDataTypes.isPresent) feature.getPersonalDataTypes.get()
else null
val renamedFeature = feature.getFeatureType match {
case FeatureType.BINARY =>
new Feature.Binary(renamedFeatureName, personalDataTypes)
case FeatureType.DISCRETE =>
new Feature.Discrete(renamedFeatureName, personalDataTypes)
case FeatureType.STRING =>
new Feature.Text(renamedFeatureName, personalDataTypes)
case FeatureType.CONTINUOUS =>
new Feature.Continuous(renamedFeatureName, personalDataTypes)
case FeatureType.SPARSE_BINARY =>
new Feature.SparseBinary(renamedFeatureName, personalDataTypes)
case FeatureType.SPARSE_CONTINUOUS =>
new Feature.SparseContinuous(renamedFeatureName, personalDataTypes)
}
feature -> renamedFeature
} else {
feature -> feature
}
}.toMap
}
}

View File

@ -1,122 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework
import com.twitter.algebird.ScMapMonoid
import com.twitter.algebird.Semigroup
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.Feature
import com.twitter.ml.api.FeatureType
import com.twitter.ml.api.util.SRichDataRecord
import java.lang.{Long => JLong}
import scala.collection.{Map => ScMap}
object Utils {
val dataRecordMerger: DataRecordMerger = new DataRecordMerger
def EmptyDataRecord: DataRecord = new DataRecord()
private val random = scala.util.Random
private val keyedDataRecordMapMonoid = {
val dataRecordMergerSg = new Semigroup[DataRecord] {
override def plus(x: DataRecord, y: DataRecord): DataRecord = {
dataRecordMerger.merge(x, y)
x
}
}
new ScMapMonoid[Long, DataRecord]()(dataRecordMergerSg)
}
def keyFromLong(record: DataRecord, feature: Feature[JLong]): Long =
SRichDataRecord(record).getFeatureValue(feature).longValue
def keyFromString(record: DataRecord, feature: Feature[String]): Long =
try {
SRichDataRecord(record).getFeatureValue(feature).toLong
} catch {
case _: NumberFormatException => 0L
}
def keyFromHash(record: DataRecord, feature: Feature[String]): Long =
SRichDataRecord(record).getFeatureValue(feature).hashCode.toLong
def extractSecondary[T](
record: DataRecord,
secondaryKey: Feature[T],
shouldHash: Boolean = false
): Long = secondaryKey.getFeatureType match {
case FeatureType.STRING =>
if (shouldHash) keyFromHash(record, secondaryKey.asInstanceOf[Feature[String]])
else keyFromString(record, secondaryKey.asInstanceOf[Feature[String]])
case FeatureType.DISCRETE => keyFromLong(record, secondaryKey.asInstanceOf[Feature[JLong]])
case f => throw new IllegalArgumentException(s"Feature type $f is not supported.")
}
def mergeKeyedRecordOpts(args: Option[KeyedRecord]*): Option[KeyedRecord] = {
val keyedRecords = args.flatten
if (keyedRecords.isEmpty) {
None
} else {
val keys = keyedRecords.map(_.aggregateType)
require(keys.toSet.size == 1, "All merged records must have the same aggregate key.")
val mergedRecord = mergeRecords(keyedRecords.map(_.record): _*)
Some(KeyedRecord(keys.head, mergedRecord))
}
}
private def mergeRecords(args: DataRecord*): DataRecord =
if (args.isEmpty) EmptyDataRecord
else {
// can just do foldLeft(new DataRecord) for both cases, but try reusing the EmptyDataRecord singleton as much as possible
args.tail.foldLeft(args.head) { (merged, record) =>
dataRecordMerger.merge(merged, record)
merged
}
}
def mergeKeyedRecordMapOpts(
opt1: Option[KeyedRecordMap],
opt2: Option[KeyedRecordMap],
maxSize: Int = Int.MaxValue
): Option[KeyedRecordMap] = {
if (opt1.isEmpty && opt2.isEmpty) {
None
} else {
val keys = Seq(opt1, opt2).flatten.map(_.aggregateType)
require(keys.toSet.size == 1, "All merged records must have the same aggregate key.")
val mergedRecordMap = mergeMapOpts(opt1.map(_.recordMap), opt2.map(_.recordMap), maxSize)
Some(KeyedRecordMap(keys.head, mergedRecordMap))
}
}
private def mergeMapOpts(
opt1: Option[ScMap[Long, DataRecord]],
opt2: Option[ScMap[Long, DataRecord]],
maxSize: Int = Int.MaxValue
): ScMap[Long, DataRecord] = {
require(maxSize >= 0)
val keySet = opt1.map(_.keySet).getOrElse(Set.empty) ++ opt2.map(_.keySet).getOrElse(Set.empty)
val totalSize = keySet.size
val rate = if (totalSize <= maxSize) 1.0 else maxSize.toDouble / totalSize
val prunedOpt1 = opt1.map(downsample(_, rate))
val prunedOpt2 = opt2.map(downsample(_, rate))
Seq(prunedOpt1, prunedOpt2).flatten
.foldLeft(keyedDataRecordMapMonoid.zero)(keyedDataRecordMapMonoid.plus)
}
def downsample[K, T](m: ScMap[K, T], samplingRate: Double): ScMap[K, T] = {
if (samplingRate >= 1.0) {
m
} else if (samplingRate <= 0) {
Map.empty
} else {
m.filter {
case (key, _) =>
// It is important that the same user with the same sampling rate be deterministically
// selected or rejected. Otherwise, mergeMapOpts will choose different keys for the
// two input maps and their union will be larger than the limit we want.
random.setSeed((key.hashCode, samplingRate.hashCode).hashCode)
random.nextDouble < samplingRate
}
}
}
}

View File

@ -1,165 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.algebird.DecayedValue
import com.twitter.algebird.DecayedValueMonoid
import com.twitter.algebird.Monoid
import com.twitter.ml.api._
import com.twitter.ml.api.constant.SharedFeatures
import com.twitter.ml.api.util.FDsl._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.summingbird.batch.BatchID
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregateFeature
import com.twitter.util.Duration
import java.lang.{Double => JDouble}
import java.lang.{Long => JLong}
import scala.collection.JavaConverters._
import scala.collection.mutable
import java.{util => ju}
object AggregatesV2Adapter {
type AggregatesV2Tuple = (AggregationKey, (BatchID, DataRecord))
val Epsilon: Double = 1e-6
val decayedValueMonoid: Monoid[DecayedValue] = DecayedValueMonoid(Epsilon)
/*
* Decays the storedValue from timestamp -> sourceVersion
*
* @param storedValue value read from the aggregates v2 output store
* @param timestamp timestamp corresponding to store value
* @param sourceVersion timestamp of version to decay all values to uniformly
* @param halfLife Half life duration to use for applying decay
*
* By applying this function, the feature values for all users are decayed
* to sourceVersion. This is important to ensure that a user whose aggregates
* were updated long in the past does not have an artifically inflated count
* compared to one whose aggregates were updated (and hence decayed) more recently.
*/
def decayValueToSourceVersion(
storedValue: Double,
timestamp: Long,
sourceVersion: Long,
halfLife: Duration
): Double =
if (timestamp > sourceVersion) {
storedValue
} else {
decayedValueMonoid
.plus(
DecayedValue.build(storedValue, timestamp, halfLife.inMilliseconds),
DecayedValue.build(0, sourceVersion, halfLife.inMilliseconds)
)
.value
}
/*
* Decays all the aggregate features occurring in the ''inputRecord''
* to a given timestamp, and mutates the ''outputRecord'' accordingly.
* Note that inputRecord and outputRecord can be the same if you want
* to mutate the input in place, the function does this correctly.
*
* @param inputRecord Input record to get features from
* @param aggregates Aggregates to decay
* @param decayTo Timestamp to decay to
* @param trimThreshold Drop features below this trim threshold
* @param outputRecord Output record to mutate
* @return the mutated outputRecord
*/
def mutateDecay(
inputRecord: DataRecord,
aggregateFeaturesAndHalfLives: List[(Feature[_], Duration)],
decayTo: Long,
trimThreshold: Double,
outputRecord: DataRecord
): DataRecord = {
val timestamp = inputRecord.getFeatureValue(SharedFeatures.TIMESTAMP).toLong
aggregateFeaturesAndHalfLives.foreach {
case (aggregateFeature: Feature[_], halfLife: Duration) =>
if (aggregateFeature.getFeatureType() == FeatureType.CONTINUOUS) {
val continuousFeature = aggregateFeature.asInstanceOf[Feature[JDouble]]
if (inputRecord.hasFeature(continuousFeature)) {
val storedValue = inputRecord.getFeatureValue(continuousFeature).toDouble
val decayedValue = decayValueToSourceVersion(storedValue, timestamp, decayTo, halfLife)
if (math.abs(decayedValue) > trimThreshold) {
outputRecord.setFeatureValue(continuousFeature, decayedValue)
}
}
}
}
/* Update timestamp to version (now that we've decayed all aggregates) */
outputRecord.setFeatureValue(SharedFeatures.TIMESTAMP, decayTo)
outputRecord
}
}
class AggregatesV2Adapter(
aggregates: Set[TypedAggregateGroup[_]],
sourceVersion: Long,
trimThreshold: Double)
extends IRecordOneToManyAdapter[AggregatesV2Adapter.AggregatesV2Tuple] {
import AggregatesV2Adapter._
val keyFeatures: List[Feature[_]] = aggregates.flatMap(_.allOutputKeys).toList
val aggregateFeatures: List[Feature[_]] = aggregates.flatMap(_.allOutputFeatures).toList
val timestampFeatures: List[Feature[JLong]] = List(SharedFeatures.TIMESTAMP)
val allFeatures: List[Feature[_]] = keyFeatures ++ aggregateFeatures ++ timestampFeatures
val featureContext: FeatureContext = new FeatureContext(allFeatures.asJava)
override def getFeatureContext: FeatureContext = featureContext
val aggregateFeaturesAndHalfLives: List[(Feature[_$3], Duration) forSome { type _$3 }] =
aggregateFeatures.map { aggregateFeature: Feature[_] =>
val halfLife = AggregateFeature.parseHalfLife(aggregateFeature)
(aggregateFeature, halfLife)
}
override def adaptToDataRecords(tuple: AggregatesV2Tuple): ju.List[DataRecord] = tuple match {
case (key: AggregationKey, (batchId: BatchID, record: DataRecord)) => {
val resultRecord = new SRichDataRecord(new DataRecord, featureContext)
val itr = resultRecord.continuousFeaturesIterator()
val featuresToClear = mutable.Set[Feature[JDouble]]()
while (itr.moveNext()) {
val nextFeature = itr.getFeature
if (!aggregateFeatures.contains(nextFeature)) {
featuresToClear += nextFeature
}
}
featuresToClear.foreach(resultRecord.clearFeature)
keyFeatures.foreach { keyFeature: Feature[_] =>
if (keyFeature.getFeatureType == FeatureType.DISCRETE) {
resultRecord.setFeatureValue(
keyFeature.asInstanceOf[Feature[JLong]],
key.discreteFeaturesById(keyFeature.getDenseFeatureId)
)
} else if (keyFeature.getFeatureType == FeatureType.STRING) {
resultRecord.setFeatureValue(
keyFeature.asInstanceOf[Feature[String]],
key.textFeaturesById(keyFeature.getDenseFeatureId)
)
}
}
if (record.hasFeature(SharedFeatures.TIMESTAMP)) {
mutateDecay(
record,
aggregateFeaturesAndHalfLives,
sourceVersion,
trimThreshold,
resultRecord)
List(resultRecord.getRecord).asJava
} else {
List.empty[DataRecord].asJava
}
}
}
}

View File

@ -1,171 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.bijection.Injection
import com.twitter.bijection.thrift.CompactThriftCodec
import com.twitter.ml.api.AdaptedFeatureSource
import com.twitter.ml.api.DataRecord
import com.twitter.ml.api.IRecordOneToManyAdapter
import com.twitter.ml.api.TypedFeatureSource
import com.twitter.scalding.DateRange
import com.twitter.scalding.RichDate
import com.twitter.scalding.TypedPipe
import com.twitter.scalding.commons.source.VersionedKeyValSource
import com.twitter.scalding.commons.tap.VersionedTap.TapMode
import com.twitter.summingbird.batch.BatchID
import com.twitter.summingbird_internal.bijection.BatchPairImplicits
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKeyInjection
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import org.apache.hadoop.mapred.JobConf
import scala.collection.JavaConverters._
import AggregatesV2Adapter._
object AggregatesV2AdaptedSource {
val DefaultTrimThreshold = 0
}
trait AggregatesV2AdaptedSource extends AggregatesV2AdaptedSourceBase[DataRecord] {
override def storageFormatCodec: Injection[DataRecord, Array[Byte]] =
CompactThriftCodec[DataRecord]
override def toDataRecord(v: DataRecord): DataRecord = v
}
trait AggregatesV2AdaptedSourceBase[StorageFormat]
extends TypedFeatureSource[AggregatesV2Tuple]
with AdaptedFeatureSource[AggregatesV2Tuple]
with BatchPairImplicits {
/* Output root path of aggregates v2 job, excluding store name and version */
def rootPath: String
/* Name of store under root path to read */
def storeName: String
// max bijection failures
def maxFailures: Int = 0
/* Aggregate config used to generate above output */
def aggregates: Set[TypedAggregateGroup[_]]
/* trimThreshold Trim all aggregates below a certain threshold to save memory */
def trimThreshold: Double
def toDataRecord(v: StorageFormat): DataRecord
def sourceVersionOpt: Option[Long]
def enableMostRecentBeforeSourceVersion: Boolean = false
implicit private val aggregationKeyInjection: Injection[AggregationKey, Array[Byte]] =
AggregationKeyInjection
implicit def storageFormatCodec: Injection[StorageFormat, Array[Byte]]
private def filteredAggregates = aggregates.filter(_.outputStore.name == storeName)
def storePath: String = List(rootPath, storeName).mkString("/")
def mostRecentVkvs: VersionedKeyValSource[_, _] = {
VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)](
path = storePath,
sourceVersion = None,
maxFailures = maxFailures
)
}
private def availableVersions: Seq[Long] =
mostRecentVkvs
.getTap(TapMode.SOURCE)
.getStore(new JobConf(true))
.getAllVersions()
.asScala
.map(_.toLong)
private def mostRecentVersion: Long = {
require(!availableVersions.isEmpty, s"$storeName has no available versions")
availableVersions.max
}
def versionToUse: Long =
if (enableMostRecentBeforeSourceVersion) {
sourceVersionOpt
.map(sourceVersion =>
availableVersions.filter(_ <= sourceVersion) match {
case Seq() =>
throw new IllegalArgumentException(
"No version older than version: %s, available versions: %s"
.format(sourceVersion, availableVersions)
)
case versionList => versionList.max
})
.getOrElse(mostRecentVersion)
} else {
sourceVersionOpt.getOrElse(mostRecentVersion)
}
override lazy val adapter: IRecordOneToManyAdapter[AggregatesV2Tuple] =
new AggregatesV2Adapter(filteredAggregates, versionToUse, trimThreshold)
override def getData: TypedPipe[AggregatesV2Tuple] = {
val vkvsToUse: VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)] = {
VersionedKeyValSource[AggregationKey, (BatchID, StorageFormat)](
path = storePath,
sourceVersion = Some(versionToUse),
maxFailures = maxFailures
)
}
TypedPipe.from(vkvsToUse).map {
case (key, (batch, value)) => (key, (batch, toDataRecord(value)))
}
}
}
/*
* Adapted data record feature source from aggregates v2 manhattan output
* Params documented in parent trait.
*/
case class AggregatesV2FeatureSource(
override val rootPath: String,
override val storeName: String,
override val aggregates: Set[TypedAggregateGroup[_]],
override val trimThreshold: Double = 0,
override val maxFailures: Int = 0,
)(
implicit val dateRange: DateRange)
extends AggregatesV2AdaptedSource {
// Increment end date by 1 millisec since summingbird output for date D is stored at (D+1)T00
override val sourceVersionOpt: Some[Long] = Some(dateRange.end.timestamp + 1)
}
/*
* Reads most recent available AggregatesV2FeatureSource.
* There is no constraint on recency.
* Params documented in parent trait.
*/
case class AggregatesV2MostRecentFeatureSource(
override val rootPath: String,
override val storeName: String,
override val aggregates: Set[TypedAggregateGroup[_]],
override val trimThreshold: Double = AggregatesV2AdaptedSource.DefaultTrimThreshold,
override val maxFailures: Int = 0)
extends AggregatesV2AdaptedSource {
override val sourceVersionOpt: None.type = None
}
/*
* Reads most recent available AggregatesV2FeatureSource
* on or before the specified beforeDate.
* Params documented in parent trait.
*/
case class AggregatesV2MostRecentFeatureSourceBeforeDate(
override val rootPath: String,
override val storeName: String,
override val aggregates: Set[TypedAggregateGroup[_]],
override val trimThreshold: Double = AggregatesV2AdaptedSource.DefaultTrimThreshold,
beforeDate: RichDate,
override val maxFailures: Int = 0)
extends AggregatesV2AdaptedSource {
override val enableMostRecentBeforeSourceVersion = true
override val sourceVersionOpt: Some[Long] = Some(beforeDate.timestamp + 1)
}

View File

@ -1,71 +0,0 @@
scala_library(
sources = ["*.scala"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/algebird:core",
"3rdparty/jvm/com/twitter/algebird:util",
"3rdparty/jvm/com/twitter/bijection:core",
"3rdparty/jvm/com/twitter/bijection:json",
"3rdparty/jvm/com/twitter/bijection:netty",
"3rdparty/jvm/com/twitter/bijection:scrooge",
"3rdparty/jvm/com/twitter/bijection:thrift",
"3rdparty/jvm/com/twitter/bijection:util",
"3rdparty/jvm/com/twitter/storehaus:algebra",
"3rdparty/jvm/com/twitter/storehaus:core",
"3rdparty/src/jvm/com/twitter/scalding:commons",
"3rdparty/src/jvm/com/twitter/scalding:core",
"3rdparty/src/jvm/com/twitter/scalding:date",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"3rdparty/src/jvm/com/twitter/summingbird:core",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/scala/com/twitter/ml/api:api-base",
"src/scala/com/twitter/ml/api/util",
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:interpretable-model-java",
"src/thrift/com/twitter/summingbird",
"timelines/data_processing/ml_util/aggregation_framework:common_types",
"timelines/data_processing/ml_util/aggregation_framework/metrics",
"util/util-core:scala",
],
)
scala_library(
name = "for-timelines",
sources = [
"CombineCountsPolicy.scala",
"SparseBinaryMergePolicy.scala",
],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/twitter/algebird:core",
"3rdparty/jvm/com/twitter/algebird:util",
"3rdparty/jvm/com/twitter/bijection:core",
"3rdparty/jvm/com/twitter/bijection:json",
"3rdparty/jvm/com/twitter/bijection:netty",
"3rdparty/jvm/com/twitter/bijection:scrooge",
"3rdparty/jvm/com/twitter/bijection:thrift",
"3rdparty/jvm/com/twitter/bijection:util",
"3rdparty/jvm/com/twitter/storehaus:algebra",
"3rdparty/jvm/com/twitter/storehaus:core",
"3rdparty/src/jvm/com/twitter/scalding:commons",
"3rdparty/src/jvm/com/twitter/scalding:core",
"3rdparty/src/jvm/com/twitter/scalding:date",
"3rdparty/src/jvm/com/twitter/summingbird:batch",
"3rdparty/src/jvm/com/twitter/summingbird:core",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/constant",
"src/scala/com/twitter/summingbird_internal/bijection:bijection-implicits",
"src/thrift/com/twitter/dal/personal_data:personal_data-java",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/ml/api:interpretable-model-java",
"src/thrift/com/twitter/summingbird",
"timelines/data_processing/ml_util/aggregation_framework:common_types",
"timelines/data_processing/ml_util/aggregation_framework/metrics",
"util/util-core:scala",
],
)

View File

@ -1,223 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.google.common.annotations.VisibleForTesting
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api._
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.TypedCountMetric
import java.lang.{Double => JDouble}
import scala.collection.JavaConverters._
case class CombinedFeatures(
sum: Feature[JDouble],
nonzero: Feature[JDouble],
mean: Feature[JDouble],
topK: Seq[Feature[JDouble]])
trait CombineCountsBase {
val SparseSum = "sparse_sum"
val SparseNonzero = "sparse_nonzero"
val SparseMean = "sparse_mean"
val SparseTop = "sparse_top"
def topK: Int
def hardLimit: Option[Int]
def precomputedCountFeatures: Seq[Feature[_]]
lazy val precomputedFeaturesMap: Map[Feature[_], CombinedFeatures] =
precomputedCountFeatures.map { countFeature =>
val derivedPersonalDataTypes =
AggregationMetricCommon.derivePersonalDataTypes(Some(countFeature))
val sum = new Feature.Continuous(
countFeature.getDenseFeatureName + "." + SparseSum,
derivedPersonalDataTypes)
val nonzero = new Feature.Continuous(
countFeature.getDenseFeatureName + "." + SparseNonzero,
derivedPersonalDataTypes)
val mean = new Feature.Continuous(
countFeature.getDenseFeatureName + "." + SparseMean,
derivedPersonalDataTypes)
val topKFeatures = (1 to topK).map { k =>
new Feature.Continuous(
countFeature.getDenseFeatureName + "." + SparseTop + k,
derivedPersonalDataTypes)
}
(countFeature, CombinedFeatures(sum, nonzero, mean, topKFeatures))
}.toMap
lazy val outputFeaturesPostMerge: Set[Feature[JDouble]] =
precomputedFeaturesMap.values.flatMap { combinedFeatures: CombinedFeatures =>
Seq(
combinedFeatures.sum,
combinedFeatures.nonzero,
combinedFeatures.mean
) ++ combinedFeatures.topK
}.toSet
private case class ComputedStats(sum: Double, nonzero: Double, mean: Double)
private def preComputeStats(featureValues: Seq[Double]): ComputedStats = {
val (sum, nonzero) = featureValues.foldLeft((0.0, 0.0)) {
case ((accSum, accNonzero), value) =>
(accSum + value, if (value > 0.0) accNonzero + 1.0 else accNonzero)
}
ComputedStats(sum, nonzero, if (nonzero > 0.0) sum / nonzero else 0.0)
}
private def computeSortedFeatureValues(featureValues: List[Double]): List[Double] =
featureValues.sortBy(-_)
private def extractKth(sortedFeatureValues: Seq[Double], k: Int): Double =
sortedFeatureValues
.lift(k - 1)
.getOrElse(0.0)
private def setContinuousFeatureIfNonZero(
record: SRichDataRecord,
feature: Feature[JDouble],
value: Double
): Unit =
if (value != 0.0) {
record.setFeatureValue(feature, value)
}
def hydrateCountFeatures(
richRecord: SRichDataRecord,
features: Seq[Feature[_]],
featureValuesMap: Map[Feature[_], List[Double]]
): Unit =
for {
feature <- features
featureValues <- featureValuesMap.get(feature)
} {
mergeRecordFromCountFeature(
countFeature = feature,
featureValues = featureValues,
richInputRecord = richRecord
)
}
def mergeRecordFromCountFeature(
richInputRecord: SRichDataRecord,
countFeature: Feature[_],
featureValues: List[Double]
): Unit = {
// In majority of calls to this method from timeline scorer
// the featureValues list is empty.
// While with empty list each operation will be not that expensive, these
// small things do add up. By adding early stop here we can avoid sorting
// empty list, allocating several options and making multiple function
// calls. In addition to that, we won't iterate over [1, topK].
if (featureValues.nonEmpty) {
val sortedFeatureValues = hardLimit
.map { limit =>
computeSortedFeatureValues(featureValues).take(limit)
}.getOrElse(computeSortedFeatureValues(featureValues)).toIndexedSeq
val computed = preComputeStats(sortedFeatureValues)
val combinedFeatures = precomputedFeaturesMap(countFeature)
setContinuousFeatureIfNonZero(
richInputRecord,
combinedFeatures.sum,
computed.sum
)
setContinuousFeatureIfNonZero(
richInputRecord,
combinedFeatures.nonzero,
computed.nonzero
)
setContinuousFeatureIfNonZero(
richInputRecord,
combinedFeatures.mean,
computed.mean
)
(1 to topK).foreach { k =>
setContinuousFeatureIfNonZero(
richInputRecord,
combinedFeatures.topK(k - 1),
extractKth(sortedFeatureValues, k)
)
}
}
}
}
object CombineCountsPolicy {
def getCountFeatures(aggregateContext: FeatureContext): Seq[Feature[_]] =
aggregateContext.getAllFeatures.asScala.toSeq
.filter { feature =>
feature.getFeatureType == FeatureType.CONTINUOUS &&
feature.getDenseFeatureName.endsWith(TypedCountMetric[JDouble]().operatorName)
}
@VisibleForTesting
private[conversion] def getFeatureValues(
dataRecordsWithCounts: List[DataRecord],
countFeature: Feature[_]
): List[Double] =
dataRecordsWithCounts.map(new SRichDataRecord(_)).flatMap { record =>
Option(record.getFeatureValue(countFeature)).map(_.asInstanceOf[JDouble].toDouble)
}
}
/**
* A merge policy that works whenever all aggregate features are
* counts (computed using CountMetric), and typically represent
* either impressions or engagements. For each such input count
* feature, the policy outputs the following (3+k) derived features
* into the output data record:
*
* Sum of the feature's value across all aggregate records
* Number of aggregate records that have the feature set to non-zero
* Mean of the feature's value across all aggregate records
* topK values of the feature across all aggregate records
*
* @param topK topK values to compute
* @param hardLimit when set, records are sorted and only the top values will be used for aggregation if
* the number of records are higher than this hard limit.
*/
case class CombineCountsPolicy(
override val topK: Int,
aggregateContextToPrecompute: FeatureContext,
override val hardLimit: Option[Int] = None)
extends SparseBinaryMergePolicy
with CombineCountsBase {
import CombineCountsPolicy._
override val precomputedCountFeatures: Seq[Feature[_]] = getCountFeatures(
aggregateContextToPrecompute)
override def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit = {
// Assumes aggregateContext === aggregateContextToPrecompute
mergeRecordFromCountFeatures(mutableInputRecord, aggregateRecords, precomputedCountFeatures)
}
def defaultMergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord]
): Unit = {
mergeRecordFromCountFeatures(mutableInputRecord, aggregateRecords, precomputedCountFeatures)
}
def mergeRecordFromCountFeatures(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
countFeatures: Seq[Feature[_]]
): Unit = {
val richInputRecord = new SRichDataRecord(mutableInputRecord)
countFeatures.foreach { countFeature =>
mergeRecordFromCountFeature(
richInputRecord = richInputRecord,
countFeature = countFeature,
featureValues = getFeatureValues(aggregateRecords, countFeature)
)
}
}
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
outputFeaturesPostMerge.map(_.asInstanceOf[Feature[_]])
}

View File

@ -1,46 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.bijection.Injection
import com.twitter.ml.api._
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.scalding.TypedPipe
object DataSetPipeSketchJoin {
val DefaultSketchNumReducers = 500
val dataRecordMerger: DataRecordMerger = new DataRecordMerger
implicit val str2Byte: String => Array[Byte] =
implicitly[Injection[String, Array[Byte]]].toFunction
/* Computes a left sketch join on a set of skewed keys. */
def apply(
inputDataSet: DataSetPipe,
skewedJoinKeys: Product,
joinFeaturesDataSet: DataSetPipe,
sketchNumReducers: Int = DefaultSketchNumReducers
): DataSetPipe = {
val joinKeyList = skewedJoinKeys.productIterator.toList.asInstanceOf[List[Feature[_]]]
def makeKey(record: DataRecord): String =
joinKeyList
.map(SRichDataRecord(record).getFeatureValue(_))
.toString
def byKey(pipe: DataSetPipe): TypedPipe[(String, DataRecord)] =
pipe.records.map(record => (makeKey(record), record))
val joinedRecords = byKey(inputDataSet)
.sketch(sketchNumReducers)
.leftJoin(byKey(joinFeaturesDataSet))
.values
.map {
case (inputRecord, joinFeaturesOpt) =>
joinFeaturesOpt.foreach { joinRecord => dataRecordMerger.merge(inputRecord, joinRecord) }
inputRecord
}
DataSetPipe(
joinedRecords,
FeatureContext.merge(inputDataSet.featureContext, joinFeaturesDataSet.featureContext)
)
}
}

View File

@ -1,26 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.ml.api._
import com.twitter.ml.api.FeatureContext
import scala.collection.JavaConverters._
/*
* A really bad default merge policy that picks all the aggregate
* features corresponding to the first sparse key value in the list.
* Does not rename any of the aggregate features for simplicity.
* Avoid using this merge policy if at all possible.
*/
object PickFirstRecordPolicy extends SparseBinaryMergePolicy {
val dataRecordMerger: DataRecordMerger = new DataRecordMerger
override def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit =
aggregateRecords.headOption
.foreach(aggregateRecord => dataRecordMerger.merge(mutableInputRecord, aggregateRecord))
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
aggregateContext.getAllFeatures.asScala.toSet
}

View File

@ -1,226 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.ml.api._
import com.twitter.ml.api.FeatureContext
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.metrics.AggregationMetricCommon
import java.lang.{Boolean => JBoolean}
import java.lang.{Double => JDouble}
case class CtrDescriptor(
engagementFeature: Feature[JDouble],
impressionFeature: Feature[JDouble],
outputFeature: Feature[JDouble])
object PickTopCtrBuilderHelper {
def createCtrDescriptors(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
outputSuffix: String
): Set[CtrDescriptor] = {
val aggregateFeatures = aggregatesToCompute
.filter(_.aggregatePrefix == aggregatePrefix)
val impressionFeature = aggregateFeatures
.flatMap { group =>
group.individualAggregateDescriptors
.filter(_.query.feature == None)
.filter(_.query.label == None)
.flatMap(_.outputFeatures)
}
.head
.asInstanceOf[Feature[JDouble]]
val aggregateEngagementFeatures =
aggregateFeatures
.flatMap { group =>
group.individualAggregateDescriptors
.filter(_.query.feature == None)
.filter { descriptor =>
//TODO: we should remove the need to pass around engagementLabels and just use all the labels available.
descriptor.query.label.exists(engagementLabels.contains(_))
}
.flatMap(_.outputFeatures)
}
.map(_.asInstanceOf[Feature[JDouble]])
aggregateEngagementFeatures
.map { aggregateEngagementFeature =>
CtrDescriptor(
engagementFeature = aggregateEngagementFeature,
impressionFeature = impressionFeature,
outputFeature = new Feature.Continuous(
aggregateEngagementFeature.getDenseFeatureName + "." + outputSuffix,
AggregationMetricCommon.derivePersonalDataTypes(
Some(aggregateEngagementFeature),
Some(impressionFeature)
)
)
)
}
}
}
object PickTopCtrPolicy {
def build(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
smoothing: Double = 1.0,
outputSuffix: String = "ratio"
): PickTopCtrPolicy = {
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
aggregatePrefix = aggregatePrefix,
engagementLabels = engagementLabels,
aggregatesToCompute = aggregatesToCompute,
outputSuffix = outputSuffix
)
PickTopCtrPolicy(
ctrDescriptors = ctrDescriptors,
smoothing = smoothing
)
}
}
object CombinedTopNCtrsByWilsonConfidenceIntervalPolicy {
def build(
aggregatePrefix: String,
engagementLabels: Set[Feature[JBoolean]],
aggregatesToCompute: Set[TypedAggregateGroup[_]],
outputSuffix: String = "ratioWithWCI",
z: Double = 1.96,
topN: Int = 1
): CombinedTopNCtrsByWilsonConfidenceIntervalPolicy = {
val ctrDescriptors = PickTopCtrBuilderHelper.createCtrDescriptors(
aggregatePrefix = aggregatePrefix,
engagementLabels = engagementLabels,
aggregatesToCompute = aggregatesToCompute,
outputSuffix = outputSuffix
)
CombinedTopNCtrsByWilsonConfidenceIntervalPolicy(
ctrDescriptors = ctrDescriptors,
z = z,
topN = topN
)
}
}
/*
* A merge policy that picks the aggregate features corresponding to
* the sparse key value with the highest engagement rate (defined
* as the ratio of two specified features, representing engagements
* and impressions). Also outputs the engagement rate to the specified
* outputFeature.
*
* This is an abstract class. We can make variants of this policy by overriding
* the calculateCtr method.
*/
abstract class PickTopCtrPolicyBase(ctrDescriptors: Set[CtrDescriptor])
extends SparseBinaryMergePolicy {
private def getContinuousFeature(
aggregateRecord: DataRecord,
feature: Feature[JDouble]
): Double = {
Option(SRichDataRecord(aggregateRecord).getFeatureValue(feature))
.map(_.asInstanceOf[JDouble].toDouble)
.getOrElse(0.0)
}
/**
* For every provided descriptor, compute the corresponding CTR feature
* and only hydrate this result to the provided input record.
*/
override def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit = {
ctrDescriptors
.foreach {
case CtrDescriptor(engagementFeature, impressionFeature, outputFeature) =>
val sortedCtrs =
aggregateRecords
.map { aggregateRecord =>
val impressions = getContinuousFeature(aggregateRecord, impressionFeature)
val engagements = getContinuousFeature(aggregateRecord, engagementFeature)
calculateCtr(impressions, engagements)
}
.sortBy { ctr => -ctr }
combineTopNCtrsToSingleScore(sortedCtrs)
.foreach { score =>
SRichDataRecord(mutableInputRecord).setFeatureValue(outputFeature, score)
}
}
}
protected def calculateCtr(impressions: Double, engagements: Double): Double
protected def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double]
override def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]] =
ctrDescriptors
.map(_.outputFeature)
.toSet
}
case class PickTopCtrPolicy(ctrDescriptors: Set[CtrDescriptor], smoothing: Double = 1.0)
extends PickTopCtrPolicyBase(ctrDescriptors) {
require(smoothing > 0.0)
override def calculateCtr(impressions: Double, engagements: Double): Double =
(1.0 * engagements) / (smoothing + impressions)
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
sortedCtrs.headOption
}
case class CombinedTopNCtrsByWilsonConfidenceIntervalPolicy(
ctrDescriptors: Set[CtrDescriptor],
z: Double = 1.96,
topN: Int = 1)
extends PickTopCtrPolicyBase(ctrDescriptors) {
private val zSquared = z * z
private val zSquaredDiv2 = zSquared / 2.0
private val zSquaredDiv4 = zSquared / 4.0
/**
* calculates the lower bound of wilson score interval. which roughly says "the actual engagement
* rate is at least this value" with confidence designated by the z-score:
* https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval#Wilson_score_interval
*/
override def calculateCtr(rawImpressions: Double, engagements: Double): Double = {
// just in case engagements happens to be more than impressions...
val impressions = Math.max(rawImpressions, engagements)
if (impressions > 0.0) {
val p = engagements / impressions
(p
+ zSquaredDiv2 / impressions
- z * Math.sqrt(
(p * (1.0 - p) + zSquaredDiv4 / impressions) / impressions)) / (1.0 + zSquared / impressions)
} else 0.0
}
/**
* takes the topN engagement rates, and returns the joint probability as {1.0 - Π(1.0 - p)}
*
* e.g. let's say you have 0.6 chance of clicking on a tweet shared by the user A.
* you also have 0.3 chance of clicking on a tweet shared by the user B.
* seeing a tweet shared by both A and B will not lead to 0.9 chance of you clicking on it.
* but you could say that you have 0.4*0.7 chance of NOT clicking on that tweet.
*/
override def combineTopNCtrsToSingleScore(sortedCtrs: Seq[Double]): Option[Double] =
if (sortedCtrs.nonEmpty) {
val inverseLogP = sortedCtrs
.take(topN).map { p => Math.log(1.0 - p) }.sum
Some(1.0 - Math.exp(inverseLogP))
} else None
}

View File

@ -1,199 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.ml.api._
import com.twitter.ml.api.Feature
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.scalding.typed.TypedPipe
import com.twitter.scalding.typed.UnsortedGrouped
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import java.util.{Set => JSet}
import scala.collection.JavaConverters._
object SparseBinaryAggregateJoin {
import TypedAggregateGroup._
def makeKey(record: DataRecord, joinKeyList: List[Feature[_]]): String = {
joinKeyList.map {
case sparseKey: Feature.SparseBinary =>
SRichDataRecord(record).getFeatureValue(sparseFeature(sparseKey))
case nonSparseKey: Feature[_] =>
SRichDataRecord(record).getFeatureValue(nonSparseKey)
}.toString
}
/**
* @param record Data record to get all possible sparse aggregate keys from
* @param List of join key features (some can be sparse and some non-sparse)
* @return A list of string keys to use for joining
*/
def makeKeyPermutations(record: DataRecord, joinKeyList: List[Feature[_]]): List[String] = {
val allIdValues = joinKeyList.flatMap {
case sparseKey: Feature.SparseBinary => {
val id = sparseKey.getDenseFeatureId
val valuesOpt = Option(SRichDataRecord(record).getFeatureValue(sparseKey))
.map(_.asInstanceOf[JSet[String]].asScala.toSet)
valuesOpt.map { (id, _) }
}
case nonSparseKey: Feature[_] => {
val id = nonSparseKey.getDenseFeatureId
Option(SRichDataRecord(record).getFeatureValue(nonSparseKey)).map { value =>
(id, Set(value.toString))
}
}
}
sparseBinaryPermutations(allIdValues).toList.map { idValues =>
joinKeyList.map { key => idValues.getOrElse(key.getDenseFeatureId, "") }.toString
}
}
private[this] def mkKeyIndexedAggregates(
joinFeaturesDataSet: DataSetPipe,
joinKeyList: List[Feature[_]]
): TypedPipe[(String, DataRecord)] =
joinFeaturesDataSet.records
.map { record => (makeKey(record, joinKeyList), record) }
private[this] def mkKeyIndexedInput(
inputDataSet: DataSetPipe,
joinKeyList: List[Feature[_]]
): TypedPipe[(String, DataRecord)] =
inputDataSet.records
.flatMap { record =>
for {
key <- makeKeyPermutations(record, joinKeyList)
} yield { (key, record) }
}
private[this] def mkKeyIndexedInputWithUniqueId(
inputDataSet: DataSetPipe,
joinKeyList: List[Feature[_]],
uniqueIdFeatureList: List[Feature[_]]
): TypedPipe[(String, String)] =
inputDataSet.records
.flatMap { record =>
for {
key <- makeKeyPermutations(record, joinKeyList)
} yield { (key, makeKey(record, uniqueIdFeatureList)) }
}
private[this] def mkRecordIndexedAggregates(
keyIndexedInput: TypedPipe[(String, DataRecord)],
keyIndexedAggregates: TypedPipe[(String, DataRecord)]
): UnsortedGrouped[DataRecord, List[DataRecord]] =
keyIndexedInput
.join(keyIndexedAggregates)
.map { case (_, (inputRecord, aggregateRecord)) => (inputRecord, aggregateRecord) }
.group
.toList
private[this] def mkRecordIndexedAggregatesWithUniqueId(
keyIndexedInput: TypedPipe[(String, String)],
keyIndexedAggregates: TypedPipe[(String, DataRecord)]
): UnsortedGrouped[String, List[DataRecord]] =
keyIndexedInput
.join(keyIndexedAggregates)
.map { case (_, (inputId, aggregateRecord)) => (inputId, aggregateRecord) }
.group
.toList
def mkJoinedDataSet(
inputDataSet: DataSetPipe,
joinFeaturesDataSet: DataSetPipe,
recordIndexedAggregates: UnsortedGrouped[DataRecord, List[DataRecord]],
mergePolicy: SparseBinaryMergePolicy
): TypedPipe[DataRecord] =
inputDataSet.records
.map(record => (record, ()))
.leftJoin(recordIndexedAggregates)
.map {
case (inputRecord, (_, aggregateRecordsOpt)) =>
aggregateRecordsOpt
.map { aggregateRecords =>
mergePolicy.mergeRecord(
inputRecord,
aggregateRecords,
joinFeaturesDataSet.featureContext
)
inputRecord
}
.getOrElse(inputRecord)
}
def mkJoinedDataSetWithUniqueId(
inputDataSet: DataSetPipe,
joinFeaturesDataSet: DataSetPipe,
recordIndexedAggregates: UnsortedGrouped[String, List[DataRecord]],
mergePolicy: SparseBinaryMergePolicy,
uniqueIdFeatureList: List[Feature[_]]
): TypedPipe[DataRecord] =
inputDataSet.records
.map(record => (makeKey(record, uniqueIdFeatureList), record))
.leftJoin(recordIndexedAggregates)
.map {
case (_, (inputRecord, aggregateRecordsOpt)) =>
aggregateRecordsOpt
.map { aggregateRecords =>
mergePolicy.mergeRecord(
inputRecord,
aggregateRecords,
joinFeaturesDataSet.featureContext
)
inputRecord
}
.getOrElse(inputRecord)
}
/**
* If uniqueIdFeatures is non-empty and the join keys include a sparse binary
* key, the join will use this set of keys as a unique id to reduce
* memory consumption. You should need this option only for
* memory-intensive joins to avoid OOM errors.
*/
def apply(
inputDataSet: DataSetPipe,
joinKeys: Product,
joinFeaturesDataSet: DataSetPipe,
mergePolicy: SparseBinaryMergePolicy = PickFirstRecordPolicy,
uniqueIdFeaturesOpt: Option[Product] = None
): DataSetPipe = {
val joinKeyList = joinKeys.productIterator.toList.asInstanceOf[List[Feature[_]]]
val sparseBinaryJoinKeySet =
joinKeyList.toSet.filter(_.getFeatureType() == FeatureType.SPARSE_BINARY)
val containsSparseBinaryKey = !sparseBinaryJoinKeySet.isEmpty
if (containsSparseBinaryKey) {
val uniqueIdFeatureList = uniqueIdFeaturesOpt
.map(uniqueIdFeatures =>
uniqueIdFeatures.productIterator.toList.asInstanceOf[List[Feature[_]]])
.getOrElse(List.empty[Feature[_]])
val keyIndexedAggregates = mkKeyIndexedAggregates(joinFeaturesDataSet, joinKeyList)
val joinedDataSet = if (uniqueIdFeatureList.isEmpty) {
val keyIndexedInput = mkKeyIndexedInput(inputDataSet, joinKeyList)
val recordIndexedAggregates =
mkRecordIndexedAggregates(keyIndexedInput, keyIndexedAggregates)
mkJoinedDataSet(inputDataSet, joinFeaturesDataSet, recordIndexedAggregates, mergePolicy)
} else {
val keyIndexedInput =
mkKeyIndexedInputWithUniqueId(inputDataSet, joinKeyList, uniqueIdFeatureList)
val recordIndexedAggregates =
mkRecordIndexedAggregatesWithUniqueId(keyIndexedInput, keyIndexedAggregates)
mkJoinedDataSetWithUniqueId(
inputDataSet,
joinFeaturesDataSet,
recordIndexedAggregates,
mergePolicy,
uniqueIdFeatureList
)
}
DataSetPipe(
joinedDataSet,
mergePolicy.mergeContext(
inputDataSet.featureContext,
joinFeaturesDataSet.featureContext
)
)
} else {
inputDataSet.joinWithSmaller(joinKeys, joinFeaturesDataSet) { _.pass }
}
}
}

View File

@ -1,81 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.ml.api._
import com.twitter.ml.api.FeatureContext
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup
import scala.collection.JavaConverters._
/**
* When using the aggregates framework to group by sparse binary keys,
* we generate different aggregate feature values for each possible
* value of the sparse key. Hence, when joining back the aggregate
* features with a training data set, each individual training record
* has multiple aggregate features to choose from, for each value taken
* by the sparse key(s) in the training record. The merge policy trait
* below specifies how to condense/combine this variable number of
* aggregate features into a constant number of features for training.
* Some simple policies might be: pick the first feature set (randomly),
* pick the top sorted by some attribute, or take some average.
*
* Example: suppose we group by (ADVERTISER_ID, INTEREST_ID) where INTEREST_ID
* is the sparse key, and compute a "CTR" aggregate feature for each such
* pair measuring the click through rate on ads with (ADVERTISER_ID, INTEREST_ID).
* Say we have the following aggregate records:
*
* (ADVERTISER_ID = 1, INTEREST_ID = 1, CTR = 5%)
* (ADVERTISER_ID = 1, INTEREST_ID = 2, CTR = 15%)
* (ADVERTISER_ID = 2, INTEREST_ID = 1, CTR = 1%)
* (ADVERTISER_ID = 2, INTEREST_ID = 2, CTR = 10%)
* ...
* At training time, each training record has one value for ADVERTISER_ID, but it
* has multiple values for INTEREST_ID e.g.
*
* (ADVERTISER_ID = 1, INTEREST_IDS = (1,2))
*
* There are multiple potential CTRs we can get when joining in the aggregate features:
* in this case 2 values (5% and 15%) but in general it could be many depending on how
* many interests the user has. When joining back the CTR features, the merge policy says how to
* combine all these CTRs to engineer features.
*
* "Pick first" would say - pick some random CTR (whatever is first in the list, maybe 5%)
* for training (probably not a good policy). "Sort by CTR" could be a policy
* that just picks the top CTR and uses it as a feature (here 15%). Similarly, you could
* imagine "Top K sorted by CTR" (use both 5 and 15%) or "Avg CTR" (10%) or other policies,
* all of which are defined as objects/case classes that override this trait.
*/
trait SparseBinaryMergePolicy {
/**
* @param mutableInputRecord Input record to add aggregates to
* @param aggregateRecords Aggregate feature records
* @param aggregateContext Context for aggregate records
*/
def mergeRecord(
mutableInputRecord: DataRecord,
aggregateRecords: List[DataRecord],
aggregateContext: FeatureContext
): Unit
def aggregateFeaturesPostMerge(aggregateContext: FeatureContext): Set[Feature[_]]
/**
* @param inputContext Context for input record
* @param aggregateContext Context for aggregate records
* @return Context for record returned by mergeRecord()
*/
def mergeContext(
inputContext: FeatureContext,
aggregateContext: FeatureContext
): FeatureContext = new FeatureContext(
(inputContext.getAllFeatures.asScala.toSet ++ aggregateFeaturesPostMerge(
aggregateContext)).toSeq.asJava
)
def allOutputFeaturesPostMergePolicy[T](config: TypedAggregateGroup[T]): Set[Feature[_]] = {
val containsSparseBinary = config.keysToAggregate
.exists(_.getFeatureType == FeatureType.SPARSE_BINARY)
if (!containsSparseBinary) config.allOutputFeatures
else aggregateFeaturesPostMerge(new FeatureContext(config.allOutputFeatures.toSeq.asJava))
}
}

View File

@ -1,109 +0,0 @@
package com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion
import com.twitter.bijection.Injection
import com.twitter.ml.api._
import com.twitter.ml.api.Feature
import com.twitter.ml.api.util.SRichDataRecord
import com.twitter.scalding.typed.TypedPipe
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.TypedAggregateGroup.sparseFeature
import scala.collection.JavaConverters._
case class SparseJoinConfig(
aggregates: DataSetPipe,
sparseKey: Feature.SparseBinary,
mergePolicies: SparseBinaryMergePolicy*)
object SparseBinaryMultipleAggregateJoin {
type CommonMap = (String, ((Feature.SparseBinary, String), DataRecord))
def apply(
source: DataSetPipe,
commonKey: Feature[_],
joinConfigs: Set[SparseJoinConfig],
rightJoin: Boolean = false,
isSketchJoin: Boolean = false,
numSketchJoinReducers: Int = 0
): DataSetPipe = {
val emptyPipe: TypedPipe[CommonMap] = TypedPipe.empty
val aggregateMaps: Set[TypedPipe[CommonMap]] = joinConfigs.map { joinConfig =>
joinConfig.aggregates.records.map { record =>
val sparseKeyValue =
SRichDataRecord(record).getFeatureValue(sparseFeature(joinConfig.sparseKey)).toString
val commonKeyValue = SRichDataRecord(record).getFeatureValue(commonKey).toString
(commonKeyValue, ((joinConfig.sparseKey, sparseKeyValue), record))
}
}
val commonKeyToAggregateMap = aggregateMaps
.foldLeft(emptyPipe) {
case (union: TypedPipe[CommonMap], next: TypedPipe[CommonMap]) =>
union ++ next
}
.group
.toList
.map {
case (commonKeyValue, aggregateTuples) =>
(commonKeyValue, aggregateTuples.toMap)
}
val commonKeyToRecordMap = source.records
.map { record =>
val commonKeyValue = SRichDataRecord(record).getFeatureValue(commonKey).toString
(commonKeyValue, record)
}
// rightJoin is not supported by Sketched, so rightJoin will be ignored if isSketchJoin is set
implicit val string2Byte = (value: String) => Injection[String, Array[Byte]](value)
val intermediateRecords = if (isSketchJoin) {
commonKeyToRecordMap.group
.sketch(numSketchJoinReducers)
.leftJoin(commonKeyToAggregateMap)
.toTypedPipe
} else if (rightJoin) {
commonKeyToAggregateMap
.rightJoin(commonKeyToRecordMap)
.mapValues(_.swap)
.toTypedPipe
} else {
commonKeyToRecordMap.leftJoin(commonKeyToAggregateMap).toTypedPipe
}
val joinedRecords = intermediateRecords
.map {
case (commonKeyValue, (inputRecord, aggregateTupleMapOpt)) =>
aggregateTupleMapOpt.foreach { aggregateTupleMap =>
joinConfigs.foreach { joinConfig =>
val sparseKeyValues = Option(
SRichDataRecord(inputRecord)
.getFeatureValue(joinConfig.sparseKey)
).map(_.asScala.toList)
.getOrElse(List.empty[String])
val aggregateRecords = sparseKeyValues.flatMap { sparseKeyValue =>
aggregateTupleMap.get((joinConfig.sparseKey, sparseKeyValue))
}
joinConfig.mergePolicies.foreach { mergePolicy =>
mergePolicy.mergeRecord(
inputRecord,
aggregateRecords,
joinConfig.aggregates.featureContext
)
}
}
}
inputRecord
}
val joinedFeatureContext = joinConfigs
.foldLeft(source.featureContext) {
case (left, joinConfig) =>
joinConfig.mergePolicies.foldLeft(left) {
case (soFar, mergePolicy) =>
mergePolicy.mergeContext(soFar, joinConfig.aggregates.featureContext)
}
}
DataSetPipe(joinedRecords, joinedFeatureContext)
}
}

View File

@ -1,5 +0,0 @@
aggregation.rst
batch.rst
index.rst
real-time.rst
troubleshooting.rst

View File

@ -1,167 +0,0 @@
.. _aggregation:
Core Concepts
=============
This page provides an overview of the aggregation framework and goes through examples on how to define aggregate features. In general, we can think of an aggregate feature as a grouped set of records, on which we incrementally update the aggregate feature values, crossed by the provided features and conditional on the provided labels.
AggregateGroup
--------------
An `AggregateGroup` defines a single unit of aggregate computation, similar to a SQL query. These are executed by the underlying jobs (internally, a `DataRecordAggregationMonoid <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/DataRecordAggregationMonoid.scala#n42>`_ is applied to `DataRecords` that contain the features to aggregate). Many of these groups can exist to define different types of aggregate features.
Let's start with the following examples of an `AggregateGroup` to discuss the meaning of each of its constructor arguments:
.. code-block:: scala
val UserAggregateStore = "user_aggregates"
val aggregatesToCompute: Set[TypedAggregateGroup[_]] = Set(
AggregateGroup(
inputSource = timelinesDailyRecapSource,
aggregatePrefix = "user_aggregate_v2",
preTransformOpt = Some(RemoveUserIdZero),
keys = Set(USER_ID),
features = Set(HAS_PHOTO),
labels = Set(IS_FAVORITED),
metrics = Set(CountMetric, SumMetric),
halfLives = Set(50.days),
outputStore = OfflineAggregateStore(
name = UserAggregateStore,
startDate = "2016-07-15 00:00",
commonConfig = timelinesDailyAggregateSink,
batchesToKeep = 5
)
)
.flatMap(_.buildTypedAggregateGroups)
)
This `AggregateGroup` computes the number of times each user has faved a tweet with a photo. The aggregate count is decayed with a 50 day halflife.
Naming and preprocessing
------------------------
`UserAggregateStore` is a string val that acts as a scope of a "root path" to which this group of aggregate features will be written. The root path is provided separately by the implementing job.
`inputSource` defines the input source of `DataRecords` that we aggregate on. These records contain the relevant features required for aggregation.
`aggregatePrefix` tells the framework what prefix to use for the aggregate features it generates. A descriptive naming scheme with versioning makes it easier to maintain features as you add or remove them over the long-term.
`preTransforms` is a `Seq[com.twitter.ml.api.ITransform] <https://cgit.twitter.biz/source/tree/src/java/com/twitter/ml/api/ITransform.java>`_ that can be applied to the data records read from the input source before they are fed into the `AggregateGroup` to apply aggregation. These transforms are optional but can be useful for certain preprocessing operations for a group's raw input features.
.. admonition:: Examples
You can downsample input data records by providing `preTransforms`. In addition, you could also join different input labels (e.g. "is_push_openend" and "is_push_favorited") and transform them into a combined label that is their union ("is_push_engaged") on which aggregate counts will be calculated.
Keys
----
`keys` is a crucial field in the config. It defines a `Set[com.twitter.ml.api.Feature]` which specifies a set of grouping keys to use for this `AggregateGroup`.
Keys can only be of 3 supported types currently: `DISCRETE`, `STRING` and `SPARSE_BINARY`. Using a discrete or a string/text feature as a key specifies the unit to group records by before applying counting/aggregation operators.
.. admonition:: Examples
.. cssclass:: shortlist
#. If the key is `USER_ID`, this tells the framework to group all records by `USER_ID`, and then apply aggregations (sum/count/etc) within each users data to generate aggregate features for each user.
#. If the key is `(USER_ID, AUTHOR_ID)`, then the `AggregateGroup` will output features for each unique user-author pair in the input data.
#. Finally, using a sparse binary feature as key has special "flattening" or "flatMap" like semantics. For example, consider grouping by `(USER_ID, AUTHOR_INTEREST_IDS)` where `AUTHOR_INTEREST_IDS` is a sparse binary feature which represents a set of topic IDs the author may be tweeting about. This creates one record for each `(user_id, interest_id)` pair - so each record with multiple author interests is flattened before feeding it to the aggregation.
Features
--------
`features` specifies a `Set[com.twitter.ml.api.Feature]` to aggregate within each group (defined by the keys specified earlier).
We support 2 types of `features`: `BINARY` and `CONTINUOUS`.
The semantics of how the aggregation works is slightly different based on the type of “feature”, and based on the “metric” (or aggregation operation):
.. cssclass:: shortlist
#. Binary Feature, Count Metric: Suppose we have a binary feature `HAS_PHOTO` in this set, and are applying the “Count” metric (see below for more details on the metrics), with key `USER_ID`. The semantics is that this computes a feature which measures the count of records with `HAS_PHOTO` set to true for each user.
#. Binary Feature, Sum Metric - Does not apply. No feature will be computed.
#. Continuous Feature, Count Metric - The count metric treats all features as binary features ignoring their value. For example, suppose we have a continuous feature `NUM_CHARACTERS_IN_TWEET`, and key `USER_ID`. This measures the count of records that have this feature `NUM_CHARACTERS_IN_TWEET` present.
#. Continuous Feature, Sum Metric - In the above example, the features measures the sum of (num_characters_in_tweet) over all a users records. Dividing this sum feature by the count feature would give the average number of characters in all tweets.
.. admonition:: Unsupported feature types
`DISCRETE` and `SPARSE` features are not supported by the Sum Metric, because there is no meaning in summing a discrete feature or a sparse feature. You can use them with the CountMetric, but they may not do what you would expect since they will be treated as binary features losing all the information within the feature. The best way to use these is as “keys” and not as “features”.
.. admonition:: Setting includeAnyFeature
If constructor argument `includeAnyFeature` is set, the framework will append a feature with scope `any_feature` to the set of all features you define. This additional feature simply measures the total count of records. So if you set your features to be equal to Set.empty, this will measure the count of records for a given `USER_ID`.
Labels
------
`labels` specifies a set of `BINARY` features that you can cross with, prior to applying aggregations on the `features`. This essentially restricts the aggregate computation to a subset of the records within a particular key.
We typically use this to represent engagement labels in an ML model, in this case, `IS_FAVORITED`.
In this example, we are grouping by `USER_ID`, the feature is `HAS_PHOTO`, the label is `IS_FAVORITED`, and we are computing `CountMetric`. The system will output a feature for each user that represents the number of favorites on tweets having photos by this `userId`.
.. admonition:: Setting includeAnyLabel
If constructor argument `includeAnyLabel` is set (as it is by default), then similar to `any_feature`, the framework automatically appends a label of type `any_label` to the set of all labels you define, which represents not applying any filter or cross.
In this example, `any_label` and `any_feature` are set by default and the system would actually output 4 features for each `user_id`:
.. cssclass:: shortlist
#. The number of `IS_FAVORITED` (favorites) on tweet impressions having `HAS_PHOTO=true`
#. The number of `IS_FAVORITED` (favorites) on all tweet impressions (`any_feature` aggregate)
#. The number of tweet impressions having `HAS_PHOTO=true` (`any_label` aggregate)
#. The total number of tweet impressions for this user id (`any_feature.any_label` aggregate)
.. admonition:: Disabling includeAnyLabel
To disable this automatically generated feature you can use `includeAnyLabel = false` in your config. This will remove some useful features (particularly for counterfactual signal), but it can greatly save on space since it does not store every possible impressed set of keys in the output store. So use this if you are short on space, but not otherwise.
Metrics
-------
`metrics` specifies the aggregate operators to apply. The most commonly used are `Count`, `Sum` and `SumSq`.
As mentioned before, `Count` can be applied to all types of features, but treats every feature as binary and ignores the value of the feature. `Sum` and `SumSq` can only be applied to Continuous features - they will ignore all other features you specify. By combining sum and sumsq and count, you can produce powerful “z-score” features or other distributional features using a post-transform.
It is also possible to add your own aggregate operators (e.g. `LastResetMetric <https://phabricator.twitter.biz/D228537>`_) to the framework with some additional work.
HalfLives
---------
`halfLives` specifies how fast aggregate features should be decayed. It is important to note that the framework works on an incremental basis: in the batch implementation, the summingbird-scalding job takes in the most recently computed aggregate features, processed on data until day `N-1`, then reads new data records for day `N` and computes updated values of the aggregate features. Similarly, the decay of real-time aggregate features takes the actual time delta between the current time and the last time the aggregate feature value was updated.
The halflife `H` specifies how fast to decay old sums/counts to simulate a sliding window of counts. The implementation is such that it will take `H` amount of time to decay an aggregate feature to half its initial value. New observed values of sums/counts are added to the aggregate feature value.
.. admonition:: Batch and real-time
In the batch use case where aggregate features are recomputed on a daily basis, we typically take halflives on the order of weeks or longer (in Timelines, 50 days). In the real-time use case, shorter halflives are appropriate (hours) since they are updated as client engagements are received by the summingbird job.
SQL Equivalent
--------------
Conceptually, you can also think of it as:
.. code-block:: sql
INSERT INTO <outputStore>.<aggregatePrefix>
SELECT AGG(<features>) /* AGG is <metrics>, which is a exponentially decaying SUM or COUNT etc. based on the halfLifves */
FROM (
SELECT preTransformOpt(*) FROM <inputSource>
)
GROUP BY <keys>
WHERE <labels> = True
any_features is AGG(*).
any_labels removes the WHERE clause.

View File

@ -1,215 +0,0 @@
.. _batch:
Batch aggregate feature jobs
============================
In the previous section, we went over the core concepts of the aggregation framework and discussed how you can set up you own `AggregateGroups` to compute aggregate features.
Given these groups, this section will discuss how you can setup offline batch jobs to produce the corresponding aggregate features, updated daily. To accomplish this, we need to setup a summingbird-scalding job that is pointed to the input data records containing features and labels to be aggregated.
Input Data
----------
In order to generate aggregate features, the relevant input features need to be available offline as a daily scalding source in `DataRecord` format (typically `DailySuffixFeatureSource <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/ml/api/FeatureSource.scala>`_, though `HourlySuffixFeatureSource` could also be usable but we have not tested this).
.. admonition:: Note
The input data source should contain the keys, features and labels you want to use in your `AggregateGroups`.
Aggregation Config
------------------
Now that we have a daily data source with input features and labels, we need to setup the `AggregateGroup` config itself. This contains all aggregation groups that you would like to compute and we will go through the implementation step-by-step.
.. admonition:: Example: Timelines Quality config
`TimelinesAggregationConfig <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfig.scala>`_ imports the configured `AggregationGroups` from `TimelinesAggregationConfigDetails <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/TimelinesAggregationConfigDetails.scala>`_. The config is then referenced by the implementing summingbird-scalding job which we will setup below.
OfflineAggregateSource
----------------------
Each `AggregateGroup` will need to define a (daily) source of input features. We use `OfflineAggregateSource` for this to tell the aggregation framework where the input data set is and the required timestamp feature that the framework uses to decay aggregate feature values:
.. code-block:: scala
val timelinesDailyRecapSource = OfflineAggregateSource(
name = "timelines_daily_recap",
timestampFeature = TIMESTAMP,
scaldingHdfsPath = Some("/user/timelines/processed/suggests/recap/data_records"),
scaldingSuffixType = Some("daily"),
withValidation = true
)
.. admonition:: Note
.. cssclass:: shortlist
#. The name is not important as long as it is unique.
#. `timestampFeature` must be a discrete feature of type `com.twitter.ml.api.Feature[Long]` and represents the “time” of a given training record in milliseconds - for example, the time at which an engagement, push open event, or abuse event took place that you are trying to train on. If you do not already have such a feature in your daily training data, you need to add one.
#. `scaldingSuffixType` can be “hourly” or “daily” depending on the type of source (`HourlySuffixFeatureSource` vs `DailySuffixFeatureSource`).
#. Set `withValidation` to true to validate the presence of _SUCCESS file. Context: https://jira.twitter.biz/browse/TQ-10618
Output HDFS store
-----------------
The output HDFS store is where the computed aggregate features are stored. This store contains all computed aggregate feature values and is incrementally updated by the aggregates job every day.
.. code-block:: scala
val outputHdfsPath = "/user/timelines/processed/aggregates_v2"
val timelinesOfflineAggregateSink = new OfflineStoreCommonConfig {
override def apply(startDate: String) = new OfflineAggregateStoreCommonConfig(
outputHdfsPathPrefix = outputHdfsPath,
dummyAppId = "timelines_aggregates_v2_ro", // unused - can be arbitrary
dummyDatasetPrefix = "timelines_aggregates_v2_ro", // unused - can be arbitrary
startDate = startDate
)
}
Note: `dummyAppId` and `dummyDatasetPrefix` are unused so can be set to any arbitrary value. They should be removed on the framework side.
The `outputHdfsPathPrefix` is the only field that matters, and should be set to the HDFS path where you want to store the aggregate features. Make sure you have a lot of quota available at that path.
Setting Up Aggregates Job
-------------------------
Once you have defined a config file with the aggregates you would like to compute, the next step is to create the aggregates scalding job using the config (`example <https://cgit.twitter.biz/source/tree/timelines/data_processing/ad_hoc/aggregate_interactions/v2/offline_aggregation/TimelinesAggregationScaldingJob.scala>`_). This is very concise and requires only a few lines of code:
.. code-block:: scala
object TimelinesAggregationScaldingJob extends AggregatesV2ScaldingJob {
override val aggregatesToCompute = TimelinesAggregationConfig.aggregatesToCompute
}
Now that the scalding job is implemented with the aggregation config, we need to setup a capesos config similar to https://cgit.twitter.biz/source/tree/science/scalding/mesos/timelines/prod.yml:
.. code-block:: scala
# Common configuration shared by all aggregates v2 jobs
__aggregates_v2_common__: &__aggregates_v2_common__
class: HadoopSummingbirdProducer
bundle: offline_aggregation-deploy.tar.gz
mainjar: offline_aggregation-deploy.jar
pants_target: "bundle timelines/data_processing/ad_hoc/aggregate_interactions/v2/offline_aggregation:bin"
cron_collision_policy: CANCEL_NEW
use_libjar_wild_card: true
.. code-block:: scala
# Specific job computing user aggregates
user_aggregates_v2:
<<: *__aggregates_v2_common__
cron_schedule: "25 * * * *"
arguments: --batches 1 --output_stores user_aggregates --job_name timelines_user_aggregates_v2
.. admonition:: Important
Each AggregateGroup in your config should have its own associated offline job which specifies `output_stores` pointing to the output store name you defined in your config.
Running The Job
---------------
When you run the batch job for the first time, you need to add a temporary entry to your capesos yml file that looks like this:
.. code-block:: scala
user_aggregates_v2_initial_run:
<<: *__aggregates_v2_common__
cron_schedule: "25 * * * *"
arguments: --batches 1 --start-time “2017-03-03 00:00:00” --output_stores user_aggregates --job_name timelines_user_aggregates_v2
.. admonition:: Start Time
The additional `--start-time` argument should match the `startDate` in your config for that AggregateGroup, but in the format `yyyy-mm-dd hh:mm:ss`.
To invoke the initial run via capesos, we would do the following (in Timelines case):
.. code-block:: scala
CAPESOSPY_ENV=prod capesospy-v2 update --build_locally --start_cron user_aggregates_v2_initial_run science/scalding/mesos/timelines/prod.yml
Once it is running smoothly, you can deschedule the initial run job and delete the temporary entry from your production yml config.
.. code-block:: scala
aurora cron deschedule atla/timelines/prod/user_aggregates_v2_initial_run
Note: deschedule it preemptively to avoid repeatedly overwriting the same initial results
Then schedule the production job from jenkins using something like this:
.. code-block:: scala
CAPESOSPY_ENV=prod capesospy-v2 update user_aggregates_v2 science/scalding/mesos/timelines/prod.yml
All future runs (2nd onwards) will use the permanent entry in the capesos yml config that does not have the `start-time` specified.
.. admonition:: Job name has to match
It's important that the production run should share the same `--job_name` with the initial_run so that eagleeye/statebird knows how to keep track of it correctly.
Output Aggregate Features
-------------------------
This scalding job using the example config from the earlier section would output a VersionedKeyValSource to `/user/timelines/processed/aggregates_v2/user_aggregates` on HDFS.
Note that `/user/timelines/processed/aggregates_v2` is the explicitly defined root path while `user_aggregates` is the output directory of the example `AggregateGroup` defined earlier. The latter can be different for different `AggregateGroups` defined in your config.
The VersionedKeyValSource is difficult to use directly in your jobs/offline trainings, but we provide an adapted source `AggregatesV2FeatureSource` that makes it easy to join and use in your jobs:
.. code-block:: scala
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.conversion._
val pipe: DataSetPipe = AggregatesV2FeatureSource(
rootPath = "/user/timelines/processed/aggregates_v2",
storeName = "user_aggregates",
aggregates = TimelinesAggregationConfig.aggregatesToCompute,
trimThreshold = 0
)(dateRange).read
Simply replace the `rootPath`, `storeName` and `aggregates` object to whatever you defined. The `trimThreshold` tells the framework to trim all features below a certain cutoff: 0 is a safe default to use to begin with.
.. admonition:: Usage
This can now be used like any other `DataSetPipe` in offline ML jobs. You can write out the features to a `DailySuffixFeatureSource`, you can join them with your data offline for trainings, or you can write them to a Manhattan store for serving online.
Aggregate Features Example
--------------------------
Here is an example of sample of the aggregate features we just computed:
.. code-block:: scala
user_aggregate_v2.pair.any_label.any_feature.50.days.count: 100.0
user_aggregate_v2.pair.any_label.tweetsource.is_quote.50.days.count: 30.0
user_aggregate_v2.pair.is_favorited.any_feature.50.days.count: 10.0
user_aggregate_v2.pair.is_favorited.tweetsource.is_quote.50.days.count: 6.0
meta.user_id: 123456789
Aggregate feature names match a `prefix.pair.label.feature.half_life.metric` schema and correspond to what was defined in the aggregation config for each of these fields.
.. admonition:: Example
In this example, the above features are capturing that userId 123456789L has:
..
A 50-day decayed count of 100 training records with any label or feature (“tweet impressions”)
A 50-day decayed count of 30 records that are “quote tweets” (tweetsource.is_quote = true)
A 50-day decayed count of 10 records that are favorites on any type of tweet (is_favorited = true)
A 50-day decayed count of 6 records that are “favorites” on “quote tweets” (both of the above are true)
By combining the above, a model might infer that for this specific user, quote tweets comprise 30% of all impressions, have a favorite rate of 6/30 = 20%, compared to a favorite rate of 10/100 = 10% on the total population of tweets.
Therefore, being a quote tweet makes this specific user `123456789L` approximately twice as likely to favorite the tweet, which is useful for prediction and could result in the ML model giving higher scores to & ranking quote tweets higher in a personalized fashion for this user.
Tests for Feature Names
--------------------------
When you change or add AggregateGroup, feature names might change. And the Feature Store provides a testing mechanism to assert that the feature names change as you expect. See `tests for feature names <https://docbird.twitter.biz/ml_feature_store/catalog.html#tests-for-feature-names>`_.

View File

@ -1,59 +0,0 @@
# -*- coding: utf-8 -*-
#
# docbird documentation build configuration file
# Note that not all possible configuration values are present in this
# autogenerated file.
#
from os.path import abspath, dirname, isfile, join
extensions = [
"sphinx.ext.autodoc",
"sphinx.ext.intersphinx",
"sphinx.ext.ifconfig",
"sphinx.ext.graphviz",
"twitter.docbird.ext.thriftlexer",
"twitter.docbird.ext.toctree_default_caption",
"sphinxcontrib.httpdomain",
]
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The suffix of source filenames.
source_suffix = ".rst"
# The master toctree document.
master_doc = "index"
# General information about the project.
project = u"""Aggregation Framework"""
description = u""""""
# The short X.Y version.
version = u"""1.0"""
# The full version, including alpha/beta/rc tags.
release = u"""1.0"""
exclude_patterns = ["_build"]
pygments_style = "sphinx"
html_theme = "default"
html_static_path = ["_static"]
html_logo = u""""""
# Automagically add project logo, if it exists
# (checks on any build, not just init)
# Scan for some common defaults (png or svg format,
# called "logo" or project name, in docs folder)
if not html_logo:
location = dirname(abspath(__file__))
for logo_file in ["logo.png", "logo.svg", ("%s.png" % project), ("%s.svg" % project)]:
html_logo = logo_file if isfile(join(location, logo_file)) else html_logo
graphviz_output_format = "svg"

View File

@ -1,11 +0,0 @@
.. markdowninclude:: ../README.md
.. toctree::
:maxdepth: 2
:hidden:
aggregation
batch
real-time
joining
troubleshooting

View File

@ -1,72 +0,0 @@
.. _joining:
Joining aggregates features to records
======================================
After setting up either offline batch jobs or online real-time summingbird jobs to produce
aggregate features and querying them, we are left with data records containing aggregate features.
This page will go over how to join them with other data records to produce offline training data.
(To discuss: joining aggregates to records online)
Joining Aggregates on Discrete/String Keys
------------------------------------------
Joining aggregate features keyed on discrete or text features to your training data is very easy -
you can use the built in methods provided by `DataSetPipe`. For example, suppose you have aggregates
keyed by `(USER_ID, AUTHOR_ID)`:
.. code-block:: scala
val userAuthorAggregates: DataSetPipe = AggregatesV2FeatureSource(
rootPath = “/path/to/my/aggregates”,
storeName = “user_author_aggregates”,
aggregates = MyConfig.aggregatesToCompute,
trimThreshold = 0
)(dateRange).read
Offline, you can then join with your training data set as follows:
.. code-block:: scala
val myTrainingData: DataSetPipe = ...
val joinedData = myTrainingData.joinWithLarger((USER_ID, AUTHOR_ID), userAuthorAggregates)
You can read from `AggregatesV2MostRecentFeatureSourceBeforeDate` in order to read the most recent aggregates
before a provided date `beforeDate`. Just note that `beforeDate` must be aligned with the date boundary so if
youre passing in a `dateRange`, use `dateRange.end`).
Joining Aggregates on Sparse Binary Keys
----------------------------------------
When joining on sparse binary keys, there can be multiple aggregate records to join to each training record in
your training data set. For example, suppose you have setup an aggregate group that is keyed on `(INTEREST_ID, AUTHOR_ID)`
capturing engagement counts of users interested in a particular `INTEREST_ID` for specific authors provided by `AUTHOR_ID`.
Suppose now that you have a training data record representing a specific user action. This training data record contains
a sparse binary feature `INTEREST_IDS` representing all the "interests" of that user - e.g. music, sports, and so on. Each `interest_id`
translates to a different set of counting features found in your aggregates data. Therefore we need a way to merge all of
these different sets of counting features to produce a more compact, fixed-size set of features.
.. admonition:: Merge policies
To do this, the aggregate framework provides a trait `SparseBinaryMergePolicy <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/conversion/SparseBinaryMergePolicy.scala>`_. Classes overriding this trait define policies
that state how to merge the individual aggregate features from each sparse binary value (in this case, each `INTEREST_ID` for a user).
Furthermore, we provide `SparseBinaryMultipleAggregateJoin` which executes these policies to merge aggregates.
A simple policy might simply average all the counts from the individual interests, or just take the max, or
a specific quantile. More advanced policies might use custom criteria to decide which interest is most relevant and choose
features from that interest to represent the user, or use some weighted combination of counts.
The framework provides two simple in-built policies (`PickTopCtrPolicy <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/conversion/PickTopCtrPolicy.scala>`_
and `CombineCountsPolicy <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/conversion/CombineCountsPolicy.scala>`_, which keeps the topK counts per
record) that you can get started with, though you likely want to implement your own policy based on domain knowledge to get
the best results for your specific problem domain.
.. admonition:: Offline Code Example
The scalding job `TrainingDataWithAggV2Generator <https://cgit.twitter.biz/source/tree/timelines/data_processing/ad_hoc/recap/training_data_generator/TrainingDataWithAggV2Generator.scala>`_ shows how multiple merge policies are defined and implemented to merge aggregates on sparse binary keys to the TQ's training data records.
.. admonition:: Online Code Example
In our (non-FeatureStore enabled) online code path, we merge aggregates on sparse binary keys using the `CombineCountsPolicy <https://cgit.twitter.biz/source/tree/timelinemixer/server/src/main/scala/com/twitter/timelinemixer/injection/recapbase/aggregates/UserFeaturesHydrator.scala#n201>`_.

View File

@ -1,327 +0,0 @@
.. _real_time:
Real-Time aggregate features
============================
In addition to computing batch aggregate features, the aggregation framework supports real-time aggregates as well. The framework concepts used here are identical to the batch use case, however, the underlying implementation differs and is provided by summingbird-storm jobs.
RTA Runbook
-----------
For operational details, please visit http://go/tqrealtimeaggregates.
Prerequisites
-------------
In order to start computing real-time aggregate features, the framework requires the following to be provided:
* A backing memcached store that will hold the computed aggregate features. This is conceptually equivalent to the output HDFS store in the batch compute case.
* Implementation of `StormAggregateSource <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/heron/StormAggregateSource.scala#n15>`_ that creates `DataRecords` with the necessary input features. This serves as the input to the aggregation operations.
* Definition of aggregate features by defining `AggregateGroup` in an implementation of `OnlineAggregationConfigTrait`. This is identical to the batch case.
* Job config file defining the backing memcached for feature storage and retrieval, and job-related parameters.
We will now go through the details in setting up each required component.
Memcached store
---------------
Real-time aggregates use Memcache as the backing cache to store and update aggregate features keys. Caches can be provisioned on `go/cacheboard <https://cacheboardv2--prod--cache.service.atla.twitter.biz/>`_.
.. admonition:: Test and prod caches
For development, it is sufficient to setup a test cache that your new job can query and write to. At the same time, a production cache request should also be submitted as these generally have significant lead times for provisioning.
StormAggregateSource
--------------------
To enable aggregation of your features, we need to start with defining a `StormAggregateSource` that builds a `Producer[Storm, DataRecord]`. This summingbird producer generates `DataRecords` that contain the input features and labels that the real-time aggregate job will compute aggregate features on. Conceptually, this is equivalent to the input data set in the offline batch use case.
.. admonition:: Example
If you are planning to aggregate on client engagements, you would need to subscribe to the `ClientEvent` kafka stream and then convert each event to a `DataRecord` that contains the key and the engagement on which to aggregate.
Typically, we would setup a julep filter for the relevant client events that we would like to aggregate on. This gives us a `Producer[Storm, LogEvent]` object which we then convert to `Producer[Storm, DataRecord]` with adapters that we wrote:
.. code-block:: scala
lazy val clientEventProducer: Producer[Storm, LogEvent] =
ClientEventSourceScrooge(
appId = AppId(jobConfig.appId),
topic = "julep_client_event_suggests",
resumeAtLastReadOffset = false
).source.name("timelines_events")
lazy val clientEventWithCachedFeaturesProducer: Producer[Storm, DataRecord] = clientEventProducer
.flatMap(mkDataRecords)
Note that this way of composing the storm graph gives us flexiblity in how we can hydrate input features. If you would like to join more complex features to `DataRecord`, you can do so here with additional storm components which can implement cache queries.
.. admonition:: Timelines Quality use case
In Timelines Quality, we aggregate client engagements on `userId` or `tweetId` and implement
`TimelinesStormAggregateSource <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesStormAggregateSource.scala>`_. We create
`Producer[Storm,LogEvent]` of Timelines engagements to which we apply `ClientLogEventAdapter <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/adapters/client_log_event/ClientLogEventAdapter.scala>`_ which converts the event to `DataRecord` containing `userId`, `tweetId`, `timestampFeature` of the engagement and the engagement label itself.
.. admonition:: MagicRecs use case
MagicRecs has a very similar setup for real-time aggregate features. In addition, they also implement a more complex cache query to fetch the user's history in the `StormAggregateSource` for each observed client engagement to hydrate a richer set of input `DataRecords`:
.. code-block:: scala
val userHistoryStoreService: Storm#Service[Long, History] =
Storm.service(UserHistoryReadableStore)
val clientEventDataRecordProducer: Producer[Storm, DataRecord] =
magicRecsClientEventProducer
.flatMap { ...
(userId, logEvent)
}.leftJoin(userHistoryStoreService)
.flatMap {
case (_, (logEvent, history)) =>
mkDataRecords(LogEventHistoryPair(logEvent, history))
}
.. admonition:: EmailRecs use case
EmailRecs shares the same cache as MagicRecs. They combine notification scribe data with email history data to identify the particular item a user engaged with in an email:
.. code-block:: scala
val emailHistoryStoreService: Storm#Service[Long, History] =
Storm.service(EmailHistoryReadableStore)
val emailEventDataRecordProducer: Producer[Storm, DataRecord] =
emailEventProducer
.flatMap { ...
(userId, logEvent)
}.leftJoin(emailHistoryStoreService)
.flatMap {
case (_, (scribe, history)) =>
mkDataRecords(ScribeHistoryPair(scribe, history))
}
Aggregation config
------------------
The real-time aggregation config is extended from `OnlineAggregationConfigTrait <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/heron/OnlineAggregationConfigTrait.scala>`_ and defines the features to aggregate and the backing memcached store to which they will be written.
Setting up real-time aggregates follows the same rules as in the offline batch use case. The major difference here is that `inputSource` should point to the `StormAggregateSource` implementation that provides the `DataRecord` containing the engagements and core features on which to aggregate. In the offline case, this would have been an `OfflineAggregateSource` pointing to an offline source of daily records.
Finally, `RealTimeAggregateStore` defines the backing memcache to be used and should be provided here as the `outputStore`.
.. NOTE::
Please make sure to provide an `AggregateGroup` for both staging and production. The main difference should be the `outputStore` where features in either environment are read from and written to. You want to make sure that a staged real-time aggregates summingbird job is reading/writing only to the test memcache store and does not mutate the production store.
Job config
----------
In addition to the aggregation config that defines the features to aggregate, the final piece we need to provide is a `RealTimeAggregatesJobConfig` that specificies job values such as `appId`, `teamName` and counts for the various topology components that define the capacity of the job (`Timelines example <https://cgit.twitter.biz/source/tree/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesRealTimeAggregatesJob.scala#n22>`_).
Once you have the job config, implementing the storm job itself is easy and almost as concise as in the batch use case:
.. code-block:: scala
object TimelinesRealTimeAggregatesJob extends RealTimeAggregatesJobBase {
override lazy val statsReceiver = DefaultStatsReceiver.scope("timelines_real_time_aggregates")
override lazy val jobConfigs = TimelinesRealTimeAggregatesJobConfigs
override lazy val aggregatesToCompute = TimelinesOnlineAggregationConfig.AggregatesToCompute
}
.. NOTE::
There are some topology settings that are currently hard-coded. In particular, we enable `Config.TOPOLOGY_DROPTUPLES_UPON_BACKPRESSURE` to be true for added robustness. This may be made user-definable in the future.
Steps to hydrate RTAs
--------------------
1. Make the changes to RTAs and follow the steps for `Running the topology`.
2. Register the new RTAs to feature store. Sample phab: https://phabricator.twitter.biz/D718120
3. Wire the features from feature store to TLX. This is usually done with the feature switch set to False. So it's just a code change and will not yet start hydrating the features yet. Merge the phab. Sample phab: https://phabricator.twitter.biz/D718424
4. Now we hydrate the features to TLX gradually by doing it shard wise. For this, first create a PCM and then enable the hydration. Sample PCM: https://jira.twitter.biz/browse/PCM-147814
Running the topology
--------------------
0. For phab that makes change to the topology (such as adding new ML features), before landing the phab, please create a PCM (`example <https://jira.twitter.biz/browse/PCM-131614>`_) and deploy the change to devel topology first and then prod (atla and pdxa). Once it is confirmed that the prod topology can handle the change, the phab can be landed.
1. Go to https://ci.twitter.biz/job/tq-ci/build
2. In `commands` input
.. code-block:: bash
. src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/deploy_local.sh [devel|atla|pdxa]
One can only deploy either `devel`, `atla` (prod atla), `pdxa` (prod pdxa) at a time.
For example, to deploy both pdxa and atla prod topologies, one needs to build/run the above steps twice, one with `pdxa` and the other with `atla`.
The status and performance stats of the topology are found at `go/heron-ui <http://heron-ui-new--prod--heron.service.pdxa.twitter.biz/topologies>`_. Here you can view whether the job is processing tuples, whether it is under any memory or backpressure and provides general observability.
Finally, since we enable `Config.TOPOLOGY_DROPTUPLES_UPON_BACKPRESSURE` by default in the topology, we also need to monitor and alert on the number of dropped tuples. Since this is a job generating features a small fraction of dropped tuples is tolerable if that enables us to avoid backpressure that would hold up global computation in the entire graph.
Hydrating Real-Time Aggregate Features
--------------------------------------
Once the job is up and running, the aggregate features will be accessible in the backing memcached store. To access these features and hydrate to your online pipeline, we need to build a Memcache client with the right query key.
.. admonition:: Example
Some care needs to be taken to define the key injection and codec correctly for the memcached store. These types do not change and you can use the Timelines `memcache client builder <https://cgit.twitter.biz/source/tree/timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/real_time_aggregates_cache/RealTimeAggregatesMemcacheBuilder.scala>`_ as an example.
Aggregate features are written to store with a `(AggregationKey, BatchID)` key.
`AggregationKey <https://cgit.twitter.biz/source/tree/timelines/data_processing/ml_util/aggregation_framework/AggregationKey.scala#n31>`_ is an instant of the keys that you previously defined in `AggregateGroup`. If your aggregation key is `USER_ID`, you would need to instantiate `AggregationKey` with the `USER_ID` featureId and the userId value.
.. admonition:: Returned features
The `DataRecord` that is returned by the cache now contains all real-time aggregate features for the query `AggregationKey` (similar to the batch use case). If your online hydration flow produces data records, the real-time aggregate features can be joined with your existing records in a straightforward way.
Adding features from Feature Store to RTA
--------------------------------------------
To add features from Feature Store to RTA and create real time aggregated features based on them, one needs to follow these steps:
**Step 1**
Copy Strato column for features that one wants to explore and add a cache if needed. See details at `Customize any Columns for your Team as Needed <https://docbird.twitter.biz/ml_feature_store/productionisation-checklist.html?highlight=manhattan#customize-any-columns-for-your-team-as-needed>`_. As an `example <https://phabricator.twitter.biz/D441050>`_, we copy Strato column of recommendationsUserFeaturesProd.User.strato and add a cache for timelines team's usage.
**Step 2**
Create a new ReadableStore which uses Feature Store Client to request features from Feature Store. Implement FeaturesAdapter which extends TimelinesAdapterBase and derive new features based on raw features from Feature Store. As an `example <https://phabricator.twitter.biz/D458168>`_, we create UserFeaturesReadableStore which reads discrete feature user state, and convert it to a list of boolean user state features.
**Step 3**
Join these derived features from Feature Store to timelines storm aggregate source. Depends on the characteristic of these derived features, joined key could be tweet id, user id or others. As an `example <https://phabricator.twitter.biz/D454408>`_, because user state is per user, the joined key is user id.
**Step 4**
Define `AggregateGroup` based on derived features in RTA
Adding New Aggregate Features from an Existing Dataset
--------------------------------
To add a new aggregate feature group from an existing dataset for use in home models, use the following steps:
1. Identify the hypothesis being tested by the addition of the features, in accordance with `go/tpfeatureguide <http://go/tpfeatureguide>`_.
2. Modify or add a new AggregateGroup to `TimelinesOnlineAggregationConfigBase.scala <https://sourcegraph.twitter.biz/git.twitter.biz/source/-/blob/src/scala/com/twitter/timelines/prediction/common/aggregates/real_time/TimelinesOnlineAggregationConfigBase.scala>`_ to define the aggregation key, set of features, labels and metrics. An example phab to add more halflives can be found at `D204415 <https://phabricator.twitter.biz/D204415>`_.
3. If the change is expected to be very large, it may be recommended to perform capacity estimation. See :ref:`Capacity Estimation` for more details.
4. Create feature catalog items for the new RTAs. An example phab is `D706348 <https://phabricator.twitter.biz/D706438>`_. For approval from a featurestore owner ping #help-ml-features on slack.
5. Add new features to the featurestore. An example phab is `D706112 <https://phabricator.twitter.biz/D706112>`_. This change can be rolled out with feature switches or by canarying TLX, depending on the risk. An example PCM for feature switches is: `PCM-148654 <https://jira.twitter.biz/browse/PCM-148654>`_. An example PCM for canarying is: `PCM-145753 <https://jira.twitter.biz/browse/PCM-145753>`_.
6. Wait for redeploy and confirm the new features are available. One way is querying in BigQuery from a table like `twitter-bq-timelines-prod.continuous_training_recap_fav`. Another way is to inspect individual records using pcat. The command to be used is like:
.. code-block:: bash
java -cp pcat-deploy.jar:$(hadoop classpath) com.twitter.ml.tool.pcat.PredictionCatTool
-path /atla/proc2/user/timelines/processed/suggests/recap/continuous_training_data_records/fav/data/YYYY/MM/DD/01/part-00000.lzo
-fc /atla/proc2/user/timelines/processed/suggests/recap/continuous_training_data_records/fav/data_spec.json
-dates YYYY-MM-DDT01 -record_limit 100 | grep [feature_group]
7. Create a phab with the new features and test the performance of a model with them compared to a control model without them. Test offline using `Deepbird for training <https://docbird.twitter.biz/tq_gcp_guide/deepbird.html to train>`_ and `RCE Hypothesis Testing <https://docbird.twitter.biz/Timelines_Deepbird_v2/training.html#model-evaluation-rce-hypothesis-testing>`_ to test. Test online using a DDG. Some helpful instructions are available in `Serving Timelines Models <https://docbird.twitter.biz/timelines_deepbird_v2/serving.html>`_ and the `Experiment Cookbook <https://docs.google.com/document/d/1FTaqd_XOzdTppzePeipLhAgYA9hercN5a_SyQXbuGws/edit#>`_
Capacity Estimation
--------------------------------
This section describes how to approximate the capacity required for a new aggregate group. It is not expected to be exact, but should give a rough estimate.
There are two main components that must be stored for each aggregate group.
Key space: Each AggregationKey struct consists of two maps, one of which is populated with tuples [Long, Long] representing <featureId, value> of discrete features. This takes up 4 x 8 bytes or 32 bytes. The cache team estimates an additional 40 bytes of overhead.
Features: An aggregate feature is represented as a <Long, Double> pair (16 bytes) and is produced for each feature x label x metric x halflife combination.
1. Use bigquery to estimate how many unique values exist for the selected key (key_count). Also collect the number of features, labels, metrics, and half-lives being used.
2. Compute the number of entries to be created, which is num_entires = feature_count * label_count * metric_count * halflife_count
3. Compute the number of bytes per entry, which is num_entry_bytes = 16*num_entries + 32 bytes (key storage) + 40 bytes (overhead)
4. Compute total space required = num_entry_bytes * key_count
Debugging New Aggregate Features
--------------------------------
To debug problems in the setup of your job, there are several steps you can take.
First, ensure that data is being received from the input stream and passed through to create data records. This can be achieved by logging results at various places in your code, and especially at the point of data record creation.
For example, suppose you want to ensure that a data record is being created with
the features you expect. With push and email features, we find that data records
are created in the adaptor, using logic like the following:
.. code-block:: scala
val record = new SRichDataRecord(new DataRecord)
...
record.setFeatureValue(feature, value)
To see what these feature values look like, we can have our adaptor class extend
Twitter's `Logging` trait, and write each created record to a log file.
.. code-block:: scala
class MyEventAdaptor extends TimelinesAdapterBase[MyObject] with Logging {
...
...
def mkDataRecord(myFeatures: MyFeatures): DataRecord = {
val record = new SRichDataRecord(new DataRecord)
...
record.setFeatureValue(feature, value)
logger.info("data record xyz: " + record.getRecord.toString)
}
This way, every time a data record is sent to the aggregator, it will also be
logged. To inspect these logs, you can push these changes to a staging instance,
ssh into that aurora instance, and grep the `log-files` directory for `xyz`. The
data record objects you find should resemble a map from feature ids to their
values.
To check that steps in the aggregation are being performed, you can also inspect the job's topology on go/heronui.
Lastly, to verify that values are being written to your cache you can check the `set` chart in your cache's viz.
To check particular feature values for a given key, you can spin up a Scala REPL like so:
.. code-block:: bash
$ ssh -fN -L*:2181:sdzookeeper-read.atla.twitter.com:2181 -D *:50001 nest.atlc.twitter.com
$ ./pants repl --jvm-repl-scala-options='-DsocksProxyHost=localhost -DsocksProxyPort=50001 -Dcom.twitter.server.resolverZkHosts=localhost:2181' timelinemixer/common/src/main/scala/com/twitter/timelinemixer/clients/real_time_aggregates_cache
You will then need to create a connection to the cache, and a key with which to query it.
.. code-block:: scala
import com.twitter.conversions.DurationOps._
import com.twitter.finagle.stats.{DefaultStatsReceiver, StatsReceiver}
import com.twitter.timelines.data_processing.ml_util.aggregation_framework.AggregationKey
import com.twitter.summingbird.batch.Batcher
import com.twitter.timelinemixer.clients.real_time_aggregates_cache.RealTimeAggregatesMemcacheBuilder
import com.twitter.timelines.clients.memcache_common.StorehausMemcacheConfig
val userFeature = -1887718638306251279L // feature id corresponding to User feature
val userId = 12L // replace with a user id logged when creating your data record
val key = (AggregationKey(Map(userFeature -> userId), Map.empty), Batcher.unit.currentBatch)
val dataset = "twemcache_magicrecs_real_time_aggregates_cache_staging" // replace with the appropriate cache name
val dest = s"/srv#/test/local/cache/twemcache_/$dataset"
val statsReceiver: StatsReceiver = DefaultStatsReceiver
val cache = new RealTimeAggregatesMemcacheBuilder(
config = StorehausMemcacheConfig(
destName = dest,
keyPrefix = "",
requestTimeout = 10.seconds,
numTries = 1,
globalTimeout = 10.seconds,
tcpConnectTimeout = 10.seconds,
connectionAcquisitionTimeout = 10.seconds,
numPendingRequests = 250,
isReadOnly = true
),
statsReceiver.scope(dataset)
).build
val result = cache.get(key)
Another option is to create a debugger which points to the staging cache and creates a cache connection and key similar to the logic above.
Run CQL query to find metrics/counters
--------------------------------
We can also visualize the counters from our job to verify new features. Run CQL query on terminal to find the right path of metrics/counters. For example, in order to check counter mergeNumFeatures, run:
cql -z atla keys heron/summingbird_timelines_real_time_aggregates Tail-FlatMap | grep mergeNumFeatures
Then use the right path to create the viz, example: https://monitoring.twitter.biz/tiny/2552105

Some files were not shown because too many files have changed in this diff Show More