mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-30 06:56:08 +02:00
252 lines
11 KiB
Scala
252 lines
11 KiB
Scala
|
package com.twitter.home_mixer.functional_component.feature_hydrator
|
||
|
|
||
|
import com.twitter.conversions.DurationOps._
|
||
|
import com.twitter.home_mixer.model.HomeFeatures.EarlybirdFeature
|
||
|
import com.twitter.home_mixer.model.HomeFeatures.NonPollingTimesFeature
|
||
|
import com.twitter.home_mixer.model.HomeFeatures.SourceTweetIdFeature
|
||
|
import com.twitter.ml.api.DataRecord
|
||
|
import com.twitter.ml.api.RichDataRecord
|
||
|
import com.twitter.ml.api.util.FDsl._
|
||
|
import com.twitter.product_mixer.component_library.model.candidate.TweetCandidate
|
||
|
import com.twitter.product_mixer.core.feature.Feature
|
||
|
import com.twitter.product_mixer.core.feature.FeatureWithDefaultOnFailure
|
||
|
import com.twitter.product_mixer.core.feature.datarecord.DataRecordInAFeature
|
||
|
import com.twitter.product_mixer.core.feature.featuremap.FeatureMap
|
||
|
import com.twitter.product_mixer.core.feature.featuremap.FeatureMapBuilder
|
||
|
import com.twitter.product_mixer.core.functional_component.feature_hydrator.CandidateFeatureHydrator
|
||
|
import com.twitter.product_mixer.core.model.common.identifier.FeatureHydratorIdentifier
|
||
|
import com.twitter.product_mixer.core.pipeline.PipelineQuery
|
||
|
import com.twitter.search.common.features.{thriftscala => sc}
|
||
|
import com.twitter.snowflake.id.SnowflakeId
|
||
|
import com.twitter.stitch.Stitch
|
||
|
import com.twitter.timelines.prediction.features.time_features.AccountAgeInterval
|
||
|
import com.twitter.timelines.prediction.features.time_features.TimeDataRecordFeatures._
|
||
|
import com.twitter.timelines.prediction.features.time_features.TimeFeatures
|
||
|
import com.twitter.util.Duration
|
||
|
import scala.collection.Searching._
|
||
|
|
||
|
object TimeFeaturesDataRecordFeature
|
||
|
extends DataRecordInAFeature[TweetCandidate]
|
||
|
with FeatureWithDefaultOnFailure[TweetCandidate, DataRecord] {
|
||
|
override def defaultValue: DataRecord = new DataRecord()
|
||
|
}
|
||
|
|
||
|
object TimeFeaturesHydrator extends CandidateFeatureHydrator[PipelineQuery, TweetCandidate] {
|
||
|
|
||
|
override val identifier: FeatureHydratorIdentifier = FeatureHydratorIdentifier("TimeFeatures")
|
||
|
|
||
|
override val features: Set[Feature[_, _]] = Set(TimeFeaturesDataRecordFeature)
|
||
|
|
||
|
override def apply(
|
||
|
query: PipelineQuery,
|
||
|
candidate: TweetCandidate,
|
||
|
existingFeatures: FeatureMap
|
||
|
): Stitch[FeatureMap] = {
|
||
|
Stitch.value {
|
||
|
val richDataRecord = new RichDataRecord()
|
||
|
setTimeFeatures(richDataRecord, candidate, existingFeatures, query)
|
||
|
FeatureMapBuilder()
|
||
|
.add(TimeFeaturesDataRecordFeature, richDataRecord.getRecord)
|
||
|
.build()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private def setTimeFeatures(
|
||
|
richDataRecord: RichDataRecord,
|
||
|
candidate: TweetCandidate,
|
||
|
existingFeatures: FeatureMap,
|
||
|
query: PipelineQuery,
|
||
|
): Unit = {
|
||
|
val timeFeaturesOpt = getTimeFeatures(query, candidate, existingFeatures)
|
||
|
timeFeaturesOpt.foreach(timeFeatures => setFeatures(timeFeatures, richDataRecord))
|
||
|
}
|
||
|
|
||
|
private[feature_hydrator] def getTimeFeatures(
|
||
|
query: PipelineQuery,
|
||
|
candidate: TweetCandidate,
|
||
|
existingFeatures: FeatureMap,
|
||
|
): Option[TimeFeatures] = {
|
||
|
for {
|
||
|
requestTimestampMs <- Some(query.queryTime.inMilliseconds)
|
||
|
tweetId <- Some(candidate.id)
|
||
|
viewerId <- query.getOptionalUserId
|
||
|
tweetCreationTimeMs <- timeFromTweetOrUserId(tweetId)
|
||
|
timeSinceTweetCreation = requestTimestampMs - tweetCreationTimeMs
|
||
|
accountAgeDurationOpt = timeFromTweetOrUserId(viewerId).map { viewerAccountCreationTimeMs =>
|
||
|
Duration.fromMilliseconds(requestTimestampMs - viewerAccountCreationTimeMs)
|
||
|
}
|
||
|
timeSinceSourceTweetCreation =
|
||
|
existingFeatures
|
||
|
.getOrElse(SourceTweetIdFeature, None)
|
||
|
.flatMap { sourceTweetId =>
|
||
|
timeFromTweetOrUserId(sourceTweetId).map { sourceTweetCreationTimeMs =>
|
||
|
requestTimestampMs - sourceTweetCreationTimeMs
|
||
|
}
|
||
|
}
|
||
|
.getOrElse(timeSinceTweetCreation)
|
||
|
if (timeSinceTweetCreation > 0 && timeSinceSourceTweetCreation > 0)
|
||
|
} yield {
|
||
|
val timeFeatures = TimeFeatures(
|
||
|
timeSinceTweetCreation = timeSinceTweetCreation,
|
||
|
timeSinceSourceTweetCreation = timeSinceSourceTweetCreation,
|
||
|
timeSinceViewerAccountCreationSecs = accountAgeDurationOpt.map(_.inSeconds),
|
||
|
isDay30NewUser = accountAgeDurationOpt.map(_ < 30.days).getOrElse(false),
|
||
|
isMonth12NewUser = accountAgeDurationOpt.map(_ < 365.days).getOrElse(false),
|
||
|
accountAgeInterval = accountAgeDurationOpt.flatMap(AccountAgeInterval.fromDuration),
|
||
|
isTweetRecycled = false // only set in RecyclableTweetCandidateFilter, but it's not used
|
||
|
)
|
||
|
|
||
|
val timeFeaturesWithLastEngagement = addLastEngagementTimeFeatures(
|
||
|
existingFeatures.getOrElse(EarlybirdFeature, None),
|
||
|
timeFeatures,
|
||
|
timeSinceSourceTweetCreation
|
||
|
).getOrElse(timeFeatures)
|
||
|
|
||
|
val nonPollingTimestampsMs =
|
||
|
query.features.map(_.getOrElse(NonPollingTimesFeature, Seq.empty))
|
||
|
val timeFeaturesWithNonPollingOpt = addNonPollingTimeFeatures(
|
||
|
timeFeaturesWithLastEngagement,
|
||
|
requestTimestampMs,
|
||
|
tweetCreationTimeMs,
|
||
|
nonPollingTimestampsMs
|
||
|
)
|
||
|
timeFeaturesWithNonPollingOpt.getOrElse(timeFeaturesWithLastEngagement)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private def timeFromTweetOrUserId(tweetOrUserId: Long): Option[Long] = {
|
||
|
if (SnowflakeId.isSnowflakeId(tweetOrUserId))
|
||
|
Some(SnowflakeId(tweetOrUserId).time.inMilliseconds)
|
||
|
else None
|
||
|
}
|
||
|
|
||
|
private def addLastEngagementTimeFeatures(
|
||
|
tweetFeaturesOpt: Option[sc.ThriftTweetFeatures],
|
||
|
timeFeatures: TimeFeatures,
|
||
|
timeSinceSourceTweetCreation: Long
|
||
|
): Option[TimeFeatures] = {
|
||
|
tweetFeaturesOpt.map { tweetFeatures =>
|
||
|
val lastFavSinceCreationHrs = tweetFeatures.lastFavSinceCreationHrs.map(_.toDouble)
|
||
|
val lastRetweetSinceCreationHrs = tweetFeatures.lastRetweetSinceCreationHrs.map(_.toDouble)
|
||
|
val lastReplySinceCreationHrs = tweetFeatures.lastReplySinceCreationHrs.map(_.toDouble)
|
||
|
val lastQuoteSinceCreationHrs = tweetFeatures.lastQuoteSinceCreationHrs.map(_.toDouble)
|
||
|
|
||
|
timeFeatures.copy(
|
||
|
lastFavSinceCreationHrs = lastFavSinceCreationHrs,
|
||
|
lastRetweetSinceCreationHrs = lastRetweetSinceCreationHrs,
|
||
|
lastReplySinceCreationHrs = lastReplySinceCreationHrs,
|
||
|
lastQuoteSinceCreationHrs = lastQuoteSinceCreationHrs,
|
||
|
timeSinceLastFavoriteHrs = getTimeSinceLastEngagementHrs(
|
||
|
lastFavSinceCreationHrs,
|
||
|
timeSinceSourceTweetCreation
|
||
|
),
|
||
|
timeSinceLastRetweetHrs = getTimeSinceLastEngagementHrs(
|
||
|
lastRetweetSinceCreationHrs,
|
||
|
timeSinceSourceTweetCreation
|
||
|
),
|
||
|
timeSinceLastReplyHrs = getTimeSinceLastEngagementHrs(
|
||
|
lastReplySinceCreationHrs,
|
||
|
timeSinceSourceTweetCreation
|
||
|
),
|
||
|
timeSinceLastQuoteHrs = getTimeSinceLastEngagementHrs(
|
||
|
lastQuoteSinceCreationHrs,
|
||
|
timeSinceSourceTweetCreation
|
||
|
)
|
||
|
)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private def addNonPollingTimeFeatures(
|
||
|
timeFeatures: TimeFeatures,
|
||
|
requestTimestampMs: Long,
|
||
|
creationTimeMs: Long,
|
||
|
nonPollingTimestampsMs: Option[Seq[Long]]
|
||
|
): Option[TimeFeatures] = {
|
||
|
for {
|
||
|
nonPollingTimestampsMs <- nonPollingTimestampsMs
|
||
|
lastNonPollingTimestampMs <- nonPollingTimestampsMs.headOption
|
||
|
earliestNonPollingTimestampMs <- nonPollingTimestampsMs.lastOption
|
||
|
} yield {
|
||
|
val timeSinceLastNonPollingRequest = requestTimestampMs - lastNonPollingTimestampMs
|
||
|
val tweetAgeRatio = timeSinceLastNonPollingRequest / math.max(
|
||
|
1.0,
|
||
|
timeFeatures.timeSinceTweetCreation
|
||
|
)
|
||
|
/*
|
||
|
* Non-polling timestamps are stored in chronological order.
|
||
|
* The latest timestamps occur first, therefore we need to explicitly search in reverse order.
|
||
|
*/
|
||
|
val nonPollingRequestsSinceTweetCreation =
|
||
|
if (nonPollingTimestampsMs.nonEmpty) {
|
||
|
nonPollingTimestampsMs.search(creationTimeMs)(Ordering[Long].reverse).insertionPoint
|
||
|
} else {
|
||
|
0
|
||
|
}
|
||
|
/*
|
||
|
* Calculate the average time between non-polling requests; include
|
||
|
* request time in this calculation as latest timestamp.
|
||
|
*/
|
||
|
val timeBetweenNonPollingRequestsAvg =
|
||
|
(requestTimestampMs - earliestNonPollingTimestampMs) / math
|
||
|
.max(1.0, nonPollingTimestampsMs.size)
|
||
|
val timeFeaturesWithNonPolling = timeFeatures.copy(
|
||
|
timeBetweenNonPollingRequestsAvg = Some(timeBetweenNonPollingRequestsAvg),
|
||
|
timeSinceLastNonPollingRequest = Some(timeSinceLastNonPollingRequest),
|
||
|
nonPollingRequestsSinceTweetCreation = Some(nonPollingRequestsSinceTweetCreation),
|
||
|
tweetAgeRatio = Some(tweetAgeRatio)
|
||
|
)
|
||
|
timeFeaturesWithNonPolling
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private[this] def getTimeSinceLastEngagementHrs(
|
||
|
lastEngagementTimeSinceCreationHrsOpt: Option[Double],
|
||
|
timeSinceTweetCreation: Long
|
||
|
): Option[Double] = {
|
||
|
lastEngagementTimeSinceCreationHrsOpt.map { lastEngagementTimeSinceCreationHrs =>
|
||
|
val timeSinceTweetCreationHrs = (timeSinceTweetCreation / (60 * 60 * 1000)).toInt
|
||
|
timeSinceTweetCreationHrs - lastEngagementTimeSinceCreationHrs
|
||
|
}
|
||
|
}
|
||
|
|
||
|
private def setFeatures(features: TimeFeatures, richDataRecord: RichDataRecord): Unit = {
|
||
|
val record = richDataRecord.getRecord
|
||
|
.setFeatureValue(IS_TWEET_RECYCLED, features.isTweetRecycled)
|
||
|
.setFeatureValue(TIME_SINCE_TWEET_CREATION, features.timeSinceTweetCreation)
|
||
|
.setFeatureValueFromOption(
|
||
|
TIME_SINCE_VIEWER_ACCOUNT_CREATION_SECS,
|
||
|
features.timeSinceViewerAccountCreationSecs)
|
||
|
.setFeatureValue(
|
||
|
USER_ID_IS_SNOWFLAKE_ID,
|
||
|
features.timeSinceViewerAccountCreationSecs.isDefined
|
||
|
)
|
||
|
.setFeatureValueFromOption(ACCOUNT_AGE_INTERVAL, features.accountAgeInterval.map(_.id.toLong))
|
||
|
.setFeatureValue(IS_30_DAY_NEW_USER, features.isDay30NewUser)
|
||
|
.setFeatureValue(IS_12_MONTH_NEW_USER, features.isMonth12NewUser)
|
||
|
.setFeatureValueFromOption(LAST_FAVORITE_SINCE_CREATION_HRS, features.lastFavSinceCreationHrs)
|
||
|
.setFeatureValueFromOption(
|
||
|
LAST_RETWEET_SINCE_CREATION_HRS,
|
||
|
features.lastRetweetSinceCreationHrs
|
||
|
)
|
||
|
.setFeatureValueFromOption(LAST_REPLY_SINCE_CREATION_HRS, features.lastReplySinceCreationHrs)
|
||
|
.setFeatureValueFromOption(LAST_QUOTE_SINCE_CREATION_HRS, features.lastQuoteSinceCreationHrs)
|
||
|
.setFeatureValueFromOption(TIME_SINCE_LAST_FAVORITE_HRS, features.timeSinceLastFavoriteHrs)
|
||
|
.setFeatureValueFromOption(TIME_SINCE_LAST_RETWEET_HRS, features.timeSinceLastRetweetHrs)
|
||
|
.setFeatureValueFromOption(TIME_SINCE_LAST_REPLY_HRS, features.timeSinceLastReplyHrs)
|
||
|
.setFeatureValueFromOption(TIME_SINCE_LAST_QUOTE_HRS, features.timeSinceLastQuoteHrs)
|
||
|
/*
|
||
|
* set features whose values are optional as some users do not have non-polling timestamps
|
||
|
*/
|
||
|
features.timeBetweenNonPollingRequestsAvg.foreach(
|
||
|
record.setFeatureValue(TIME_BETWEEN_NON_POLLING_REQUESTS_AVG, _)
|
||
|
)
|
||
|
features.timeSinceLastNonPollingRequest.foreach(
|
||
|
record.setFeatureValue(TIME_SINCE_LAST_NON_POLLING_REQUEST, _)
|
||
|
)
|
||
|
features.nonPollingRequestsSinceTweetCreation.foreach(
|
||
|
record.setFeatureValue(NON_POLLING_REQUESTS_SINCE_TWEET_CREATION, _)
|
||
|
)
|
||
|
features.tweetAgeRatio.foreach(record.setFeatureValue(TWEET_AGE_RATIO, _))
|
||
|
}
|
||
|
}
|