the-algorithm/pushservice/src/main/scala/com/twitter/frigate/pushservice/ml/HealthFeatureGetter.scala

221 lines
12 KiB
Scala

package com.twitter.frigate.pushservice.ml
import com.twitter.abuse.detection.scoring.thriftscala.{Model => TweetHealthModel}
import com.twitter.abuse.detection.scoring.thriftscala.TweetScoringRequest
import com.twitter.abuse.detection.scoring.thriftscala.TweetScoringResponse
import com.twitter.frigate.common.base.FeatureMap
import com.twitter.frigate.common.base.TweetAuthor
import com.twitter.frigate.common.base.TweetAuthorDetails
import com.twitter.frigate.common.base.TweetCandidate
import com.twitter.frigate.common.rec_types.RecTypes
import com.twitter.frigate.pushservice.model.PushTypes.PushCandidate
import com.twitter.frigate.pushservice.params.PushConstants
import com.twitter.frigate.pushservice.params.PushFeatureSwitchParams
import com.twitter.frigate.pushservice.predicate.HealthPredicates.userHealthSignalValueToDouble
import com.twitter.frigate.pushservice.util.CandidateHydrationUtil
import com.twitter.frigate.pushservice.util.CandidateUtil
import com.twitter.frigate.pushservice.util.MediaAnnotationsUtil
import com.twitter.frigate.thriftscala.UserMediaRepresentation
import com.twitter.hss.api.thriftscala.SignalValue
import com.twitter.hss.api.thriftscala.UserHealthSignal
import com.twitter.hss.api.thriftscala.UserHealthSignal.AgathaCalibratedNsfwDouble
import com.twitter.hss.api.thriftscala.UserHealthSignal.NsfwTextUserScoreDouble
import com.twitter.hss.api.thriftscala.UserHealthSignalResponse
import com.twitter.storehaus.ReadableStore
import com.twitter.util.Future
import com.twitter.util.Time
object HealthFeatureGetter {
def getFeatures(
pushCandidate: PushCandidate,
producerMediaRepresentationStore: ReadableStore[Long, UserMediaRepresentation],
userHealthScoreStore: ReadableStore[Long, UserHealthSignalResponse],
tweetHealthScoreStoreOpt: Option[ReadableStore[TweetScoringRequest, TweetScoringResponse]] =
None
): Future[FeatureMap] = {
pushCandidate match {
case cand: PushCandidate with TweetCandidate with TweetAuthor with TweetAuthorDetails =>
val pMediaNsfwRequest =
TweetScoringRequest(cand.tweetId, TweetHealthModel.ExperimentalHealthModelScore4)
val pTweetTextNsfwRequest =
TweetScoringRequest(cand.tweetId, TweetHealthModel.ExperimentalHealthModelScore1)
cand.authorId match {
case Some(authorId) =>
Future
.join(
userHealthScoreStore.get(authorId),
producerMediaRepresentationStore.get(authorId),
tweetHealthScoreStoreOpt.map(_.get(pMediaNsfwRequest)).getOrElse(Future.None),
tweetHealthScoreStoreOpt.map(_.get(pTweetTextNsfwRequest)).getOrElse(Future.None),
cand.tweetAuthor
).map {
case (
healthSignalsResponseOpt,
producerMuOpt,
pMediaNsfwOpt,
pTweetTextNsfwOpt,
tweetAuthorOpt) =>
val healthSignalScoreMap = healthSignalsResponseOpt
.map(_.signalValues).getOrElse(Map.empty[UserHealthSignal, SignalValue])
val agathaNSFWScore = userHealthSignalValueToDouble(
healthSignalScoreMap
.getOrElse(AgathaCalibratedNsfwDouble, SignalValue.DoubleValue(0.5)))
val userTextNSFWScore = userHealthSignalValueToDouble(
healthSignalScoreMap
.getOrElse(NsfwTextUserScoreDouble, SignalValue.DoubleValue(0.15)))
val pMediaNsfwScore = pMediaNsfwOpt.map(_.score).getOrElse(0.0)
val pTweetTextNsfwScore = pTweetTextNsfwOpt.map(_.score).getOrElse(0.0)
val mediaRepresentationMap =
producerMuOpt.map(_.mediaRepresentation).getOrElse(Map.empty[String, Double])
val sumScore: Double = mediaRepresentationMap.values.sum
val nudityRate =
if (sumScore > 0)
mediaRepresentationMap.getOrElse(
MediaAnnotationsUtil.nudityCategoryId,
0.0) / sumScore
else 0.0
val beautyRate =
if (sumScore > 0)
mediaRepresentationMap.getOrElse(
MediaAnnotationsUtil.beautyCategoryId,
0.0) / sumScore
else 0.0
val singlePersonRate =
if (sumScore > 0)
mediaRepresentationMap.getOrElse(
MediaAnnotationsUtil.singlePersonCategoryId,
0.0) / sumScore
else 0.0
val dislikeCt = cand.numericFeatures.getOrElse(
"tweet.magic_recs_tweet_real_time_aggregates_v2.pair.v2.magicrecs.realtime.is_ntab_disliked.any_feature.Duration.Top.count",
0.0)
val sentCt = cand.numericFeatures.getOrElse(
"tweet.magic_recs_tweet_real_time_aggregates_v2.pair.v2.magicrecs.realtime.is_sent.any_feature.Duration.Top.count",
0.0)
val dislikeRate = if (sentCt > 0) dislikeCt / sentCt else 0.0
val authorDislikeCt = cand.numericFeatures.getOrElse(
"tweet_author_aggregate.pair.label.ntab.isDisliked.any_feature.28.days.count",
0.0)
val authorReportCt = cand.numericFeatures.getOrElse(
"tweet_author_aggregate.pair.label.reportTweetDone.any_feature.28.days.count",
0.0)
val authorSentCt = cand.numericFeatures
.getOrElse(
"tweet_author_aggregate.pair.any_label.any_feature.28.days.count",
0.0)
val authorDislikeRate =
if (authorSentCt > 0) authorDislikeCt / authorSentCt else 0.0
val authorReportRate =
if (authorSentCt > 0) authorReportCt / authorSentCt else 0.0
val (isNsfwAccount, authorAccountAge) = tweetAuthorOpt match {
case Some(tweetAuthor) =>
(
CandidateHydrationUtil.isNsfwAccount(
tweetAuthor,
cand.target.params(PushFeatureSwitchParams.NsfwTokensParam)),
(Time.now - Time.fromMilliseconds(tweetAuthor.createdAtMsec)).inHours
)
case _ => (false, 0)
}
val tweetSemanticCoreIds = cand.sparseBinaryFeatures
.getOrElse(PushConstants.TweetSemanticCoreIdFeature, Set.empty[String])
val continuousFeatures = Map[String, Double](
"agathaNsfwScore" -> agathaNSFWScore,
"textNsfwScore" -> userTextNSFWScore,
"pMediaNsfwScore" -> pMediaNsfwScore,
"pTweetTextNsfwScore" -> pTweetTextNsfwScore,
"nudityRate" -> nudityRate,
"beautyRate" -> beautyRate,
"singlePersonRate" -> singlePersonRate,
"numSources" -> CandidateUtil.getTagsCRCount(cand),
"favCount" -> cand.numericFeatures
.getOrElse("tweet.core.tweet_counts.favorite_count", 0.0),
"activeFollowers" -> cand.numericFeatures
.getOrElse("RecTweetAuthor.User.ActiveFollowers", 0.0),
"favorsRcvd28Days" -> cand.numericFeatures
.getOrElse("RecTweetAuthor.User.FavorsRcvd28Days", 0.0),
"tweets28Days" -> cand.numericFeatures
.getOrElse("RecTweetAuthor.User.Tweets28Days", 0.0),
"dislikeCount" -> dislikeCt,
"dislikeRate" -> dislikeRate,
"sentCount" -> sentCt,
"authorDislikeCount" -> authorDislikeCt,
"authorDislikeRate" -> authorDislikeRate,
"authorReportCount" -> authorReportCt,
"authorReportRate" -> authorReportRate,
"authorSentCount" -> authorSentCt,
"authorAgeInHour" -> authorAccountAge.toDouble
)
val booleanFeatures = Map[String, Boolean](
"isSimclusterBased" -> RecTypes.simclusterBasedTweets
.contains(cand.commonRecType),
"isTopicTweet" -> RecTypes.isTopicTweetType(cand.commonRecType),
"isHashSpace" -> RecTypes.tagspaceTypes.contains(cand.commonRecType),
"isFRS" -> RecTypes.frsTypes.contains(cand.commonRecType),
"isModelingBased" -> RecTypes.mrModelingBasedTypes.contains(cand.commonRecType),
"isGeoPop" -> RecTypes.GeoPopTweetTypes.contains(cand.commonRecType),
"hasPhoto" -> cand.booleanFeatures
.getOrElse("RecTweet.TweetyPieResult.HasPhoto", false),
"hasVideo" -> cand.booleanFeatures
.getOrElse("RecTweet.TweetyPieResult.HasVideo", false),
"hasUrl" -> cand.booleanFeatures
.getOrElse("RecTweet.TweetyPieResult.HasUrl", false),
"isMrTwistly" -> CandidateUtil.isMrTwistlyCandidate(cand),
"abuseStrikeTop2Percent" -> tweetSemanticCoreIds.contains(
PushConstants.AbuseStrike_Top2Percent_Id),
"abuseStrikeTop1Percent" -> tweetSemanticCoreIds.contains(
PushConstants.AbuseStrike_Top1Percent_Id),
"abuseStrikeTop05Percent" -> tweetSemanticCoreIds.contains(
PushConstants.AbuseStrike_Top05Percent_Id),
"abuseStrikeTop025Percent" -> tweetSemanticCoreIds.contains(
PushConstants.AbuseStrike_Top025Percent_Id),
"allSpamReportsPerFavTop1Percent" -> tweetSemanticCoreIds.contains(
PushConstants.AllSpamReportsPerFav_Top1Percent_Id),
"reportsPerFavTop1Percent" -> tweetSemanticCoreIds.contains(
PushConstants.ReportsPerFav_Top1Percent_Id),
"reportsPerFavTop2Percent" -> tweetSemanticCoreIds.contains(
PushConstants.ReportsPerFav_Top2Percent_Id),
"isNudity" -> tweetSemanticCoreIds.contains(
PushConstants.MediaUnderstanding_Nudity_Id),
"beautyStyleFashion" -> tweetSemanticCoreIds.contains(
PushConstants.MediaUnderstanding_Beauty_Id),
"singlePerson" -> tweetSemanticCoreIds.contains(
PushConstants.MediaUnderstanding_SinglePerson_Id),
"pornList" -> tweetSemanticCoreIds.contains(PushConstants.PornList_Id),
"pornographyAndNsfwContent" -> tweetSemanticCoreIds.contains(
PushConstants.PornographyAndNsfwContent_Id),
"sexLife" -> tweetSemanticCoreIds.contains(PushConstants.SexLife_Id),
"sexLifeOrSexualOrientation" -> tweetSemanticCoreIds.contains(
PushConstants.SexLifeOrSexualOrientation_Id),
"profanity" -> tweetSemanticCoreIds.contains(PushConstants.ProfanityFilter_Id),
"isVerified" -> cand.booleanFeatures
.getOrElse("RecTweetAuthor.User.IsVerified", false),
"hasNsfwToken" -> isNsfwAccount
)
val stringFeatures = Map[String, String](
"tweetLanguage" -> cand.categoricalFeatures
.getOrElse("tweet.core.tweet_text.language", "")
)
FeatureMap(
booleanFeatures = booleanFeatures,
numericFeatures = continuousFeatures,
categoricalFeatures = stringFeatures)
}
case _ => Future.value(FeatureMap())
}
case _ => Future.value(FeatureMap())
}
}
}