the-algorithm/src/java/com/twitter/search/earlybird/search/relevance/scoring/FeatureBasedScoringFunction.java
2023-04-01 00:44:23 +01:00

1361 lines
59 KiB
Java

package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import java.util.EnumSet;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;
import org.apache.lucene.search.Explanation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common_internal.bloomfilter.BloomFilter;
import com.twitter.search.common.constants.SearchCardType;
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
import com.twitter.search.common.database.DatabaseConfig;
import com.twitter.search.common.features.ExternalTweetFeature;
import com.twitter.search.common.features.FeatureHandler;
import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchemaEntry;
import com.twitter.search.common.features.thrift.ThriftSearchFeatureType;
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
import com.twitter.search.common.query.QueryCommonFieldHitsVisitor;
import com.twitter.search.common.ranking.thriftjava.ThriftRankingParams;
import com.twitter.search.common.relevance.features.AgeDecay;
import com.twitter.search.common.relevance.features.RelevanceSignalConstants;
import com.twitter.search.common.relevance.text.VisibleTokenRatioNormalizer;
import com.twitter.search.common.results.thriftjava.FieldHitList;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.util.LongIntConverter;
import com.twitter.search.common.util.lang.ThriftLanguageUtil;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.earlybird.common.userupdates.UserTable;
import com.twitter.search.earlybird.search.AntiGamingFilter;
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
import com.twitter.search.earlybird.search.relevance.LinearScoringData.SkipReason;
import com.twitter.search.earlybird.search.relevance.LinearScoringParams;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSearchResultExtraMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
import com.twitter.search.earlybird.thrift.ThriftSocialFilterType;
/**
* Base class for scoring functions that rely on the extracted features stored in LinearScoringData.
*
* Extensions of this class must implement 2 methods:
*
* - computeScore
* - generateExplanationForScoring
*
* They are called for scoring and generating the debug information of the document that is
* currently being evaluated. The field 'data' holds the features of the document.
*/
public abstract class FeatureBasedScoringFunction extends ScoringFunction {
private static final Logger LOG = LoggerFactory.getLogger(FeatureBasedScoringFunction.class);
// A multiplier that's applied to all scores to avoid scores too low.
public static final float SCORE_ADJUSTER = 100.0f;
private static final VisibleTokenRatioNormalizer VISIBLE_TOKEN_RATIO_NORMALIZER =
VisibleTokenRatioNormalizer.createInstance();
// Allow default values only for numeric types.
private static final Set<ThriftSearchFeatureType> ALLOWED_TYPES_FOR_DEFAULT_FEATURE_VALUES =
EnumSet.of(ThriftSearchFeatureType.INT32_VALUE,
ThriftSearchFeatureType.LONG_VALUE,
ThriftSearchFeatureType.DOUBLE_VALUE);
private static final Set<Integer> NUMERIC_FEATURES_FOR_WHICH_DEFAULTS_SHOULD_NOT_BE_SET =
ImmutableSet.of(EarlybirdFieldConstant.TWEET_SIGNATURE.getFieldId(),
EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT.getFieldId(),
EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT.getFieldId());
// Name of the scoring function. Used for generating explanations.
private final String functionName;
private final BloomFilter trustedFilter;
private final BloomFilter followFilter;
// Current timestamp in seconds. Overridable by unit test or by timestamp set in search query.
private int now;
private final AntiGamingFilter antiGamingFilter;
@Nullable
private final AgeDecay ageDecay;
protected final LinearScoringParams params; // Parameters and query-dependent values.
// In order for the API calls to retrieve the correct `LinearScoringData`
// for the passed `docId`, we need to maintain a map of `docId` -> `LinearScoringData`
// NOTE: THIS CAN ONLY BE REFERENCED AT HIT COLLECTION TIME, SINCE DOC IDS ARE NOT UNIQUE
// ACROSS SEGMENTS. IT'S NOT USABLE DURING BATCH SCORING.
private final Map<Integer, LinearScoringData> docIdToScoringData;
private final ThriftSearchResultType searchResultType;
private final UserTable userTable;
@VisibleForTesting
void setNow(int fakeNow) {
now = fakeNow;
}
public FeatureBasedScoringFunction(
String functionName,
ImmutableSchemaInterface schema,
ThriftSearchQuery searchQuery,
AntiGamingFilter antiGamingFilter,
ThriftSearchResultType searchResultType,
UserTable userTable) throws IOException {
super(schema);
this.functionName = functionName;
this.searchResultType = searchResultType;
this.userTable = userTable;
Preconditions.checkNotNull(searchQuery.getRelevanceOptions());
ThriftRankingParams rankingParams = searchQuery.getRelevanceOptions().getRankingParams();
Preconditions.checkNotNull(rankingParams);
params = new LinearScoringParams(searchQuery, rankingParams);
docIdToScoringData = new HashMap<>();
long timestamp = searchQuery.isSetTimestampMsecs() && searchQuery.getTimestampMsecs() > 0
? searchQuery.getTimestampMsecs() : System.currentTimeMillis();
now = Ints.checkedCast(TimeUnit.MILLISECONDS.toSeconds(timestamp));
this.antiGamingFilter = antiGamingFilter;
this.ageDecay = params.useAgeDecay
? new AgeDecay(params.ageDecayBase, params.ageDecayHalflife, params.ageDecaySlope)
: null;
if (searchQuery.isSetTrustedFilter()) {
trustedFilter = new BloomFilter(searchQuery.getTrustedFilter());
} else {
trustedFilter = null;
}
if (searchQuery.isSetDirectFollowFilter()) {
followFilter = new BloomFilter(searchQuery.getDirectFollowFilter());
} else {
followFilter = null;
}
}
@VisibleForTesting
final LinearScoringParams getScoringParams() {
return params;
}
/**
* Returns the LinearScoringData instance associated with the current doc ID. If it doesn't exist,
* an empty LinearScoringData is created.
*/
@Override
public LinearScoringData getScoringDataForCurrentDocument() {
LinearScoringData data = docIdToScoringData.get(getCurrentDocID());
if (data == null) {
data = new LinearScoringData();
docIdToScoringData.put(getCurrentDocID(), data);
}
return data;
}
@Override
public void setDebugMode(int debugMode) {
super.setDebugMode(debugMode);
}
/**
* Normal the lucene score, which was unbounded, to a range of [1.0, maxLuceneScoreBoost].
* The normalized value increases almost linearly in the lucene score range 2.0 ~ 7.0, where
* most queries fall in. For rare long tail queries, like some hashtags, they have high idf and
* thus high lucene score, the normalized value won't have much difference between tweets.
* The normalization function is:
* ls = luceneScore
* norm = min(max, 1 + (max - 1.0) / 2.4 * ln(1 + ls)
*/
static float normalizeLuceneScore(float luceneScore, float maxBoost) {
return (float) Math.min(maxBoost, 1.0 + (maxBoost - 1.0) / 2.4 * Math.log1p(luceneScore));
}
@Override
protected float score(float luceneQueryScore) throws IOException {
return scoreInternal(luceneQueryScore, null);
}
protected LinearScoringData updateLinearScoringData(float luceneQueryScore) throws IOException {
// Reset the data for each tweet!!!
LinearScoringData data = new LinearScoringData();
docIdToScoringData.put(getCurrentDocID(), data);
// Set proper version for engagement counters for this request.
data.skipReason = SkipReason.NOT_SKIPPED;
data.luceneScore = luceneQueryScore;
data.userRep = (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.USER_REPUTATION);
if (antiGamingFilter != null && !antiGamingFilter.accept(getCurrentDocID())) {
data.skipReason = SkipReason.ANTIGAMING;
return data;
}
data.textScore = (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.TEXT_SCORE);
data.tokenAt140DividedByNumTokensBucket = VISIBLE_TOKEN_RATIO_NORMALIZER.denormalize(
(byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.VISIBLE_TOKEN_RATIO));
data.fromUserId = documentFeatures.getFeatureValue(EarlybirdFieldConstant.FROM_USER_ID_CSF);
data.isFollow = followFilter != null
&& followFilter.contains(Longs.toByteArray(data.fromUserId));
data.isTrusted = trustedFilter != null
&& trustedFilter.contains(Longs.toByteArray(data.fromUserId));
data.isFromVerifiedAccount = documentFeatures.isFlagSet(
EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG);
data.isFromBlueVerifiedAccount = documentFeatures.isFlagSet(
EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG);
data.isSelfTweet = data.fromUserId == params.searcherId;
// v1 engagement counters, note that the first three values are post-log2 version
// of the original unnormalized values.
data.retweetCountPostLog2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.RETWEET_COUNT);
data.replyCountPostLog2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.REPLY_COUNT);
data.favCountPostLog2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.FAVORITE_COUNT);
data.embedsImpressionCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT);
data.embedsUrlCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EMBEDS_URL_COUNT);
data.videoViewCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.VIDEO_VIEW_COUNT);
// v2 engagement counters
data.retweetCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.RETWEET_COUNT_V2);
data.replyCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.REPLY_COUNT_V2);
data.favCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.FAVORITE_COUNT_V2);
// other v2 engagement counters
data.embedsImpressionCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT_V2);
data.embedsUrlCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EMBEDS_URL_COUNT_V2);
data.videoViewCountV2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.VIDEO_VIEW_COUNT_V2);
// pure v2 engagement counters without v1 counterpart
data.quotedCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.QUOTE_COUNT);
data.weightedRetweetCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.WEIGHTED_RETWEET_COUNT);
data.weightedReplyCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.WEIGHTED_REPLY_COUNT);
data.weightedFavCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.WEIGHTED_FAVORITE_COUNT);
data.weightedQuoteCount = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.WEIGHTED_QUOTE_COUNT);
Double querySpecificScoreAdjustment = params.querySpecificScoreAdjustments == null ? null
: params.querySpecificScoreAdjustments.get(tweetIDMapper.getTweetID(getCurrentDocID()));
data.querySpecificScore =
querySpecificScoreAdjustment == null ? 0.0 : querySpecificScoreAdjustment;
data.authorSpecificScore = params.authorSpecificScoreAdjustments == null
? 0.0
: params.authorSpecificScoreAdjustments.getOrDefault(data.fromUserId, 0.0);
// respect social filter type
if (params.socialFilterType != null && !data.isSelfTweet) {
if ((params.socialFilterType == ThriftSocialFilterType.ALL
&& !data.isFollow && !data.isTrusted)
|| (params.socialFilterType == ThriftSocialFilterType.TRUSTED && !data.isTrusted)
|| (params.socialFilterType == ThriftSocialFilterType.FOLLOWS && !data.isFollow)) {
// we can skip this hit as we only want social results in this mode.
data.skipReason = SkipReason.SOCIAL_FILTER;
return data;
}
}
// 1. first apply all the filters to only non-follow tweets and non-verified accounts,
// but be tender to sentinel values
// unless you specifically asked to apply filters regardless
if (params.applyFiltersAlways
|| (!data.isSelfTweet && !data.isFollow && !data.isFromVerifiedAccount
&& !data.isFromBlueVerifiedAccount)) {
if (data.userRep < params.reputationMinVal
// don't filter unset userreps, we give them the benefit of doubt and let it
// continue to scoring. userrep is unset when either user just signed up or
// during ingestion time we had trouble getting userrep from reputation service.
&& data.userRep != RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL) {
data.skipReason = SkipReason.LOW_REPUTATION;
return data;
} else if (data.textScore < params.textScoreMinVal
// don't filter unset text scores, use goodwill value
&& data.textScore != RelevanceSignalConstants.UNSET_TEXT_SCORE_SENTINEL) {
data.skipReason = SkipReason.LOW_TEXT_SCORE;
return data;
} else if (data.retweetCountPostLog2 != LinearScoringData.UNSET_SIGNAL_VALUE
&& data.retweetCountPostLog2 < params.retweetMinVal) {
data.skipReason = SkipReason.LOW_RETWEET_COUNT;
return data;
} else if (data.favCountPostLog2 != LinearScoringData.UNSET_SIGNAL_VALUE
&& data.favCountPostLog2 < params.favMinVal) {
data.skipReason = SkipReason.LOW_FAV_COUNT;
return data;
}
}
// if sentinel value is set, assume goodwill score and let scoring continue.
if (data.textScore == RelevanceSignalConstants.UNSET_TEXT_SCORE_SENTINEL) {
data.textScore = RelevanceSignalConstants.GOODWILL_TEXT_SCORE;
}
if (data.userRep == RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL) {
data.userRep = RelevanceSignalConstants.GOODWILL_REPUTATION;
}
data.tweetAgeInSeconds = now - timeMapper.getTime(getCurrentDocID());
if (data.tweetAgeInSeconds < 0) {
data.tweetAgeInSeconds = 0; // Age cannot be negative
}
// The PARUS_SCORE feature should be read as is.
data.parusScore = documentFeatures.getFeatureValue(EarlybirdFieldConstant.PARUS_SCORE);
data.isNullcast = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG);
data.hasUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_LINK_FLAG);
data.hasImageUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
data.hasVideoUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG);
data.hasNewsUrl = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG);
data.isReply = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_REPLY_FLAG);
data.isRetweet = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_RETWEET_FLAG);
data.isOffensive = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG);
data.hasTrend = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_TREND_FLAG);
data.hasMultipleHashtagsOrTrends =
documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG);
data.isUserSpam = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_SPAM_FLAG);
data.isUserNSFW = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_NSFW_FLAG)
|| userTable.isSet(data.fromUserId, UserTable.NSFW_BIT);
data.isUserAntiSocial =
userTable.isSet(data.fromUserId, UserTable.ANTISOCIAL_BIT);
data.isUserBot = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_BOT_FLAG);
data.hasCard = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_CARD_FLAG);
data.cardType = SearchCardType.UNKNOWN.getByteValue();
if (data.hasCard) {
data.cardType =
(byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD);
}
data.hasVisibleLink = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG);
data.hasConsumerVideo =
documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG);
data.hasProVideo = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG);
data.hasVine = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VINE_FLAG);
data.hasPeriscope = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_PERISCOPE_FLAG);
data.hasNativeImage = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG);
data.hasQuote = documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_QUOTE_FLAG);
data.isComposerSourceCamera =
documentFeatures.isFlagSet(EarlybirdFieldConstant.COMPOSER_SOURCE_IS_CAMERA_FLAG);
// Only read the shared status if the isRetweet or isReply bit is true (minor optimization).
if (data.isRetweet || (params.getInReplyToStatusId && data.isReply)) {
data.sharedStatusId =
documentFeatures.getFeatureValue(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF);
}
// Only read the reference tweet author ID if the isRetweet or isReply bit
// is true (minor optimization).
if (data.isRetweet || data.isReply) {
// the REFERENCE_AUTHOR_ID_CSF stores the source tweet author id for all retweets
long referenceAuthorId =
documentFeatures.getFeatureValue(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF);
if (referenceAuthorId > 0) {
data.referenceAuthorId = referenceAuthorId;
} else {
// we also store the reference author id for retweets, directed at tweets, and self threaded
// tweets separately on Realtime/Protected Earlybirds. This data will be moved to the
// REFERENCE_AUTHOR_ID_CSF and these fields will be deprecated in SEARCH-34958.
referenceAuthorId = LongIntConverter.convertTwoIntToOneLong(
(int) documentFeatures.getFeatureValue(
EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT),
(int) documentFeatures.getFeatureValue(
EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT));
if (referenceAuthorId > 0) {
data.referenceAuthorId = referenceAuthorId;
}
}
}
// Convert language to a thrift language and then back to an int in order to
// ensure a value compatible with our current ThriftLanguage definition.
ThriftLanguage tweetLang = ThriftLanguageUtil.safeFindByValue(
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LANGUAGE));
data.tweetLangId = tweetLang.getValue();
// Set the language-related features here so that they can be later used in promotion/demotion
// and also be transferred to ThriftSearchResultMetadata
data.userLangMult = computeUserLangMultiplier(data, params);
data.hasDifferentLang = params.uiLangId != ThriftLanguage.UNKNOWN.getValue()
&& params.uiLangId != data.tweetLangId;
data.hasEnglishTweetAndDifferentUILang = data.hasDifferentLang
&& data.tweetLangId == ThriftLanguage.ENGLISH.getValue();
data.hasEnglishUIAndDifferentTweetLang = data.hasDifferentLang
&& params.uiLangId == ThriftLanguage.ENGLISH.getValue();
// Exposed all these features for the clients.
data.isSensitiveContent =
documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT);
data.hasMultipleMediaFlag =
documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_MULTIPLE_MEDIA_FLAG);
data.profileIsEggFlag = documentFeatures.isFlagSet(EarlybirdFieldConstant.PROFILE_IS_EGG_FLAG);
data.isUserNewFlag = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_USER_NEW_FLAG);
data.numMentions = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.NUM_MENTIONS);
data.numHashtags = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.NUM_HASHTAGS);
data.linkLanguage =
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LINK_LANGUAGE);
data.prevUserTweetEngagement =
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.PREV_USER_TWEET_ENGAGEMENT);
// health model scores by HML
data.toxicityScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.TOXICITY_SCORE);
data.pBlockScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.PBLOCK_SCORE);
data.pSpammyTweetScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.P_SPAMMY_TWEET_SCORE);
data.pReportedTweetScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.P_REPORTED_TWEET_SCORE);
data.spammyTweetContentScore = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.SPAMMY_TWEET_CONTENT_SCORE
);
data.experimentalHealthModelScore1 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_1);
data.experimentalHealthModelScore2 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_2);
data.experimentalHealthModelScore3 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_3);
data.experimentalHealthModelScore4 = documentFeatures.getUnnormalizedFeatureValue(
EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_4);
return data;
}
protected float scoreInternal(
float luceneQueryScore, ExplanationWrapper explanation) throws IOException {
LinearScoringData data = updateLinearScoringData(luceneQueryScore);
if (data.skipReason != null && data.skipReason != SkipReason.NOT_SKIPPED) {
return finalizeScore(data, explanation, SKIP_HIT);
}
double score = computeScore(data, explanation != null);
return postScoreComputation(data, score, true, explanation);
}
protected float postScoreComputation(
LinearScoringData data,
double score,
boolean boostScoreWithHitAttribution,
ExplanationWrapper explanation) throws IOException {
double modifiedScore = score;
data.scoreBeforeBoost = modifiedScore;
if (params.applyBoosts) {
modifiedScore =
applyBoosts(data, modifiedScore, boostScoreWithHitAttribution, explanation != null);
}
// Final adjustment to avoid too-low scores.
modifiedScore *= SCORE_ADJUSTER;
data.scoreAfterBoost = modifiedScore;
// 3. final score filter
data.scoreFinal = modifiedScore;
if ((params.applyFiltersAlways || (!data.isSelfTweet && !data.isFollow))
&& modifiedScore < params.minScore) {
data.skipReason = SkipReason.LOW_FINAL_SCORE;
modifiedScore = SKIP_HIT;
}
// clear field hits
this.fieldHitAttribution = null;
return finalizeScore(data, explanation, modifiedScore);
}
/**
* Applying promotion/demotion to the scores generated by feature-based scoring functions
*
* @param data Original LinearScoringData (to be modified with boosts here)
* @param score Score generated by the feature-based scoring function
* @param withHitAttribution Determines if hit attribution data should be included.
* @param forExplanation Indicates if the score will be computed for generating the explanation.
* @return Score after applying promotion/demotion
*/
private double applyBoosts(
LinearScoringData data,
double score,
boolean withHitAttribution,
boolean forExplanation) {
double boostedScore = score;
if (params.useLuceneScoreAsBoost) {
data.normalizedLuceneScore = normalizeLuceneScore(
(float) data.luceneScore, (float) params.maxLuceneScoreBoost);
boostedScore *= data.normalizedLuceneScore;
}
if (data.isOffensive) {
boostedScore *= params.offensiveDamping;
}
if (data.isUserSpam && params.spamUserDamping != LinearScoringData.NO_BOOST_VALUE) {
data.spamUserDampApplied = true;
boostedScore *= params.spamUserDamping;
}
if (data.isUserNSFW && params.nsfwUserDamping != LinearScoringData.NO_BOOST_VALUE) {
data.nsfwUserDampApplied = true;
boostedScore *= params.nsfwUserDamping;
}
if (data.isUserBot && params.botUserDamping != LinearScoringData.NO_BOOST_VALUE) {
data.botUserDampApplied = true;
boostedScore *= params.botUserDamping;
}
// cards
if (data.hasCard && params.hasCardBoosts[data.cardType] != LinearScoringData.NO_BOOST_VALUE) {
boostedScore *= params.hasCardBoosts[data.cardType];
data.hasCardBoostApplied = true;
}
// trends
if (data.hasMultipleHashtagsOrTrends) {
boostedScore *= params.multipleHashtagsOrTrendsDamping;
} else if (data.hasTrend) {
data.tweetHasTrendsBoostApplied = true;
boostedScore *= params.tweetHasTrendBoost;
}
// Media/News url boosts.
if (data.hasImageUrl || data.hasVideoUrl) {
data.hasMedialUrlBoostApplied = true;
boostedScore *= params.tweetHasMediaUrlBoost;
}
if (data.hasNewsUrl) {
data.hasNewsUrlBoostApplied = true;
boostedScore *= params.tweetHasNewsUrlBoost;
}
if (data.isFromVerifiedAccount) {
data.tweetFromVerifiedAccountBoostApplied = true;
boostedScore *= params.tweetFromVerifiedAccountBoost;
}
if (data.isFromBlueVerifiedAccount) {
data.tweetFromBlueVerifiedAccountBoostApplied = true;
boostedScore *= params.tweetFromBlueVerifiedAccountBoost;
}
if (data.isFollow) {
// direct follow, so boost both replies and non-replies.
data.directFollowBoostApplied = true;
boostedScore *= params.directFollowBoost;
} else if (data.isTrusted) {
// trusted circle
if (!data.isReply) {
// non-at-reply, in trusted network
data.trustedCircleBoostApplied = true;
boostedScore *= params.trustedCircleBoost;
}
} else if (data.isReply) {
// at-reply out of my network
data.outOfNetworkReplyPenaltyApplied = true;
boostedScore -= params.outOfNetworkReplyPenalty;
}
if (data.isSelfTweet) {
data.selfTweetBoostApplied = true;
data.selfTweetMult = params.selfTweetBoost;
boostedScore *= params.selfTweetBoost;
}
// Language Demotion
// User language based demotion
// The data.userLangMult is set in scoreInternal(), and this setting step is always before
// the applying boosts step
if (params.useUserLanguageInfo) {
boostedScore *= data.userLangMult;
}
// UI language based demotion
if (params.uiLangId != ThriftLanguage.UNKNOWN.getValue()
&& params.uiLangId != data.tweetLangId) {
if (data.tweetLangId == ThriftLanguage.ENGLISH.getValue()) {
data.uiLangMult = params.langEnglishTweetDemote;
} else if (params.uiLangId == ThriftLanguage.ENGLISH.getValue()) {
data.uiLangMult = params.langEnglishUIDemote;
} else {
data.uiLangMult = params.langDefaultDemote;
}
} else {
data.uiLangMult = LinearScoringData.NO_BOOST_VALUE;
}
boostedScore *= data.uiLangMult;
if (params.useAgeDecay) {
// shallow sigmoid with an inflection point at ageDecayHalflife
data.ageDecayMult = ageDecay.getAgeDecayMultiplier(data.tweetAgeInSeconds);
boostedScore *= data.ageDecayMult;
}
// Hit Attribute Demotion
// Scoring is currently based on tokenized user name, text, and url in the tweet
// If hit attribute collection is enabled, we demote score based on these fields
if (hitAttributeHelper != null && params.enableHitDemotion) {
Map<Integer, List<String>> hitMap;
if (forExplanation && fieldHitAttribution != null) {
// if this scoring call is for generating an explanation,
// we'll use the fieldHitAttribution found in the search result's metadata because
// collectors are not called during the debug workflow
hitMap = Maps.transformValues(fieldHitAttribution.getHitMap(), FieldHitList::getHitFields);
} else if (withHitAttribution) {
hitMap = hitAttributeHelper.getHitAttribution(getCurrentDocID());
} else {
hitMap = Maps.newHashMap();
}
Set<String> uniqueFieldHits = ImmutableSet.copyOf(Iterables.concat(hitMap.values()));
data.hitFields.addAll(uniqueFieldHits);
// there should always be fields that are hit
// if there aren't, we assume this is a call from 'explain' in debug mode
// do not override hit attribute data if in debug mode
if (!uniqueFieldHits.isEmpty()) {
// demotions based strictly on field hits
if (uniqueFieldHits.size() == 1) {
if (uniqueFieldHits.contains(
EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName())) {
// if url was the only field that was hit, demote
data.hasUrlOnlyHitDemotionApplied = true;
boostedScore *= params.urlOnlyHitDemotion;
} else if (uniqueFieldHits.contains(
EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName())) {
// if name was the only field that was hit, demote
data.hasNameOnlyHitDemotionApplied = true;
boostedScore *= params.nameOnlyHitDemotion;
}
} else if (!uniqueFieldHits.contains(EarlybirdFieldConstant.TEXT_FIELD.getFieldName())
&& !uniqueFieldHits.contains(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName())
&& !uniqueFieldHits.contains(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName())
&& !uniqueFieldHits.contains(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName())) {
// if text or special text was never hit, demote
data.hasNoTextHitDemotionApplied = true;
boostedScore *= params.noTextHitDemotion;
} else if (uniqueFieldHits.size() == 2) {
// demotions based on field hit combinations
// want to demote if we only hit two of the fields (one being text)
// but with separate terms
Set<String> fieldIntersections = QueryCommonFieldHitsVisitor.findIntersection(
hitAttributeHelper.getNodeToRankMap(),
hitMap,
query);
if (fieldIntersections.isEmpty()) {
if (uniqueFieldHits.contains(
EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName())) {
// if name is hit but has no hits in common with text, demote
// want to demote cases where we hit part of the person's name
// and tweet text separately
data.hasSeparateTextAndNameHitDemotionApplied = true;
boostedScore *= params.separateTextAndNameHitDemotion;
} else if (uniqueFieldHits.contains(
EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName())) {
// if url is hit but has no hits in common with text, demote
// want to demote cases where we hit a potential domain keyword
// and tweet text separately
data.hasSeparateTextAndUrlHitDemotionApplied = true;
boostedScore *= params.separateTextAndUrlHitDemotion;
}
}
}
}
}
return boostedScore;
}
/**
* Compute the user language based demotion multiplier
*/
private static double computeUserLangMultiplier(
LinearScoringData data, LinearScoringParams params) {
if (data.tweetLangId == params.uiLangId
&& data.tweetLangId != ThriftLanguage.UNKNOWN.getValue()) {
// Effectively the uiLang is considered a language that user knows with 1.0 confidence.
return LinearScoringData.NO_BOOST_VALUE;
}
if (params.userLangs[data.tweetLangId] > 0.0) {
return params.userLangs[data.tweetLangId];
}
return params.unknownLanguageBoost;
}
/**
* Computes the score of the document that is currently being evaluated.
*
* The extracted features from the document are available in the field 'data'.
*
* @param data The LinearScoringData instance that will store the document features.
* @param forExplanation Indicates if the score will be computed for generating the explanation.
*/
protected abstract double computeScore(
LinearScoringData data, boolean forExplanation) throws IOException;
private float finalizeScore(
LinearScoringData scoringData,
ExplanationWrapper explanation,
double score) throws IOException {
scoringData.scoreReturned = score;
if (explanation != null) {
explanation.explanation = generateExplanation(scoringData);
}
return (float) score;
}
@Override
protected void initializeNextSegment(EarlybirdIndexSegmentAtomicReader reader)
throws IOException {
if (antiGamingFilter != null) {
antiGamingFilter.startSegment(reader);
}
}
/*
* Generate the scoring explanation for debug.
*/
private Explanation generateExplanation(LinearScoringData scoringData) throws IOException {
final List<Explanation> details = Lists.newArrayList();
details.add(Explanation.match(0.0f, "[PROPERTIES] "
+ scoringData.getPropertyExplanation()));
// 1. Filters
boolean isHit = scoringData.skipReason == SkipReason.NOT_SKIPPED;
if (scoringData.skipReason == SkipReason.ANTIGAMING) {
details.add(Explanation.noMatch("SKIPPED for antigaming"));
}
if (scoringData.skipReason == SkipReason.LOW_REPUTATION) {
details.add(Explanation.noMatch(
String.format("SKIPPED for low reputation: %.3f < %.3f",
scoringData.userRep, params.reputationMinVal)));
}
if (scoringData.skipReason == SkipReason.LOW_TEXT_SCORE) {
details.add(Explanation.noMatch(
String.format("SKIPPED for low text score: %.3f < %.3f",
scoringData.textScore, params.textScoreMinVal)));
}
if (scoringData.skipReason == SkipReason.LOW_RETWEET_COUNT) {
details.add(Explanation.noMatch(
String.format("SKIPPED for low retweet count: %.3f < %.3f",
scoringData.retweetCountPostLog2, params.retweetMinVal)));
}
if (scoringData.skipReason == SkipReason.LOW_FAV_COUNT) {
details.add(Explanation.noMatch(
String.format("SKIPPED for low fav count: %.3f < %.3f",
scoringData.favCountPostLog2, params.favMinVal)));
}
if (scoringData.skipReason == SkipReason.SOCIAL_FILTER) {
details.add(Explanation.noMatch("SKIPPED for not in the right social circle"));
}
// 2. Explanation depending on the scoring type
generateExplanationForScoring(scoringData, isHit, details);
// 3. Explanation depending on boosts
if (params.applyBoosts) {
generateExplanationForBoosts(scoringData, isHit, details);
}
// 4. Final score filter.
if (scoringData.skipReason == SkipReason.LOW_FINAL_SCORE) {
details.add(Explanation.noMatch("SKIPPED for low final score: " + scoringData.scoreFinal));
isHit = false;
}
String hostAndSegment = String.format("%s host = %s segment = %s",
functionName, DatabaseConfig.getLocalHostname(), DatabaseConfig.getDatabase());
if (isHit) {
return Explanation.match((float) scoringData.scoreFinal, hostAndSegment, details);
} else {
return Explanation.noMatch(hostAndSegment, details);
}
}
/**
* Generates the explanation for the document that is currently being evaluated.
*
* Implementations of this method must use the 'details' parameter to collect its output.
*
* @param scoringData Scoring components for the document
* @param isHit Indicates whether the document is not skipped
* @param details Details of the explanation. Used to collect the output.
*/
protected abstract void generateExplanationForScoring(
LinearScoringData scoringData, boolean isHit, List<Explanation> details) throws IOException;
/**
* Generates the boosts part of the explanation for the document that is currently
* being evaluated.
*/
private void generateExplanationForBoosts(
LinearScoringData scoringData,
boolean isHit,
List<Explanation> details) {
List<Explanation> boostDetails = Lists.newArrayList();
boostDetails.add(Explanation.match((float) scoringData.scoreBeforeBoost, "Score before boost"));
// Lucene score boost
if (params.useLuceneScoreAsBoost) {
boostDetails.add(Explanation.match(
(float) scoringData.normalizedLuceneScore,
String.format("[x] Lucene score boost, luceneScore=%.3f",
scoringData.luceneScore)));
}
// card boost
if (scoringData.hasCardBoostApplied) {
boostDetails.add(Explanation.match((float) params.hasCardBoosts[scoringData.cardType],
"[x] card boost for type " + SearchCardType.cardTypeFromByteValue(scoringData.cardType)));
}
// Offensive
if (scoringData.isOffensive) {
boostDetails.add(Explanation.match((float) params.offensiveDamping, "[x] Offensive damping"));
} else {
boostDetails.add(Explanation.match(LinearScoringData.NO_BOOST_VALUE,
String.format("Not Offensive, damping=%.3f", params.offensiveDamping)));
}
// Spam
if (scoringData.spamUserDampApplied) {
boostDetails.add(Explanation.match((float) params.spamUserDamping, "[x] Spam"));
}
// NSFW
if (scoringData.nsfwUserDampApplied) {
boostDetails.add(Explanation.match((float) params.nsfwUserDamping, "[X] NSFW"));
}
// Bot
if (scoringData.botUserDampApplied) {
boostDetails.add(Explanation.match((float) params.botUserDamping, "[X] Bot"));
}
// Multiple hashtags or trends
if (scoringData.hasMultipleHashtagsOrTrends) {
boostDetails.add(Explanation.match((float) params.multipleHashtagsOrTrendsDamping,
"[x] Multiple hashtags or trends boost"));
} else {
boostDetails.add(Explanation.match(LinearScoringData.NO_BOOST_VALUE,
String.format("No multiple hashtags or trends, damping=%.3f",
params.multipleHashtagsOrTrendsDamping)));
}
if (scoringData.tweetHasTrendsBoostApplied) {
boostDetails.add(Explanation.match(
(float) params.tweetHasTrendBoost, "[x] Tweet has trend boost"));
}
if (scoringData.hasMedialUrlBoostApplied) {
boostDetails.add(Explanation.match(
(float) params.tweetHasMediaUrlBoost, "[x] Media url boost"));
}
if (scoringData.hasNewsUrlBoostApplied) {
boostDetails.add(Explanation.match(
(float) params.tweetHasNewsUrlBoost, "[x] News url boost"));
}
boostDetails.add(Explanation.match(0.0f, "[FIELDS HIT] " + scoringData.hitFields));
if (scoringData.hasNoTextHitDemotionApplied) {
boostDetails.add(Explanation.match(
(float) params.noTextHitDemotion, "[x] No text hit demotion"));
}
if (scoringData.hasUrlOnlyHitDemotionApplied) {
boostDetails.add(Explanation.match(
(float) params.urlOnlyHitDemotion, "[x] URL only hit demotion"));
}
if (scoringData.hasNameOnlyHitDemotionApplied) {
boostDetails.add(Explanation.match(
(float) params.nameOnlyHitDemotion, "[x] Name only hit demotion"));
}
if (scoringData.hasSeparateTextAndNameHitDemotionApplied) {
boostDetails.add(Explanation.match((float) params.separateTextAndNameHitDemotion,
"[x] Separate text/name demotion"));
}
if (scoringData.hasSeparateTextAndUrlHitDemotionApplied) {
boostDetails.add(Explanation.match((float) params.separateTextAndUrlHitDemotion,
"[x] Separate text/url demotion"));
}
if (scoringData.tweetFromVerifiedAccountBoostApplied) {
boostDetails.add(Explanation.match((float) params.tweetFromVerifiedAccountBoost,
"[x] Verified account boost"));
}
if (scoringData.tweetFromBlueVerifiedAccountBoostApplied) {
boostDetails.add(Explanation.match((float) params.tweetFromBlueVerifiedAccountBoost,
"[x] Blue-verified account boost"));
}
if (scoringData.selfTweetBoostApplied) {
boostDetails.add(Explanation.match((float) params.selfTweetBoost,
"[x] Self tweet boost"));
}
if (scoringData.skipReason == LinearScoringData.SkipReason.SOCIAL_FILTER) {
boostDetails.add(Explanation.noMatch("SKIPPED for social filter"));
} else {
if (scoringData.directFollowBoostApplied) {
boostDetails.add(Explanation.match((float) params.directFollowBoost,
"[x] Direct follow boost"));
}
if (scoringData.trustedCircleBoostApplied) {
boostDetails.add(Explanation.match((float) params.trustedCircleBoost,
"[x] Trusted circle boost"));
}
if (scoringData.outOfNetworkReplyPenaltyApplied) {
boostDetails.add(Explanation.match((float) params.outOfNetworkReplyPenalty,
"[-] Out of network reply penalty"));
}
}
// Language demotions
String langDetails = String.format(
"tweetLang=[%s] uiLang=[%s]",
ThriftLanguageUtil.getLocaleOf(
ThriftLanguage.findByValue(scoringData.tweetLangId)).getLanguage(),
ThriftLanguageUtil.getLocaleOf(ThriftLanguage.findByValue(params.uiLangId)).getLanguage());
if (scoringData.uiLangMult == 1.0) {
boostDetails.add(Explanation.match(
LinearScoringData.NO_BOOST_VALUE, "No UI Language demotion: " + langDetails));
} else {
boostDetails.add(Explanation.match(
(float) scoringData.uiLangMult, "[x] UI LangMult: " + langDetails));
}
StringBuilder userLangDetails = new StringBuilder();
userLangDetails.append("userLang=[");
for (int i = 0; i < params.userLangs.length; i++) {
if (params.userLangs[i] > 0.0) {
String lang = ThriftLanguageUtil.getLocaleOf(ThriftLanguage.findByValue(i)).getLanguage();
userLangDetails.append(String.format("%s:%.3f,", lang, params.userLangs[i]));
}
}
userLangDetails.append("]");
if (!params.useUserLanguageInfo) {
boostDetails.add(Explanation.noMatch(
"No User Language Demotion: " + userLangDetails.toString()));
} else {
boostDetails.add(Explanation.match(
(float) scoringData.userLangMult,
"[x] User LangMult: " + userLangDetails.toString()));
}
// Age decay
String ageDecayDetails = String.format(
"age=%d seconds, slope=%.3f, base=%.1f, half-life=%.0f",
scoringData.tweetAgeInSeconds, params.ageDecaySlope,
params.ageDecayBase, params.ageDecayHalflife);
if (params.useAgeDecay) {
boostDetails.add(Explanation.match(
(float) scoringData.ageDecayMult, "[x] AgeDecay: " + ageDecayDetails));
} else {
boostDetails.add(Explanation.match(1.0f, "Age decay disabled: " + ageDecayDetails));
}
// Score adjuster
boostDetails.add(Explanation.match(SCORE_ADJUSTER, "[x] score adjuster"));
Explanation boostCombo = isHit
? Explanation.match((float) scoringData.scoreAfterBoost,
"(MATCH) After Boosts and Demotions:", boostDetails)
: Explanation.noMatch("After Boosts and Demotions:", boostDetails);
details.add(boostCombo);
}
@Override
protected Explanation doExplain(float luceneQueryScore) throws IOException {
// Run the scorer again and get the explanation.
ExplanationWrapper explanation = new ExplanationWrapper();
scoreInternal(luceneQueryScore, explanation);
return explanation.explanation;
}
@Override
public void populateResultMetadataBasedOnScoringData(
ThriftSearchResultMetadataOptions options,
ThriftSearchResultMetadata metadata,
LinearScoringData data) throws IOException {
metadata.setResultType(searchResultType);
metadata.setScore(data.scoreReturned);
metadata.setFromUserId(data.fromUserId);
if (data.isTrusted) {
metadata.setIsTrusted(true);
}
if (data.isFollow) {
metadata.setIsFollow(true);
}
if (data.skipReason != SkipReason.NOT_SKIPPED) {
metadata.setSkipped(true);
}
if ((data.isRetweet || (params.getInReplyToStatusId && data.isReply))
&& data.sharedStatusId != LinearScoringData.UNSET_SIGNAL_VALUE) {
metadata.setSharedStatusId(data.sharedStatusId);
}
if (data.hasCard) {
metadata.setCardType(data.cardType);
}
// Optional features. Note: other optional metadata is populated by
// AbstractRelevanceCollector, not the scoring function.
if (options.isGetLuceneScore()) {
metadata.setLuceneScore(data.luceneScore);
}
if (options.isGetReferencedTweetAuthorId()
&& data.referenceAuthorId != LinearScoringData.UNSET_SIGNAL_VALUE) {
metadata.setReferencedTweetAuthorId(data.referenceAuthorId);
}
if (options.isGetMediaBits()) {
metadata.setHasConsumerVideo(data.hasConsumerVideo);
metadata.setHasProVideo(data.hasProVideo);
metadata.setHasVine(data.hasVine);
metadata.setHasPeriscope(data.hasPeriscope);
boolean hasNativeVideo =
data.hasConsumerVideo || data.hasProVideo || data.hasVine || data.hasPeriscope;
metadata.setHasNativeVideo(hasNativeVideo);
metadata.setHasNativeImage(data.hasNativeImage);
}
metadata
.setIsOffensive(data.isOffensive)
.setIsReply(data.isReply)
.setIsRetweet(data.isRetweet)
.setHasLink(data.hasUrl)
.setHasTrend(data.hasTrend)
.setHasMultipleHashtagsOrTrends(data.hasMultipleHashtagsOrTrends)
.setRetweetCount((int) data.retweetCountPostLog2)
.setFavCount((int) data.favCountPostLog2)
.setReplyCount((int) data.replyCountPostLog2)
.setEmbedsImpressionCount((int) data.embedsImpressionCount)
.setEmbedsUrlCount((int) data.embedsUrlCount)
.setVideoViewCount((int) data.videoViewCount)
.setResultType(searchResultType)
.setFromVerifiedAccount(data.isFromVerifiedAccount)
.setIsUserSpam(data.isUserSpam)
.setIsUserNSFW(data.isUserNSFW)
.setIsUserBot(data.isUserBot)
.setHasImage(data.hasImageUrl)
.setHasVideo(data.hasVideoUrl)
.setHasNews(data.hasNewsUrl)
.setHasCard(data.hasCard)
.setHasVisibleLink(data.hasVisibleLink)
.setParusScore(data.parusScore)
.setTextScore(data.textScore)
.setUserRep(data.userRep)
.setTokenAt140DividedByNumTokensBucket(data.tokenAt140DividedByNumTokensBucket);
if (!metadata.isSetExtraMetadata()) {
metadata.setExtraMetadata(new ThriftSearchResultExtraMetadata());
}
ThriftSearchResultExtraMetadata extraMetadata = metadata.getExtraMetadata();
// Promotion/Demotion features
extraMetadata.setUserLangScore(data.userLangMult)
.setHasDifferentLang(data.hasDifferentLang)
.setHasEnglishTweetAndDifferentUILang(data.hasEnglishTweetAndDifferentUILang)
.setHasEnglishUIAndDifferentTweetLang(data.hasEnglishUIAndDifferentTweetLang)
.setHasQuote(data.hasQuote)
.setQuotedCount((int) data.quotedCount)
.setWeightedRetweetCount((int) data.weightedRetweetCount)
.setWeightedReplyCount((int) data.weightedReplyCount)
.setWeightedFavCount((int) data.weightedFavCount)
.setWeightedQuoteCount((int) data.weightedQuoteCount)
.setQuerySpecificScore(data.querySpecificScore)
.setAuthorSpecificScore(data.authorSpecificScore)
.setRetweetCountV2((int) data.retweetCountV2)
.setFavCountV2((int) data.favCountV2)
.setReplyCountV2((int) data.replyCountV2)
.setIsComposerSourceCamera(data.isComposerSourceCamera)
.setFromBlueVerifiedAccount(data.isFromBlueVerifiedAccount);
// Health model scores features
extraMetadata
.setToxicityScore(data.toxicityScore)
.setPBlockScore(data.pBlockScore)
.setPSpammyTweetScore(data.pSpammyTweetScore)
.setPReportedTweetScore(data.pReportedTweetScore)
.setSpammyTweetContentScore(data.spammyTweetContentScore)
.setExperimentalHealthModelScore1(data.experimentalHealthModelScore1)
.setExperimentalHealthModelScore2(data.experimentalHealthModelScore2)
.setExperimentalHealthModelScore3(data.experimentalHealthModelScore3)
.setExperimentalHealthModelScore4(data.experimentalHealthModelScore4);
// Return all extra features for clients to consume.
if (options.isGetAllFeatures()) {
extraMetadata.setIsSensitiveContent(data.isSensitiveContent)
.setHasMultipleMediaFlag(data.hasMultipleMediaFlag)
.setProfileIsEggFlag(data.profileIsEggFlag)
.setIsUserNewFlag(data.isUserNewFlag)
.setNumMentions(data.numMentions)
.setNumHashtags(data.numHashtags)
.setLinkLanguage(data.linkLanguage)
.setPrevUserTweetEngagement(data.prevUserTweetEngagement);
}
// Set features in new Feature Access API format, in the future this will be the only part
// needed in this method, we don't need to set any other metadata fields any more.
if (options.isReturnSearchResultFeatures()) {
// If the features are unset, and they were requested, then we can retrieve them. If they are
// already set, then we don't need to re-read the document features, and the reader
// is probably positioned over the wrong document so it will return incorrect results.
if (!extraMetadata.isSetFeatures()) {
// We ignore all features with default values when returning them in the response,
// because it saves a lot of network bandwidth.
ThriftSearchResultFeatures features = createFeaturesForDocument(data, true).getFeatures();
extraMetadata.setFeatures(features);
}
// The raw score may have changed since we created the features, so we should update it.
extraMetadata.getFeatures().getDoubleValues()
.put(ExternalTweetFeature.RAW_EARLYBIRD_SCORE.getId(), data.scoreFinal);
}
metadata
.setIsSelfTweet(data.isSelfTweet)
.setIsUserAntiSocial(data.isUserAntiSocial);
}
/**
* Create earlybird basic features and dervied features for current document.
* @return a FeatureHandler object where you can keep adding extra feature values, or you can
* call .getFeatures() on it to get a Thrift object to return.
*/
protected FeatureHandler createFeaturesForDocument(
LinearScoringData data, boolean ignoreDefaultValues) throws IOException {
ThriftSearchResultFeatures features = documentFeatures.getSearchResultFeatures(getSchema());
if (!ignoreDefaultValues) {
setDefaultFeatureValues(features);
}
// add derived features
return new FeatureHandler(features, ignoreDefaultValues)
.addDouble(ExternalTweetFeature.LUCENE_SCORE, data.luceneScore)
.addInt(ExternalTweetFeature.TWEET_AGE_IN_SECS, data.tweetAgeInSeconds)
.addBoolean(ExternalTweetFeature.IS_SELF_TWEET, data.isSelfTweet)
.addBoolean(ExternalTweetFeature.IS_FOLLOW_RETWEET, data.isFollow && data.isRetweet)
.addBoolean(ExternalTweetFeature.IS_TRUSTED_RETWEET, data.isTrusted && data.isRetweet)
.addBoolean(ExternalTweetFeature.AUTHOR_IS_FOLLOW, data.isFollow)
.addBoolean(ExternalTweetFeature.AUTHOR_IS_TRUSTED, data.isTrusted)
.addBoolean(ExternalTweetFeature.AUTHOR_IS_ANTISOCIAL, data.isUserAntiSocial)
.addBoolean(ExternalTweetFeature.HAS_DIFF_LANG, data.hasDifferentLang)
.addBoolean(ExternalTweetFeature.HAS_ENGLISH_TWEET_DIFF_UI_LANG,
data.hasEnglishTweetAndDifferentUILang)
.addBoolean(ExternalTweetFeature.HAS_ENGLISH_UI_DIFF_TWEET_LANG,
data.hasEnglishUIAndDifferentTweetLang)
.addDouble(ExternalTweetFeature.SEARCHER_LANG_SCORE, data.userLangMult)
.addDouble(ExternalTweetFeature.QUERY_SPECIFIC_SCORE, data.querySpecificScore)
.addDouble(ExternalTweetFeature.AUTHOR_SPECIFIC_SCORE, data.authorSpecificScore);
}
/**
* Adds default values for most numeric features that do not have a value set yet in the given
* ThriftSearchResultFeatures instance.
*
* This method is needed because some models do not work properly with missing features. Instead,
* they expect all features to be present even if they are unset (their values are 0).
*/
protected void setDefaultFeatureValues(ThriftSearchResultFeatures features) {
for (Map.Entry<Integer, ThriftSearchFeatureSchemaEntry> entry
: getSchema().getSearchFeatureSchema().getEntries().entrySet()) {
int featureId = entry.getKey();
ThriftSearchFeatureSchemaEntry schemaEntry = entry.getValue();
if (shouldSetDefaultValueForFeature(schemaEntry.getFeatureType(), featureId)) {
switch (schemaEntry.getFeatureType()) {
case INT32_VALUE:
features.getIntValues().putIfAbsent(featureId, 0);
break;
case LONG_VALUE:
features.getLongValues().putIfAbsent(featureId, 0L);
break;
case DOUBLE_VALUE:
features.getDoubleValues().putIfAbsent(featureId, 0.0);
break;
default:
throw new IllegalArgumentException(
"Should set default values only for integer, long or double features. Instead, "
+ "found feature " + featureId + " of type " + schemaEntry.getFeatureType());
}
}
}
}
protected void overrideFeatureValues(ThriftSearchResultFeatures features,
ThriftSearchResultFeatures overrideFeatures) {
LOG.info("Features before override {}", features);
if (overrideFeatures.isSetIntValues()) {
overrideFeatures.getIntValues().forEach(features::putToIntValues);
}
if (overrideFeatures.isSetLongValues()) {
overrideFeatures.getLongValues().forEach(features::putToLongValues);
}
if (overrideFeatures.isSetDoubleValues()) {
overrideFeatures.getDoubleValues().forEach(features::putToDoubleValues);
}
if (overrideFeatures.isSetBoolValues()) {
overrideFeatures.getBoolValues().forEach(features::putToBoolValues);
}
if (overrideFeatures.isSetStringValues()) {
overrideFeatures.getStringValues().forEach(features::putToStringValues);
}
if (overrideFeatures.isSetBytesValues()) {
overrideFeatures.getBytesValues().forEach(features::putToBytesValues);
}
if (overrideFeatures.isSetFeatureStoreDiscreteValues()) {
overrideFeatures.getFeatureStoreDiscreteValues().forEach(
features::putToFeatureStoreDiscreteValues);
}
if (overrideFeatures.isSetSparseBinaryValues()) {
overrideFeatures.getSparseBinaryValues().forEach(features::putToSparseBinaryValues);
}
if (overrideFeatures.isSetSparseContinuousValues()) {
overrideFeatures.getSparseContinuousValues().forEach(features::putToSparseContinuousValues);
}
if (overrideFeatures.isSetGeneralTensorValues()) {
overrideFeatures.getGeneralTensorValues().forEach(features::putToGeneralTensorValues);
}
if (overrideFeatures.isSetStringTensorValues()) {
overrideFeatures.getStringTensorValues().forEach(features::putToStringTensorValues);
}
LOG.info("Features after override {}", features);
}
/**
* Check if a feature is eligible to have its default value automatically set when absent.
* We have a similar logic for building data record.
*/
private static boolean shouldSetDefaultValueForFeature(
ThriftSearchFeatureType type, int featureId) {
return ALLOWED_TYPES_FOR_DEFAULT_FEATURE_VALUES.contains(type)
&& !NUMERIC_FEATURES_FOR_WHICH_DEFAULTS_SHOULD_NOT_BE_SET.contains(featureId)
&& (ExternalTweetFeature.EARLYBIRD_INDEXED_FEATURE_IDS.contains(featureId)
|| ExternalTweetFeature.EARLYBIRD_DERIVED_FEATURE_IDS.contains(featureId));
}
@Override
public void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
if (relevanceStats == null) {
return;
}
LinearScoringData data = getScoringDataForCurrentDocument();
if (data.tweetAgeInSeconds > relevanceStats.getOldestScoredTweetAgeInSeconds()) {
relevanceStats.setOldestScoredTweetAgeInSeconds(data.tweetAgeInSeconds);
}
relevanceStats.setNumScored(relevanceStats.getNumScored() + 1);
if (data.scoreReturned == SKIP_HIT) {
relevanceStats.setNumSkipped(relevanceStats.getNumSkipped() + 1);
switch(data.skipReason) {
case ANTIGAMING:
relevanceStats.setNumSkippedForAntiGaming(
relevanceStats.getNumSkippedForAntiGaming() + 1);
break;
case LOW_REPUTATION:
relevanceStats.setNumSkippedForLowReputation(
relevanceStats.getNumSkippedForLowReputation() + 1);
break;
case LOW_TEXT_SCORE:
relevanceStats.setNumSkippedForLowTextScore(
relevanceStats.getNumSkippedForLowTextScore() + 1);
break;
case SOCIAL_FILTER:
relevanceStats.setNumSkippedForSocialFilter(
relevanceStats.getNumSkippedForSocialFilter() + 1);
break;
case LOW_FINAL_SCORE:
relevanceStats.setNumSkippedForLowFinalScore(
relevanceStats.getNumSkippedForLowFinalScore() + 1);
break;
case LOW_RETWEET_COUNT:
break;
default:
LOG.warn("Unknown SkipReason: " + data.skipReason);
}
}
if (data.isFollow) {
relevanceStats.setNumFromDirectFollows(relevanceStats.getNumFromDirectFollows() + 1);
}
if (data.isTrusted) {
relevanceStats.setNumFromTrustedCircle(relevanceStats.getNumFromTrustedCircle() + 1);
}
if (data.isReply) {
relevanceStats.setNumReplies(relevanceStats.getNumReplies() + 1);
if (data.isTrusted) {
relevanceStats.setNumRepliesTrusted(relevanceStats.getNumRepliesTrusted() + 1);
} else if (!data.isFollow) {
relevanceStats.setNumRepliesOutOfNetwork(relevanceStats.getNumRepliesOutOfNetwork() + 1);
}
}
if (data.isSelfTweet) {
relevanceStats.setNumSelfTweets(relevanceStats.getNumSelfTweets() + 1);
}
if (data.hasImageUrl || data.hasVideoUrl) {
relevanceStats.setNumWithMedia(relevanceStats.getNumWithMedia() + 1);
}
if (data.hasNewsUrl) {
relevanceStats.setNumWithNews(relevanceStats.getNumWithNews() + 1);
}
if (data.isUserSpam) {
relevanceStats.setNumSpamUser(relevanceStats.getNumSpamUser() + 1);
}
if (data.isUserNSFW) {
relevanceStats.setNumOffensive(relevanceStats.getNumOffensive() + 1);
}
if (data.isUserBot) {
relevanceStats.setNumBot(relevanceStats.getNumBot() + 1);
}
}
@VisibleForTesting
static final class ExplanationWrapper {
private Explanation explanation;
public Explanation getExplanation() {
return explanation;
}
@Override
public String toString() {
return explanation.toString();
}
}
}