the-algorithm/src/java/com/twitter/search/common/relevance/features/TweetFeatureType.java

292 lines
12 KiB
Java

package com.twitter.search.common.relevance.features;
import java.util.Map;
import java.util.Set;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.ImmutableSet;
import com.twitter.search.common.encoding.features.IntNormalizer;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants;
import static com.twitter.search.common.relevance.features.IntNormalizers.BOOLEAN_NORMALIZER;
import static com.twitter.search.common.relevance.features.IntNormalizers.LEGACY_NORMALIZER;
import static com.twitter.search.common.relevance.features.IntNormalizers.PARUS_SCORE_NORMALIZER;
import static com.twitter.search.common.relevance.features.IntNormalizers.SMART_INTEGER_NORMALIZER;
import static com.twitter.search.common.relevance.features.IntNormalizers.TIMESTAMP_SEC_TO_HR_NORMALIZER;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
/**
* An enum to represent all dynamic/realtime feature types we can update in the Signal Ingester.
* It provides information for their normalization and their corresponding earlybird feature fields
* and provides utils both producer (Signal Ingester) and consumer (Earlybird) side.
*
*/
public enum TweetFeatureType {
RETWEET (true, 0, LEGACY_NORMALIZER,
EarlybirdFieldConstant.RETWEET_COUNT),
REPLY (true, 1, LEGACY_NORMALIZER,
EarlybirdFieldConstant.REPLY_COUNT),
FAVORITE (true, 4, LEGACY_NORMALIZER,
EarlybirdFieldConstant.FAVORITE_COUNT),
PARUS_SCORE (false, 3, PARUS_SCORE_NORMALIZER,
EarlybirdFieldConstant.PARUS_SCORE),
EMBEDS_IMP_COUNT (true, 10, LEGACY_NORMALIZER,
EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT),
EMBEDS_URL_COUNT (true, 11, LEGACY_NORMALIZER,
EarlybirdFieldConstant.EMBEDS_URL_COUNT),
VIDEO_VIEW (false, 12, LEGACY_NORMALIZER,
EarlybirdFieldConstant.VIDEO_VIEW_COUNT),
// v2 engagement counters, they will eventually replace v1 counters above
RETWEET_V2 (true, 13, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.RETWEET_COUNT_V2),
REPLY_V2 (true, 14, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.REPLY_COUNT_V2),
FAVORITE_V2 (true, 15, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAVORITE_COUNT_V2),
EMBEDS_IMP_COUNT_V2 (true, 16, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT_V2),
EMBEDS_URL_COUNT_V2 (true, 17, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.EMBEDS_URL_COUNT_V2),
VIDEO_VIEW_V2 (false, 18, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.VIDEO_VIEW_COUNT_V2),
// other new items
QUOTE (true, 19, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.QUOTE_COUNT),
// weighted engagement counters
WEIGHTED_RETWEET (true, 20, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.WEIGHTED_RETWEET_COUNT),
WEIGHTED_REPLY (true, 21, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.WEIGHTED_REPLY_COUNT),
WEIGHTED_FAVORITE (true, 22, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.WEIGHTED_FAVORITE_COUNT),
WEIGHTED_QUOTE (true, 23, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.WEIGHTED_QUOTE_COUNT),
// tweet-level safety labels
LABEL_ABUSIVE (false, 24, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_ABUSIVE_FLAG),
LABEL_ABUSIVE_HI_RCL (false, 25, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_ABUSIVE_HI_RCL_FLAG),
LABEL_DUP_CONTENT (false, 26, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_DUP_CONTENT_FLAG),
LABEL_NSFW_HI_PRC (false, 27, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_NSFW_HI_PRC_FLAG),
LABEL_NSFW_HI_RCL (false, 28, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_NSFW_HI_RCL_FLAG),
LABEL_SPAM (false, 29, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_SPAM_FLAG),
LABEL_SPAM_HI_RCL (false, 30, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.LABEL_SPAM_HI_RCL_FLAG),
PERISCOPE_EXISTS (false, 32, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_EXISTS),
PERISCOPE_HAS_BEEN_FEATURED (false, 33, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_HAS_BEEN_FEATURED),
PERISCOPE_IS_CURRENTLY_FEATURED (false, 34, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_IS_CURRENTLY_FEATURED),
PERISCOPE_IS_FROM_QUALITY_SOURCE(false, 35, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_IS_FROM_QUALITY_SOURCE),
PERISCOPE_IS_LIVE (false, 36, BOOLEAN_NORMALIZER,
EarlybirdFieldConstant.PERISCOPE_IS_LIVE),
// decayed engagement counters
DECAYED_RETWEET (true, 37, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.DECAYED_RETWEET_COUNT),
DECAYED_REPLY (true, 38, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.DECAYED_REPLY_COUNT),
DECAYED_FAVORITE (true, 39, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.DECAYED_FAVORITE_COUNT),
DECAYED_QUOTE (true, 40, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.DECAYED_QUOTE_COUNT),
// timestamp of last engagement types
LAST_RETWEET_SINCE_CREATION_HR (false, 41, TIMESTAMP_SEC_TO_HR_NORMALIZER,
EarlybirdFieldConstant.LAST_RETWEET_SINCE_CREATION_HRS),
LAST_REPLY_SINCE_CREATION_HR (false, 42, TIMESTAMP_SEC_TO_HR_NORMALIZER,
EarlybirdFieldConstant.LAST_REPLY_SINCE_CREATION_HRS),
LAST_FAVORITE_SINCE_CREATION_HR (false, 43, TIMESTAMP_SEC_TO_HR_NORMALIZER,
EarlybirdFieldConstant.LAST_FAVORITE_SINCE_CREATION_HRS),
LAST_QUOTE_SINCE_CREATION_HR (false, 44, TIMESTAMP_SEC_TO_HR_NORMALIZER,
EarlybirdFieldConstant.LAST_QUOTE_SINCE_CREATION_HRS),
// fake engagement counters
FAKE_RETWEET (true, 45, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAKE_RETWEET_COUNT),
FAKE_REPLY (true, 46, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAKE_REPLY_COUNT),
FAKE_FAVORITE (true, 47, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAKE_FAVORITE_COUNT),
FAKE_QUOTE (true, 48, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.FAKE_QUOTE_COUNT),
// blink engagement counters
BLINK_RETWEET (true, 49, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.BLINK_RETWEET_COUNT),
BLINK_REPLY (true, 50, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.BLINK_REPLY_COUNT),
BLINK_FAVORITE (true, 51, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.BLINK_FAVORITE_COUNT),
BLINK_QUOTE (true, 52, SMART_INTEGER_NORMALIZER,
EarlybirdFieldConstant.BLINK_QUOTE_COUNT),
/* semicolon in a single line to avoid polluting git blame */;
private static final Map<TweetFeatureType, TweetFeatureType> V2_COUNTER_MAP =
ImmutableMap.<TweetFeatureType, TweetFeatureType>builder()
.put(RETWEET, RETWEET_V2)
.put(REPLY, REPLY_V2)
.put(FAVORITE, FAVORITE_V2)
.put(EMBEDS_IMP_COUNT, EMBEDS_IMP_COUNT_V2)
.put(EMBEDS_URL_COUNT, EMBEDS_URL_COUNT_V2)
.put(VIDEO_VIEW, VIDEO_VIEW_V2)
.build();
private static final Map<TweetFeatureType, TweetFeatureType> WEIGHTED_COUNTER_MAP =
ImmutableMap.<TweetFeatureType, TweetFeatureType>builder()
.put(RETWEET, WEIGHTED_RETWEET)
.put(REPLY, WEIGHTED_REPLY)
.put(FAVORITE, WEIGHTED_FAVORITE)
.put(QUOTE, WEIGHTED_QUOTE)
.build();
private static final Map<TweetFeatureType, TweetFeatureType> DECAYED_COUNTER_MAP =
ImmutableMap.<TweetFeatureType, TweetFeatureType>builder()
.put(RETWEET, DECAYED_RETWEET)
.put(REPLY, DECAYED_REPLY)
.put(FAVORITE, DECAYED_FAVORITE)
.put(QUOTE, DECAYED_QUOTE)
.build();
private static final Map<TweetFeatureType, TweetFeatureType> DECAYED_COUNTER_TO_ELAPSED_TIME =
ImmutableMap.<TweetFeatureType, TweetFeatureType>builder()
.put(DECAYED_RETWEET, LAST_RETWEET_SINCE_CREATION_HR)
.put(DECAYED_REPLY, LAST_REPLY_SINCE_CREATION_HR)
.put(DECAYED_FAVORITE, LAST_FAVORITE_SINCE_CREATION_HR)
.put(DECAYED_QUOTE, LAST_QUOTE_SINCE_CREATION_HR)
.build();
private static final Set<TweetFeatureType> DECAYED_FEATURES =
ImmutableSet.of(DECAYED_RETWEET, DECAYED_REPLY, DECAYED_FAVORITE, DECAYED_QUOTE);
private static final Set<TweetFeatureType> FAKE_ENGAGEMENT_FEATURES =
ImmutableSet.of(FAKE_RETWEET, FAKE_REPLY, FAKE_FAVORITE, FAKE_QUOTE);
private static final Set<TweetFeatureType> BLINK_ENGAGEMENT_FEATURES =
ImmutableSet.of(BLINK_RETWEET, BLINK_REPLY, BLINK_FAVORITE, BLINK_QUOTE);
@Nullable
public TweetFeatureType getV2Type() {
return V2_COUNTER_MAP.get(this);
}
@Nullable
public static TweetFeatureType getWeightedType(TweetFeatureType type) {
return WEIGHTED_COUNTER_MAP.get(type);
}
@Nullable
public static TweetFeatureType getDecayedType(TweetFeatureType type) {
return DECAYED_COUNTER_MAP.get(type);
}
// Whether this feature is incremental or direct value.
private final boolean incremental;
// This normalizer is used to (1) normalize the output value in DLIndexEventOutputBolt,
// (2) check value change.
private final IntNormalizer normalizer;
// value for composing cache key. It has to be unique and in increasing order.
private final int typeInt;
private final EarlybirdFieldConstants.EarlybirdFieldConstant earlybirdField;
private final IncrementChecker incrementChecker;
/**
* Constructing an enum for a type. The earlybirdField can be null if it's not prepared, they
* can be here as placeholders but they can't be outputted.
* The normalizer is null for the timestamp features that do not require normalization
*/
TweetFeatureType(boolean incremental,
int typeInt,
IntNormalizer normalizer,
@Nullable EarlybirdFieldConstant earlybirdField) {
this.incremental = incremental;
this.typeInt = typeInt;
this.normalizer = normalizer;
this.earlybirdField = earlybirdField;
this.incrementChecker = new IncrementChecker(this);
}
public boolean isIncremental() {
return incremental;
}
public IntNormalizer getNormalizer() {
return normalizer;
}
public int getTypeInt() {
return typeInt;
}
public int normalize(double value) {
return normalizer.normalize(value);
}
public IncrementChecker getIncrementChecker() {
return incrementChecker;
}
public EarlybirdFieldConstant getEarlybirdField() {
return Preconditions.checkNotNull(earlybirdField);
}
public boolean hasEarlybirdField() {
return earlybirdField != null;
}
public boolean isDecayed() {
return DECAYED_FEATURES.contains(this);
}
@Nullable
public TweetFeatureType getElapsedTimeFeatureType() {
return DECAYED_COUNTER_TO_ELAPSED_TIME.get(this);
}
public boolean isFakeEngagement() {
return FAKE_ENGAGEMENT_FEATURES.contains(this);
}
public boolean isBlinkEngagement() {
return BLINK_ENGAGEMENT_FEATURES.contains(this);
}
/**
* Check if an increment is eligible for emitting
*/
public static class IncrementChecker {
private final IntNormalizer normalizer;
public IncrementChecker(IntNormalizer normalizer) {
this.normalizer = normalizer;
}
IncrementChecker(TweetFeatureType type) {
this(type.getNormalizer());
}
/**
* Check if a value change is eligible for output
*/
public boolean eligibleForEmit(int oldValue, int newValue) {
return normalizer.normalize(oldValue) != normalizer.normalize(newValue);
}
}
}