[docx] split commit for file 4000

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
Ari Archer 2024-01-23 19:14:40 +02:00
parent 0e39f836ae
commit 47a8228a09
No known key found for this signature in database
GPG Key ID: A50D5B4B599AF8A2
400 changed files with 0 additions and 31141 deletions

View File

@ -1,148 +0,0 @@
package com.twitter.search.common.schema.earlybird;
import com.google.common.base.Preconditions;
import com.twitter.search.common.encoding.features.IntegerEncodedFeatures;
import com.twitter.search.common.indexing.thriftjava.PackedFeatures;
import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures;
import com.twitter.search.common.schema.SchemaUtil;
import com.twitter.search.common.schema.base.FeatureConfiguration;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
/**
* A class for encoding earlybird features in integers
*/
public abstract class EarlybirdEncodedFeatures extends IntegerEncodedFeatures {
private final ImmutableSchemaInterface schema;
private final EarlybirdFieldConstant baseField;
public EarlybirdEncodedFeatures(ImmutableSchemaInterface schema,
EarlybirdFieldConstant baseField) {
this.schema = schema;
this.baseField = baseField;
}
/**
* Write this object into packedFeatures of the given VersionedTweetFeatures.
*/
public void writeFeaturesToVersionedTweetFeatures(
VersionedTweetFeatures versionedTweetFeatures) {
if (!versionedTweetFeatures.isSetPackedFeatures()) {
versionedTweetFeatures.setPackedFeatures(new PackedFeatures());
}
copyToPackedFeatures(versionedTweetFeatures.getPackedFeatures());
}
/**
* Write this object into extendedPackedFeatures of the given VersionedTweetFeatures.
*/
public void writeExtendedFeaturesToVersionedTweetFeatures(
VersionedTweetFeatures versionedTweetFeatures) {
if (!versionedTweetFeatures.isSetExtendedPackedFeatures()) {
versionedTweetFeatures.setExtendedPackedFeatures(new PackedFeatures());
}
copyToPackedFeatures(versionedTweetFeatures.getExtendedPackedFeatures());
}
@Override
public String toString() {
StringBuilder ret = new StringBuilder();
ret.append("Tweet features: \n");
for (FeatureConfiguration feature
: EarlybirdSchemaCreateTool.FEATURE_CONFIGURATION_MAP.values()) {
ret.append(feature.getName()).append(": ").append(getFeatureValue(feature)).append("\n");
}
return ret.toString();
}
public boolean isFlagSet(EarlybirdFieldConstant field) {
return isFlagSet(schema.getFeatureConfigurationById(field.getFieldId()));
}
public int getFeatureValue(EarlybirdFieldConstant field) {
return getFeatureValue(schema.getFeatureConfigurationById(field.getFieldId()));
}
public EarlybirdEncodedFeatures setFlag(EarlybirdFieldConstant field) {
setFlag(schema.getFeatureConfigurationById(field.getFieldId()));
return this;
}
public EarlybirdEncodedFeatures clearFlag(EarlybirdFieldConstant field) {
clearFlag(schema.getFeatureConfigurationById(field.getFieldId()));
return this;
}
public EarlybirdEncodedFeatures setFlagValue(EarlybirdFieldConstant field,
boolean value) {
setFlagValue(schema.getFeatureConfigurationById(field.getFieldId()), value);
return this;
}
public EarlybirdEncodedFeatures setFeatureValue(EarlybirdFieldConstant field,
int value) {
setFeatureValue(schema.getFeatureConfigurationById(field.getFieldId()), value);
return this;
}
public EarlybirdEncodedFeatures setFeatureValueIfGreater(EarlybirdFieldConstant field,
int value) {
setFeatureValueIfGreater(schema.getFeatureConfigurationById(field.getFieldId()), value);
return this;
}
public boolean incrementIfNotMaximum(EarlybirdFieldConstant field) {
return incrementIfNotMaximum(schema.getFeatureConfigurationById(field.getFieldId()));
}
private static final class ArrayEncodedTweetFeatures extends EarlybirdEncodedFeatures {
private final int[] encodedInts;
private ArrayEncodedTweetFeatures(ImmutableSchemaInterface schema,
EarlybirdFieldConstant baseField) {
super(schema, baseField);
final int numIntegers = SchemaUtil.getCSFFieldFixedLength(schema, baseField.getFieldId());
Preconditions.checkState(numIntegers > 0);
this.encodedInts = new int[numIntegers];
}
@Override
public int getNumInts() {
return encodedInts.length;
}
@Override
public int getInt(int pos) {
return encodedInts[pos];
}
@Override
public void setInt(int pos, int value) {
encodedInts[pos] = value;
}
}
/**
* Create a new {@link EarlybirdEncodedFeatures} object based on schema and base field.
* @param schema the schema for all fields
* @param baseField base field's constant value
*/
public static EarlybirdEncodedFeatures newEncodedTweetFeatures(
ImmutableSchemaInterface schema, EarlybirdFieldConstant baseField) {
return new ArrayEncodedTweetFeatures(schema, baseField);
}
/**
* Create a new {@link EarlybirdEncodedFeatures} object based on schema and base field name.
* @param schema the schema for all fields
* @param baseFieldName base field's name
*/
public static EarlybirdEncodedFeatures newEncodedTweetFeatures(
ImmutableSchemaInterface schema, String baseFieldName) {
EarlybirdFieldConstant baseField = EarlybirdFieldConstants.getFieldConstant(baseFieldName);
Preconditions.checkNotNull(baseField);
return newEncodedTweetFeatures(schema, baseField);
}
}

View File

@ -1,36 +0,0 @@
package com.twitter.search.common.schema.earlybird;
import com.twitter.search.common.encoding.docvalues.CSFTypeUtil;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
public final class EarlybirdEncodedFeaturesUtil {
private EarlybirdEncodedFeaturesUtil() {
}
/**
* Returns a byte array that can be stored in a ThriftDocument as bytesField.
*/
public static byte[] toBytesForThriftDocument(EarlybirdEncodedFeatures features) {
int numInts = features.getNumInts();
byte[] serializedFeatures = new byte[numInts * Integer.BYTES];
for (int i = 0; i < numInts; i++) {
CSFTypeUtil.convertToBytes(serializedFeatures, i, features.getInt(i));
}
return serializedFeatures;
}
/**
* Converts data in a given byte array (starting at the provided offset) into
* EarlybirdEncodedFeatures.
*/
public static EarlybirdEncodedFeatures fromBytes(
ImmutableSchemaInterface schema, EarlybirdFieldConstants.EarlybirdFieldConstant baseField,
byte[] data, int offset) {
EarlybirdEncodedFeatures features = EarlybirdEncodedFeatures.newEncodedTweetFeatures(
schema, baseField);
for (int idx = 0; idx < features.getNumInts(); ++idx) {
features.setInt(idx, CSFTypeUtil.convertFromBytes(data, offset, idx));
}
return features;
}
}

View File

@ -1,96 +0,0 @@
package com.twitter.search.common.schema.earlybird;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.search.common.schema.SchemaBuilder;
import com.twitter.search.common.schema.base.FieldNameToIdMapping;
import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration;
import com.twitter.search.common.schema.thriftjava.ThriftFieldSettings;
import com.twitter.search.common.schema.thriftjava.ThriftTokenStreamSerializer;
import com.twitter.search.common.util.analysis.CharTermAttributeSerializer;
import com.twitter.search.common.util.analysis.TermPayloadAttributeSerializer;
/**
* Build class used to build a ThriftSchema
*/
public class EarlybirdSchemaBuilder extends SchemaBuilder {
private final EarlybirdCluster cluster;
public EarlybirdSchemaBuilder(FieldNameToIdMapping idMapping,
EarlybirdCluster cluster,
TokenStreamSerializer.Version tokenStreamSerializerVersion) {
super(idMapping, tokenStreamSerializerVersion);
this.cluster = cluster;
}
/**
* Configure the specified field to be Out-of-order.
* In the realtime cluster, this causes Earlybird to used the skip list posting format.
*/
public final EarlybirdSchemaBuilder withOutOfOrderEnabledForField(String fieldName) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings settings =
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
Preconditions.checkState(settings.isSetIndexedFieldSettings(),
"Out of order field must be indexed");
settings.getIndexedFieldSettings().setSupportOutOfOrderAppends(true);
return this;
}
/**
* This turns on tweet specific normalizations. This turns on the following two token processors:
* {@link com.twitter.search.common.util.text.splitter.HashtagMentionPunctuationSplitter}
* {@link com.twitter.search.common.util.text.filter.NormalizedTokenFilter}
* <p/>
* HashtagMentionPunctuationSplitter would break a mention or hashtag like @ab_cd or #ab_cd into
* tokens {ab, cd}.
* NormalizedTokenFilter strips out the # @ $ from the tokens.
*/
public final EarlybirdSchemaBuilder withTweetSpecificNormalization(String fieldName) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings settings =
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
Preconditions.checkState(settings.isSetIndexedFieldSettings(),
"Tweet text field must be indexed.");
settings.getIndexedFieldSettings().setDeprecated_performTweetSpecificNormalizations(true);
return this;
}
/**
* Add a twitter photo facet field.
*/
public final EarlybirdSchemaBuilder withPhotoUrlFacetField(String fieldName) {
if (!shouldIncludeField(fieldName)) {
return this;
}
ThriftFieldSettings photoFieldSettings = getNoPositionNoFreqSettings();
ThriftTokenStreamSerializer tokenStreamSerializer =
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
tokenStreamSerializer.setAttributeSerializerClassNames(
ImmutableList.<String>of(
CharTermAttributeSerializer.class.getName(),
TermPayloadAttributeSerializer.class.getName()));
photoFieldSettings
.getIndexedFieldSettings()
.setTokenStreamSerializer(tokenStreamSerializer)
.setTokenized(true);
putIntoFieldConfigs(idMapping.getFieldID(fieldName),
new ThriftFieldConfiguration(fieldName).setSettings(photoFieldSettings));
return this;
}
/**
* Returns whether the given field should be included or dropped.
*/
@Override
protected boolean shouldIncludeField(String fieldName) {
return EarlybirdFieldConstants.getFieldConstant(fieldName).isValidFieldInCluster(cluster);
}
}

View File

@ -1,702 +0,0 @@
package com.twitter.search.common.schema.earlybird;
import java.util.Map;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.schema.AnalyzerFactory;
import com.twitter.search.common.schema.DynamicSchema;
import com.twitter.search.common.schema.ImmutableSchema;
import com.twitter.search.common.schema.SchemaBuilder;
import com.twitter.search.common.schema.base.FeatureConfiguration;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint;
import com.twitter.search.common.schema.thriftjava.ThriftSchema;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_FAVORITE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_QUOTE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_REPLY_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_RETWEET_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.COMPOSER_SOURCE_IS_CAMERA_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_FAVORITE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_QUOTE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_REPLY_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_RETWEET_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT_V2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_URL_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_URL_COUNT_V2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_1;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_3;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_4;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_FEATURE_UNUSED_BITS_0_24_8;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_12_30_2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_13_30_2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_14_10_22;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_16;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_17;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_18;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_19;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_20;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_4_31_1;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_7_6_26;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_FAVORITE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_QUOTE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_REPLY_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_RETWEET_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAVORITE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAVORITE_COUNT_V2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CARD_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_EXPANDO_CARD_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_LINK_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_MULTIPLE_MEDIA_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NEWS_URL_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PERISCOPE_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_QUOTE_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_TREND_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VINE_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_NULLCAST_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_OFFENSIVE_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_REPLY_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_RETWEET_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_SENSITIVE_CONTENT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_TRENDING_NOW_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_BOT_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_NEW_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_NSFW_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_SPAM_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_ABUSIVE_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_ABUSIVE_HI_RCL_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_DUP_CONTENT_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_NSFW_HI_PRC_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_NSFW_HI_RCL_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_SPAM_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_SPAM_HI_RCL_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LANGUAGE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_FAVORITE_SINCE_CREATION_HRS;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_QUOTE_SINCE_CREATION_HRS;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_REPLY_SINCE_CREATION_HRS;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_RETWEET_SINCE_CREATION_HRS;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LINK_LANGUAGE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_HASHTAGS;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_HASHTAGS_V2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_MENTIONS;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_MENTIONS_V2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_STOCKS;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PARUS_SCORE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PBLOCK_SCORE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_EXISTS;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_HAS_BEEN_FEATURED;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_IS_CURRENTLY_FEATURED;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_IS_FROM_QUALITY_SOURCE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_IS_LIVE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PREV_USER_TWEET_ENGAGEMENT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PROFILE_IS_EGG_FLAG;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.P_REPORTED_TWEET_SCORE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.P_SPAMMY_TWEET_SCORE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.QUOTE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REPLY_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REPLY_COUNT_V2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.RETWEET_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.RETWEET_COUNT_V2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.SPAMMY_TWEET_CONTENT_SCORE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.TEXT_SCORE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.TOXICITY_SCORE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.TWEET_SIGNATURE;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.USER_REPUTATION;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_VIEW_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_VIEW_COUNT_V2;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.VISIBLE_TOKEN_RATIO;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_FAVORITE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_QUOTE_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_REPLY_COUNT;
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_RETWEET_COUNT;
/**
* Field configurations for Earlybird.
*/
public final class EarlybirdSchemaCreateTool {
// How many times a schema is built
private static final SearchCounter SCHEMA_BUILD_COUNT =
SearchCounter.export("schema_build_count");
// Number of integers for the column of ENCODED_TWEET_FEATURES_FIELD.
@VisibleForTesting
public static final int NUMBER_OF_INTEGERS_FOR_FEATURES = 5;
// Number of integers for the column of EXTENDED_ENCODED_TWEET_FEATURES_FIELD.
// extra 80 bytes
// In realtime cluster, assuming 19 segments total, and 8388608 docs per segment
// this would amount to about 12.75GB of memory needed
//
@VisibleForTesting
public static final int NUMBER_OF_INTEGERS_FOR_EXTENDED_FEATURES = 20;
@VisibleForTesting
public static final Map<String, FeatureConfiguration> FEATURE_CONFIGURATION_MAP
= Maps.newLinkedHashMap();
public static final String BASE_FIELD_NAME =
EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName();
private static String getBaseFieldName(String fullName) {
int index = fullName.indexOf(SchemaBuilder.CSF_VIEW_NAME_SEPARATOR);
Preconditions.checkArgument(index > 0);
return fullName.substring(0, index);
}
private static String getBaseFieldName(EarlybirdFieldConstant fieldConstant) {
return getBaseFieldName(fieldConstant.getFieldName());
}
private static String getFeatureNameInField(EarlybirdFieldConstant fieldConstant) {
int index = fieldConstant.getFieldName().indexOf(SchemaBuilder.CSF_VIEW_NAME_SEPARATOR);
Preconditions.checkArgument(index > 0);
return fieldConstant.getFieldName().substring(index + 1);
}
// defining all features
static {
// Add individual tweet encoded features as views on top of
// EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD
// int intIndex, int bitStartPos, int bitLength
newEarlybirdFeatureConfiguration(IS_RETWEET_FLAG, ThriftCSFType.BOOLEAN, 0, 0, 1);
newEarlybirdFeatureConfiguration(IS_OFFENSIVE_FLAG, ThriftCSFType.BOOLEAN, 0, 1, 1);
newEarlybirdFeatureConfiguration(HAS_LINK_FLAG, ThriftCSFType.BOOLEAN, 0, 2, 1);
newEarlybirdFeatureConfiguration(HAS_TREND_FLAG, ThriftCSFType.BOOLEAN, 0, 3, 1);
newEarlybirdFeatureConfiguration(IS_REPLY_FLAG, ThriftCSFType.BOOLEAN, 0, 4, 1);
newEarlybirdFeatureConfiguration(IS_SENSITIVE_CONTENT, ThriftCSFType.BOOLEAN, 0, 5, 1);
newEarlybirdFeatureConfiguration(HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG,
ThriftCSFType.BOOLEAN, 0, 6, 1);
newEarlybirdFeatureConfiguration(FROM_VERIFIED_ACCOUNT_FLAG, ThriftCSFType.BOOLEAN, 0, 7, 1);
newEarlybirdFeatureConfiguration(TEXT_SCORE, ThriftCSFType.INT, 0, 8, 8);
newEarlybirdFeatureConfiguration(LANGUAGE, ThriftCSFType.INT, 0, 16, 8);
newEarlybirdFeatureConfiguration(LINK_LANGUAGE, ThriftCSFType.INT, 0, 24, 8);
newEarlybirdFeatureConfiguration(HAS_IMAGE_URL_FLAG, ThriftCSFType.BOOLEAN, 1, 0, 1);
newEarlybirdFeatureConfiguration(HAS_VIDEO_URL_FLAG, ThriftCSFType.BOOLEAN, 1, 1, 1);
newEarlybirdFeatureConfiguration(HAS_NEWS_URL_FLAG, ThriftCSFType.BOOLEAN, 1, 2, 1);
newEarlybirdFeatureConfiguration(HAS_EXPANDO_CARD_FLAG, ThriftCSFType.BOOLEAN, 1, 3, 1);
newEarlybirdFeatureConfiguration(HAS_MULTIPLE_MEDIA_FLAG, ThriftCSFType.BOOLEAN, 1, 4, 1);
newEarlybirdFeatureConfiguration(PROFILE_IS_EGG_FLAG, ThriftCSFType.BOOLEAN, 1, 5, 1);
newEarlybirdFeatureConfiguration(NUM_MENTIONS, ThriftCSFType.INT, 1, 6, 2); // 0, 1, 2, 3+
newEarlybirdFeatureConfiguration(NUM_HASHTAGS, ThriftCSFType.INT, 1, 8, 2); // 0, 1, 2, 3+
newEarlybirdFeatureConfiguration(HAS_CARD_FLAG, ThriftCSFType.BOOLEAN, 1, 10, 1);
newEarlybirdFeatureConfiguration(HAS_VISIBLE_LINK_FLAG, ThriftCSFType.BOOLEAN, 1, 11, 1);
newEarlybirdFeatureConfiguration(USER_REPUTATION, ThriftCSFType.INT, 1, 12, 8);
newEarlybirdFeatureConfiguration(IS_USER_SPAM_FLAG, ThriftCSFType.BOOLEAN, 1, 20, 1);
newEarlybirdFeatureConfiguration(IS_USER_NSFW_FLAG, ThriftCSFType.BOOLEAN, 1, 21, 1);
newEarlybirdFeatureConfiguration(IS_USER_BOT_FLAG, ThriftCSFType.BOOLEAN, 1, 22, 1);
newEarlybirdFeatureConfiguration(IS_USER_NEW_FLAG, ThriftCSFType.BOOLEAN, 1, 23, 1);
newEarlybirdFeatureConfiguration(PREV_USER_TWEET_ENGAGEMENT, ThriftCSFType.INT, 1, 24, 6);
newEarlybirdFeatureConfiguration(COMPOSER_SOURCE_IS_CAMERA_FLAG,
ThriftCSFType.BOOLEAN, 1, 30, 1);
newEarlybirdFeatureConfiguration(IS_NULLCAST_FLAG, ThriftCSFType.BOOLEAN, 1, 31, 1);
newEarlybirdFeatureConfiguration(RETWEET_COUNT, ThriftCSFType.DOUBLE, 2, 0, 8,
ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(FAVORITE_COUNT, ThriftCSFType.DOUBLE, 2, 8, 8,
ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(REPLY_COUNT, ThriftCSFType.DOUBLE, 2, 16, 8,
ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(PARUS_SCORE, ThriftCSFType.DOUBLE, 2, 24, 8);
newEarlybirdFeatureConfiguration(HAS_CONSUMER_VIDEO_FLAG, ThriftCSFType.BOOLEAN, 3, 0, 1);
newEarlybirdFeatureConfiguration(HAS_PRO_VIDEO_FLAG, ThriftCSFType.BOOLEAN, 3, 1, 1);
newEarlybirdFeatureConfiguration(HAS_VINE_FLAG, ThriftCSFType.BOOLEAN, 3, 2, 1);
newEarlybirdFeatureConfiguration(HAS_PERISCOPE_FLAG, ThriftCSFType.BOOLEAN, 3, 3, 1);
newEarlybirdFeatureConfiguration(HAS_NATIVE_IMAGE_FLAG, ThriftCSFType.BOOLEAN, 3, 4, 1);
// NOTE: There are 3 bits left in the first byte of INT 3, if possible, please reserve them
// for future media types (SEARCH-9131)
// newEarlybirdFeatureConfiguration(FUTURE_MEDIA_BITS, ThriftCSFType.INT, 3, 5, 3);
newEarlybirdFeatureConfiguration(VISIBLE_TOKEN_RATIO, ThriftCSFType.INT, 3, 8, 4);
newEarlybirdFeatureConfiguration(HAS_QUOTE_FLAG, ThriftCSFType.BOOLEAN, 3, 12, 1);
newEarlybirdFeatureConfiguration(FROM_BLUE_VERIFIED_ACCOUNT_FLAG,
ThriftCSFType.BOOLEAN, 3, 13, 1);
// Unused bits from bit 14 to bit 31 (18 bits)
// newEarlybirdFeatureConfiguration(UNUSED_BITS, ThriftCSFType.INT, 3, 14, 18);
newEarlybirdFeatureConfiguration(TWEET_SIGNATURE, ThriftCSFType.INT, 4, 0, 32);
newEarlybirdFeatureConfiguration(EMBEDS_IMPRESSION_COUNT,
ThriftCSFType.DOUBLE, 0, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(EMBEDS_URL_COUNT,
ThriftCSFType.DOUBLE, 0, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(VIDEO_VIEW_COUNT,
ThriftCSFType.DOUBLE, 0, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
// Unused bits from bit 24 to bit 31 (8 bits).
// This used to be a feature that was decommissioned (SEARCHQUAL-10321)
newEarlybirdFeatureConfiguration(EXTENDED_FEATURE_UNUSED_BITS_0_24_8,
ThriftCSFType.INT, 0, 24, 8);
newEarlybirdFeatureConfiguration(REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT,
ThriftCSFType.INT, 1, 0, 32, ThriftFeatureUpdateConstraint.IMMUTABLE);
newEarlybirdFeatureConfiguration(REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT,
ThriftCSFType.INT, 2, 0, 32, ThriftFeatureUpdateConstraint.IMMUTABLE);
newEarlybirdFeatureConfiguration(RETWEET_COUNT_V2,
ThriftCSFType.DOUBLE, 3, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(FAVORITE_COUNT_V2,
ThriftCSFType.DOUBLE, 3, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(REPLY_COUNT_V2,
ThriftCSFType.DOUBLE, 3, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(EMBEDS_IMPRESSION_COUNT_V2,
ThriftCSFType.DOUBLE, 3, 24, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(EMBEDS_URL_COUNT_V2,
ThriftCSFType.DOUBLE, 4, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(VIDEO_VIEW_COUNT_V2,
ThriftCSFType.DOUBLE, 4, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(QUOTE_COUNT,
ThriftCSFType.DOUBLE, 4, 16, 8);
newEarlybirdFeatureConfiguration(LABEL_ABUSIVE_FLAG, ThriftCSFType.BOOLEAN, 4, 24, 1);
newEarlybirdFeatureConfiguration(LABEL_ABUSIVE_HI_RCL_FLAG, ThriftCSFType.BOOLEAN, 4, 25, 1);
newEarlybirdFeatureConfiguration(LABEL_DUP_CONTENT_FLAG, ThriftCSFType.BOOLEAN, 4, 26, 1);
newEarlybirdFeatureConfiguration(LABEL_NSFW_HI_PRC_FLAG, ThriftCSFType.BOOLEAN, 4, 27, 1);
newEarlybirdFeatureConfiguration(LABEL_NSFW_HI_RCL_FLAG, ThriftCSFType.BOOLEAN, 4, 28, 1);
newEarlybirdFeatureConfiguration(LABEL_SPAM_FLAG, ThriftCSFType.BOOLEAN, 4, 29, 1);
newEarlybirdFeatureConfiguration(LABEL_SPAM_HI_RCL_FLAG, ThriftCSFType.BOOLEAN, 4, 30, 1);
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_4_31_1,
ThriftCSFType.INT, 4, 31, 1);
newEarlybirdFeatureConfiguration(WEIGHTED_RETWEET_COUNT,
ThriftCSFType.DOUBLE, 5, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(WEIGHTED_REPLY_COUNT,
ThriftCSFType.DOUBLE, 5, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(WEIGHTED_FAVORITE_COUNT,
ThriftCSFType.DOUBLE, 5, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(WEIGHTED_QUOTE_COUNT,
ThriftCSFType.DOUBLE, 5, 24, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(PERISCOPE_EXISTS,
ThriftCSFType.BOOLEAN, 6, 0, 1);
newEarlybirdFeatureConfiguration(PERISCOPE_HAS_BEEN_FEATURED,
ThriftCSFType.BOOLEAN, 6, 1, 1);
newEarlybirdFeatureConfiguration(PERISCOPE_IS_CURRENTLY_FEATURED,
ThriftCSFType.BOOLEAN, 6, 2, 1);
newEarlybirdFeatureConfiguration(PERISCOPE_IS_FROM_QUALITY_SOURCE,
ThriftCSFType.BOOLEAN, 6, 3, 1);
newEarlybirdFeatureConfiguration(PERISCOPE_IS_LIVE,
ThriftCSFType.BOOLEAN, 6, 4, 1);
newEarlybirdFeatureConfiguration(IS_TRENDING_NOW_FLAG,
ThriftCSFType.BOOLEAN, 6, 5, 1);
// remaining bits for integer 6
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_7_6_26,
ThriftCSFType.INT, 6, 6, 26);
// The decaying counters can become smaller
newEarlybirdFeatureConfiguration(DECAYED_RETWEET_COUNT,
ThriftCSFType.DOUBLE, 7, 0, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(DECAYED_REPLY_COUNT,
ThriftCSFType.DOUBLE, 7, 8, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(DECAYED_FAVORITE_COUNT,
ThriftCSFType.DOUBLE, 7, 16, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(DECAYED_QUOTE_COUNT,
ThriftCSFType.DOUBLE, 7, 24, 8, ThriftFeatureUpdateConstraint.POSITIVE);
// The fake engagement counters.
newEarlybirdFeatureConfiguration(FAKE_RETWEET_COUNT,
ThriftCSFType.DOUBLE, 8, 0, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(FAKE_REPLY_COUNT,
ThriftCSFType.DOUBLE, 8, 8, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(FAKE_FAVORITE_COUNT,
ThriftCSFType.DOUBLE, 8, 16, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(FAKE_QUOTE_COUNT,
ThriftCSFType.DOUBLE, 8, 24, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(LAST_RETWEET_SINCE_CREATION_HRS,
ThriftCSFType.INT, 9, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(LAST_REPLY_SINCE_CREATION_HRS,
ThriftCSFType.INT, 9, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(LAST_FAVORITE_SINCE_CREATION_HRS,
ThriftCSFType.INT, 9, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(LAST_QUOTE_SINCE_CREATION_HRS,
ThriftCSFType.INT, 9, 24, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
newEarlybirdFeatureConfiguration(NUM_HASHTAGS_V2,
ThriftCSFType.INT, 10, 0, 4);
newEarlybirdFeatureConfiguration(NUM_MENTIONS_V2,
ThriftCSFType.INT, 10, 4, 4);
newEarlybirdFeatureConfiguration(NUM_STOCKS,
ThriftCSFType.INT, 10, 8, 4);
// Remaining bits for integer 10
// Production Toxicity and PBlock score from HML (go/toxicity, go/pblock)
newEarlybirdFeatureConfiguration(TOXICITY_SCORE,
ThriftCSFType.DOUBLE, 10, 12, 10);
newEarlybirdFeatureConfiguration(PBLOCK_SCORE,
ThriftCSFType.DOUBLE, 10, 22, 10);
// The blink engagement counters
newEarlybirdFeatureConfiguration(BLINK_RETWEET_COUNT,
ThriftCSFType.DOUBLE, 11, 0, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(BLINK_REPLY_COUNT,
ThriftCSFType.DOUBLE, 11, 8, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(BLINK_FAVORITE_COUNT,
ThriftCSFType.DOUBLE, 11, 16, 8, ThriftFeatureUpdateConstraint.POSITIVE);
newEarlybirdFeatureConfiguration(BLINK_QUOTE_COUNT,
ThriftCSFType.DOUBLE, 11, 24, 8, ThriftFeatureUpdateConstraint.POSITIVE);
// Experimental health model scores from HML
newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_1,
ThriftCSFType.DOUBLE, 12, 0, 10);
newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_2,
ThriftCSFType.DOUBLE, 12, 10, 10);
newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_3,
ThriftCSFType.DOUBLE, 12, 20, 10);
// remaining bits for integer 12
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_12_30_2,
ThriftCSFType.INT, 12, 30, 2);
// Experimental health model scores from HML (cont.)
newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_4,
ThriftCSFType.DOUBLE, 13, 0, 10);
// Production pSpammyTweet score from HML (go/pspammytweet)
newEarlybirdFeatureConfiguration(P_SPAMMY_TWEET_SCORE,
ThriftCSFType.DOUBLE, 13, 10, 10);
// Production pReportedTweet score from HML (go/preportedtweet)
newEarlybirdFeatureConfiguration(P_REPORTED_TWEET_SCORE,
ThriftCSFType.DOUBLE, 13, 20, 10);
// remaining bits for integer 13
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_13_30_2,
ThriftCSFType.INT, 13, 30, 2);
// Experimental health model scores from HML (cont.)
// Prod Spammy Tweet Content model score from Platform Manipulation (go/spammy-tweet-content)
newEarlybirdFeatureConfiguration(SPAMMY_TWEET_CONTENT_SCORE,
ThriftCSFType.DOUBLE, 14, 0, 10);
// remaining bits for integer 14
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_14_10_22,
ThriftCSFType.INT, 14, 10, 22);
// Note that the integer index below is 0-based, but the index j in UNUSED_BITS_{j} below
// is 1-based.
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_16,
ThriftCSFType.INT, 15, 0, 32);
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_17,
ThriftCSFType.INT, 16, 0, 32);
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_18,
ThriftCSFType.INT, 17, 0, 32);
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_19,
ThriftCSFType.INT, 18, 0, 32);
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_20,
ThriftCSFType.INT, 19, 0, 32);
}
private EarlybirdSchemaCreateTool() { }
/**
* Get schema for the Earlybird.
*/
public static DynamicSchema buildSchema(EarlybirdCluster cluster)
throws Schema.SchemaValidationException {
SCHEMA_BUILD_COUNT.increment();
return new DynamicSchema(new ImmutableSchema(buildThriftSchema(cluster),
new AnalyzerFactory(),
cluster.getNameForStats()));
}
/**
* Get schema for the Earlybird, can throw runtime exception. This is mostly for static schema
* usage, which does not care about schema updates.
*/
@VisibleForTesting
public static DynamicSchema buildSchemaWithRuntimeException(EarlybirdCluster cluster) {
try {
return buildSchema(cluster);
} catch (Schema.SchemaValidationException e) {
throw new RuntimeException(e);
}
}
private static FeatureConfiguration newEarlybirdFeatureConfiguration(
EarlybirdFieldConstant fieldConstant,
ThriftCSFType type,
int intIndex, int bitStartPos, int bitLength,
ThriftFeatureUpdateConstraint... constraints) {
if (!fieldConstant.isFlagFeatureField() && type == ThriftCSFType.BOOLEAN) {
throw new IllegalArgumentException(
"Non-flag feature field configured with boolean Thrift type: " + fieldConstant);
}
if (fieldConstant.isFlagFeatureField() && type != ThriftCSFType.BOOLEAN) {
throw new IllegalArgumentException(
"Flag feature field configured with non-boolean Thrift type: " + fieldConstant);
}
String baseFieldName = getBaseFieldName(fieldConstant);
String name = getFeatureNameInField(fieldConstant);
FeatureConfiguration.Builder builder = FeatureConfiguration.builder()
.withName(name)
.withType(type)
.withBitRange(intIndex, bitStartPos, bitLength);
// remove the following line once we configure features purely by the schema
builder.withBaseField(baseFieldName);
if (!fieldConstant.isUnusedField()) {
builder.withOutputType(type);
}
if (fieldConstant.getFeatureNormalizationType() != null) {
builder.withFeatureNormalizationType(fieldConstant.getFeatureNormalizationType());
}
for (ThriftFeatureUpdateConstraint constraint : constraints) {
builder.withFeatureUpdateConstraint(constraint);
}
FeatureConfiguration featureConfiguration = builder.build();
FEATURE_CONFIGURATION_MAP.put(fieldConstant.getFieldName(), featureConfiguration);
return featureConfiguration;
}
/**
* Build ThriftSchema for the Earlybird. Note that the schema returned can be used
* all Earlybird clusters. However, some clusters may not use all the field configurations.
*/
@VisibleForTesting
public static ThriftSchema buildThriftSchema(EarlybirdCluster cluster) {
EarlybirdSchemaBuilder builder = new EarlybirdSchemaBuilder(
new EarlybirdFieldConstants(), cluster, TokenStreamSerializer.Version.VERSION_2);
builder.withSchemaVersion(
FlushVersion.CURRENT_FLUSH_VERSION.getVersionNumber(),
FlushVersion.CURRENT_FLUSH_VERSION.getMinorVersion(),
FlushVersion.CURRENT_FLUSH_VERSION.getDescription(),
FlushVersion.CURRENT_FLUSH_VERSION.isOfficial());
// ID field, used for partitioning
builder.withPartitionFieldId(0)
.withSortableLongTermField(EarlybirdFieldConstant.ID_FIELD.getFieldName())
// Text Fields that are searched by default
.withTextField(EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), true)
.withSearchFieldByDefault(
EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), 0.1f)
.withPretokenizedTextField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), true)
.withSearchFieldByDefault(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), 1.0f);
builder.withTweetSpecificNormalization(EarlybirdFieldConstant.TEXT_FIELD.getFieldName())
.withTextField(EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), true)
.withSearchFieldByDefault(
EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), 0.2f)
// Text fields not searched by default
.withTextField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), false)
.withTextField(EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(), false)
// cards are not searched by default, and have weight 0.
.withPretokenizedTextField(EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(), false)
.withTextField(EarlybirdFieldConstant.CARD_LANG.getFieldName(), false)
// Out-of-order append fields
.withLongTermField(EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName())
.withLongTermField(EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName())
// No Position fields, sorted alphabetically
.withPretokenizedNoPositionField(EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName())
.withIntTermField(EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.RETWEET_SOURCE_TWEET_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.RETWEET_SOURCE_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName())
.withTextField(EarlybirdFieldConstant.PLACE_FULL_NAME_FIELD.getFieldName(), false)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.PLACE_COUNTRY_CODE_FIELD.getFieldName())
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.PROFILE_GEO_COUNTRY_CODE_FIELD.getFieldName())
.withTextField(EarlybirdFieldConstant.PROFILE_GEO_REGION_FIELD.getFieldName(), false)
.withTextField(EarlybirdFieldConstant.PROFILE_GEO_LOCALITY_FIELD.getFieldName(), false)
.withTermTextLookup(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName())
.withTermTextLookup(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName())
.withPretokenizedNoPositionField(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName())
.withIndexedNotTokenizedField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD)
.withIndexedNotTokenizedField(ImmutableSchema.HF_TERM_PAIRS_FIELD)
.withIndexedNotTokenizedField(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.LINKS_FIELD.getFieldName())
.withIntTermField(EarlybirdFieldConstant.LINK_CATEGORY_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.PLACE_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName())
.withPretokenizedNoPositionField(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName())
.withIndexedNotTokenizedField(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName())
.withIntTermField(NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName())
.withIntTermField(NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName())
.withIntTermField(NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName())
.withIntTermField(EarlybirdFieldConstant.COMPOSER_SOURCE.getFieldName())
.withLongTermField(EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName())
.withLongTermField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName())
// Named entity fields
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.NAMED_ENTITY_FROM_URL_FIELD.getFieldName(), true)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.NAMED_ENTITY_FROM_TEXT_FIELD.getFieldName(), true)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD.getFieldName(), true)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD.getFieldName(), true)
// camelCase-tokenized user handles and tokenized user names, not searchable by default
.withPretokenizedTextField(
EarlybirdFieldConstant.CAMELCASE_USER_HANDLE_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(), false)
.withIndexedNotTokenizedField(
EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName())
.withTextField(EarlybirdFieldConstant.SPACE_ADMIN_FIELD.getFieldName(), false)
.withPretokenizedTextField(EarlybirdFieldConstant.SPACE_TITLE_FIELD.getFieldName(), false)
.withTextField(EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), true)
.withPretokenizedTextField(
EarlybirdFieldConstant.CAMELCASE_TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_DISPLAY_NAME_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.URL_DESCRIPTION_FIELD.getFieldName(), false)
.withPretokenizedTextField(
EarlybirdFieldConstant.URL_TITLE_FIELD.getFieldName(), false);
builder
.withPhotoUrlFacetField(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName())
.withOutOfOrderEnabledForField(
EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName())
.withOutOfOrderEnabledForField(
EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName())
.withOutOfOrderEnabledForField(
EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName());
// ColumnStrideFields.
boolean loadCSFIntoRAMDefault = cluster != EarlybirdCluster.FULL_ARCHIVE;
builder
.withColumnStrideField(EarlybirdFieldConstants.ENCODED_TWEET_FEATURES_FIELD_NAME,
ThriftCSFType.INT, NUMBER_OF_INTEGERS_FOR_FEATURES,
true, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, /* the full archive loads this field into RAM */ true)
.withColumnStrideField(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(),
ThriftCSFType.BYTE, 1, false, loadCSFIntoRAMDefault)
// CSF Used by archive mappers
.withColumnStrideField(EarlybirdFieldConstant.CREATED_AT_CSF_FIELD.getFieldName(),
ThriftCSFType.INT, 1, false, /* the full archive loads this field into RAM */ true)
.withColumnStrideField(EarlybirdFieldConstant.ID_CSF_FIELD.getFieldName(),
ThriftCSFType.LONG, 1, false, /* the full archive loads this field into RAM */ true)
.withColumnStrideField(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.CONVERSATION_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.QUOTED_TWEET_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.QUOTED_USER_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(),
ThriftCSFType.INT, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.CARD_URI_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
.withColumnStrideField(
EarlybirdFieldConstant.EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF.getFieldName(),
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
/* Semicolon on separate line to preserve git blame. */;
builder.withColumnStrideField(
EarlybirdFieldConstants.EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME,
ThriftCSFType.INT, NUMBER_OF_INTEGERS_FOR_EXTENDED_FEATURES,
true, loadCSFIntoRAMDefault);
for (Map.Entry<String, FeatureConfiguration> entry : FEATURE_CONFIGURATION_MAP.entrySet()) {
String fullName = entry.getKey();
String baseName = getBaseFieldName(fullName);
EarlybirdFieldConstant fieldConstant = EarlybirdFieldConstants.getFieldConstant(fullName);
if (fieldConstant.isValidFieldInCluster(cluster)) {
builder.withFeatureConfiguration(baseName, fullName, entry.getValue());
}
}
// Add facet settings for facet fields
// boolean args are respectively whether to use skiplist, whether offensive, whether to use CSF
builder
.withFacetConfigs(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName(),
EarlybirdFieldConstant.MENTIONS_FACET, true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName(),
EarlybirdFieldConstant.HASHTAGS_FACET, true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName(),
EarlybirdFieldConstant.STOCKS_FACET, true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.IMAGES_FACET, true, true, false)
.withFacetConfigs(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.VIDEOS_FACET, true, true, false)
.withFacetConfigs(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.NEWS_FACET, true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(),
EarlybirdFieldConstant.LANGUAGES_FACET, false, false, false)
.withFacetConfigs(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(),
EarlybirdFieldConstant.SOURCES_FACET, false, false, false)
.withFacetConfigs(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.TWIMG_FACET, true, true, false)
.withFacetConfigs(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(),
EarlybirdFieldConstant.FROM_USER_ID_FACET, false, false, true /* facet on CSF */)
.withFacetConfigs(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(),
EarlybirdFieldConstant.RETWEETS_FACET, false, false, true /* facet on CSF */)
.withFacetConfigs(EarlybirdFieldConstant.LINKS_FIELD.getFieldName(),
EarlybirdFieldConstant.LINKS_FACET, true, false, false)
.withFacetConfigs(
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD.getFieldName(),
true, false, false)
.withFacetConfigs(
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD.getFieldName(),
true, false, false)
.withFacetConfigs(
EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
true, false, false)
.withFacetConfigs(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName(),
EarlybirdFieldConstant.SPACES_FACET, true, false, false);
return builder.build();
}
}

View File

@ -1,897 +0,0 @@
package com.twitter.search.common.schema.earlybird;
import java.io.IOException;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.TokenStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.collections.Pair;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.cuad.ner.plain.thriftjava.NamedEntity;
import com.twitter.cuad.ner.plain.thriftjava.NamedEntityContext;
import com.twitter.cuad.ner.plain.thriftjava.NamedEntityInputSourceType;
import com.twitter.cuad.ner.thriftjava.WholeEntityType;
import com.twitter.search.common.constants.SearchCardType;
import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl;
import com.twitter.search.common.indexing.thriftjava.ThriftGeoLocationSource;
import com.twitter.search.common.indexing.thriftjava.TwitterPhotoUrl;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.schema.ThriftDocumentBuilder;
import com.twitter.search.common.schema.base.FieldNameToIdMapping;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.util.analysis.CharTermAttributeSerializer;
import com.twitter.search.common.util.analysis.IntTermAttributeSerializer;
import com.twitter.search.common.util.analysis.TermPayloadAttributeSerializer;
import com.twitter.search.common.util.analysis.TwitterPhotoTokenStream;
import com.twitter.search.common.util.spatial.GeoUtil;
import com.twitter.search.common.util.text.TokenizerHelper;
import com.twitter.search.common.util.text.TweetTokenStreamSerializer;
import com.twitter.search.common.util.text.regex.Regex;
import com.twitter.search.common.util.url.LinkVisibilityUtils;
import com.twitter.search.common.util.url.URLUtils;
import geo.google.datamodel.GeoAddressAccuracy;
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
/**
* Builder class for building a {@link ThriftDocument}.
*/
public final class EarlybirdThriftDocumentBuilder extends ThriftDocumentBuilder {
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdThriftDocumentBuilder.class);
private static final SearchCounter SERIALIZE_FAILURE_COUNT_NONPENGUIN_DEPENDENT =
SearchCounter.export("tokenstream_serialization_failure_non_penguin_dependent");
private static final String HASHTAG_SYMBOL = "#";
private static final String CASHTAG_SYMBOL = "$";
private static final String MENTION_SYMBOL = "@";
private static final SearchCounter BCP47_LANGUAGE_TAG_COUNTER =
SearchCounter.export("bcp47_language_tag");
/**
* Used to check if a card is video card.
*
* @see #withSearchCard
*/
private static final String AMPLIFY_CARD_NAME = "amplify";
private static final String PLAYER_CARD_NAME = "player";
// Extra term indexed for native retweets, to ensure that the "-rt" query excludes them.
public static final String RETWEET_TERM = "rt";
public static final String QUESTION_MARK = "?";
private static final Set<NamedEntityInputSourceType> NAMED_ENTITY_URL_SOURCE_TYPES =
ImmutableSet.of(
NamedEntityInputSourceType.URL_TITLE, NamedEntityInputSourceType.URL_DESCRIPTION);
private final TokenStreamSerializer intTermAttributeSerializer =
new TokenStreamSerializer(ImmutableList.of(
new IntTermAttributeSerializer()));
private final TokenStreamSerializer photoUrlSerializer =
new TokenStreamSerializer(ImmutableList
.<TokenStreamSerializer.AttributeSerializer>of(
new CharTermAttributeSerializer(), new TermPayloadAttributeSerializer()));
private final Schema schema;
private boolean isSetLatLonCSF = false;
private boolean addLatLonCSF = true;
private boolean addEncodedTweetFeatures = true;
@Nonnull
private final EarlybirdEncodedFeatures encodedTweetFeatures;
@Nullable
private final EarlybirdEncodedFeatures extendedEncodedTweetFeatures;
/**
* Default constructor
*/
public EarlybirdThriftDocumentBuilder(
@Nonnull EarlybirdEncodedFeatures encodedTweetFeatures,
@Nullable EarlybirdEncodedFeatures extendedEncodedTweetFeatures,
FieldNameToIdMapping idMapping,
Schema schema) {
super(idMapping);
this.schema = schema;
this.encodedTweetFeatures = Preconditions.checkNotNull(encodedTweetFeatures);
this.extendedEncodedTweetFeatures = extendedEncodedTweetFeatures;
}
/**
* Get internal {@link EarlybirdEncodedFeatures}
*/
public EarlybirdEncodedFeatures getEncodedTweetFeatures() {
return encodedTweetFeatures;
}
/**
* Add skip list entry for the given field.
* This adds a term __has_fieldName in the INTERNAL field.
*/
public EarlybirdThriftDocumentBuilder addFacetSkipList(String fieldName) {
withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstant.getFacetSkipFieldName(fieldName));
return this;
}
/**
* Add a filter term in the INTERNAL field.
*/
public EarlybirdThriftDocumentBuilder addFilterInternalFieldTerm(String filterName) {
withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdThriftDocumentUtil.formatFilter(filterName));
return this;
}
/**
* Add id field and id csf field.
*/
public EarlybirdThriftDocumentBuilder withID(long id) {
withLongField(EarlybirdFieldConstant.ID_FIELD.getFieldName(), id);
withLongField(EarlybirdFieldConstant.ID_CSF_FIELD.getFieldName(), id);
return this;
}
/**
* Add created at field and created at csf field.
*/
public EarlybirdThriftDocumentBuilder withCreatedAt(int createdAt) {
withIntField(EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName(), createdAt);
withIntField(EarlybirdFieldConstant.CREATED_AT_CSF_FIELD.getFieldName(), createdAt);
return this;
}
/**
* Add tweet text field.
*/
public EarlybirdThriftDocumentBuilder withTweetText(
String text, byte[] textTokenStream) throws IOException {
withTokenStreamField(EarlybirdFieldConstants.EarlybirdFieldConstant.TEXT_FIELD.getFieldName(),
text, textTokenStream);
return this;
}
public EarlybirdThriftDocumentBuilder withTweetText(String text) throws IOException {
withTweetText(text, null);
return this;
}
/**
* Add a list of cashTags. Like $TWTR.
*/
public EarlybirdThriftDocumentBuilder withStocksFields(List<String> cashTags) {
if (isNotEmpty(cashTags)) {
addFacetSkipList(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName());
for (String cashTag : cashTags) {
withStringField(
EarlybirdFieldConstant.STOCKS_FIELD.getFieldName(), CASHTAG_SYMBOL + cashTag);
}
}
return this;
}
/**
* Add a list of hashtags.
*/
public EarlybirdThriftDocumentBuilder withHashtagsField(List<String> hashtags) {
if (isNotEmpty(hashtags)) {
int numHashtags = Math.min(
hashtags.size(),
schema.getFeatureConfigurationById(
EarlybirdFieldConstant.NUM_HASHTAGS.getFieldId()).getMaxValue());
encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.NUM_HASHTAGS, numHashtags);
addFacetSkipList(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName());
for (String hashtag : hashtags) {
withStringField(
EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName(), HASHTAG_SYMBOL + hashtag);
}
}
return this;
}
/**
* Added a list of mentions.
*/
public EarlybirdThriftDocumentBuilder withMentionsField(List<String> mentions) {
if (isNotEmpty(mentions)) {
int numMentions = Math.min(
mentions.size(),
schema.getFeatureConfigurationById(
EarlybirdFieldConstant.NUM_HASHTAGS.getFieldId()).getMaxValue());
encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.NUM_MENTIONS, numMentions);
addFacetSkipList(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName());
for (String mention : mentions) {
withStringField(
EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName(), MENTION_SYMBOL + mention);
}
}
return this;
}
/**
* Add a list of Twitter Photo URLs (twimg URLs). These are different from regular URLs, because
* we use the TwitterPhotoTokenStream to index them, and we also include the status ID as payload.
*/
public EarlybirdThriftDocumentBuilder withTwimgURLs(
List<TwitterPhotoUrl> urls) throws IOException {
if (isNotEmpty(urls)) {
for (TwitterPhotoUrl photoUrl : urls) {
TokenStream ts = new TwitterPhotoTokenStream(photoUrl.getPhotoStatusId(),
photoUrl.getMediaUrl());
byte[] serializedTs = photoUrlSerializer.serialize(ts);
withTokenStreamField(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName(),
Long.toString(photoUrl.getPhotoStatusId()), serializedTs);
addFacetSkipList(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName());
}
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG);
}
return this;
}
/**
* Add a list of URLs. This also add facet skip list terms for news / images / videos if needed.
*/
public EarlybirdThriftDocumentBuilder withURLs(List<ThriftExpandedUrl> urls) {
if (isNotEmpty(urls)) {
Set<String> dedupedLinks = Sets.newHashSet();
for (ThriftExpandedUrl expandedUrl : urls) {
if (expandedUrl.isSetOriginalUrl()) {
String normalizedOriginalUrl = URLUtils.normalizePath(expandedUrl.getOriginalUrl());
dedupedLinks.add(normalizedOriginalUrl);
}
if (expandedUrl.isSetExpandedUrl()) {
dedupedLinks.add(URLUtils.normalizePath(expandedUrl.getExpandedUrl()));
}
if (expandedUrl.isSetCanonicalLastHopUrl()) {
String url = URLUtils.normalizePath(expandedUrl.getCanonicalLastHopUrl());
dedupedLinks.add(url);
String facetUrl = URLUtils.normalizeFacetURL(url);
if (expandedUrl.isSetMediaType()) {
switch (expandedUrl.getMediaType()) {
case NEWS:
withStringField(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName(), url);
addFacetSkipList(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName());
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG);
break;
case VIDEO:
withStringField(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName(), facetUrl);
addFacetSkipList(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName());
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG);
break;
case IMAGE:
withStringField(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName(), facetUrl);
addFacetSkipList(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName());
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
break;
case NATIVE_IMAGE:
// Nothing done here. Native images are handled separately.
// They are in PhotoUrls instead of expandedUrls.
break;
case UNKNOWN:
break;
default:
throw new RuntimeException("Unknown Media Type: " + expandedUrl.getMediaType());
}
}
if (expandedUrl.isSetLinkCategory()) {
withIntField(EarlybirdFieldConstant.LINK_CATEGORY_FIELD.getFieldName(),
expandedUrl.getLinkCategory().getValue());
}
}
}
if (!dedupedLinks.isEmpty()) {
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_LINK_FLAG);
addFacetSkipList(EarlybirdFieldConstant.LINKS_FIELD.getFieldName());
for (String linkUrl : dedupedLinks) {
withStringField(EarlybirdFieldConstant.LINKS_FIELD.getFieldName(), linkUrl);
}
}
encodedTweetFeatures.setFlagValue(
EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG,
LinkVisibilityUtils.hasVisibleLink(urls));
}
return this;
}
/**
* Add a list of places. The place are U64 encoded place IDs.
*/
public EarlybirdThriftDocumentBuilder withPlacesField(List<String> places) {
if (isNotEmpty(places)) {
for (String place : places) {
withStringField(EarlybirdFieldConstant.PLACE_FIELD.getFieldName(), place);
}
}
return this;
}
/**
* Add tweet text signature field.
*/
public EarlybirdThriftDocumentBuilder withTweetSignature(int signature) {
encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.TWEET_SIGNATURE, signature);
return this;
}
/**
* Add geo hash field and internal filter field.
*/
public EarlybirdThriftDocumentBuilder withGeoHash(double lat, double lon, int accuracy) {
if (GeoUtil.validateGeoCoordinates(lat, lon)) {
withGeoField(
EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(),
lat, lon, accuracy);
withLatLonCSF(lat, lon);
}
return this;
}
public EarlybirdThriftDocumentBuilder withGeoHash(double lat, double lon) {
withGeoHash(lat, lon, GeoAddressAccuracy.UNKNOWN_LOCATION.getCode());
return this;
}
/**
* Add geo location source to the internal field with ThriftGeoLocationSource object.
*/
public EarlybirdThriftDocumentBuilder withGeoLocationSource(
ThriftGeoLocationSource geoLocationSource) {
if (geoLocationSource != null) {
withGeoLocationSource(EarlybirdFieldConstants.formatGeoType(geoLocationSource));
}
return this;
}
/**
* Add geo location source to the internal field.
*/
public EarlybirdThriftDocumentBuilder withGeoLocationSource(String geoLocationSource) {
withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), geoLocationSource);
return this;
}
/**
* Add encoded lat and lon to LatLonCSF field.
*/
public EarlybirdThriftDocumentBuilder withLatLonCSF(double lat, double lon) {
isSetLatLonCSF = true;
long encodedLatLon = GeoUtil.encodeLatLonIntoInt64((float) lat, (float) lon);
withLongField(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName(), encodedLatLon);
return this;
}
/**
* Add from verified account flag to internal field.
*/
public EarlybirdThriftDocumentBuilder withFromVerifiedAccountFlag() {
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG);
addFilterInternalFieldTerm(EarlybirdFieldConstant.VERIFIED_FILTER_TERM);
return this;
}
/**
* Add from blue-verified account flag to internal field.
*/
public EarlybirdThriftDocumentBuilder withFromBlueVerifiedAccountFlag() {
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG);
addFilterInternalFieldTerm(EarlybirdFieldConstant.BLUE_VERIFIED_FILTER_TERM);
return this;
}
/**
* Add offensive flag to internal field.
*/
public EarlybirdThriftDocumentBuilder withOffensiveFlag() {
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG);
withStringField(
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstant.IS_OFFENSIVE);
return this;
}
/**
* Add user reputation value to encoded feature.
*/
public EarlybirdThriftDocumentBuilder withUserReputation(byte score) {
encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.USER_REPUTATION, score);
return this;
}
/**
* This method creates the fields related to document language.
* For most languages, their isoLanguageCode and bcp47LanguageTag are the same.
* For some languages with variants, these two fields are different.
* E.g. for simplified Chinese, their isoLanguageCode is zh, but their bcp47LanguageTag is zh-cn.
* <p>
* This method adds fields for both the isoLanguageCode and bcp47LanguageTag.
*/
public EarlybirdThriftDocumentBuilder withLanguageCodes(
String isoLanguageCode, String bcp47LanguageTag) {
if (isoLanguageCode != null) {
withISOLanguage(isoLanguageCode);
}
if (bcp47LanguageTag != null && !bcp47LanguageTag.equals(isoLanguageCode)) {
BCP47_LANGUAGE_TAG_COUNTER.increment();
withISOLanguage(bcp47LanguageTag);
}
return this;
}
/**
* Adds a String field into the ISO_LANGUAGE_FIELD.
*/
public EarlybirdThriftDocumentBuilder withISOLanguage(String languageString) {
withStringField(
EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(), languageString.toLowerCase());
return this;
}
/**
* Add from user ID fields.
*/
public EarlybirdThriftDocumentBuilder withFromUserID(long fromUserId) {
withLongField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(), fromUserId);
withLongField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(), fromUserId);
return this;
}
/**
* Add from user information fields.
*/
public EarlybirdThriftDocumentBuilder withFromUser(
long fromUserId, String fromUser) {
withFromUser(fromUserId, fromUser, null);
return this;
}
/**
* Add from user information fields.
*/
public EarlybirdThriftDocumentBuilder withFromUser(String fromUser) {
withFromUser(fromUser, null);
return this;
}
/**
* Add from user information fields.
*/
public EarlybirdThriftDocumentBuilder withFromUser(
String fromUser, String tokenizedFromUser) {
withStringField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), fromUser);
withStringField(EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(),
isNotBlank(tokenizedFromUser) ? tokenizedFromUser : fromUser);
return this;
}
/**
* Add from user information fields.
*/
public EarlybirdThriftDocumentBuilder withFromUser(
long fromUserId, String fromUser, String tokenizedFromUser) {
withFromUserID(fromUserId);
withFromUser(fromUser, tokenizedFromUser);
return this;
}
/**
* Add to user field.
*/
public EarlybirdThriftDocumentBuilder withToUser(
String toUser) {
withStringField(EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(), toUser);
return this;
}
/**
* Add escherbird annotation fields.
*/
public EarlybirdThriftDocumentBuilder withAnnotationEntities(List<String> entities) {
if (isNotEmpty(entities)) {
for (String entity : entities) {
withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(), entity);
}
}
return this;
}
/**
* Add replies to internal field and set is reply flag.
*/
public EarlybirdThriftDocumentBuilder withReplyFlag() {
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.IS_REPLY_FLAG);
addFilterInternalFieldTerm(EarlybirdFieldConstant.REPLIES_FILTER_TERM);
return this;
}
public EarlybirdThriftDocumentBuilder withCameraComposerSourceFlag() {
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.COMPOSER_SOURCE_IS_CAMERA_FLAG);
return this;
}
/**
* Add in reply to user id.
* <p>
* Notice {@link #withReplyFlag} is not automatically called since retweet a tweet that is
* a reply to some other tweet is not considered a reply.
* The caller should call {@link #withReplyFlag} separately if this tweet is really a reply tweet.
*/
public EarlybirdThriftDocumentBuilder withInReplyToUserID(long inReplyToUserID) {
withLongField(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName(), inReplyToUserID);
return this;
}
/**
* Add reference tweet author id.
*/
public EarlybirdThriftDocumentBuilder withReferenceAuthorID(long referenceAuthorID) {
withLongField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), referenceAuthorID);
return this;
}
/**
* Add all native retweet related fields/label
*/
@VisibleForTesting
public EarlybirdThriftDocumentBuilder withNativeRetweet(final long retweetUserID,
final long sharedStatusID) {
withLongField(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(), sharedStatusID);
withLongField(EarlybirdFieldConstant.RETWEET_SOURCE_TWEET_ID_FIELD.getFieldName(),
sharedStatusID);
withLongField(EarlybirdFieldConstant.RETWEET_SOURCE_USER_ID_FIELD.getFieldName(),
retweetUserID);
withLongField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), retweetUserID);
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.IS_RETWEET_FLAG);
// Add native retweet label to the internal field.
addFilterInternalFieldTerm(EarlybirdFieldConstant.NATIVE_RETWEETS_FILTER_TERM);
withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), RETWEET_TERM);
return this;
}
/**
* Add quoted tweet id and user id.
*/
@VisibleForTesting
public EarlybirdThriftDocumentBuilder withQuote(
final long quotedStatusId, final long quotedUserId) {
withLongField(EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName(), quotedStatusId);
withLongField(EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName(), quotedUserId);
withLongField(EarlybirdFieldConstant.QUOTED_TWEET_ID_CSF.getFieldName(), quotedStatusId);
withLongField(EarlybirdFieldConstant.QUOTED_USER_ID_CSF.getFieldName(), quotedUserId);
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_QUOTE_FLAG);
// Add quote label to the internal field.
addFilterInternalFieldTerm(EarlybirdFieldConstant.QUOTE_FILTER_TERM);
return this;
}
/**
* Add resolved links text field.
*/
public EarlybirdThriftDocumentBuilder withResolvedLinksText(String linksText) {
withStringField(EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), linksText);
return this;
}
/**
* Add source field.
*/
public EarlybirdThriftDocumentBuilder withSource(String source) {
withStringField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(), source);
return this;
}
/**
* Add normalized source field.
*/
public EarlybirdThriftDocumentBuilder withNormalizedSource(String normalizedSource) {
withStringField(
EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName(), normalizedSource);
return this;
}
/**
* Add positive smiley to internal field.
*/
public EarlybirdThriftDocumentBuilder withPositiveSmiley() {
withStringField(
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstant.HAS_POSITIVE_SMILEY);
return this;
}
/**
* Add negative smiley to internal field.
*/
public EarlybirdThriftDocumentBuilder withNegativeSmiley() {
withStringField(
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
EarlybirdFieldConstant.HAS_NEGATIVE_SMILEY);
return this;
}
/**
* Add question mark label to a text field.
*/
public EarlybirdThriftDocumentBuilder withQuestionMark() {
withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), QUESTION_MARK);
return this;
}
/**
* Add card related fields.
*/
public EarlybirdThriftDocumentBuilder withSearchCard(
String name,
String domain,
String title, byte[] serializedTitleStream,
String description, byte[] serializedDescriptionStream,
String lang) {
if (isNotBlank(title)) {
withTokenStreamField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(),
title, serializedTitleStream);
}
if (isNotBlank(description)) {
withTokenStreamField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(),
description, serializedDescriptionStream);
}
if (isNotBlank(lang)) {
withStringField(EarlybirdFieldConstant.CARD_LANG.getFieldName(), lang);
}
if (isNotBlank(domain)) {
withStringField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName(), domain);
}
if (isNotBlank(name)) {
withStringField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(), name);
withIntField(
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(),
SearchCardType.cardTypeFromStringName(name).getByteValue());
}
if (AMPLIFY_CARD_NAME.equalsIgnoreCase(name)
|| PLAYER_CARD_NAME.equalsIgnoreCase(name)) {
// Add into "internal" field so that this tweet is returned by filter:videos.
addFacetSkipList(
EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName());
}
return this;
}
public EarlybirdThriftDocumentBuilder withNormalizedMinEngagementField(
String fieldName, int normalizedNumEngagements) throws IOException {
EarlybirdThriftDocumentUtil.addNormalizedMinEngagementField(doc, fieldName,
normalizedNumEngagements);
return this;
}
/**
* Add named entity with given canonical name and type to document.
*/
public EarlybirdThriftDocumentBuilder withNamedEntity(NamedEntity namedEntity) {
if (namedEntity.getContexts() == null) {
// In this unlikely case, we don't have any context for named entity type or source,
// so we can't properly index it in any of our fields. We'll just skip it in this case.
return this;
}
// Keep track of the fields we've applied in the builder already, to ensure we only index
// each term (field/value pair) once
Set<Pair<EarlybirdFieldConstant, String>> fieldsApplied = new HashSet<>();
for (NamedEntityContext context : namedEntity.getContexts()) {
if (context.isSetInput_source()
&& NAMED_ENTITY_URL_SOURCE_TYPES.contains(context.getInput_source().getSource_type())) {
// If the source is one of the URL* types, add the named entity to the "from_url" fields,
// ensuring we add it only once
addNamedEntityFields(
fieldsApplied,
EarlybirdFieldConstant.NAMED_ENTITY_FROM_URL_FIELD,
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD,
namedEntity.getCanonical_name(),
context);
} else {
addNamedEntityFields(
fieldsApplied,
EarlybirdFieldConstant.NAMED_ENTITY_FROM_TEXT_FIELD,
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD,
namedEntity.getCanonical_name(),
context);
}
}
return this;
}
/**
* Add space id fields.
*/
public EarlybirdThriftDocumentBuilder withSpaceIdFields(Set<String> spaceIds) {
if (!spaceIds.isEmpty()) {
addFacetSkipList(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName());
for (String spaceId : spaceIds) {
withStringField(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName(), spaceId);
}
}
return this;
}
/**
* Add directed at user.
*/
@VisibleForTesting
public EarlybirdThriftDocumentBuilder withDirectedAtUser(final long directedAtUserId) {
withLongField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName(),
directedAtUserId);
withLongField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(), directedAtUserId);
return this;
}
/**
* Add a white space tokenized screen name field.
*
* Example:
* screenName - "super_hero"
* tokenized version - "super hero"
*/
public EarlybirdThriftDocumentBuilder withWhiteSpaceTokenizedScreenNameField(
String fieldName,
String normalizedScreenName) {
String whiteSpaceTokenizableScreenName = StringUtils.join(
normalizedScreenName.split(Regex.HASHTAG_USERNAME_PUNCTUATION_REGEX), " ");
withStringField(fieldName, whiteSpaceTokenizableScreenName);
return this;
}
/**
* Add a camel case tokenized screen name field.
*/
public EarlybirdThriftDocumentBuilder withCamelCaseTokenizedScreenNameField(
String fieldName,
String screenName,
String normalizedScreenName,
TokenStream screenNameTokenStream) {
// this normalized text is consistent to how the tokenized stream is created from
// TokenizerHelper.getNormalizedCamelcaseTokenStream - ie. just lowercasing.
String camelCaseTokenizedScreenNameText =
TokenizerHelper.getNormalizedCamelcaseTokenStreamText(screenName);
try {
// Reset the token stream in case it has been read before.
screenNameTokenStream.reset();
byte[] camelCaseTokenizedScreenName =
TweetTokenStreamSerializer.getTweetTokenStreamSerializer()
.serialize(screenNameTokenStream);
withTokenStreamField(
fieldName,
camelCaseTokenizedScreenNameText.isEmpty()
? normalizedScreenName : camelCaseTokenizedScreenNameText,
camelCaseTokenizedScreenName);
} catch (IOException e) {
LOG.error("TwitterTokenStream serialization error! Could not serialize: " + screenName);
SERIALIZE_FAILURE_COUNT_NONPENGUIN_DEPENDENT.increment();
}
return this;
}
private void addNamedEntityFields(
Set<Pair<EarlybirdFieldConstant, String>> fieldsApplied,
EarlybirdFieldConstant nameOnlyField,
EarlybirdFieldConstant nameWithTypeField,
String name,
NamedEntityContext context) {
withOneTimeStringField(fieldsApplied, nameOnlyField, name, false);
if (context.isSetEntity_type()) {
withOneTimeStringField(fieldsApplied, nameWithTypeField,
formatNamedEntityString(name, context.getEntity_type()), true);
}
}
private void withOneTimeStringField(
Set<Pair<EarlybirdFieldConstant, String>> fieldsApplied, EarlybirdFieldConstant field,
String value, boolean addToFacets) {
Pair<EarlybirdFieldConstant, String> fieldValuePair = new Pair<>(field, value);
if (!fieldsApplied.contains(fieldValuePair)) {
if (addToFacets) {
addFacetSkipList(field.getFieldName());
}
withStringField(field.getFieldName(), value);
fieldsApplied.add(fieldValuePair);
}
}
private String formatNamedEntityString(String name, WholeEntityType type) {
return String.format("%s:%s", name, type).toLowerCase();
}
/**
* Set whether set LAT_LON_CSF_FIELD or not before build
* if LAT_LON_CSF_FIELD is not set deliberately.
*
* @see #prepareToBuild()
*/
public EarlybirdThriftDocumentBuilder setAddLatLonCSF(boolean isSet) {
addLatLonCSF = isSet;
return this;
}
/**
* Set if add encoded tweet feature field in the end.
*
* @see #prepareToBuild()
*/
public EarlybirdThriftDocumentBuilder setAddEncodedTweetFeatures(boolean isSet) {
addEncodedTweetFeatures = isSet;
return this;
}
@Override
protected void prepareToBuild() {
if (!isSetLatLonCSF && addLatLonCSF) {
// In lucene archives, this CSF is needed regardless of whether geoLocation is set.
withLatLonCSF(GeoUtil.ILLEGAL_LATLON, GeoUtil.ILLEGAL_LATLON);
}
if (addEncodedTweetFeatures) {
// Add encoded_tweet_features before building the document.
withBytesField(
EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName(),
EarlybirdEncodedFeaturesUtil.toBytesForThriftDocument(encodedTweetFeatures));
}
if (extendedEncodedTweetFeatures != null) {
// Add extended_encoded_tweet_features before building the document.
withBytesField(
EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldName(),
EarlybirdEncodedFeaturesUtil.toBytesForThriftDocument(extendedEncodedTweetFeatures));
}
}
private static boolean isNotBlank(String value) {
return value != null && !value.isEmpty();
}
private static boolean isNotEmpty(List<?> value) {
return value != null && !value.isEmpty();
}
}

View File

@ -1,377 +0,0 @@
package com.twitter.search.common.schema.earlybird;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import com.google.common.collect.ImmutableList;
import com.twitter.common.text.util.TokenStreamSerializer;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.ThriftDocumentUtil;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
import com.twitter.search.common.schema.thriftjava.ThriftField;
import com.twitter.search.common.schema.thriftjava.ThriftFieldData;
import com.twitter.search.common.util.analysis.IntTermAttributeSerializer;
import com.twitter.search.common.util.analysis.TwitterNormalizedMinEngagementTokenStream;
/**
* Utility APIs for ThriftDocument used in Earlybird.
*/
public final class EarlybirdThriftDocumentUtil {
private static final EarlybirdFieldConstants ID_MAPPING = new EarlybirdFieldConstants();
private static final String FILTER_FORMAT_STRING = "__filter_%s";
/**
* Used to check whether a thrift document has filter nullcast internal field set.
* @see #isNullcastFilterSet(ThriftDocument)
*/
private static final String NULLCAST_FILTER_TERM =
formatFilter(EarlybirdFieldConstant.NULLCAST_FILTER_TERM);
private static final String SELF_THREAD_FILTER_TERM =
formatFilter(EarlybirdFieldConstant.SELF_THREAD_FILTER_TERM);
private static final String DIRECTED_AT_FILTER_TERM =
formatFilter(EarlybirdFieldConstant.DIRECTED_AT_FILTER_TERM);
private EarlybirdThriftDocumentUtil() {
// Cannot instantiate.
}
/**
* Formats a regular, simple filter term. The 'filter' argument should correspond to a constant
* from the Operator class, matching the operand (filter:links -> "links").
*/
public static final String formatFilter(String filter) {
return String.format(FILTER_FORMAT_STRING, filter);
}
/**
* Get status id.
*/
public static long getID(ThriftDocument document) {
return ThriftDocumentUtil.getLongValue(
document, EarlybirdFieldConstant.ID_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get Card name.
*/
public static String getCardName(ThriftDocument document) {
return ThriftDocumentUtil.getStringValue(
document, EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get Card language.
*/
public static String getCardLang(ThriftDocument document) {
return ThriftDocumentUtil.getStringValue(
document, EarlybirdFieldConstant.CARD_LANG.getFieldName(), ID_MAPPING);
}
/**
* Get Card language CSF.
*
* card language CSF is represented internally as an integer ID for a ThriftLanguage.
*/
public static int getCardLangCSF(ThriftDocument document) {
return ThriftDocumentUtil.getIntValue(
document, EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(), ID_MAPPING);
}
/**
* Get quoted tweet id.
*/
public static long getQuotedTweetID(ThriftDocument document) {
return ThriftDocumentUtil.getLongValue(
document, EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get quoted tweet user id.
*/
public static long getQuotedUserID(ThriftDocument document) {
return ThriftDocumentUtil.getLongValue(
document, EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get directed at user id.
*/
public static long getDirectedAtUserId(ThriftDocument document) {
return ThriftDocumentUtil.getLongValue(
document, EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get directed at user id CSF.
*/
public static long getDirectedAtUserIdCSF(ThriftDocument document) {
return ThriftDocumentUtil.getLongValue(
document, EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(), ID_MAPPING);
}
/**
* Get reference author id CSF.
*/
public static long getReferenceAuthorIdCSF(ThriftDocument document) {
return ThriftDocumentUtil.getLongValue(
document, EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), ID_MAPPING);
}
/**
* Get links.
*/
public static List<String> getLinks(ThriftDocument document) {
return getStringValues(document, EarlybirdFieldConstant.LINKS_FIELD);
}
/**
* Get created at timestamp in sec.
*/
public static int getCreatedAtSec(ThriftDocument document) {
return ThriftDocumentUtil.getIntValue(
document, EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get created at timestamp in ms.
*/
public static long getCreatedAtMs(ThriftDocument document) {
long createdAtSec = (long) getCreatedAtSec(document);
return createdAtSec * 1000L;
}
/**
* Get from user id.
*/
public static long getFromUserID(ThriftDocument document) {
return ThriftDocumentUtil.getLongValue(
document, EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get from user.
*/
public static String getFromUser(ThriftDocument document) {
return ThriftDocumentUtil.getStringValue(
document, EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get tokenized from user display name.
*/
public static String getFromUserDisplayName(ThriftDocument document) {
return ThriftDocumentUtil.getStringValue(
document, EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get tokenized from user.
*/
public static String getTokenizedFromUser(ThriftDocument document) {
return ThriftDocumentUtil.getStringValue(
document, EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get resolved links text.
*/
public static String getResolvedLinksText(ThriftDocument document) {
return ThriftDocumentUtil.getStringValue(
document, EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), ID_MAPPING);
}
/**
* Get iso language code.
*/
public static List<String> getISOLanguage(ThriftDocument document) {
return ThriftDocumentUtil.getStringValues(
document, EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(), ID_MAPPING);
}
/**
* First remove the old timestamp if they exist.
* Then add the created at and created at csf fields to the given thrift document.
*/
public static void replaceCreatedAtAndCreatedAtCSF(ThriftDocument document, int value) {
removeField(document, EarlybirdFieldConstant.CREATED_AT_FIELD);
removeField(document, EarlybirdFieldConstant.CREATED_AT_CSF_FIELD);
addIntField(document, EarlybirdFieldConstant.CREATED_AT_FIELD, value);
addIntField(document, EarlybirdFieldConstant.CREATED_AT_CSF_FIELD, value);
}
/**
* Add the given int value as the given field into the given document.
*/
public static ThriftDocument addIntField(
ThriftDocument document, EarlybirdFieldConstant fieldConstant, int value) {
ThriftFieldData fieldData = new ThriftFieldData().setIntValue(value);
ThriftField field =
new ThriftField().setFieldConfigId(fieldConstant.getFieldId()).setFieldData(fieldData);
document.addToFields(field);
return document;
}
private static EarlybirdFieldConstant getFeatureField(EarlybirdFieldConstant field) {
if (field.getFieldName().startsWith(
EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName())) {
return EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD;
} else if (field.getFieldName().startsWith(
EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldName())) {
return EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD;
} else {
throw new IllegalArgumentException("Not a feature field: " + field);
}
}
/**
* Get the feature value of a field.
*/
public static int getFeatureValue(
ImmutableSchemaInterface schema,
ThriftDocument document,
EarlybirdFieldConstant field) {
EarlybirdFieldConstant featureField = getFeatureField(field);
byte[] encodedFeaturesBytes =
ThriftDocumentUtil.getBytesValue(document, featureField.getFieldName(), ID_MAPPING);
if (encodedFeaturesBytes == null) {
// Treat the feature value as 0 if there is no encoded feature field.
return 0;
} else {
EarlybirdEncodedFeatures encodedFeatures = EarlybirdEncodedFeaturesUtil.fromBytes(
schema, featureField, encodedFeaturesBytes, 0);
return encodedFeatures.getFeatureValue(field);
}
}
/**
* Check whether the feature flag is set.
*/
public static boolean isFeatureBitSet(
ImmutableSchemaInterface schema,
ThriftDocument document,
EarlybirdFieldConstant field) {
EarlybirdFieldConstant featureField = getFeatureField(field);
byte[] encodedFeaturesBytes =
ThriftDocumentUtil.getBytesValue(document, featureField.getFieldName(), ID_MAPPING);
if (encodedFeaturesBytes == null) {
// Treat the bit as not set if there is no encoded feature field.
return false;
} else {
EarlybirdEncodedFeatures encodedFeatures = EarlybirdEncodedFeaturesUtil.fromBytes(
schema, featureField, encodedFeaturesBytes, 0);
return encodedFeatures.isFlagSet(field);
}
}
/**
* Check whether nullcast flag is set in the encoded features field.
*/
public static boolean isNullcastBitSet(ImmutableSchemaInterface schema, ThriftDocument document) {
return isFeatureBitSet(schema, document, EarlybirdFieldConstant.IS_NULLCAST_FLAG);
}
/**
* Remove all fields with the given field constant in a document.
*/
public static void removeField(ThriftDocument document, EarlybirdFieldConstant fieldConstant) {
List<ThriftField> fields = document.getFields();
if (fields != null) {
Iterator<ThriftField> fieldsIterator = fields.iterator();
while (fieldsIterator.hasNext()) {
if (fieldsIterator.next().getFieldConfigId() == fieldConstant.getFieldId()) {
fieldsIterator.remove();
}
}
}
}
/**
* Remove a string field with given fieldConstant and value.
*/
public static void removeStringField(
ThriftDocument document, EarlybirdFieldConstant fieldConstant, String value) {
List<ThriftField> fields = document.getFields();
if (fields != null) {
for (ThriftField field : fields) {
if (field.getFieldConfigId() == fieldConstant.getFieldId()
&& field.getFieldData().getStringValue().equals(value)) {
fields.remove(field);
return;
}
}
}
}
/**
* Adds a new TokenStream field for each engagement counter if normalizedNumEngagements >= 1.
*/
public static void addNormalizedMinEngagementField(
ThriftDocument doc,
String fieldName,
int normalizedNumEngagements) throws IOException {
if (normalizedNumEngagements < 1) {
return;
}
TokenStreamSerializer serializer =
new TokenStreamSerializer(ImmutableList.of(new IntTermAttributeSerializer()));
TwitterNormalizedMinEngagementTokenStream stream = new
TwitterNormalizedMinEngagementTokenStream(normalizedNumEngagements);
byte[] serializedStream = serializer.serialize(stream);
ThriftFieldData fieldData = new ThriftFieldData().setTokenStreamValue(serializedStream);
ThriftField field = new ThriftField().setFieldConfigId(ID_MAPPING.getFieldID(fieldName))
.setFieldData(fieldData);
doc.addToFields(field);
}
public static List<String> getStringValues(
ThriftDocument document, EarlybirdFieldConstant field) {
return ThriftDocumentUtil.getStringValues(document, field.getFieldName(), ID_MAPPING);
}
public static boolean isNullcastFilterSet(ThriftDocument document) {
return isFilterSet(document, NULLCAST_FILTER_TERM);
}
public static boolean isSelfThreadFilterSet(ThriftDocument document) {
return isFilterSet(document, SELF_THREAD_FILTER_TERM);
}
public static String getSelfThreadFilterTerm() {
return SELF_THREAD_FILTER_TERM;
}
public static String getDirectedAtFilterTerm() {
return DIRECTED_AT_FILTER_TERM;
}
public static boolean isDirectedAtFilterSet(ThriftDocument document) {
return isFilterSet(document, DIRECTED_AT_FILTER_TERM);
}
/**
* Check whether given filter is set in the internal field.
*/
private static boolean isFilterSet(ThriftDocument document, String filter) {
List<String> terms = ThriftDocumentUtil.getStringValues(
document, EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), ID_MAPPING);
for (String term : terms) {
if (filter.equals(term)) {
return true;
}
}
return false;
}
}

View File

@ -1,336 +0,0 @@
package com.twitter.search.common.schema.earlybird;
import javax.annotation.Nullable;
import com.twitter.search.common.config.Config;
public enum FlushVersion {
/* =======================================================
* Versions
* ======================================================= */
VERSION_0("Initial version of partition flushing."),
VERSION_1("Added timestamps and corresponding mapper to SegmentData."),
VERSION_2("Add column stride fields."),
VERSION_3("Change facet field configuration."),
VERSION_4("Add per term offensive counters to parallel posting arrays."),
VERSION_5("Add native photo facet."),
VERSION_6("Add UserFeature column stride field"),
VERSION_7("Index segment optimizations; new facet data structures."),
VERSION_8("Store statuses in memory in Earlybird."),
VERSION_9("Index from_user_ids into a searchable field."),
VERSION_10("Change from_user_id dictionary from fst to mphf"),
VERSION_11("Write image and video facet in separate lucene field."),
VERSION_12("Add retweeted status ID to the sparse CSF."),
VERSION_13("Add isOffensive field for profanity filter."),
VERSION_14("Fix features column stride field corruption."),
VERSION_15("Upgrade Lucene version, which has a different FST serialization format."),
VERSION_16("Remove maxDoc in favor of lastDocID"),
VERSION_17("Added partition and timeslice identifiers to SegmentData."),
VERSION_18("Per-term payloads"),
VERSION_19("Multiple per-doc payload fields"),
VERSION_20("Unify and fix hash codes"),
VERSION_21("Super awesome new flexible realtime posting list format."),
VERSION_22("Added new geo implementation."),
VERSION_23("Upgrade to Lucene 4.0.0 Final"),
VERSION_24("Added tweet topic ids."),
VERSION_25("Turn on skip list for mention facet."),
VERSION_26("Added new EncodedTweetFeaturesColumnStrideField."),
VERSION_27("Topic ids facet field."),
VERSION_28("From-user discover stories skiplist field."),
VERSION_29("Move tokenized screen name to the new username field"),
VERSION_30("Enable HF term pairs index."),
VERSION_31("Remove reverse doc ids."),
VERSION_32("Switch shared status id CSF to non-sparse long CSF index."),
VERSION_33("New skip lists for optimized high df posting lists."),
VERSION_34("Store tweet signature in EarlybirdEncodedFeatures."),
VERSION_35("Don't store shared status id csf in archive indexes."),
VERSION_36("Don't store norms."),
VERSION_37("64 bit user ids."),
VERSION_38("Index links in archive."),
VERSION_39("Fix pic.twitter.com image link handling not setting the internal field correctly."),
VERSION_40("Fix all archive tweets being marked as replies."),
VERSION_41("Avoid flushing event_ids field; event clusters are applied as updates."),
VERSION_42("No position fields refactoring; made a few fields to not use position."),
VERSION_43("Index private geo coordinates"),
VERSION_44("Materialize last doc id in HighDFCompressedPostinglists", true),
VERSION_45("Removing from_user_id facets support", true),
VERSION_46("Guard against badly out of order tweets in the search archive.", true),
VERSION_47("Added card title and description fields.", true),
VERSION_48("Added card type CSF.", true),
VERSION_49("Lucene 4.4 upgrade", true),
VERSION_50("Put mem-archive back on non-lucene optimized indexes", true),
VERSION_51("Force index rebuild to fix blank text field. See SEARCH-2505.", true),
VERSION_52("Refactoring of docValues/CSF.", true),
VERSION_53("Remove SegmentData.Configuration", true),
VERSION_54("Fix bad indices caused by SEARCH-2723.", true),
VERSION_55("Fixed non-deterministic facetIds across restarts. SEARCH-2815.", true),
VERSION_56("Flush FacetIDMap.", true),
VERSION_57("Remove LatLonMapper and use standard DocValues instead.", true),
VERSION_58("Longterm Attribute Optimization.", true),
VERSION_59("Renamed archive segment names. Current segment is no longer mutable.", true),
// Flush version 60 and 59 have the same format.
// Flush version is increased to trigger a rebuild, because we noticed incomplete segments.
// More details can be found on SEARCH-3664
VERSION_60("Flush version change to trigger segment rebuild.", true),
VERSION_61("Adding back from_user_id", true),
VERSION_62("Add retweet facet.", true),
VERSION_63("Switch to new index API in com.twitter.search.core.earlybird.", true),
VERSION_64("Sort merge archive day and part-* data. SEARCH-4692.", true),
VERSION_65("Fix ID_FIELD and CREATED_AT_FIELD sort order. SEARCH-4004 SEARCH-912 ", true),
VERSION_66("Rebuild data for 1/5/2015. Data on HDFS fixed as part of SEARCH-5347.", true),
VERSION_67("Upgrade to Lucene 4.10.3.", true),
VERSION_68("Switching to Penguin v4", true),
VERSION_69("Fix 16% archive segments: SEARCH-6073", true),
VERSION_70("Switching to Penguin v4 for full archive cluster. SEARCH-5302", true),
VERSION_71("Switching to Penguin v4 for ssd archive cluster.", true),
VERSION_72("Added Escherbird annotations for full archive.", true),
VERSION_73("Lucene 5.2.1 upgrade.", true, 0),
VERSION_74("Hanndle geo scurbbed data and archive geo index accuracy", true, 0),
VERSION_75("Delete from_user_id_stories from indices", true, 0),
VERSION_76("Allow multiple index extensions.", true, 0),
VERSION_77("Removed EarlybirdCodec", true, 0),
// minor version 2: added embedded tweet features
// minor version 3: change embedded tweet features to INC_ONLY
VERSION_78("Added 80 bytes of extended features", true, 3),
// minor version 1: SEARCH-8564 - Reference Tweet Author ID, using
// EXTENDED_TEST_FEATURE_UNUSED_BITS_2 and EXTENDED_TEST_FEATURE_UNUSED_BITS_3
VERSION_79("Renamed UNUSED_BIT to HAS_VISIBLE_LINK", true, 1),
// minor version 2: SEARCH-8564 / http://go/rb/770373
// Made REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT and
// REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT immutable field
VERSION_80("Facet for links: SEARCH-8331", true, 2),
// minor version 1: added video view count
VERSION_81("Adding LowDF posting list with packed ints", true, 1),
VERSION_82("Enabling HighDF posting list with packed ints", true, 0),
// minor version 1: SEARCH-9379 - Added bitset for nullcast tweets
// minor version 2: SEARCH-8765 - Added visible token ratio
VERSION_83("Add bits in encoded features for media type flags. SEARCH-9131", true, 2),
VERSION_84("Enable archive rebuild for __has_links field. SEARCH-9635", true, 0),
// minor version 1: SEARCHQUAL-8130, add engagement v2
VERSION_85("New archive build gen for missing geo data. SEARCH-9894", true, 1),
VERSION_86("Added new fields to the index", true, 0),
// During this rebuild both the statuses and the engagement counts were regenerated.
// minor version 1: added quote_count
VERSION_87("Periodic archive full rebuild. SEARCH-9423", true, 1),
// minor version 1: make new tokenized user name/handle fields textSearchable
// (see go/rb/847134/)
// minor version 2: added has_quote
VERSION_88("Fixing missing day in the full archive index. SEARCH-11233", true, 2),
VERSION_89("Index and store conversation ids.", true, 0),
VERSION_90("Fixing inconsistent days in the full archive index. SEARCH-11744", true, 0),
VERSION_91("Making in_reply_to_user_id field use MPH. SEARCH-10836", true, 0),
VERSION_92("Allow searches by any field. SEARCH-11251", true, 0),
// During this rebuild we regenerated engagement counts and merged the annotations in the
// aggregate job.
VERSION_93("Periodic archive full rebuild. SEARCH-11076", true, 0),
// minor version 1: add ThriftCSFViewSettings.outputCSFType
VERSION_94("Indexing a bunch of geo fields. SEARCH-10283", true, 1),
VERSION_95("Removing topic ID fields. SEARCH-8616", true, 0),
// minor version 1: add ThriftCSFViewSettings.normalizationType
VERSION_96("Enabling conversation ID for all clusters. SEARCH-11989", true, 1),
// minor version 1: set several feature configuration to be correct double type
// minor version 2: set some more feature configuration to be correct double type
// minor version 3: add safety labels SEARCHQUAL-9561
// minor version 4: add weighted engagement counts SEARCHQUAL-9574
// minor version 5: add Dopamine non personalized score SEARCHQUAL-9743
VERSION_97("Changing CSF type to BOOLEAN for some has_* flags.", true, 5),
VERSION_98("Periodic archive full rebuild. PCM-56871.", true, 1),
VERSION_99("Removing named_entities field. SEARCH-13708", true, 0),
// minor version 1: add periscope features (SEARCHQUAL-10008)
// minor version 2: add raw_earlybird_score to TweetExternalFeatures (SEARCHQUAL-10347)
VERSION_100("Upgrade Penguin Version from V4 to V6. SEARCH-12991", true, 2),
// minor version 1: adjust for normalizer type for some engagement counters (SEARCHQUAL-9537)
// minor version 2: add decaying engagement counts and last engaged timestamps (SEARCHQUAL-10532)
VERSION_101("Add emoji to the index. SEARCH-12991", true, 2),
VERSION_102("Periodic full archive rebuild. PCM-67851", true, 0),
VERSION_103("Add liked_by_user_id field. SEARCH-15341", true, 0),
// minor version 1: remove last engaged timestamp with 3-hour increment (SEARCHQUAL-10903)
// minor version 2: add fake engagement counts (SEARCHQUAL-10795)
// minor version 3: add last engaged timestamp with 1-hour increment (SEARCHQUAL-10942)
VERSION_104("Reverting to the 20170109_pc100_par30 build gen. SEARCH-15731", true, 3),
VERSION_105("Add 3 new fields to archive index for engagement features. SEARCH-16102", true, 0),
// This is the last rebuild based on /tables/statuses. Starting 9/14 this build-gen is powered
// by TweetSource. During this rebuild both statuses and engagement counts were rebuilt.
VERSION_106("Periodic archive full rebuild. PCM-74652", true, 0),
VERSION_107("Removing card fields from full archive index.", true, 0),
VERSION_108("Removing the tms_id field from all schemas.", true, 0),
VERSION_109("Removing LAT_LON_FIELD from all schemas.", true, 0),
VERSION_110("Adding the card fields back to the full archive index.", true, 1),
// minor version 1: Add composer source csf field (SEARCH-22494)
VERSION_111("Adding composer_source to index. SEARCH-20377.", true, 1),
VERSION_112("Partial rebuild to fix SEARCH-22529.", true, 0),
VERSION_113("Full archive build gen 20180312_pc100_par30.", true, 0),
VERSION_114("Fix for SEARCH-23761.", true, 0),
VERSION_115("Add fields for quoted tweets. SEARCH-23919", true, 0),
// minor version 1: Add 4 bit hashtag count, mention count and stock count (SEARCH-24336)
VERSION_116("Bump flush version for scrubbing pipeline. SEARCH-24225", true, 1),
VERSION_117("Add retweeted_by_user_id and replied_to_by_user_id fields. SEARCH-24463", true, 0),
// minor version 1: Removed dopamine_non_personalized_score (SEARCHQUAL-10321)
VERSION_118("Adding the reply and retweet source tweet IDs: SEARCH-23702, SEARCH-24502", true, 1),
// minor version 1: add blink engagement counts (SEARCHQUAL-15176)
VERSION_119("Remove public inferred location: SEARCH-24235", true, 1),
VERSION_120("Flush extensions before fields when flushing segments.", true, 0),
VERSION_121("Flush the startingDocIdForSearch field. SEARCH-25464.", true, 0),
VERSION_122("Do not flush the startingDocIdForSearch field.", true, 0),
VERSION_123("Renaming the largestDocID flushed property to firstAddedDocID.", true, 0),
VERSION_124("Use the skip list posting list for all fields.", true, 0),
VERSION_125("Use hashmap for tweet ID lookup.", true, 0),
VERSION_126("Use the skip list posting list for all fields.", true, 0),
VERSION_127("Flushing the min and max doc IDs in each segment.", true, 0),
VERSION_128("Add card_lang to index. SEARCH-26539", true, 0),
VERSION_129("Move the tweet ID mapper to the segment data.", true, 0),
VERSION_130("Move the time mapper to the segment data.", true, 0),
VERSION_131("Change the facets classes to work with any doc IDs.", true, 0),
VERSION_132("Make the CSF classes work with any doc IDs.", true, 0),
VERSION_133("Removing smallestDocID property.", true, 0),
VERSION_134("Optimize DeletedDocs before flushing.", true, 0),
VERSION_135("Add payloads to skiplists.", true, 0),
VERSION_136("Add name to int pools.", true, 0),
VERSION_137("Add unsorted stream offset.", true, 0),
VERSION_138("Switch to the OutOfOrderRealtimeTweetIDMapper.", true, 0),
VERSION_139("Remove realtime posting lists.", true, 0),
VERSION_140("Add named_entity field. SEARCH-27547", true, 0),
VERSION_141("Flush the out of order updates count.", true, 0),
VERSION_142("Add named_entity facet support. SEARCH-28054", true, 0),
VERSION_143("Index updates before optimizing segment.", true, 0),
VERSION_144("Refactor TermsArray.", true, 0),
VERSION_145("Remove SmallestDocID.", true, 0),
VERSION_146("Add entity_id facet support. SEARCH-28071", true, 0),
VERSION_147("Enable updating facets", true, 0),
VERSION_148("Rename the counter for feature updates to partial updates", true, 0),
VERSION_149("Stop flushing offsets for sorted updates DL streams.", true, 0),
VERSION_150("Update the name of the property for the updates DL stream offset.", true, 0),
VERSION_151("Upgrade Lucene version to 5.5.5.", true, 0),
VERSION_152("Upgrade Lucene version to 6.0.0.", true, 0),
VERSION_153("Upgrade Lucene version to 6.6.6.", true, 0),
VERSION_154("Store the timeslice ID on EarlybirdIndexSegmentData.", true, 0),
VERSION_155("Do not flush index extensions.", true, 0),
VERSION_156("Deprecate ThriftIndexedFieldSettings.defaultFieldBoost.", true, 0),
VERSION_157("Load CREATED_AT_CSF_FIELD into RAM in archive.", true, 0),
VERSION_158("Added directed at user ID field and CSF.", true, 0),
VERSION_159("Changing deleted docs serialization format.", true, 0),
VERSION_160("Add fields for health model scores. SEARCH-31907, HML-2099", true, 0),
VERSION_161("Switch to the 'search' Kafka cluster.", true, 0),
VERSION_162("Update Lucene version to 7.0.0.", true, 0),
VERSION_163("Update Lucene version to 7.7.2.", true, 0),
// minor version 1: add IS_TRENDING_NOW_FLAG
VERSION_164("Collect per-term stats in the realtime segments.", true, 1),
VERSION_165("Update Lucene version to 8.5.2.", true, 0),
VERSION_166("Serialize maxPosition field for InvertedRealtimeIndex", true, 0),
VERSION_167("Add field for pSpammyTweetScore. HML-2557", true, 0),
VERSION_168("Add field for pReportedTweetScore. HML-2644", true, 0),
VERSION_169("Add field for spammyTweetContentScore. PFM-70", true, 0),
VERSION_170("Add reference author id CSF. SEARCH-34715", true, 0),
VERSION_171("Add space_id field. SEARCH-36156", true, 0),
VERSION_172("Add facet support for space_id. SEARCH-36388", true, 0),
VERSION_173("Add space admin and title fields. SEARCH-36986", true, 0),
VERSION_174("Switching to Penguin v7 for realtime-exp0 cluster. SEARCH-36068", true, 0),
VERSION_175("Adding exclusive conversation author id CSF", true, 0),
VERSION_176("Adding card URI CSF", true, 0),
// minor version 1: add FROM_BLUE_VERIFIED_ACCOUNT_FLAG
// minor version 2: Adding new cluster REALTIME_CG. SEARCH-45692
VERSION_177("Adding URL Description and Title fields. SEARCH-41641", true, 2),
/**
* This semi colon is on a separate line to avoid polluting git blame history.
* Put a comma after the new enum field you're adding.
*/;
// The current version.
public static final FlushVersion CURRENT_FLUSH_VERSION =
FlushVersion.values()[FlushVersion.values().length - 1];
public static final String DELIMITER = "_v_";
/* =======================================================
* Helper methods
* ======================================================= */
private final String description;
private final boolean isOfficial;
private final int minorVersion;
/**
* A flush version is not official unless explicitly stated to be official.
* An unofficial flush version is never uploaded to HDFS.
*/
private FlushVersion(String description) {
this(description, false, 0);
}
private FlushVersion(String description, boolean isOfficial) {
this(description, isOfficial, 0);
}
private FlushVersion(String description, boolean isOfficial, int minorVersion) {
this.description = description;
this.isOfficial = isOfficial;
this.minorVersion = minorVersion;
}
/**
* Returns file extension with version number.
*/
public String getVersionFileExtension() {
if (this == VERSION_0) {
return "";
} else {
return DELIMITER + ordinal();
}
}
/**
* Returns file extension given flush version number.
* If the flush version is unknown (e.g. higher than current flush version or lower than 0), null
* is returned.
*/
@Nullable
public static String getVersionFileExtension(int flushVersion) {
if (flushVersion > CURRENT_FLUSH_VERSION.ordinal() || flushVersion < 0) {
return null;
} else {
return FlushVersion.values()[flushVersion].getVersionFileExtension();
}
}
/**
* Returns a string describing the current schema version.
* @deprecated Please use {@link com.twitter.search.common.schema.base.Schema#getVersionDescription()}
*/
@Deprecated
public String getDescription() {
return description;
}
/**
* Returns the schema's major version.
* @deprecated Please use {@link com.twitter.search.common.schema.base.Schema#getMajorVersionNumber()}.
*/
@Deprecated
public int getVersionNumber() {
return this.ordinal();
}
public boolean onOrAfter(FlushVersion other) {
return compareTo(other) >= 0;
}
/**
* Returns whether the schema version is official. Only official segments are uploaded to HDFS.
* @deprecated Please use {@link com.twitter.search.common.schema.base.Schema#isVersionOfficial()}.
*/
@Deprecated
public boolean isOfficial() {
// We want the loading/flushing tests to pass locally even if the version is not meant
// to be an official version.
return isOfficial || Config.environmentIsTest();
}
/**
* As of now, this is hardcoded to 0. We will start using this soon.
* @deprecated Please consult schema for minor version. This should only be used to build schema.
*/
@Deprecated
public int getMinorVersion() {
return minorVersion;
}
}

View File

@ -1,71 +0,0 @@
package com.twitter.search.common.search;
import java.io.IOException;
import org.apache.lucene.search.DocIdSetIterator;
public class AndNotDocIdSetIterator extends DocIdSetIterator {
private int nextDelDoc;
private final DocIdSetIterator baseIter;
private final DocIdSetIterator notIter;
private int currID;
/** Creates a new AndNotDocIdSetIterator instance. */
public AndNotDocIdSetIterator(DocIdSetIterator baseIter, DocIdSetIterator notIter)
throws IOException {
nextDelDoc = notIter.nextDoc();
this.baseIter = baseIter;
this.notIter = notIter;
currID = -1;
}
@Override
public int advance(int target) throws IOException {
currID = baseIter.advance(target);
if (currID == DocIdSetIterator.NO_MORE_DOCS) {
return currID;
}
if (nextDelDoc != DocIdSetIterator.NO_MORE_DOCS) {
if (currID < nextDelDoc) {
return currID;
} else if (currID == nextDelDoc) {
return nextDoc();
} else {
nextDelDoc = notIter.advance(currID);
if (currID == nextDelDoc) {
return nextDoc();
}
}
}
return currID;
}
@Override
public int docID() {
return currID;
}
@Override
public int nextDoc() throws IOException {
currID = baseIter.nextDoc();
if (nextDelDoc != DocIdSetIterator.NO_MORE_DOCS) {
while (currID != DocIdSetIterator.NO_MORE_DOCS) {
if (currID < nextDelDoc) {
return currID;
} else {
if (currID == nextDelDoc) {
currID = baseIter.nextDoc();
}
nextDelDoc = notIter.advance(currID);
}
}
}
return currID;
}
@Override
public long cost() {
return baseIter.cost();
}
}

View File

@ -1,33 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/com/twitter/elephantbird:core",
"3rdparty/jvm/geo/google:geoGoogle",
"3rdparty/jvm/log4j",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/lucene:lucene-facet",
"3rdparty/jvm/org/apache/lucene:lucene-queries",
"3rdparty/jvm/org/apache/lucene:lucene-spatial-extras",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/collections",
"src/java/com/twitter/common/util:system-mocks",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/query",
"src/java/com/twitter/search/common/schema",
"src/java/com/twitter/search/common/schema/base",
"src/java/com/twitter/search/common/util/spatial",
"src/java/com/twitter/search/queryparser",
"src/thrift/com/twitter/search/common:facets-java",
"src/thrift/com/twitter/search/common:query-java",
],
)

Binary file not shown.

View File

@ -1,75 +0,0 @@
package com.twitter.search.common.search;
import java.io.IOException;
import java.util.List;
import javax.annotation.Nullable;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorable;
import org.apache.lucene.search.ScoreMode;
import com.twitter.common.util.Clock;
import com.twitter.search.common.query.thriftjava.CollectorParams;
/**
* A {@link com.twitter.search.common.search.TwitterEarlyTerminationCollector}
* that delegates actual hit collection to a sub collector.
*/
public final class DelegatingEarlyTerminationCollector
extends TwitterEarlyTerminationCollector {
private final Collector subCollector;
private LeafCollector subLeafCollector;
/** Creates a new DelegatingEarlyTerminationCollector instance. */
public DelegatingEarlyTerminationCollector(Collector subCollector,
CollectorParams collectorParams,
TerminationTracker terminationTracker,
@Nullable QueryCostProvider queryCostProvider,
int numDocsBetweenTimeoutChecks,
Clock clock) {
super(
collectorParams,
terminationTracker,
queryCostProvider,
numDocsBetweenTimeoutChecks,
clock);
this.subCollector = subCollector;
}
@Override
public void setScorer(Scorable scorer) throws IOException {
super.setScorer(scorer);
subLeafCollector.setScorer(scorer);
}
@Override
protected void doCollect() throws IOException {
subLeafCollector.collect(curDocId);
}
@Override
protected void doFinishSegment(int lastSearchedDocID) throws IOException {
if (subCollector instanceof TwitterCollector) {
((TwitterCollector) subCollector).finishSegment(lastSearchedDocID);
}
}
@Override
public void setNextReader(LeafReaderContext context) throws IOException {
super.setNextReader(context);
subLeafCollector = subCollector.getLeafCollector(context);
}
@Override
public ScoreMode scoreMode() {
return subCollector.scoreMode();
}
@Override
public List<String> getDebugInfo() {
return null;
}
}

View File

@ -1,12 +0,0 @@
package com.twitter.search.common.search;
/**
* Provide an accessor for a doc ID. This is useful for classes that iterate through doc IDs
* and maintain a "last seen" doc ID.
*/
public interface DocIdTracker {
/**
* Retrieve current doc ID
*/
int getCurrentDocId();
}

View File

@ -1,51 +0,0 @@
package com.twitter.search.common.search;
import javax.annotation.Nonnull;
import com.google.common.base.Preconditions;
import com.twitter.search.common.metrics.SearchCounter;
/**
* This is not an enum to allow different clusters to define their own EarlyTerminationStates.
*/
public final class EarlyTerminationState {
private static final String STATS_PREFIX = "early_termination_";
public static final EarlyTerminationState COLLECTING =
new EarlyTerminationState("no_early_termination", false);
public static final EarlyTerminationState TERMINATED_TIME_OUT_EXCEEDED =
new EarlyTerminationState("terminated_timeout_exceeded", true);
public static final EarlyTerminationState TERMINATED_MAX_QUERY_COST_EXCEEDED =
new EarlyTerminationState("terminated_max_query_cost_exceeded", true);
public static final EarlyTerminationState TERMINATED_MAX_HITS_EXCEEDED =
new EarlyTerminationState("terminated_max_hits_exceeded", true);
public static final EarlyTerminationState TERMINATED_NUM_RESULTS_EXCEEDED =
new EarlyTerminationState("terminated_num_results_exceeded", true);
// This string can be returned as a part of a search response, to tell the searcher
// why the search got early terminated.
private final String terminationReason;
private final boolean terminated;
private final SearchCounter count;
public EarlyTerminationState(@Nonnull String terminationReason, boolean terminated) {
this.terminationReason = Preconditions.checkNotNull(terminationReason);
this.terminated = terminated;
count = SearchCounter.export(STATS_PREFIX + terminationReason + "_count");
}
public boolean isTerminated() {
return terminated;
}
public String getTerminationReason() {
return terminationReason;
}
public void incrementCount() {
count.increment();
}
}

View File

@ -1,65 +0,0 @@
package com.twitter.search.common.search;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.lucene.search.Query;
import org.apache.lucene.spatial.prefix.tree.Cell;
import org.apache.lucene.spatial.prefix.tree.CellIterator;
import org.apache.lucene.util.BytesRef;
import com.twitter.search.common.util.spatial.GeohashChunkImpl;
import com.twitter.search.queryparser.util.GeoCode;
import geo.google.datamodel.GeoAddressAccuracy;
public final class GeoQuadTreeQueryBuilderUtil {
private GeoQuadTreeQueryBuilderUtil() {
}
/**
* Build a geo quad tree query based around the geo code based on the geo field.
* @param geocode the geo location for the quad tree query
* @param field the field where the geohash tokens are indexed
* @return the corresponding for the geo quad tree query
*/
public static Query buildGeoQuadTreeQuery(GeoCode geocode, String field) {
Set<BytesRef> geoHashSet = new LinkedHashSet<>();
// if accuracy is specified. Add a term query based on accuracy.
if (geocode.accuracy != GeoAddressAccuracy.UNKNOWN_LOCATION.getCode()) {
BytesRef termRef = new BytesRef(GeohashChunkImpl.buildGeoStringWithAccuracy(geocode.latitude,
geocode.longitude,
geocode.accuracy));
geoHashSet.add(termRef);
}
// If distance is specified. Add term queries based on distance
if (geocode.distanceKm != GeoCode.DOUBLE_DISTANCE_NOT_SET) {
// Build query based on distance
int treeLevel = -1;
// First find block containing query point with diagonal greater than 2 * radius.
Cell centerNode = GeohashChunkImpl.getGeoNodeByRadius(geocode.latitude, geocode.longitude,
geocode.distanceKm);
// Add center node querying term
if (centerNode != null) {
geoHashSet.add(centerNode.getTokenBytesNoLeaf(new BytesRef()));
treeLevel = centerNode.getLevel();
}
// This improves edge case recall, by adding cells also intersecting the query area.
CellIterator nodes = GeohashChunkImpl.getNodesIntersectingCircle(geocode.latitude,
geocode.longitude,
geocode.distanceKm,
treeLevel);
// If there are other nodes intersecting query circle, also add them in.
if (nodes != null) {
while (nodes.hasNext()) {
geoHashSet.add(nodes.next().getTokenBytesNoLeaf(new BytesRef()));
}
}
}
return new com.twitter.search.common.query.MultiTermDisjunctionQuery(field, geoHashSet);
}
}

View File

@ -1,76 +0,0 @@
package com.twitter.search.common.search;
import java.util.Arrays;
import org.apache.lucene.search.DocIdSetIterator;
/**
* DocIdSetIterator implementation from a sorted list of non-negative integers. If the given list of
* doc IDs is not sorted or contains negative doc IDs, the results are undefined.
*/
public class IntArrayDocIdSetIterator extends DocIdSetIterator {
private final int[] docIds;
private int docId;
private int cursor;
public IntArrayDocIdSetIterator(int[] ids) {
docIds = ids;
reset();
}
/** Used for testing. */
public void reset() {
docId = -1;
cursor = -1;
}
@Override
public int docID() {
return docId;
}
@Override
public int nextDoc() {
return advance(docId);
}
@Override
public int advance(int target) {
if (docId == NO_MORE_DOCS) {
return docId;
}
if (target < docId) {
return docId;
}
if (cursor == docIds.length - 1) {
docId = NO_MORE_DOCS;
return docId;
}
if (target == docId) {
docId = docIds[++cursor];
return docId;
}
int toIndex = Math.min(cursor + (target - docId) + 1, docIds.length);
int targetIndex = Arrays.binarySearch(docIds, cursor + 1, toIndex, target);
if (targetIndex < 0) {
targetIndex = -targetIndex - 1;
}
if (targetIndex == docIds.length) {
docId = NO_MORE_DOCS;
} else {
cursor = targetIndex;
docId = docIds[cursor];
}
return docId;
}
@Override
public long cost() {
return docIds == null ? 0 : docIds.length;
}
}

View File

@ -1,82 +0,0 @@
package com.twitter.search.common.search;
import java.io.IOException;
import com.google.common.base.Preconditions;
import org.apache.lucene.search.DocIdSetIterator;
/**
* Disjunction over 2 DocIdSetIterators. This should be faster than a disjunction over N since there
* would be no need to adjust the heap.
*/
public class PairDocIdSetIterator extends DocIdSetIterator {
private final DocIdSetIterator d1;
private final DocIdSetIterator d2;
private int doc = -1;
/** Creates a new PairDocIdSetIterator instance. */
public PairDocIdSetIterator(DocIdSetIterator d1, DocIdSetIterator d2) throws IOException {
Preconditions.checkNotNull(d1);
Preconditions.checkNotNull(d2);
this.d1 = d1;
this.d2 = d2;
// position the iterators
this.d1.nextDoc();
this.d2.nextDoc();
}
@Override
public int docID() {
return doc;
}
@Override
public int nextDoc() throws IOException {
int doc1 = d1.docID();
int doc2 = d2.docID();
DocIdSetIterator iter = null;
if (doc1 < doc2) {
doc = doc1;
//d1.nextDoc();
iter = d1;
} else if (doc1 > doc2) {
doc = doc2;
//d2.nextDoc();
iter = d2;
} else {
doc = doc1;
//d1.nextDoc();
//d2.nextDoc();
}
if (doc != NO_MORE_DOCS) {
if (iter != null) {
iter.nextDoc();
} else {
d1.nextDoc();
d2.nextDoc();
}
}
return doc;
}
@Override
public int advance(int target) throws IOException {
if (d1.docID() < target) {
d1.advance(target);
}
if (d2.docID() < target) {
d2.advance(target);
}
return (doc != NO_MORE_DOCS) ? nextDoc() : doc;
}
@Override
public long cost() {
// very coarse estimate
return d1.cost() + d2.cost();
}
}

View File

@ -1,9 +0,0 @@
package com.twitter.search.common.search;
/**
* Any class that can track and return query cost.
*/
public interface QueryCostProvider {
/** Returns the total cost. */
double getTotalCost();
}

View File

@ -1,202 +0,0 @@
package com.twitter.search.common.search;
import java.util.HashSet;
import java.util.Set;
import com.google.common.base.Preconditions;
import com.twitter.common.util.Clock;
import com.twitter.search.common.query.thriftjava.CollectorTerminationParams;
/**
* Used for tracking termination criteria for earlybird queries.
*
* Currently this tracks the query time out and query cost, if they are set on the
* {@link com.twitter.search.common.query.thriftjava.CollectorTerminationParams}.
*/
public class TerminationTracker {
/** Query start time provided by client. */
private final long clientStartTimeMillis;
/** Timeout end times, calculated from {@link #clientStartTimeMillis}. */
private final long timeoutEndTimeMillis;
/** Query start time recorded at earlybird server. */
private final long localStartTimeMillis;
/** Tracking query cost */
private final double maxQueryCost;
// Sometimes, we want to early terminate before timeoutEndTimeMillis, to reserve time for
// work that needs to be done after early termination (E.g. merging results).
private final int postTerminationOverheadMillis;
// We don't check for early termination often enough. Some times requests timeout in between
// early termination checks. This buffer time is also substracted from deadline.
// To illustrate how this is used, let's use a simple example:
// If we spent 750ms searching 5 segments, a rough estimate is that we need 150ms to search
// one segment. If the timeout is set to 800ms, we should not starting searching the next segment.
// In this case, on can set preTerminationSafeBufferTimeMillis to 150ms, so that when early
// termination check computes the deadline, this buffer is also subtracted. See SEARCH-29723.
private int preTerminationSafeBufferTimeMillis = 0;
private EarlyTerminationState earlyTerminationState = EarlyTerminationState.COLLECTING;
// This flag determines whether the last searched doc ID trackers should be consulted when a
// timeout occurs.
private final boolean useLastSearchedDocIdOnTimeout;
private final Set<DocIdTracker> lastSearchedDocIdTrackers = new HashSet<>();
/**
* Creates a new termination tracker that will not specify a timeout or max query cost.
* Can be used for queries that explicitly do not want to use a timeout. Meant to be used for
* tests, and background queries running for the query cache.
*/
public TerminationTracker(Clock clock) {
this.clientStartTimeMillis = clock.nowMillis();
this.localStartTimeMillis = clientStartTimeMillis;
this.timeoutEndTimeMillis = Long.MAX_VALUE;
this.maxQueryCost = Double.MAX_VALUE;
this.postTerminationOverheadMillis = 0;
this.useLastSearchedDocIdOnTimeout = false;
}
/**
* Convenient method overloading for
* {@link #TerminationTracker(CollectorTerminationParams, long, Clock, int)}.
*/
public TerminationTracker(
CollectorTerminationParams terminationParams, Clock clock,
int postTerminationOverheadMillis) {
this(terminationParams, clock.nowMillis(), clock, postTerminationOverheadMillis);
}
/**
* Convenient method overloading for
* {@link #TerminationTracker(CollectorTerminationParams, long, Clock, int)}.
*/
public TerminationTracker(
CollectorTerminationParams terminationParams, int postTerminationOverheadMillis) {
this(
terminationParams,
System.currentTimeMillis(),
Clock.SYSTEM_CLOCK,
postTerminationOverheadMillis);
}
/**
* Creates a new TerminationTracker instance.
*
* @param terminationParams CollectorParams.CollectorTerminationParams carrying parameters
* about early termination.
* @param clientStartTimeMillis The query start time (in millis) specified by client. This is used
* to calculate timeout end time, like {@link #timeoutEndTimeMillis}.
* @param clock used to sample {@link #localStartTimeMillis}.
* @param postTerminationOverheadMillis How much time should be reserved. E.g. if request time
* out is 800ms, and this is set to 200ms, early termination
* will kick in at 600ms mark.
*/
public TerminationTracker(
CollectorTerminationParams terminationParams,
long clientStartTimeMillis,
Clock clock,
int postTerminationOverheadMillis) {
Preconditions.checkNotNull(terminationParams);
Preconditions.checkArgument(postTerminationOverheadMillis >= 0);
this.clientStartTimeMillis = clientStartTimeMillis;
this.localStartTimeMillis = clock.nowMillis();
if (terminationParams.isSetTimeoutMs()
&& terminationParams.getTimeoutMs() > 0) {
Preconditions.checkState(terminationParams.getTimeoutMs() >= postTerminationOverheadMillis);
this.timeoutEndTimeMillis = this.clientStartTimeMillis + terminationParams.getTimeoutMs();
} else {
// Effectively no timeout.
this.timeoutEndTimeMillis = Long.MAX_VALUE;
}
// Tracking query cost
if (terminationParams.isSetMaxQueryCost()
&& terminationParams.getMaxQueryCost() > 0) {
maxQueryCost = terminationParams.getMaxQueryCost();
} else {
maxQueryCost = Double.MAX_VALUE;
}
this.useLastSearchedDocIdOnTimeout = terminationParams.isEnforceQueryTimeout();
this.postTerminationOverheadMillis = postTerminationOverheadMillis;
}
/**
* Returns the reserve time to perform post termination work. Return the deadline timestamp
* with postTerminationWorkEstimate subtracted.
*/
public long getTimeoutEndTimeWithReservation() {
// Return huge value if time out is disabled.
if (timeoutEndTimeMillis == Long.MAX_VALUE) {
return timeoutEndTimeMillis;
} else {
return timeoutEndTimeMillis
- postTerminationOverheadMillis
- preTerminationSafeBufferTimeMillis;
}
}
public void setPreTerminationSafeBufferTimeMillis(int preTerminationSafeBufferTimeMillis) {
Preconditions.checkArgument(preTerminationSafeBufferTimeMillis >= 0);
this.preTerminationSafeBufferTimeMillis = preTerminationSafeBufferTimeMillis;
}
public long getLocalStartTimeMillis() {
return localStartTimeMillis;
}
public long getClientStartTimeMillis() {
return clientStartTimeMillis;
}
public double getMaxQueryCost() {
return maxQueryCost;
}
public boolean isEarlyTerminated() {
return earlyTerminationState.isTerminated();
}
public EarlyTerminationState getEarlyTerminationState() {
return earlyTerminationState;
}
public void setEarlyTerminationState(EarlyTerminationState earlyTerminationState) {
this.earlyTerminationState = earlyTerminationState;
}
/**
* Return the minimum searched doc ID amongst all registered trackers, or -1 if there aren't any
* trackers. Doc IDs are stored in ascending order, and trackers update their doc IDs as they
* search, so the minimum doc ID reflects the most recent fully searched doc ID.
*/
int getLastSearchedDocId() {
return lastSearchedDocIdTrackers.stream()
.mapToInt(DocIdTracker::getCurrentDocId).min().orElse(-1);
}
void resetDocIdTrackers() {
lastSearchedDocIdTrackers.clear();
}
/**
* Add a DocIdTracker, to keep track of the last fully-searched doc ID when early termination
* occurs.
*/
public void addDocIdTracker(DocIdTracker docIdTracker) {
lastSearchedDocIdTrackers.add(docIdTracker);
}
public boolean useLastSearchedDocIdOnTimeout() {
return useLastSearchedDocIdOnTimeout;
}
}

View File

@ -1,31 +0,0 @@
package com.twitter.search.common.search;
import java.io.IOException;
import org.apache.lucene.search.Collector;
/**
* Lucene Collectors throw CollectionTerminatedException to perform early termination.
* We don't believe that throwing Exceptions to control execution flow is ideal, so we are adding
* this class to be a base of all Twitter Collectors.
*
* {@link com.twitter.search.common.search.TwitterIndexSearcher} uses the {@link #isTerminated()}
* method to perform early termination, instead of relying on CollectionTerminatedException.
*/
public abstract class TwitterCollector implements Collector {
/**
* Subclasses should return true if they want to perform early termination.
* This method is called every hit and should not be expensive.
*/
public abstract boolean isTerminated() throws IOException;
/**
* Lucene API only has a method that's called before searching a segment setNextReader().
* This hook is called after finishing searching a segment.
* @param lastSearchedDocID is the last docid searched before termination,
* or NO_MORE_DOCS if there was no early termination. This doc need not be a hit,
* and should not be collected here.
*/
public abstract void finishSegment(int lastSearchedDocID) throws IOException;
}

View File

@ -1,328 +0,0 @@
package com.twitter.search.common.search;
import java.io.IOException;
import java.util.List;
import javax.annotation.Nonnull;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.util.Clock;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.query.thriftjava.CollectorParams;
import com.twitter.search.common.query.thriftjava.CollectorTerminationParams;
/**
* A TwitterCollector containing the most common early termination logic based on
* timeout, cost, and max hits. This class does not do any actual hit collection---this class
* is abstract and cannot be instantiated.
*
* If a Collector and all its subclasses need early termination, it should extend this class.
*
* However, if one just wants to add EarlyTermination to any single collector, he can just
* use {@link DelegatingEarlyTerminationCollector}
* as a wrapper.
*/
public abstract class TwitterEarlyTerminationCollector
extends TwitterCollector implements LeafCollector {
private static final Logger LOG = LoggerFactory.getLogger(TwitterEarlyTerminationCollector.class);
private static final SearchCounter NEGATIVE_TIME_PER_SEGMENT =
SearchCounter.export("TwitterEarlyTerminationCollector_negative_time_per_segment");
private static final SearchRateCounter QUERY_TIMEOUT_ENFORCED =
SearchRateCounter.export("TwitterEarlyTerminationCollector_query_timeout_enforced");
protected int curDocId = -1;
protected Scorable scorer = null;
private LeafReader curReader = null;
private final long maxHitsToProcess;
private long numHitsProcessed = 0;
private int lastEarlyTerminationCheckDocId = -1;
private final Clock clock;
@Nullable
private final QueryCostProvider queryCostProvider;
private final TerminationTracker terminationTracker;
// This determines how often the expensive early termination check is performed.
// If set to be negative, expensive early termination check only performed at segment boundaries.
// If set to a positive number X, this check is performed every X docs processed.
private int numDocsBetweenTimeoutChecks;
// Number of segments searched so far.
// This is used to predicatively early terminate.
// Expensive early termination checks may not happen often enough. Sometimes the request
// times out in between the termination checks.
// After finishing searching a segment, we estimate how much time is needed to search one
// segment on average. If searching the next segment would cause a timeout, we early terminate.
private int numSearchedSegments = 0;
/**
* Creates a new TwitterEarlyTerminationCollector instance.
*
* @param collectorParams the parameters needed to guide early termination.
* @param terminationTracker If null is passed in, a new TerminationTrack is created. Otherwise,
* the one passed in is used.
* @param numDocsBetweenTimeoutChecks TerminationTracker based check are performed upon a hit
* every numDocsBetweenTimeoutChecks docs. If a non-positive number is passed
* in, TerminationTracker based checks are disabled.
* If collectorParams specifies a value as well, that value is used.
*/
public TwitterEarlyTerminationCollector(
CollectorParams collectorParams,
TerminationTracker terminationTracker,
@Nullable QueryCostProvider queryCostProvider,
int numDocsBetweenTimeoutChecks,
Clock clock) {
CollectorTerminationParams terminationParams = collectorParams.getTerminationParams();
if (terminationParams == null) {
terminationParams = new CollectorTerminationParams()
.setMaxHitsToProcess(Integer.MAX_VALUE)
.setMaxQueryCost(Double.MAX_VALUE)
.setTimeoutMs(Integer.MAX_VALUE);
}
if (!terminationParams.isSetMaxHitsToProcess() || terminationParams.getMaxHitsToProcess() < 0) {
maxHitsToProcess = Integer.MAX_VALUE;
} else {
maxHitsToProcess = terminationParams.getMaxHitsToProcess();
}
if (terminationParams.isSetNumDocsBetweenTimeoutChecks()) {
this.numDocsBetweenTimeoutChecks = terminationParams.getNumDocsBetweenTimeoutChecks();
} else {
this.numDocsBetweenTimeoutChecks = numDocsBetweenTimeoutChecks;
}
this.terminationTracker = Preconditions.checkNotNull(terminationTracker);
this.queryCostProvider = queryCostProvider;
this.clock = clock;
}
public final LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
this.setNextReader(context);
return this;
}
/**
* Sub-classes may override this to add more collection logic.
*/
protected abstract void doCollect() throws IOException;
/**
* Sub-classes may override this to add more segment completion logic.
* @param lastSearchedDocID is the last docid searched before termination,
* or NO_MORE_DOCS if there was no early termination. This doc may not be a hit!
*/
protected abstract void doFinishSegment(int lastSearchedDocID) throws IOException;
/**
* sub classes can override this to perform more early termination checks.
*/
public EarlyTerminationState innerShouldCollectMore() throws IOException {
return EarlyTerminationState.COLLECTING;
}
/**
* After early termination, this method can be used to retrieve early termination reason.
*/
@Nonnull
public final EarlyTerminationState getEarlyTerminationState() {
return terminationTracker.getEarlyTerminationState();
}
protected final EarlyTerminationState setEarlyTerminationState(
EarlyTerminationState newEarlyTerminationState) {
terminationTracker.setEarlyTerminationState(newEarlyTerminationState);
return newEarlyTerminationState;
}
@Override
public final boolean isTerminated() throws IOException {
EarlyTerminationState earlyTerminationState = getEarlyTerminationState();
if (earlyTerminationState.isTerminated()) {
return true;
}
if (getNumHitsProcessed() >= getMaxHitsToProcess()) {
collectedEnoughResults();
if (shouldTerminate()) {
return setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_HITS_EXCEEDED)
.isTerminated();
} else {
return false;
}
}
return innerShouldCollectMore().isTerminated();
}
/**
* Note: subclasses overriding this method are expected to call "super.setNextReader"
* in their setNextReader().
* @deprecated Remove this methods in favor of {@link #getLeafCollector(LeafReaderContext)}
*/
@Deprecated
public void setNextReader(LeafReaderContext context) throws IOException {
if (!terminationTracker.useLastSearchedDocIdOnTimeout()) {
expensiveEarlyTerminationCheck();
}
// Reset curDocId for next segment
curDocId = -1;
lastEarlyTerminationCheckDocId = -1;
curReader = context.reader();
}
/**
* Sub-classes overriding this method are expected to call super.setScorer()
*/
@Override
public void setScorer(Scorable scorer) throws IOException {
this.scorer = scorer;
}
@Override
public final void collect(int doc) throws IOException {
curDocId = doc;
doCollect();
numHitsProcessed++;
if (numDocsBetweenTimeoutChecks > 0
&& (curDocId - lastEarlyTerminationCheckDocId) >= numDocsBetweenTimeoutChecks) {
lastEarlyTerminationCheckDocId = curDocId;
if (!terminationTracker.useLastSearchedDocIdOnTimeout()) {
expensiveEarlyTerminationCheck();
}
}
}
/**
* Accounting for a segment searched.
* @param lastSearchedDocID is the last docid searched before termination,
* or NO_MORE_DOCS if there was no early termination. This doc may not be a hit!
*/
protected final void trackCompleteSegment(int lastSearchedDocID) throws IOException {
doFinishSegment(lastSearchedDocID);
}
@Override
public final void finishSegment(int lastSearchedDocID) throws IOException {
// finished searching a segment. Computer average time needed to search a segment.
Preconditions.checkState(curReader != null, "Did subclass call super.setNextReader()?");
numSearchedSegments++;
long totalTime = clock.nowMillis() - terminationTracker.getLocalStartTimeMillis();
if (totalTime >= Integer.MAX_VALUE) {
String msg = String.format(
"%s: A query runs for %d that is longer than Integer.MAX_VALUE ms. lastSearchedDocID: %d",
getClass().getSimpleName(), totalTime, lastSearchedDocID
);
LOG.error(msg);
throw new IllegalStateException(msg);
}
int timePerSegment = ((int) totalTime) / numSearchedSegments;
if (timePerSegment < 0) {
NEGATIVE_TIME_PER_SEGMENT.increment();
timePerSegment = 0;
}
// If we're enforcing timeout via the last searched doc ID, we don't need to add this buffer,
// since we'll detect the timeout right away.
if (!terminationTracker.useLastSearchedDocIdOnTimeout()) {
terminationTracker.setPreTerminationSafeBufferTimeMillis(timePerSegment);
}
// Check whether we timed out and are checking for timeout at the leaves. If so, we should use
// the captured lastSearchedDocId from the tracker instead, which is the most up-to-date amongst
// the query nodes.
if (terminationTracker.useLastSearchedDocIdOnTimeout()
&& EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED.equals(
terminationTracker.getEarlyTerminationState())) {
QUERY_TIMEOUT_ENFORCED.increment();
trackCompleteSegment(terminationTracker.getLastSearchedDocId());
} else {
trackCompleteSegment(lastSearchedDocID);
}
// We finished a segment, so clear out the DocIdTrackers. The next segment will register its
// own trackers, and we don't need to keep the trackers from the current segment.
terminationTracker.resetDocIdTrackers();
curDocId = -1;
curReader = null;
scorer = null;
}
/**
* More expensive Early Termination checks, which are not called every hit.
* This sets EarlyTerminationState if it decides that early termination should kick in.
* See: SEARCH-29723.
*/
private void expensiveEarlyTerminationCheck() {
if (queryCostProvider != null) {
double totalQueryCost = queryCostProvider.getTotalCost();
double maxQueryCost = terminationTracker.getMaxQueryCost();
if (totalQueryCost >= maxQueryCost) {
setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_QUERY_COST_EXCEEDED);
}
}
final long nowMillis = clock.nowMillis();
if (nowMillis >= terminationTracker.getTimeoutEndTimeWithReservation()) {
setEarlyTerminationState(EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED);
}
}
public long getMaxHitsToProcess() {
return maxHitsToProcess;
}
public final void setNumHitsProcessed(long numHitsProcessed) {
this.numHitsProcessed = numHitsProcessed;
}
protected final long getNumHitsProcessed() {
return numHitsProcessed;
}
protected final int getNumSearchedSegments() {
return numSearchedSegments;
}
protected final Clock getClock() {
return clock;
}
@VisibleForTesting
protected final TerminationTracker getTerminationTracker() {
return this.terminationTracker;
}
protected void collectedEnoughResults() throws IOException {
}
protected boolean shouldTerminate() {
return true;
}
/**
* Debug info collected during execution.
*/
public abstract List<String> getDebugInfo();
}

View File

@ -1,189 +0,0 @@
package com.twitter.search.common.search;
import java.io.IOException;
import java.util.List;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.MultiDocValues;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.search.CollectionStatistics;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.LeafCollector;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.TermStatistics;
import org.apache.lucene.search.Weight;
/**
* An IndexSearch that works with TwitterEarlyTerminationCollector.
* If a stock Lucene collector is passed into search(), this IndexSearch.search() behaves the
* same as Lucene's stock IndexSearcher. However, if a TwitterEarlyTerminationCollector is passed
* in, this IndexSearcher performs early termination without relying on
* CollectionTerminatedException.
*/
public class TwitterIndexSearcher extends IndexSearcher {
public TwitterIndexSearcher(IndexReader r) {
super(r);
}
/**
* search() main loop.
* This behaves exactly like IndexSearcher.search() if a stock Lucene collector passed in.
* However, if a TwitterCollector is passed in, this class performs Twitter style early
* termination without relying on
* {@link org.apache.lucene.search.CollectionTerminatedException}.
*/
@Override
protected void search(List<LeafReaderContext> leaves, Weight weight, Collector coll)
throws IOException {
// If an TwitterCollector is passed in, we can do a few extra things in here, such
// as early termination. Otherwise we can just fall back to IndexSearcher.search().
if (coll instanceof TwitterCollector) {
TwitterCollector collector = (TwitterCollector) coll;
for (LeafReaderContext ctx : leaves) { // search each subreader
if (collector.isTerminated()) {
return;
}
// Notify the collector that we're starting this segment, and check for early
// termination criteria again. setNextReader() performs 'expensive' early
// termination checks in some implementations such as TwitterEarlyTerminationCollector.
LeafCollector leafCollector = collector.getLeafCollector(ctx);
if (collector.isTerminated()) {
return;
}
// Initialize the scorer - it should not be null. Note that constructing the scorer
// may actually do real work, such as advancing to the first hit.
Scorer scorer = weight.scorer(ctx);
if (scorer == null) {
collector.finishSegment(DocIdSetIterator.NO_MORE_DOCS);
continue;
}
leafCollector.setScorer(scorer);
// Start searching.
DocIdSetIterator docIdSetIterator = scorer.iterator();
int docID = docIdSetIterator.nextDoc();
if (docID != DocIdSetIterator.NO_MORE_DOCS) {
// Collect results. Note: check isTerminated() before calling nextDoc().
do {
leafCollector.collect(docID);
} while (!collector.isTerminated()
&& (docID = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS);
}
// Always finish the segment, providing the last docID advanced to.
collector.finishSegment(docID);
}
} else {
// The collector given is not a TwitterCollector, just use stock lucene search().
super.search(leaves, weight, coll);
}
}
/** Returns {@link NumericDocValues} for this field, or
* null if no {@link NumericDocValues} were indexed for
* this field. The returned instance should only be
* used by a single thread. */
public NumericDocValues getNumericDocValues(String field) throws IOException {
return MultiDocValues.getNumericValues(getIndexReader(), field);
}
@Override
public CollectionStatistics collectionStatistics(String field) throws IOException {
return collectionStatistics(field, getIndexReader());
}
@Override
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) {
return termStats(term, docFreq, totalTermFreq);
}
/**
* Lucene relies on the fact that maxDocID is typically equal to the number of documents in the
* index, which is false when we have sparse doc IDs or when we start from 8 million docs and
* decrement, so in this class we pass in numDocs instead of the maximum assigned document ID.
* Note that the comment on {@link CollectionStatistics#maxDoc()} says that it returns the number
* of documents in the segment, not the maximum ID, and that it is only used this way. This is
* necessary for all lucene scoring methods, e.g.
* {@link org.apache.lucene.search.similarities.TFIDFSimilarity#idfExplain}. This method body is
* largely copied from {@link IndexSearcher#collectionStatistics(String)}.
*/
public static CollectionStatistics collectionStatistics(String field, IndexReader indexReader)
throws IOException {
Preconditions.checkNotNull(field);
int docsWithField = 0;
long sumTotalTermFreq = 0;
long sumDocFreq = 0;
for (LeafReaderContext leaf : indexReader.leaves()) {
Terms terms = leaf.reader().terms(field);
if (terms == null) {
continue;
}
docsWithField += terms.getDocCount();
sumTotalTermFreq += terms.getSumTotalTermFreq();
sumDocFreq += terms.getSumDocFreq();
}
if (docsWithField == 0) {
// The CollectionStatistics API in Lucene is designed poorly. On one hand, starting with
// Lucene 8.0.0, searchers are expected to always produce valid CollectionStatistics instances
// and all int fields in these instances are expected to be strictly greater than 0. On the
// other hand, Lucene itself produces null CollectionStatistics instances in a few places.
// Also, there's no good placeholder value to indicate that a field is empty, which is a very
// reasonable thing to happen (for example, the first few tweets in a new segment might not
// have any links, so then the resolved_links_text would be empty). So to get around this
// issue, we do here what Lucene does: we return a CollectionStatistics instance with all
// fields set to 1.
return new CollectionStatistics(field, 1, 1, 1, 1);
}
// The writer could have added more docs to the index since this searcher started processing
// this request, or could be in the middle of adding a doc, which could mean that only some of
// the docsWithField, sumTotalTermFreq and sumDocFreq stats have been updated. I don't think
// this is a big deal, as these stats are only used for computing a hit's score, and minor
// inaccuracies should have very little effect on a hit's final score. But CollectionStatistic's
// constructor has some strict asserts for the relationship between these stats. So we need to
// make sure we cap the values of these stats appropriately.
//
// Adjust numDocs based on docsWithField (instead of doing the opposite), because:
// 1. If new documents were added to this segment after the reader was created, it seems
// reasonable to take the more recent information into account.
// 2. The termStats() method below will return the most recent docFreq (not the value that
// docFreq was set to when this reader was created). If this value is higher than numDocs,
// then Lucene might end up producing negative scores, which must never happen.
int numDocs = Math.max(indexReader.numDocs(), docsWithField);
sumDocFreq = Math.max(sumDocFreq, docsWithField);
sumTotalTermFreq = Math.max(sumTotalTermFreq, sumDocFreq);
return new CollectionStatistics(field, numDocs, docsWithField, sumTotalTermFreq, sumDocFreq);
}
/**
* This method body is largely copied from {@link IndexSearcher#termStatistics(Term, int, long)}.
* The only difference is that we make sure all parameters we pass to the TermStatistics instance
* we create are set to at least 1 (because Lucene 8.0.0 expects them to be).
*/
public static TermStatistics termStats(Term term, int docFreq, long totalTermFreq) {
// Lucene expects the doc frequency and total term frequency to be at least 1. This assumption
// doesn't always make sense (the segment can be empty -- see comment above), but to make Lucene
// happy, make sure to always set these parameters to at least 1.
int adjustedDocFreq = Math.max(docFreq, 1);
return new TermStatistics(
term.bytes(),
adjustedDocFreq,
Math.max(totalTermFreq, adjustedDocFreq));
}
}

View File

@ -1,20 +0,0 @@
java_library(
name = "termination",
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
"3rdparty/jvm/org/apache/lucene:lucene-core",
"3rdparty/jvm/org/apache/lucene:lucene-facet",
"3rdparty/jvm/org/apache/lucene:lucene-queries",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/util:system-mocks",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/query",
"src/java/com/twitter/search/common/search",
"src/thrift/com/twitter/search:earlybird-java",
],
)

View File

@ -1,24 +0,0 @@
package com.twitter.search.common.search.termination;
import com.twitter.search.common.search.DocIdTracker;
/**
* QueryTimeout provides a method for early termination of queries.
*/
public interface QueryTimeout {
/**
* Returns true if query processing should terminate, otherwise false.
*/
boolean shouldExit();
/**
* Register a DocIdTracker for the scope of the query, to determine the last fully-searched
* doc ID after early termination.
*/
void registerDocIdTracker(DocIdTracker docIdTracker);
/**
* Return client ID of query.
*/
String getClientId();
}

View File

@ -1,34 +0,0 @@
package com.twitter.search.common.search.termination;
import com.twitter.common.util.Clock;
import com.twitter.search.common.search.TerminationTracker;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
public class QueryTimeoutFactory {
/**
* Creates a QueryTimeout instance for a given EarlybirdRequest and TerminationTracker, if the
* required conditions for leaf-level timeout checking are met. Returns null otherwise.
*
* The conditions are:
* 1) CollectorTerminationParams.isEnforceQueryTimeout()
* 2) CollectorTerminationParams.isSetTimeoutMs()
*/
public QueryTimeout createQueryTimeout(
EarlybirdRequest request,
TerminationTracker tracker,
Clock clock) {
if (tracker != null
&& request != null
&& request.isSetSearchQuery()
&& request.getSearchQuery().isSetCollectorParams()
&& request.getSearchQuery().getCollectorParams().isSetTerminationParams()
&& request.getSearchQuery().getCollectorParams().getTerminationParams()
.isEnforceQueryTimeout()
&& request.getSearchQuery().getCollectorParams().getTerminationParams()
.isSetTimeoutMs()) {
return new QueryTimeoutImpl(request.getClientId(), tracker, clock);
} else {
return null;
}
}
}

View File

@ -1,65 +0,0 @@
package com.twitter.search.common.search.termination;
import com.google.common.base.Preconditions;
import com.twitter.common.util.Clock;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.search.DocIdTracker;
import com.twitter.search.common.search.EarlyTerminationState;
import com.twitter.search.common.search.TerminationTracker;
/**
* QueryTimeoutImpl provides a method for early termination of queries based on time.
*/
public class QueryTimeoutImpl implements QueryTimeout {
private final String clientId;
private final TerminationTracker tracker;
private final Clock clock;
private final SearchRateCounter shouldTerminateCounter;
public QueryTimeoutImpl(String clientId, TerminationTracker tracker, Clock clock) {
this.clientId = Preconditions.checkNotNull(clientId);
this.tracker = Preconditions.checkNotNull(tracker);
this.clock = Preconditions.checkNotNull(clock);
shouldTerminateCounter =
SearchRateCounter.export("query_timeout_should_terminate_" + clientId);
}
/**
* Returns true when the clock's time has met or exceeded the tracker's timeout end.
*/
public boolean shouldExit() {
if (clock.nowMillis() >= tracker.getTimeoutEndTimeWithReservation()) {
tracker.setEarlyTerminationState(EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED);
shouldTerminateCounter.increment();
return true;
}
return false;
}
@Override
public void registerDocIdTracker(DocIdTracker docIdTracker) {
tracker.addDocIdTracker(docIdTracker);
}
@Override
public String getClientId() {
return clientId;
}
@Override
public int hashCode() {
return clientId.hashCode() * 13 + tracker.hashCode();
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof QueryTimeoutImpl)) {
return false;
}
QueryTimeoutImpl queryTimeout = QueryTimeoutImpl.class.cast(obj);
return clientId.equals(queryTimeout.clientId) && tracker.equals(queryTimeout.tracker);
}
}

View File

@ -1,66 +0,0 @@
package com.twitter.search.common.search.termination;
import java.io.IOException;
import java.util.Arrays;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
/**
* Query implementation that can timeout and return non-exhaustive results.
*/
public class TerminationQuery extends Query {
private final Query inner;
private final QueryTimeout timeout;
public TerminationQuery(Query inner, QueryTimeout timeout) {
this.inner = Preconditions.checkNotNull(inner);
this.timeout = Preconditions.checkNotNull(timeout);
}
@Override
public Weight createWeight(
IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
Weight innerWeight = inner.createWeight(searcher, scoreMode, boost);
return new TerminationQueryWeight(this, innerWeight, timeout);
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = inner.rewrite(reader);
if (rewritten != inner) {
return new TerminationQuery(rewritten, timeout);
}
return this;
}
public QueryTimeout getTimeout() {
return timeout;
}
@Override
public int hashCode() {
return Arrays.hashCode(new Object[] {inner, timeout});
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof TerminationQuery)) {
return false;
}
TerminationQuery terminationQuery = TerminationQuery.class.cast(obj);
return Arrays.equals(new Object[] {inner, timeout},
new Object[] {terminationQuery.inner, terminationQuery.timeout});
}
@Override
public String toString(String field) {
return inner.toString(field);
}
}

View File

@ -1,91 +0,0 @@
package com.twitter.search.common.search.termination;
import java.io.IOException;
import com.google.common.base.Preconditions;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.query.FilteredScorer;
import com.twitter.search.common.search.DocIdTracker;
/**
* Scorer implementation that adds termination support for an underlying query.
* Meant to be used in conjunction with {@link TerminationQuery}.
*/
public class TerminationQueryScorer extends FilteredScorer implements DocIdTracker {
private final QueryTimeout timeout;
private int lastSearchedDocId = -1;
TerminationQueryScorer(Weight weight, Scorer inner, QueryTimeout timeout) {
super(weight, inner);
this.timeout = Preconditions.checkNotNull(timeout);
this.timeout.registerDocIdTracker(this);
SearchRateCounter.export(
timeout.getClientId() + "_num_termination_query_scorers_created").increment();
}
@Override
public DocIdSetIterator iterator() {
final DocIdSetIterator superDISI = super.iterator();
return new DocIdSetIterator() {
// lastSearchedDocId is the ID of the last document that was traversed in the posting list.
// docId is the current doc ID in this iterator. In most cases, lastSearchedDocId and docId
// will be equal. They will be different only if the query needed to be terminated based on
// the timeout. In that case, docId will be set to NO_MORE_DOCS, but lastSearchedDocId will
// still be set to the last document that was actually traversed.
private int docId = -1;
@Override
public int docID() {
return docId;
}
@Override
public int nextDoc() throws IOException {
if (docId == NO_MORE_DOCS) {
return NO_MORE_DOCS;
}
if (timeout.shouldExit()) {
docId = NO_MORE_DOCS;
} else {
docId = superDISI.nextDoc();
lastSearchedDocId = docId;
}
return docId;
}
@Override
public int advance(int target) throws IOException {
if (docId == NO_MORE_DOCS) {
return NO_MORE_DOCS;
}
if (target == NO_MORE_DOCS) {
docId = NO_MORE_DOCS;
lastSearchedDocId = docId;
} else if (timeout.shouldExit()) {
docId = NO_MORE_DOCS;
} else {
docId = superDISI.advance(target);
lastSearchedDocId = docId;
}
return docId;
}
@Override
public long cost() {
return superDISI.cost();
}
};
}
@Override
public int getCurrentDocId() {
return lastSearchedDocId;
}
}

View File

@ -1,53 +0,0 @@
package com.twitter.search.common.search.termination;
import java.io.IOException;
import java.util.Set;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.Weight;
/**
* Weight implementation that adds termination support for an underlying query.
* Meant to be used in conjunction with {@link TerminationQuery}.
*/
public class TerminationQueryWeight extends Weight {
private final Weight inner;
private final QueryTimeout timeout;
TerminationQueryWeight(TerminationQuery query, Weight inner, QueryTimeout timeout) {
super(query);
this.inner = inner;
this.timeout = Preconditions.checkNotNull(timeout);
}
@Override
public Explanation explain(LeafReaderContext context, int doc)
throws IOException {
return inner.explain(context, doc);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
Scorer innerScorer = inner.scorer(context);
if (innerScorer != null) {
return new TerminationQueryScorer(this, innerScorer, timeout);
}
return null;
}
@Override
public void extractTerms(Set<Term> terms) {
inner.extractTerms(terms);
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return inner.isCacheable(ctx);
}
}

View File

@ -1,32 +0,0 @@
java_library(
sources = ["**/*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/com/twitter/elephantbird:core",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common/collections",
"src/java/com/twitter/search/common/encoding/features",
"src/java/com/twitter/search/common/logging",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/relevance:ranking",
"src/java/com/twitter/search/common/relevance:text",
"src/java/com/twitter/search/common/relevance/features",
"src/java/com/twitter/search/common/runtime",
"src/java/com/twitter/search/common/schema/base",
"src/java/com/twitter/search/common/schema/earlybird",
"src/thrift/com/twitter/search:earlybird-java",
"src/thrift/com/twitter/search/adaptive:adaptive-results-java",
"src/thrift/com/twitter/search/common:constants-java",
"src/thrift/com/twitter/search/common:indexing-java",
"src/thrift/com/twitter/search/common:query-java",
"src/thrift/com/twitter/search/common:ranking-java",
"util/util-core:scala",
],
)

View File

@ -1,269 +0,0 @@
package com.twitter.search.common.util.earlybird;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ExecutionException;
import com.google.common.base.Preconditions;
import com.google.common.cache.LoadingCache;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.collections.Pair;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
import com.twitter.search.earlybird.thrift.EarlybirdResponseCode;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode;
import com.twitter.search.earlybird.thrift.ThriftSearchResult;
import com.twitter.search.earlybird.thrift.ThriftTweetSource;
/**
* Utility methods to merge EarlybirdResponses.
*/
public final class EarlybirdResponseMergeUtil {
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdResponseMergeUtil.class);
private static final String INVALID_RESPONSE_STATS_PREFIX = "invalid_response_stats_";
// Stats for invalid earlybird response
private static final ImmutableMap<EarlybirdResponseCode, SearchCounter> ERROR_EXCEPTIONS;
public static final SearchCounter NULL_RESPONSE_COUNTER =
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "null_response");
public static final SearchCounter SEARCH_RESULTS_NOT_SET_COUNTER =
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "search_results_not_set");
public static final SearchCounter SEARCH_RESULTS_WITH_RESULTS_NOT_SET_COUNTER =
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "search_results_with_results_not_set");
public static final SearchCounter MAX_SEARCHED_STATUS_ID_NOT_SET_COUNTER =
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "max_searched_status_id_not_set");
public static final SearchCounter MIN_SEARCHED_STATUS_ID_NOT_SET_COUNTER =
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "min_searched_status_id_not_set");
static {
ImmutableMap.Builder<EarlybirdResponseCode, SearchCounter> builder = ImmutableMap.builder();
for (EarlybirdResponseCode responseCode : EarlybirdResponseCode.values()) {
if (responseCode != EarlybirdResponseCode.SUCCESS) {
builder.put(responseCode, SearchCounter.export(
INVALID_RESPONSE_STATS_PREFIX + responseCode.name().toLowerCase()));
}
}
ERROR_EXCEPTIONS = builder.build();
}
private EarlybirdResponseMergeUtil() {
}
/**
* Tags the results in the given EarlybirdResponse with the given ThriftTweetSource and adds them
* to the given list of results.
*
* @param results The list of results to which the new results will be added.
* @param earlybirdResponse The EarlybirdResponse whose results will be added to {@code results}.
* @param tweetSource The ThriftTweetSource that will be used to mark all results in
* {@code earlybirdResponse}.
* @return {@code false} if {@code earlybirdResponse} is {@code null} or doesn't have any results;
* {@code true}, otherwise.
*/
public static boolean addResultsToList(List<ThriftSearchResult> results,
EarlybirdResponse earlybirdResponse,
ThriftTweetSource tweetSource) {
return EarlybirdResponseUtil.hasResults(earlybirdResponse)
&& addResultsToList(results,
earlybirdResponse.getSearchResults().getResults(),
tweetSource);
}
/**
* Tags the results in the given list with the given ThriftTweetSource and adds them to the given
* list of results.
*
* @param results The list of results to which the new results will be added.
* @param resultsToAdd The list of results to add.
* @param tweetSource The ThriftTweetSource that will be used to mark all results in
* {@code resultsToAdd}.
* @return {@code false} if {@code results} is {@code null} or if {@code resultsToAdd} is
* {@code null} or doesn't have any results; {@code true}, otherwise.
*/
public static boolean addResultsToList(List<ThriftSearchResult> results,
List<ThriftSearchResult> resultsToAdd,
ThriftTweetSource tweetSource) {
Preconditions.checkNotNull(results);
if ((resultsToAdd == null) || resultsToAdd.isEmpty()) {
return false;
}
markWithTweetSource(resultsToAdd, tweetSource);
results.addAll(resultsToAdd);
return true;
}
/**
* Distinct the input ThriftSearchResult by its status id. If there are duplicates, the first
* instance of the duplicates is returned in the distinct result. If the distinct result is the
* same as the input result, the initial input result is returned; otherwise, the distinct result
* is returned.
*
* @param results the input result
* @param dupsStats stats counter track duplicates source
* @return the input result if there is no duplicate; otherwise, return the distinct result
*/
public static List<ThriftSearchResult> distinctByStatusId(
List<ThriftSearchResult> results,
LoadingCache<Pair<ThriftTweetSource, ThriftTweetSource>, SearchCounter> dupsStats) {
Map<Long, ThriftTweetSource> seenStatusIdToSourceMap = new HashMap<>();
List<ThriftSearchResult> distinctResults = Lists.newArrayListWithCapacity(results.size());
for (ThriftSearchResult result : results) {
if (seenStatusIdToSourceMap.containsKey(result.getId())) {
ThriftTweetSource source1 = seenStatusIdToSourceMap.get(result.getId());
ThriftTweetSource source2 = result.getTweetSource();
if (source1 != null && source2 != null) {
try {
dupsStats.get(Pair.of(source1, source2)).increment();
} catch (ExecutionException e) {
LOG.warn("Could not increment stat for duplicate results from clusters " + source1
+ " and " + source2, e);
}
}
} else {
distinctResults.add(result);
seenStatusIdToSourceMap.put(result.getId(), result.getTweetSource());
}
}
return results.size() == distinctResults.size() ? results : distinctResults;
}
/**
* Tags the given results with the given ThriftTweetSource.
*
* @param results The results to be tagged.
* @param tweetSource The ThriftTweetSource to be used to tag the given results.
*/
public static void markWithTweetSource(List<ThriftSearchResult> results,
ThriftTweetSource tweetSource) {
if (results != null) {
for (ThriftSearchResult result : results) {
result.setTweetSource(tweetSource);
}
}
}
/**
* Check if an Earlybird response is valid
*/
public static boolean isValidResponse(final EarlybirdResponse response) {
if (response == null) {
NULL_RESPONSE_COUNTER.increment();
return false;
}
if (!EarlybirdResponseUtil.isSuccessfulResponse(response)) {
return false;
}
if (!response.isSetSearchResults()) {
SEARCH_RESULTS_NOT_SET_COUNTER.increment();
return true;
}
if (!response.getSearchResults().isSetResults()) {
SEARCH_RESULTS_WITH_RESULTS_NOT_SET_COUNTER.increment();
}
// In earlybird, when earlybird terminated, e.g., time out, complex queries - we don't set the
// min/max searched status id.
boolean isEarlyTerminated = response.isSetEarlyTerminationInfo()
&& response.getEarlyTerminationInfo().isEarlyTerminated();
if (!isEarlyTerminated && !response.getSearchResults().isSetMinSearchedStatusID()) {
MIN_SEARCHED_STATUS_ID_NOT_SET_COUNTER.increment();
}
if (!isEarlyTerminated && !response.getSearchResults().isSetMaxSearchedStatusID()) {
MAX_SEARCHED_STATUS_ID_NOT_SET_COUNTER.increment();
}
return true;
}
/**
* For invalid successful Earlybird Response, return a failed response with debug msg.
*/
public static EarlybirdResponse transformInvalidResponse(final EarlybirdResponse response,
final String debugMsg) {
if (response == null) {
return failedEarlybirdResponse(EarlybirdResponseCode.PERSISTENT_ERROR,
debugMsg + ", msg: null response from downstream");
}
Preconditions.checkState(response.getResponseCode() != EarlybirdResponseCode.SUCCESS);
EarlybirdResponseCode newResponseCode;
EarlybirdResponseCode responseCode = response.getResponseCode();
switch (responseCode) {
case TIER_SKIPPED:
ERROR_EXCEPTIONS.get(responseCode).increment();
return response;
case REQUEST_BLOCKED_ERROR:
case CLIENT_ERROR:
case SERVER_TIMEOUT_ERROR:
case QUOTA_EXCEEDED_ERROR:
case CLIENT_CANCEL_ERROR:
case TOO_MANY_PARTITIONS_FAILED_ERROR:
ERROR_EXCEPTIONS.get(responseCode).increment();
newResponseCode = responseCode;
break;
default:
ERROR_EXCEPTIONS.get(responseCode).increment();
newResponseCode = EarlybirdResponseCode.PERSISTENT_ERROR;
}
String newDebugMsg = debugMsg + ", downstream response code: " + responseCode
+ (response.isSetDebugString() ? ", downstream msg: " + response.getDebugString() : "");
return failedEarlybirdResponse(newResponseCode, newDebugMsg);
}
/**
* Create a new EarlybirdResponse with debug msg
*/
public static EarlybirdResponse failedEarlybirdResponse(final EarlybirdResponseCode responseCode,
final String debugMsg) {
EarlybirdResponse failedResponse = new EarlybirdResponse();
failedResponse.setResponseCode(responseCode);
failedResponse.setDebugString(debugMsg);
return failedResponse;
}
/**
* Returns the number of results to keep as part of merge-collection. Recency mode should ignore
* relevance options. In particular, the flag returnAllResults inside relevance options.
*/
public static int computeNumResultsToKeep(EarlybirdRequest request) {
ThriftSearchQuery searchQuery = request.getSearchQuery();
if (searchQuery.getRankingMode() != ThriftSearchRankingMode.RECENCY
&& searchQuery.isSetRelevanceOptions()
&& searchQuery.getRelevanceOptions().isReturnAllResults()) {
return Integer.MAX_VALUE;
}
if (request.isSetNumResultsToReturnAtRoot()) {
return request.getNumResultsToReturnAtRoot();
}
if (searchQuery.isSetCollectorParams()) {
return searchQuery.getCollectorParams().getNumResultsToReturn();
}
return searchQuery.getNumResults();
}
}

View File

@ -1,204 +0,0 @@
package com.twitter.search.common.util.earlybird;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
import com.google.common.base.Preconditions;
import com.twitter.search.adaptive.adaptive_results.thriftjava.TweetSource;
import com.twitter.search.common.logging.ObjectKey;
import com.twitter.search.common.runtime.DebugManager;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
import com.twitter.search.earlybird.thrift.EarlybirdResponseCode;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSearchResult;
import com.twitter.search.earlybird.thrift.ThriftSearchResults;
import com.twitter.search.earlybird.thrift.ThriftTweetSource;
/** Utility methods that work on EarlybirdResponses. */
public final class EarlybirdResponseUtil {
private EarlybirdResponseUtil() {
}
/**
* Returns the results in the given EarlybirdResponse.
*
* @param response The EarlybirdResponse.
* @return The results in the given EarlybirdResponse, or {@code null} if the response is
* {@code null} or the results are not set.
*/
public static ThriftSearchResults getResults(EarlybirdResponse response) {
if ((response == null) || !response.isSetSearchResults()) {
return null;
}
return response.getSearchResults();
}
/**
* Determines if the given EarlybirdResponse has results.
*
* @param response The EarlybirdResponse.
* @return {@code true} if the given EarlybirdResponse has results; {@code false} otherwise.
*/
public static boolean hasResults(EarlybirdResponse response) {
ThriftSearchResults results = getResults(response);
return (results != null) && results.isSetResults() && !results.getResults().isEmpty();
}
/**
* Returns the number of results in the given EarlybirdResponse.
*
* @param response The EarlybirdResponse.
* @return The number of results in the given EarlybirdResponse.
*/
public static int getNumResults(EarlybirdResponse response) {
return hasResults(response) ? response.getSearchResults().getResultsSize() : 0;
}
/**
* Determines the response is early-terminated.
*
* @param response The EarlybirdResponse.
* @return {@code true} if the response is early-terminated; {@code false} otherwise.
*/
public static boolean isEarlyTerminated(EarlybirdResponse response) {
Preconditions.checkNotNull(response);
return response.isSetEarlyTerminationInfo()
&& response.getEarlyTerminationInfo().isEarlyTerminated();
}
/**
* Returns if the response should be considered failed for purposes of stats and logging.
*/
public static boolean responseConsideredFailed(EarlybirdResponseCode code) {
return code != EarlybirdResponseCode.SUCCESS
&& code != EarlybirdResponseCode.REQUEST_BLOCKED_ERROR
&& code != EarlybirdResponseCode.TIER_SKIPPED;
}
/**
* Extract results from Earlybird response.
*/
public static List<ThriftSearchResult> extractResultsFromEarlybirdResponse(
EarlybirdResponse response) {
return hasResults(response)
? response.getSearchResults().getResults() : Collections.emptyList();
}
/**
* Log the Earlybird response as a candidate source.
*/
public static EarlybirdResponse debugLogAsCandidateSource(
EarlybirdResponse response, TweetSource tweetSource) {
List<ThriftSearchResult> results = extractResultsFromEarlybirdResponse(response);
debugLogAsCandidateSourceHelper(results, tweetSource);
return response;
}
/**
* Log a list of ThriftSearchResult as a candidate source.
*/
public static List<ThriftSearchResult> debugLogAsCandidateSource(
List<ThriftSearchResult> results, TweetSource tweetSource) {
debugLogAsCandidateSourceHelper(results, tweetSource);
return results;
}
private static void debugLogAsCandidateSourceHelper(
List<ThriftSearchResult> results, TweetSource tweetSource) {
// debug message for Earlybird relevance candidate source
List<String> strIds = results
.stream()
.map(ThriftSearchResult::getId)
.map(Object::toString)
.collect(Collectors.toList());
ObjectKey debugMsgKey = ObjectKey.createTweetCandidateSourceKey(
tweetSource.name());
DebugManager.perObjectBasic(
debugMsgKey,
String.format("[%s][%s] results: %s", debugMsgKey.getType(), debugMsgKey.getId(), strIds));
}
/**
* Extract the real time response from an existing response
*/
public static EarlybirdResponse extractRealtimeResponse(EarlybirdResponse response) {
EarlybirdResponse realtimeResponse = response.deepCopy();
if (EarlybirdResponseUtil.hasResults(response)) {
List<ThriftSearchResult> realtimeResults = realtimeResponse.getSearchResults().getResults();
realtimeResults.clear();
for (ThriftSearchResult result : response.getSearchResults().getResults()) {
if (result.getTweetSource() == ThriftTweetSource.REALTIME_CLUSTER) {
realtimeResults.add(result);
}
}
}
return realtimeResponse;
}
/**
* Returns an EarlybirdResponse that should be returned by roots when a tier was skipped.
*
* @param minId The minSearchedStatusID to be set on the response.
* @param maxId The maxSearchedStatusID to be set on the response.
* @param debugMsg The debug message to be set on the response.
* @return A response that should be returned by roots when a tier was skipped.
*/
public static EarlybirdResponse tierSkippedRootResponse(long minId, long maxId, String debugMsg) {
return new EarlybirdResponse(EarlybirdResponseCode.SUCCESS, 0)
.setSearchResults(new ThriftSearchResults()
.setResults(new ArrayList<>())
.setMinSearchedStatusID(minId)
.setMaxSearchedStatusID(maxId))
.setDebugString(debugMsg);
}
/**
* Determines if the given response is a success response.
*
* A response is considered successful if it's not null and has either a SUCCESS, TIER_SKIPPED or
* REQUEST_BLOCKED_ERROR response code.
*
* @param response The response to check.
* @return Whether the given response is successful or not.
*/
public static boolean isSuccessfulResponse(EarlybirdResponse response) {
return response != null
&& (response.getResponseCode() == EarlybirdResponseCode.SUCCESS
|| response.getResponseCode() == EarlybirdResponseCode.TIER_SKIPPED
|| response.getResponseCode() == EarlybirdResponseCode.REQUEST_BLOCKED_ERROR);
}
/**
* Finds all unexpected nullcast statuses within the given result. A nullcast status is
* unexpected iff:
* 1. the tweet is a nullcast tweet.
* 2. the tweet is NOT explicitly requested with {@link ThriftSearchQuery#searchStatusIds}
*/
public static Set<Long> findUnexpectedNullcastStatusIds(
ThriftSearchResults thriftSearchResults, EarlybirdRequest request) {
Set<Long> statusIds = new HashSet<>();
for (ThriftSearchResult result : thriftSearchResults.getResults()) {
if (resultIsNullcast(result) && !isSearchStatusId(request, result.getId())) {
statusIds.add(result.getId());
}
}
return statusIds;
}
private static boolean isSearchStatusId(EarlybirdRequest request, long id) {
return request.getSearchQuery().isSetSearchStatusIds()
&& request.getSearchQuery().getSearchStatusIds().contains(id);
}
private static boolean resultIsNullcast(ThriftSearchResult result) {
return result.isSetMetadata() && result.getMetadata().isIsNullcast();
}
}

View File

@ -1,495 +0,0 @@
package com.twitter.search.common.util.earlybird;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
import com.twitter.search.common.logging.DebugMessageBuilder;
import com.twitter.search.common.ranking.thriftjava.ThriftFacetFinalSortOrder;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
import com.twitter.search.earlybird.thrift.ThriftFacetCount;
import com.twitter.search.earlybird.thrift.ThriftFacetCountMetadata;
import com.twitter.search.earlybird.thrift.ThriftFacetFieldRequest;
import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults;
import com.twitter.search.earlybird.thrift.ThriftFacetRankingMode;
import com.twitter.search.earlybird.thrift.ThriftFacetRequest;
import com.twitter.search.earlybird.thrift.ThriftFacetResults;
import com.twitter.search.earlybird.thrift.ThriftTermResults;
/**
* A utility class to provide some functions for facets results processing.
*/
public final class FacetsResultsUtils {
private static final Logger LOG = LoggerFactory.getLogger(FacetsResultsUtils.class);
private FacetsResultsUtils() {
}
public static class FacetFieldInfo {
public ThriftFacetFieldRequest fieldRequest;
public int totalCounts;
public Map<String, ThriftFacetCount> topFacets;
public List<Map.Entry<ThriftLanguage, Double>> languageHistogramEntries = Lists.newLinkedList();
}
// Only return top languages in the language histogram which sum up to at least this much
// ratio, here we get first 80 percentiles.
public static final double MIN_PERCENTAGE_SUM_REQUIRED = 0.8;
// if a language ratio is over this number, we already return.
public static final double MIN_PERCENTAGE = 0.01;
/**
* Prepare facet fields with empty entries and check if we need termStats for filtering.
* Returns true if termStats filtering is needed (thus the termStats servie call).
* @param facetRequest The related facet request.
* @param facetFieldInfoMap The facet field info map to fill, a map from facet type to the facet
* fiels results info.
* @return {@code true} if termstats request is needed afterwards.
*/
public static boolean prepareFieldInfoMap(
ThriftFacetRequest facetRequest,
final Map<String, FacetsResultsUtils.FacetFieldInfo> facetFieldInfoMap) {
boolean termStatsFilteringMode = false;
for (ThriftFacetFieldRequest fieldRequest : facetRequest.getFacetFields()) {
FacetsResultsUtils.FacetFieldInfo info = new FacetsResultsUtils.FacetFieldInfo();
info.fieldRequest = fieldRequest;
facetFieldInfoMap.put(fieldRequest.getFieldName(), info);
if (fieldRequest.getRankingMode() == ThriftFacetRankingMode.FILTER_WITH_TERM_STATISTICS) {
termStatsFilteringMode = true;
}
}
return termStatsFilteringMode;
}
/**
* Extract information from one ThriftFacetResults into facetFieldInfoMap and userIDWhitelist.
* @param facetResults Related facets results.
* @param facetFieldInfoMap The facets field info map to fill, a map from facet type to the facet
* fiels results info.
* @param userIDWhitelist The user whitelist to fill.
*/
public static void fillFacetFieldInfo(
final ThriftFacetResults facetResults,
final Map<String, FacetsResultsUtils.FacetFieldInfo> facetFieldInfoMap,
final Set<Long> userIDWhitelist) {
for (String facetField : facetResults.getFacetFields().keySet()) {
FacetsResultsUtils.FacetFieldInfo info = facetFieldInfoMap.get(facetField);
if (info.topFacets == null) {
info.topFacets = new HashMap<>();
}
ThriftFacetFieldResults results = facetResults.getFacetFields().get(facetField);
if (results.isSetLanguageHistogram()) {
info.languageHistogramEntries.addAll(results.getLanguageHistogram().entrySet());
}
for (ThriftFacetCount newCount : results.getTopFacets()) {
ThriftFacetCount resultCount = info.topFacets.get(newCount.facetLabel);
if (resultCount == null) {
info.topFacets.put(newCount.facetLabel, new ThriftFacetCount(newCount));
} else {
resultCount.setFacetCount(resultCount.facetCount + newCount.facetCount);
resultCount.setSimpleCount(resultCount.simpleCount + newCount.simpleCount);
resultCount.setWeightedCount(resultCount.weightedCount + newCount.weightedCount);
resultCount.setPenaltyCount(resultCount.penaltyCount + newCount.penaltyCount);
// this could pass the old metadata object back or a new merged one.
resultCount.setMetadata(
mergeFacetMetadata(resultCount.getMetadata(), newCount.getMetadata(),
userIDWhitelist));
}
}
info.totalCounts += results.totalCount;
}
}
/**
* Merge a metadata into an existing one.
* @param baseMetadata the metadata to merge into.
* @param metadataUpdate the new metadata to merge.
* @param userIDWhitelist user id whitelist to filter user id with.
* @return The updated metadata.
*/
public static ThriftFacetCountMetadata mergeFacetMetadata(
final ThriftFacetCountMetadata baseMetadata,
final ThriftFacetCountMetadata metadataUpdate,
final Set<Long> userIDWhitelist) {
ThriftFacetCountMetadata mergedMetadata = baseMetadata;
if (metadataUpdate != null) {
String mergedExplanation = null;
if (mergedMetadata != null) {
if (mergedMetadata.maxTweepCred < metadataUpdate.maxTweepCred) {
mergedMetadata.setMaxTweepCred(metadataUpdate.maxTweepCred);
}
if (mergedMetadata.isSetExplanation()) {
mergedExplanation = mergedMetadata.getExplanation();
if (metadataUpdate.isSetExplanation()) {
mergedExplanation += "\n" + metadataUpdate.getExplanation();
}
} else if (metadataUpdate.isSetExplanation()) {
mergedExplanation = metadataUpdate.getExplanation();
}
if (mergedMetadata.getStatusId() == -1) {
if (LOG.isDebugEnabled()) {
LOG.debug("status id in facet count metadata is -1: " + mergedMetadata);
}
mergedMetadata = metadataUpdate;
} else if (metadataUpdate.getStatusId() != -1
&& metadataUpdate.getStatusId() < mergedMetadata.getStatusId()) {
// keep the oldest tweet, ie. the lowest status ID
mergedMetadata = metadataUpdate;
} else if (metadataUpdate.getStatusId() == mergedMetadata.getStatusId()) {
if (mergedMetadata.getTwitterUserId() == -1) {
// in this case we didn't find the user in a previous partition yet
// only update the user if the status id matches
mergedMetadata.setTwitterUserId(metadataUpdate.getTwitterUserId());
mergedMetadata.setDontFilterUser(metadataUpdate.isDontFilterUser());
}
if (!mergedMetadata.isSetStatusLanguage()) {
mergedMetadata.setStatusLanguage(metadataUpdate.getStatusLanguage());
}
}
if (!mergedMetadata.isSetNativePhotoUrl() && metadataUpdate.isSetNativePhotoUrl()) {
mergedMetadata.setNativePhotoUrl(metadataUpdate.getNativePhotoUrl());
}
} else {
mergedMetadata = metadataUpdate;
}
// this will not set an explanation if neither oldMetadata nor metadataUpdate
// had an explanation
if (mergedExplanation != null) {
mergedMetadata.setExplanation(mergedExplanation);
}
if (userIDWhitelist != null) {
// result must not be null now because of the if above
if (mergedMetadata.getTwitterUserId() != -1 && !mergedMetadata.isDontFilterUser()) {
mergedMetadata.setDontFilterUser(
userIDWhitelist.contains(mergedMetadata.getTwitterUserId()));
}
}
}
return mergedMetadata;
}
/**
* Appends all twimg results to the image results. Optionally resorts the image results if
* a comparator is passed in.
* Also computes the sums of totalCount, totalScore, totalPenalty.
*/
public static void mergeTwimgResults(ThriftFacetResults facetResults,
Comparator<ThriftFacetCount> optionalSortComparator) {
if (facetResults == null || !facetResults.isSetFacetFields()) {
return;
}
ThriftFacetFieldResults imageResults =
facetResults.getFacetFields().get(EarlybirdFieldConstant.IMAGES_FACET);
ThriftFacetFieldResults twimgResults =
facetResults.getFacetFields().remove(EarlybirdFieldConstant.TWIMG_FACET);
if (imageResults == null) {
if (twimgResults != null) {
facetResults.getFacetFields().put(EarlybirdFieldConstant.IMAGES_FACET, twimgResults);
}
return;
}
if (twimgResults != null) {
imageResults.setTotalCount(imageResults.getTotalCount() + twimgResults.getTotalCount());
imageResults.setTotalPenalty(imageResults.getTotalPenalty() + twimgResults.getTotalPenalty());
imageResults.setTotalScore(imageResults.getTotalScore() + twimgResults.getTotalScore());
for (ThriftFacetCount count : twimgResults.getTopFacets()) {
imageResults.addToTopFacets(count);
}
if (optionalSortComparator != null) {
Collections.sort(imageResults.topFacets, optionalSortComparator);
}
}
}
/**
* Dedup twimg facets.
*
* Twimg facet uses the status ID as the facet label, instead of the twimg URL, a.k.a.
* native photo URL. It is possible to have the same twimg URL appearing in two different
* facet label (RT style retweet? copy & paste the twimg URL?). Therefore, to dedup twimg
* facet correctly, we need to look at ThriftFacetCount.metadata.nativePhotoUrl
*
* @param dedupSet A set holding the native URLs from the twimg facetFieldResults. By having
* the caller passing in the set, it allows the caller to dedup the facet
* across different ThriftFacetFieldResults.
* @param facetFieldResults The twimg facet field results to be debupped
* @param debugMessageBuilder
*/
public static void dedupTwimgFacet(Set<String> dedupSet,
ThriftFacetFieldResults facetFieldResults,
DebugMessageBuilder debugMessageBuilder) {
if (facetFieldResults == null || facetFieldResults.getTopFacets() == null) {
return;
}
Iterator<ThriftFacetCount> iterator = facetFieldResults.getTopFacetsIterator();
while (iterator.hasNext()) {
ThriftFacetCount count = iterator.next();
if (count.isSetMetadata() && count.getMetadata().isSetNativePhotoUrl()) {
String nativeUrl = count.getMetadata().getNativePhotoUrl();
if (dedupSet.contains(nativeUrl)) {
iterator.remove();
debugMessageBuilder.detailed("dedupTwimgFacet removed %s", nativeUrl);
} else {
dedupSet.add(nativeUrl);
}
}
}
}
private static final class LanguageCount {
private final ThriftLanguage lang;
private final double count;
private LanguageCount(ThriftLanguage lang, double count) {
this.lang = lang;
this.count = count;
}
}
/**
* Calculate the top languages and store them in the results.
*/
public static void fillTopLanguages(FacetsResultsUtils.FacetFieldInfo info,
final ThriftFacetFieldResults results) {
double sumForLanguage = 0.0;
double[] sums = new double[ThriftLanguage.values().length];
for (Map.Entry<ThriftLanguage, Double> entry : info.languageHistogramEntries) {
sumForLanguage += entry.getValue();
if (entry.getKey() == null) {
// EB might be setting null key for unknown language. SEARCH-1294
continue;
}
sums[entry.getKey().getValue()] += entry.getValue();
}
if (sumForLanguage == 0.0) {
return;
}
List<LanguageCount> langCounts = new ArrayList<>(ThriftLanguage.values().length);
for (int i = 0; i < sums.length; i++) {
if (sums[i] > 0.0) {
// ThriftLanguage.findByValue() might return null, which should fall back to UNKNOWN.
ThriftLanguage lang = ThriftLanguage.findByValue(i);
lang = lang == null ? ThriftLanguage.UNKNOWN : lang;
langCounts.add(new LanguageCount(lang, sums[i]));
}
}
Collections.sort(langCounts, (left, right) -> Double.compare(right.count, left.count));
double percentageSum = 0.0;
Map<ThriftLanguage, Double> languageHistogramMap =
new HashMap<>(langCounts.size());
int numAdded = 0;
for (LanguageCount langCount : langCounts) {
if (langCount.count == 0.0) {
break;
}
double percentage = langCount.count / sumForLanguage;
if (percentageSum > MIN_PERCENTAGE_SUM_REQUIRED
&& percentage < MIN_PERCENTAGE && numAdded >= 3) {
break;
}
languageHistogramMap.put(langCount.lang, percentage);
percentageSum += percentage;
numAdded++;
}
results.setLanguageHistogram(languageHistogramMap);
}
/**
* Replace "p.twimg.com/" part of the native photo (twimg) URL with "pbs.twimg.com/media/".
* We need to do this because of blobstore and it's suppose to be a temporary measure. This
* code should be removed once we verified that all native photo URL being sent to Search
* are prefixed with "pbs.twimg.com/media/" and no native photo URL in our index contains
* "p.twimg.com/"
*
* Please see SEARCH-783 and EVENTS-539 for more details.
*
* @param response response containing the facet results
*/
public static void fixNativePhotoUrl(EarlybirdResponse response) {
if (response == null
|| !response.isSetFacetResults()
|| !response.getFacetResults().isSetFacetFields()) {
return;
}
for (Map.Entry<String, ThriftFacetFieldResults> facetMapEntry
: response.getFacetResults().getFacetFields().entrySet()) {
final String facetResultField = facetMapEntry.getKey();
if (EarlybirdFieldConstant.TWIMG_FACET.equals(facetResultField)
|| EarlybirdFieldConstant.IMAGES_FACET.equals(facetResultField)) {
ThriftFacetFieldResults facetFieldResults = facetMapEntry.getValue();
for (ThriftFacetCount facetCount : facetFieldResults.getTopFacets()) {
replacePhotoUrl(facetCount.getMetadata());
}
}
}
}
/**
* Replace "p.twimg.com/" part of the native photo (twimg) URL with "pbs.twimg.com/media/".
* We need to do this because of blobstore and it's suppose to be a temporary measure. This
* code should be removed once we verified that all native photo URL being sent to Search
* are prefixed with "pbs.twimg.com/media/" and no native photo URL in our index contains
* "p.twimg.com/"
*
* Please see SEARCH-783 and EVENTS-539 for more details.
*
* @param termResultsCollection collection of ThriftTermResults containing the native photo URL
*/
public static void fixNativePhotoUrl(Collection<ThriftTermResults> termResultsCollection) {
if (termResultsCollection == null) {
return;
}
for (ThriftTermResults termResults : termResultsCollection) {
if (!termResults.isSetMetadata()) {
continue;
}
replacePhotoUrl(termResults.getMetadata());
}
}
/**
* Helper function for fixNativePhotoUrl()
*/
private static void replacePhotoUrl(ThriftFacetCountMetadata metadata) {
if (metadata != null
&& metadata.isSetNativePhotoUrl()) {
String nativePhotoUrl = metadata.getNativePhotoUrl();
nativePhotoUrl = nativePhotoUrl.replace("://p.twimg.com/", "://pbs.twimg.com/media/");
metadata.setNativePhotoUrl(nativePhotoUrl);
}
}
/**
* Deepcopy of an EarlybirdResponse without explanation
*/
public static EarlybirdResponse deepCopyWithoutExplanation(EarlybirdResponse facetsResponse) {
if (facetsResponse == null) {
return null;
} else if (!facetsResponse.isSetFacetResults()
|| facetsResponse.getFacetResults().getFacetFieldsSize() == 0) {
return facetsResponse.deepCopy();
}
EarlybirdResponse copy = facetsResponse.deepCopy();
for (Map.Entry<String, ThriftFacetFieldResults> entry
: copy.getFacetResults().getFacetFields().entrySet()) {
if (entry.getValue().getTopFacetsSize() > 0) {
for (ThriftFacetCount fc : entry.getValue().getTopFacets()) {
fc.getMetadata().unsetExplanation();
}
}
}
return copy;
}
/**
* Returns a comparator used to compare facet counts by calling
* getFacetCountComparator(ThriftFacetFinalSortOrder). The sort order is determined by
* the facetRankingOptions on the facet request.
*/
public static Comparator<ThriftFacetCount> getFacetCountComparator(
ThriftFacetRequest facetRequest) {
ThriftFacetFinalSortOrder sortOrder = ThriftFacetFinalSortOrder.SCORE;
if (facetRequest.isSetFacetRankingOptions()
&& facetRequest.getFacetRankingOptions().isSetFinalSortOrder()) {
sortOrder = facetRequest.getFacetRankingOptions().getFinalSortOrder();
}
return getFacetCountComparator(sortOrder);
}
/**
* Returns a comparator using the specified order.
*/
public static Comparator<ThriftFacetCount> getFacetCountComparator(
ThriftFacetFinalSortOrder sortOrder) {
switch (sortOrder) {
case SIMPLE_COUNT: return SIMPLE_COUNT_COMPARATOR;
case SCORE: return SCORE_COMPARATOR;
case CREATED_AT: return CREATED_AT_COMPARATOR;
case WEIGHTED_COUNT: return WEIGHTED_COUNT_COMPARATOR;
default: return SCORE_COMPARATOR;
}
}
private static final Comparator<ThriftFacetCount> SIMPLE_COUNT_COMPARATOR =
(count1, count2) -> {
if (count1.simpleCount > count2.simpleCount) {
return 1;
} else if (count1.simpleCount < count2.simpleCount) {
return -1;
}
return count1.facetLabel.compareTo(count2.facetLabel);
};
private static final Comparator<ThriftFacetCount> WEIGHTED_COUNT_COMPARATOR =
(count1, count2) -> {
if (count1.weightedCount > count2.weightedCount) {
return 1;
} else if (count1.weightedCount < count2.weightedCount) {
return -1;
}
return SIMPLE_COUNT_COMPARATOR.compare(count1, count2);
};
private static final Comparator<ThriftFacetCount> SCORE_COMPARATOR =
(count1, count2) -> {
if (count1.score > count2.score) {
return 1;
} else if (count1.score < count2.score) {
return -1;
}
return SIMPLE_COUNT_COMPARATOR.compare(count1, count2);
};
private static final Comparator<ThriftFacetCount> CREATED_AT_COMPARATOR =
(count1, count2) -> {
if (count1.isSetMetadata() && count1.getMetadata().isSetCreated_at()
&& count2.isSetMetadata() && count2.getMetadata().isSetCreated_at()) {
// more recent items have higher created_at values
if (count1.getMetadata().getCreated_at() > count2.getMetadata().getCreated_at()) {
return 1;
} else if (count1.getMetadata().getCreated_at() < count2.getMetadata().getCreated_at()) {
return -1;
}
}
return SCORE_COMPARATOR.compare(count1, count2);
};
}

View File

@ -1,45 +0,0 @@
package com.twitter.search.common.util.earlybird;
import java.util.List;
import java.util.Set;
import com.google.common.collect.Lists;
import com.google.common.collect.Sets;
import com.twitter.search.common.query.thriftjava.EarlyTerminationInfo;
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
public final class ResponseMergerUtils {
// Utility class, disallow instantiation.
private ResponseMergerUtils() {
}
/**
* Merges early termination infos from several earlybird responses.
*
* @param responses earlybird responses to merge the early termination infos from
* @return merged early termination info
*/
public static EarlyTerminationInfo mergeEarlyTerminationInfo(List<EarlybirdResponse> responses) {
EarlyTerminationInfo etInfo = new EarlyTerminationInfo(false);
Set<String> etReasonSet = Sets.newHashSet();
// Fill in EarlyTerminationStatus
for (EarlybirdResponse ebResp : responses) {
if (ebResp.isSetEarlyTerminationInfo()
&& ebResp.getEarlyTerminationInfo().isEarlyTerminated()) {
etInfo.setEarlyTerminated(true);
if (ebResp.getEarlyTerminationInfo().isSetEarlyTerminationReason()) {
etReasonSet.add(ebResp.getEarlyTerminationInfo().getEarlyTerminationReason());
}
if (ebResp.getEarlyTerminationInfo().isSetMergedEarlyTerminationReasons()) {
etReasonSet.addAll(ebResp.getEarlyTerminationInfo().getMergedEarlyTerminationReasons());
}
}
}
if (etInfo.isEarlyTerminated()) {
etInfo.setMergedEarlyTerminationReasons(Lists.newArrayList(etReasonSet));
}
return etInfo;
}
}

View File

@ -1,36 +0,0 @@
package com.twitter.search.common.util.earlybird;
import java.util.Map;
import com.google.common.base.Function;
import com.google.common.collect.Iterables;
import com.google.common.collect.Maps;
/**
* Utility class used to help merging results.
*/
public final class ResultsUtil {
private ResultsUtil() { }
/**
* Aggregate a list of responses in the following way.
* 1. For each response, mapGetter can turn the response into a map.
* 2. Dump all entries from the above map into a "total" map, which accumulates entries from
* all the responses.
*/
public static <T, V> Map<T, Integer> aggregateCountMap(
Iterable<V> responses,
Function<V, Map<T, Integer>> mapGetter) {
Map<T, Integer> total = Maps.newHashMap();
for (Map<T, Integer> map : Iterables.transform(responses, mapGetter)) {
if (map != null) {
for (Map.Entry<T, Integer> entry : map.entrySet()) {
T key = entry.getKey();
total.put(key, total.containsKey(key)
? total.get(key) + entry.getValue() : entry.getValue());
}
}
}
return total;
}
}

View File

@ -1,47 +0,0 @@
package com.twitter.search.common.util.earlybird;
import java.util.concurrent.TimeUnit;
import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
/**
* A utility class to provide some functions for TermStatistics request processing
*/
public final class TermStatisticsUtil {
private static final org.slf4j.Logger LOG =
org.slf4j.LoggerFactory.getLogger(TermStatisticsUtil.class);
private TermStatisticsUtil() {
}
/**
* Determine the binsize base on settings in ThriftHistogramSettings.granularity
*/
public static int determineBinSize(ThriftHistogramSettings histogramSettings) {
final int DEFAULT_BINSIZE = (int) TimeUnit.HOURS.toSeconds(1);
int binSize;
switch (histogramSettings.getGranularity()) {
case DAYS:
binSize = (int) TimeUnit.DAYS.toSeconds(1);
break;
case HOURS:
binSize = (int) TimeUnit.HOURS.toSeconds(1);
break;
case MINUTES:
binSize = (int) TimeUnit.MINUTES.toSeconds(1);
break;
case CUSTOM:
binSize = histogramSettings.isSetBinSizeInSeconds()
? histogramSettings.getBinSizeInSeconds()
: DEFAULT_BINSIZE;
break;
default:
binSize = DEFAULT_BINSIZE;
LOG.warn("Unknown ThriftHistogramGranularityType {} using default binsize: {}",
histogramSettings.getGranularity(), DEFAULT_BINSIZE);
}
return binSize;
}
}

View File

@ -1,29 +0,0 @@
package com.twitter.search.common.util.earlybird;
import com.twitter.search.common.query.thriftjava.CollectorParams;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
/**
* Utility class from constructing ThriftSearchQuery.
*/
public final class ThriftSearchQueryUtil {
private ThriftSearchQueryUtil() { }
/**
* Convenience methods for constructing a ThriftSearchQuery.
*/
public static ThriftSearchQuery newSearchQuery(String serializedQuery, int numResults) {
ThriftSearchQuery searchQuery = new ThriftSearchQuery();
searchQuery.setSerializedQuery(serializedQuery);
searchQuery.setCollectorParams(new CollectorParams().setNumResultsToReturn(numResults));
return searchQuery;
}
/** Determines if the given request was initiated by a logged in user. */
public static boolean requestInitiatedByLoggedInUser(EarlybirdRequest request) {
ThriftSearchQuery searchQuery = request.getSearchQuery();
return (searchQuery != null) && searchQuery.isSetSearcherId()
&& (searchQuery.getSearcherId() > 0);
}
}

View File

@ -1,209 +0,0 @@
package com.twitter.search.common.util.earlybird;
import java.util.List;
import java.util.Map;
import javax.annotation.Nullable;
import com.google.common.base.Function;
import com.google.common.base.Predicate;
import com.google.common.base.Predicates;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
import com.twitter.search.common.relevance.ranking.ActionChain;
import com.twitter.search.common.relevance.ranking.filters.ExactDuplicateFilter;
import com.twitter.search.common.relevance.text.VisibleTokenRatioNormalizer;
import com.twitter.search.common.runtime.ActionChainDebugManager;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults;
import com.twitter.search.earlybird.thrift.ThriftFacetResults;
import com.twitter.search.earlybird.thrift.ThriftSearchResult;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
import com.twitter.search.earlybird.thrift.ThriftSearchResults;
import com.twitter.search.earlybird.thrift.ThriftTweetSource;
/**
* ThriftSearchResultUtil contains some simple static methods for constructing
* ThriftSearchResult objects.
*/
public final class ThriftSearchResultUtil {
private ThriftSearchResultUtil() { }
private static final VisibleTokenRatioNormalizer NORMALIZER =
VisibleTokenRatioNormalizer.createInstance();
public static final Function<ThriftSearchResults, Map<ThriftLanguage, Integer>> LANG_MAP_GETTER =
searchResults -> searchResults.getLanguageHistogram();
public static final Function<ThriftSearchResults, Map<Long, Integer>> HIT_COUNTS_MAP_GETTER =
searchResults -> searchResults.getHitCounts();
// Some useful Predicates
public static final Predicate<ThriftSearchResult> IS_OFFENSIVE_TWEET =
result -> {
if (result != null && result.isSetMetadata()) {
ThriftSearchResultMetadata metadata = result.getMetadata();
return metadata.isIsOffensive();
} else {
return false;
}
};
public static final Predicate<ThriftSearchResult> IS_TOP_TWEET =
result -> result != null
&& result.isSetMetadata()
&& result.getMetadata().isSetResultType()
&& result.getMetadata().getResultType() == ThriftSearchResultType.POPULAR;
public static final Predicate<ThriftSearchResult> FROM_FULL_ARCHIVE =
result -> result != null
&& result.isSetTweetSource()
&& result.getTweetSource() == ThriftTweetSource.FULL_ARCHIVE_CLUSTER;
public static final Predicate<ThriftSearchResult> IS_FULL_ARCHIVE_TOP_TWEET =
Predicates.and(FROM_FULL_ARCHIVE, IS_TOP_TWEET);
public static final Predicate<ThriftSearchResult> IS_NSFW_BY_ANY_MEANS_TWEET =
result -> {
if (result != null && result.isSetMetadata()) {
ThriftSearchResultMetadata metadata = result.getMetadata();
return metadata.isIsUserNSFW()
|| metadata.isIsOffensive()
|| metadata.getExtraMetadata().isIsSensitiveContent();
} else {
return false;
}
};
/**
* Returns the number of underlying ThriftSearchResult results.
*/
public static int numResults(ThriftSearchResults results) {
if (results == null || !results.isSetResults()) {
return 0;
} else {
return results.getResultsSize();
}
}
/**
* Returns the list of tweet IDs in ThriftSearchResults.
* Returns null if there's no results.
*/
@Nullable
public static List<Long> getTweetIds(ThriftSearchResults results) {
if (numResults(results) > 0) {
return getTweetIds(results.getResults());
} else {
return null;
}
}
/**
* Returns the list of tweet IDs in a list of ThriftSearchResult.
* Returns null if there's no results.
*/
public static List<Long> getTweetIds(@Nullable List<ThriftSearchResult> results) {
if (results != null && results.size() > 0) {
return Lists.newArrayList(Iterables.transform(
results,
searchResult -> searchResult.getId()
));
}
return null;
}
/**
* Given ThriftSearchResults, build a map from tweet ID to the tweets metadata.
*/
public static Map<Long, ThriftSearchResultMetadata> getTweetMetadataMap(
Schema schema, ThriftSearchResults results) {
Map<Long, ThriftSearchResultMetadata> resultMap = Maps.newHashMap();
if (results == null || results.getResultsSize() == 0) {
return resultMap;
}
for (ThriftSearchResult searchResult : results.getResults()) {
resultMap.put(searchResult.getId(), searchResult.getMetadata());
}
return resultMap;
}
/**
* Return the total number of facet results in ThriftFacetResults, by summing up the number
* of facet results in each field.
*/
public static int numFacetResults(ThriftFacetResults results) {
if (results == null || !results.isSetFacetFields()) {
return 0;
} else {
int numResults = 0;
for (ThriftFacetFieldResults field : results.getFacetFields().values()) {
if (field.isSetTopFacets()) {
numResults += field.topFacets.size();
}
}
return numResults;
}
}
/**
* Updates the search statistics on base, by adding the corresponding stats from delta.
*/
public static void incrementCounts(ThriftSearchResults base,
ThriftSearchResults delta) {
if (delta.isSetNumHitsProcessed()) {
base.setNumHitsProcessed(base.getNumHitsProcessed() + delta.getNumHitsProcessed());
}
if (delta.isSetNumPartitionsEarlyTerminated() && delta.getNumPartitionsEarlyTerminated() > 0) {
// This currently used for merging results on a single earlybird, so we don't sum up all the
// counts, just set it to 1 if we see one that was early terminated.
base.setNumPartitionsEarlyTerminated(1);
}
if (delta.isSetMaxSearchedStatusID()) {
long deltaMax = delta.getMaxSearchedStatusID();
if (!base.isSetMaxSearchedStatusID() || deltaMax > base.getMaxSearchedStatusID()) {
base.setMaxSearchedStatusID(deltaMax);
}
}
if (delta.isSetMinSearchedStatusID()) {
long deltaMin = delta.getMinSearchedStatusID();
if (!base.isSetMinSearchedStatusID() || deltaMin < base.getMinSearchedStatusID()) {
base.setMinSearchedStatusID(deltaMin);
}
}
if (delta.isSetScore()) {
if (base.isSetScore()) {
base.setScore(base.getScore() + delta.getScore());
} else {
base.setScore(delta.getScore());
}
}
}
/**
* Removes the duplicates from the given list of results.
*
* @param results The list of ThriftSearchResults.
* @return The given list with duplicates removed.
*/
public static List<ThriftSearchResult> removeDuplicates(List<ThriftSearchResult> results) {
ActionChain<ThriftSearchResult> filterChain =
ActionChainDebugManager
.<ThriftSearchResult>createActionChainBuilder("RemoveDuplicatesFilters")
.appendActions(new ExactDuplicateFilter())
.build();
return filterChain.apply(results);
}
/**
* Returns ranking score from Earlybird shard-based ranking models if any, and 0 otherwise.
*/
public static double getTweetScore(@Nullable ThriftSearchResult result) {
if (result == null || !result.isSetMetadata() || !result.getMetadata().isSetScore()) {
return 0.0;
}
return result.getMetadata().getScore();
}
}

View File

@ -1,46 +0,0 @@
package com.twitter.search.common.util.earlybird;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
public final class ThriftSearchResultsRelevanceStatsUtil {
private ThriftSearchResultsRelevanceStatsUtil() { }
/**
* Adding ThriftSearchResultsRelevanceStats from one set of results onto a base set.
* Assumes all values are set on both of the inputs.
*
* @param base the stats to add to.
* @param delta the stats to be added.
*/
public static void addRelevanceStats(ThriftSearchResultsRelevanceStats base,
ThriftSearchResultsRelevanceStats delta) {
base.setNumScored(base.getNumScored() + delta.getNumScored());
base.setNumSkipped(base.getNumSkipped() + delta.getNumSkipped());
base.setNumSkippedForAntiGaming(
base.getNumSkippedForAntiGaming() + delta.getNumSkippedForAntiGaming());
base.setNumSkippedForLowReputation(
base.getNumSkippedForLowReputation() + delta.getNumSkippedForLowReputation());
base.setNumSkippedForLowTextScore(
base.getNumSkippedForLowTextScore() + delta.getNumSkippedForLowTextScore());
base.setNumSkippedForSocialFilter(
base.getNumSkippedForSocialFilter() + delta.getNumSkippedForSocialFilter());
base.setNumSkippedForLowFinalScore(
base.getNumSkippedForLowFinalScore() + delta.getNumSkippedForLowFinalScore());
if (delta.getOldestScoredTweetAgeInSeconds() > base.getOldestScoredTweetAgeInSeconds()) {
base.setOldestScoredTweetAgeInSeconds(delta.getOldestScoredTweetAgeInSeconds());
}
base.setNumFromDirectFollows(base.getNumFromDirectFollows() + delta.getNumFromDirectFollows());
base.setNumFromTrustedCircle(base.getNumFromTrustedCircle() + delta.getNumFromTrustedCircle());
base.setNumReplies(base.getNumReplies() + delta.getNumReplies());
base.setNumRepliesTrusted(base.getNumRepliesTrusted() + delta.getNumRepliesTrusted());
base.setNumRepliesOutOfNetwork(
base.getNumRepliesOutOfNetwork() + delta.getNumRepliesOutOfNetwork());
base.setNumSelfTweets(base.getNumSelfTweets() + delta.getNumSelfTweets());
base.setNumWithMedia(base.getNumWithMedia() + delta.getNumWithMedia());
base.setNumWithNews(base.getNumWithNews() + delta.getNumWithNews());
base.setNumSpamUser(base.getNumSpamUser() + delta.getNumSpamUser());
base.setNumOffensive(base.getNumOffensive() + delta.getNumOffensive());
base.setNumBot(base.getNumBot() + delta.getNumBot());
}
}

View File

@ -1,18 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
provides = artifact(
org = "com.twitter.search.common.util",
name = "lang",
repo = artifactory,
),
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/code/findbugs:jsr305",
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/text/language:locale-util",
"src/thrift/com/twitter/search/common:constants-java",
],
)

View File

@ -1,141 +0,0 @@
package com.twitter.search.common.util.lang;
import java.lang.reflect.Field;
import java.util.Locale;
import java.util.Map;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.text.language.LocaleUtil;
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
/**
* This class can be used to convert ThriftLanguage to Locale object and vise versa.
*/
public final class ThriftLanguageUtil {
private static final Logger LOG = LoggerFactory.getLogger(ThriftLanguageUtil.class.getName());
// stores ThriftLanguage.id -> Locale mapping
private static final Locale[] LOCALES;
// stores Locale -> ThriftLanguage mapping
private static final Map<Locale, ThriftLanguage> THRIFT_LANGUAGES;
static {
LOCALES = new Locale[ThriftLanguage.values().length];
Map<Locale, ThriftLanguage> thriftLanguageMap = Maps.newHashMap();
// get all languages defined in ThriftLanguage
Field[] fields = ThriftLanguage.class.getDeclaredFields();
for (Field field : fields) {
if (!field.isEnumConstant()) {
continue;
}
try {
ThriftLanguage thriftLang = (ThriftLanguage) field.get(null);
String thriftLanguageName = field.getName();
// get corresponding Locale declared in LocaleUtil
try {
Field localeUtilField = LocaleUtil.class.getDeclaredField(thriftLanguageName);
Locale localeLang = (Locale) localeUtilField.get(null);
LOCALES[thriftLang.getValue()] = localeLang;
thriftLanguageMap.put(localeLang, thriftLang);
} catch (NoSuchFieldException e) {
LOG.warn("{} is defined in ThriftLanguage, but not in LocaleUtil.", thriftLanguageName);
}
} catch (IllegalAccessException e) {
// shouldn't happen.
LOG.warn("Could not get a declared field.", e);
}
}
// Let's make sure that all Locales defined in LocaleUtil are also defined in ThriftLanguage
for (Locale lang : LocaleUtil.getDefinedLanguages()) {
if (!thriftLanguageMap.containsKey(lang)) {
LOG.warn("{} is defined in LocaleUtil but not in ThriftLanguage.", lang.getLanguage());
}
}
THRIFT_LANGUAGES = ImmutableMap.copyOf(thriftLanguageMap);
}
private ThriftLanguageUtil() {
}
/**
* Returns a Locale object which corresponds to a given ThriftLanguage object.
* @param language ThriftLanguage object
* @return a corresponding Locale object
*/
public static Locale getLocaleOf(ThriftLanguage language) {
// Note that ThriftLanguage.findByValue() can return null (thrift generated code).
// So ThriftLanguageUtil.getLocaleOf needs to handle null correctly.
if (language == null) {
return LocaleUtil.UNKNOWN;
}
Preconditions.checkArgument(language.getValue() < LOCALES.length);
return LOCALES[language.getValue()];
}
/**
* Returns a ThriftLanguage object which corresponds to a given Locale object.
*
* @param language Locale object
* @return a corresponding ThriftLanguage object, or UNKNOWN if there's no corresponding one.
*/
public static ThriftLanguage getThriftLanguageOf(Locale language) {
Preconditions.checkNotNull(language);
ThriftLanguage thriftLang = THRIFT_LANGUAGES.get(language);
return thriftLang == null ? ThriftLanguage.UNKNOWN : thriftLang;
}
/**
* Returns a ThriftLanguage object which corresponds to a given language code.
*
* @param languageCode BCP-47 language code
* @return a corresponding ThriftLanguage object, or UNKNOWN if there's no corresponding one.
*/
public static ThriftLanguage getThriftLanguageOf(String languageCode) {
Preconditions.checkNotNull(languageCode);
ThriftLanguage thriftLang = THRIFT_LANGUAGES.get(LocaleUtil.getLocaleOf(languageCode));
return thriftLang == null ? ThriftLanguage.UNKNOWN : thriftLang;
}
/**
* Returns a ThriftLanguage object which corresponds to a given int value.
* If value is not valid, returns ThriftLanguage.UNKNOWN
* @param value value of language
* @return a corresponding ThriftLanguage object
*/
public static ThriftLanguage safeFindByValue(int value) {
ThriftLanguage thriftLang = ThriftLanguage.findByValue(value);
return thriftLang == null ? ThriftLanguage.UNKNOWN : thriftLang;
}
/**
* Returns the language code which corresponds to a given ThriftLanguage.
*
* Note that multiple ThriftLanguage entries can return the same language code.
*
* @param thriftLang ThriftLanguage object
* @return Corresponding language or null if thriftLang is null.
*/
@Nullable
public static String getLanguageCodeOf(@Nullable ThriftLanguage thriftLang) {
if (thriftLang == null) {
return null;
}
return ThriftLanguageUtil.getLocaleOf(thriftLang).getLanguage();
}
}

View File

@ -1,16 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/it/unimi/dsi:fastutil",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/search/common/file",
"src/java/com/twitter/search/common/util/io",
],
)

Binary file not shown.

View File

@ -1,141 +0,0 @@
package com.twitter.search.common.util.ml;
import java.io.IOException;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.Map;
import java.util.Set;
import com.google.common.base.Preconditions;
import com.google.common.base.Predicates;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Maps;
import com.twitter.search.common.file.AbstractFile;
import com.twitter.search.common.util.io.TextFileLoadingUtils;
/**
* Represents a linear model for scoring and classification.
*
* The list of features is defined by an Enum class. The model weights and instances are
* represented as maps that must contain an entry for all the values of the enum.
*
*/
public class EnumBasedLinearModel<K extends Enum<K>> implements MapBasedLinearModel<K> {
private final EnumSet<K> features;
private final EnumMap<K, Float> weights;
/**
* Creates a model from a map of weights.
*
* @param enumType Enum used for the keys
* @param weights Feature weights.
*/
public EnumBasedLinearModel(Class<K> enumType, Map<K, Float> weights) {
features = EnumSet.allOf(enumType);
EnumMap<K, Float> enumWeights =
new EnumMap<>(Maps.filterValues(weights, Predicates.notNull()));
Preconditions.checkArgument(features.equals(enumWeights.keySet()),
"The model does not include weights for all the available features");
this.weights = enumWeights;
}
public ImmutableMap<K, Float> getWeights() {
return Maps.immutableEnumMap(weights);
}
@Override
public float score(Map<K, Float> instance) {
float total = 0;
for (Map.Entry<K, Float> weightEntry : weights.entrySet()) {
Float feature = instance.get(weightEntry.getKey());
if (feature != null) {
total += weightEntry.getValue() * feature;
}
}
return total;
}
/**
* Determines whether an instance is positive.
*/
@Override
public boolean classify(float threshold, Map<K, Float> instance) {
return score(instance) > threshold;
}
@Override
public boolean classify(Map<K, Float> instance) {
return classify(0, instance);
}
@Override
public String toString() {
return String.format("EnumBasedLinearModel[%s]", weights);
}
/**
* Creates a model where all the features have the same weight.
* This method is useful for generating the feature vectors for training a new model.
*/
public static <T extends Enum<T>> EnumBasedLinearModel<T> createWithEqualWeight(Class<T> enumType,
Float weight) {
EnumSet<T> features = EnumSet.allOf(enumType);
EnumMap<T, Float> weights = Maps.newEnumMap(enumType);
for (T feature : features) {
weights.put(feature, weight);
}
return new EnumBasedLinearModel<>(enumType, weights);
}
/**
* Loads the model from a TSV file with the following format:
*
* feature_name \t weight
*/
public static <T extends Enum<T>> EnumBasedLinearModel<T> createFromFile(
Class<T> enumType, AbstractFile path) throws IOException {
return new EnumBasedLinearModel<>(enumType, loadWeights(enumType, path, true));
}
/**
* Loads the model from a TSV file, using a default weight of 0 for missing features.
*
* File format:
*
* feature_name \t weight
*/
public static <T extends Enum<T>> EnumBasedLinearModel<T> createFromFileSafe(
Class<T> enumType, AbstractFile path) throws IOException {
return new EnumBasedLinearModel<>(enumType, loadWeights(enumType, path, false));
}
/**
* Creates a map of (feature_name, weight) from a TSV file.
*
* If strictMode is true, it will throw an exception if the file doesn't contain all the
* features declared in the enum. Otherwise, it will use zero as default value.
*
*/
private static <T extends Enum<T>> EnumMap<T, Float> loadWeights(
Class<T> enumType, AbstractFile fileHandle, boolean strictMode) throws IOException {
Map<String, Float> weightsFromFile =
TextFileLoadingUtils.loadMapFromFile(fileHandle, input -> Float.parseFloat(input));
EnumMap<T, Float> weights = Maps.newEnumMap(enumType);
Set<T> expectedFeatures = EnumSet.allOf(enumType);
if (!strictMode) {
for (T feature : expectedFeatures) {
weights.put(feature, 0f);
}
}
for (String featureName : weightsFromFile.keySet()) {
Float weight = weightsFromFile.get(featureName);
weights.put(Enum.valueOf(enumType, featureName.toUpperCase()), weight);
}
Preconditions.checkArgument(expectedFeatures.equals(weights.keySet()),
"Model does not contain weights for all the features");
return weights;
}
}

View File

@ -1,120 +0,0 @@
package com.twitter.search.common.util.ml;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.Sets;
/**
* Utilities for feature transformation and extraction.
*/
public final class FeatureUtils {
private FeatureUtils() {
}
/**
* Computes the difference between 2 values and returns the ratio of the difference over the
* minimum of both, according to these cases:
*
* 1. if (a > b) return a / b
* 2. if (a < b) return - b / a
* 3. if (a == b == 0) return 0
*
* The upper/lower limit is (-) maxRatio. For cases 1 and 2, if the denominator is 0,
* it returns maxRatio.
*
* This method is used to define a feature that tells how much larger or smaller is the
* first value with respect to the second one..
*/
public static float diffRatio(float a, float b, float maxRatio) {
float diff = a - b;
if (diff == 0) {
return 0;
}
float denominator = Math.min(a, b);
float ratio = denominator != 0 ? Math.abs(diff / denominator) : maxRatio;
return Math.copySign(Math.min(ratio, maxRatio), diff);
}
/**
* Computes the cosine similarity between two maps that represent sparse vectors.
*/
public static <K, V extends Number> double cosineSimilarity(
Map<K, V> vector1, Map<K, V> vector2) {
if (vector1 == null || vector1.isEmpty() || vector2 == null || vector2.isEmpty()) {
return 0;
}
double squaredSum1 = 0;
double squaredSum2 = 0;
double squaredCrossSum = 0;
for (K key : Sets.union(vector1.keySet(), vector2.keySet())) {
double value1 = 0;
double value2 = 0;
V optValue1 = vector1.get(key);
if (optValue1 != null) {
value1 = optValue1.doubleValue();
}
V optValue2 = vector2.get(key);
if (optValue2 != null) {
value2 = optValue2.doubleValue();
}
squaredSum1 += value1 * value1;
squaredSum2 += value2 * value2;
squaredCrossSum += value1 * value2;
}
if (squaredSum1 == 0 || squaredSum2 == 0) {
return 0;
} else {
return squaredCrossSum / Math.sqrt(squaredSum1 * squaredSum2);
}
}
/**
* Computes the cosine similarity between two (dense) vectors.
*/
public static <V extends Number> double cosineSimilarity(
List<V> vector1, List<V> vector2) {
if (vector1 == null || vector1.isEmpty() || vector2 == null || vector2.isEmpty()) {
return 0;
}
Preconditions.checkArgument(vector1.size() == vector2.size());
double squaredSum1 = 0;
double squaredSum2 = 0;
double squaredCrossSum = 0;
for (int i = 0; i < vector1.size(); i++) {
double value1 = vector1.get(i).doubleValue();
double value2 = vector2.get(i).doubleValue();
squaredSum1 += value1 * value1;
squaredSum2 += value2 * value2;
squaredCrossSum += value1 * value2;
}
if (squaredSum1 == 0 || squaredSum2 == 0) {
return 0;
} else {
return squaredCrossSum / Math.sqrt(squaredSum1 * squaredSum2);
}
}
/**
* Finds the key of the map with the highest value (compared in natural order)
*/
@SuppressWarnings("unchecked")
public static <K, V extends Comparable> Optional<K> findMaxKey(Map<K, V> map) {
if (map == null || map.isEmpty()) {
return Optional.empty();
}
Optional<Map.Entry<K, V>> maxEntry = map.entrySet().stream().max(Map.Entry.comparingByValue());
return maxEntry.map(Map.Entry::getKey);
}
}

View File

@ -1,32 +0,0 @@
package com.twitter.search.common.util.ml;
import java.util.Map;
/**
* An interface for linear models that are backed by some sort of map
*/
public interface MapBasedLinearModel<K> {
/**
* Evaluate using this model given a feature vector.
* @param instance The feature vector in format of a hashmap.
* @return
*/
boolean classify(Map<K, Float> instance);
/**
* Evaluate using this model given a classification threshold and a feature vector.
* @param threshold Score threshold used for classification.
* @param instance The feature vector in format of a hashmap.
* @return
*/
boolean classify(float threshold, Map<K, Float> instance);
/**
* Computes the score of an instance as a linear combination of the features and the model
* weights. 0 is used as default value for features or weights that are not present.
*
* @param instance The feature vector in format of a hashmap.
* @return The instance score according to the model.
*/
float score(Map<K, Float> instance);
}

View File

@ -1,125 +0,0 @@
package com.twitter.search.common.util.ml;
import java.util.Map;
import com.google.common.annotations.VisibleForTesting;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.base.Function;
import com.twitter.search.common.file.AbstractFile;
import com.twitter.search.common.util.io.TextFileLoadingUtils;
import it.unimi.dsi.fastutil.objects.Object2FloatMap;
import it.unimi.dsi.fastutil.objects.Object2FloatOpenHashMap;
/**
* Represents a linear model for scoring and classification.
*
* Features are represented as arbitrary strings, making this a fairly flexible implementation
* (at the cost of some performance, since all operations require hash lookups). Instances
* and weights are both encoded sparsely (as maps) so this implementation is well suited to
* models with large feature sets where most features are inactive at a given time. Weights
* for unknown features are assumed to be 0.
*
*/
public class StringMapBasedLinearModel implements MapBasedLinearModel<String> {
private static final Logger LOG = LoggerFactory.getLogger(StringMapBasedLinearModel.class);
protected final Object2FloatMap<String> model = new Object2FloatOpenHashMap<>();
/**
* Creates a model from a map of weights.
*
* @param weights Feature weights.
*/
public StringMapBasedLinearModel(Map<String, Float> weights) {
model.putAll(weights);
model.defaultReturnValue(0.0f);
}
/**
* Get the weight of a feature
* @param featureName
* @return
*/
public float getWeight(String featureName) {
return model.getFloat(featureName);
}
/**
* Get the full weight map
*/
@VisibleForTesting
protected Map<String, Float> getWeights() {
return model;
}
/**
* Evaluate using this model given a feature vector.
* @param values The feature vector in format of a hashmap.
* @return
*/
@Override
public float score(Map<String, Float> values) {
float score = 0.0f;
for (Map.Entry<String, Float> value : values.entrySet()) {
String featureName = value.getKey();
float weight = getWeight(featureName);
if (weight != 0.0f) {
score += weight * value.getValue();
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("%s = %.3f * %.3f = %.3f, ",
featureName, weight, value.getValue(),
weight * value.getValue()));
}
}
}
if (LOG.isDebugEnabled()) {
LOG.debug(String.format("Score = %.3f", score));
}
return score;
}
/**
* Determines whether an instance is positive.
*/
@Override
public boolean classify(Map<String, Float> values) {
return classify(0.0f, values);
}
@Override
public boolean classify(float threshold, Map<String, Float> values) {
return score(values) > threshold;
}
public int size() {
return model.size();
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("StringMapBasedLinearModel[");
for (Map.Entry<String, Float> entry : model.entrySet()) {
sb.append(String.format("(%s = %.3f), ", entry.getKey(), entry.getValue()));
}
sb.append("]");
return sb.toString();
}
/**
* Loads the model from a TSV file with the following format:
*
* feature_name \t weight
*/
public static StringMapBasedLinearModel loadFromFile(AbstractFile fileHandle) {
Map<String, Float> weights =
TextFileLoadingUtils.loadMapFromFile(
fileHandle,
(Function<String, Float>) item -> Float.parseFloat(item));
return new StringMapBasedLinearModel(weights);
}
}

View File

@ -1,14 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
strict_deps = True,
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/slf4j:slf4j-api",
"3rdparty/jvm/org/yaml:snakeyaml",
"src/java/com/twitter/search/common/file",
"src/java/com/twitter/search/common/metrics",
],
)

View File

@ -1,293 +0,0 @@
package com.twitter.search.common.util.ml.models_manager;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.Executors;
import java.util.concurrent.TimeUnit;
import java.util.function.Function;
import java.util.function.Supplier;
import java.util.stream.Collectors;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import com.google.common.util.concurrent.ThreadFactoryBuilder;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.yaml.snakeyaml.Yaml;
import com.twitter.search.common.file.AbstractFile;
import com.twitter.search.common.file.FileUtils;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchLongGauge;
/**
* Loads models from HDFS and provides an interface for reloading them periodically.
*
* There are 2 possible ways of detecting the active models:
*
* - DirectorySupplier: Uses all the subdirectories of a base path
* - ConfigSupplier: Gets the list from from a configuration file
*
* Models can be updated or added. Depending on the selected method, existing models can be removed
* if they are no longer active.
*/
public abstract class BaseModelsManager<T> implements Runnable {
private static final Logger LOG = LoggerFactory.getLogger(BaseModelsManager.class);
protected final Map<String, Long> lastModifiedMsByModel = new ConcurrentHashMap<>();
protected final Map<String, T> loadedModels = new ConcurrentHashMap<>();
protected final Supplier<Map<String, AbstractFile>> activeModelsSupplier;
protected Map<String, T> prevLoadedModels = new ConcurrentHashMap<>();
// This flag determines whether models are unloaded immediately when they're removed from
// activeModelsSupplier. If false, old models stay in memory until the process is restarted.
// This may be useful to safely change model configuration without restarting.
protected final boolean shouldUnloadInactiveModels;
protected final SearchLongGauge numModels;
protected final SearchCounter numErrors;
protected final SearchLongGauge lastLoadedMs;
protected Supplier<Boolean> shouldServeModels;
protected Supplier<Boolean> shouldLoadModels;
public BaseModelsManager(
Supplier<Map<String, AbstractFile>> activeModelsSupplier,
boolean shouldUnloadInactiveModels,
String statsPrefix
) {
this(
activeModelsSupplier,
shouldUnloadInactiveModels,
statsPrefix,
() -> true,
() -> true
);
}
public BaseModelsManager(
Supplier<Map<String, AbstractFile>> activeModelsSupplier,
boolean shouldUnloadInactiveModels,
String statsPrefix,
Supplier<Boolean> shouldServeModels,
Supplier<Boolean> shouldLoadModels
) {
this.activeModelsSupplier = activeModelsSupplier;
this.shouldUnloadInactiveModels = shouldUnloadInactiveModels;
this.shouldServeModels = shouldServeModels;
this.shouldLoadModels = shouldLoadModels;
numModels = SearchLongGauge.export(
String.format("model_loader_%s_num_models", statsPrefix));
numErrors = SearchCounter.export(
String.format("model_loader_%s_num_errors", statsPrefix));
lastLoadedMs = SearchLongGauge.export(
String.format("model_loader_%s_last_loaded_timestamp_ms", statsPrefix));
}
/**
* Retrieves a particular model.
*/
public Optional<T> getModel(String name) {
if (shouldServeModels.get()) {
return Optional.ofNullable(loadedModels.get(name));
} else {
return Optional.empty();
}
}
/**
* Reads a model instance from the directory file instance.
*
* @param modelBaseDir AbstractFile instance representing the directory.
* @return Model instance parsed from the directory.
*/
public abstract T readModelFromDirectory(AbstractFile modelBaseDir) throws Exception;
/**
* Cleans up any resources used by the model instance.
* This method is called after removing the model from the in-memory map.
* Sub-classes can provide custom overridden implementation as required.
*
* @param unloadedModel Model instance that would be unloaded from the manager.
*/
protected void cleanUpUnloadedModel(T unloadedModel) { }
@Override
public void run() {
// Get available models, either from the config file or by listing the base directory
final Map<String, AbstractFile> modelPathsFromConfig;
if (!shouldLoadModels.get()) {
LOG.info("Loading models is currently disabled.");
return;
}
modelPathsFromConfig = activeModelsSupplier.get();
for (Map.Entry<String, AbstractFile> nameAndPath : modelPathsFromConfig.entrySet()) {
String modelName = nameAndPath.getKey();
try {
AbstractFile modelDirectory = nameAndPath.getValue();
if (!modelDirectory.exists() && loadedModels.containsKey(modelName)) {
LOG.warn("Loaded model '{}' no longer exists at HDFS path {}, keeping loaded version; "
+ "replace directory in HDFS to update model.", modelName, modelDirectory);
continue;
}
long previousModifiedTimestamp = lastModifiedMsByModel.getOrDefault(modelName, 0L);
long lastModifiedMs = modelDirectory.getLastModified();
if (previousModifiedTimestamp == lastModifiedMs) {
continue;
}
LOG.info("Starting to load model. name={} path={}", modelName, modelDirectory.getPath());
T model = Preconditions.checkNotNull(readModelFromDirectory(modelDirectory));
LOG.info("Model initialized: {}. Last modified: {} ({})",
modelName, lastModifiedMs, new Date(lastModifiedMs));
T previousModel = loadedModels.put(modelName, model);
lastModifiedMsByModel.put(modelName, lastModifiedMs);
if (previousModel != null) {
cleanUpUnloadedModel(previousModel);
}
} catch (Exception e) {
numErrors.increment();
LOG.error("Error initializing model: {}", modelName, e);
}
}
// Remove any currently loaded models not present in the latest list
if (shouldUnloadInactiveModels) {
Set<String> inactiveModels =
Sets.difference(loadedModels.keySet(), modelPathsFromConfig.keySet()).immutableCopy();
for (String modelName : inactiveModels) {
T modelToUnload = loadedModels.get(modelName);
loadedModels.remove(modelName);
if (modelToUnload != null) {
// We could have an inactive model key without a model (value) if the
// initial readModelFromDirectory failed for the model entry.
// Checking for null to avoid exception.
cleanUpUnloadedModel(modelToUnload);
}
LOG.info("Unloaded model that is no longer active: {}", modelName);
}
}
if (!prevLoadedModels.keySet().equals(loadedModels.keySet())) {
LOG.info("Finished loading models: {}", loadedModels.keySet());
}
prevLoadedModels = loadedModels;
numModels.set(loadedModels.size());
lastLoadedMs.set(System.currentTimeMillis());
}
/**
* Schedules the loader to run periodically.
* @param period Period between executions
* @param timeUnit The time unit the period parameter.
*/
public final void scheduleAtFixedRate(
long period, TimeUnit timeUnit, String builderThreadName) {
Executors.newSingleThreadScheduledExecutor(
new ThreadFactoryBuilder()
.setDaemon(true)
.setNameFormat(builderThreadName)
.build())
.scheduleAtFixedRate(this, 0, period, timeUnit);
}
/**
* Gets the active list of models from the subdirectories in a base directory.
*
* Each model is identified by the name of the subdirectory.
*/
@VisibleForTesting
public static class DirectorySupplier implements Supplier<Map<String, AbstractFile>> {
private static final Logger LOG = LoggerFactory.getLogger(DirectorySupplier.class);
private final AbstractFile baseDir;
public DirectorySupplier(AbstractFile baseDir) {
this.baseDir = baseDir;
}
@Override
public Map<String, AbstractFile> get() {
try {
LOG.info("Loading models from the directories in: {}", baseDir.getPath());
List<AbstractFile> modelDirs =
ImmutableList.copyOf(baseDir.listFiles(AbstractFile.IS_DIRECTORY));
LOG.info("Found {} model directories: {}", modelDirs.size(), modelDirs);
return modelDirs.stream()
.collect(Collectors.toMap(
AbstractFile::getName,
Function.identity()
));
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
/**
* Gets the active list of models by reading a YAML config file.
*
* The keys are the model names, the values are dictionaries with a single entry for the path
* of the model in HDFS (without the HDFS name node prefix). For example:
*
* model_a:
* path: /path/to/model_a
* model_b:
* path: /path/to/model_b
*
*/
@VisibleForTesting
public static class ConfigSupplier implements Supplier<Map<String, AbstractFile>> {
private final AbstractFile configFile;
public ConfigSupplier(AbstractFile configFile) {
this.configFile = configFile;
}
@SuppressWarnings("unchecked")
@Override
public Map<String, AbstractFile> get() {
try (BufferedReader configReader = configFile.getCharSource().openBufferedStream()) {
Yaml yamlParser = new Yaml();
//noinspection unchecked
Map<String, Map<String, String>> config =
(Map<String, Map<String, String>>) yamlParser.load(configReader);
if (config == null || config.isEmpty()) {
return Collections.emptyMap();
}
Map<String, AbstractFile> modelPaths = new HashMap<>();
for (Map.Entry<String, Map<String, String>> nameAndConfig : config.entrySet()) {
String path = Strings.emptyToNull(nameAndConfig.getValue().get("path"));
Preconditions.checkNotNull(path, "Missing path for model: %s", nameAndConfig.getKey());
modelPaths.put(nameAndConfig.getKey(), FileUtils.getHdfsFileHandle(path));
}
return modelPaths;
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
}
}

View File

@ -1,68 +0,0 @@
java_library(
sources = ["*.java"],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common_internal/hadoop",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/transform",
"src/java/com/twitter/ml/common/base",
"src/java/com/twitter/ml/prediction/core",
"src/java/com/twitter/ml/tool/prediction:ModelInterpreter",
"src/java/com/twitter/ml/vw/constant",
"src/java/com/twitter/mlv2/trees/predictor",
"src/java/com/twitter/mlv2/trees/scorer",
"src/java/com/twitter/search/common/features",
"src/java/com/twitter/search/common/file",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/util/ml/models_manager",
"src/java/com/twitter/search/modeling/common",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/search/common:features-java",
],
)
java_library(
name = "for-timelines",
sources = [
"BaseLegacyScoreAccumulator.java",
"BaseModelBuilder.java",
"BaseScoreAccumulator.java",
"CompositeFeatureContext.java",
"DiscretizedFeature.java",
"DiscretizedFeatureRange.java",
"LegacyModelBuilder.java",
"LightweightLinearModel.java",
"ModelBuilder.java",
"SchemaBasedModelBuilder.java",
],
platform = "java8",
tags = ["bazel-compatible"],
dependencies = [
"3rdparty/jvm/com/google/guava",
"3rdparty/jvm/com/google/inject:guice",
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
"3rdparty/jvm/org/apache/thrift:libthrift",
"3rdparty/jvm/org/slf4j:slf4j-api",
"src/java/com/twitter/common/base",
"src/java/com/twitter/common_internal/hadoop",
"src/java/com/twitter/ml/api:api-base",
"src/java/com/twitter/ml/api/transform:DiscretizerTransform",
"src/java/com/twitter/ml/common/base",
"src/java/com/twitter/ml/tool/prediction:ModelInterpreter",
"src/java/com/twitter/ml/vw/constant",
"src/java/com/twitter/search/common/features",
"src/java/com/twitter/search/common/file",
"src/java/com/twitter/search/common/metrics",
"src/java/com/twitter/search/common/util/ml/models_manager",
"src/java/com/twitter/search/modeling/common",
"src/thrift/com/twitter/ml/api:data-java",
"src/thrift/com/twitter/search/common:features-java",
],
)

View File

@ -1,64 +0,0 @@
package com.twitter.search.common.util.ml.prediction_engine;
import com.google.common.base.Preconditions;
import com.twitter.ml.api.Feature;
/**
* Score accumulator for legacy (non-schema-based) features. It provides methods to add features
* using Feature objects.
*
* @deprecated This class is retired and we suggest to switch to schema-based features.
*/
@Deprecated
public abstract class BaseLegacyScoreAccumulator<D> extends BaseScoreAccumulator<D> {
public BaseLegacyScoreAccumulator(LightweightLinearModel model) {
super(model);
Preconditions.checkState(!model.isSchemaBased(),
"Cannot create LegacyScoreAccumulator with a schema-based model: %s", model.getName());
}
/**
* Add to the score the weight of a binary feature (if it's present).
*
* @deprecated This function is retired and we suggest to switch to addSchemaBooleanFeatures in
* SchemaBasedScoreAccumulator.
*/
@Deprecated
protected BaseLegacyScoreAccumulator addBinaryFeature(Feature<Boolean> feature,
boolean value) {
if (value) {
Double weight = model.binaryFeatures.get(feature);
if (weight != null) {
score += weight;
}
}
return this;
}
/**
* Add to the score the weight of a continuous feature.
* <p>
* If the model uses real valued features, it multiplies its weight by the provided value.
* Otherwise, it tries to find the discretized feature and adds its weight to the score.
*
* @deprecated This function is retired and we suggest to switch to addSchemaContinuousFeatures in
* SchemaBasedScoreAccumulator.
*/
@Deprecated
protected BaseLegacyScoreAccumulator addContinuousFeature(Feature<Double> feature,
double value) {
Double weightFromContinuous = model.continuousFeatures.get(feature);
if (weightFromContinuous != null) {
score += weightFromContinuous * value;
} else {
DiscretizedFeature discretizedFeature = model.discretizedFeatures.get(feature);
if (discretizedFeature != null) {
// Use only the weight of the discretized feature (there's no need to multiply it)
score += discretizedFeature.getWeight(value);
}
}
return this;
}
}

View File

@ -1,111 +0,0 @@
package com.twitter.search.common.util.ml.prediction_engine;
import java.util.Collection;
import java.util.Comparator;
import java.util.List;
import com.google.common.collect.Lists;
import com.twitter.ml.api.FeatureParser;
import com.twitter.ml.api.transform.DiscretizerTransform;
import com.twitter.ml.tool.prediction.ModelInterpreter;
/**
* The base model builder for LightweightLinearModels.
*/
public abstract class BaseModelBuilder implements ModelBuilder {
// Ignore features that have an absolute weight lower than this value
protected static final double MIN_WEIGHT = 1e-9;
private static final String BIAS_FIELD_NAME = ModelInterpreter.BIAS_FIELD_NAME;
static final String DISCRETIZER_NAME_SUFFIX =
"." + DiscretizerTransform.DEFAULT_FEATURE_NAME_SUFFIX;
protected final String modelName;
protected double bias;
public BaseModelBuilder(String modelName) {
this.modelName = modelName;
this.bias = 0.0;
}
/**
* Collects all the ranges of a discretized feature and sorts them.
*/
static DiscretizedFeature buildFeature(Collection<DiscretizedFeatureRange> ranges) {
List<DiscretizedFeatureRange> sortedRanges = Lists.newArrayList(ranges);
sortedRanges.sort(Comparator.comparingDouble(a -> a.minValue));
double[] splits = new double[ranges.size()];
double[] weights = new double[ranges.size()];
for (int i = 0; i < sortedRanges.size(); i++) {
splits[i] = sortedRanges.get(i).minValue;
weights[i] = sortedRanges.get(i).weight;
}
return new DiscretizedFeature(splits, weights);
}
/**
* Parses a line from the interpreted model text file. See the javadoc of the constructor for
* more details about how to create the text file.
* <p>
* The file uses TSV format with 3 columns:
* <p>
* Model name (Generated by ML API, but ignored by this class)
* Feature definition:
* Name of the feature or definition from the MDL discretizer.
* Weight:
* Weight of the feature using LOGIT scale.
* <p>
* When it parses each line, it stores the weights for all the features defined in the context,
* as well as the bias, but it ignores any other feature (e.g. label, prediction or
* meta.record_weight) and features with a small absolute weight (see MIN_WEIGHT).
* <p>
* Example lines:
* <p>
* model_name bias 0.019735312089324074
* model_name demo.binary_feature 0.06524706073105327
* model_name demo.continuous_feature 0.0
* model_name demo.continuous_feature.dz/dz_model=mdl/dz_range=-inf_3.58e-01 0.07155931927263737
* model_name demo.continuous_feature.dz/dz_model=mdl/dz_range=3.58e-01_inf -0.08979256264865387
*
* @see ModelInterpreter
* @see DiscretizerTransform
*/
@Override
public ModelBuilder parseLine(String line) {
String[] columns = line.split("\t");
if (columns.length != 3) {
return this;
}
// columns[0] has the model name, which we don't need
String featureName = columns[1];
double weight = Double.parseDouble(columns[2]);
if (BIAS_FIELD_NAME.equals(featureName)) {
bias = weight;
return this;
}
FeatureParser parser = FeatureParser.parse(featureName);
String baseName = parser.getBaseName();
if (Math.abs(weight) < MIN_WEIGHT && !baseName.endsWith(DISCRETIZER_NAME_SUFFIX)) {
// skip, unless it represents a range of a discretized feature.
// discretized features with all zeros should also be removed, but will handle that later
return this;
}
addFeature(baseName, weight, parser);
return this;
}
/**
* Adds feature to the model
*/
protected abstract void addFeature(String baseName, double weight, FeatureParser parser);
@Override
public abstract LightweightLinearModel build();
}

Some files were not shown because too many files have changed in this diff Show More