mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-11-16 00:25:11 +01:00
[docx] split commit for file 4000
Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
parent
0e39f836ae
commit
47a8228a09
Binary file not shown.
@ -1,148 +0,0 @@
|
||||
package com.twitter.search.common.schema.earlybird;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.search.common.encoding.features.IntegerEncodedFeatures;
|
||||
import com.twitter.search.common.indexing.thriftjava.PackedFeatures;
|
||||
import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures;
|
||||
import com.twitter.search.common.schema.SchemaUtil;
|
||||
import com.twitter.search.common.schema.base.FeatureConfiguration;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
|
||||
/**
|
||||
* A class for encoding earlybird features in integers
|
||||
*/
|
||||
public abstract class EarlybirdEncodedFeatures extends IntegerEncodedFeatures {
|
||||
private final ImmutableSchemaInterface schema;
|
||||
private final EarlybirdFieldConstant baseField;
|
||||
|
||||
public EarlybirdEncodedFeatures(ImmutableSchemaInterface schema,
|
||||
EarlybirdFieldConstant baseField) {
|
||||
this.schema = schema;
|
||||
this.baseField = baseField;
|
||||
}
|
||||
|
||||
/**
|
||||
* Write this object into packedFeatures of the given VersionedTweetFeatures.
|
||||
*/
|
||||
public void writeFeaturesToVersionedTweetFeatures(
|
||||
VersionedTweetFeatures versionedTweetFeatures) {
|
||||
if (!versionedTweetFeatures.isSetPackedFeatures()) {
|
||||
versionedTweetFeatures.setPackedFeatures(new PackedFeatures());
|
||||
}
|
||||
copyToPackedFeatures(versionedTweetFeatures.getPackedFeatures());
|
||||
}
|
||||
|
||||
/**
|
||||
* Write this object into extendedPackedFeatures of the given VersionedTweetFeatures.
|
||||
*/
|
||||
public void writeExtendedFeaturesToVersionedTweetFeatures(
|
||||
VersionedTweetFeatures versionedTweetFeatures) {
|
||||
if (!versionedTweetFeatures.isSetExtendedPackedFeatures()) {
|
||||
versionedTweetFeatures.setExtendedPackedFeatures(new PackedFeatures());
|
||||
}
|
||||
copyToPackedFeatures(versionedTweetFeatures.getExtendedPackedFeatures());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder ret = new StringBuilder();
|
||||
ret.append("Tweet features: \n");
|
||||
for (FeatureConfiguration feature
|
||||
: EarlybirdSchemaCreateTool.FEATURE_CONFIGURATION_MAP.values()) {
|
||||
ret.append(feature.getName()).append(": ").append(getFeatureValue(feature)).append("\n");
|
||||
}
|
||||
return ret.toString();
|
||||
}
|
||||
|
||||
public boolean isFlagSet(EarlybirdFieldConstant field) {
|
||||
return isFlagSet(schema.getFeatureConfigurationById(field.getFieldId()));
|
||||
}
|
||||
|
||||
public int getFeatureValue(EarlybirdFieldConstant field) {
|
||||
return getFeatureValue(schema.getFeatureConfigurationById(field.getFieldId()));
|
||||
}
|
||||
|
||||
public EarlybirdEncodedFeatures setFlag(EarlybirdFieldConstant field) {
|
||||
setFlag(schema.getFeatureConfigurationById(field.getFieldId()));
|
||||
return this;
|
||||
}
|
||||
|
||||
public EarlybirdEncodedFeatures clearFlag(EarlybirdFieldConstant field) {
|
||||
clearFlag(schema.getFeatureConfigurationById(field.getFieldId()));
|
||||
return this;
|
||||
}
|
||||
|
||||
public EarlybirdEncodedFeatures setFlagValue(EarlybirdFieldConstant field,
|
||||
boolean value) {
|
||||
setFlagValue(schema.getFeatureConfigurationById(field.getFieldId()), value);
|
||||
return this;
|
||||
}
|
||||
|
||||
public EarlybirdEncodedFeatures setFeatureValue(EarlybirdFieldConstant field,
|
||||
int value) {
|
||||
setFeatureValue(schema.getFeatureConfigurationById(field.getFieldId()), value);
|
||||
return this;
|
||||
}
|
||||
|
||||
public EarlybirdEncodedFeatures setFeatureValueIfGreater(EarlybirdFieldConstant field,
|
||||
int value) {
|
||||
setFeatureValueIfGreater(schema.getFeatureConfigurationById(field.getFieldId()), value);
|
||||
return this;
|
||||
}
|
||||
|
||||
public boolean incrementIfNotMaximum(EarlybirdFieldConstant field) {
|
||||
return incrementIfNotMaximum(schema.getFeatureConfigurationById(field.getFieldId()));
|
||||
}
|
||||
|
||||
private static final class ArrayEncodedTweetFeatures extends EarlybirdEncodedFeatures {
|
||||
private final int[] encodedInts;
|
||||
|
||||
private ArrayEncodedTweetFeatures(ImmutableSchemaInterface schema,
|
||||
EarlybirdFieldConstant baseField) {
|
||||
super(schema, baseField);
|
||||
|
||||
final int numIntegers = SchemaUtil.getCSFFieldFixedLength(schema, baseField.getFieldId());
|
||||
Preconditions.checkState(numIntegers > 0);
|
||||
this.encodedInts = new int[numIntegers];
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getNumInts() {
|
||||
return encodedInts.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getInt(int pos) {
|
||||
return encodedInts[pos];
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setInt(int pos, int value) {
|
||||
encodedInts[pos] = value;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link EarlybirdEncodedFeatures} object based on schema and base field.
|
||||
* @param schema the schema for all fields
|
||||
* @param baseField base field's constant value
|
||||
*/
|
||||
public static EarlybirdEncodedFeatures newEncodedTweetFeatures(
|
||||
ImmutableSchemaInterface schema, EarlybirdFieldConstant baseField) {
|
||||
return new ArrayEncodedTweetFeatures(schema, baseField);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link EarlybirdEncodedFeatures} object based on schema and base field name.
|
||||
* @param schema the schema for all fields
|
||||
* @param baseFieldName base field's name
|
||||
*/
|
||||
public static EarlybirdEncodedFeatures newEncodedTweetFeatures(
|
||||
ImmutableSchemaInterface schema, String baseFieldName) {
|
||||
EarlybirdFieldConstant baseField = EarlybirdFieldConstants.getFieldConstant(baseFieldName);
|
||||
Preconditions.checkNotNull(baseField);
|
||||
return newEncodedTweetFeatures(schema, baseField);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,36 +0,0 @@
|
||||
package com.twitter.search.common.schema.earlybird;
|
||||
|
||||
import com.twitter.search.common.encoding.docvalues.CSFTypeUtil;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
|
||||
public final class EarlybirdEncodedFeaturesUtil {
|
||||
private EarlybirdEncodedFeaturesUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a byte array that can be stored in a ThriftDocument as bytesField.
|
||||
*/
|
||||
public static byte[] toBytesForThriftDocument(EarlybirdEncodedFeatures features) {
|
||||
int numInts = features.getNumInts();
|
||||
byte[] serializedFeatures = new byte[numInts * Integer.BYTES];
|
||||
for (int i = 0; i < numInts; i++) {
|
||||
CSFTypeUtil.convertToBytes(serializedFeatures, i, features.getInt(i));
|
||||
}
|
||||
return serializedFeatures;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts data in a given byte array (starting at the provided offset) into
|
||||
* EarlybirdEncodedFeatures.
|
||||
*/
|
||||
public static EarlybirdEncodedFeatures fromBytes(
|
||||
ImmutableSchemaInterface schema, EarlybirdFieldConstants.EarlybirdFieldConstant baseField,
|
||||
byte[] data, int offset) {
|
||||
EarlybirdEncodedFeatures features = EarlybirdEncodedFeatures.newEncodedTweetFeatures(
|
||||
schema, baseField);
|
||||
for (int idx = 0; idx < features.getNumInts(); ++idx) {
|
||||
features.setInt(idx, CSFTypeUtil.convertFromBytes(data, offset, idx));
|
||||
}
|
||||
return features;
|
||||
}
|
||||
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -1,96 +0,0 @@
|
||||
package com.twitter.search.common.schema.earlybird;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
import com.twitter.common.text.util.TokenStreamSerializer;
|
||||
import com.twitter.search.common.schema.SchemaBuilder;
|
||||
import com.twitter.search.common.schema.base.FieldNameToIdMapping;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftFieldSettings;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftTokenStreamSerializer;
|
||||
import com.twitter.search.common.util.analysis.CharTermAttributeSerializer;
|
||||
import com.twitter.search.common.util.analysis.TermPayloadAttributeSerializer;
|
||||
|
||||
/**
|
||||
* Build class used to build a ThriftSchema
|
||||
*/
|
||||
public class EarlybirdSchemaBuilder extends SchemaBuilder {
|
||||
private final EarlybirdCluster cluster;
|
||||
|
||||
public EarlybirdSchemaBuilder(FieldNameToIdMapping idMapping,
|
||||
EarlybirdCluster cluster,
|
||||
TokenStreamSerializer.Version tokenStreamSerializerVersion) {
|
||||
super(idMapping, tokenStreamSerializerVersion);
|
||||
this.cluster = cluster;
|
||||
}
|
||||
|
||||
/**
|
||||
* Configure the specified field to be Out-of-order.
|
||||
* In the realtime cluster, this causes Earlybird to used the skip list posting format.
|
||||
*/
|
||||
public final EarlybirdSchemaBuilder withOutOfOrderEnabledForField(String fieldName) {
|
||||
if (!shouldIncludeField(fieldName)) {
|
||||
return this;
|
||||
}
|
||||
ThriftFieldSettings settings =
|
||||
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
|
||||
Preconditions.checkState(settings.isSetIndexedFieldSettings(),
|
||||
"Out of order field must be indexed");
|
||||
settings.getIndexedFieldSettings().setSupportOutOfOrderAppends(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This turns on tweet specific normalizations. This turns on the following two token processors:
|
||||
* {@link com.twitter.search.common.util.text.splitter.HashtagMentionPunctuationSplitter}
|
||||
* {@link com.twitter.search.common.util.text.filter.NormalizedTokenFilter}
|
||||
* <p/>
|
||||
* HashtagMentionPunctuationSplitter would break a mention or hashtag like @ab_cd or #ab_cd into
|
||||
* tokens {ab, cd}.
|
||||
* NormalizedTokenFilter strips out the # @ $ from the tokens.
|
||||
*/
|
||||
public final EarlybirdSchemaBuilder withTweetSpecificNormalization(String fieldName) {
|
||||
if (!shouldIncludeField(fieldName)) {
|
||||
return this;
|
||||
}
|
||||
ThriftFieldSettings settings =
|
||||
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
|
||||
Preconditions.checkState(settings.isSetIndexedFieldSettings(),
|
||||
"Tweet text field must be indexed.");
|
||||
settings.getIndexedFieldSettings().setDeprecated_performTweetSpecificNormalizations(true);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a twitter photo facet field.
|
||||
*/
|
||||
public final EarlybirdSchemaBuilder withPhotoUrlFacetField(String fieldName) {
|
||||
if (!shouldIncludeField(fieldName)) {
|
||||
return this;
|
||||
}
|
||||
ThriftFieldSettings photoFieldSettings = getNoPositionNoFreqSettings();
|
||||
ThriftTokenStreamSerializer tokenStreamSerializer =
|
||||
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
|
||||
tokenStreamSerializer.setAttributeSerializerClassNames(
|
||||
ImmutableList.<String>of(
|
||||
CharTermAttributeSerializer.class.getName(),
|
||||
TermPayloadAttributeSerializer.class.getName()));
|
||||
photoFieldSettings
|
||||
.getIndexedFieldSettings()
|
||||
.setTokenStreamSerializer(tokenStreamSerializer)
|
||||
.setTokenized(true);
|
||||
putIntoFieldConfigs(idMapping.getFieldID(fieldName),
|
||||
new ThriftFieldConfiguration(fieldName).setSettings(photoFieldSettings));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the given field should be included or dropped.
|
||||
*/
|
||||
@Override
|
||||
protected boolean shouldIncludeField(String fieldName) {
|
||||
return EarlybirdFieldConstants.getFieldConstant(fieldName).isValidFieldInCluster(cluster);
|
||||
}
|
||||
}
|
||||
|
Binary file not shown.
@ -1,702 +0,0 @@
|
||||
package com.twitter.search.common.schema.earlybird;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import com.twitter.common.text.util.TokenStreamSerializer;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.schema.AnalyzerFactory;
|
||||
import com.twitter.search.common.schema.DynamicSchema;
|
||||
import com.twitter.search.common.schema.ImmutableSchema;
|
||||
import com.twitter.search.common.schema.SchemaBuilder;
|
||||
import com.twitter.search.common.schema.base.FeatureConfiguration;
|
||||
import com.twitter.search.common.schema.base.Schema;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftSchema;
|
||||
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_FAVORITE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_QUOTE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_REPLY_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_RETWEET_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.COMPOSER_SOURCE_IS_CAMERA_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_FAVORITE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_QUOTE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_REPLY_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_RETWEET_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT_V2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_URL_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_URL_COUNT_V2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_1;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_3;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_4;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_FEATURE_UNUSED_BITS_0_24_8;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_12_30_2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_13_30_2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_14_10_22;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_16;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_17;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_18;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_19;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_20;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_4_31_1;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_7_6_26;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_FAVORITE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_QUOTE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_REPLY_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_RETWEET_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAVORITE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAVORITE_COUNT_V2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CARD_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_EXPANDO_CARD_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_LINK_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_MULTIPLE_MEDIA_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NEWS_URL_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PERISCOPE_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_QUOTE_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_TREND_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VINE_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_NULLCAST_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_OFFENSIVE_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_REPLY_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_RETWEET_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_SENSITIVE_CONTENT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_TRENDING_NOW_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_BOT_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_NEW_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_NSFW_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_SPAM_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_ABUSIVE_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_ABUSIVE_HI_RCL_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_DUP_CONTENT_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_NSFW_HI_PRC_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_NSFW_HI_RCL_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_SPAM_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_SPAM_HI_RCL_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LANGUAGE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_FAVORITE_SINCE_CREATION_HRS;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_QUOTE_SINCE_CREATION_HRS;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_REPLY_SINCE_CREATION_HRS;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_RETWEET_SINCE_CREATION_HRS;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LINK_LANGUAGE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_HASHTAGS;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_HASHTAGS_V2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_MENTIONS;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_MENTIONS_V2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_STOCKS;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PARUS_SCORE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PBLOCK_SCORE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_EXISTS;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_HAS_BEEN_FEATURED;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_IS_CURRENTLY_FEATURED;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_IS_FROM_QUALITY_SOURCE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_IS_LIVE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PREV_USER_TWEET_ENGAGEMENT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PROFILE_IS_EGG_FLAG;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.P_REPORTED_TWEET_SCORE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.P_SPAMMY_TWEET_SCORE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.QUOTE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REPLY_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REPLY_COUNT_V2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.RETWEET_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.RETWEET_COUNT_V2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.SPAMMY_TWEET_CONTENT_SCORE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.TEXT_SCORE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.TOXICITY_SCORE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.TWEET_SIGNATURE;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.USER_REPUTATION;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_VIEW_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_VIEW_COUNT_V2;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.VISIBLE_TOKEN_RATIO;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_FAVORITE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_QUOTE_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_REPLY_COUNT;
|
||||
import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_RETWEET_COUNT;
|
||||
|
||||
/**
|
||||
* Field configurations for Earlybird.
|
||||
*/
|
||||
public final class EarlybirdSchemaCreateTool {
|
||||
// How many times a schema is built
|
||||
private static final SearchCounter SCHEMA_BUILD_COUNT =
|
||||
SearchCounter.export("schema_build_count");
|
||||
|
||||
// Number of integers for the column of ENCODED_TWEET_FEATURES_FIELD.
|
||||
@VisibleForTesting
|
||||
public static final int NUMBER_OF_INTEGERS_FOR_FEATURES = 5;
|
||||
|
||||
// Number of integers for the column of EXTENDED_ENCODED_TWEET_FEATURES_FIELD.
|
||||
// extra 80 bytes
|
||||
// In realtime cluster, assuming 19 segments total, and 8388608 docs per segment
|
||||
// this would amount to about 12.75GB of memory needed
|
||||
//
|
||||
@VisibleForTesting
|
||||
public static final int NUMBER_OF_INTEGERS_FOR_EXTENDED_FEATURES = 20;
|
||||
|
||||
@VisibleForTesting
|
||||
public static final Map<String, FeatureConfiguration> FEATURE_CONFIGURATION_MAP
|
||||
= Maps.newLinkedHashMap();
|
||||
|
||||
public static final String BASE_FIELD_NAME =
|
||||
EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName();
|
||||
|
||||
private static String getBaseFieldName(String fullName) {
|
||||
int index = fullName.indexOf(SchemaBuilder.CSF_VIEW_NAME_SEPARATOR);
|
||||
Preconditions.checkArgument(index > 0);
|
||||
return fullName.substring(0, index);
|
||||
}
|
||||
|
||||
private static String getBaseFieldName(EarlybirdFieldConstant fieldConstant) {
|
||||
return getBaseFieldName(fieldConstant.getFieldName());
|
||||
}
|
||||
|
||||
private static String getFeatureNameInField(EarlybirdFieldConstant fieldConstant) {
|
||||
int index = fieldConstant.getFieldName().indexOf(SchemaBuilder.CSF_VIEW_NAME_SEPARATOR);
|
||||
Preconditions.checkArgument(index > 0);
|
||||
return fieldConstant.getFieldName().substring(index + 1);
|
||||
}
|
||||
|
||||
// defining all features
|
||||
static {
|
||||
// Add individual tweet encoded features as views on top of
|
||||
// EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD
|
||||
|
||||
// int intIndex, int bitStartPos, int bitLength
|
||||
newEarlybirdFeatureConfiguration(IS_RETWEET_FLAG, ThriftCSFType.BOOLEAN, 0, 0, 1);
|
||||
newEarlybirdFeatureConfiguration(IS_OFFENSIVE_FLAG, ThriftCSFType.BOOLEAN, 0, 1, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_LINK_FLAG, ThriftCSFType.BOOLEAN, 0, 2, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_TREND_FLAG, ThriftCSFType.BOOLEAN, 0, 3, 1);
|
||||
newEarlybirdFeatureConfiguration(IS_REPLY_FLAG, ThriftCSFType.BOOLEAN, 0, 4, 1);
|
||||
newEarlybirdFeatureConfiguration(IS_SENSITIVE_CONTENT, ThriftCSFType.BOOLEAN, 0, 5, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG,
|
||||
ThriftCSFType.BOOLEAN, 0, 6, 1);
|
||||
newEarlybirdFeatureConfiguration(FROM_VERIFIED_ACCOUNT_FLAG, ThriftCSFType.BOOLEAN, 0, 7, 1);
|
||||
newEarlybirdFeatureConfiguration(TEXT_SCORE, ThriftCSFType.INT, 0, 8, 8);
|
||||
newEarlybirdFeatureConfiguration(LANGUAGE, ThriftCSFType.INT, 0, 16, 8);
|
||||
newEarlybirdFeatureConfiguration(LINK_LANGUAGE, ThriftCSFType.INT, 0, 24, 8);
|
||||
|
||||
newEarlybirdFeatureConfiguration(HAS_IMAGE_URL_FLAG, ThriftCSFType.BOOLEAN, 1, 0, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_VIDEO_URL_FLAG, ThriftCSFType.BOOLEAN, 1, 1, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_NEWS_URL_FLAG, ThriftCSFType.BOOLEAN, 1, 2, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_EXPANDO_CARD_FLAG, ThriftCSFType.BOOLEAN, 1, 3, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_MULTIPLE_MEDIA_FLAG, ThriftCSFType.BOOLEAN, 1, 4, 1);
|
||||
newEarlybirdFeatureConfiguration(PROFILE_IS_EGG_FLAG, ThriftCSFType.BOOLEAN, 1, 5, 1);
|
||||
newEarlybirdFeatureConfiguration(NUM_MENTIONS, ThriftCSFType.INT, 1, 6, 2); // 0, 1, 2, 3+
|
||||
newEarlybirdFeatureConfiguration(NUM_HASHTAGS, ThriftCSFType.INT, 1, 8, 2); // 0, 1, 2, 3+
|
||||
newEarlybirdFeatureConfiguration(HAS_CARD_FLAG, ThriftCSFType.BOOLEAN, 1, 10, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_VISIBLE_LINK_FLAG, ThriftCSFType.BOOLEAN, 1, 11, 1);
|
||||
newEarlybirdFeatureConfiguration(USER_REPUTATION, ThriftCSFType.INT, 1, 12, 8);
|
||||
newEarlybirdFeatureConfiguration(IS_USER_SPAM_FLAG, ThriftCSFType.BOOLEAN, 1, 20, 1);
|
||||
newEarlybirdFeatureConfiguration(IS_USER_NSFW_FLAG, ThriftCSFType.BOOLEAN, 1, 21, 1);
|
||||
newEarlybirdFeatureConfiguration(IS_USER_BOT_FLAG, ThriftCSFType.BOOLEAN, 1, 22, 1);
|
||||
newEarlybirdFeatureConfiguration(IS_USER_NEW_FLAG, ThriftCSFType.BOOLEAN, 1, 23, 1);
|
||||
newEarlybirdFeatureConfiguration(PREV_USER_TWEET_ENGAGEMENT, ThriftCSFType.INT, 1, 24, 6);
|
||||
newEarlybirdFeatureConfiguration(COMPOSER_SOURCE_IS_CAMERA_FLAG,
|
||||
ThriftCSFType.BOOLEAN, 1, 30, 1);
|
||||
newEarlybirdFeatureConfiguration(IS_NULLCAST_FLAG, ThriftCSFType.BOOLEAN, 1, 31, 1);
|
||||
|
||||
newEarlybirdFeatureConfiguration(RETWEET_COUNT, ThriftCSFType.DOUBLE, 2, 0, 8,
|
||||
ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(FAVORITE_COUNT, ThriftCSFType.DOUBLE, 2, 8, 8,
|
||||
ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(REPLY_COUNT, ThriftCSFType.DOUBLE, 2, 16, 8,
|
||||
ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(PARUS_SCORE, ThriftCSFType.DOUBLE, 2, 24, 8);
|
||||
|
||||
newEarlybirdFeatureConfiguration(HAS_CONSUMER_VIDEO_FLAG, ThriftCSFType.BOOLEAN, 3, 0, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_PRO_VIDEO_FLAG, ThriftCSFType.BOOLEAN, 3, 1, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_VINE_FLAG, ThriftCSFType.BOOLEAN, 3, 2, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_PERISCOPE_FLAG, ThriftCSFType.BOOLEAN, 3, 3, 1);
|
||||
newEarlybirdFeatureConfiguration(HAS_NATIVE_IMAGE_FLAG, ThriftCSFType.BOOLEAN, 3, 4, 1);
|
||||
// NOTE: There are 3 bits left in the first byte of INT 3, if possible, please reserve them
|
||||
// for future media types (SEARCH-9131)
|
||||
// newEarlybirdFeatureConfiguration(FUTURE_MEDIA_BITS, ThriftCSFType.INT, 3, 5, 3);
|
||||
|
||||
newEarlybirdFeatureConfiguration(VISIBLE_TOKEN_RATIO, ThriftCSFType.INT, 3, 8, 4);
|
||||
newEarlybirdFeatureConfiguration(HAS_QUOTE_FLAG, ThriftCSFType.BOOLEAN, 3, 12, 1);
|
||||
newEarlybirdFeatureConfiguration(FROM_BLUE_VERIFIED_ACCOUNT_FLAG,
|
||||
ThriftCSFType.BOOLEAN, 3, 13, 1);
|
||||
// Unused bits from bit 14 to bit 31 (18 bits)
|
||||
// newEarlybirdFeatureConfiguration(UNUSED_BITS, ThriftCSFType.INT, 3, 14, 18);
|
||||
|
||||
newEarlybirdFeatureConfiguration(TWEET_SIGNATURE, ThriftCSFType.INT, 4, 0, 32);
|
||||
|
||||
newEarlybirdFeatureConfiguration(EMBEDS_IMPRESSION_COUNT,
|
||||
ThriftCSFType.DOUBLE, 0, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(EMBEDS_URL_COUNT,
|
||||
ThriftCSFType.DOUBLE, 0, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(VIDEO_VIEW_COUNT,
|
||||
ThriftCSFType.DOUBLE, 0, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
|
||||
// Unused bits from bit 24 to bit 31 (8 bits).
|
||||
// This used to be a feature that was decommissioned (SEARCHQUAL-10321)
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_FEATURE_UNUSED_BITS_0_24_8,
|
||||
ThriftCSFType.INT, 0, 24, 8);
|
||||
|
||||
newEarlybirdFeatureConfiguration(REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT,
|
||||
ThriftCSFType.INT, 1, 0, 32, ThriftFeatureUpdateConstraint.IMMUTABLE);
|
||||
newEarlybirdFeatureConfiguration(REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT,
|
||||
ThriftCSFType.INT, 2, 0, 32, ThriftFeatureUpdateConstraint.IMMUTABLE);
|
||||
|
||||
newEarlybirdFeatureConfiguration(RETWEET_COUNT_V2,
|
||||
ThriftCSFType.DOUBLE, 3, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(FAVORITE_COUNT_V2,
|
||||
ThriftCSFType.DOUBLE, 3, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(REPLY_COUNT_V2,
|
||||
ThriftCSFType.DOUBLE, 3, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(EMBEDS_IMPRESSION_COUNT_V2,
|
||||
ThriftCSFType.DOUBLE, 3, 24, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
|
||||
newEarlybirdFeatureConfiguration(EMBEDS_URL_COUNT_V2,
|
||||
ThriftCSFType.DOUBLE, 4, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(VIDEO_VIEW_COUNT_V2,
|
||||
ThriftCSFType.DOUBLE, 4, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(QUOTE_COUNT,
|
||||
ThriftCSFType.DOUBLE, 4, 16, 8);
|
||||
|
||||
newEarlybirdFeatureConfiguration(LABEL_ABUSIVE_FLAG, ThriftCSFType.BOOLEAN, 4, 24, 1);
|
||||
newEarlybirdFeatureConfiguration(LABEL_ABUSIVE_HI_RCL_FLAG, ThriftCSFType.BOOLEAN, 4, 25, 1);
|
||||
newEarlybirdFeatureConfiguration(LABEL_DUP_CONTENT_FLAG, ThriftCSFType.BOOLEAN, 4, 26, 1);
|
||||
newEarlybirdFeatureConfiguration(LABEL_NSFW_HI_PRC_FLAG, ThriftCSFType.BOOLEAN, 4, 27, 1);
|
||||
newEarlybirdFeatureConfiguration(LABEL_NSFW_HI_RCL_FLAG, ThriftCSFType.BOOLEAN, 4, 28, 1);
|
||||
newEarlybirdFeatureConfiguration(LABEL_SPAM_FLAG, ThriftCSFType.BOOLEAN, 4, 29, 1);
|
||||
newEarlybirdFeatureConfiguration(LABEL_SPAM_HI_RCL_FLAG, ThriftCSFType.BOOLEAN, 4, 30, 1);
|
||||
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_4_31_1,
|
||||
ThriftCSFType.INT, 4, 31, 1);
|
||||
|
||||
newEarlybirdFeatureConfiguration(WEIGHTED_RETWEET_COUNT,
|
||||
ThriftCSFType.DOUBLE, 5, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(WEIGHTED_REPLY_COUNT,
|
||||
ThriftCSFType.DOUBLE, 5, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(WEIGHTED_FAVORITE_COUNT,
|
||||
ThriftCSFType.DOUBLE, 5, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(WEIGHTED_QUOTE_COUNT,
|
||||
ThriftCSFType.DOUBLE, 5, 24, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
|
||||
newEarlybirdFeatureConfiguration(PERISCOPE_EXISTS,
|
||||
ThriftCSFType.BOOLEAN, 6, 0, 1);
|
||||
newEarlybirdFeatureConfiguration(PERISCOPE_HAS_BEEN_FEATURED,
|
||||
ThriftCSFType.BOOLEAN, 6, 1, 1);
|
||||
newEarlybirdFeatureConfiguration(PERISCOPE_IS_CURRENTLY_FEATURED,
|
||||
ThriftCSFType.BOOLEAN, 6, 2, 1);
|
||||
newEarlybirdFeatureConfiguration(PERISCOPE_IS_FROM_QUALITY_SOURCE,
|
||||
ThriftCSFType.BOOLEAN, 6, 3, 1);
|
||||
newEarlybirdFeatureConfiguration(PERISCOPE_IS_LIVE,
|
||||
ThriftCSFType.BOOLEAN, 6, 4, 1);
|
||||
|
||||
newEarlybirdFeatureConfiguration(IS_TRENDING_NOW_FLAG,
|
||||
ThriftCSFType.BOOLEAN, 6, 5, 1);
|
||||
|
||||
// remaining bits for integer 6
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_7_6_26,
|
||||
ThriftCSFType.INT, 6, 6, 26);
|
||||
|
||||
// The decaying counters can become smaller
|
||||
newEarlybirdFeatureConfiguration(DECAYED_RETWEET_COUNT,
|
||||
ThriftCSFType.DOUBLE, 7, 0, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
newEarlybirdFeatureConfiguration(DECAYED_REPLY_COUNT,
|
||||
ThriftCSFType.DOUBLE, 7, 8, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
newEarlybirdFeatureConfiguration(DECAYED_FAVORITE_COUNT,
|
||||
ThriftCSFType.DOUBLE, 7, 16, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
newEarlybirdFeatureConfiguration(DECAYED_QUOTE_COUNT,
|
||||
ThriftCSFType.DOUBLE, 7, 24, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
|
||||
// The fake engagement counters.
|
||||
newEarlybirdFeatureConfiguration(FAKE_RETWEET_COUNT,
|
||||
ThriftCSFType.DOUBLE, 8, 0, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
newEarlybirdFeatureConfiguration(FAKE_REPLY_COUNT,
|
||||
ThriftCSFType.DOUBLE, 8, 8, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
newEarlybirdFeatureConfiguration(FAKE_FAVORITE_COUNT,
|
||||
ThriftCSFType.DOUBLE, 8, 16, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
newEarlybirdFeatureConfiguration(FAKE_QUOTE_COUNT,
|
||||
ThriftCSFType.DOUBLE, 8, 24, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
|
||||
newEarlybirdFeatureConfiguration(LAST_RETWEET_SINCE_CREATION_HRS,
|
||||
ThriftCSFType.INT, 9, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(LAST_REPLY_SINCE_CREATION_HRS,
|
||||
ThriftCSFType.INT, 9, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(LAST_FAVORITE_SINCE_CREATION_HRS,
|
||||
ThriftCSFType.INT, 9, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
newEarlybirdFeatureConfiguration(LAST_QUOTE_SINCE_CREATION_HRS,
|
||||
ThriftCSFType.INT, 9, 24, 8, ThriftFeatureUpdateConstraint.INC_ONLY);
|
||||
|
||||
newEarlybirdFeatureConfiguration(NUM_HASHTAGS_V2,
|
||||
ThriftCSFType.INT, 10, 0, 4);
|
||||
newEarlybirdFeatureConfiguration(NUM_MENTIONS_V2,
|
||||
ThriftCSFType.INT, 10, 4, 4);
|
||||
newEarlybirdFeatureConfiguration(NUM_STOCKS,
|
||||
ThriftCSFType.INT, 10, 8, 4);
|
||||
|
||||
// Remaining bits for integer 10
|
||||
// Production Toxicity and PBlock score from HML (go/toxicity, go/pblock)
|
||||
newEarlybirdFeatureConfiguration(TOXICITY_SCORE,
|
||||
ThriftCSFType.DOUBLE, 10, 12, 10);
|
||||
newEarlybirdFeatureConfiguration(PBLOCK_SCORE,
|
||||
ThriftCSFType.DOUBLE, 10, 22, 10);
|
||||
|
||||
// The blink engagement counters
|
||||
newEarlybirdFeatureConfiguration(BLINK_RETWEET_COUNT,
|
||||
ThriftCSFType.DOUBLE, 11, 0, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
newEarlybirdFeatureConfiguration(BLINK_REPLY_COUNT,
|
||||
ThriftCSFType.DOUBLE, 11, 8, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
newEarlybirdFeatureConfiguration(BLINK_FAVORITE_COUNT,
|
||||
ThriftCSFType.DOUBLE, 11, 16, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
newEarlybirdFeatureConfiguration(BLINK_QUOTE_COUNT,
|
||||
ThriftCSFType.DOUBLE, 11, 24, 8, ThriftFeatureUpdateConstraint.POSITIVE);
|
||||
|
||||
// Experimental health model scores from HML
|
||||
newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_1,
|
||||
ThriftCSFType.DOUBLE, 12, 0, 10);
|
||||
newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_2,
|
||||
ThriftCSFType.DOUBLE, 12, 10, 10);
|
||||
newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_3,
|
||||
ThriftCSFType.DOUBLE, 12, 20, 10);
|
||||
// remaining bits for integer 12
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_12_30_2,
|
||||
ThriftCSFType.INT, 12, 30, 2);
|
||||
|
||||
// Experimental health model scores from HML (cont.)
|
||||
newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_4,
|
||||
ThriftCSFType.DOUBLE, 13, 0, 10);
|
||||
// Production pSpammyTweet score from HML (go/pspammytweet)
|
||||
newEarlybirdFeatureConfiguration(P_SPAMMY_TWEET_SCORE,
|
||||
ThriftCSFType.DOUBLE, 13, 10, 10);
|
||||
// Production pReportedTweet score from HML (go/preportedtweet)
|
||||
newEarlybirdFeatureConfiguration(P_REPORTED_TWEET_SCORE,
|
||||
ThriftCSFType.DOUBLE, 13, 20, 10);
|
||||
// remaining bits for integer 13
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_13_30_2,
|
||||
ThriftCSFType.INT, 13, 30, 2);
|
||||
|
||||
// Experimental health model scores from HML (cont.)
|
||||
// Prod Spammy Tweet Content model score from Platform Manipulation (go/spammy-tweet-content)
|
||||
newEarlybirdFeatureConfiguration(SPAMMY_TWEET_CONTENT_SCORE,
|
||||
ThriftCSFType.DOUBLE, 14, 0, 10);
|
||||
// remaining bits for integer 14
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_14_10_22,
|
||||
ThriftCSFType.INT, 14, 10, 22);
|
||||
|
||||
// Note that the integer index below is 0-based, but the index j in UNUSED_BITS_{j} below
|
||||
// is 1-based.
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_16,
|
||||
ThriftCSFType.INT, 15, 0, 32);
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_17,
|
||||
ThriftCSFType.INT, 16, 0, 32);
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_18,
|
||||
ThriftCSFType.INT, 17, 0, 32);
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_19,
|
||||
ThriftCSFType.INT, 18, 0, 32);
|
||||
newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_20,
|
||||
ThriftCSFType.INT, 19, 0, 32);
|
||||
}
|
||||
|
||||
private EarlybirdSchemaCreateTool() { }
|
||||
|
||||
/**
|
||||
* Get schema for the Earlybird.
|
||||
*/
|
||||
public static DynamicSchema buildSchema(EarlybirdCluster cluster)
|
||||
throws Schema.SchemaValidationException {
|
||||
SCHEMA_BUILD_COUNT.increment();
|
||||
return new DynamicSchema(new ImmutableSchema(buildThriftSchema(cluster),
|
||||
new AnalyzerFactory(),
|
||||
cluster.getNameForStats()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get schema for the Earlybird, can throw runtime exception. This is mostly for static schema
|
||||
* usage, which does not care about schema updates.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public static DynamicSchema buildSchemaWithRuntimeException(EarlybirdCluster cluster) {
|
||||
try {
|
||||
return buildSchema(cluster);
|
||||
} catch (Schema.SchemaValidationException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
private static FeatureConfiguration newEarlybirdFeatureConfiguration(
|
||||
EarlybirdFieldConstant fieldConstant,
|
||||
ThriftCSFType type,
|
||||
int intIndex, int bitStartPos, int bitLength,
|
||||
ThriftFeatureUpdateConstraint... constraints) {
|
||||
|
||||
if (!fieldConstant.isFlagFeatureField() && type == ThriftCSFType.BOOLEAN) {
|
||||
throw new IllegalArgumentException(
|
||||
"Non-flag feature field configured with boolean Thrift type: " + fieldConstant);
|
||||
}
|
||||
if (fieldConstant.isFlagFeatureField() && type != ThriftCSFType.BOOLEAN) {
|
||||
throw new IllegalArgumentException(
|
||||
"Flag feature field configured with non-boolean Thrift type: " + fieldConstant);
|
||||
}
|
||||
|
||||
String baseFieldName = getBaseFieldName(fieldConstant);
|
||||
String name = getFeatureNameInField(fieldConstant);
|
||||
FeatureConfiguration.Builder builder = FeatureConfiguration.builder()
|
||||
.withName(name)
|
||||
.withType(type)
|
||||
.withBitRange(intIndex, bitStartPos, bitLength);
|
||||
// remove the following line once we configure features purely by the schema
|
||||
builder.withBaseField(baseFieldName);
|
||||
|
||||
if (!fieldConstant.isUnusedField()) {
|
||||
builder.withOutputType(type);
|
||||
}
|
||||
if (fieldConstant.getFeatureNormalizationType() != null) {
|
||||
builder.withFeatureNormalizationType(fieldConstant.getFeatureNormalizationType());
|
||||
}
|
||||
|
||||
for (ThriftFeatureUpdateConstraint constraint : constraints) {
|
||||
builder.withFeatureUpdateConstraint(constraint);
|
||||
}
|
||||
FeatureConfiguration featureConfiguration = builder.build();
|
||||
FEATURE_CONFIGURATION_MAP.put(fieldConstant.getFieldName(), featureConfiguration);
|
||||
return featureConfiguration;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build ThriftSchema for the Earlybird. Note that the schema returned can be used
|
||||
* all Earlybird clusters. However, some clusters may not use all the field configurations.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public static ThriftSchema buildThriftSchema(EarlybirdCluster cluster) {
|
||||
EarlybirdSchemaBuilder builder = new EarlybirdSchemaBuilder(
|
||||
new EarlybirdFieldConstants(), cluster, TokenStreamSerializer.Version.VERSION_2);
|
||||
|
||||
builder.withSchemaVersion(
|
||||
FlushVersion.CURRENT_FLUSH_VERSION.getVersionNumber(),
|
||||
FlushVersion.CURRENT_FLUSH_VERSION.getMinorVersion(),
|
||||
FlushVersion.CURRENT_FLUSH_VERSION.getDescription(),
|
||||
FlushVersion.CURRENT_FLUSH_VERSION.isOfficial());
|
||||
|
||||
// ID field, used for partitioning
|
||||
builder.withPartitionFieldId(0)
|
||||
.withSortableLongTermField(EarlybirdFieldConstant.ID_FIELD.getFieldName())
|
||||
// Text Fields that are searched by default
|
||||
.withTextField(EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), true)
|
||||
.withSearchFieldByDefault(
|
||||
EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), 0.1f)
|
||||
.withPretokenizedTextField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), true)
|
||||
.withSearchFieldByDefault(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), 1.0f);
|
||||
builder.withTweetSpecificNormalization(EarlybirdFieldConstant.TEXT_FIELD.getFieldName())
|
||||
.withTextField(EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), true)
|
||||
.withSearchFieldByDefault(
|
||||
EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), 0.2f)
|
||||
|
||||
// Text fields not searched by default
|
||||
.withTextField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), false)
|
||||
.withTextField(EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(), false)
|
||||
|
||||
// cards are not searched by default, and have weight 0.
|
||||
.withPretokenizedTextField(EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(), false)
|
||||
.withPretokenizedTextField(
|
||||
EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(), false)
|
||||
.withTextField(EarlybirdFieldConstant.CARD_LANG.getFieldName(), false)
|
||||
|
||||
// Out-of-order append fields
|
||||
.withLongTermField(EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName())
|
||||
|
||||
// No Position fields, sorted alphabetically
|
||||
.withPretokenizedNoPositionField(EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName())
|
||||
.withIntTermField(EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.RETWEET_SOURCE_TWEET_ID_FIELD.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.RETWEET_SOURCE_USER_ID_FIELD.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName())
|
||||
.withTextField(EarlybirdFieldConstant.PLACE_FULL_NAME_FIELD.getFieldName(), false)
|
||||
.withIndexedNotTokenizedField(
|
||||
EarlybirdFieldConstant.PLACE_COUNTRY_CODE_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(
|
||||
EarlybirdFieldConstant.PROFILE_GEO_COUNTRY_CODE_FIELD.getFieldName())
|
||||
.withTextField(EarlybirdFieldConstant.PROFILE_GEO_REGION_FIELD.getFieldName(), false)
|
||||
.withTextField(EarlybirdFieldConstant.PROFILE_GEO_LOCALITY_FIELD.getFieldName(), false)
|
||||
.withTermTextLookup(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName())
|
||||
.withTermTextLookup(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName())
|
||||
.withPretokenizedNoPositionField(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD)
|
||||
.withIndexedNotTokenizedField(ImmutableSchema.HF_TERM_PAIRS_FIELD)
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.LINKS_FIELD.getFieldName())
|
||||
.withIntTermField(EarlybirdFieldConstant.LINK_CATEGORY_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.PLACE_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName())
|
||||
.withPretokenizedNoPositionField(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName())
|
||||
.withIndexedNotTokenizedField(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName())
|
||||
.withIntTermField(NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName())
|
||||
.withIntTermField(NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName())
|
||||
.withIntTermField(NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName())
|
||||
|
||||
.withIntTermField(EarlybirdFieldConstant.COMPOSER_SOURCE.getFieldName())
|
||||
|
||||
.withLongTermField(EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName())
|
||||
.withLongTermField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName())
|
||||
|
||||
// Named entity fields
|
||||
.withIndexedNotTokenizedField(
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_FROM_URL_FIELD.getFieldName(), true)
|
||||
.withIndexedNotTokenizedField(
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_FROM_TEXT_FIELD.getFieldName(), true)
|
||||
.withIndexedNotTokenizedField(
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD.getFieldName(), true)
|
||||
.withIndexedNotTokenizedField(
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD.getFieldName(), true)
|
||||
|
||||
// camelCase-tokenized user handles and tokenized user names, not searchable by default
|
||||
.withPretokenizedTextField(
|
||||
EarlybirdFieldConstant.CAMELCASE_USER_HANDLE_FIELD.getFieldName(), false)
|
||||
.withPretokenizedTextField(
|
||||
EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(), false)
|
||||
|
||||
.withIndexedNotTokenizedField(
|
||||
EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName())
|
||||
.withTextField(EarlybirdFieldConstant.SPACE_ADMIN_FIELD.getFieldName(), false)
|
||||
.withPretokenizedTextField(EarlybirdFieldConstant.SPACE_TITLE_FIELD.getFieldName(), false)
|
||||
.withTextField(EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), true)
|
||||
.withPretokenizedTextField(
|
||||
EarlybirdFieldConstant.CAMELCASE_TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), false)
|
||||
.withPretokenizedTextField(
|
||||
EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_DISPLAY_NAME_FIELD.getFieldName(), false)
|
||||
.withPretokenizedTextField(
|
||||
EarlybirdFieldConstant.URL_DESCRIPTION_FIELD.getFieldName(), false)
|
||||
.withPretokenizedTextField(
|
||||
EarlybirdFieldConstant.URL_TITLE_FIELD.getFieldName(), false);
|
||||
|
||||
builder
|
||||
.withPhotoUrlFacetField(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName())
|
||||
.withOutOfOrderEnabledForField(
|
||||
EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName())
|
||||
.withOutOfOrderEnabledForField(
|
||||
EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName())
|
||||
.withOutOfOrderEnabledForField(
|
||||
EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName());
|
||||
|
||||
// ColumnStrideFields.
|
||||
boolean loadCSFIntoRAMDefault = cluster != EarlybirdCluster.FULL_ARCHIVE;
|
||||
|
||||
builder
|
||||
.withColumnStrideField(EarlybirdFieldConstants.ENCODED_TWEET_FEATURES_FIELD_NAME,
|
||||
ThriftCSFType.INT, NUMBER_OF_INTEGERS_FOR_FEATURES,
|
||||
true, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, /* the full archive loads this field into RAM */ true)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(),
|
||||
ThriftCSFType.BYTE, 1, false, loadCSFIntoRAMDefault)
|
||||
// CSF Used by archive mappers
|
||||
.withColumnStrideField(EarlybirdFieldConstant.CREATED_AT_CSF_FIELD.getFieldName(),
|
||||
ThriftCSFType.INT, 1, false, /* the full archive loads this field into RAM */ true)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.ID_CSF_FIELD.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, /* the full archive loads this field into RAM */ true)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.CONVERSATION_ID_CSF.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.QUOTED_TWEET_ID_CSF.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.QUOTED_USER_ID_CSF.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(),
|
||||
ThriftCSFType.INT, 1, false, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.CARD_URI_CSF.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
|
||||
.withColumnStrideField(
|
||||
EarlybirdFieldConstant.EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF.getFieldName(),
|
||||
ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault)
|
||||
|
||||
/* Semicolon on separate line to preserve git blame. */;
|
||||
|
||||
builder.withColumnStrideField(
|
||||
EarlybirdFieldConstants.EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME,
|
||||
ThriftCSFType.INT, NUMBER_OF_INTEGERS_FOR_EXTENDED_FEATURES,
|
||||
true, loadCSFIntoRAMDefault);
|
||||
|
||||
for (Map.Entry<String, FeatureConfiguration> entry : FEATURE_CONFIGURATION_MAP.entrySet()) {
|
||||
String fullName = entry.getKey();
|
||||
String baseName = getBaseFieldName(fullName);
|
||||
EarlybirdFieldConstant fieldConstant = EarlybirdFieldConstants.getFieldConstant(fullName);
|
||||
if (fieldConstant.isValidFieldInCluster(cluster)) {
|
||||
builder.withFeatureConfiguration(baseName, fullName, entry.getValue());
|
||||
}
|
||||
}
|
||||
// Add facet settings for facet fields
|
||||
// boolean args are respectively whether to use skiplist, whether offensive, whether to use CSF
|
||||
builder
|
||||
.withFacetConfigs(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.MENTIONS_FACET, true, false, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.HASHTAGS_FACET, true, false, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.STOCKS_FACET, true, false, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.IMAGES_FACET, true, true, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.VIDEOS_FACET, true, true, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.NEWS_FACET, true, false, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.LANGUAGES_FACET, false, false, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.SOURCES_FACET, false, false, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.TWIMG_FACET, true, true, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(),
|
||||
EarlybirdFieldConstant.FROM_USER_ID_FACET, false, false, true /* facet on CSF */)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(),
|
||||
EarlybirdFieldConstant.RETWEETS_FACET, false, false, true /* facet on CSF */)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.LINKS_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.LINKS_FACET, true, false, false)
|
||||
.withFacetConfigs(
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD.getFieldName(),
|
||||
true, false, false)
|
||||
.withFacetConfigs(
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD.getFieldName(),
|
||||
true, false, false)
|
||||
.withFacetConfigs(
|
||||
EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(),
|
||||
true, false, false)
|
||||
.withFacetConfigs(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.SPACES_FACET, true, false, false);
|
||||
return builder.build();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,897 +0,0 @@
|
||||
package com.twitter.search.common.schema.earlybird;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import javax.annotation.Nonnull;
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.collections.Pair;
|
||||
import com.twitter.common.text.util.TokenStreamSerializer;
|
||||
import com.twitter.cuad.ner.plain.thriftjava.NamedEntity;
|
||||
import com.twitter.cuad.ner.plain.thriftjava.NamedEntityContext;
|
||||
import com.twitter.cuad.ner.plain.thriftjava.NamedEntityInputSourceType;
|
||||
import com.twitter.cuad.ner.thriftjava.WholeEntityType;
|
||||
import com.twitter.search.common.constants.SearchCardType;
|
||||
import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl;
|
||||
import com.twitter.search.common.indexing.thriftjava.ThriftGeoLocationSource;
|
||||
import com.twitter.search.common.indexing.thriftjava.TwitterPhotoUrl;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.schema.ThriftDocumentBuilder;
|
||||
import com.twitter.search.common.schema.base.FieldNameToIdMapping;
|
||||
import com.twitter.search.common.schema.base.Schema;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.common.util.analysis.CharTermAttributeSerializer;
|
||||
import com.twitter.search.common.util.analysis.IntTermAttributeSerializer;
|
||||
import com.twitter.search.common.util.analysis.TermPayloadAttributeSerializer;
|
||||
import com.twitter.search.common.util.analysis.TwitterPhotoTokenStream;
|
||||
import com.twitter.search.common.util.spatial.GeoUtil;
|
||||
import com.twitter.search.common.util.text.TokenizerHelper;
|
||||
import com.twitter.search.common.util.text.TweetTokenStreamSerializer;
|
||||
import com.twitter.search.common.util.text.regex.Regex;
|
||||
import com.twitter.search.common.util.url.LinkVisibilityUtils;
|
||||
import com.twitter.search.common.util.url.URLUtils;
|
||||
|
||||
import geo.google.datamodel.GeoAddressAccuracy;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
|
||||
|
||||
/**
|
||||
* Builder class for building a {@link ThriftDocument}.
|
||||
*/
|
||||
public final class EarlybirdThriftDocumentBuilder extends ThriftDocumentBuilder {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdThriftDocumentBuilder.class);
|
||||
|
||||
private static final SearchCounter SERIALIZE_FAILURE_COUNT_NONPENGUIN_DEPENDENT =
|
||||
SearchCounter.export("tokenstream_serialization_failure_non_penguin_dependent");
|
||||
|
||||
private static final String HASHTAG_SYMBOL = "#";
|
||||
private static final String CASHTAG_SYMBOL = "$";
|
||||
private static final String MENTION_SYMBOL = "@";
|
||||
|
||||
private static final SearchCounter BCP47_LANGUAGE_TAG_COUNTER =
|
||||
SearchCounter.export("bcp47_language_tag");
|
||||
|
||||
/**
|
||||
* Used to check if a card is video card.
|
||||
*
|
||||
* @see #withSearchCard
|
||||
*/
|
||||
private static final String AMPLIFY_CARD_NAME = "amplify";
|
||||
private static final String PLAYER_CARD_NAME = "player";
|
||||
|
||||
// Extra term indexed for native retweets, to ensure that the "-rt" query excludes them.
|
||||
public static final String RETWEET_TERM = "rt";
|
||||
public static final String QUESTION_MARK = "?";
|
||||
|
||||
private static final Set<NamedEntityInputSourceType> NAMED_ENTITY_URL_SOURCE_TYPES =
|
||||
ImmutableSet.of(
|
||||
NamedEntityInputSourceType.URL_TITLE, NamedEntityInputSourceType.URL_DESCRIPTION);
|
||||
|
||||
private final TokenStreamSerializer intTermAttributeSerializer =
|
||||
new TokenStreamSerializer(ImmutableList.of(
|
||||
new IntTermAttributeSerializer()));
|
||||
private final TokenStreamSerializer photoUrlSerializer =
|
||||
new TokenStreamSerializer(ImmutableList
|
||||
.<TokenStreamSerializer.AttributeSerializer>of(
|
||||
new CharTermAttributeSerializer(), new TermPayloadAttributeSerializer()));
|
||||
private final Schema schema;
|
||||
|
||||
private boolean isSetLatLonCSF = false;
|
||||
private boolean addLatLonCSF = true;
|
||||
private boolean addEncodedTweetFeatures = true;
|
||||
|
||||
@Nonnull
|
||||
private final EarlybirdEncodedFeatures encodedTweetFeatures;
|
||||
@Nullable
|
||||
private final EarlybirdEncodedFeatures extendedEncodedTweetFeatures;
|
||||
|
||||
/**
|
||||
* Default constructor
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder(
|
||||
@Nonnull EarlybirdEncodedFeatures encodedTweetFeatures,
|
||||
@Nullable EarlybirdEncodedFeatures extendedEncodedTweetFeatures,
|
||||
FieldNameToIdMapping idMapping,
|
||||
Schema schema) {
|
||||
super(idMapping);
|
||||
this.schema = schema;
|
||||
this.encodedTweetFeatures = Preconditions.checkNotNull(encodedTweetFeatures);
|
||||
|
||||
this.extendedEncodedTweetFeatures = extendedEncodedTweetFeatures;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get internal {@link EarlybirdEncodedFeatures}
|
||||
*/
|
||||
public EarlybirdEncodedFeatures getEncodedTweetFeatures() {
|
||||
return encodedTweetFeatures;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add skip list entry for the given field.
|
||||
* This adds a term __has_fieldName in the INTERNAL field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder addFacetSkipList(String fieldName) {
|
||||
withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.getFacetSkipFieldName(fieldName));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a filter term in the INTERNAL field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder addFilterInternalFieldTerm(String filterName) {
|
||||
withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
|
||||
EarlybirdThriftDocumentUtil.formatFilter(filterName));
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add id field and id csf field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withID(long id) {
|
||||
withLongField(EarlybirdFieldConstant.ID_FIELD.getFieldName(), id);
|
||||
withLongField(EarlybirdFieldConstant.ID_CSF_FIELD.getFieldName(), id);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add created at field and created at csf field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withCreatedAt(int createdAt) {
|
||||
withIntField(EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName(), createdAt);
|
||||
withIntField(EarlybirdFieldConstant.CREATED_AT_CSF_FIELD.getFieldName(), createdAt);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add tweet text field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withTweetText(
|
||||
String text, byte[] textTokenStream) throws IOException {
|
||||
withTokenStreamField(EarlybirdFieldConstants.EarlybirdFieldConstant.TEXT_FIELD.getFieldName(),
|
||||
text, textTokenStream);
|
||||
return this;
|
||||
}
|
||||
|
||||
public EarlybirdThriftDocumentBuilder withTweetText(String text) throws IOException {
|
||||
withTweetText(text, null);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a list of cashTags. Like $TWTR.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withStocksFields(List<String> cashTags) {
|
||||
if (isNotEmpty(cashTags)) {
|
||||
addFacetSkipList(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName());
|
||||
for (String cashTag : cashTags) {
|
||||
withStringField(
|
||||
EarlybirdFieldConstant.STOCKS_FIELD.getFieldName(), CASHTAG_SYMBOL + cashTag);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a list of hashtags.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withHashtagsField(List<String> hashtags) {
|
||||
if (isNotEmpty(hashtags)) {
|
||||
int numHashtags = Math.min(
|
||||
hashtags.size(),
|
||||
schema.getFeatureConfigurationById(
|
||||
EarlybirdFieldConstant.NUM_HASHTAGS.getFieldId()).getMaxValue());
|
||||
encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.NUM_HASHTAGS, numHashtags);
|
||||
addFacetSkipList(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName());
|
||||
for (String hashtag : hashtags) {
|
||||
withStringField(
|
||||
EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName(), HASHTAG_SYMBOL + hashtag);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Added a list of mentions.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withMentionsField(List<String> mentions) {
|
||||
if (isNotEmpty(mentions)) {
|
||||
int numMentions = Math.min(
|
||||
mentions.size(),
|
||||
schema.getFeatureConfigurationById(
|
||||
EarlybirdFieldConstant.NUM_HASHTAGS.getFieldId()).getMaxValue());
|
||||
encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.NUM_MENTIONS, numMentions);
|
||||
addFacetSkipList(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName());
|
||||
for (String mention : mentions) {
|
||||
withStringField(
|
||||
EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName(), MENTION_SYMBOL + mention);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a list of Twitter Photo URLs (twimg URLs). These are different from regular URLs, because
|
||||
* we use the TwitterPhotoTokenStream to index them, and we also include the status ID as payload.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withTwimgURLs(
|
||||
List<TwitterPhotoUrl> urls) throws IOException {
|
||||
if (isNotEmpty(urls)) {
|
||||
for (TwitterPhotoUrl photoUrl : urls) {
|
||||
TokenStream ts = new TwitterPhotoTokenStream(photoUrl.getPhotoStatusId(),
|
||||
photoUrl.getMediaUrl());
|
||||
byte[] serializedTs = photoUrlSerializer.serialize(ts);
|
||||
withTokenStreamField(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName(),
|
||||
Long.toString(photoUrl.getPhotoStatusId()), serializedTs);
|
||||
addFacetSkipList(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName());
|
||||
}
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a list of URLs. This also add facet skip list terms for news / images / videos if needed.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withURLs(List<ThriftExpandedUrl> urls) {
|
||||
if (isNotEmpty(urls)) {
|
||||
Set<String> dedupedLinks = Sets.newHashSet();
|
||||
|
||||
for (ThriftExpandedUrl expandedUrl : urls) {
|
||||
if (expandedUrl.isSetOriginalUrl()) {
|
||||
String normalizedOriginalUrl = URLUtils.normalizePath(expandedUrl.getOriginalUrl());
|
||||
dedupedLinks.add(normalizedOriginalUrl);
|
||||
}
|
||||
if (expandedUrl.isSetExpandedUrl()) {
|
||||
dedupedLinks.add(URLUtils.normalizePath(expandedUrl.getExpandedUrl()));
|
||||
}
|
||||
|
||||
if (expandedUrl.isSetCanonicalLastHopUrl()) {
|
||||
String url = URLUtils.normalizePath(expandedUrl.getCanonicalLastHopUrl());
|
||||
dedupedLinks.add(url);
|
||||
|
||||
String facetUrl = URLUtils.normalizeFacetURL(url);
|
||||
|
||||
if (expandedUrl.isSetMediaType()) {
|
||||
switch (expandedUrl.getMediaType()) {
|
||||
case NEWS:
|
||||
withStringField(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName(), url);
|
||||
addFacetSkipList(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName());
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG);
|
||||
break;
|
||||
case VIDEO:
|
||||
withStringField(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName(), facetUrl);
|
||||
addFacetSkipList(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName());
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG);
|
||||
break;
|
||||
case IMAGE:
|
||||
withStringField(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName(), facetUrl);
|
||||
addFacetSkipList(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName());
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG);
|
||||
break;
|
||||
case NATIVE_IMAGE:
|
||||
// Nothing done here. Native images are handled separately.
|
||||
// They are in PhotoUrls instead of expandedUrls.
|
||||
break;
|
||||
case UNKNOWN:
|
||||
break;
|
||||
default:
|
||||
throw new RuntimeException("Unknown Media Type: " + expandedUrl.getMediaType());
|
||||
}
|
||||
}
|
||||
|
||||
if (expandedUrl.isSetLinkCategory()) {
|
||||
withIntField(EarlybirdFieldConstant.LINK_CATEGORY_FIELD.getFieldName(),
|
||||
expandedUrl.getLinkCategory().getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!dedupedLinks.isEmpty()) {
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_LINK_FLAG);
|
||||
|
||||
addFacetSkipList(EarlybirdFieldConstant.LINKS_FIELD.getFieldName());
|
||||
|
||||
for (String linkUrl : dedupedLinks) {
|
||||
withStringField(EarlybirdFieldConstant.LINKS_FIELD.getFieldName(), linkUrl);
|
||||
}
|
||||
}
|
||||
|
||||
encodedTweetFeatures.setFlagValue(
|
||||
EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG,
|
||||
LinkVisibilityUtils.hasVisibleLink(urls));
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a list of places. The place are U64 encoded place IDs.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withPlacesField(List<String> places) {
|
||||
if (isNotEmpty(places)) {
|
||||
for (String place : places) {
|
||||
withStringField(EarlybirdFieldConstant.PLACE_FIELD.getFieldName(), place);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add tweet text signature field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withTweetSignature(int signature) {
|
||||
encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.TWEET_SIGNATURE, signature);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add geo hash field and internal filter field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withGeoHash(double lat, double lon, int accuracy) {
|
||||
if (GeoUtil.validateGeoCoordinates(lat, lon)) {
|
||||
withGeoField(
|
||||
EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(),
|
||||
lat, lon, accuracy);
|
||||
withLatLonCSF(lat, lon);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public EarlybirdThriftDocumentBuilder withGeoHash(double lat, double lon) {
|
||||
withGeoHash(lat, lon, GeoAddressAccuracy.UNKNOWN_LOCATION.getCode());
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add geo location source to the internal field with ThriftGeoLocationSource object.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withGeoLocationSource(
|
||||
ThriftGeoLocationSource geoLocationSource) {
|
||||
if (geoLocationSource != null) {
|
||||
withGeoLocationSource(EarlybirdFieldConstants.formatGeoType(geoLocationSource));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add geo location source to the internal field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withGeoLocationSource(String geoLocationSource) {
|
||||
withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), geoLocationSource);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add encoded lat and lon to LatLonCSF field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withLatLonCSF(double lat, double lon) {
|
||||
isSetLatLonCSF = true;
|
||||
long encodedLatLon = GeoUtil.encodeLatLonIntoInt64((float) lat, (float) lon);
|
||||
withLongField(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName(), encodedLatLon);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add from verified account flag to internal field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withFromVerifiedAccountFlag() {
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG);
|
||||
addFilterInternalFieldTerm(EarlybirdFieldConstant.VERIFIED_FILTER_TERM);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add from blue-verified account flag to internal field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withFromBlueVerifiedAccountFlag() {
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG);
|
||||
addFilterInternalFieldTerm(EarlybirdFieldConstant.BLUE_VERIFIED_FILTER_TERM);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add offensive flag to internal field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withOffensiveFlag() {
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG);
|
||||
withStringField(
|
||||
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.IS_OFFENSIVE);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add user reputation value to encoded feature.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withUserReputation(byte score) {
|
||||
encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.USER_REPUTATION, score);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* This method creates the fields related to document language.
|
||||
* For most languages, their isoLanguageCode and bcp47LanguageTag are the same.
|
||||
* For some languages with variants, these two fields are different.
|
||||
* E.g. for simplified Chinese, their isoLanguageCode is zh, but their bcp47LanguageTag is zh-cn.
|
||||
* <p>
|
||||
* This method adds fields for both the isoLanguageCode and bcp47LanguageTag.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withLanguageCodes(
|
||||
String isoLanguageCode, String bcp47LanguageTag) {
|
||||
if (isoLanguageCode != null) {
|
||||
withISOLanguage(isoLanguageCode);
|
||||
}
|
||||
if (bcp47LanguageTag != null && !bcp47LanguageTag.equals(isoLanguageCode)) {
|
||||
BCP47_LANGUAGE_TAG_COUNTER.increment();
|
||||
withISOLanguage(bcp47LanguageTag);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a String field into the ISO_LANGUAGE_FIELD.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withISOLanguage(String languageString) {
|
||||
withStringField(
|
||||
EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(), languageString.toLowerCase());
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add from user ID fields.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withFromUserID(long fromUserId) {
|
||||
withLongField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(), fromUserId);
|
||||
withLongField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(), fromUserId);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add from user information fields.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withFromUser(
|
||||
long fromUserId, String fromUser) {
|
||||
withFromUser(fromUserId, fromUser, null);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add from user information fields.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withFromUser(String fromUser) {
|
||||
withFromUser(fromUser, null);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add from user information fields.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withFromUser(
|
||||
String fromUser, String tokenizedFromUser) {
|
||||
withStringField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), fromUser);
|
||||
withStringField(EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(),
|
||||
isNotBlank(tokenizedFromUser) ? tokenizedFromUser : fromUser);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add from user information fields.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withFromUser(
|
||||
long fromUserId, String fromUser, String tokenizedFromUser) {
|
||||
withFromUserID(fromUserId);
|
||||
withFromUser(fromUser, tokenizedFromUser);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add to user field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withToUser(
|
||||
String toUser) {
|
||||
withStringField(EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(), toUser);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add escherbird annotation fields.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withAnnotationEntities(List<String> entities) {
|
||||
if (isNotEmpty(entities)) {
|
||||
for (String entity : entities) {
|
||||
withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(), entity);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add replies to internal field and set is reply flag.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withReplyFlag() {
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.IS_REPLY_FLAG);
|
||||
addFilterInternalFieldTerm(EarlybirdFieldConstant.REPLIES_FILTER_TERM);
|
||||
return this;
|
||||
}
|
||||
|
||||
public EarlybirdThriftDocumentBuilder withCameraComposerSourceFlag() {
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.COMPOSER_SOURCE_IS_CAMERA_FLAG);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add in reply to user id.
|
||||
* <p>
|
||||
* Notice {@link #withReplyFlag} is not automatically called since retweet a tweet that is
|
||||
* a reply to some other tweet is not considered a reply.
|
||||
* The caller should call {@link #withReplyFlag} separately if this tweet is really a reply tweet.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withInReplyToUserID(long inReplyToUserID) {
|
||||
withLongField(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName(), inReplyToUserID);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add reference tweet author id.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withReferenceAuthorID(long referenceAuthorID) {
|
||||
withLongField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), referenceAuthorID);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add all native retweet related fields/label
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public EarlybirdThriftDocumentBuilder withNativeRetweet(final long retweetUserID,
|
||||
final long sharedStatusID) {
|
||||
withLongField(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(), sharedStatusID);
|
||||
|
||||
withLongField(EarlybirdFieldConstant.RETWEET_SOURCE_TWEET_ID_FIELD.getFieldName(),
|
||||
sharedStatusID);
|
||||
withLongField(EarlybirdFieldConstant.RETWEET_SOURCE_USER_ID_FIELD.getFieldName(),
|
||||
retweetUserID);
|
||||
withLongField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), retweetUserID);
|
||||
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.IS_RETWEET_FLAG);
|
||||
|
||||
// Add native retweet label to the internal field.
|
||||
addFilterInternalFieldTerm(EarlybirdFieldConstant.NATIVE_RETWEETS_FILTER_TERM);
|
||||
withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), RETWEET_TERM);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add quoted tweet id and user id.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public EarlybirdThriftDocumentBuilder withQuote(
|
||||
final long quotedStatusId, final long quotedUserId) {
|
||||
withLongField(EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName(), quotedStatusId);
|
||||
withLongField(EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName(), quotedUserId);
|
||||
|
||||
withLongField(EarlybirdFieldConstant.QUOTED_TWEET_ID_CSF.getFieldName(), quotedStatusId);
|
||||
withLongField(EarlybirdFieldConstant.QUOTED_USER_ID_CSF.getFieldName(), quotedUserId);
|
||||
|
||||
encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_QUOTE_FLAG);
|
||||
|
||||
// Add quote label to the internal field.
|
||||
addFilterInternalFieldTerm(EarlybirdFieldConstant.QUOTE_FILTER_TERM);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add resolved links text field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withResolvedLinksText(String linksText) {
|
||||
withStringField(EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), linksText);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add source field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withSource(String source) {
|
||||
withStringField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(), source);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add normalized source field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withNormalizedSource(String normalizedSource) {
|
||||
withStringField(
|
||||
EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName(), normalizedSource);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add positive smiley to internal field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withPositiveSmiley() {
|
||||
withStringField(
|
||||
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.HAS_POSITIVE_SMILEY);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add negative smiley to internal field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withNegativeSmiley() {
|
||||
withStringField(
|
||||
EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(),
|
||||
EarlybirdFieldConstant.HAS_NEGATIVE_SMILEY);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add question mark label to a text field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withQuestionMark() {
|
||||
withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), QUESTION_MARK);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add card related fields.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withSearchCard(
|
||||
String name,
|
||||
String domain,
|
||||
String title, byte[] serializedTitleStream,
|
||||
String description, byte[] serializedDescriptionStream,
|
||||
String lang) {
|
||||
if (isNotBlank(title)) {
|
||||
withTokenStreamField(
|
||||
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(),
|
||||
title, serializedTitleStream);
|
||||
}
|
||||
|
||||
if (isNotBlank(description)) {
|
||||
withTokenStreamField(
|
||||
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(),
|
||||
description, serializedDescriptionStream);
|
||||
}
|
||||
|
||||
if (isNotBlank(lang)) {
|
||||
withStringField(EarlybirdFieldConstant.CARD_LANG.getFieldName(), lang);
|
||||
}
|
||||
|
||||
if (isNotBlank(domain)) {
|
||||
withStringField(
|
||||
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName(), domain);
|
||||
}
|
||||
|
||||
if (isNotBlank(name)) {
|
||||
withStringField(
|
||||
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(), name);
|
||||
withIntField(
|
||||
EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(),
|
||||
SearchCardType.cardTypeFromStringName(name).getByteValue());
|
||||
}
|
||||
|
||||
if (AMPLIFY_CARD_NAME.equalsIgnoreCase(name)
|
||||
|| PLAYER_CARD_NAME.equalsIgnoreCase(name)) {
|
||||
// Add into "internal" field so that this tweet is returned by filter:videos.
|
||||
addFacetSkipList(
|
||||
EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName());
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
public EarlybirdThriftDocumentBuilder withNormalizedMinEngagementField(
|
||||
String fieldName, int normalizedNumEngagements) throws IOException {
|
||||
EarlybirdThriftDocumentUtil.addNormalizedMinEngagementField(doc, fieldName,
|
||||
normalizedNumEngagements);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add named entity with given canonical name and type to document.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withNamedEntity(NamedEntity namedEntity) {
|
||||
if (namedEntity.getContexts() == null) {
|
||||
// In this unlikely case, we don't have any context for named entity type or source,
|
||||
// so we can't properly index it in any of our fields. We'll just skip it in this case.
|
||||
return this;
|
||||
}
|
||||
|
||||
// Keep track of the fields we've applied in the builder already, to ensure we only index
|
||||
// each term (field/value pair) once
|
||||
Set<Pair<EarlybirdFieldConstant, String>> fieldsApplied = new HashSet<>();
|
||||
for (NamedEntityContext context : namedEntity.getContexts()) {
|
||||
if (context.isSetInput_source()
|
||||
&& NAMED_ENTITY_URL_SOURCE_TYPES.contains(context.getInput_source().getSource_type())) {
|
||||
// If the source is one of the URL* types, add the named entity to the "from_url" fields,
|
||||
// ensuring we add it only once
|
||||
addNamedEntityFields(
|
||||
fieldsApplied,
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_FROM_URL_FIELD,
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD,
|
||||
namedEntity.getCanonical_name(),
|
||||
context);
|
||||
} else {
|
||||
addNamedEntityFields(
|
||||
fieldsApplied,
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_FROM_TEXT_FIELD,
|
||||
EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD,
|
||||
namedEntity.getCanonical_name(),
|
||||
context);
|
||||
}
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add space id fields.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withSpaceIdFields(Set<String> spaceIds) {
|
||||
if (!spaceIds.isEmpty()) {
|
||||
addFacetSkipList(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName());
|
||||
for (String spaceId : spaceIds) {
|
||||
withStringField(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName(), spaceId);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add directed at user.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public EarlybirdThriftDocumentBuilder withDirectedAtUser(final long directedAtUserId) {
|
||||
withLongField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName(),
|
||||
directedAtUserId);
|
||||
|
||||
withLongField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(), directedAtUserId);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a white space tokenized screen name field.
|
||||
*
|
||||
* Example:
|
||||
* screenName - "super_hero"
|
||||
* tokenized version - "super hero"
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withWhiteSpaceTokenizedScreenNameField(
|
||||
String fieldName,
|
||||
String normalizedScreenName) {
|
||||
String whiteSpaceTokenizableScreenName = StringUtils.join(
|
||||
normalizedScreenName.split(Regex.HASHTAG_USERNAME_PUNCTUATION_REGEX), " ");
|
||||
withStringField(fieldName, whiteSpaceTokenizableScreenName);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a camel case tokenized screen name field.
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder withCamelCaseTokenizedScreenNameField(
|
||||
String fieldName,
|
||||
String screenName,
|
||||
String normalizedScreenName,
|
||||
TokenStream screenNameTokenStream) {
|
||||
|
||||
// this normalized text is consistent to how the tokenized stream is created from
|
||||
// TokenizerHelper.getNormalizedCamelcaseTokenStream - ie. just lowercasing.
|
||||
String camelCaseTokenizedScreenNameText =
|
||||
TokenizerHelper.getNormalizedCamelcaseTokenStreamText(screenName);
|
||||
try {
|
||||
// Reset the token stream in case it has been read before.
|
||||
screenNameTokenStream.reset();
|
||||
byte[] camelCaseTokenizedScreenName =
|
||||
TweetTokenStreamSerializer.getTweetTokenStreamSerializer()
|
||||
.serialize(screenNameTokenStream);
|
||||
|
||||
withTokenStreamField(
|
||||
fieldName,
|
||||
camelCaseTokenizedScreenNameText.isEmpty()
|
||||
? normalizedScreenName : camelCaseTokenizedScreenNameText,
|
||||
camelCaseTokenizedScreenName);
|
||||
} catch (IOException e) {
|
||||
LOG.error("TwitterTokenStream serialization error! Could not serialize: " + screenName);
|
||||
SERIALIZE_FAILURE_COUNT_NONPENGUIN_DEPENDENT.increment();
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
private void addNamedEntityFields(
|
||||
Set<Pair<EarlybirdFieldConstant, String>> fieldsApplied,
|
||||
EarlybirdFieldConstant nameOnlyField,
|
||||
EarlybirdFieldConstant nameWithTypeField,
|
||||
String name,
|
||||
NamedEntityContext context) {
|
||||
withOneTimeStringField(fieldsApplied, nameOnlyField, name, false);
|
||||
if (context.isSetEntity_type()) {
|
||||
withOneTimeStringField(fieldsApplied, nameWithTypeField,
|
||||
formatNamedEntityString(name, context.getEntity_type()), true);
|
||||
}
|
||||
}
|
||||
|
||||
private void withOneTimeStringField(
|
||||
Set<Pair<EarlybirdFieldConstant, String>> fieldsApplied, EarlybirdFieldConstant field,
|
||||
String value, boolean addToFacets) {
|
||||
Pair<EarlybirdFieldConstant, String> fieldValuePair = new Pair<>(field, value);
|
||||
if (!fieldsApplied.contains(fieldValuePair)) {
|
||||
if (addToFacets) {
|
||||
addFacetSkipList(field.getFieldName());
|
||||
}
|
||||
withStringField(field.getFieldName(), value);
|
||||
fieldsApplied.add(fieldValuePair);
|
||||
}
|
||||
}
|
||||
|
||||
private String formatNamedEntityString(String name, WholeEntityType type) {
|
||||
return String.format("%s:%s", name, type).toLowerCase();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set whether set LAT_LON_CSF_FIELD or not before build
|
||||
* if LAT_LON_CSF_FIELD is not set deliberately.
|
||||
*
|
||||
* @see #prepareToBuild()
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder setAddLatLonCSF(boolean isSet) {
|
||||
addLatLonCSF = isSet;
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set if add encoded tweet feature field in the end.
|
||||
*
|
||||
* @see #prepareToBuild()
|
||||
*/
|
||||
public EarlybirdThriftDocumentBuilder setAddEncodedTweetFeatures(boolean isSet) {
|
||||
addEncodedTweetFeatures = isSet;
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void prepareToBuild() {
|
||||
if (!isSetLatLonCSF && addLatLonCSF) {
|
||||
// In lucene archives, this CSF is needed regardless of whether geoLocation is set.
|
||||
withLatLonCSF(GeoUtil.ILLEGAL_LATLON, GeoUtil.ILLEGAL_LATLON);
|
||||
}
|
||||
|
||||
if (addEncodedTweetFeatures) {
|
||||
// Add encoded_tweet_features before building the document.
|
||||
withBytesField(
|
||||
EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName(),
|
||||
EarlybirdEncodedFeaturesUtil.toBytesForThriftDocument(encodedTweetFeatures));
|
||||
}
|
||||
|
||||
if (extendedEncodedTweetFeatures != null) {
|
||||
// Add extended_encoded_tweet_features before building the document.
|
||||
withBytesField(
|
||||
EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldName(),
|
||||
EarlybirdEncodedFeaturesUtil.toBytesForThriftDocument(extendedEncodedTweetFeatures));
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isNotBlank(String value) {
|
||||
return value != null && !value.isEmpty();
|
||||
}
|
||||
|
||||
private static boolean isNotEmpty(List<?> value) {
|
||||
return value != null && !value.isEmpty();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,377 +0,0 @@
|
||||
package com.twitter.search.common.schema.earlybird;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
import com.twitter.common.text.util.TokenStreamSerializer;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.base.ThriftDocumentUtil;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftDocument;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftField;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftFieldData;
|
||||
import com.twitter.search.common.util.analysis.IntTermAttributeSerializer;
|
||||
import com.twitter.search.common.util.analysis.TwitterNormalizedMinEngagementTokenStream;
|
||||
|
||||
/**
|
||||
* Utility APIs for ThriftDocument used in Earlybird.
|
||||
*/
|
||||
public final class EarlybirdThriftDocumentUtil {
|
||||
private static final EarlybirdFieldConstants ID_MAPPING = new EarlybirdFieldConstants();
|
||||
|
||||
private static final String FILTER_FORMAT_STRING = "__filter_%s";
|
||||
|
||||
/**
|
||||
* Used to check whether a thrift document has filter nullcast internal field set.
|
||||
* @see #isNullcastFilterSet(ThriftDocument)
|
||||
*/
|
||||
private static final String NULLCAST_FILTER_TERM =
|
||||
formatFilter(EarlybirdFieldConstant.NULLCAST_FILTER_TERM);
|
||||
|
||||
private static final String SELF_THREAD_FILTER_TERM =
|
||||
formatFilter(EarlybirdFieldConstant.SELF_THREAD_FILTER_TERM);
|
||||
|
||||
private static final String DIRECTED_AT_FILTER_TERM =
|
||||
formatFilter(EarlybirdFieldConstant.DIRECTED_AT_FILTER_TERM);
|
||||
|
||||
private EarlybirdThriftDocumentUtil() {
|
||||
// Cannot instantiate.
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats a regular, simple filter term. The 'filter' argument should correspond to a constant
|
||||
* from the Operator class, matching the operand (filter:links -> "links").
|
||||
*/
|
||||
public static final String formatFilter(String filter) {
|
||||
return String.format(FILTER_FORMAT_STRING, filter);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get status id.
|
||||
*/
|
||||
public static long getID(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getLongValue(
|
||||
document, EarlybirdFieldConstant.ID_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Card name.
|
||||
*/
|
||||
public static String getCardName(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getStringValue(
|
||||
document, EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Card language.
|
||||
*/
|
||||
public static String getCardLang(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getStringValue(
|
||||
document, EarlybirdFieldConstant.CARD_LANG.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Card language CSF.
|
||||
*
|
||||
* card language CSF is represented internally as an integer ID for a ThriftLanguage.
|
||||
*/
|
||||
public static int getCardLangCSF(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getIntValue(
|
||||
document, EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get quoted tweet id.
|
||||
*/
|
||||
public static long getQuotedTweetID(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getLongValue(
|
||||
document, EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get quoted tweet user id.
|
||||
*/
|
||||
public static long getQuotedUserID(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getLongValue(
|
||||
document, EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get directed at user id.
|
||||
*/
|
||||
public static long getDirectedAtUserId(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getLongValue(
|
||||
document, EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get directed at user id CSF.
|
||||
*/
|
||||
public static long getDirectedAtUserIdCSF(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getLongValue(
|
||||
document, EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get reference author id CSF.
|
||||
*/
|
||||
public static long getReferenceAuthorIdCSF(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getLongValue(
|
||||
document, EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get links.
|
||||
*/
|
||||
public static List<String> getLinks(ThriftDocument document) {
|
||||
return getStringValues(document, EarlybirdFieldConstant.LINKS_FIELD);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get created at timestamp in sec.
|
||||
*/
|
||||
public static int getCreatedAtSec(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getIntValue(
|
||||
document, EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get created at timestamp in ms.
|
||||
*/
|
||||
public static long getCreatedAtMs(ThriftDocument document) {
|
||||
long createdAtSec = (long) getCreatedAtSec(document);
|
||||
return createdAtSec * 1000L;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get from user id.
|
||||
*/
|
||||
public static long getFromUserID(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getLongValue(
|
||||
document, EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get from user.
|
||||
*/
|
||||
public static String getFromUser(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getStringValue(
|
||||
document, EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get tokenized from user display name.
|
||||
*/
|
||||
public static String getFromUserDisplayName(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getStringValue(
|
||||
document, EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get tokenized from user.
|
||||
*/
|
||||
public static String getTokenizedFromUser(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getStringValue(
|
||||
document, EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get resolved links text.
|
||||
*/
|
||||
public static String getResolvedLinksText(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getStringValue(
|
||||
document, EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get iso language code.
|
||||
*/
|
||||
public static List<String> getISOLanguage(ThriftDocument document) {
|
||||
return ThriftDocumentUtil.getStringValues(
|
||||
document, EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
/**
|
||||
* First remove the old timestamp if they exist.
|
||||
* Then add the created at and created at csf fields to the given thrift document.
|
||||
*/
|
||||
public static void replaceCreatedAtAndCreatedAtCSF(ThriftDocument document, int value) {
|
||||
removeField(document, EarlybirdFieldConstant.CREATED_AT_FIELD);
|
||||
removeField(document, EarlybirdFieldConstant.CREATED_AT_CSF_FIELD);
|
||||
|
||||
addIntField(document, EarlybirdFieldConstant.CREATED_AT_FIELD, value);
|
||||
addIntField(document, EarlybirdFieldConstant.CREATED_AT_CSF_FIELD, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Add the given int value as the given field into the given document.
|
||||
*/
|
||||
public static ThriftDocument addIntField(
|
||||
ThriftDocument document, EarlybirdFieldConstant fieldConstant, int value) {
|
||||
ThriftFieldData fieldData = new ThriftFieldData().setIntValue(value);
|
||||
ThriftField field =
|
||||
new ThriftField().setFieldConfigId(fieldConstant.getFieldId()).setFieldData(fieldData);
|
||||
document.addToFields(field);
|
||||
return document;
|
||||
}
|
||||
|
||||
private static EarlybirdFieldConstant getFeatureField(EarlybirdFieldConstant field) {
|
||||
if (field.getFieldName().startsWith(
|
||||
EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName())) {
|
||||
return EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD;
|
||||
} else if (field.getFieldName().startsWith(
|
||||
EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldName())) {
|
||||
return EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD;
|
||||
} else {
|
||||
throw new IllegalArgumentException("Not a feature field: " + field);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the feature value of a field.
|
||||
*/
|
||||
public static int getFeatureValue(
|
||||
ImmutableSchemaInterface schema,
|
||||
ThriftDocument document,
|
||||
EarlybirdFieldConstant field) {
|
||||
|
||||
EarlybirdFieldConstant featureField = getFeatureField(field);
|
||||
|
||||
byte[] encodedFeaturesBytes =
|
||||
ThriftDocumentUtil.getBytesValue(document, featureField.getFieldName(), ID_MAPPING);
|
||||
|
||||
if (encodedFeaturesBytes == null) {
|
||||
// Treat the feature value as 0 if there is no encoded feature field.
|
||||
return 0;
|
||||
} else {
|
||||
EarlybirdEncodedFeatures encodedFeatures = EarlybirdEncodedFeaturesUtil.fromBytes(
|
||||
schema, featureField, encodedFeaturesBytes, 0);
|
||||
return encodedFeatures.getFeatureValue(field);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the feature flag is set.
|
||||
*/
|
||||
public static boolean isFeatureBitSet(
|
||||
ImmutableSchemaInterface schema,
|
||||
ThriftDocument document,
|
||||
EarlybirdFieldConstant field) {
|
||||
|
||||
EarlybirdFieldConstant featureField = getFeatureField(field);
|
||||
|
||||
byte[] encodedFeaturesBytes =
|
||||
ThriftDocumentUtil.getBytesValue(document, featureField.getFieldName(), ID_MAPPING);
|
||||
|
||||
if (encodedFeaturesBytes == null) {
|
||||
// Treat the bit as not set if there is no encoded feature field.
|
||||
return false;
|
||||
} else {
|
||||
EarlybirdEncodedFeatures encodedFeatures = EarlybirdEncodedFeaturesUtil.fromBytes(
|
||||
schema, featureField, encodedFeaturesBytes, 0);
|
||||
return encodedFeatures.isFlagSet(field);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether nullcast flag is set in the encoded features field.
|
||||
*/
|
||||
public static boolean isNullcastBitSet(ImmutableSchemaInterface schema, ThriftDocument document) {
|
||||
return isFeatureBitSet(schema, document, EarlybirdFieldConstant.IS_NULLCAST_FLAG);
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove all fields with the given field constant in a document.
|
||||
*/
|
||||
public static void removeField(ThriftDocument document, EarlybirdFieldConstant fieldConstant) {
|
||||
List<ThriftField> fields = document.getFields();
|
||||
if (fields != null) {
|
||||
Iterator<ThriftField> fieldsIterator = fields.iterator();
|
||||
while (fieldsIterator.hasNext()) {
|
||||
if (fieldsIterator.next().getFieldConfigId() == fieldConstant.getFieldId()) {
|
||||
fieldsIterator.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove a string field with given fieldConstant and value.
|
||||
*/
|
||||
public static void removeStringField(
|
||||
ThriftDocument document, EarlybirdFieldConstant fieldConstant, String value) {
|
||||
List<ThriftField> fields = document.getFields();
|
||||
if (fields != null) {
|
||||
for (ThriftField field : fields) {
|
||||
if (field.getFieldConfigId() == fieldConstant.getFieldId()
|
||||
&& field.getFieldData().getStringValue().equals(value)) {
|
||||
fields.remove(field);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds a new TokenStream field for each engagement counter if normalizedNumEngagements >= 1.
|
||||
*/
|
||||
public static void addNormalizedMinEngagementField(
|
||||
ThriftDocument doc,
|
||||
String fieldName,
|
||||
int normalizedNumEngagements) throws IOException {
|
||||
if (normalizedNumEngagements < 1) {
|
||||
return;
|
||||
}
|
||||
TokenStreamSerializer serializer =
|
||||
new TokenStreamSerializer(ImmutableList.of(new IntTermAttributeSerializer()));
|
||||
TwitterNormalizedMinEngagementTokenStream stream = new
|
||||
TwitterNormalizedMinEngagementTokenStream(normalizedNumEngagements);
|
||||
byte[] serializedStream = serializer.serialize(stream);
|
||||
ThriftFieldData fieldData = new ThriftFieldData().setTokenStreamValue(serializedStream);
|
||||
ThriftField field = new ThriftField().setFieldConfigId(ID_MAPPING.getFieldID(fieldName))
|
||||
.setFieldData(fieldData);
|
||||
doc.addToFields(field);
|
||||
}
|
||||
|
||||
public static List<String> getStringValues(
|
||||
ThriftDocument document, EarlybirdFieldConstant field) {
|
||||
return ThriftDocumentUtil.getStringValues(document, field.getFieldName(), ID_MAPPING);
|
||||
}
|
||||
|
||||
public static boolean isNullcastFilterSet(ThriftDocument document) {
|
||||
return isFilterSet(document, NULLCAST_FILTER_TERM);
|
||||
}
|
||||
|
||||
public static boolean isSelfThreadFilterSet(ThriftDocument document) {
|
||||
return isFilterSet(document, SELF_THREAD_FILTER_TERM);
|
||||
}
|
||||
|
||||
public static String getSelfThreadFilterTerm() {
|
||||
return SELF_THREAD_FILTER_TERM;
|
||||
}
|
||||
|
||||
public static String getDirectedAtFilterTerm() {
|
||||
return DIRECTED_AT_FILTER_TERM;
|
||||
}
|
||||
|
||||
public static boolean isDirectedAtFilterSet(ThriftDocument document) {
|
||||
return isFilterSet(document, DIRECTED_AT_FILTER_TERM);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether given filter is set in the internal field.
|
||||
*/
|
||||
private static boolean isFilterSet(ThriftDocument document, String filter) {
|
||||
List<String> terms = ThriftDocumentUtil.getStringValues(
|
||||
document, EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), ID_MAPPING);
|
||||
for (String term : terms) {
|
||||
if (filter.equals(term)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,336 +0,0 @@
|
||||
package com.twitter.search.common.schema.earlybird;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.twitter.search.common.config.Config;
|
||||
|
||||
public enum FlushVersion {
|
||||
/* =======================================================
|
||||
* Versions
|
||||
* ======================================================= */
|
||||
VERSION_0("Initial version of partition flushing."),
|
||||
VERSION_1("Added timestamps and corresponding mapper to SegmentData."),
|
||||
VERSION_2("Add column stride fields."),
|
||||
VERSION_3("Change facet field configuration."),
|
||||
VERSION_4("Add per term offensive counters to parallel posting arrays."),
|
||||
VERSION_5("Add native photo facet."),
|
||||
VERSION_6("Add UserFeature column stride field"),
|
||||
VERSION_7("Index segment optimizations; new facet data structures."),
|
||||
VERSION_8("Store statuses in memory in Earlybird."),
|
||||
VERSION_9("Index from_user_ids into a searchable field."),
|
||||
VERSION_10("Change from_user_id dictionary from fst to mphf"),
|
||||
VERSION_11("Write image and video facet in separate lucene field."),
|
||||
VERSION_12("Add retweeted status ID to the sparse CSF."),
|
||||
VERSION_13("Add isOffensive field for profanity filter."),
|
||||
VERSION_14("Fix features column stride field corruption."),
|
||||
VERSION_15("Upgrade Lucene version, which has a different FST serialization format."),
|
||||
VERSION_16("Remove maxDoc in favor of lastDocID"),
|
||||
VERSION_17("Added partition and timeslice identifiers to SegmentData."),
|
||||
VERSION_18("Per-term payloads"),
|
||||
VERSION_19("Multiple per-doc payload fields"),
|
||||
VERSION_20("Unify and fix hash codes"),
|
||||
VERSION_21("Super awesome new flexible realtime posting list format."),
|
||||
VERSION_22("Added new geo implementation."),
|
||||
VERSION_23("Upgrade to Lucene 4.0.0 Final"),
|
||||
VERSION_24("Added tweet topic ids."),
|
||||
VERSION_25("Turn on skip list for mention facet."),
|
||||
VERSION_26("Added new EncodedTweetFeaturesColumnStrideField."),
|
||||
VERSION_27("Topic ids facet field."),
|
||||
VERSION_28("From-user discover stories skiplist field."),
|
||||
VERSION_29("Move tokenized screen name to the new username field"),
|
||||
VERSION_30("Enable HF term pairs index."),
|
||||
VERSION_31("Remove reverse doc ids."),
|
||||
VERSION_32("Switch shared status id CSF to non-sparse long CSF index."),
|
||||
VERSION_33("New skip lists for optimized high df posting lists."),
|
||||
VERSION_34("Store tweet signature in EarlybirdEncodedFeatures."),
|
||||
VERSION_35("Don't store shared status id csf in archive indexes."),
|
||||
VERSION_36("Don't store norms."),
|
||||
VERSION_37("64 bit user ids."),
|
||||
VERSION_38("Index links in archive."),
|
||||
VERSION_39("Fix pic.twitter.com image link handling not setting the internal field correctly."),
|
||||
VERSION_40("Fix all archive tweets being marked as replies."),
|
||||
VERSION_41("Avoid flushing event_ids field; event clusters are applied as updates."),
|
||||
VERSION_42("No position fields refactoring; made a few fields to not use position."),
|
||||
VERSION_43("Index private geo coordinates"),
|
||||
VERSION_44("Materialize last doc id in HighDFCompressedPostinglists", true),
|
||||
VERSION_45("Removing from_user_id facets support", true),
|
||||
VERSION_46("Guard against badly out of order tweets in the search archive.", true),
|
||||
VERSION_47("Added card title and description fields.", true),
|
||||
VERSION_48("Added card type CSF.", true),
|
||||
VERSION_49("Lucene 4.4 upgrade", true),
|
||||
VERSION_50("Put mem-archive back on non-lucene optimized indexes", true),
|
||||
VERSION_51("Force index rebuild to fix blank text field. See SEARCH-2505.", true),
|
||||
VERSION_52("Refactoring of docValues/CSF.", true),
|
||||
VERSION_53("Remove SegmentData.Configuration", true),
|
||||
VERSION_54("Fix bad indices caused by SEARCH-2723.", true),
|
||||
VERSION_55("Fixed non-deterministic facetIds across restarts. SEARCH-2815.", true),
|
||||
VERSION_56("Flush FacetIDMap.", true),
|
||||
VERSION_57("Remove LatLonMapper and use standard DocValues instead.", true),
|
||||
VERSION_58("Longterm Attribute Optimization.", true),
|
||||
VERSION_59("Renamed archive segment names. Current segment is no longer mutable.", true),
|
||||
// Flush version 60 and 59 have the same format.
|
||||
// Flush version is increased to trigger a rebuild, because we noticed incomplete segments.
|
||||
// More details can be found on SEARCH-3664
|
||||
VERSION_60("Flush version change to trigger segment rebuild.", true),
|
||||
VERSION_61("Adding back from_user_id", true),
|
||||
VERSION_62("Add retweet facet.", true),
|
||||
VERSION_63("Switch to new index API in com.twitter.search.core.earlybird.", true),
|
||||
VERSION_64("Sort merge archive day and part-* data. SEARCH-4692.", true),
|
||||
VERSION_65("Fix ID_FIELD and CREATED_AT_FIELD sort order. SEARCH-4004 SEARCH-912 ", true),
|
||||
VERSION_66("Rebuild data for 1/5/2015. Data on HDFS fixed as part of SEARCH-5347.", true),
|
||||
VERSION_67("Upgrade to Lucene 4.10.3.", true),
|
||||
VERSION_68("Switching to Penguin v4", true),
|
||||
VERSION_69("Fix 16% archive segments: SEARCH-6073", true),
|
||||
VERSION_70("Switching to Penguin v4 for full archive cluster. SEARCH-5302", true),
|
||||
VERSION_71("Switching to Penguin v4 for ssd archive cluster.", true),
|
||||
VERSION_72("Added Escherbird annotations for full archive.", true),
|
||||
VERSION_73("Lucene 5.2.1 upgrade.", true, 0),
|
||||
VERSION_74("Hanndle geo scurbbed data and archive geo index accuracy", true, 0),
|
||||
VERSION_75("Delete from_user_id_stories from indices", true, 0),
|
||||
VERSION_76("Allow multiple index extensions.", true, 0),
|
||||
VERSION_77("Removed EarlybirdCodec", true, 0),
|
||||
// minor version 2: added embedded tweet features
|
||||
// minor version 3: change embedded tweet features to INC_ONLY
|
||||
VERSION_78("Added 80 bytes of extended features", true, 3),
|
||||
// minor version 1: SEARCH-8564 - Reference Tweet Author ID, using
|
||||
// EXTENDED_TEST_FEATURE_UNUSED_BITS_2 and EXTENDED_TEST_FEATURE_UNUSED_BITS_3
|
||||
VERSION_79("Renamed UNUSED_BIT to HAS_VISIBLE_LINK", true, 1),
|
||||
// minor version 2: SEARCH-8564 / http://go/rb/770373
|
||||
// Made REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT and
|
||||
// REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT immutable field
|
||||
VERSION_80("Facet for links: SEARCH-8331", true, 2),
|
||||
// minor version 1: added video view count
|
||||
VERSION_81("Adding LowDF posting list with packed ints", true, 1),
|
||||
VERSION_82("Enabling HighDF posting list with packed ints", true, 0),
|
||||
// minor version 1: SEARCH-9379 - Added bitset for nullcast tweets
|
||||
// minor version 2: SEARCH-8765 - Added visible token ratio
|
||||
VERSION_83("Add bits in encoded features for media type flags. SEARCH-9131", true, 2),
|
||||
VERSION_84("Enable archive rebuild for __has_links field. SEARCH-9635", true, 0),
|
||||
// minor version 1: SEARCHQUAL-8130, add engagement v2
|
||||
VERSION_85("New archive build gen for missing geo data. SEARCH-9894", true, 1),
|
||||
VERSION_86("Added new fields to the index", true, 0),
|
||||
// During this rebuild both the statuses and the engagement counts were regenerated.
|
||||
// minor version 1: added quote_count
|
||||
VERSION_87("Periodic archive full rebuild. SEARCH-9423", true, 1),
|
||||
// minor version 1: make new tokenized user name/handle fields textSearchable
|
||||
// (see go/rb/847134/)
|
||||
// minor version 2: added has_quote
|
||||
VERSION_88("Fixing missing day in the full archive index. SEARCH-11233", true, 2),
|
||||
VERSION_89("Index and store conversation ids.", true, 0),
|
||||
VERSION_90("Fixing inconsistent days in the full archive index. SEARCH-11744", true, 0),
|
||||
VERSION_91("Making in_reply_to_user_id field use MPH. SEARCH-10836", true, 0),
|
||||
VERSION_92("Allow searches by any field. SEARCH-11251", true, 0),
|
||||
// During this rebuild we regenerated engagement counts and merged the annotations in the
|
||||
// aggregate job.
|
||||
VERSION_93("Periodic archive full rebuild. SEARCH-11076", true, 0),
|
||||
// minor version 1: add ThriftCSFViewSettings.outputCSFType
|
||||
VERSION_94("Indexing a bunch of geo fields. SEARCH-10283", true, 1),
|
||||
VERSION_95("Removing topic ID fields. SEARCH-8616", true, 0),
|
||||
// minor version 1: add ThriftCSFViewSettings.normalizationType
|
||||
VERSION_96("Enabling conversation ID for all clusters. SEARCH-11989", true, 1),
|
||||
// minor version 1: set several feature configuration to be correct double type
|
||||
// minor version 2: set some more feature configuration to be correct double type
|
||||
// minor version 3: add safety labels SEARCHQUAL-9561
|
||||
// minor version 4: add weighted engagement counts SEARCHQUAL-9574
|
||||
// minor version 5: add Dopamine non personalized score SEARCHQUAL-9743
|
||||
VERSION_97("Changing CSF type to BOOLEAN for some has_* flags.", true, 5),
|
||||
VERSION_98("Periodic archive full rebuild. PCM-56871.", true, 1),
|
||||
VERSION_99("Removing named_entities field. SEARCH-13708", true, 0),
|
||||
// minor version 1: add periscope features (SEARCHQUAL-10008)
|
||||
// minor version 2: add raw_earlybird_score to TweetExternalFeatures (SEARCHQUAL-10347)
|
||||
VERSION_100("Upgrade Penguin Version from V4 to V6. SEARCH-12991", true, 2),
|
||||
// minor version 1: adjust for normalizer type for some engagement counters (SEARCHQUAL-9537)
|
||||
// minor version 2: add decaying engagement counts and last engaged timestamps (SEARCHQUAL-10532)
|
||||
VERSION_101("Add emoji to the index. SEARCH-12991", true, 2),
|
||||
VERSION_102("Periodic full archive rebuild. PCM-67851", true, 0),
|
||||
VERSION_103("Add liked_by_user_id field. SEARCH-15341", true, 0),
|
||||
// minor version 1: remove last engaged timestamp with 3-hour increment (SEARCHQUAL-10903)
|
||||
// minor version 2: add fake engagement counts (SEARCHQUAL-10795)
|
||||
// minor version 3: add last engaged timestamp with 1-hour increment (SEARCHQUAL-10942)
|
||||
VERSION_104("Reverting to the 20170109_pc100_par30 build gen. SEARCH-15731", true, 3),
|
||||
VERSION_105("Add 3 new fields to archive index for engagement features. SEARCH-16102", true, 0),
|
||||
// This is the last rebuild based on /tables/statuses. Starting 9/14 this build-gen is powered
|
||||
// by TweetSource. During this rebuild both statuses and engagement counts were rebuilt.
|
||||
VERSION_106("Periodic archive full rebuild. PCM-74652", true, 0),
|
||||
VERSION_107("Removing card fields from full archive index.", true, 0),
|
||||
VERSION_108("Removing the tms_id field from all schemas.", true, 0),
|
||||
VERSION_109("Removing LAT_LON_FIELD from all schemas.", true, 0),
|
||||
VERSION_110("Adding the card fields back to the full archive index.", true, 1),
|
||||
// minor version 1: Add composer source csf field (SEARCH-22494)
|
||||
VERSION_111("Adding composer_source to index. SEARCH-20377.", true, 1),
|
||||
VERSION_112("Partial rebuild to fix SEARCH-22529.", true, 0),
|
||||
VERSION_113("Full archive build gen 20180312_pc100_par30.", true, 0),
|
||||
VERSION_114("Fix for SEARCH-23761.", true, 0),
|
||||
VERSION_115("Add fields for quoted tweets. SEARCH-23919", true, 0),
|
||||
// minor version 1: Add 4 bit hashtag count, mention count and stock count (SEARCH-24336)
|
||||
VERSION_116("Bump flush version for scrubbing pipeline. SEARCH-24225", true, 1),
|
||||
VERSION_117("Add retweeted_by_user_id and replied_to_by_user_id fields. SEARCH-24463", true, 0),
|
||||
// minor version 1: Removed dopamine_non_personalized_score (SEARCHQUAL-10321)
|
||||
VERSION_118("Adding the reply and retweet source tweet IDs: SEARCH-23702, SEARCH-24502", true, 1),
|
||||
// minor version 1: add blink engagement counts (SEARCHQUAL-15176)
|
||||
VERSION_119("Remove public inferred location: SEARCH-24235", true, 1),
|
||||
VERSION_120("Flush extensions before fields when flushing segments.", true, 0),
|
||||
VERSION_121("Flush the startingDocIdForSearch field. SEARCH-25464.", true, 0),
|
||||
VERSION_122("Do not flush the startingDocIdForSearch field.", true, 0),
|
||||
VERSION_123("Renaming the largestDocID flushed property to firstAddedDocID.", true, 0),
|
||||
VERSION_124("Use the skip list posting list for all fields.", true, 0),
|
||||
VERSION_125("Use hashmap for tweet ID lookup.", true, 0),
|
||||
VERSION_126("Use the skip list posting list for all fields.", true, 0),
|
||||
VERSION_127("Flushing the min and max doc IDs in each segment.", true, 0),
|
||||
VERSION_128("Add card_lang to index. SEARCH-26539", true, 0),
|
||||
VERSION_129("Move the tweet ID mapper to the segment data.", true, 0),
|
||||
VERSION_130("Move the time mapper to the segment data.", true, 0),
|
||||
VERSION_131("Change the facets classes to work with any doc IDs.", true, 0),
|
||||
VERSION_132("Make the CSF classes work with any doc IDs.", true, 0),
|
||||
VERSION_133("Removing smallestDocID property.", true, 0),
|
||||
VERSION_134("Optimize DeletedDocs before flushing.", true, 0),
|
||||
VERSION_135("Add payloads to skiplists.", true, 0),
|
||||
VERSION_136("Add name to int pools.", true, 0),
|
||||
VERSION_137("Add unsorted stream offset.", true, 0),
|
||||
VERSION_138("Switch to the OutOfOrderRealtimeTweetIDMapper.", true, 0),
|
||||
VERSION_139("Remove realtime posting lists.", true, 0),
|
||||
VERSION_140("Add named_entity field. SEARCH-27547", true, 0),
|
||||
VERSION_141("Flush the out of order updates count.", true, 0),
|
||||
VERSION_142("Add named_entity facet support. SEARCH-28054", true, 0),
|
||||
VERSION_143("Index updates before optimizing segment.", true, 0),
|
||||
VERSION_144("Refactor TermsArray.", true, 0),
|
||||
VERSION_145("Remove SmallestDocID.", true, 0),
|
||||
VERSION_146("Add entity_id facet support. SEARCH-28071", true, 0),
|
||||
VERSION_147("Enable updating facets", true, 0),
|
||||
VERSION_148("Rename the counter for feature updates to partial updates", true, 0),
|
||||
VERSION_149("Stop flushing offsets for sorted updates DL streams.", true, 0),
|
||||
VERSION_150("Update the name of the property for the updates DL stream offset.", true, 0),
|
||||
VERSION_151("Upgrade Lucene version to 5.5.5.", true, 0),
|
||||
VERSION_152("Upgrade Lucene version to 6.0.0.", true, 0),
|
||||
VERSION_153("Upgrade Lucene version to 6.6.6.", true, 0),
|
||||
VERSION_154("Store the timeslice ID on EarlybirdIndexSegmentData.", true, 0),
|
||||
VERSION_155("Do not flush index extensions.", true, 0),
|
||||
VERSION_156("Deprecate ThriftIndexedFieldSettings.defaultFieldBoost.", true, 0),
|
||||
VERSION_157("Load CREATED_AT_CSF_FIELD into RAM in archive.", true, 0),
|
||||
VERSION_158("Added directed at user ID field and CSF.", true, 0),
|
||||
VERSION_159("Changing deleted docs serialization format.", true, 0),
|
||||
VERSION_160("Add fields for health model scores. SEARCH-31907, HML-2099", true, 0),
|
||||
VERSION_161("Switch to the 'search' Kafka cluster.", true, 0),
|
||||
VERSION_162("Update Lucene version to 7.0.0.", true, 0),
|
||||
VERSION_163("Update Lucene version to 7.7.2.", true, 0),
|
||||
// minor version 1: add IS_TRENDING_NOW_FLAG
|
||||
VERSION_164("Collect per-term stats in the realtime segments.", true, 1),
|
||||
VERSION_165("Update Lucene version to 8.5.2.", true, 0),
|
||||
VERSION_166("Serialize maxPosition field for InvertedRealtimeIndex", true, 0),
|
||||
VERSION_167("Add field for pSpammyTweetScore. HML-2557", true, 0),
|
||||
VERSION_168("Add field for pReportedTweetScore. HML-2644", true, 0),
|
||||
VERSION_169("Add field for spammyTweetContentScore. PFM-70", true, 0),
|
||||
VERSION_170("Add reference author id CSF. SEARCH-34715", true, 0),
|
||||
VERSION_171("Add space_id field. SEARCH-36156", true, 0),
|
||||
VERSION_172("Add facet support for space_id. SEARCH-36388", true, 0),
|
||||
VERSION_173("Add space admin and title fields. SEARCH-36986", true, 0),
|
||||
VERSION_174("Switching to Penguin v7 for realtime-exp0 cluster. SEARCH-36068", true, 0),
|
||||
VERSION_175("Adding exclusive conversation author id CSF", true, 0),
|
||||
VERSION_176("Adding card URI CSF", true, 0),
|
||||
// minor version 1: add FROM_BLUE_VERIFIED_ACCOUNT_FLAG
|
||||
// minor version 2: Adding new cluster REALTIME_CG. SEARCH-45692
|
||||
VERSION_177("Adding URL Description and Title fields. SEARCH-41641", true, 2),
|
||||
|
||||
/**
|
||||
* This semi colon is on a separate line to avoid polluting git blame history.
|
||||
* Put a comma after the new enum field you're adding.
|
||||
*/;
|
||||
|
||||
// The current version.
|
||||
public static final FlushVersion CURRENT_FLUSH_VERSION =
|
||||
FlushVersion.values()[FlushVersion.values().length - 1];
|
||||
|
||||
public static final String DELIMITER = "_v_";
|
||||
|
||||
/* =======================================================
|
||||
* Helper methods
|
||||
* ======================================================= */
|
||||
private final String description;
|
||||
private final boolean isOfficial;
|
||||
private final int minorVersion;
|
||||
|
||||
/**
|
||||
* A flush version is not official unless explicitly stated to be official.
|
||||
* An unofficial flush version is never uploaded to HDFS.
|
||||
*/
|
||||
private FlushVersion(String description) {
|
||||
this(description, false, 0);
|
||||
}
|
||||
|
||||
private FlushVersion(String description, boolean isOfficial) {
|
||||
this(description, isOfficial, 0);
|
||||
}
|
||||
|
||||
private FlushVersion(String description, boolean isOfficial, int minorVersion) {
|
||||
this.description = description;
|
||||
this.isOfficial = isOfficial;
|
||||
this.minorVersion = minorVersion;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns file extension with version number.
|
||||
*/
|
||||
public String getVersionFileExtension() {
|
||||
if (this == VERSION_0) {
|
||||
return "";
|
||||
} else {
|
||||
return DELIMITER + ordinal();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns file extension given flush version number.
|
||||
* If the flush version is unknown (e.g. higher than current flush version or lower than 0), null
|
||||
* is returned.
|
||||
*/
|
||||
@Nullable
|
||||
public static String getVersionFileExtension(int flushVersion) {
|
||||
if (flushVersion > CURRENT_FLUSH_VERSION.ordinal() || flushVersion < 0) {
|
||||
return null;
|
||||
} else {
|
||||
return FlushVersion.values()[flushVersion].getVersionFileExtension();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string describing the current schema version.
|
||||
* @deprecated Please use {@link com.twitter.search.common.schema.base.Schema#getVersionDescription()}
|
||||
*/
|
||||
@Deprecated
|
||||
public String getDescription() {
|
||||
return description;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the schema's major version.
|
||||
* @deprecated Please use {@link com.twitter.search.common.schema.base.Schema#getMajorVersionNumber()}.
|
||||
*/
|
||||
@Deprecated
|
||||
public int getVersionNumber() {
|
||||
return this.ordinal();
|
||||
}
|
||||
|
||||
public boolean onOrAfter(FlushVersion other) {
|
||||
return compareTo(other) >= 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns whether the schema version is official. Only official segments are uploaded to HDFS.
|
||||
* @deprecated Please use {@link com.twitter.search.common.schema.base.Schema#isVersionOfficial()}.
|
||||
*/
|
||||
@Deprecated
|
||||
public boolean isOfficial() {
|
||||
// We want the loading/flushing tests to pass locally even if the version is not meant
|
||||
// to be an official version.
|
||||
return isOfficial || Config.environmentIsTest();
|
||||
}
|
||||
|
||||
/**
|
||||
* As of now, this is hardcoded to 0. We will start using this soon.
|
||||
* @deprecated Please consult schema for minor version. This should only be used to build schema.
|
||||
*/
|
||||
@Deprecated
|
||||
public int getMinorVersion() {
|
||||
return minorVersion;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,71 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
||||
public class AndNotDocIdSetIterator extends DocIdSetIterator {
|
||||
private int nextDelDoc;
|
||||
private final DocIdSetIterator baseIter;
|
||||
private final DocIdSetIterator notIter;
|
||||
private int currID;
|
||||
|
||||
/** Creates a new AndNotDocIdSetIterator instance. */
|
||||
public AndNotDocIdSetIterator(DocIdSetIterator baseIter, DocIdSetIterator notIter)
|
||||
throws IOException {
|
||||
nextDelDoc = notIter.nextDoc();
|
||||
this.baseIter = baseIter;
|
||||
this.notIter = notIter;
|
||||
currID = -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
currID = baseIter.advance(target);
|
||||
if (currID == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
return currID;
|
||||
}
|
||||
|
||||
if (nextDelDoc != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (currID < nextDelDoc) {
|
||||
return currID;
|
||||
} else if (currID == nextDelDoc) {
|
||||
return nextDoc();
|
||||
} else {
|
||||
nextDelDoc = notIter.advance(currID);
|
||||
if (currID == nextDelDoc) {
|
||||
return nextDoc();
|
||||
}
|
||||
}
|
||||
}
|
||||
return currID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return currID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
currID = baseIter.nextDoc();
|
||||
if (nextDelDoc != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
while (currID != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
if (currID < nextDelDoc) {
|
||||
return currID;
|
||||
} else {
|
||||
if (currID == nextDelDoc) {
|
||||
currID = baseIter.nextDoc();
|
||||
}
|
||||
nextDelDoc = notIter.advance(currID);
|
||||
}
|
||||
}
|
||||
}
|
||||
return currID;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return baseIter.cost();
|
||||
}
|
||||
}
|
@ -1,33 +0,0 @@
|
||||
java_library(
|
||||
sources = ["*.java"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/google/inject:guice",
|
||||
"3rdparty/jvm/com/twitter/elephantbird:core",
|
||||
"3rdparty/jvm/geo/google:geoGoogle",
|
||||
"3rdparty/jvm/log4j",
|
||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-core",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-facet",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-queries",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-spatial-extras",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common/collections",
|
||||
"src/java/com/twitter/common/util:system-mocks",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/query",
|
||||
"src/java/com/twitter/search/common/schema",
|
||||
"src/java/com/twitter/search/common/schema/base",
|
||||
"src/java/com/twitter/search/common/util/spatial",
|
||||
"src/java/com/twitter/search/queryparser",
|
||||
"src/thrift/com/twitter/search/common:facets-java",
|
||||
"src/thrift/com/twitter/search/common:query-java",
|
||||
],
|
||||
)
|
BIN
src/java/com/twitter/search/common/search/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/common/search/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,75 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.LeafCollector;
|
||||
import org.apache.lucene.search.Scorable;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.query.thriftjava.CollectorParams;
|
||||
|
||||
/**
|
||||
* A {@link com.twitter.search.common.search.TwitterEarlyTerminationCollector}
|
||||
* that delegates actual hit collection to a sub collector.
|
||||
*/
|
||||
public final class DelegatingEarlyTerminationCollector
|
||||
extends TwitterEarlyTerminationCollector {
|
||||
private final Collector subCollector;
|
||||
private LeafCollector subLeafCollector;
|
||||
|
||||
/** Creates a new DelegatingEarlyTerminationCollector instance. */
|
||||
public DelegatingEarlyTerminationCollector(Collector subCollector,
|
||||
CollectorParams collectorParams,
|
||||
TerminationTracker terminationTracker,
|
||||
@Nullable QueryCostProvider queryCostProvider,
|
||||
int numDocsBetweenTimeoutChecks,
|
||||
Clock clock) {
|
||||
super(
|
||||
collectorParams,
|
||||
terminationTracker,
|
||||
queryCostProvider,
|
||||
numDocsBetweenTimeoutChecks,
|
||||
clock);
|
||||
this.subCollector = subCollector;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setScorer(Scorable scorer) throws IOException {
|
||||
super.setScorer(scorer);
|
||||
subLeafCollector.setScorer(scorer);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doCollect() throws IOException {
|
||||
subLeafCollector.collect(curDocId);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doFinishSegment(int lastSearchedDocID) throws IOException {
|
||||
if (subCollector instanceof TwitterCollector) {
|
||||
((TwitterCollector) subCollector).finishSegment(lastSearchedDocID);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setNextReader(LeafReaderContext context) throws IOException {
|
||||
super.setNextReader(context);
|
||||
subLeafCollector = subCollector.getLeafCollector(context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public ScoreMode scoreMode() {
|
||||
return subCollector.scoreMode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getDebugInfo() {
|
||||
return null;
|
||||
}
|
||||
}
|
BIN
src/java/com/twitter/search/common/search/DocIdTracker.docx
Normal file
BIN
src/java/com/twitter/search/common/search/DocIdTracker.docx
Normal file
Binary file not shown.
@ -1,12 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
/**
|
||||
* Provide an accessor for a doc ID. This is useful for classes that iterate through doc IDs
|
||||
* and maintain a "last seen" doc ID.
|
||||
*/
|
||||
public interface DocIdTracker {
|
||||
/**
|
||||
* Retrieve current doc ID
|
||||
*/
|
||||
int getCurrentDocId();
|
||||
}
|
Binary file not shown.
@ -1,51 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import javax.annotation.Nonnull;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
|
||||
/**
|
||||
* This is not an enum to allow different clusters to define their own EarlyTerminationStates.
|
||||
*/
|
||||
public final class EarlyTerminationState {
|
||||
private static final String STATS_PREFIX = "early_termination_";
|
||||
|
||||
public static final EarlyTerminationState COLLECTING =
|
||||
new EarlyTerminationState("no_early_termination", false);
|
||||
public static final EarlyTerminationState TERMINATED_TIME_OUT_EXCEEDED =
|
||||
new EarlyTerminationState("terminated_timeout_exceeded", true);
|
||||
public static final EarlyTerminationState TERMINATED_MAX_QUERY_COST_EXCEEDED =
|
||||
new EarlyTerminationState("terminated_max_query_cost_exceeded", true);
|
||||
public static final EarlyTerminationState TERMINATED_MAX_HITS_EXCEEDED =
|
||||
new EarlyTerminationState("terminated_max_hits_exceeded", true);
|
||||
public static final EarlyTerminationState TERMINATED_NUM_RESULTS_EXCEEDED =
|
||||
new EarlyTerminationState("terminated_num_results_exceeded", true);
|
||||
|
||||
|
||||
// This string can be returned as a part of a search response, to tell the searcher
|
||||
// why the search got early terminated.
|
||||
private final String terminationReason;
|
||||
private final boolean terminated;
|
||||
private final SearchCounter count;
|
||||
|
||||
public EarlyTerminationState(@Nonnull String terminationReason, boolean terminated) {
|
||||
this.terminationReason = Preconditions.checkNotNull(terminationReason);
|
||||
this.terminated = terminated;
|
||||
count = SearchCounter.export(STATS_PREFIX + terminationReason + "_count");
|
||||
|
||||
}
|
||||
|
||||
public boolean isTerminated() {
|
||||
return terminated;
|
||||
}
|
||||
|
||||
public String getTerminationReason() {
|
||||
return terminationReason;
|
||||
}
|
||||
|
||||
public void incrementCount() {
|
||||
count.increment();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,65 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import java.util.LinkedHashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.spatial.prefix.tree.Cell;
|
||||
import org.apache.lucene.spatial.prefix.tree.CellIterator;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import com.twitter.search.common.util.spatial.GeohashChunkImpl;
|
||||
import com.twitter.search.queryparser.util.GeoCode;
|
||||
|
||||
import geo.google.datamodel.GeoAddressAccuracy;
|
||||
|
||||
public final class GeoQuadTreeQueryBuilderUtil {
|
||||
private GeoQuadTreeQueryBuilderUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Build a geo quad tree query based around the geo code based on the geo field.
|
||||
* @param geocode the geo location for the quad tree query
|
||||
* @param field the field where the geohash tokens are indexed
|
||||
* @return the corresponding for the geo quad tree query
|
||||
*/
|
||||
public static Query buildGeoQuadTreeQuery(GeoCode geocode, String field) {
|
||||
Set<BytesRef> geoHashSet = new LinkedHashSet<>();
|
||||
|
||||
// if accuracy is specified. Add a term query based on accuracy.
|
||||
if (geocode.accuracy != GeoAddressAccuracy.UNKNOWN_LOCATION.getCode()) {
|
||||
BytesRef termRef = new BytesRef(GeohashChunkImpl.buildGeoStringWithAccuracy(geocode.latitude,
|
||||
geocode.longitude,
|
||||
geocode.accuracy));
|
||||
geoHashSet.add(termRef);
|
||||
}
|
||||
|
||||
// If distance is specified. Add term queries based on distance
|
||||
if (geocode.distanceKm != GeoCode.DOUBLE_DISTANCE_NOT_SET) {
|
||||
// Build query based on distance
|
||||
int treeLevel = -1;
|
||||
// First find block containing query point with diagonal greater than 2 * radius.
|
||||
Cell centerNode = GeohashChunkImpl.getGeoNodeByRadius(geocode.latitude, geocode.longitude,
|
||||
geocode.distanceKm);
|
||||
// Add center node querying term
|
||||
if (centerNode != null) {
|
||||
geoHashSet.add(centerNode.getTokenBytesNoLeaf(new BytesRef()));
|
||||
treeLevel = centerNode.getLevel();
|
||||
}
|
||||
|
||||
// This improves edge case recall, by adding cells also intersecting the query area.
|
||||
CellIterator nodes = GeohashChunkImpl.getNodesIntersectingCircle(geocode.latitude,
|
||||
geocode.longitude,
|
||||
geocode.distanceKm,
|
||||
treeLevel);
|
||||
// If there are other nodes intersecting query circle, also add them in.
|
||||
if (nodes != null) {
|
||||
while (nodes.hasNext()) {
|
||||
geoHashSet.add(nodes.next().getTokenBytesNoLeaf(new BytesRef()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return new com.twitter.search.common.query.MultiTermDisjunctionQuery(field, geoHashSet);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,76 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import java.util.Arrays;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
||||
/**
|
||||
* DocIdSetIterator implementation from a sorted list of non-negative integers. If the given list of
|
||||
* doc IDs is not sorted or contains negative doc IDs, the results are undefined.
|
||||
*/
|
||||
public class IntArrayDocIdSetIterator extends DocIdSetIterator {
|
||||
private final int[] docIds;
|
||||
private int docId;
|
||||
private int cursor;
|
||||
|
||||
public IntArrayDocIdSetIterator(int[] ids) {
|
||||
docIds = ids;
|
||||
reset();
|
||||
}
|
||||
|
||||
/** Used for testing. */
|
||||
public void reset() {
|
||||
docId = -1;
|
||||
cursor = -1;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() {
|
||||
return advance(docId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) {
|
||||
if (docId == NO_MORE_DOCS) {
|
||||
return docId;
|
||||
}
|
||||
|
||||
if (target < docId) {
|
||||
return docId;
|
||||
}
|
||||
|
||||
if (cursor == docIds.length - 1) {
|
||||
docId = NO_MORE_DOCS;
|
||||
return docId;
|
||||
}
|
||||
|
||||
if (target == docId) {
|
||||
docId = docIds[++cursor];
|
||||
return docId;
|
||||
}
|
||||
|
||||
int toIndex = Math.min(cursor + (target - docId) + 1, docIds.length);
|
||||
int targetIndex = Arrays.binarySearch(docIds, cursor + 1, toIndex, target);
|
||||
if (targetIndex < 0) {
|
||||
targetIndex = -targetIndex - 1;
|
||||
}
|
||||
|
||||
if (targetIndex == docIds.length) {
|
||||
docId = NO_MORE_DOCS;
|
||||
} else {
|
||||
cursor = targetIndex;
|
||||
docId = docIds[cursor];
|
||||
}
|
||||
return docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return docIds == null ? 0 : docIds.length;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,82 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
/**
|
||||
* Disjunction over 2 DocIdSetIterators. This should be faster than a disjunction over N since there
|
||||
* would be no need to adjust the heap.
|
||||
*/
|
||||
public class PairDocIdSetIterator extends DocIdSetIterator {
|
||||
|
||||
private final DocIdSetIterator d1;
|
||||
private final DocIdSetIterator d2;
|
||||
|
||||
private int doc = -1;
|
||||
|
||||
/** Creates a new PairDocIdSetIterator instance. */
|
||||
public PairDocIdSetIterator(DocIdSetIterator d1, DocIdSetIterator d2) throws IOException {
|
||||
Preconditions.checkNotNull(d1);
|
||||
Preconditions.checkNotNull(d2);
|
||||
this.d1 = d1;
|
||||
this.d2 = d2;
|
||||
// position the iterators
|
||||
this.d1.nextDoc();
|
||||
this.d2.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
int doc1 = d1.docID();
|
||||
int doc2 = d2.docID();
|
||||
DocIdSetIterator iter = null;
|
||||
if (doc1 < doc2) {
|
||||
doc = doc1;
|
||||
//d1.nextDoc();
|
||||
iter = d1;
|
||||
} else if (doc1 > doc2) {
|
||||
doc = doc2;
|
||||
//d2.nextDoc();
|
||||
iter = d2;
|
||||
} else {
|
||||
doc = doc1;
|
||||
//d1.nextDoc();
|
||||
//d2.nextDoc();
|
||||
}
|
||||
|
||||
if (doc != NO_MORE_DOCS) {
|
||||
if (iter != null) {
|
||||
iter.nextDoc();
|
||||
} else {
|
||||
d1.nextDoc();
|
||||
d2.nextDoc();
|
||||
}
|
||||
}
|
||||
return doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
if (d1.docID() < target) {
|
||||
d1.advance(target);
|
||||
}
|
||||
if (d2.docID() < target) {
|
||||
d2.advance(target);
|
||||
}
|
||||
return (doc != NO_MORE_DOCS) ? nextDoc() : doc;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
// very coarse estimate
|
||||
return d1.cost() + d2.cost();
|
||||
}
|
||||
|
||||
}
|
BIN
src/java/com/twitter/search/common/search/QueryCostProvider.docx
Normal file
BIN
src/java/com/twitter/search/common/search/QueryCostProvider.docx
Normal file
Binary file not shown.
@ -1,9 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
/**
|
||||
* Any class that can track and return query cost.
|
||||
*/
|
||||
public interface QueryCostProvider {
|
||||
/** Returns the total cost. */
|
||||
double getTotalCost();
|
||||
}
|
Binary file not shown.
@ -1,202 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.query.thriftjava.CollectorTerminationParams;
|
||||
|
||||
/**
|
||||
* Used for tracking termination criteria for earlybird queries.
|
||||
*
|
||||
* Currently this tracks the query time out and query cost, if they are set on the
|
||||
* {@link com.twitter.search.common.query.thriftjava.CollectorTerminationParams}.
|
||||
*/
|
||||
public class TerminationTracker {
|
||||
/** Query start time provided by client. */
|
||||
private final long clientStartTimeMillis;
|
||||
|
||||
/** Timeout end times, calculated from {@link #clientStartTimeMillis}. */
|
||||
private final long timeoutEndTimeMillis;
|
||||
|
||||
/** Query start time recorded at earlybird server. */
|
||||
private final long localStartTimeMillis;
|
||||
|
||||
/** Tracking query cost */
|
||||
private final double maxQueryCost;
|
||||
|
||||
// Sometimes, we want to early terminate before timeoutEndTimeMillis, to reserve time for
|
||||
// work that needs to be done after early termination (E.g. merging results).
|
||||
private final int postTerminationOverheadMillis;
|
||||
|
||||
// We don't check for early termination often enough. Some times requests timeout in between
|
||||
// early termination checks. This buffer time is also substracted from deadline.
|
||||
// To illustrate how this is used, let's use a simple example:
|
||||
// If we spent 750ms searching 5 segments, a rough estimate is that we need 150ms to search
|
||||
// one segment. If the timeout is set to 800ms, we should not starting searching the next segment.
|
||||
// In this case, on can set preTerminationSafeBufferTimeMillis to 150ms, so that when early
|
||||
// termination check computes the deadline, this buffer is also subtracted. See SEARCH-29723.
|
||||
private int preTerminationSafeBufferTimeMillis = 0;
|
||||
|
||||
private EarlyTerminationState earlyTerminationState = EarlyTerminationState.COLLECTING;
|
||||
|
||||
// This flag determines whether the last searched doc ID trackers should be consulted when a
|
||||
// timeout occurs.
|
||||
private final boolean useLastSearchedDocIdOnTimeout;
|
||||
|
||||
private final Set<DocIdTracker> lastSearchedDocIdTrackers = new HashSet<>();
|
||||
|
||||
/**
|
||||
* Creates a new termination tracker that will not specify a timeout or max query cost.
|
||||
* Can be used for queries that explicitly do not want to use a timeout. Meant to be used for
|
||||
* tests, and background queries running for the query cache.
|
||||
*/
|
||||
public TerminationTracker(Clock clock) {
|
||||
this.clientStartTimeMillis = clock.nowMillis();
|
||||
this.localStartTimeMillis = clientStartTimeMillis;
|
||||
this.timeoutEndTimeMillis = Long.MAX_VALUE;
|
||||
this.maxQueryCost = Double.MAX_VALUE;
|
||||
this.postTerminationOverheadMillis = 0;
|
||||
this.useLastSearchedDocIdOnTimeout = false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenient method overloading for
|
||||
* {@link #TerminationTracker(CollectorTerminationParams, long, Clock, int)}.
|
||||
*/
|
||||
public TerminationTracker(
|
||||
CollectorTerminationParams terminationParams, Clock clock,
|
||||
int postTerminationOverheadMillis) {
|
||||
this(terminationParams, clock.nowMillis(), clock, postTerminationOverheadMillis);
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenient method overloading for
|
||||
* {@link #TerminationTracker(CollectorTerminationParams, long, Clock, int)}.
|
||||
*/
|
||||
public TerminationTracker(
|
||||
CollectorTerminationParams terminationParams, int postTerminationOverheadMillis) {
|
||||
this(
|
||||
terminationParams,
|
||||
System.currentTimeMillis(),
|
||||
Clock.SYSTEM_CLOCK,
|
||||
postTerminationOverheadMillis);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a new TerminationTracker instance.
|
||||
*
|
||||
* @param terminationParams CollectorParams.CollectorTerminationParams carrying parameters
|
||||
* about early termination.
|
||||
* @param clientStartTimeMillis The query start time (in millis) specified by client. This is used
|
||||
* to calculate timeout end time, like {@link #timeoutEndTimeMillis}.
|
||||
* @param clock used to sample {@link #localStartTimeMillis}.
|
||||
* @param postTerminationOverheadMillis How much time should be reserved. E.g. if request time
|
||||
* out is 800ms, and this is set to 200ms, early termination
|
||||
* will kick in at 600ms mark.
|
||||
*/
|
||||
public TerminationTracker(
|
||||
CollectorTerminationParams terminationParams,
|
||||
long clientStartTimeMillis,
|
||||
Clock clock,
|
||||
int postTerminationOverheadMillis) {
|
||||
Preconditions.checkNotNull(terminationParams);
|
||||
Preconditions.checkArgument(postTerminationOverheadMillis >= 0);
|
||||
|
||||
this.clientStartTimeMillis = clientStartTimeMillis;
|
||||
this.localStartTimeMillis = clock.nowMillis();
|
||||
|
||||
if (terminationParams.isSetTimeoutMs()
|
||||
&& terminationParams.getTimeoutMs() > 0) {
|
||||
Preconditions.checkState(terminationParams.getTimeoutMs() >= postTerminationOverheadMillis);
|
||||
this.timeoutEndTimeMillis = this.clientStartTimeMillis + terminationParams.getTimeoutMs();
|
||||
} else {
|
||||
// Effectively no timeout.
|
||||
this.timeoutEndTimeMillis = Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
// Tracking query cost
|
||||
if (terminationParams.isSetMaxQueryCost()
|
||||
&& terminationParams.getMaxQueryCost() > 0) {
|
||||
maxQueryCost = terminationParams.getMaxQueryCost();
|
||||
} else {
|
||||
maxQueryCost = Double.MAX_VALUE;
|
||||
}
|
||||
|
||||
this.useLastSearchedDocIdOnTimeout = terminationParams.isEnforceQueryTimeout();
|
||||
this.postTerminationOverheadMillis = postTerminationOverheadMillis;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the reserve time to perform post termination work. Return the deadline timestamp
|
||||
* with postTerminationWorkEstimate subtracted.
|
||||
*/
|
||||
public long getTimeoutEndTimeWithReservation() {
|
||||
// Return huge value if time out is disabled.
|
||||
if (timeoutEndTimeMillis == Long.MAX_VALUE) {
|
||||
return timeoutEndTimeMillis;
|
||||
} else {
|
||||
return timeoutEndTimeMillis
|
||||
- postTerminationOverheadMillis
|
||||
- preTerminationSafeBufferTimeMillis;
|
||||
}
|
||||
}
|
||||
|
||||
public void setPreTerminationSafeBufferTimeMillis(int preTerminationSafeBufferTimeMillis) {
|
||||
Preconditions.checkArgument(preTerminationSafeBufferTimeMillis >= 0);
|
||||
|
||||
this.preTerminationSafeBufferTimeMillis = preTerminationSafeBufferTimeMillis;
|
||||
}
|
||||
|
||||
public long getLocalStartTimeMillis() {
|
||||
return localStartTimeMillis;
|
||||
}
|
||||
|
||||
public long getClientStartTimeMillis() {
|
||||
return clientStartTimeMillis;
|
||||
}
|
||||
|
||||
public double getMaxQueryCost() {
|
||||
return maxQueryCost;
|
||||
}
|
||||
|
||||
public boolean isEarlyTerminated() {
|
||||
return earlyTerminationState.isTerminated();
|
||||
}
|
||||
|
||||
public EarlyTerminationState getEarlyTerminationState() {
|
||||
return earlyTerminationState;
|
||||
}
|
||||
|
||||
public void setEarlyTerminationState(EarlyTerminationState earlyTerminationState) {
|
||||
this.earlyTerminationState = earlyTerminationState;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the minimum searched doc ID amongst all registered trackers, or -1 if there aren't any
|
||||
* trackers. Doc IDs are stored in ascending order, and trackers update their doc IDs as they
|
||||
* search, so the minimum doc ID reflects the most recent fully searched doc ID.
|
||||
*/
|
||||
int getLastSearchedDocId() {
|
||||
return lastSearchedDocIdTrackers.stream()
|
||||
.mapToInt(DocIdTracker::getCurrentDocId).min().orElse(-1);
|
||||
}
|
||||
|
||||
void resetDocIdTrackers() {
|
||||
lastSearchedDocIdTrackers.clear();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add a DocIdTracker, to keep track of the last fully-searched doc ID when early termination
|
||||
* occurs.
|
||||
*/
|
||||
public void addDocIdTracker(DocIdTracker docIdTracker) {
|
||||
lastSearchedDocIdTrackers.add(docIdTracker);
|
||||
}
|
||||
|
||||
public boolean useLastSearchedDocIdOnTimeout() {
|
||||
return useLastSearchedDocIdOnTimeout;
|
||||
}
|
||||
}
|
BIN
src/java/com/twitter/search/common/search/TwitterCollector.docx
Normal file
BIN
src/java/com/twitter/search/common/search/TwitterCollector.docx
Normal file
Binary file not shown.
@ -1,31 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.search.Collector;
|
||||
|
||||
/**
|
||||
* Lucene Collectors throw CollectionTerminatedException to perform early termination.
|
||||
* We don't believe that throwing Exceptions to control execution flow is ideal, so we are adding
|
||||
* this class to be a base of all Twitter Collectors.
|
||||
*
|
||||
* {@link com.twitter.search.common.search.TwitterIndexSearcher} uses the {@link #isTerminated()}
|
||||
* method to perform early termination, instead of relying on CollectionTerminatedException.
|
||||
*/
|
||||
public abstract class TwitterCollector implements Collector {
|
||||
|
||||
/**
|
||||
* Subclasses should return true if they want to perform early termination.
|
||||
* This method is called every hit and should not be expensive.
|
||||
*/
|
||||
public abstract boolean isTerminated() throws IOException;
|
||||
|
||||
/**
|
||||
* Lucene API only has a method that's called before searching a segment setNextReader().
|
||||
* This hook is called after finishing searching a segment.
|
||||
* @param lastSearchedDocID is the last docid searched before termination,
|
||||
* or NO_MORE_DOCS if there was no early termination. This doc need not be a hit,
|
||||
* and should not be collected here.
|
||||
*/
|
||||
public abstract void finishSegment(int lastSearchedDocID) throws IOException;
|
||||
}
|
Binary file not shown.
@ -1,328 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import javax.annotation.Nonnull;
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.LeafCollector;
|
||||
import org.apache.lucene.search.Scorable;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
||||
import com.twitter.search.common.query.thriftjava.CollectorParams;
|
||||
import com.twitter.search.common.query.thriftjava.CollectorTerminationParams;
|
||||
|
||||
/**
|
||||
* A TwitterCollector containing the most common early termination logic based on
|
||||
* timeout, cost, and max hits. This class does not do any actual hit collection---this class
|
||||
* is abstract and cannot be instantiated.
|
||||
*
|
||||
* If a Collector and all its subclasses need early termination, it should extend this class.
|
||||
*
|
||||
* However, if one just wants to add EarlyTermination to any single collector, he can just
|
||||
* use {@link DelegatingEarlyTerminationCollector}
|
||||
* as a wrapper.
|
||||
*/
|
||||
public abstract class TwitterEarlyTerminationCollector
|
||||
extends TwitterCollector implements LeafCollector {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TwitterEarlyTerminationCollector.class);
|
||||
private static final SearchCounter NEGATIVE_TIME_PER_SEGMENT =
|
||||
SearchCounter.export("TwitterEarlyTerminationCollector_negative_time_per_segment");
|
||||
private static final SearchRateCounter QUERY_TIMEOUT_ENFORCED =
|
||||
SearchRateCounter.export("TwitterEarlyTerminationCollector_query_timeout_enforced");
|
||||
|
||||
protected int curDocId = -1;
|
||||
|
||||
protected Scorable scorer = null;
|
||||
private LeafReader curReader = null;
|
||||
private final long maxHitsToProcess;
|
||||
private long numHitsProcessed = 0;
|
||||
private int lastEarlyTerminationCheckDocId = -1;
|
||||
private final Clock clock;
|
||||
|
||||
@Nullable
|
||||
private final QueryCostProvider queryCostProvider;
|
||||
|
||||
private final TerminationTracker terminationTracker;
|
||||
|
||||
// This determines how often the expensive early termination check is performed.
|
||||
// If set to be negative, expensive early termination check only performed at segment boundaries.
|
||||
// If set to a positive number X, this check is performed every X docs processed.
|
||||
private int numDocsBetweenTimeoutChecks;
|
||||
|
||||
// Number of segments searched so far.
|
||||
// This is used to predicatively early terminate.
|
||||
// Expensive early termination checks may not happen often enough. Sometimes the request
|
||||
// times out in between the termination checks.
|
||||
// After finishing searching a segment, we estimate how much time is needed to search one
|
||||
// segment on average. If searching the next segment would cause a timeout, we early terminate.
|
||||
private int numSearchedSegments = 0;
|
||||
|
||||
/**
|
||||
* Creates a new TwitterEarlyTerminationCollector instance.
|
||||
*
|
||||
* @param collectorParams the parameters needed to guide early termination.
|
||||
* @param terminationTracker If null is passed in, a new TerminationTrack is created. Otherwise,
|
||||
* the one passed in is used.
|
||||
* @param numDocsBetweenTimeoutChecks TerminationTracker based check are performed upon a hit
|
||||
* every numDocsBetweenTimeoutChecks docs. If a non-positive number is passed
|
||||
* in, TerminationTracker based checks are disabled.
|
||||
* If collectorParams specifies a value as well, that value is used.
|
||||
*/
|
||||
public TwitterEarlyTerminationCollector(
|
||||
CollectorParams collectorParams,
|
||||
TerminationTracker terminationTracker,
|
||||
@Nullable QueryCostProvider queryCostProvider,
|
||||
int numDocsBetweenTimeoutChecks,
|
||||
Clock clock) {
|
||||
CollectorTerminationParams terminationParams = collectorParams.getTerminationParams();
|
||||
|
||||
if (terminationParams == null) {
|
||||
terminationParams = new CollectorTerminationParams()
|
||||
.setMaxHitsToProcess(Integer.MAX_VALUE)
|
||||
.setMaxQueryCost(Double.MAX_VALUE)
|
||||
.setTimeoutMs(Integer.MAX_VALUE);
|
||||
}
|
||||
|
||||
if (!terminationParams.isSetMaxHitsToProcess() || terminationParams.getMaxHitsToProcess() < 0) {
|
||||
maxHitsToProcess = Integer.MAX_VALUE;
|
||||
} else {
|
||||
maxHitsToProcess = terminationParams.getMaxHitsToProcess();
|
||||
}
|
||||
|
||||
if (terminationParams.isSetNumDocsBetweenTimeoutChecks()) {
|
||||
this.numDocsBetweenTimeoutChecks = terminationParams.getNumDocsBetweenTimeoutChecks();
|
||||
} else {
|
||||
this.numDocsBetweenTimeoutChecks = numDocsBetweenTimeoutChecks;
|
||||
}
|
||||
|
||||
this.terminationTracker = Preconditions.checkNotNull(terminationTracker);
|
||||
this.queryCostProvider = queryCostProvider;
|
||||
this.clock = clock;
|
||||
}
|
||||
|
||||
public final LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
|
||||
this.setNextReader(context);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sub-classes may override this to add more collection logic.
|
||||
*/
|
||||
protected abstract void doCollect() throws IOException;
|
||||
|
||||
/**
|
||||
* Sub-classes may override this to add more segment completion logic.
|
||||
* @param lastSearchedDocID is the last docid searched before termination,
|
||||
* or NO_MORE_DOCS if there was no early termination. This doc may not be a hit!
|
||||
*/
|
||||
protected abstract void doFinishSegment(int lastSearchedDocID) throws IOException;
|
||||
|
||||
/**
|
||||
* sub classes can override this to perform more early termination checks.
|
||||
*/
|
||||
public EarlyTerminationState innerShouldCollectMore() throws IOException {
|
||||
return EarlyTerminationState.COLLECTING;
|
||||
}
|
||||
|
||||
/**
|
||||
* After early termination, this method can be used to retrieve early termination reason.
|
||||
*/
|
||||
@Nonnull
|
||||
public final EarlyTerminationState getEarlyTerminationState() {
|
||||
return terminationTracker.getEarlyTerminationState();
|
||||
}
|
||||
|
||||
protected final EarlyTerminationState setEarlyTerminationState(
|
||||
EarlyTerminationState newEarlyTerminationState) {
|
||||
terminationTracker.setEarlyTerminationState(newEarlyTerminationState);
|
||||
return newEarlyTerminationState;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final boolean isTerminated() throws IOException {
|
||||
EarlyTerminationState earlyTerminationState = getEarlyTerminationState();
|
||||
|
||||
if (earlyTerminationState.isTerminated()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (getNumHitsProcessed() >= getMaxHitsToProcess()) {
|
||||
collectedEnoughResults();
|
||||
if (shouldTerminate()) {
|
||||
return setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_HITS_EXCEEDED)
|
||||
.isTerminated();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return innerShouldCollectMore().isTerminated();
|
||||
}
|
||||
|
||||
/**
|
||||
* Note: subclasses overriding this method are expected to call "super.setNextReader"
|
||||
* in their setNextReader().
|
||||
* @deprecated Remove this methods in favor of {@link #getLeafCollector(LeafReaderContext)}
|
||||
*/
|
||||
@Deprecated
|
||||
public void setNextReader(LeafReaderContext context) throws IOException {
|
||||
if (!terminationTracker.useLastSearchedDocIdOnTimeout()) {
|
||||
expensiveEarlyTerminationCheck();
|
||||
}
|
||||
|
||||
// Reset curDocId for next segment
|
||||
curDocId = -1;
|
||||
lastEarlyTerminationCheckDocId = -1;
|
||||
curReader = context.reader();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sub-classes overriding this method are expected to call super.setScorer()
|
||||
*/
|
||||
@Override
|
||||
public void setScorer(Scorable scorer) throws IOException {
|
||||
this.scorer = scorer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void collect(int doc) throws IOException {
|
||||
curDocId = doc;
|
||||
doCollect();
|
||||
numHitsProcessed++;
|
||||
if (numDocsBetweenTimeoutChecks > 0
|
||||
&& (curDocId - lastEarlyTerminationCheckDocId) >= numDocsBetweenTimeoutChecks) {
|
||||
lastEarlyTerminationCheckDocId = curDocId;
|
||||
|
||||
if (!terminationTracker.useLastSearchedDocIdOnTimeout()) {
|
||||
expensiveEarlyTerminationCheck();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Accounting for a segment searched.
|
||||
* @param lastSearchedDocID is the last docid searched before termination,
|
||||
* or NO_MORE_DOCS if there was no early termination. This doc may not be a hit!
|
||||
*/
|
||||
protected final void trackCompleteSegment(int lastSearchedDocID) throws IOException {
|
||||
doFinishSegment(lastSearchedDocID);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void finishSegment(int lastSearchedDocID) throws IOException {
|
||||
// finished searching a segment. Computer average time needed to search a segment.
|
||||
Preconditions.checkState(curReader != null, "Did subclass call super.setNextReader()?");
|
||||
numSearchedSegments++;
|
||||
|
||||
long totalTime = clock.nowMillis() - terminationTracker.getLocalStartTimeMillis();
|
||||
|
||||
if (totalTime >= Integer.MAX_VALUE) {
|
||||
String msg = String.format(
|
||||
"%s: A query runs for %d that is longer than Integer.MAX_VALUE ms. lastSearchedDocID: %d",
|
||||
getClass().getSimpleName(), totalTime, lastSearchedDocID
|
||||
);
|
||||
LOG.error(msg);
|
||||
throw new IllegalStateException(msg);
|
||||
}
|
||||
|
||||
int timePerSegment = ((int) totalTime) / numSearchedSegments;
|
||||
|
||||
if (timePerSegment < 0) {
|
||||
NEGATIVE_TIME_PER_SEGMENT.increment();
|
||||
timePerSegment = 0;
|
||||
}
|
||||
|
||||
// If we're enforcing timeout via the last searched doc ID, we don't need to add this buffer,
|
||||
// since we'll detect the timeout right away.
|
||||
if (!terminationTracker.useLastSearchedDocIdOnTimeout()) {
|
||||
terminationTracker.setPreTerminationSafeBufferTimeMillis(timePerSegment);
|
||||
}
|
||||
|
||||
// Check whether we timed out and are checking for timeout at the leaves. If so, we should use
|
||||
// the captured lastSearchedDocId from the tracker instead, which is the most up-to-date amongst
|
||||
// the query nodes.
|
||||
if (terminationTracker.useLastSearchedDocIdOnTimeout()
|
||||
&& EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED.equals(
|
||||
terminationTracker.getEarlyTerminationState())) {
|
||||
QUERY_TIMEOUT_ENFORCED.increment();
|
||||
trackCompleteSegment(terminationTracker.getLastSearchedDocId());
|
||||
} else {
|
||||
trackCompleteSegment(lastSearchedDocID);
|
||||
}
|
||||
|
||||
// We finished a segment, so clear out the DocIdTrackers. The next segment will register its
|
||||
// own trackers, and we don't need to keep the trackers from the current segment.
|
||||
terminationTracker.resetDocIdTrackers();
|
||||
|
||||
curDocId = -1;
|
||||
curReader = null;
|
||||
scorer = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* More expensive Early Termination checks, which are not called every hit.
|
||||
* This sets EarlyTerminationState if it decides that early termination should kick in.
|
||||
* See: SEARCH-29723.
|
||||
*/
|
||||
private void expensiveEarlyTerminationCheck() {
|
||||
if (queryCostProvider != null) {
|
||||
double totalQueryCost = queryCostProvider.getTotalCost();
|
||||
double maxQueryCost = terminationTracker.getMaxQueryCost();
|
||||
if (totalQueryCost >= maxQueryCost) {
|
||||
setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_QUERY_COST_EXCEEDED);
|
||||
}
|
||||
}
|
||||
|
||||
final long nowMillis = clock.nowMillis();
|
||||
if (nowMillis >= terminationTracker.getTimeoutEndTimeWithReservation()) {
|
||||
setEarlyTerminationState(EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED);
|
||||
}
|
||||
}
|
||||
|
||||
public long getMaxHitsToProcess() {
|
||||
return maxHitsToProcess;
|
||||
}
|
||||
|
||||
public final void setNumHitsProcessed(long numHitsProcessed) {
|
||||
this.numHitsProcessed = numHitsProcessed;
|
||||
}
|
||||
|
||||
protected final long getNumHitsProcessed() {
|
||||
return numHitsProcessed;
|
||||
}
|
||||
|
||||
protected final int getNumSearchedSegments() {
|
||||
return numSearchedSegments;
|
||||
}
|
||||
|
||||
protected final Clock getClock() {
|
||||
return clock;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected final TerminationTracker getTerminationTracker() {
|
||||
return this.terminationTracker;
|
||||
}
|
||||
|
||||
protected void collectedEnoughResults() throws IOException {
|
||||
}
|
||||
|
||||
protected boolean shouldTerminate() {
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Debug info collected during execution.
|
||||
*/
|
||||
public abstract List<String> getDebugInfo();
|
||||
}
|
Binary file not shown.
@ -1,189 +0,0 @@
|
||||
package com.twitter.search.common.search;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.MultiDocValues;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.search.CollectionStatistics;
|
||||
import org.apache.lucene.search.Collector;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.LeafCollector;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.TermStatistics;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
/**
|
||||
* An IndexSearch that works with TwitterEarlyTerminationCollector.
|
||||
* If a stock Lucene collector is passed into search(), this IndexSearch.search() behaves the
|
||||
* same as Lucene's stock IndexSearcher. However, if a TwitterEarlyTerminationCollector is passed
|
||||
* in, this IndexSearcher performs early termination without relying on
|
||||
* CollectionTerminatedException.
|
||||
*/
|
||||
public class TwitterIndexSearcher extends IndexSearcher {
|
||||
public TwitterIndexSearcher(IndexReader r) {
|
||||
super(r);
|
||||
}
|
||||
|
||||
/**
|
||||
* search() main loop.
|
||||
* This behaves exactly like IndexSearcher.search() if a stock Lucene collector passed in.
|
||||
* However, if a TwitterCollector is passed in, this class performs Twitter style early
|
||||
* termination without relying on
|
||||
* {@link org.apache.lucene.search.CollectionTerminatedException}.
|
||||
*/
|
||||
@Override
|
||||
protected void search(List<LeafReaderContext> leaves, Weight weight, Collector coll)
|
||||
throws IOException {
|
||||
|
||||
// If an TwitterCollector is passed in, we can do a few extra things in here, such
|
||||
// as early termination. Otherwise we can just fall back to IndexSearcher.search().
|
||||
if (coll instanceof TwitterCollector) {
|
||||
TwitterCollector collector = (TwitterCollector) coll;
|
||||
|
||||
for (LeafReaderContext ctx : leaves) { // search each subreader
|
||||
if (collector.isTerminated()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Notify the collector that we're starting this segment, and check for early
|
||||
// termination criteria again. setNextReader() performs 'expensive' early
|
||||
// termination checks in some implementations such as TwitterEarlyTerminationCollector.
|
||||
LeafCollector leafCollector = collector.getLeafCollector(ctx);
|
||||
if (collector.isTerminated()) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Initialize the scorer - it should not be null. Note that constructing the scorer
|
||||
// may actually do real work, such as advancing to the first hit.
|
||||
Scorer scorer = weight.scorer(ctx);
|
||||
|
||||
if (scorer == null) {
|
||||
collector.finishSegment(DocIdSetIterator.NO_MORE_DOCS);
|
||||
continue;
|
||||
}
|
||||
|
||||
leafCollector.setScorer(scorer);
|
||||
|
||||
// Start searching.
|
||||
DocIdSetIterator docIdSetIterator = scorer.iterator();
|
||||
int docID = docIdSetIterator.nextDoc();
|
||||
if (docID != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
// Collect results. Note: check isTerminated() before calling nextDoc().
|
||||
do {
|
||||
leafCollector.collect(docID);
|
||||
} while (!collector.isTerminated()
|
||||
&& (docID = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS);
|
||||
}
|
||||
|
||||
// Always finish the segment, providing the last docID advanced to.
|
||||
collector.finishSegment(docID);
|
||||
}
|
||||
} else {
|
||||
// The collector given is not a TwitterCollector, just use stock lucene search().
|
||||
super.search(leaves, weight, coll);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns {@link NumericDocValues} for this field, or
|
||||
* null if no {@link NumericDocValues} were indexed for
|
||||
* this field. The returned instance should only be
|
||||
* used by a single thread. */
|
||||
public NumericDocValues getNumericDocValues(String field) throws IOException {
|
||||
return MultiDocValues.getNumericValues(getIndexReader(), field);
|
||||
}
|
||||
|
||||
@Override
|
||||
public CollectionStatistics collectionStatistics(String field) throws IOException {
|
||||
return collectionStatistics(field, getIndexReader());
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) {
|
||||
return termStats(term, docFreq, totalTermFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* Lucene relies on the fact that maxDocID is typically equal to the number of documents in the
|
||||
* index, which is false when we have sparse doc IDs or when we start from 8 million docs and
|
||||
* decrement, so in this class we pass in numDocs instead of the maximum assigned document ID.
|
||||
* Note that the comment on {@link CollectionStatistics#maxDoc()} says that it returns the number
|
||||
* of documents in the segment, not the maximum ID, and that it is only used this way. This is
|
||||
* necessary for all lucene scoring methods, e.g.
|
||||
* {@link org.apache.lucene.search.similarities.TFIDFSimilarity#idfExplain}. This method body is
|
||||
* largely copied from {@link IndexSearcher#collectionStatistics(String)}.
|
||||
*/
|
||||
public static CollectionStatistics collectionStatistics(String field, IndexReader indexReader)
|
||||
throws IOException {
|
||||
Preconditions.checkNotNull(field);
|
||||
|
||||
int docsWithField = 0;
|
||||
long sumTotalTermFreq = 0;
|
||||
long sumDocFreq = 0;
|
||||
for (LeafReaderContext leaf : indexReader.leaves()) {
|
||||
Terms terms = leaf.reader().terms(field);
|
||||
if (terms == null) {
|
||||
continue;
|
||||
}
|
||||
|
||||
docsWithField += terms.getDocCount();
|
||||
sumTotalTermFreq += terms.getSumTotalTermFreq();
|
||||
sumDocFreq += terms.getSumDocFreq();
|
||||
}
|
||||
|
||||
if (docsWithField == 0) {
|
||||
// The CollectionStatistics API in Lucene is designed poorly. On one hand, starting with
|
||||
// Lucene 8.0.0, searchers are expected to always produce valid CollectionStatistics instances
|
||||
// and all int fields in these instances are expected to be strictly greater than 0. On the
|
||||
// other hand, Lucene itself produces null CollectionStatistics instances in a few places.
|
||||
// Also, there's no good placeholder value to indicate that a field is empty, which is a very
|
||||
// reasonable thing to happen (for example, the first few tweets in a new segment might not
|
||||
// have any links, so then the resolved_links_text would be empty). So to get around this
|
||||
// issue, we do here what Lucene does: we return a CollectionStatistics instance with all
|
||||
// fields set to 1.
|
||||
return new CollectionStatistics(field, 1, 1, 1, 1);
|
||||
}
|
||||
|
||||
// The writer could have added more docs to the index since this searcher started processing
|
||||
// this request, or could be in the middle of adding a doc, which could mean that only some of
|
||||
// the docsWithField, sumTotalTermFreq and sumDocFreq stats have been updated. I don't think
|
||||
// this is a big deal, as these stats are only used for computing a hit's score, and minor
|
||||
// inaccuracies should have very little effect on a hit's final score. But CollectionStatistic's
|
||||
// constructor has some strict asserts for the relationship between these stats. So we need to
|
||||
// make sure we cap the values of these stats appropriately.
|
||||
//
|
||||
// Adjust numDocs based on docsWithField (instead of doing the opposite), because:
|
||||
// 1. If new documents were added to this segment after the reader was created, it seems
|
||||
// reasonable to take the more recent information into account.
|
||||
// 2. The termStats() method below will return the most recent docFreq (not the value that
|
||||
// docFreq was set to when this reader was created). If this value is higher than numDocs,
|
||||
// then Lucene might end up producing negative scores, which must never happen.
|
||||
int numDocs = Math.max(indexReader.numDocs(), docsWithField);
|
||||
sumDocFreq = Math.max(sumDocFreq, docsWithField);
|
||||
sumTotalTermFreq = Math.max(sumTotalTermFreq, sumDocFreq);
|
||||
return new CollectionStatistics(field, numDocs, docsWithField, sumTotalTermFreq, sumDocFreq);
|
||||
}
|
||||
|
||||
/**
|
||||
* This method body is largely copied from {@link IndexSearcher#termStatistics(Term, int, long)}.
|
||||
* The only difference is that we make sure all parameters we pass to the TermStatistics instance
|
||||
* we create are set to at least 1 (because Lucene 8.0.0 expects them to be).
|
||||
*/
|
||||
public static TermStatistics termStats(Term term, int docFreq, long totalTermFreq) {
|
||||
// Lucene expects the doc frequency and total term frequency to be at least 1. This assumption
|
||||
// doesn't always make sense (the segment can be empty -- see comment above), but to make Lucene
|
||||
// happy, make sure to always set these parameters to at least 1.
|
||||
int adjustedDocFreq = Math.max(docFreq, 1);
|
||||
return new TermStatistics(
|
||||
term.bytes(),
|
||||
adjustedDocFreq,
|
||||
Math.max(totalTermFreq, adjustedDocFreq));
|
||||
}
|
||||
}
|
@ -1,20 +0,0 @@
|
||||
java_library(
|
||||
name = "termination",
|
||||
sources = ["*.java"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-core",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-facet",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-queries",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common/util:system-mocks",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/query",
|
||||
"src/java/com/twitter/search/common/search",
|
||||
"src/thrift/com/twitter/search:earlybird-java",
|
||||
],
|
||||
)
|
BIN
src/java/com/twitter/search/common/search/termination/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/common/search/termination/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,24 +0,0 @@
|
||||
package com.twitter.search.common.search.termination;
|
||||
|
||||
import com.twitter.search.common.search.DocIdTracker;
|
||||
|
||||
/**
|
||||
* QueryTimeout provides a method for early termination of queries.
|
||||
*/
|
||||
public interface QueryTimeout {
|
||||
/**
|
||||
* Returns true if query processing should terminate, otherwise false.
|
||||
*/
|
||||
boolean shouldExit();
|
||||
|
||||
/**
|
||||
* Register a DocIdTracker for the scope of the query, to determine the last fully-searched
|
||||
* doc ID after early termination.
|
||||
*/
|
||||
void registerDocIdTracker(DocIdTracker docIdTracker);
|
||||
|
||||
/**
|
||||
* Return client ID of query.
|
||||
*/
|
||||
String getClientId();
|
||||
}
|
Binary file not shown.
@ -1,34 +0,0 @@
|
||||
package com.twitter.search.common.search.termination;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.search.TerminationTracker;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
|
||||
public class QueryTimeoutFactory {
|
||||
/**
|
||||
* Creates a QueryTimeout instance for a given EarlybirdRequest and TerminationTracker, if the
|
||||
* required conditions for leaf-level timeout checking are met. Returns null otherwise.
|
||||
*
|
||||
* The conditions are:
|
||||
* 1) CollectorTerminationParams.isEnforceQueryTimeout()
|
||||
* 2) CollectorTerminationParams.isSetTimeoutMs()
|
||||
*/
|
||||
public QueryTimeout createQueryTimeout(
|
||||
EarlybirdRequest request,
|
||||
TerminationTracker tracker,
|
||||
Clock clock) {
|
||||
if (tracker != null
|
||||
&& request != null
|
||||
&& request.isSetSearchQuery()
|
||||
&& request.getSearchQuery().isSetCollectorParams()
|
||||
&& request.getSearchQuery().getCollectorParams().isSetTerminationParams()
|
||||
&& request.getSearchQuery().getCollectorParams().getTerminationParams()
|
||||
.isEnforceQueryTimeout()
|
||||
&& request.getSearchQuery().getCollectorParams().getTerminationParams()
|
||||
.isSetTimeoutMs()) {
|
||||
return new QueryTimeoutImpl(request.getClientId(), tracker, clock);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,65 +0,0 @@
|
||||
package com.twitter.search.common.search.termination;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
||||
import com.twitter.search.common.search.DocIdTracker;
|
||||
import com.twitter.search.common.search.EarlyTerminationState;
|
||||
import com.twitter.search.common.search.TerminationTracker;
|
||||
|
||||
/**
|
||||
* QueryTimeoutImpl provides a method for early termination of queries based on time.
|
||||
*/
|
||||
public class QueryTimeoutImpl implements QueryTimeout {
|
||||
private final String clientId;
|
||||
private final TerminationTracker tracker;
|
||||
private final Clock clock;
|
||||
|
||||
private final SearchRateCounter shouldTerminateCounter;
|
||||
|
||||
public QueryTimeoutImpl(String clientId, TerminationTracker tracker, Clock clock) {
|
||||
this.clientId = Preconditions.checkNotNull(clientId);
|
||||
this.tracker = Preconditions.checkNotNull(tracker);
|
||||
this.clock = Preconditions.checkNotNull(clock);
|
||||
shouldTerminateCounter =
|
||||
SearchRateCounter.export("query_timeout_should_terminate_" + clientId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when the clock's time has met or exceeded the tracker's timeout end.
|
||||
*/
|
||||
public boolean shouldExit() {
|
||||
if (clock.nowMillis() >= tracker.getTimeoutEndTimeWithReservation()) {
|
||||
tracker.setEarlyTerminationState(EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED);
|
||||
shouldTerminateCounter.increment();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void registerDocIdTracker(DocIdTracker docIdTracker) {
|
||||
tracker.addDocIdTracker(docIdTracker);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getClientId() {
|
||||
return clientId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return clientId.hashCode() * 13 + tracker.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof QueryTimeoutImpl)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
QueryTimeoutImpl queryTimeout = QueryTimeoutImpl.class.cast(obj);
|
||||
return clientId.equals(queryTimeout.clientId) && tracker.equals(queryTimeout.tracker);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,66 +0,0 @@
|
||||
package com.twitter.search.common.search.termination;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
/**
|
||||
* Query implementation that can timeout and return non-exhaustive results.
|
||||
*/
|
||||
public class TerminationQuery extends Query {
|
||||
private final Query inner;
|
||||
private final QueryTimeout timeout;
|
||||
|
||||
public TerminationQuery(Query inner, QueryTimeout timeout) {
|
||||
this.inner = Preconditions.checkNotNull(inner);
|
||||
this.timeout = Preconditions.checkNotNull(timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(
|
||||
IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException {
|
||||
Weight innerWeight = inner.createWeight(searcher, scoreMode, boost);
|
||||
return new TerminationQueryWeight(this, innerWeight, timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
Query rewritten = inner.rewrite(reader);
|
||||
if (rewritten != inner) {
|
||||
return new TerminationQuery(rewritten, timeout);
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
public QueryTimeout getTimeout() {
|
||||
return timeout;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Arrays.hashCode(new Object[] {inner, timeout});
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof TerminationQuery)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
TerminationQuery terminationQuery = TerminationQuery.class.cast(obj);
|
||||
return Arrays.equals(new Object[] {inner, timeout},
|
||||
new Object[] {terminationQuery.inner, terminationQuery.timeout});
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return inner.toString(field);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,91 +0,0 @@
|
||||
package com.twitter.search.common.search.termination;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
||||
import com.twitter.search.common.query.FilteredScorer;
|
||||
import com.twitter.search.common.search.DocIdTracker;
|
||||
|
||||
/**
|
||||
* Scorer implementation that adds termination support for an underlying query.
|
||||
* Meant to be used in conjunction with {@link TerminationQuery}.
|
||||
*/
|
||||
public class TerminationQueryScorer extends FilteredScorer implements DocIdTracker {
|
||||
private final QueryTimeout timeout;
|
||||
private int lastSearchedDocId = -1;
|
||||
|
||||
TerminationQueryScorer(Weight weight, Scorer inner, QueryTimeout timeout) {
|
||||
super(weight, inner);
|
||||
this.timeout = Preconditions.checkNotNull(timeout);
|
||||
this.timeout.registerDocIdTracker(this);
|
||||
SearchRateCounter.export(
|
||||
timeout.getClientId() + "_num_termination_query_scorers_created").increment();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSetIterator iterator() {
|
||||
final DocIdSetIterator superDISI = super.iterator();
|
||||
return new DocIdSetIterator() {
|
||||
// lastSearchedDocId is the ID of the last document that was traversed in the posting list.
|
||||
// docId is the current doc ID in this iterator. In most cases, lastSearchedDocId and docId
|
||||
// will be equal. They will be different only if the query needed to be terminated based on
|
||||
// the timeout. In that case, docId will be set to NO_MORE_DOCS, but lastSearchedDocId will
|
||||
// still be set to the last document that was actually traversed.
|
||||
private int docId = -1;
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (docId == NO_MORE_DOCS) {
|
||||
return NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
if (timeout.shouldExit()) {
|
||||
docId = NO_MORE_DOCS;
|
||||
} else {
|
||||
docId = superDISI.nextDoc();
|
||||
lastSearchedDocId = docId;
|
||||
}
|
||||
return docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
if (docId == NO_MORE_DOCS) {
|
||||
return NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
if (target == NO_MORE_DOCS) {
|
||||
docId = NO_MORE_DOCS;
|
||||
lastSearchedDocId = docId;
|
||||
} else if (timeout.shouldExit()) {
|
||||
docId = NO_MORE_DOCS;
|
||||
} else {
|
||||
docId = superDISI.advance(target);
|
||||
lastSearchedDocId = docId;
|
||||
}
|
||||
return docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return superDISI.cost();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getCurrentDocId() {
|
||||
return lastSearchedDocId;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,53 +0,0 @@
|
||||
package com.twitter.search.common.search.termination;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
/**
|
||||
* Weight implementation that adds termination support for an underlying query.
|
||||
* Meant to be used in conjunction with {@link TerminationQuery}.
|
||||
*/
|
||||
public class TerminationQueryWeight extends Weight {
|
||||
private final Weight inner;
|
||||
private final QueryTimeout timeout;
|
||||
|
||||
TerminationQueryWeight(TerminationQuery query, Weight inner, QueryTimeout timeout) {
|
||||
super(query);
|
||||
this.inner = inner;
|
||||
this.timeout = Preconditions.checkNotNull(timeout);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext context, int doc)
|
||||
throws IOException {
|
||||
return inner.explain(context, doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
Scorer innerScorer = inner.scorer(context);
|
||||
if (innerScorer != null) {
|
||||
return new TerminationQueryScorer(this, innerScorer, timeout);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
inner.extractTerms(terms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable(LeafReaderContext ctx) {
|
||||
return inner.isCacheable(ctx);
|
||||
}
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
java_library(
|
||||
sources = ["**/*.java"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/google/inject:guice",
|
||||
"3rdparty/jvm/com/twitter/elephantbird:core",
|
||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common/collections",
|
||||
"src/java/com/twitter/search/common/encoding/features",
|
||||
"src/java/com/twitter/search/common/logging",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/relevance:ranking",
|
||||
"src/java/com/twitter/search/common/relevance:text",
|
||||
"src/java/com/twitter/search/common/relevance/features",
|
||||
"src/java/com/twitter/search/common/runtime",
|
||||
"src/java/com/twitter/search/common/schema/base",
|
||||
"src/java/com/twitter/search/common/schema/earlybird",
|
||||
"src/thrift/com/twitter/search:earlybird-java",
|
||||
"src/thrift/com/twitter/search/adaptive:adaptive-results-java",
|
||||
"src/thrift/com/twitter/search/common:constants-java",
|
||||
"src/thrift/com/twitter/search/common:indexing-java",
|
||||
"src/thrift/com/twitter/search/common:query-java",
|
||||
"src/thrift/com/twitter/search/common:ranking-java",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
BIN
src/java/com/twitter/search/common/util/earlybird/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/common/util/earlybird/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,269 +0,0 @@
|
||||
package com.twitter.search.common.util.earlybird;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ExecutionException;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.cache.LoadingCache;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.collections.Pair;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponseCode;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResult;
|
||||
import com.twitter.search.earlybird.thrift.ThriftTweetSource;
|
||||
|
||||
/**
|
||||
* Utility methods to merge EarlybirdResponses.
|
||||
*/
|
||||
public final class EarlybirdResponseMergeUtil {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdResponseMergeUtil.class);
|
||||
|
||||
private static final String INVALID_RESPONSE_STATS_PREFIX = "invalid_response_stats_";
|
||||
|
||||
// Stats for invalid earlybird response
|
||||
private static final ImmutableMap<EarlybirdResponseCode, SearchCounter> ERROR_EXCEPTIONS;
|
||||
|
||||
public static final SearchCounter NULL_RESPONSE_COUNTER =
|
||||
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "null_response");
|
||||
public static final SearchCounter SEARCH_RESULTS_NOT_SET_COUNTER =
|
||||
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "search_results_not_set");
|
||||
public static final SearchCounter SEARCH_RESULTS_WITH_RESULTS_NOT_SET_COUNTER =
|
||||
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "search_results_with_results_not_set");
|
||||
public static final SearchCounter MAX_SEARCHED_STATUS_ID_NOT_SET_COUNTER =
|
||||
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "max_searched_status_id_not_set");
|
||||
public static final SearchCounter MIN_SEARCHED_STATUS_ID_NOT_SET_COUNTER =
|
||||
SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "min_searched_status_id_not_set");
|
||||
|
||||
static {
|
||||
ImmutableMap.Builder<EarlybirdResponseCode, SearchCounter> builder = ImmutableMap.builder();
|
||||
|
||||
for (EarlybirdResponseCode responseCode : EarlybirdResponseCode.values()) {
|
||||
if (responseCode != EarlybirdResponseCode.SUCCESS) {
|
||||
builder.put(responseCode, SearchCounter.export(
|
||||
INVALID_RESPONSE_STATS_PREFIX + responseCode.name().toLowerCase()));
|
||||
}
|
||||
}
|
||||
|
||||
ERROR_EXCEPTIONS = builder.build();
|
||||
}
|
||||
|
||||
private EarlybirdResponseMergeUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Tags the results in the given EarlybirdResponse with the given ThriftTweetSource and adds them
|
||||
* to the given list of results.
|
||||
*
|
||||
* @param results The list of results to which the new results will be added.
|
||||
* @param earlybirdResponse The EarlybirdResponse whose results will be added to {@code results}.
|
||||
* @param tweetSource The ThriftTweetSource that will be used to mark all results in
|
||||
* {@code earlybirdResponse}.
|
||||
* @return {@code false} if {@code earlybirdResponse} is {@code null} or doesn't have any results;
|
||||
* {@code true}, otherwise.
|
||||
*/
|
||||
public static boolean addResultsToList(List<ThriftSearchResult> results,
|
||||
EarlybirdResponse earlybirdResponse,
|
||||
ThriftTweetSource tweetSource) {
|
||||
return EarlybirdResponseUtil.hasResults(earlybirdResponse)
|
||||
&& addResultsToList(results,
|
||||
earlybirdResponse.getSearchResults().getResults(),
|
||||
tweetSource);
|
||||
}
|
||||
|
||||
/**
|
||||
* Tags the results in the given list with the given ThriftTweetSource and adds them to the given
|
||||
* list of results.
|
||||
*
|
||||
* @param results The list of results to which the new results will be added.
|
||||
* @param resultsToAdd The list of results to add.
|
||||
* @param tweetSource The ThriftTweetSource that will be used to mark all results in
|
||||
* {@code resultsToAdd}.
|
||||
* @return {@code false} if {@code results} is {@code null} or if {@code resultsToAdd} is
|
||||
* {@code null} or doesn't have any results; {@code true}, otherwise.
|
||||
*/
|
||||
public static boolean addResultsToList(List<ThriftSearchResult> results,
|
||||
List<ThriftSearchResult> resultsToAdd,
|
||||
ThriftTweetSource tweetSource) {
|
||||
Preconditions.checkNotNull(results);
|
||||
if ((resultsToAdd == null) || resultsToAdd.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
markWithTweetSource(resultsToAdd, tweetSource);
|
||||
|
||||
results.addAll(resultsToAdd);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Distinct the input ThriftSearchResult by its status id. If there are duplicates, the first
|
||||
* instance of the duplicates is returned in the distinct result. If the distinct result is the
|
||||
* same as the input result, the initial input result is returned; otherwise, the distinct result
|
||||
* is returned.
|
||||
*
|
||||
* @param results the input result
|
||||
* @param dupsStats stats counter track duplicates source
|
||||
* @return the input result if there is no duplicate; otherwise, return the distinct result
|
||||
*/
|
||||
public static List<ThriftSearchResult> distinctByStatusId(
|
||||
List<ThriftSearchResult> results,
|
||||
LoadingCache<Pair<ThriftTweetSource, ThriftTweetSource>, SearchCounter> dupsStats) {
|
||||
Map<Long, ThriftTweetSource> seenStatusIdToSourceMap = new HashMap<>();
|
||||
List<ThriftSearchResult> distinctResults = Lists.newArrayListWithCapacity(results.size());
|
||||
for (ThriftSearchResult result : results) {
|
||||
if (seenStatusIdToSourceMap.containsKey(result.getId())) {
|
||||
ThriftTweetSource source1 = seenStatusIdToSourceMap.get(result.getId());
|
||||
ThriftTweetSource source2 = result.getTweetSource();
|
||||
if (source1 != null && source2 != null) {
|
||||
try {
|
||||
dupsStats.get(Pair.of(source1, source2)).increment();
|
||||
} catch (ExecutionException e) {
|
||||
LOG.warn("Could not increment stat for duplicate results from clusters " + source1
|
||||
+ " and " + source2, e);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
distinctResults.add(result);
|
||||
seenStatusIdToSourceMap.put(result.getId(), result.getTweetSource());
|
||||
}
|
||||
}
|
||||
return results.size() == distinctResults.size() ? results : distinctResults;
|
||||
}
|
||||
|
||||
/**
|
||||
* Tags the given results with the given ThriftTweetSource.
|
||||
*
|
||||
* @param results The results to be tagged.
|
||||
* @param tweetSource The ThriftTweetSource to be used to tag the given results.
|
||||
*/
|
||||
public static void markWithTweetSource(List<ThriftSearchResult> results,
|
||||
ThriftTweetSource tweetSource) {
|
||||
if (results != null) {
|
||||
for (ThriftSearchResult result : results) {
|
||||
result.setTweetSource(tweetSource);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if an Earlybird response is valid
|
||||
*/
|
||||
public static boolean isValidResponse(final EarlybirdResponse response) {
|
||||
if (response == null) {
|
||||
NULL_RESPONSE_COUNTER.increment();
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!EarlybirdResponseUtil.isSuccessfulResponse(response)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!response.isSetSearchResults()) {
|
||||
SEARCH_RESULTS_NOT_SET_COUNTER.increment();
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!response.getSearchResults().isSetResults()) {
|
||||
SEARCH_RESULTS_WITH_RESULTS_NOT_SET_COUNTER.increment();
|
||||
}
|
||||
|
||||
// In earlybird, when earlybird terminated, e.g., time out, complex queries - we don't set the
|
||||
// min/max searched status id.
|
||||
boolean isEarlyTerminated = response.isSetEarlyTerminationInfo()
|
||||
&& response.getEarlyTerminationInfo().isEarlyTerminated();
|
||||
|
||||
if (!isEarlyTerminated && !response.getSearchResults().isSetMinSearchedStatusID()) {
|
||||
MIN_SEARCHED_STATUS_ID_NOT_SET_COUNTER.increment();
|
||||
}
|
||||
|
||||
if (!isEarlyTerminated && !response.getSearchResults().isSetMaxSearchedStatusID()) {
|
||||
MAX_SEARCHED_STATUS_ID_NOT_SET_COUNTER.increment();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* For invalid successful Earlybird Response, return a failed response with debug msg.
|
||||
*/
|
||||
public static EarlybirdResponse transformInvalidResponse(final EarlybirdResponse response,
|
||||
final String debugMsg) {
|
||||
if (response == null) {
|
||||
return failedEarlybirdResponse(EarlybirdResponseCode.PERSISTENT_ERROR,
|
||||
debugMsg + ", msg: null response from downstream");
|
||||
}
|
||||
Preconditions.checkState(response.getResponseCode() != EarlybirdResponseCode.SUCCESS);
|
||||
|
||||
EarlybirdResponseCode newResponseCode;
|
||||
EarlybirdResponseCode responseCode = response.getResponseCode();
|
||||
switch (responseCode) {
|
||||
case TIER_SKIPPED:
|
||||
ERROR_EXCEPTIONS.get(responseCode).increment();
|
||||
return response;
|
||||
case REQUEST_BLOCKED_ERROR:
|
||||
case CLIENT_ERROR:
|
||||
case SERVER_TIMEOUT_ERROR:
|
||||
case QUOTA_EXCEEDED_ERROR:
|
||||
case CLIENT_CANCEL_ERROR:
|
||||
case TOO_MANY_PARTITIONS_FAILED_ERROR:
|
||||
ERROR_EXCEPTIONS.get(responseCode).increment();
|
||||
newResponseCode = responseCode;
|
||||
break;
|
||||
default:
|
||||
ERROR_EXCEPTIONS.get(responseCode).increment();
|
||||
newResponseCode = EarlybirdResponseCode.PERSISTENT_ERROR;
|
||||
}
|
||||
|
||||
String newDebugMsg = debugMsg + ", downstream response code: " + responseCode
|
||||
+ (response.isSetDebugString() ? ", downstream msg: " + response.getDebugString() : "");
|
||||
|
||||
|
||||
return failedEarlybirdResponse(newResponseCode, newDebugMsg);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new EarlybirdResponse with debug msg
|
||||
*/
|
||||
public static EarlybirdResponse failedEarlybirdResponse(final EarlybirdResponseCode responseCode,
|
||||
final String debugMsg) {
|
||||
EarlybirdResponse failedResponse = new EarlybirdResponse();
|
||||
failedResponse.setResponseCode(responseCode);
|
||||
failedResponse.setDebugString(debugMsg);
|
||||
return failedResponse;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of results to keep as part of merge-collection. Recency mode should ignore
|
||||
* relevance options. In particular, the flag returnAllResults inside relevance options.
|
||||
*/
|
||||
public static int computeNumResultsToKeep(EarlybirdRequest request) {
|
||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
||||
|
||||
if (searchQuery.getRankingMode() != ThriftSearchRankingMode.RECENCY
|
||||
&& searchQuery.isSetRelevanceOptions()
|
||||
&& searchQuery.getRelevanceOptions().isReturnAllResults()) {
|
||||
return Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
if (request.isSetNumResultsToReturnAtRoot()) {
|
||||
return request.getNumResultsToReturnAtRoot();
|
||||
}
|
||||
|
||||
if (searchQuery.isSetCollectorParams()) {
|
||||
return searchQuery.getCollectorParams().getNumResultsToReturn();
|
||||
}
|
||||
|
||||
return searchQuery.getNumResults();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,204 +0,0 @@
|
||||
package com.twitter.search.common.util.earlybird;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.search.adaptive.adaptive_results.thriftjava.TweetSource;
|
||||
import com.twitter.search.common.logging.ObjectKey;
|
||||
import com.twitter.search.common.runtime.DebugManager;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponseCode;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResult;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResults;
|
||||
import com.twitter.search.earlybird.thrift.ThriftTweetSource;
|
||||
|
||||
/** Utility methods that work on EarlybirdResponses. */
|
||||
public final class EarlybirdResponseUtil {
|
||||
private EarlybirdResponseUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the results in the given EarlybirdResponse.
|
||||
*
|
||||
* @param response The EarlybirdResponse.
|
||||
* @return The results in the given EarlybirdResponse, or {@code null} if the response is
|
||||
* {@code null} or the results are not set.
|
||||
*/
|
||||
public static ThriftSearchResults getResults(EarlybirdResponse response) {
|
||||
if ((response == null) || !response.isSetSearchResults()) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return response.getSearchResults();
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the given EarlybirdResponse has results.
|
||||
*
|
||||
* @param response The EarlybirdResponse.
|
||||
* @return {@code true} if the given EarlybirdResponse has results; {@code false} otherwise.
|
||||
*/
|
||||
public static boolean hasResults(EarlybirdResponse response) {
|
||||
ThriftSearchResults results = getResults(response);
|
||||
return (results != null) && results.isSetResults() && !results.getResults().isEmpty();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of results in the given EarlybirdResponse.
|
||||
*
|
||||
* @param response The EarlybirdResponse.
|
||||
* @return The number of results in the given EarlybirdResponse.
|
||||
*/
|
||||
public static int getNumResults(EarlybirdResponse response) {
|
||||
return hasResults(response) ? response.getSearchResults().getResultsSize() : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines the response is early-terminated.
|
||||
*
|
||||
* @param response The EarlybirdResponse.
|
||||
* @return {@code true} if the response is early-terminated; {@code false} otherwise.
|
||||
*/
|
||||
public static boolean isEarlyTerminated(EarlybirdResponse response) {
|
||||
Preconditions.checkNotNull(response);
|
||||
return response.isSetEarlyTerminationInfo()
|
||||
&& response.getEarlyTerminationInfo().isEarlyTerminated();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns if the response should be considered failed for purposes of stats and logging.
|
||||
*/
|
||||
public static boolean responseConsideredFailed(EarlybirdResponseCode code) {
|
||||
return code != EarlybirdResponseCode.SUCCESS
|
||||
&& code != EarlybirdResponseCode.REQUEST_BLOCKED_ERROR
|
||||
&& code != EarlybirdResponseCode.TIER_SKIPPED;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract results from Earlybird response.
|
||||
*/
|
||||
public static List<ThriftSearchResult> extractResultsFromEarlybirdResponse(
|
||||
EarlybirdResponse response) {
|
||||
return hasResults(response)
|
||||
? response.getSearchResults().getResults() : Collections.emptyList();
|
||||
}
|
||||
|
||||
/**
|
||||
* Log the Earlybird response as a candidate source.
|
||||
*/
|
||||
public static EarlybirdResponse debugLogAsCandidateSource(
|
||||
EarlybirdResponse response, TweetSource tweetSource) {
|
||||
List<ThriftSearchResult> results = extractResultsFromEarlybirdResponse(response);
|
||||
debugLogAsCandidateSourceHelper(results, tweetSource);
|
||||
return response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Log a list of ThriftSearchResult as a candidate source.
|
||||
*/
|
||||
public static List<ThriftSearchResult> debugLogAsCandidateSource(
|
||||
List<ThriftSearchResult> results, TweetSource tweetSource) {
|
||||
debugLogAsCandidateSourceHelper(results, tweetSource);
|
||||
return results;
|
||||
}
|
||||
|
||||
private static void debugLogAsCandidateSourceHelper(
|
||||
List<ThriftSearchResult> results, TweetSource tweetSource) {
|
||||
// debug message for Earlybird relevance candidate source
|
||||
List<String> strIds = results
|
||||
.stream()
|
||||
.map(ThriftSearchResult::getId)
|
||||
.map(Object::toString)
|
||||
.collect(Collectors.toList());
|
||||
ObjectKey debugMsgKey = ObjectKey.createTweetCandidateSourceKey(
|
||||
tweetSource.name());
|
||||
DebugManager.perObjectBasic(
|
||||
debugMsgKey,
|
||||
String.format("[%s][%s] results: %s", debugMsgKey.getType(), debugMsgKey.getId(), strIds));
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract the real time response from an existing response
|
||||
*/
|
||||
public static EarlybirdResponse extractRealtimeResponse(EarlybirdResponse response) {
|
||||
EarlybirdResponse realtimeResponse = response.deepCopy();
|
||||
if (EarlybirdResponseUtil.hasResults(response)) {
|
||||
List<ThriftSearchResult> realtimeResults = realtimeResponse.getSearchResults().getResults();
|
||||
realtimeResults.clear();
|
||||
for (ThriftSearchResult result : response.getSearchResults().getResults()) {
|
||||
if (result.getTweetSource() == ThriftTweetSource.REALTIME_CLUSTER) {
|
||||
realtimeResults.add(result);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return realtimeResponse;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an EarlybirdResponse that should be returned by roots when a tier was skipped.
|
||||
*
|
||||
* @param minId The minSearchedStatusID to be set on the response.
|
||||
* @param maxId The maxSearchedStatusID to be set on the response.
|
||||
* @param debugMsg The debug message to be set on the response.
|
||||
* @return A response that should be returned by roots when a tier was skipped.
|
||||
*/
|
||||
public static EarlybirdResponse tierSkippedRootResponse(long minId, long maxId, String debugMsg) {
|
||||
return new EarlybirdResponse(EarlybirdResponseCode.SUCCESS, 0)
|
||||
.setSearchResults(new ThriftSearchResults()
|
||||
.setResults(new ArrayList<>())
|
||||
.setMinSearchedStatusID(minId)
|
||||
.setMaxSearchedStatusID(maxId))
|
||||
.setDebugString(debugMsg);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the given response is a success response.
|
||||
*
|
||||
* A response is considered successful if it's not null and has either a SUCCESS, TIER_SKIPPED or
|
||||
* REQUEST_BLOCKED_ERROR response code.
|
||||
*
|
||||
* @param response The response to check.
|
||||
* @return Whether the given response is successful or not.
|
||||
*/
|
||||
public static boolean isSuccessfulResponse(EarlybirdResponse response) {
|
||||
return response != null
|
||||
&& (response.getResponseCode() == EarlybirdResponseCode.SUCCESS
|
||||
|| response.getResponseCode() == EarlybirdResponseCode.TIER_SKIPPED
|
||||
|| response.getResponseCode() == EarlybirdResponseCode.REQUEST_BLOCKED_ERROR);
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds all unexpected nullcast statuses within the given result. A nullcast status is
|
||||
* unexpected iff:
|
||||
* 1. the tweet is a nullcast tweet.
|
||||
* 2. the tweet is NOT explicitly requested with {@link ThriftSearchQuery#searchStatusIds}
|
||||
*/
|
||||
public static Set<Long> findUnexpectedNullcastStatusIds(
|
||||
ThriftSearchResults thriftSearchResults, EarlybirdRequest request) {
|
||||
Set<Long> statusIds = new HashSet<>();
|
||||
for (ThriftSearchResult result : thriftSearchResults.getResults()) {
|
||||
if (resultIsNullcast(result) && !isSearchStatusId(request, result.getId())) {
|
||||
statusIds.add(result.getId());
|
||||
}
|
||||
}
|
||||
return statusIds;
|
||||
}
|
||||
|
||||
private static boolean isSearchStatusId(EarlybirdRequest request, long id) {
|
||||
return request.getSearchQuery().isSetSearchStatusIds()
|
||||
&& request.getSearchQuery().getSearchStatusIds().contains(id);
|
||||
}
|
||||
|
||||
private static boolean resultIsNullcast(ThriftSearchResult result) {
|
||||
return result.isSetMetadata() && result.getMetadata().isIsNullcast();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,495 +0,0 @@
|
||||
package com.twitter.search.common.util.earlybird;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
|
||||
import com.twitter.search.common.logging.DebugMessageBuilder;
|
||||
import com.twitter.search.common.ranking.thriftjava.ThriftFacetFinalSortOrder;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetCount;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetCountMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetFieldRequest;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetRankingMode;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetRequest;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetResults;
|
||||
import com.twitter.search.earlybird.thrift.ThriftTermResults;
|
||||
|
||||
/**
|
||||
* A utility class to provide some functions for facets results processing.
|
||||
*/
|
||||
public final class FacetsResultsUtils {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(FacetsResultsUtils.class);
|
||||
|
||||
private FacetsResultsUtils() {
|
||||
}
|
||||
|
||||
public static class FacetFieldInfo {
|
||||
public ThriftFacetFieldRequest fieldRequest;
|
||||
public int totalCounts;
|
||||
public Map<String, ThriftFacetCount> topFacets;
|
||||
public List<Map.Entry<ThriftLanguage, Double>> languageHistogramEntries = Lists.newLinkedList();
|
||||
}
|
||||
|
||||
// Only return top languages in the language histogram which sum up to at least this much
|
||||
// ratio, here we get first 80 percentiles.
|
||||
public static final double MIN_PERCENTAGE_SUM_REQUIRED = 0.8;
|
||||
// if a language ratio is over this number, we already return.
|
||||
public static final double MIN_PERCENTAGE = 0.01;
|
||||
|
||||
/**
|
||||
* Prepare facet fields with empty entries and check if we need termStats for filtering.
|
||||
* Returns true if termStats filtering is needed (thus the termStats servie call).
|
||||
* @param facetRequest The related facet request.
|
||||
* @param facetFieldInfoMap The facet field info map to fill, a map from facet type to the facet
|
||||
* fiels results info.
|
||||
* @return {@code true} if termstats request is needed afterwards.
|
||||
*/
|
||||
public static boolean prepareFieldInfoMap(
|
||||
ThriftFacetRequest facetRequest,
|
||||
final Map<String, FacetsResultsUtils.FacetFieldInfo> facetFieldInfoMap) {
|
||||
boolean termStatsFilteringMode = false;
|
||||
|
||||
for (ThriftFacetFieldRequest fieldRequest : facetRequest.getFacetFields()) {
|
||||
FacetsResultsUtils.FacetFieldInfo info = new FacetsResultsUtils.FacetFieldInfo();
|
||||
info.fieldRequest = fieldRequest;
|
||||
facetFieldInfoMap.put(fieldRequest.getFieldName(), info);
|
||||
if (fieldRequest.getRankingMode() == ThriftFacetRankingMode.FILTER_WITH_TERM_STATISTICS) {
|
||||
termStatsFilteringMode = true;
|
||||
}
|
||||
}
|
||||
|
||||
return termStatsFilteringMode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extract information from one ThriftFacetResults into facetFieldInfoMap and userIDWhitelist.
|
||||
* @param facetResults Related facets results.
|
||||
* @param facetFieldInfoMap The facets field info map to fill, a map from facet type to the facet
|
||||
* fiels results info.
|
||||
* @param userIDWhitelist The user whitelist to fill.
|
||||
*/
|
||||
public static void fillFacetFieldInfo(
|
||||
final ThriftFacetResults facetResults,
|
||||
final Map<String, FacetsResultsUtils.FacetFieldInfo> facetFieldInfoMap,
|
||||
final Set<Long> userIDWhitelist) {
|
||||
|
||||
for (String facetField : facetResults.getFacetFields().keySet()) {
|
||||
FacetsResultsUtils.FacetFieldInfo info = facetFieldInfoMap.get(facetField);
|
||||
if (info.topFacets == null) {
|
||||
info.topFacets = new HashMap<>();
|
||||
}
|
||||
|
||||
ThriftFacetFieldResults results = facetResults.getFacetFields().get(facetField);
|
||||
if (results.isSetLanguageHistogram()) {
|
||||
info.languageHistogramEntries.addAll(results.getLanguageHistogram().entrySet());
|
||||
}
|
||||
for (ThriftFacetCount newCount : results.getTopFacets()) {
|
||||
ThriftFacetCount resultCount = info.topFacets.get(newCount.facetLabel);
|
||||
if (resultCount == null) {
|
||||
info.topFacets.put(newCount.facetLabel, new ThriftFacetCount(newCount));
|
||||
} else {
|
||||
resultCount.setFacetCount(resultCount.facetCount + newCount.facetCount);
|
||||
resultCount.setSimpleCount(resultCount.simpleCount + newCount.simpleCount);
|
||||
resultCount.setWeightedCount(resultCount.weightedCount + newCount.weightedCount);
|
||||
resultCount.setPenaltyCount(resultCount.penaltyCount + newCount.penaltyCount);
|
||||
// this could pass the old metadata object back or a new merged one.
|
||||
resultCount.setMetadata(
|
||||
mergeFacetMetadata(resultCount.getMetadata(), newCount.getMetadata(),
|
||||
userIDWhitelist));
|
||||
}
|
||||
}
|
||||
info.totalCounts += results.totalCount;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Merge a metadata into an existing one.
|
||||
* @param baseMetadata the metadata to merge into.
|
||||
* @param metadataUpdate the new metadata to merge.
|
||||
* @param userIDWhitelist user id whitelist to filter user id with.
|
||||
* @return The updated metadata.
|
||||
*/
|
||||
public static ThriftFacetCountMetadata mergeFacetMetadata(
|
||||
final ThriftFacetCountMetadata baseMetadata,
|
||||
final ThriftFacetCountMetadata metadataUpdate,
|
||||
final Set<Long> userIDWhitelist) {
|
||||
ThriftFacetCountMetadata mergedMetadata = baseMetadata;
|
||||
if (metadataUpdate != null) {
|
||||
String mergedExplanation = null;
|
||||
if (mergedMetadata != null) {
|
||||
if (mergedMetadata.maxTweepCred < metadataUpdate.maxTweepCred) {
|
||||
mergedMetadata.setMaxTweepCred(metadataUpdate.maxTweepCred);
|
||||
}
|
||||
|
||||
if (mergedMetadata.isSetExplanation()) {
|
||||
mergedExplanation = mergedMetadata.getExplanation();
|
||||
if (metadataUpdate.isSetExplanation()) {
|
||||
mergedExplanation += "\n" + metadataUpdate.getExplanation();
|
||||
}
|
||||
} else if (metadataUpdate.isSetExplanation()) {
|
||||
mergedExplanation = metadataUpdate.getExplanation();
|
||||
}
|
||||
|
||||
if (mergedMetadata.getStatusId() == -1) {
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("status id in facet count metadata is -1: " + mergedMetadata);
|
||||
}
|
||||
mergedMetadata = metadataUpdate;
|
||||
} else if (metadataUpdate.getStatusId() != -1
|
||||
&& metadataUpdate.getStatusId() < mergedMetadata.getStatusId()) {
|
||||
// keep the oldest tweet, ie. the lowest status ID
|
||||
mergedMetadata = metadataUpdate;
|
||||
} else if (metadataUpdate.getStatusId() == mergedMetadata.getStatusId()) {
|
||||
if (mergedMetadata.getTwitterUserId() == -1) {
|
||||
// in this case we didn't find the user in a previous partition yet
|
||||
// only update the user if the status id matches
|
||||
mergedMetadata.setTwitterUserId(metadataUpdate.getTwitterUserId());
|
||||
mergedMetadata.setDontFilterUser(metadataUpdate.isDontFilterUser());
|
||||
}
|
||||
if (!mergedMetadata.isSetStatusLanguage()) {
|
||||
mergedMetadata.setStatusLanguage(metadataUpdate.getStatusLanguage());
|
||||
}
|
||||
}
|
||||
if (!mergedMetadata.isSetNativePhotoUrl() && metadataUpdate.isSetNativePhotoUrl()) {
|
||||
mergedMetadata.setNativePhotoUrl(metadataUpdate.getNativePhotoUrl());
|
||||
}
|
||||
} else {
|
||||
mergedMetadata = metadataUpdate;
|
||||
}
|
||||
|
||||
// this will not set an explanation if neither oldMetadata nor metadataUpdate
|
||||
// had an explanation
|
||||
if (mergedExplanation != null) {
|
||||
mergedMetadata.setExplanation(mergedExplanation);
|
||||
}
|
||||
|
||||
if (userIDWhitelist != null) {
|
||||
// result must not be null now because of the if above
|
||||
if (mergedMetadata.getTwitterUserId() != -1 && !mergedMetadata.isDontFilterUser()) {
|
||||
mergedMetadata.setDontFilterUser(
|
||||
userIDWhitelist.contains(mergedMetadata.getTwitterUserId()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return mergedMetadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends all twimg results to the image results. Optionally resorts the image results if
|
||||
* a comparator is passed in.
|
||||
* Also computes the sums of totalCount, totalScore, totalPenalty.
|
||||
*/
|
||||
public static void mergeTwimgResults(ThriftFacetResults facetResults,
|
||||
Comparator<ThriftFacetCount> optionalSortComparator) {
|
||||
if (facetResults == null || !facetResults.isSetFacetFields()) {
|
||||
return;
|
||||
}
|
||||
|
||||
ThriftFacetFieldResults imageResults =
|
||||
facetResults.getFacetFields().get(EarlybirdFieldConstant.IMAGES_FACET);
|
||||
ThriftFacetFieldResults twimgResults =
|
||||
facetResults.getFacetFields().remove(EarlybirdFieldConstant.TWIMG_FACET);
|
||||
if (imageResults == null) {
|
||||
if (twimgResults != null) {
|
||||
facetResults.getFacetFields().put(EarlybirdFieldConstant.IMAGES_FACET, twimgResults);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (twimgResults != null) {
|
||||
imageResults.setTotalCount(imageResults.getTotalCount() + twimgResults.getTotalCount());
|
||||
imageResults.setTotalPenalty(imageResults.getTotalPenalty() + twimgResults.getTotalPenalty());
|
||||
imageResults.setTotalScore(imageResults.getTotalScore() + twimgResults.getTotalScore());
|
||||
for (ThriftFacetCount count : twimgResults.getTopFacets()) {
|
||||
imageResults.addToTopFacets(count);
|
||||
}
|
||||
if (optionalSortComparator != null) {
|
||||
Collections.sort(imageResults.topFacets, optionalSortComparator);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Dedup twimg facets.
|
||||
*
|
||||
* Twimg facet uses the status ID as the facet label, instead of the twimg URL, a.k.a.
|
||||
* native photo URL. It is possible to have the same twimg URL appearing in two different
|
||||
* facet label (RT style retweet? copy & paste the twimg URL?). Therefore, to dedup twimg
|
||||
* facet correctly, we need to look at ThriftFacetCount.metadata.nativePhotoUrl
|
||||
*
|
||||
* @param dedupSet A set holding the native URLs from the twimg facetFieldResults. By having
|
||||
* the caller passing in the set, it allows the caller to dedup the facet
|
||||
* across different ThriftFacetFieldResults.
|
||||
* @param facetFieldResults The twimg facet field results to be debupped
|
||||
* @param debugMessageBuilder
|
||||
*/
|
||||
public static void dedupTwimgFacet(Set<String> dedupSet,
|
||||
ThriftFacetFieldResults facetFieldResults,
|
||||
DebugMessageBuilder debugMessageBuilder) {
|
||||
if (facetFieldResults == null || facetFieldResults.getTopFacets() == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
Iterator<ThriftFacetCount> iterator = facetFieldResults.getTopFacetsIterator();
|
||||
|
||||
while (iterator.hasNext()) {
|
||||
ThriftFacetCount count = iterator.next();
|
||||
if (count.isSetMetadata() && count.getMetadata().isSetNativePhotoUrl()) {
|
||||
String nativeUrl = count.getMetadata().getNativePhotoUrl();
|
||||
|
||||
if (dedupSet.contains(nativeUrl)) {
|
||||
iterator.remove();
|
||||
debugMessageBuilder.detailed("dedupTwimgFacet removed %s", nativeUrl);
|
||||
} else {
|
||||
dedupSet.add(nativeUrl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
private static final class LanguageCount {
|
||||
private final ThriftLanguage lang;
|
||||
private final double count;
|
||||
private LanguageCount(ThriftLanguage lang, double count) {
|
||||
this.lang = lang;
|
||||
this.count = count;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the top languages and store them in the results.
|
||||
*/
|
||||
public static void fillTopLanguages(FacetsResultsUtils.FacetFieldInfo info,
|
||||
final ThriftFacetFieldResults results) {
|
||||
double sumForLanguage = 0.0;
|
||||
double[] sums = new double[ThriftLanguage.values().length];
|
||||
for (Map.Entry<ThriftLanguage, Double> entry : info.languageHistogramEntries) {
|
||||
sumForLanguage += entry.getValue();
|
||||
if (entry.getKey() == null) {
|
||||
// EB might be setting null key for unknown language. SEARCH-1294
|
||||
continue;
|
||||
}
|
||||
sums[entry.getKey().getValue()] += entry.getValue();
|
||||
}
|
||||
if (sumForLanguage == 0.0) {
|
||||
return;
|
||||
}
|
||||
List<LanguageCount> langCounts = new ArrayList<>(ThriftLanguage.values().length);
|
||||
for (int i = 0; i < sums.length; i++) {
|
||||
if (sums[i] > 0.0) {
|
||||
// ThriftLanguage.findByValue() might return null, which should fall back to UNKNOWN.
|
||||
ThriftLanguage lang = ThriftLanguage.findByValue(i);
|
||||
lang = lang == null ? ThriftLanguage.UNKNOWN : lang;
|
||||
langCounts.add(new LanguageCount(lang, sums[i]));
|
||||
}
|
||||
}
|
||||
Collections.sort(langCounts, (left, right) -> Double.compare(right.count, left.count));
|
||||
double percentageSum = 0.0;
|
||||
Map<ThriftLanguage, Double> languageHistogramMap =
|
||||
new HashMap<>(langCounts.size());
|
||||
int numAdded = 0;
|
||||
for (LanguageCount langCount : langCounts) {
|
||||
if (langCount.count == 0.0) {
|
||||
break;
|
||||
}
|
||||
double percentage = langCount.count / sumForLanguage;
|
||||
if (percentageSum > MIN_PERCENTAGE_SUM_REQUIRED
|
||||
&& percentage < MIN_PERCENTAGE && numAdded >= 3) {
|
||||
break;
|
||||
}
|
||||
languageHistogramMap.put(langCount.lang, percentage);
|
||||
percentageSum += percentage;
|
||||
numAdded++;
|
||||
}
|
||||
results.setLanguageHistogram(languageHistogramMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace "p.twimg.com/" part of the native photo (twimg) URL with "pbs.twimg.com/media/".
|
||||
* We need to do this because of blobstore and it's suppose to be a temporary measure. This
|
||||
* code should be removed once we verified that all native photo URL being sent to Search
|
||||
* are prefixed with "pbs.twimg.com/media/" and no native photo URL in our index contains
|
||||
* "p.twimg.com/"
|
||||
*
|
||||
* Please see SEARCH-783 and EVENTS-539 for more details.
|
||||
*
|
||||
* @param response response containing the facet results
|
||||
*/
|
||||
public static void fixNativePhotoUrl(EarlybirdResponse response) {
|
||||
if (response == null
|
||||
|| !response.isSetFacetResults()
|
||||
|| !response.getFacetResults().isSetFacetFields()) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (Map.Entry<String, ThriftFacetFieldResults> facetMapEntry
|
||||
: response.getFacetResults().getFacetFields().entrySet()) {
|
||||
final String facetResultField = facetMapEntry.getKey();
|
||||
|
||||
if (EarlybirdFieldConstant.TWIMG_FACET.equals(facetResultField)
|
||||
|| EarlybirdFieldConstant.IMAGES_FACET.equals(facetResultField)) {
|
||||
ThriftFacetFieldResults facetFieldResults = facetMapEntry.getValue();
|
||||
for (ThriftFacetCount facetCount : facetFieldResults.getTopFacets()) {
|
||||
replacePhotoUrl(facetCount.getMetadata());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace "p.twimg.com/" part of the native photo (twimg) URL with "pbs.twimg.com/media/".
|
||||
* We need to do this because of blobstore and it's suppose to be a temporary measure. This
|
||||
* code should be removed once we verified that all native photo URL being sent to Search
|
||||
* are prefixed with "pbs.twimg.com/media/" and no native photo URL in our index contains
|
||||
* "p.twimg.com/"
|
||||
*
|
||||
* Please see SEARCH-783 and EVENTS-539 for more details.
|
||||
*
|
||||
* @param termResultsCollection collection of ThriftTermResults containing the native photo URL
|
||||
*/
|
||||
public static void fixNativePhotoUrl(Collection<ThriftTermResults> termResultsCollection) {
|
||||
if (termResultsCollection == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (ThriftTermResults termResults : termResultsCollection) {
|
||||
if (!termResults.isSetMetadata()) {
|
||||
continue;
|
||||
}
|
||||
replacePhotoUrl(termResults.getMetadata());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Helper function for fixNativePhotoUrl()
|
||||
*/
|
||||
private static void replacePhotoUrl(ThriftFacetCountMetadata metadata) {
|
||||
if (metadata != null
|
||||
&& metadata.isSetNativePhotoUrl()) {
|
||||
String nativePhotoUrl = metadata.getNativePhotoUrl();
|
||||
nativePhotoUrl = nativePhotoUrl.replace("://p.twimg.com/", "://pbs.twimg.com/media/");
|
||||
metadata.setNativePhotoUrl(nativePhotoUrl);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Deepcopy of an EarlybirdResponse without explanation
|
||||
*/
|
||||
public static EarlybirdResponse deepCopyWithoutExplanation(EarlybirdResponse facetsResponse) {
|
||||
if (facetsResponse == null) {
|
||||
return null;
|
||||
} else if (!facetsResponse.isSetFacetResults()
|
||||
|| facetsResponse.getFacetResults().getFacetFieldsSize() == 0) {
|
||||
return facetsResponse.deepCopy();
|
||||
}
|
||||
EarlybirdResponse copy = facetsResponse.deepCopy();
|
||||
for (Map.Entry<String, ThriftFacetFieldResults> entry
|
||||
: copy.getFacetResults().getFacetFields().entrySet()) {
|
||||
if (entry.getValue().getTopFacetsSize() > 0) {
|
||||
for (ThriftFacetCount fc : entry.getValue().getTopFacets()) {
|
||||
fc.getMetadata().unsetExplanation();
|
||||
}
|
||||
}
|
||||
}
|
||||
return copy;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a comparator used to compare facet counts by calling
|
||||
* getFacetCountComparator(ThriftFacetFinalSortOrder). The sort order is determined by
|
||||
* the facetRankingOptions on the facet request.
|
||||
*/
|
||||
public static Comparator<ThriftFacetCount> getFacetCountComparator(
|
||||
ThriftFacetRequest facetRequest) {
|
||||
|
||||
ThriftFacetFinalSortOrder sortOrder = ThriftFacetFinalSortOrder.SCORE;
|
||||
|
||||
if (facetRequest.isSetFacetRankingOptions()
|
||||
&& facetRequest.getFacetRankingOptions().isSetFinalSortOrder()) {
|
||||
sortOrder = facetRequest.getFacetRankingOptions().getFinalSortOrder();
|
||||
}
|
||||
|
||||
return getFacetCountComparator(sortOrder);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a comparator using the specified order.
|
||||
*/
|
||||
public static Comparator<ThriftFacetCount> getFacetCountComparator(
|
||||
ThriftFacetFinalSortOrder sortOrder) {
|
||||
|
||||
switch (sortOrder) {
|
||||
case SIMPLE_COUNT: return SIMPLE_COUNT_COMPARATOR;
|
||||
case SCORE: return SCORE_COMPARATOR;
|
||||
case CREATED_AT: return CREATED_AT_COMPARATOR;
|
||||
case WEIGHTED_COUNT: return WEIGHTED_COUNT_COMPARATOR;
|
||||
default: return SCORE_COMPARATOR;
|
||||
}
|
||||
}
|
||||
|
||||
private static final Comparator<ThriftFacetCount> SIMPLE_COUNT_COMPARATOR =
|
||||
(count1, count2) -> {
|
||||
if (count1.simpleCount > count2.simpleCount) {
|
||||
return 1;
|
||||
} else if (count1.simpleCount < count2.simpleCount) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return count1.facetLabel.compareTo(count2.facetLabel);
|
||||
};
|
||||
|
||||
private static final Comparator<ThriftFacetCount> WEIGHTED_COUNT_COMPARATOR =
|
||||
(count1, count2) -> {
|
||||
if (count1.weightedCount > count2.weightedCount) {
|
||||
return 1;
|
||||
} else if (count1.weightedCount < count2.weightedCount) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
return SIMPLE_COUNT_COMPARATOR.compare(count1, count2);
|
||||
};
|
||||
|
||||
private static final Comparator<ThriftFacetCount> SCORE_COMPARATOR =
|
||||
(count1, count2) -> {
|
||||
if (count1.score > count2.score) {
|
||||
return 1;
|
||||
} else if (count1.score < count2.score) {
|
||||
return -1;
|
||||
}
|
||||
return SIMPLE_COUNT_COMPARATOR.compare(count1, count2);
|
||||
};
|
||||
|
||||
private static final Comparator<ThriftFacetCount> CREATED_AT_COMPARATOR =
|
||||
(count1, count2) -> {
|
||||
if (count1.isSetMetadata() && count1.getMetadata().isSetCreated_at()
|
||||
&& count2.isSetMetadata() && count2.getMetadata().isSetCreated_at()) {
|
||||
// more recent items have higher created_at values
|
||||
if (count1.getMetadata().getCreated_at() > count2.getMetadata().getCreated_at()) {
|
||||
return 1;
|
||||
} else if (count1.getMetadata().getCreated_at() < count2.getMetadata().getCreated_at()) {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return SCORE_COMPARATOR.compare(count1, count2);
|
||||
};
|
||||
}
|
Binary file not shown.
@ -1,45 +0,0 @@
|
||||
package com.twitter.search.common.util.earlybird;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import com.twitter.search.common.query.thriftjava.EarlyTerminationInfo;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
||||
|
||||
public final class ResponseMergerUtils {
|
||||
|
||||
// Utility class, disallow instantiation.
|
||||
private ResponseMergerUtils() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Merges early termination infos from several earlybird responses.
|
||||
*
|
||||
* @param responses earlybird responses to merge the early termination infos from
|
||||
* @return merged early termination info
|
||||
*/
|
||||
public static EarlyTerminationInfo mergeEarlyTerminationInfo(List<EarlybirdResponse> responses) {
|
||||
EarlyTerminationInfo etInfo = new EarlyTerminationInfo(false);
|
||||
Set<String> etReasonSet = Sets.newHashSet();
|
||||
// Fill in EarlyTerminationStatus
|
||||
for (EarlybirdResponse ebResp : responses) {
|
||||
if (ebResp.isSetEarlyTerminationInfo()
|
||||
&& ebResp.getEarlyTerminationInfo().isEarlyTerminated()) {
|
||||
etInfo.setEarlyTerminated(true);
|
||||
if (ebResp.getEarlyTerminationInfo().isSetEarlyTerminationReason()) {
|
||||
etReasonSet.add(ebResp.getEarlyTerminationInfo().getEarlyTerminationReason());
|
||||
}
|
||||
if (ebResp.getEarlyTerminationInfo().isSetMergedEarlyTerminationReasons()) {
|
||||
etReasonSet.addAll(ebResp.getEarlyTerminationInfo().getMergedEarlyTerminationReasons());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (etInfo.isEarlyTerminated()) {
|
||||
etInfo.setMergedEarlyTerminationReasons(Lists.newArrayList(etReasonSet));
|
||||
}
|
||||
return etInfo;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,36 +0,0 @@
|
||||
package com.twitter.search.common.util.earlybird;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
/**
|
||||
* Utility class used to help merging results.
|
||||
*/
|
||||
public final class ResultsUtil {
|
||||
private ResultsUtil() { }
|
||||
|
||||
/**
|
||||
* Aggregate a list of responses in the following way.
|
||||
* 1. For each response, mapGetter can turn the response into a map.
|
||||
* 2. Dump all entries from the above map into a "total" map, which accumulates entries from
|
||||
* all the responses.
|
||||
*/
|
||||
public static <T, V> Map<T, Integer> aggregateCountMap(
|
||||
Iterable<V> responses,
|
||||
Function<V, Map<T, Integer>> mapGetter) {
|
||||
Map<T, Integer> total = Maps.newHashMap();
|
||||
for (Map<T, Integer> map : Iterables.transform(responses, mapGetter)) {
|
||||
if (map != null) {
|
||||
for (Map.Entry<T, Integer> entry : map.entrySet()) {
|
||||
T key = entry.getKey();
|
||||
total.put(key, total.containsKey(key)
|
||||
? total.get(key) + entry.getValue() : entry.getValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,47 +0,0 @@
|
||||
package com.twitter.search.common.util.earlybird;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
|
||||
|
||||
/**
|
||||
* A utility class to provide some functions for TermStatistics request processing
|
||||
*/
|
||||
public final class TermStatisticsUtil {
|
||||
|
||||
private static final org.slf4j.Logger LOG =
|
||||
org.slf4j.LoggerFactory.getLogger(TermStatisticsUtil.class);
|
||||
|
||||
private TermStatisticsUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Determine the binsize base on settings in ThriftHistogramSettings.granularity
|
||||
*/
|
||||
public static int determineBinSize(ThriftHistogramSettings histogramSettings) {
|
||||
final int DEFAULT_BINSIZE = (int) TimeUnit.HOURS.toSeconds(1);
|
||||
int binSize;
|
||||
switch (histogramSettings.getGranularity()) {
|
||||
case DAYS:
|
||||
binSize = (int) TimeUnit.DAYS.toSeconds(1);
|
||||
break;
|
||||
case HOURS:
|
||||
binSize = (int) TimeUnit.HOURS.toSeconds(1);
|
||||
break;
|
||||
case MINUTES:
|
||||
binSize = (int) TimeUnit.MINUTES.toSeconds(1);
|
||||
break;
|
||||
case CUSTOM:
|
||||
binSize = histogramSettings.isSetBinSizeInSeconds()
|
||||
? histogramSettings.getBinSizeInSeconds()
|
||||
: DEFAULT_BINSIZE;
|
||||
break;
|
||||
default:
|
||||
binSize = DEFAULT_BINSIZE;
|
||||
LOG.warn("Unknown ThriftHistogramGranularityType {} using default binsize: {}",
|
||||
histogramSettings.getGranularity(), DEFAULT_BINSIZE);
|
||||
}
|
||||
|
||||
return binSize;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,29 +0,0 @@
|
||||
package com.twitter.search.common.util.earlybird;
|
||||
|
||||
import com.twitter.search.common.query.thriftjava.CollectorParams;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
|
||||
/**
|
||||
* Utility class from constructing ThriftSearchQuery.
|
||||
*/
|
||||
public final class ThriftSearchQueryUtil {
|
||||
private ThriftSearchQueryUtil() { }
|
||||
|
||||
/**
|
||||
* Convenience methods for constructing a ThriftSearchQuery.
|
||||
*/
|
||||
public static ThriftSearchQuery newSearchQuery(String serializedQuery, int numResults) {
|
||||
ThriftSearchQuery searchQuery = new ThriftSearchQuery();
|
||||
searchQuery.setSerializedQuery(serializedQuery);
|
||||
searchQuery.setCollectorParams(new CollectorParams().setNumResultsToReturn(numResults));
|
||||
return searchQuery;
|
||||
}
|
||||
|
||||
/** Determines if the given request was initiated by a logged in user. */
|
||||
public static boolean requestInitiatedByLoggedInUser(EarlybirdRequest request) {
|
||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
||||
return (searchQuery != null) && searchQuery.isSetSearcherId()
|
||||
&& (searchQuery.getSearcherId() > 0);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,209 +0,0 @@
|
||||
package com.twitter.search.common.util.earlybird;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.base.Predicate;
|
||||
import com.google.common.base.Predicates;
|
||||
import com.google.common.collect.Iterables;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
|
||||
import com.twitter.search.common.relevance.ranking.ActionChain;
|
||||
import com.twitter.search.common.relevance.ranking.filters.ExactDuplicateFilter;
|
||||
import com.twitter.search.common.relevance.text.VisibleTokenRatioNormalizer;
|
||||
import com.twitter.search.common.runtime.ActionChainDebugManager;
|
||||
import com.twitter.search.common.schema.base.Schema;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetResults;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResult;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResults;
|
||||
import com.twitter.search.earlybird.thrift.ThriftTweetSource;
|
||||
|
||||
/**
|
||||
* ThriftSearchResultUtil contains some simple static methods for constructing
|
||||
* ThriftSearchResult objects.
|
||||
*/
|
||||
public final class ThriftSearchResultUtil {
|
||||
private ThriftSearchResultUtil() { }
|
||||
|
||||
private static final VisibleTokenRatioNormalizer NORMALIZER =
|
||||
VisibleTokenRatioNormalizer.createInstance();
|
||||
|
||||
public static final Function<ThriftSearchResults, Map<ThriftLanguage, Integer>> LANG_MAP_GETTER =
|
||||
searchResults -> searchResults.getLanguageHistogram();
|
||||
public static final Function<ThriftSearchResults, Map<Long, Integer>> HIT_COUNTS_MAP_GETTER =
|
||||
searchResults -> searchResults.getHitCounts();
|
||||
|
||||
// Some useful Predicates
|
||||
public static final Predicate<ThriftSearchResult> IS_OFFENSIVE_TWEET =
|
||||
result -> {
|
||||
if (result != null && result.isSetMetadata()) {
|
||||
ThriftSearchResultMetadata metadata = result.getMetadata();
|
||||
return metadata.isIsOffensive();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
public static final Predicate<ThriftSearchResult> IS_TOP_TWEET =
|
||||
result -> result != null
|
||||
&& result.isSetMetadata()
|
||||
&& result.getMetadata().isSetResultType()
|
||||
&& result.getMetadata().getResultType() == ThriftSearchResultType.POPULAR;
|
||||
|
||||
public static final Predicate<ThriftSearchResult> FROM_FULL_ARCHIVE =
|
||||
result -> result != null
|
||||
&& result.isSetTweetSource()
|
||||
&& result.getTweetSource() == ThriftTweetSource.FULL_ARCHIVE_CLUSTER;
|
||||
|
||||
public static final Predicate<ThriftSearchResult> IS_FULL_ARCHIVE_TOP_TWEET =
|
||||
Predicates.and(FROM_FULL_ARCHIVE, IS_TOP_TWEET);
|
||||
|
||||
public static final Predicate<ThriftSearchResult> IS_NSFW_BY_ANY_MEANS_TWEET =
|
||||
result -> {
|
||||
if (result != null && result.isSetMetadata()) {
|
||||
ThriftSearchResultMetadata metadata = result.getMetadata();
|
||||
return metadata.isIsUserNSFW()
|
||||
|| metadata.isIsOffensive()
|
||||
|| metadata.getExtraMetadata().isIsSensitiveContent();
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns the number of underlying ThriftSearchResult results.
|
||||
*/
|
||||
public static int numResults(ThriftSearchResults results) {
|
||||
if (results == null || !results.isSetResults()) {
|
||||
return 0;
|
||||
} else {
|
||||
return results.getResultsSize();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of tweet IDs in ThriftSearchResults.
|
||||
* Returns null if there's no results.
|
||||
*/
|
||||
@Nullable
|
||||
public static List<Long> getTweetIds(ThriftSearchResults results) {
|
||||
if (numResults(results) > 0) {
|
||||
return getTweetIds(results.getResults());
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the list of tweet IDs in a list of ThriftSearchResult.
|
||||
* Returns null if there's no results.
|
||||
*/
|
||||
public static List<Long> getTweetIds(@Nullable List<ThriftSearchResult> results) {
|
||||
if (results != null && results.size() > 0) {
|
||||
return Lists.newArrayList(Iterables.transform(
|
||||
results,
|
||||
searchResult -> searchResult.getId()
|
||||
));
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given ThriftSearchResults, build a map from tweet ID to the tweets metadata.
|
||||
*/
|
||||
public static Map<Long, ThriftSearchResultMetadata> getTweetMetadataMap(
|
||||
Schema schema, ThriftSearchResults results) {
|
||||
Map<Long, ThriftSearchResultMetadata> resultMap = Maps.newHashMap();
|
||||
if (results == null || results.getResultsSize() == 0) {
|
||||
return resultMap;
|
||||
}
|
||||
for (ThriftSearchResult searchResult : results.getResults()) {
|
||||
resultMap.put(searchResult.getId(), searchResult.getMetadata());
|
||||
}
|
||||
return resultMap;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the total number of facet results in ThriftFacetResults, by summing up the number
|
||||
* of facet results in each field.
|
||||
*/
|
||||
public static int numFacetResults(ThriftFacetResults results) {
|
||||
if (results == null || !results.isSetFacetFields()) {
|
||||
return 0;
|
||||
} else {
|
||||
int numResults = 0;
|
||||
for (ThriftFacetFieldResults field : results.getFacetFields().values()) {
|
||||
if (field.isSetTopFacets()) {
|
||||
numResults += field.topFacets.size();
|
||||
}
|
||||
}
|
||||
return numResults;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the search statistics on base, by adding the corresponding stats from delta.
|
||||
*/
|
||||
public static void incrementCounts(ThriftSearchResults base,
|
||||
ThriftSearchResults delta) {
|
||||
if (delta.isSetNumHitsProcessed()) {
|
||||
base.setNumHitsProcessed(base.getNumHitsProcessed() + delta.getNumHitsProcessed());
|
||||
}
|
||||
if (delta.isSetNumPartitionsEarlyTerminated() && delta.getNumPartitionsEarlyTerminated() > 0) {
|
||||
// This currently used for merging results on a single earlybird, so we don't sum up all the
|
||||
// counts, just set it to 1 if we see one that was early terminated.
|
||||
base.setNumPartitionsEarlyTerminated(1);
|
||||
}
|
||||
if (delta.isSetMaxSearchedStatusID()) {
|
||||
long deltaMax = delta.getMaxSearchedStatusID();
|
||||
if (!base.isSetMaxSearchedStatusID() || deltaMax > base.getMaxSearchedStatusID()) {
|
||||
base.setMaxSearchedStatusID(deltaMax);
|
||||
}
|
||||
}
|
||||
if (delta.isSetMinSearchedStatusID()) {
|
||||
long deltaMin = delta.getMinSearchedStatusID();
|
||||
if (!base.isSetMinSearchedStatusID() || deltaMin < base.getMinSearchedStatusID()) {
|
||||
base.setMinSearchedStatusID(deltaMin);
|
||||
}
|
||||
}
|
||||
if (delta.isSetScore()) {
|
||||
if (base.isSetScore()) {
|
||||
base.setScore(base.getScore() + delta.getScore());
|
||||
} else {
|
||||
base.setScore(delta.getScore());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes the duplicates from the given list of results.
|
||||
*
|
||||
* @param results The list of ThriftSearchResults.
|
||||
* @return The given list with duplicates removed.
|
||||
*/
|
||||
public static List<ThriftSearchResult> removeDuplicates(List<ThriftSearchResult> results) {
|
||||
ActionChain<ThriftSearchResult> filterChain =
|
||||
ActionChainDebugManager
|
||||
.<ThriftSearchResult>createActionChainBuilder("RemoveDuplicatesFilters")
|
||||
.appendActions(new ExactDuplicateFilter())
|
||||
.build();
|
||||
return filterChain.apply(results);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns ranking score from Earlybird shard-based ranking models if any, and 0 otherwise.
|
||||
*/
|
||||
public static double getTweetScore(@Nullable ThriftSearchResult result) {
|
||||
if (result == null || !result.isSetMetadata() || !result.getMetadata().isSetScore()) {
|
||||
return 0.0;
|
||||
}
|
||||
return result.getMetadata().getScore();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,46 +0,0 @@
|
||||
package com.twitter.search.common.util.earlybird;
|
||||
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
|
||||
|
||||
public final class ThriftSearchResultsRelevanceStatsUtil {
|
||||
private ThriftSearchResultsRelevanceStatsUtil() { }
|
||||
|
||||
/**
|
||||
* Adding ThriftSearchResultsRelevanceStats from one set of results onto a base set.
|
||||
* Assumes all values are set on both of the inputs.
|
||||
*
|
||||
* @param base the stats to add to.
|
||||
* @param delta the stats to be added.
|
||||
*/
|
||||
public static void addRelevanceStats(ThriftSearchResultsRelevanceStats base,
|
||||
ThriftSearchResultsRelevanceStats delta) {
|
||||
base.setNumScored(base.getNumScored() + delta.getNumScored());
|
||||
base.setNumSkipped(base.getNumSkipped() + delta.getNumSkipped());
|
||||
base.setNumSkippedForAntiGaming(
|
||||
base.getNumSkippedForAntiGaming() + delta.getNumSkippedForAntiGaming());
|
||||
base.setNumSkippedForLowReputation(
|
||||
base.getNumSkippedForLowReputation() + delta.getNumSkippedForLowReputation());
|
||||
base.setNumSkippedForLowTextScore(
|
||||
base.getNumSkippedForLowTextScore() + delta.getNumSkippedForLowTextScore());
|
||||
base.setNumSkippedForSocialFilter(
|
||||
base.getNumSkippedForSocialFilter() + delta.getNumSkippedForSocialFilter());
|
||||
base.setNumSkippedForLowFinalScore(
|
||||
base.getNumSkippedForLowFinalScore() + delta.getNumSkippedForLowFinalScore());
|
||||
if (delta.getOldestScoredTweetAgeInSeconds() > base.getOldestScoredTweetAgeInSeconds()) {
|
||||
base.setOldestScoredTweetAgeInSeconds(delta.getOldestScoredTweetAgeInSeconds());
|
||||
}
|
||||
|
||||
base.setNumFromDirectFollows(base.getNumFromDirectFollows() + delta.getNumFromDirectFollows());
|
||||
base.setNumFromTrustedCircle(base.getNumFromTrustedCircle() + delta.getNumFromTrustedCircle());
|
||||
base.setNumReplies(base.getNumReplies() + delta.getNumReplies());
|
||||
base.setNumRepliesTrusted(base.getNumRepliesTrusted() + delta.getNumRepliesTrusted());
|
||||
base.setNumRepliesOutOfNetwork(
|
||||
base.getNumRepliesOutOfNetwork() + delta.getNumRepliesOutOfNetwork());
|
||||
base.setNumSelfTweets(base.getNumSelfTweets() + delta.getNumSelfTweets());
|
||||
base.setNumWithMedia(base.getNumWithMedia() + delta.getNumWithMedia());
|
||||
base.setNumWithNews(base.getNumWithNews() + delta.getNumWithNews());
|
||||
base.setNumSpamUser(base.getNumSpamUser() + delta.getNumSpamUser());
|
||||
base.setNumOffensive(base.getNumOffensive() + delta.getNumOffensive());
|
||||
base.setNumBot(base.getNumBot() + delta.getNumBot());
|
||||
}
|
||||
}
|
@ -1,18 +0,0 @@
|
||||
java_library(
|
||||
sources = ["*.java"],
|
||||
platform = "java8",
|
||||
provides = artifact(
|
||||
org = "com.twitter.search.common.util",
|
||||
name = "lang",
|
||||
repo = artifactory,
|
||||
),
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/code/findbugs:jsr305",
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"src/java/com/twitter/common/text/language:locale-util",
|
||||
"src/thrift/com/twitter/search/common:constants-java",
|
||||
],
|
||||
)
|
BIN
src/java/com/twitter/search/common/util/lang/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/common/util/lang/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,141 +0,0 @@
|
||||
package com.twitter.search.common.util.lang;
|
||||
|
||||
import java.lang.reflect.Field;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.text.language.LocaleUtil;
|
||||
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
|
||||
|
||||
/**
|
||||
* This class can be used to convert ThriftLanguage to Locale object and vise versa.
|
||||
*/
|
||||
public final class ThriftLanguageUtil {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ThriftLanguageUtil.class.getName());
|
||||
|
||||
// stores ThriftLanguage.id -> Locale mapping
|
||||
private static final Locale[] LOCALES;
|
||||
|
||||
// stores Locale -> ThriftLanguage mapping
|
||||
private static final Map<Locale, ThriftLanguage> THRIFT_LANGUAGES;
|
||||
|
||||
static {
|
||||
LOCALES = new Locale[ThriftLanguage.values().length];
|
||||
Map<Locale, ThriftLanguage> thriftLanguageMap = Maps.newHashMap();
|
||||
|
||||
// get all languages defined in ThriftLanguage
|
||||
Field[] fields = ThriftLanguage.class.getDeclaredFields();
|
||||
for (Field field : fields) {
|
||||
if (!field.isEnumConstant()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
try {
|
||||
ThriftLanguage thriftLang = (ThriftLanguage) field.get(null);
|
||||
String thriftLanguageName = field.getName();
|
||||
|
||||
// get corresponding Locale declared in LocaleUtil
|
||||
try {
|
||||
Field localeUtilField = LocaleUtil.class.getDeclaredField(thriftLanguageName);
|
||||
Locale localeLang = (Locale) localeUtilField.get(null);
|
||||
|
||||
LOCALES[thriftLang.getValue()] = localeLang;
|
||||
thriftLanguageMap.put(localeLang, thriftLang);
|
||||
} catch (NoSuchFieldException e) {
|
||||
LOG.warn("{} is defined in ThriftLanguage, but not in LocaleUtil.", thriftLanguageName);
|
||||
}
|
||||
} catch (IllegalAccessException e) {
|
||||
// shouldn't happen.
|
||||
LOG.warn("Could not get a declared field.", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Let's make sure that all Locales defined in LocaleUtil are also defined in ThriftLanguage
|
||||
for (Locale lang : LocaleUtil.getDefinedLanguages()) {
|
||||
if (!thriftLanguageMap.containsKey(lang)) {
|
||||
LOG.warn("{} is defined in LocaleUtil but not in ThriftLanguage.", lang.getLanguage());
|
||||
}
|
||||
}
|
||||
|
||||
THRIFT_LANGUAGES = ImmutableMap.copyOf(thriftLanguageMap);
|
||||
}
|
||||
|
||||
private ThriftLanguageUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a Locale object which corresponds to a given ThriftLanguage object.
|
||||
* @param language ThriftLanguage object
|
||||
* @return a corresponding Locale object
|
||||
*/
|
||||
public static Locale getLocaleOf(ThriftLanguage language) {
|
||||
// Note that ThriftLanguage.findByValue() can return null (thrift generated code).
|
||||
// So ThriftLanguageUtil.getLocaleOf needs to handle null correctly.
|
||||
if (language == null) {
|
||||
return LocaleUtil.UNKNOWN;
|
||||
}
|
||||
|
||||
Preconditions.checkArgument(language.getValue() < LOCALES.length);
|
||||
return LOCALES[language.getValue()];
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a ThriftLanguage object which corresponds to a given Locale object.
|
||||
*
|
||||
* @param language Locale object
|
||||
* @return a corresponding ThriftLanguage object, or UNKNOWN if there's no corresponding one.
|
||||
*/
|
||||
public static ThriftLanguage getThriftLanguageOf(Locale language) {
|
||||
Preconditions.checkNotNull(language);
|
||||
ThriftLanguage thriftLang = THRIFT_LANGUAGES.get(language);
|
||||
return thriftLang == null ? ThriftLanguage.UNKNOWN : thriftLang;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a ThriftLanguage object which corresponds to a given language code.
|
||||
*
|
||||
* @param languageCode BCP-47 language code
|
||||
* @return a corresponding ThriftLanguage object, or UNKNOWN if there's no corresponding one.
|
||||
*/
|
||||
public static ThriftLanguage getThriftLanguageOf(String languageCode) {
|
||||
Preconditions.checkNotNull(languageCode);
|
||||
ThriftLanguage thriftLang = THRIFT_LANGUAGES.get(LocaleUtil.getLocaleOf(languageCode));
|
||||
return thriftLang == null ? ThriftLanguage.UNKNOWN : thriftLang;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a ThriftLanguage object which corresponds to a given int value.
|
||||
* If value is not valid, returns ThriftLanguage.UNKNOWN
|
||||
* @param value value of language
|
||||
* @return a corresponding ThriftLanguage object
|
||||
*/
|
||||
public static ThriftLanguage safeFindByValue(int value) {
|
||||
ThriftLanguage thriftLang = ThriftLanguage.findByValue(value);
|
||||
return thriftLang == null ? ThriftLanguage.UNKNOWN : thriftLang;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the language code which corresponds to a given ThriftLanguage.
|
||||
*
|
||||
* Note that multiple ThriftLanguage entries can return the same language code.
|
||||
*
|
||||
* @param thriftLang ThriftLanguage object
|
||||
* @return Corresponding language or null if thriftLang is null.
|
||||
*/
|
||||
@Nullable
|
||||
public static String getLanguageCodeOf(@Nullable ThriftLanguage thriftLang) {
|
||||
if (thriftLang == null) {
|
||||
return null;
|
||||
}
|
||||
return ThriftLanguageUtil.getLocaleOf(thriftLang).getLanguage();
|
||||
}
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
java_library(
|
||||
sources = ["*.java"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/google/inject:guice",
|
||||
"3rdparty/jvm/it/unimi/dsi:fastutil",
|
||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/search/common/file",
|
||||
"src/java/com/twitter/search/common/util/io",
|
||||
],
|
||||
)
|
BIN
src/java/com/twitter/search/common/util/ml/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/common/util/ml/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,141 +0,0 @@
|
||||
package com.twitter.search.common.util.ml;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.EnumMap;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Predicates;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import com.twitter.search.common.file.AbstractFile;
|
||||
import com.twitter.search.common.util.io.TextFileLoadingUtils;
|
||||
|
||||
/**
|
||||
* Represents a linear model for scoring and classification.
|
||||
*
|
||||
* The list of features is defined by an Enum class. The model weights and instances are
|
||||
* represented as maps that must contain an entry for all the values of the enum.
|
||||
*
|
||||
*/
|
||||
public class EnumBasedLinearModel<K extends Enum<K>> implements MapBasedLinearModel<K> {
|
||||
|
||||
private final EnumSet<K> features;
|
||||
private final EnumMap<K, Float> weights;
|
||||
|
||||
/**
|
||||
* Creates a model from a map of weights.
|
||||
*
|
||||
* @param enumType Enum used for the keys
|
||||
* @param weights Feature weights.
|
||||
*/
|
||||
public EnumBasedLinearModel(Class<K> enumType, Map<K, Float> weights) {
|
||||
features = EnumSet.allOf(enumType);
|
||||
EnumMap<K, Float> enumWeights =
|
||||
new EnumMap<>(Maps.filterValues(weights, Predicates.notNull()));
|
||||
Preconditions.checkArgument(features.equals(enumWeights.keySet()),
|
||||
"The model does not include weights for all the available features");
|
||||
|
||||
this.weights = enumWeights;
|
||||
}
|
||||
|
||||
public ImmutableMap<K, Float> getWeights() {
|
||||
return Maps.immutableEnumMap(weights);
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score(Map<K, Float> instance) {
|
||||
float total = 0;
|
||||
for (Map.Entry<K, Float> weightEntry : weights.entrySet()) {
|
||||
Float feature = instance.get(weightEntry.getKey());
|
||||
if (feature != null) {
|
||||
total += weightEntry.getValue() * feature;
|
||||
}
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether an instance is positive.
|
||||
*/
|
||||
@Override
|
||||
public boolean classify(float threshold, Map<K, Float> instance) {
|
||||
return score(instance) > threshold;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean classify(Map<K, Float> instance) {
|
||||
return classify(0, instance);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return String.format("EnumBasedLinearModel[%s]", weights);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a model where all the features have the same weight.
|
||||
* This method is useful for generating the feature vectors for training a new model.
|
||||
*/
|
||||
public static <T extends Enum<T>> EnumBasedLinearModel<T> createWithEqualWeight(Class<T> enumType,
|
||||
Float weight) {
|
||||
EnumSet<T> features = EnumSet.allOf(enumType);
|
||||
EnumMap<T, Float> weights = Maps.newEnumMap(enumType);
|
||||
for (T feature : features) {
|
||||
weights.put(feature, weight);
|
||||
}
|
||||
return new EnumBasedLinearModel<>(enumType, weights);
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the model from a TSV file with the following format:
|
||||
*
|
||||
* feature_name \t weight
|
||||
*/
|
||||
public static <T extends Enum<T>> EnumBasedLinearModel<T> createFromFile(
|
||||
Class<T> enumType, AbstractFile path) throws IOException {
|
||||
return new EnumBasedLinearModel<>(enumType, loadWeights(enumType, path, true));
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the model from a TSV file, using a default weight of 0 for missing features.
|
||||
*
|
||||
* File format:
|
||||
*
|
||||
* feature_name \t weight
|
||||
*/
|
||||
public static <T extends Enum<T>> EnumBasedLinearModel<T> createFromFileSafe(
|
||||
Class<T> enumType, AbstractFile path) throws IOException {
|
||||
return new EnumBasedLinearModel<>(enumType, loadWeights(enumType, path, false));
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a map of (feature_name, weight) from a TSV file.
|
||||
*
|
||||
* If strictMode is true, it will throw an exception if the file doesn't contain all the
|
||||
* features declared in the enum. Otherwise, it will use zero as default value.
|
||||
*
|
||||
*/
|
||||
private static <T extends Enum<T>> EnumMap<T, Float> loadWeights(
|
||||
Class<T> enumType, AbstractFile fileHandle, boolean strictMode) throws IOException {
|
||||
Map<String, Float> weightsFromFile =
|
||||
TextFileLoadingUtils.loadMapFromFile(fileHandle, input -> Float.parseFloat(input));
|
||||
EnumMap<T, Float> weights = Maps.newEnumMap(enumType);
|
||||
Set<T> expectedFeatures = EnumSet.allOf(enumType);
|
||||
if (!strictMode) {
|
||||
for (T feature : expectedFeatures) {
|
||||
weights.put(feature, 0f);
|
||||
}
|
||||
}
|
||||
for (String featureName : weightsFromFile.keySet()) {
|
||||
Float weight = weightsFromFile.get(featureName);
|
||||
weights.put(Enum.valueOf(enumType, featureName.toUpperCase()), weight);
|
||||
}
|
||||
Preconditions.checkArgument(expectedFeatures.equals(weights.keySet()),
|
||||
"Model does not contain weights for all the features");
|
||||
return weights;
|
||||
}
|
||||
}
|
BIN
src/java/com/twitter/search/common/util/ml/FeatureUtils.docx
Normal file
BIN
src/java/com/twitter/search/common/util/ml/FeatureUtils.docx
Normal file
Binary file not shown.
@ -1,120 +0,0 @@
|
||||
package com.twitter.search.common.util.ml;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
/**
|
||||
* Utilities for feature transformation and extraction.
|
||||
*/
|
||||
public final class FeatureUtils {
|
||||
|
||||
private FeatureUtils() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the difference between 2 values and returns the ratio of the difference over the
|
||||
* minimum of both, according to these cases:
|
||||
*
|
||||
* 1. if (a > b) return a / b
|
||||
* 2. if (a < b) return - b / a
|
||||
* 3. if (a == b == 0) return 0
|
||||
*
|
||||
* The upper/lower limit is (-) maxRatio. For cases 1 and 2, if the denominator is 0,
|
||||
* it returns maxRatio.
|
||||
*
|
||||
* This method is used to define a feature that tells how much larger or smaller is the
|
||||
* first value with respect to the second one..
|
||||
*/
|
||||
public static float diffRatio(float a, float b, float maxRatio) {
|
||||
float diff = a - b;
|
||||
if (diff == 0) {
|
||||
return 0;
|
||||
}
|
||||
float denominator = Math.min(a, b);
|
||||
float ratio = denominator != 0 ? Math.abs(diff / denominator) : maxRatio;
|
||||
return Math.copySign(Math.min(ratio, maxRatio), diff);
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the cosine similarity between two maps that represent sparse vectors.
|
||||
*/
|
||||
public static <K, V extends Number> double cosineSimilarity(
|
||||
Map<K, V> vector1, Map<K, V> vector2) {
|
||||
if (vector1 == null || vector1.isEmpty() || vector2 == null || vector2.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
double squaredSum1 = 0;
|
||||
double squaredSum2 = 0;
|
||||
double squaredCrossSum = 0;
|
||||
|
||||
for (K key : Sets.union(vector1.keySet(), vector2.keySet())) {
|
||||
double value1 = 0;
|
||||
double value2 = 0;
|
||||
|
||||
V optValue1 = vector1.get(key);
|
||||
if (optValue1 != null) {
|
||||
value1 = optValue1.doubleValue();
|
||||
}
|
||||
V optValue2 = vector2.get(key);
|
||||
if (optValue2 != null) {
|
||||
value2 = optValue2.doubleValue();
|
||||
}
|
||||
|
||||
squaredSum1 += value1 * value1;
|
||||
squaredSum2 += value2 * value2;
|
||||
squaredCrossSum += value1 * value2;
|
||||
}
|
||||
|
||||
if (squaredSum1 == 0 || squaredSum2 == 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return squaredCrossSum / Math.sqrt(squaredSum1 * squaredSum2);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the cosine similarity between two (dense) vectors.
|
||||
*/
|
||||
public static <V extends Number> double cosineSimilarity(
|
||||
List<V> vector1, List<V> vector2) {
|
||||
if (vector1 == null || vector1.isEmpty() || vector2 == null || vector2.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
Preconditions.checkArgument(vector1.size() == vector2.size());
|
||||
double squaredSum1 = 0;
|
||||
double squaredSum2 = 0;
|
||||
double squaredCrossSum = 0;
|
||||
for (int i = 0; i < vector1.size(); i++) {
|
||||
double value1 = vector1.get(i).doubleValue();
|
||||
double value2 = vector2.get(i).doubleValue();
|
||||
squaredSum1 += value1 * value1;
|
||||
squaredSum2 += value2 * value2;
|
||||
squaredCrossSum += value1 * value2;
|
||||
}
|
||||
|
||||
if (squaredSum1 == 0 || squaredSum2 == 0) {
|
||||
return 0;
|
||||
} else {
|
||||
return squaredCrossSum / Math.sqrt(squaredSum1 * squaredSum2);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Finds the key of the map with the highest value (compared in natural order)
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public static <K, V extends Comparable> Optional<K> findMaxKey(Map<K, V> map) {
|
||||
if (map == null || map.isEmpty()) {
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
Optional<Map.Entry<K, V>> maxEntry = map.entrySet().stream().max(Map.Entry.comparingByValue());
|
||||
return maxEntry.map(Map.Entry::getKey);
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
@ -1,32 +0,0 @@
|
||||
package com.twitter.search.common.util.ml;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* An interface for linear models that are backed by some sort of map
|
||||
*/
|
||||
public interface MapBasedLinearModel<K> {
|
||||
/**
|
||||
* Evaluate using this model given a feature vector.
|
||||
* @param instance The feature vector in format of a hashmap.
|
||||
* @return
|
||||
*/
|
||||
boolean classify(Map<K, Float> instance);
|
||||
|
||||
/**
|
||||
* Evaluate using this model given a classification threshold and a feature vector.
|
||||
* @param threshold Score threshold used for classification.
|
||||
* @param instance The feature vector in format of a hashmap.
|
||||
* @return
|
||||
*/
|
||||
boolean classify(float threshold, Map<K, Float> instance);
|
||||
|
||||
/**
|
||||
* Computes the score of an instance as a linear combination of the features and the model
|
||||
* weights. 0 is used as default value for features or weights that are not present.
|
||||
*
|
||||
* @param instance The feature vector in format of a hashmap.
|
||||
* @return The instance score according to the model.
|
||||
*/
|
||||
float score(Map<K, Float> instance);
|
||||
}
|
Binary file not shown.
@ -1,125 +0,0 @@
|
||||
package com.twitter.search.common.util.ml;
|
||||
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.base.Function;
|
||||
import com.twitter.search.common.file.AbstractFile;
|
||||
import com.twitter.search.common.util.io.TextFileLoadingUtils;
|
||||
|
||||
import it.unimi.dsi.fastutil.objects.Object2FloatMap;
|
||||
import it.unimi.dsi.fastutil.objects.Object2FloatOpenHashMap;
|
||||
|
||||
/**
|
||||
* Represents a linear model for scoring and classification.
|
||||
*
|
||||
* Features are represented as arbitrary strings, making this a fairly flexible implementation
|
||||
* (at the cost of some performance, since all operations require hash lookups). Instances
|
||||
* and weights are both encoded sparsely (as maps) so this implementation is well suited to
|
||||
* models with large feature sets where most features are inactive at a given time. Weights
|
||||
* for unknown features are assumed to be 0.
|
||||
*
|
||||
*/
|
||||
public class StringMapBasedLinearModel implements MapBasedLinearModel<String> {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(StringMapBasedLinearModel.class);
|
||||
|
||||
protected final Object2FloatMap<String> model = new Object2FloatOpenHashMap<>();
|
||||
|
||||
/**
|
||||
* Creates a model from a map of weights.
|
||||
*
|
||||
* @param weights Feature weights.
|
||||
*/
|
||||
public StringMapBasedLinearModel(Map<String, Float> weights) {
|
||||
model.putAll(weights);
|
||||
model.defaultReturnValue(0.0f);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the weight of a feature
|
||||
* @param featureName
|
||||
* @return
|
||||
*/
|
||||
public float getWeight(String featureName) {
|
||||
return model.getFloat(featureName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the full weight map
|
||||
*/
|
||||
@VisibleForTesting
|
||||
protected Map<String, Float> getWeights() {
|
||||
return model;
|
||||
}
|
||||
|
||||
/**
|
||||
* Evaluate using this model given a feature vector.
|
||||
* @param values The feature vector in format of a hashmap.
|
||||
* @return
|
||||
*/
|
||||
@Override
|
||||
public float score(Map<String, Float> values) {
|
||||
float score = 0.0f;
|
||||
for (Map.Entry<String, Float> value : values.entrySet()) {
|
||||
String featureName = value.getKey();
|
||||
float weight = getWeight(featureName);
|
||||
if (weight != 0.0f) {
|
||||
score += weight * value.getValue();
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("%s = %.3f * %.3f = %.3f, ",
|
||||
featureName, weight, value.getValue(),
|
||||
weight * value.getValue()));
|
||||
}
|
||||
}
|
||||
}
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug(String.format("Score = %.3f", score));
|
||||
}
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether an instance is positive.
|
||||
*/
|
||||
@Override
|
||||
public boolean classify(Map<String, Float> values) {
|
||||
return classify(0.0f, values);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean classify(float threshold, Map<String, Float> values) {
|
||||
return score(values) > threshold;
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return model.size();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("StringMapBasedLinearModel[");
|
||||
for (Map.Entry<String, Float> entry : model.entrySet()) {
|
||||
sb.append(String.format("(%s = %.3f), ", entry.getKey(), entry.getValue()));
|
||||
}
|
||||
sb.append("]");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the model from a TSV file with the following format:
|
||||
*
|
||||
* feature_name \t weight
|
||||
*/
|
||||
public static StringMapBasedLinearModel loadFromFile(AbstractFile fileHandle) {
|
||||
Map<String, Float> weights =
|
||||
TextFileLoadingUtils.loadMapFromFile(
|
||||
fileHandle,
|
||||
(Function<String, Float>) item -> Float.parseFloat(item));
|
||||
return new StringMapBasedLinearModel(weights);
|
||||
}
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
java_library(
|
||||
sources = ["*.java"],
|
||||
platform = "java8",
|
||||
strict_deps = True,
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"3rdparty/jvm/org/yaml:snakeyaml",
|
||||
"src/java/com/twitter/search/common/file",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
@ -1,293 +0,0 @@
|
||||
package com.twitter.search.common.util.ml.models_manager;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Function;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Strings;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Sets;
|
||||
import com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.yaml.snakeyaml.Yaml;
|
||||
|
||||
import com.twitter.search.common.file.AbstractFile;
|
||||
import com.twitter.search.common.file.FileUtils;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.metrics.SearchLongGauge;
|
||||
|
||||
/**
|
||||
* Loads models from HDFS and provides an interface for reloading them periodically.
|
||||
*
|
||||
* There are 2 possible ways of detecting the active models:
|
||||
*
|
||||
* - DirectorySupplier: Uses all the subdirectories of a base path
|
||||
* - ConfigSupplier: Gets the list from from a configuration file
|
||||
*
|
||||
* Models can be updated or added. Depending on the selected method, existing models can be removed
|
||||
* if they are no longer active.
|
||||
*/
|
||||
public abstract class BaseModelsManager<T> implements Runnable {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(BaseModelsManager.class);
|
||||
|
||||
protected final Map<String, Long> lastModifiedMsByModel = new ConcurrentHashMap<>();
|
||||
protected final Map<String, T> loadedModels = new ConcurrentHashMap<>();
|
||||
protected final Supplier<Map<String, AbstractFile>> activeModelsSupplier;
|
||||
|
||||
protected Map<String, T> prevLoadedModels = new ConcurrentHashMap<>();
|
||||
|
||||
// This flag determines whether models are unloaded immediately when they're removed from
|
||||
// activeModelsSupplier. If false, old models stay in memory until the process is restarted.
|
||||
// This may be useful to safely change model configuration without restarting.
|
||||
protected final boolean shouldUnloadInactiveModels;
|
||||
|
||||
protected final SearchLongGauge numModels;
|
||||
protected final SearchCounter numErrors;
|
||||
protected final SearchLongGauge lastLoadedMs;
|
||||
|
||||
protected Supplier<Boolean> shouldServeModels;
|
||||
protected Supplier<Boolean> shouldLoadModels;
|
||||
|
||||
public BaseModelsManager(
|
||||
Supplier<Map<String, AbstractFile>> activeModelsSupplier,
|
||||
boolean shouldUnloadInactiveModels,
|
||||
String statsPrefix
|
||||
) {
|
||||
this(
|
||||
activeModelsSupplier,
|
||||
shouldUnloadInactiveModels,
|
||||
statsPrefix,
|
||||
() -> true,
|
||||
() -> true
|
||||
);
|
||||
}
|
||||
|
||||
public BaseModelsManager(
|
||||
Supplier<Map<String, AbstractFile>> activeModelsSupplier,
|
||||
boolean shouldUnloadInactiveModels,
|
||||
String statsPrefix,
|
||||
Supplier<Boolean> shouldServeModels,
|
||||
Supplier<Boolean> shouldLoadModels
|
||||
) {
|
||||
this.activeModelsSupplier = activeModelsSupplier;
|
||||
this.shouldUnloadInactiveModels = shouldUnloadInactiveModels;
|
||||
|
||||
this.shouldServeModels = shouldServeModels;
|
||||
this.shouldLoadModels = shouldLoadModels;
|
||||
|
||||
numModels = SearchLongGauge.export(
|
||||
String.format("model_loader_%s_num_models", statsPrefix));
|
||||
numErrors = SearchCounter.export(
|
||||
String.format("model_loader_%s_num_errors", statsPrefix));
|
||||
lastLoadedMs = SearchLongGauge.export(
|
||||
String.format("model_loader_%s_last_loaded_timestamp_ms", statsPrefix));
|
||||
}
|
||||
|
||||
/**
|
||||
* Retrieves a particular model.
|
||||
*/
|
||||
public Optional<T> getModel(String name) {
|
||||
if (shouldServeModels.get()) {
|
||||
return Optional.ofNullable(loadedModels.get(name));
|
||||
} else {
|
||||
return Optional.empty();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads a model instance from the directory file instance.
|
||||
*
|
||||
* @param modelBaseDir AbstractFile instance representing the directory.
|
||||
* @return Model instance parsed from the directory.
|
||||
*/
|
||||
public abstract T readModelFromDirectory(AbstractFile modelBaseDir) throws Exception;
|
||||
|
||||
/**
|
||||
* Cleans up any resources used by the model instance.
|
||||
* This method is called after removing the model from the in-memory map.
|
||||
* Sub-classes can provide custom overridden implementation as required.
|
||||
*
|
||||
* @param unloadedModel Model instance that would be unloaded from the manager.
|
||||
*/
|
||||
protected void cleanUpUnloadedModel(T unloadedModel) { }
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
// Get available models, either from the config file or by listing the base directory
|
||||
final Map<String, AbstractFile> modelPathsFromConfig;
|
||||
if (!shouldLoadModels.get()) {
|
||||
LOG.info("Loading models is currently disabled.");
|
||||
return;
|
||||
}
|
||||
|
||||
modelPathsFromConfig = activeModelsSupplier.get();
|
||||
for (Map.Entry<String, AbstractFile> nameAndPath : modelPathsFromConfig.entrySet()) {
|
||||
String modelName = nameAndPath.getKey();
|
||||
try {
|
||||
AbstractFile modelDirectory = nameAndPath.getValue();
|
||||
if (!modelDirectory.exists() && loadedModels.containsKey(modelName)) {
|
||||
LOG.warn("Loaded model '{}' no longer exists at HDFS path {}, keeping loaded version; "
|
||||
+ "replace directory in HDFS to update model.", modelName, modelDirectory);
|
||||
continue;
|
||||
}
|
||||
|
||||
long previousModifiedTimestamp = lastModifiedMsByModel.getOrDefault(modelName, 0L);
|
||||
long lastModifiedMs = modelDirectory.getLastModified();
|
||||
if (previousModifiedTimestamp == lastModifiedMs) {
|
||||
continue;
|
||||
}
|
||||
|
||||
LOG.info("Starting to load model. name={} path={}", modelName, modelDirectory.getPath());
|
||||
T model = Preconditions.checkNotNull(readModelFromDirectory(modelDirectory));
|
||||
LOG.info("Model initialized: {}. Last modified: {} ({})",
|
||||
modelName, lastModifiedMs, new Date(lastModifiedMs));
|
||||
T previousModel = loadedModels.put(modelName, model);
|
||||
lastModifiedMsByModel.put(modelName, lastModifiedMs);
|
||||
|
||||
if (previousModel != null) {
|
||||
cleanUpUnloadedModel(previousModel);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
numErrors.increment();
|
||||
LOG.error("Error initializing model: {}", modelName, e);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove any currently loaded models not present in the latest list
|
||||
if (shouldUnloadInactiveModels) {
|
||||
Set<String> inactiveModels =
|
||||
Sets.difference(loadedModels.keySet(), modelPathsFromConfig.keySet()).immutableCopy();
|
||||
|
||||
for (String modelName : inactiveModels) {
|
||||
T modelToUnload = loadedModels.get(modelName);
|
||||
loadedModels.remove(modelName);
|
||||
|
||||
if (modelToUnload != null) {
|
||||
// We could have an inactive model key without a model (value) if the
|
||||
// initial readModelFromDirectory failed for the model entry.
|
||||
// Checking for null to avoid exception.
|
||||
cleanUpUnloadedModel(modelToUnload);
|
||||
}
|
||||
LOG.info("Unloaded model that is no longer active: {}", modelName);
|
||||
}
|
||||
}
|
||||
|
||||
if (!prevLoadedModels.keySet().equals(loadedModels.keySet())) {
|
||||
LOG.info("Finished loading models: {}", loadedModels.keySet());
|
||||
}
|
||||
prevLoadedModels = loadedModels;
|
||||
numModels.set(loadedModels.size());
|
||||
lastLoadedMs.set(System.currentTimeMillis());
|
||||
}
|
||||
|
||||
/**
|
||||
* Schedules the loader to run periodically.
|
||||
* @param period Period between executions
|
||||
* @param timeUnit The time unit the period parameter.
|
||||
*/
|
||||
public final void scheduleAtFixedRate(
|
||||
long period, TimeUnit timeUnit, String builderThreadName) {
|
||||
Executors.newSingleThreadScheduledExecutor(
|
||||
new ThreadFactoryBuilder()
|
||||
.setDaemon(true)
|
||||
.setNameFormat(builderThreadName)
|
||||
.build())
|
||||
.scheduleAtFixedRate(this, 0, period, timeUnit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the active list of models from the subdirectories in a base directory.
|
||||
*
|
||||
* Each model is identified by the name of the subdirectory.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public static class DirectorySupplier implements Supplier<Map<String, AbstractFile>> {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DirectorySupplier.class);
|
||||
private final AbstractFile baseDir;
|
||||
|
||||
public DirectorySupplier(AbstractFile baseDir) {
|
||||
this.baseDir = baseDir;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, AbstractFile> get() {
|
||||
try {
|
||||
LOG.info("Loading models from the directories in: {}", baseDir.getPath());
|
||||
List<AbstractFile> modelDirs =
|
||||
ImmutableList.copyOf(baseDir.listFiles(AbstractFile.IS_DIRECTORY));
|
||||
LOG.info("Found {} model directories: {}", modelDirs.size(), modelDirs);
|
||||
return modelDirs.stream()
|
||||
.collect(Collectors.toMap(
|
||||
AbstractFile::getName,
|
||||
Function.identity()
|
||||
));
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the active list of models by reading a YAML config file.
|
||||
*
|
||||
* The keys are the model names, the values are dictionaries with a single entry for the path
|
||||
* of the model in HDFS (without the HDFS name node prefix). For example:
|
||||
*
|
||||
* model_a:
|
||||
* path: /path/to/model_a
|
||||
* model_b:
|
||||
* path: /path/to/model_b
|
||||
*
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public static class ConfigSupplier implements Supplier<Map<String, AbstractFile>> {
|
||||
|
||||
private final AbstractFile configFile;
|
||||
|
||||
public ConfigSupplier(AbstractFile configFile) {
|
||||
this.configFile = configFile;
|
||||
}
|
||||
|
||||
@SuppressWarnings("unchecked")
|
||||
@Override
|
||||
public Map<String, AbstractFile> get() {
|
||||
try (BufferedReader configReader = configFile.getCharSource().openBufferedStream()) {
|
||||
Yaml yamlParser = new Yaml();
|
||||
//noinspection unchecked
|
||||
Map<String, Map<String, String>> config =
|
||||
(Map<String, Map<String, String>>) yamlParser.load(configReader);
|
||||
|
||||
if (config == null || config.isEmpty()) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
|
||||
Map<String, AbstractFile> modelPaths = new HashMap<>();
|
||||
for (Map.Entry<String, Map<String, String>> nameAndConfig : config.entrySet()) {
|
||||
String path = Strings.emptyToNull(nameAndConfig.getValue().get("path"));
|
||||
Preconditions.checkNotNull(path, "Missing path for model: %s", nameAndConfig.getKey());
|
||||
modelPaths.put(nameAndConfig.getKey(), FileUtils.getHdfsFileHandle(path));
|
||||
}
|
||||
return modelPaths;
|
||||
} catch (IOException e) {
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -1,68 +0,0 @@
|
||||
java_library(
|
||||
sources = ["*.java"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/google/inject:guice",
|
||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common_internal/hadoop",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/transform",
|
||||
"src/java/com/twitter/ml/common/base",
|
||||
"src/java/com/twitter/ml/prediction/core",
|
||||
"src/java/com/twitter/ml/tool/prediction:ModelInterpreter",
|
||||
"src/java/com/twitter/ml/vw/constant",
|
||||
"src/java/com/twitter/mlv2/trees/predictor",
|
||||
"src/java/com/twitter/mlv2/trees/scorer",
|
||||
"src/java/com/twitter/search/common/features",
|
||||
"src/java/com/twitter/search/common/file",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/util/ml/models_manager",
|
||||
"src/java/com/twitter/search/modeling/common",
|
||||
"src/thrift/com/twitter/ml/api:data-java",
|
||||
"src/thrift/com/twitter/search/common:features-java",
|
||||
],
|
||||
)
|
||||
|
||||
java_library(
|
||||
name = "for-timelines",
|
||||
sources = [
|
||||
"BaseLegacyScoreAccumulator.java",
|
||||
"BaseModelBuilder.java",
|
||||
"BaseScoreAccumulator.java",
|
||||
"CompositeFeatureContext.java",
|
||||
"DiscretizedFeature.java",
|
||||
"DiscretizedFeatureRange.java",
|
||||
"LegacyModelBuilder.java",
|
||||
"LightweightLinearModel.java",
|
||||
"ModelBuilder.java",
|
||||
"SchemaBasedModelBuilder.java",
|
||||
],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/google/inject:guice",
|
||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common_internal/hadoop",
|
||||
"src/java/com/twitter/ml/api:api-base",
|
||||
"src/java/com/twitter/ml/api/transform:DiscretizerTransform",
|
||||
"src/java/com/twitter/ml/common/base",
|
||||
"src/java/com/twitter/ml/tool/prediction:ModelInterpreter",
|
||||
"src/java/com/twitter/ml/vw/constant",
|
||||
"src/java/com/twitter/search/common/features",
|
||||
"src/java/com/twitter/search/common/file",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/util/ml/models_manager",
|
||||
"src/java/com/twitter/search/modeling/common",
|
||||
"src/thrift/com/twitter/ml/api:data-java",
|
||||
"src/thrift/com/twitter/search/common:features-java",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
@ -1,64 +0,0 @@
|
||||
package com.twitter.search.common.util.ml.prediction_engine;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.ml.api.Feature;
|
||||
|
||||
/**
|
||||
* Score accumulator for legacy (non-schema-based) features. It provides methods to add features
|
||||
* using Feature objects.
|
||||
*
|
||||
* @deprecated This class is retired and we suggest to switch to schema-based features.
|
||||
*/
|
||||
@Deprecated
|
||||
public abstract class BaseLegacyScoreAccumulator<D> extends BaseScoreAccumulator<D> {
|
||||
|
||||
public BaseLegacyScoreAccumulator(LightweightLinearModel model) {
|
||||
super(model);
|
||||
Preconditions.checkState(!model.isSchemaBased(),
|
||||
"Cannot create LegacyScoreAccumulator with a schema-based model: %s", model.getName());
|
||||
}
|
||||
|
||||
/**
|
||||
* Add to the score the weight of a binary feature (if it's present).
|
||||
*
|
||||
* @deprecated This function is retired and we suggest to switch to addSchemaBooleanFeatures in
|
||||
* SchemaBasedScoreAccumulator.
|
||||
*/
|
||||
@Deprecated
|
||||
protected BaseLegacyScoreAccumulator addBinaryFeature(Feature<Boolean> feature,
|
||||
boolean value) {
|
||||
if (value) {
|
||||
Double weight = model.binaryFeatures.get(feature);
|
||||
if (weight != null) {
|
||||
score += weight;
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Add to the score the weight of a continuous feature.
|
||||
* <p>
|
||||
* If the model uses real valued features, it multiplies its weight by the provided value.
|
||||
* Otherwise, it tries to find the discretized feature and adds its weight to the score.
|
||||
*
|
||||
* @deprecated This function is retired and we suggest to switch to addSchemaContinuousFeatures in
|
||||
* SchemaBasedScoreAccumulator.
|
||||
*/
|
||||
@Deprecated
|
||||
protected BaseLegacyScoreAccumulator addContinuousFeature(Feature<Double> feature,
|
||||
double value) {
|
||||
Double weightFromContinuous = model.continuousFeatures.get(feature);
|
||||
if (weightFromContinuous != null) {
|
||||
score += weightFromContinuous * value;
|
||||
} else {
|
||||
DiscretizedFeature discretizedFeature = model.discretizedFeatures.get(feature);
|
||||
if (discretizedFeature != null) {
|
||||
// Use only the weight of the discretized feature (there's no need to multiply it)
|
||||
score += discretizedFeature.getWeight(value);
|
||||
}
|
||||
}
|
||||
return this;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,111 +0,0 @@
|
||||
package com.twitter.search.common.util.ml.prediction_engine;
|
||||
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import com.twitter.ml.api.FeatureParser;
|
||||
import com.twitter.ml.api.transform.DiscretizerTransform;
|
||||
import com.twitter.ml.tool.prediction.ModelInterpreter;
|
||||
|
||||
/**
|
||||
* The base model builder for LightweightLinearModels.
|
||||
*/
|
||||
public abstract class BaseModelBuilder implements ModelBuilder {
|
||||
// Ignore features that have an absolute weight lower than this value
|
||||
protected static final double MIN_WEIGHT = 1e-9;
|
||||
private static final String BIAS_FIELD_NAME = ModelInterpreter.BIAS_FIELD_NAME;
|
||||
static final String DISCRETIZER_NAME_SUFFIX =
|
||||
"." + DiscretizerTransform.DEFAULT_FEATURE_NAME_SUFFIX;
|
||||
|
||||
protected final String modelName;
|
||||
protected double bias;
|
||||
|
||||
public BaseModelBuilder(String modelName) {
|
||||
this.modelName = modelName;
|
||||
this.bias = 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Collects all the ranges of a discretized feature and sorts them.
|
||||
*/
|
||||
static DiscretizedFeature buildFeature(Collection<DiscretizedFeatureRange> ranges) {
|
||||
List<DiscretizedFeatureRange> sortedRanges = Lists.newArrayList(ranges);
|
||||
sortedRanges.sort(Comparator.comparingDouble(a -> a.minValue));
|
||||
|
||||
double[] splits = new double[ranges.size()];
|
||||
double[] weights = new double[ranges.size()];
|
||||
|
||||
for (int i = 0; i < sortedRanges.size(); i++) {
|
||||
splits[i] = sortedRanges.get(i).minValue;
|
||||
weights[i] = sortedRanges.get(i).weight;
|
||||
}
|
||||
return new DiscretizedFeature(splits, weights);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses a line from the interpreted model text file. See the javadoc of the constructor for
|
||||
* more details about how to create the text file.
|
||||
* <p>
|
||||
* The file uses TSV format with 3 columns:
|
||||
* <p>
|
||||
* Model name (Generated by ML API, but ignored by this class)
|
||||
* Feature definition:
|
||||
* Name of the feature or definition from the MDL discretizer.
|
||||
* Weight:
|
||||
* Weight of the feature using LOGIT scale.
|
||||
* <p>
|
||||
* When it parses each line, it stores the weights for all the features defined in the context,
|
||||
* as well as the bias, but it ignores any other feature (e.g. label, prediction or
|
||||
* meta.record_weight) and features with a small absolute weight (see MIN_WEIGHT).
|
||||
* <p>
|
||||
* Example lines:
|
||||
* <p>
|
||||
* model_name bias 0.019735312089324074
|
||||
* model_name demo.binary_feature 0.06524706073105327
|
||||
* model_name demo.continuous_feature 0.0
|
||||
* model_name demo.continuous_feature.dz/dz_model=mdl/dz_range=-inf_3.58e-01 0.07155931927263737
|
||||
* model_name demo.continuous_feature.dz/dz_model=mdl/dz_range=3.58e-01_inf -0.08979256264865387
|
||||
*
|
||||
* @see ModelInterpreter
|
||||
* @see DiscretizerTransform
|
||||
*/
|
||||
@Override
|
||||
public ModelBuilder parseLine(String line) {
|
||||
String[] columns = line.split("\t");
|
||||
if (columns.length != 3) {
|
||||
return this;
|
||||
}
|
||||
|
||||
// columns[0] has the model name, which we don't need
|
||||
String featureName = columns[1];
|
||||
double weight = Double.parseDouble(columns[2]);
|
||||
|
||||
if (BIAS_FIELD_NAME.equals(featureName)) {
|
||||
bias = weight;
|
||||
return this;
|
||||
}
|
||||
|
||||
FeatureParser parser = FeatureParser.parse(featureName);
|
||||
String baseName = parser.getBaseName();
|
||||
|
||||
if (Math.abs(weight) < MIN_WEIGHT && !baseName.endsWith(DISCRETIZER_NAME_SUFFIX)) {
|
||||
// skip, unless it represents a range of a discretized feature.
|
||||
// discretized features with all zeros should also be removed, but will handle that later
|
||||
return this;
|
||||
}
|
||||
|
||||
addFeature(baseName, weight, parser);
|
||||
return this;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds feature to the model
|
||||
*/
|
||||
protected abstract void addFeature(String baseName, double weight, FeatureParser parser);
|
||||
|
||||
@Override
|
||||
public abstract LightweightLinearModel build();
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user