diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeatures.docx b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeatures.docx new file mode 100644 index 000000000..68632d837 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeatures.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeatures.java b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeatures.java deleted file mode 100644 index e3ea16c23..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeatures.java +++ /dev/null @@ -1,148 +0,0 @@ -package com.twitter.search.common.schema.earlybird; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.encoding.features.IntegerEncodedFeatures; -import com.twitter.search.common.indexing.thriftjava.PackedFeatures; -import com.twitter.search.common.indexing.thriftjava.VersionedTweetFeatures; -import com.twitter.search.common.schema.SchemaUtil; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; - -/** - * A class for encoding earlybird features in integers - */ -public abstract class EarlybirdEncodedFeatures extends IntegerEncodedFeatures { - private final ImmutableSchemaInterface schema; - private final EarlybirdFieldConstant baseField; - - public EarlybirdEncodedFeatures(ImmutableSchemaInterface schema, - EarlybirdFieldConstant baseField) { - this.schema = schema; - this.baseField = baseField; - } - - /** - * Write this object into packedFeatures of the given VersionedTweetFeatures. - */ - public void writeFeaturesToVersionedTweetFeatures( - VersionedTweetFeatures versionedTweetFeatures) { - if (!versionedTweetFeatures.isSetPackedFeatures()) { - versionedTweetFeatures.setPackedFeatures(new PackedFeatures()); - } - copyToPackedFeatures(versionedTweetFeatures.getPackedFeatures()); - } - - /** - * Write this object into extendedPackedFeatures of the given VersionedTweetFeatures. - */ - public void writeExtendedFeaturesToVersionedTweetFeatures( - VersionedTweetFeatures versionedTweetFeatures) { - if (!versionedTweetFeatures.isSetExtendedPackedFeatures()) { - versionedTweetFeatures.setExtendedPackedFeatures(new PackedFeatures()); - } - copyToPackedFeatures(versionedTweetFeatures.getExtendedPackedFeatures()); - } - - @Override - public String toString() { - StringBuilder ret = new StringBuilder(); - ret.append("Tweet features: \n"); - for (FeatureConfiguration feature - : EarlybirdSchemaCreateTool.FEATURE_CONFIGURATION_MAP.values()) { - ret.append(feature.getName()).append(": ").append(getFeatureValue(feature)).append("\n"); - } - return ret.toString(); - } - - public boolean isFlagSet(EarlybirdFieldConstant field) { - return isFlagSet(schema.getFeatureConfigurationById(field.getFieldId())); - } - - public int getFeatureValue(EarlybirdFieldConstant field) { - return getFeatureValue(schema.getFeatureConfigurationById(field.getFieldId())); - } - - public EarlybirdEncodedFeatures setFlag(EarlybirdFieldConstant field) { - setFlag(schema.getFeatureConfigurationById(field.getFieldId())); - return this; - } - - public EarlybirdEncodedFeatures clearFlag(EarlybirdFieldConstant field) { - clearFlag(schema.getFeatureConfigurationById(field.getFieldId())); - return this; - } - - public EarlybirdEncodedFeatures setFlagValue(EarlybirdFieldConstant field, - boolean value) { - setFlagValue(schema.getFeatureConfigurationById(field.getFieldId()), value); - return this; - } - - public EarlybirdEncodedFeatures setFeatureValue(EarlybirdFieldConstant field, - int value) { - setFeatureValue(schema.getFeatureConfigurationById(field.getFieldId()), value); - return this; - } - - public EarlybirdEncodedFeatures setFeatureValueIfGreater(EarlybirdFieldConstant field, - int value) { - setFeatureValueIfGreater(schema.getFeatureConfigurationById(field.getFieldId()), value); - return this; - } - - public boolean incrementIfNotMaximum(EarlybirdFieldConstant field) { - return incrementIfNotMaximum(schema.getFeatureConfigurationById(field.getFieldId())); - } - - private static final class ArrayEncodedTweetFeatures extends EarlybirdEncodedFeatures { - private final int[] encodedInts; - - private ArrayEncodedTweetFeatures(ImmutableSchemaInterface schema, - EarlybirdFieldConstant baseField) { - super(schema, baseField); - - final int numIntegers = SchemaUtil.getCSFFieldFixedLength(schema, baseField.getFieldId()); - Preconditions.checkState(numIntegers > 0); - this.encodedInts = new int[numIntegers]; - } - - @Override - public int getNumInts() { - return encodedInts.length; - } - - @Override - public int getInt(int pos) { - return encodedInts[pos]; - } - - @Override - public void setInt(int pos, int value) { - encodedInts[pos] = value; - } - } - - /** - * Create a new {@link EarlybirdEncodedFeatures} object based on schema and base field. - * @param schema the schema for all fields - * @param baseField base field's constant value - */ - public static EarlybirdEncodedFeatures newEncodedTweetFeatures( - ImmutableSchemaInterface schema, EarlybirdFieldConstant baseField) { - return new ArrayEncodedTweetFeatures(schema, baseField); - } - - /** - * Create a new {@link EarlybirdEncodedFeatures} object based on schema and base field name. - * @param schema the schema for all fields - * @param baseFieldName base field's name - */ - public static EarlybirdEncodedFeatures newEncodedTweetFeatures( - ImmutableSchemaInterface schema, String baseFieldName) { - EarlybirdFieldConstant baseField = EarlybirdFieldConstants.getFieldConstant(baseFieldName); - Preconditions.checkNotNull(baseField); - return newEncodedTweetFeatures(schema, baseField); - } -} diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeaturesUtil.docx b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeaturesUtil.docx new file mode 100644 index 000000000..39af957b1 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeaturesUtil.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeaturesUtil.java b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeaturesUtil.java deleted file mode 100644 index d8330faca..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdEncodedFeaturesUtil.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.search.common.schema.earlybird; - -import com.twitter.search.common.encoding.docvalues.CSFTypeUtil; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; - -public final class EarlybirdEncodedFeaturesUtil { - private EarlybirdEncodedFeaturesUtil() { - } - - /** - * Returns a byte array that can be stored in a ThriftDocument as bytesField. - */ - public static byte[] toBytesForThriftDocument(EarlybirdEncodedFeatures features) { - int numInts = features.getNumInts(); - byte[] serializedFeatures = new byte[numInts * Integer.BYTES]; - for (int i = 0; i < numInts; i++) { - CSFTypeUtil.convertToBytes(serializedFeatures, i, features.getInt(i)); - } - return serializedFeatures; - } - - /** - * Converts data in a given byte array (starting at the provided offset) into - * EarlybirdEncodedFeatures. - */ - public static EarlybirdEncodedFeatures fromBytes( - ImmutableSchemaInterface schema, EarlybirdFieldConstants.EarlybirdFieldConstant baseField, - byte[] data, int offset) { - EarlybirdEncodedFeatures features = EarlybirdEncodedFeatures.newEncodedTweetFeatures( - schema, baseField); - for (int idx = 0; idx < features.getNumInts(); ++idx) { - features.setInt(idx, CSFTypeUtil.convertFromBytes(data, offset, idx)); - } - return features; - } -} diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdFieldConstants.docx b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdFieldConstants.docx new file mode 100644 index 000000000..7f4fd47b1 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdFieldConstants.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdFieldConstants.java b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdFieldConstants.java deleted file mode 100644 index 6ec044933..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdFieldConstants.java +++ /dev/null @@ -1,1132 +0,0 @@ - -package com.twitter.search.common.schema.earlybird; - -import java.util.Collection; -import java.util.EnumSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Sets; - -import com.twitter.search.common.indexing.thriftjava.ThriftGeoLocationSource; -import com.twitter.search.common.schema.ImmutableSchema; -import com.twitter.search.common.schema.SchemaBuilder; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.FieldNameToIdMapping; -import com.twitter.search.common.schema.thriftjava.ThriftFeatureNormalizationType; - -/** - * Field names, field IDs etc. - */ -public class EarlybirdFieldConstants extends FieldNameToIdMapping { - @VisibleForTesting - public static final String ENCODED_TWEET_FEATURES_FIELD_NAME = "encoded_tweet_features"; - - @VisibleForTesting - public static final String EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME = - "extended_encoded_tweet_features"; - - private enum FlagFeatureFieldType { - NON_FLAG_FEATURE_FIELD, - FLAG_FEATURE_FIELD - } - - private enum UnusedFeatureFieldType { - USED_FEATURE_FIELD, - UNUSED_FEATURE_FIELD - } - - /** - * CSF_NAME_TO_MIN_ENGAGEMENT_FIELD_MAP and MIN_ENGAGEMENT_FIELD_TO_CSF_NAME_MAP are used in - * EarlybirdLuceneQueryVisitor to map the CSFs REPLY_COUNT, RETWEET_COUNT, and FAVORITE_COUNT to - * their respective min engagement fields, and vice versa. - */ - public static final ImmutableMap - CSF_NAME_TO_MIN_ENGAGEMENT_FIELD_MAP = ImmutableMap.builder() - .put(EarlybirdFieldConstant.REPLY_COUNT.getFieldName(), - EarlybirdFieldConstant.NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD) - .put(EarlybirdFieldConstant.RETWEET_COUNT.getFieldName(), - EarlybirdFieldConstant.NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD) - .put(EarlybirdFieldConstant.FAVORITE_COUNT.getFieldName(), - EarlybirdFieldConstant.NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD) - .build(); - - public static final ImmutableMap - MIN_ENGAGEMENT_FIELD_TO_CSF_NAME_MAP = ImmutableMap.builder() - .put(EarlybirdFieldConstant.NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD - .getFieldName(), - EarlybirdFieldConstant.REPLY_COUNT) - .put(EarlybirdFieldConstant.NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD - .getFieldName(), - EarlybirdFieldConstant.RETWEET_COUNT) - .put(EarlybirdFieldConstant.NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD - .getFieldName(), - EarlybirdFieldConstant.FAVORITE_COUNT) - .build(); - - /** - * A list of Earlybird field names and field IDs, and the clusters that need them. - */ - public enum EarlybirdFieldConstant { - // These enums are grouped by category and sorted alphabetically. - // Next indexed field ID is 76 - // Next CSF field ID is 115 - // Next encoded_features CSF field ID is 185 - // Next extended_encoded_features CSF field ID is 284 - - // Text searchable fields - // Provides slow ID Mapping from tweet ID to doc ID through TermsEnum.seekExact(). - ID_FIELD("id", 0, EarlybirdCluster.ALL_CLUSTERS), - RESOLVED_LINKS_TEXT_FIELD("resolved_links_text", 1), - TEXT_FIELD("text", 2), - TOKENIZED_FROM_USER_FIELD("tokenized_from_user", 3), - - // Other indexed fields - CARD_TITLE_FIELD("card_title", 4), - CARD_DESCRIPTION_FIELD("card_description", 5), - // We require the createdAt field to be set so we can properly filter tweets based on time. - CREATED_AT_FIELD("created_at", 6, EarlybirdCluster.ALL_CLUSTERS), - // 7 was formerly EVENT_IDS_FIELD("event_ids", 7, EarlybirdCluster.REALTIME) - ENTITY_ID_FIELD("entity_id", 40), - // The screen name of the user that created the tweet. Should be set to the normalized value in - // the com.twitter.gizmoduck.thriftjava.Profile.screen_name field. - FROM_USER_FIELD("from_user", 8), - // The numeric ID of the user that created the tweet. - FROM_USER_ID_FIELD("from_user_id", 9, EarlybirdCluster.ALL_CLUSTERS), - CARD_DOMAIN_FIELD("card_domain", 11), - CARD_NAME_FIELD("card_name", 12), - GEO_HASH_FIELD("geo_hash", 13), - HASHTAGS_FIELD("hashtags", 14), - HF_PHRASE_PAIRS_FIELD(ImmutableSchema.HF_PHRASE_PAIRS_FIELD, 15), - HF_TERM_PAIRS_FIELD(ImmutableSchema.HF_TERM_PAIRS_FIELD, 16), - IMAGE_LINKS_FIELD("image_links", 17), - IN_REPLY_TO_TWEET_ID_FIELD("in_reply_to_tweet_id", 59), - IN_REPLY_TO_USER_ID_FIELD("in_reply_to_user_id", 38), - // The internal field is used for many purposes: - // 1. to store facet skiplists - // 2. to power the filter operator, by storing posting list for terms like __filter_twimg - // 3. to store posting lists for positive and negative smileys - // 4. to store geo location types. - // etc. - INTERNAL_FIELD("internal", 18, EarlybirdCluster.ALL_CLUSTERS), - ISO_LANGUAGE_FIELD("iso_lang", 19), - LINK_CATEGORY_FIELD("link_category", 36), - LINKS_FIELD("links", 21), - MENTIONS_FIELD("mentions", 22), - // Field 23 used to be NAMED_ENTITIES_FIELD - NEWS_LINKS_FIELD("news_links", 24), - NORMALIZED_SOURCE_FIELD("norm_source", 25), - PLACE_FIELD("place", 26), - // Field 37 used to be PUBLICLY_INFERRED_USER_LOCATION_PLACE_ID_FIELD - // The ID of the source tweet. Set for retweets only. - RETWEET_SOURCE_TWEET_ID_FIELD("retweet_source_tweet_id", 60, - EarlybirdCluster.ALL_CLUSTERS), - // The ID of the source tweet's author. Set for retweets only. - RETWEET_SOURCE_USER_ID_FIELD("retweet_source_user_id", 39), - SOURCE_FIELD("source", 29), - STOCKS_FIELD("stocks", 30), - // The screen name of the user that a tweet was directed at. - TO_USER_FIELD("to_user", 32), - // Field 33 used to be TOPIC_IDS_FIELD and is now unused. It can be reused later. - TWIMG_LINKS_FIELD("twimg_links", 34), - VIDEO_LINKS_FIELD("video_links", 35), - CAMELCASE_USER_HANDLE_FIELD("camelcase_tokenized_from_user", 41), - // This field should be set to the the tokenized and normalized value in the - // com.twitter.gizmoduck.thriftjava.Profile.name field. - TOKENIZED_USER_NAME_FIELD("tokenized_from_user_display_name", 42), - CONVERSATION_ID_FIELD("conversation_id", 43), - PLACE_ID_FIELD("place_id", 44), - PLACE_FULL_NAME_FIELD("place_full_name", 45), - PLACE_COUNTRY_CODE_FIELD("place_country_code", 46), - PROFILE_GEO_COUNTRY_CODE_FIELD("profile_geo_country_code", 47), - PROFILE_GEO_REGION_FIELD("profile_geo_region", 48), - PROFILE_GEO_LOCALITY_FIELD("profile_geo_locality", 49), - LIKED_BY_USER_ID_FIELD("liked_by_user_id", 50, EarlybirdCluster.REALTIME), - NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD( - "normalized_reply_count_greater_than_or_equal_to", 51, EarlybirdCluster.FULL_ARCHIVE), - NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD( - "normalized_retweet_count_greater_than_or_equal_to", 52, EarlybirdCluster.FULL_ARCHIVE), - NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD( - "normalized_favorite_count_greater_than_or_equal_to", 53, EarlybirdCluster.FULL_ARCHIVE), - COMPOSER_SOURCE("composer_source", 54), - QUOTED_TWEET_ID_FIELD("quoted_tweet_id", 55), - QUOTED_USER_ID_FIELD("quoted_user_id", 56), - RETWEETED_BY_USER_ID("retweeted_by_user_id", 57, EarlybirdCluster.REALTIME), - REPLIED_TO_BY_USER_ID("replied_to_by_user_id", 58, EarlybirdCluster.REALTIME), - CARD_LANG("card_lang", 61), - // SEARCH-27823: Field ID 62 used to be named_entity, which was the combination of all - // named_entity* fields below. We need to leave 62 unused for backwards compatibility. - NAMED_ENTITY_FROM_URL_FIELD("named_entity_from_url", 63), - NAMED_ENTITY_FROM_TEXT_FIELD("named_entity_from_text", 64), - NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD("named_entity_with_type_from_url", 65), - NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD("named_entity_with_type_from_text", 66), - DIRECTED_AT_USER_ID_FIELD("directed_at_user_id", 67), - SPACE_ID_FIELD("space_id", 68, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_GENERAL_PURPOSE_CLUSTERS), - SPACE_TITLE_FIELD("space_title", 69, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_GENERAL_PURPOSE_CLUSTERS), - - // Detailed description of the space admin fields can be found at go/earlybirdfields. - SPACE_ADMIN_FIELD("space_admin", 70, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_GENERAL_PURPOSE_CLUSTERS), - TOKENIZED_SPACE_ADMIN_FIELD("tokenized_space_admin", 71, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_GENERAL_PURPOSE_CLUSTERS), - CAMELCASE_TOKENIZED_SPACE_ADMIN_FIELD("camelcase_tokenized_space_admin", 72, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_GENERAL_PURPOSE_CLUSTERS), - TOKENIZED_SPACE_ADMIN_DISPLAY_NAME_FIELD("tokenized_space_admin_display_name", 73, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_GENERAL_PURPOSE_CLUSTERS), - URL_DESCRIPTION_FIELD("url_description", 74), - URL_TITLE_FIELD("url_title", 75), - - // CSF - CARD_TYPE_CSF_FIELD("card_type_csf", 100), - ENCODED_TWEET_FEATURES_FIELD(ENCODED_TWEET_FEATURES_FIELD_NAME, 102, - EarlybirdCluster.ALL_CLUSTERS), - // Provides the doc ID -> original tweet ID mapping for retweets. - SHARED_STATUS_ID_CSF("shared_status_id_csf", 106, EarlybirdCluster.ALL_CLUSTERS), - // Provides the doc ID -> tweet author's user ID mapping. - FROM_USER_ID_CSF("from_user_id_csf", 103, EarlybirdCluster.ALL_CLUSTERS), - CREATED_AT_CSF_FIELD("created_at_csf", 101, EarlybirdCluster.ARCHIVE_CLUSTERS), - // Provides the doc ID -> tweet ID mapping. - ID_CSF_FIELD("id_csf", 104, EarlybirdCluster.ARCHIVE_CLUSTERS), - LAT_LON_CSF_FIELD("latlon_csf", 105), - CONVERSATION_ID_CSF("conversation_id_csf", 107, EarlybirdCluster.ALL_CLUSTERS), - QUOTED_TWEET_ID_CSF("quoted_tweet_id_csf", 108), - QUOTED_USER_ID_CSF("quoted_user_id_csf", 109), - CARD_LANG_CSF("card_lang_csf", 110), - DIRECTED_AT_USER_ID_CSF("directed_at_user_id_csf", 111), - REFERENCE_AUTHOR_ID_CSF("reference_author_id_csf", 112), - EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF("exclusive_conversation_author_id_csf", 113), - CARD_URI_CSF("card_uri_csf", 114), - - // CSF Views on top of ENCODED_TWEET_FEATURES_FIELD - IS_RETWEET_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "IS_RETWEET_FLAG", 150, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - IS_OFFENSIVE_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "IS_OFFENSIVE_FLAG", 151, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_LINK_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_LINK_FLAG", 152, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_TREND_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_TREND_FLAG", 153, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - IS_REPLY_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "IS_REPLY_FLAG", 154, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - IS_SENSITIVE_CONTENT(ENCODED_TWEET_FEATURES_FIELD_NAME, "IS_SENSITIVE_CONTENT", 155, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, - "HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG", 156, FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.ALL_CLUSTERS), - FROM_VERIFIED_ACCOUNT_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "FROM_VERIFIED_ACCOUNT_FLAG", - 157, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - TEXT_SCORE(ENCODED_TWEET_FEATURES_FIELD_NAME, "TEXT_SCORE", 158, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - LANGUAGE(ENCODED_TWEET_FEATURES_FIELD_NAME, "LANGUAGE", 159, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - LINK_LANGUAGE(ENCODED_TWEET_FEATURES_FIELD_NAME, "LINK_LANGUAGE", 160, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_IMAGE_URL_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_IMAGE_URL_FLAG", 161, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_VIDEO_URL_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_VIDEO_URL_FLAG", 162, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_NEWS_URL_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_NEWS_URL_FLAG", 163, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_EXPANDO_CARD_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_EXPANDO_CARD_FLAG", 164, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_MULTIPLE_MEDIA_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_MULTIPLE_MEDIA_FLAG", 165, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - PROFILE_IS_EGG_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "PROFILE_IS_EGG_FLAG", 166, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - NUM_MENTIONS(ENCODED_TWEET_FEATURES_FIELD_NAME, "NUM_MENTIONS", 167, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - NUM_HASHTAGS(ENCODED_TWEET_FEATURES_FIELD_NAME, "NUM_HASHTAGS", 168, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_CARD_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_CARD_FLAG", 169, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_VISIBLE_LINK_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_VISIBLE_LINK_FLAG", 170, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - USER_REPUTATION(ENCODED_TWEET_FEATURES_FIELD_NAME, "USER_REPUTATION", 171, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - IS_USER_SPAM_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "IS_USER_SPAM_FLAG", 172, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - IS_USER_NSFW_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "IS_USER_NSFW_FLAG", 173, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - IS_USER_BOT_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "IS_USER_BOT_FLAG", 174, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - IS_USER_NEW_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "IS_USER_NEW_FLAG", 175, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - PREV_USER_TWEET_ENGAGEMENT(ENCODED_TWEET_FEATURES_FIELD_NAME, "PREV_USER_TWEET_ENGAGEMENT", - 176, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - COMPOSER_SOURCE_IS_CAMERA_FLAG( - ENCODED_TWEET_FEATURES_FIELD_NAME, - "COMPOSER_SOURCE_IS_CAMERA_FLAG", - 177, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.ALL_CLUSTERS), - RETWEET_COUNT( - ENCODED_TWEET_FEATURES_FIELD_NAME, - "RETWEET_COUNT", - 178, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.ALL_CLUSTERS, - ThriftFeatureNormalizationType.LEGACY_BYTE_NORMALIZER_WITH_LOG2), - FAVORITE_COUNT( - ENCODED_TWEET_FEATURES_FIELD_NAME, - "FAVORITE_COUNT", - 179, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.ALL_CLUSTERS, - ThriftFeatureNormalizationType.LEGACY_BYTE_NORMALIZER_WITH_LOG2), - REPLY_COUNT( - ENCODED_TWEET_FEATURES_FIELD_NAME, - "REPLY_COUNT", - 180, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.ALL_CLUSTERS, - ThriftFeatureNormalizationType.LEGACY_BYTE_NORMALIZER_WITH_LOG2), - PARUS_SCORE(ENCODED_TWEET_FEATURES_FIELD_NAME, "PARUS_SCORE", 181, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - - /** - * This is the rough percentage of the nth token at 140 divided by num tokens - * and is basically n / num tokens where n is the token starting before 140 characters - */ - VISIBLE_TOKEN_RATIO(ENCODED_TWEET_FEATURES_FIELD_NAME, "VISIBLE_TOKEN_RATIO", 182, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_QUOTE_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_QUOTE_FLAG", 183, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - - FROM_BLUE_VERIFIED_ACCOUNT_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, - "FROM_BLUE_VERIFIED_ACCOUNT_FLAG", - 184, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - - TWEET_SIGNATURE(ENCODED_TWEET_FEATURES_FIELD_NAME, "TWEET_SIGNATURE", 188, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - - // MEDIA TYPES - HAS_CONSUMER_VIDEO_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_CONSUMER_VIDEO_FLAG", 189, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_PRO_VIDEO_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_PRO_VIDEO_FLAG", 190, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_VINE_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_VINE_FLAG", 191, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_PERISCOPE_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_PERISCOPE_FLAG", 192, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - HAS_NATIVE_IMAGE_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "HAS_NATIVE_IMAGE_FLAG", 193, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - - // NOTE: if possible, please reserve field ID 194 to 196 for future media types (SEARCH-9131) - - IS_NULLCAST_FLAG(ENCODED_TWEET_FEATURES_FIELD_NAME, "IS_NULLCAST_FLAG", 197, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, EarlybirdCluster.ALL_CLUSTERS), - - // EXTENDED ENCODED TWEET FEATURES that's not available on archive clusters - EXTENDED_ENCODED_TWEET_FEATURES_FIELD(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, 200, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - EMBEDS_IMPRESSION_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "EMBEDS_IMPRESSION_COUNT", - 221, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.LEGACY_BYTE_NORMALIZER), - EMBEDS_URL_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "EMBEDS_URL_COUNT", - 222, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.LEGACY_BYTE_NORMALIZER), - VIDEO_VIEW_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "VIDEO_VIEW_COUNT", - 223, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.LEGACY_BYTE_NORMALIZER), - - // empty bits in integer 0 (starting bit 24, 8 bits) - EXTENDED_FEATURE_UNUSED_BITS_0_24_8(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS_0_24_8", 244, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - // SEARCH-8564 - Reference Tweet Author ID - REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT", 202, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT", 203, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - // SEARCHQUAL-8130: engagement counters v2 - // Integer 3 - RETWEET_COUNT_V2(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "RETWEET_COUNT_V2", 225, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - FAVORITE_COUNT_V2(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "FAVORITE_COUNT_V2", 226, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - REPLY_COUNT_V2(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "REPLY_COUNT_V2", 227, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - EMBEDS_IMPRESSION_COUNT_V2( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "EMBEDS_IMPRESSION_COUNT_V2", - 228, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - - // Integer 4 - EMBEDS_URL_COUNT_V2( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "EMBEDS_URL_COUNT_V2", - 229, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - VIDEO_VIEW_COUNT_V2( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "VIDEO_VIEW_COUNT_V2", - 230, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - QUOTE_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "QUOTE_COUNT", - 231, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - - // Tweet Safety Labels - LABEL_ABUSIVE_FLAG(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LABEL_ABUSIVE_FLAG", 232, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - LABEL_ABUSIVE_HI_RCL_FLAG(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LABEL_ABUSIVE_HI_RCL_FLAG", 233, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - LABEL_DUP_CONTENT_FLAG(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LABEL_DUP_CONTENT_FLAG", 234, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - LABEL_NSFW_HI_PRC_FLAG(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LABEL_NSFW_HI_PRC_FLAG", 235, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - LABEL_NSFW_HI_RCL_FLAG(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LABEL_NSFW_HI_RCL_FLAG", 236, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - LABEL_SPAM_FLAG(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LABEL_SPAM_FLAG", 237, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - LABEL_SPAM_HI_RCL_FLAG(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LABEL_SPAM_HI_RCL_FLAG", 238, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - // please save this bit for other safety labels - EXTENDED_TEST_FEATURE_UNUSED_BITS_4_31_1(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS_4_31_1", 239, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - // Integer 5 - WEIGHTED_RETWEET_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "WEIGHTED_RETWEET_COUNT", - 240, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - WEIGHTED_REPLY_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "WEIGHTED_REPLY_COUNT", - 241, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - WEIGHTED_FAVORITE_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "WEIGHTED_FAVORITE_COUNT", - 242, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - WEIGHTED_QUOTE_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "WEIGHTED_QUOTE_COUNT", - 243, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - - // Integer 6 - // Periscope features - PERISCOPE_EXISTS(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "PERISCOPE_EXISTS", 245, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - PERISCOPE_HAS_BEEN_FEATURED(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "PERISCOPE_HAS_BEEN_FEATURED", 246, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - PERISCOPE_IS_CURRENTLY_FEATURED(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "PERISCOPE_IS_CURRENTLY_FEATURED", 247, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - PERISCOPE_IS_FROM_QUALITY_SOURCE(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "PERISCOPE_IS_FROM_QUALITY_SOURCE", 248, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - PERISCOPE_IS_LIVE(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "PERISCOPE_IS_LIVE", 249, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - IS_TRENDING_NOW_FLAG(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "IS_TRENDING_NOW_FLAG", 292, - FlagFeatureFieldType.FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - // remaining bits for integer 6 (starting bit 6, 26 remaining bits) - EXTENDED_TEST_FEATURE_UNUSED_BITS_7_6_26(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS_7_6_26", 250, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - // Decaying engagement counters - // Integer 7 - DECAYED_RETWEET_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "DECAYED_RETWEET_COUNT", - 251, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - DECAYED_REPLY_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "DECAYED_REPLY_COUNT", - 252, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - DECAYED_FAVORITE_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "DECAYED_FAVORITE_COUNT", - 253, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - DECAYED_QUOTE_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "DECAYED_QUOTE_COUNT", - 254, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - - // Fake engagement counters. The fake here is in the sense of spam, not in the sense of testing. - // Refer to [JIRA SEARCHQUAL-10736 Remove Fake Engagements in Search] for more details. - // Integer 8 - FAKE_RETWEET_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "FAKE_RETWEET_COUNT", 269, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - FAKE_REPLY_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "FAKE_REPLY_COUNT", 270, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - FAKE_FAVORITE_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "FAKE_FAVORITE_COUNT", 271, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - FAKE_QUOTE_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "FAKE_QUOTE_COUNT", 272, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - - // Last engagement timestamps. These features use the Tweet's creation time as base and - // are incremented every 1 hour - // Integer 9 - LAST_RETWEET_SINCE_CREATION_HRS( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LAST_RETWEET_SINCE_CREATION_HRS", - 273, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.NONE), - LAST_REPLY_SINCE_CREATION_HRS( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LAST_REPLY_SINCE_CREATION_HRS", - 274, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.NONE), - LAST_FAVORITE_SINCE_CREATION_HRS( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LAST_FAVORITE_SINCE_CREATION_HRS", - 275, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.NONE), - LAST_QUOTE_SINCE_CREATION_HRS( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "LAST_QUOTE_SINCE_CREATION_HRS", - 276, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.NONE), - - // 4 bits hashtag count, mention count and stock count (SEARCH-24336) - // Integer 10 - NUM_HASHTAGS_V2( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "NUM_HASHTAGS_V2", - 277, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.NONE - ), - NUM_MENTIONS_V2( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "NUM_MENTIONS_V2", - 278, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.NONE - ), - NUM_STOCKS( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "NUM_STOCKS", - 279, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.NONE - ), - - // Integer 11 - // Blink engagement counters - BLINK_RETWEET_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "BLINK_RETWEET_COUNT", - 280, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - BLINK_REPLY_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "BLINK_REPLY_COUNT", - 281, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - BLINK_FAVORITE_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "BLINK_FAVORITE_COUNT", - 282, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - BLINK_QUOTE_COUNT( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "BLINK_QUOTE_COUNT", - 283, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.SMART_INTEGER_NORMALIZER), - - // Integer 10 (remaining) - // Production Toxicity and PBlock score from HML (go/toxicity, go/pblock) - TOXICITY_SCORE( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "TOXICITY_SCORE", 284, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.PREDICTION_SCORE_NORMALIZER - ), - PBLOCK_SCORE( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "PBLOCK_SCORE", 285, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.PREDICTION_SCORE_NORMALIZER - ), - - // Integer 12 - // Experimental health model scores from HML - EXPERIMENTAL_HEALTH_MODEL_SCORE_1( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "EXPERIMENTAL_HEALTH_MODEL_SCORE_1", 286, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.PREDICTION_SCORE_NORMALIZER - ), - EXPERIMENTAL_HEALTH_MODEL_SCORE_2( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "EXPERIMENTAL_HEALTH_MODEL_SCORE_2", 287, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.PREDICTION_SCORE_NORMALIZER - ), - EXPERIMENTAL_HEALTH_MODEL_SCORE_3( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "EXPERIMENTAL_HEALTH_MODEL_SCORE_3", 288, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.PREDICTION_SCORE_NORMALIZER - ), - // remaining bits for index 12 (unused_bits_12) - EXTENDED_TEST_FEATURE_UNUSED_BITS_12_30_2(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS_12_30_2", 289, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - // Integer 13 - // Experimental health model scores from HML (cont.) - EXPERIMENTAL_HEALTH_MODEL_SCORE_4( - EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "EXPERIMENTAL_HEALTH_MODEL_SCORE_4", 290, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.PREDICTION_SCORE_NORMALIZER - ), - // Production pSpammyTweet score from HML (go/pspammytweet) - P_SPAMMY_TWEET_SCORE(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "P_SPAMMY_TWEET_SCORE", 291, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.PREDICTION_SCORE_NORMALIZER - ), - // Production pReportedTweet score from HML (go/preportedtweet) - P_REPORTED_TWEET_SCORE(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "P_REPORTED_TWEET_SCORE", 293, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.PREDICTION_SCORE_NORMALIZER - ), - // remaining bits for index 13 (unused_bits_13) - EXTENDED_TEST_FEATURE_UNUSED_BITS_13_30_2(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS_13_30_2", 294, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS - ), - - // Integer 14 - // Health model scores from HML (cont.) - // Prod Spammy Tweet Content model score from Platform Manipulation (go/spammy-tweet-content) - SPAMMY_TWEET_CONTENT_SCORE(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "SPAMMY_TWEET_CONTENT_SCORE", 295, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS, - ThriftFeatureNormalizationType.PREDICTION_SCORE_NORMALIZER - ), - // remaining bits for index 14 (unused_bits_14) - EXTENDED_TEST_FEATURE_UNUSED_BITS_14_10_22(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS_14_10_22", 296, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS - ), - - // Note that the integer block index i in the names UNUSED_BITS{i}" below is 1-based, but the - // index j in UNUSED_BITS_{j}_x_y above is 0-based. - EXTENDED_TEST_FEATURE_UNUSED_BITS_16(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS16", 216, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - EXTENDED_TEST_FEATURE_UNUSED_BITS_17(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS17", 217, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - EXTENDED_TEST_FEATURE_UNUSED_BITS_18(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS18", 218, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - EXTENDED_TEST_FEATURE_UNUSED_BITS_19(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS19", 219, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS), - - EXTENDED_TEST_FEATURE_UNUSED_BITS_20(EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - "UNUSED_BITS20", 220, - FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - UnusedFeatureFieldType.UNUSED_FEATURE_FIELD, - EarlybirdCluster.TWITTER_IN_MEMORY_INDEX_FORMAT_ALL_CLUSTERS); - - // Filter field terms. These end up as terms in the "internal" field (id=18). So for example - // you can have a doc with field(internal) = "__filter_nullcast", "__filter_vine" and that will - // be a nullcast tweet with a vine link in it. - public static final String NULLCAST_FILTER_TERM = "nullcast"; - public static final String VERIFIED_FILTER_TERM = "verified"; - public static final String BLUE_VERIFIED_FILTER_TERM = "blue_verified"; - public static final String NATIVE_RETWEETS_FILTER_TERM = "nativeretweets"; - public static final String QUOTE_FILTER_TERM = "quote"; - public static final String REPLIES_FILTER_TERM = "replies"; - public static final String CONSUMER_VIDEO_FILTER_TERM = "consumer_video"; - public static final String PRO_VIDEO_FILTER_TERM = "pro_video"; - public static final String VINE_FILTER_TERM = "vine"; - public static final String PERISCOPE_FILTER_TERM = "periscope"; - public static final String PROFILE_GEO_FILTER_TERM = "profile_geo"; - public static final String SELF_THREAD_FILTER_TERM = "self_threads"; - public static final String DIRECTED_AT_FILTER_TERM = "directed_at"; - public static final String EXCLUSIVE_FILTER_TERM = "exclusive"; - - // Reserved terms for the internal field. - public static final String HAS_POSITIVE_SMILEY = "__has_positive_smiley"; - public static final String HAS_NEGATIVE_SMILEY = "__has_negative_smiley"; - public static final String IS_OFFENSIVE = "__is_offensive"; - - // Facet fields - public static final String MENTIONS_FACET = "mentions"; - public static final String HASHTAGS_FACET = "hashtags"; - public static final String STOCKS_FACET = "stocks"; - public static final String VIDEOS_FACET = "videos"; - public static final String IMAGES_FACET = "images"; - public static final String NEWS_FACET = "news"; - public static final String LANGUAGES_FACET = "languages"; - public static final String SOURCES_FACET = "sources"; - public static final String TWIMG_FACET = "twimg"; - public static final String FROM_USER_ID_FACET = "user_id"; - public static final String RETWEETS_FACET = "retweets"; - public static final String LINKS_FACET = "links"; - public static final String SPACES_FACET = "spaces"; - - /** - * Used by the query parser to check that the operator of a [filter X] query is valid. - * Also used by blender, though it probably shouldn't be. - */ - public static final ImmutableSet FACETS = ImmutableSet.builder() - .add(MENTIONS_FACET) - .add(HASHTAGS_FACET) - .add(STOCKS_FACET) - .add(VIDEOS_FACET) - .add(IMAGES_FACET) - .add(NEWS_FACET) - .add(LINKS_FACET) - .add(LANGUAGES_FACET) - .add(SOURCES_FACET) - .add(TWIMG_FACET) - .add(SPACES_FACET) - .build(); - - /** - * Used by blender to convert facet names to field names. We should find a way to get the - * information we need in blender without needing this map. - */ - public static final ImmutableMap FACET_TO_FIELD_MAP = - ImmutableMap.builder() - .put(MENTIONS_FACET, MENTIONS_FIELD.getFieldName()) - .put(HASHTAGS_FACET, HASHTAGS_FIELD.getFieldName()) - .put(STOCKS_FACET, STOCKS_FIELD.getFieldName()) - .put(VIDEOS_FACET, VIDEO_LINKS_FIELD.getFieldName()) - .put(IMAGES_FACET, IMAGE_LINKS_FIELD.getFieldName()) - .put(NEWS_FACET, NEWS_LINKS_FIELD.getFieldName()) - .put(LANGUAGES_FACET, ISO_LANGUAGE_FIELD.getFieldName()) - .put(SOURCES_FACET, SOURCE_FIELD.getFieldName()) - .put(TWIMG_FACET, TWIMG_LINKS_FIELD.getFieldName()) - .put(LINKS_FACET, LINKS_FIELD.getFieldName()) - .put(SPACES_FACET, SPACE_ID_FIELD.getFieldName()) - .build(); - - public static String getFacetSkipFieldName(String fieldName) { - return "__has_" + fieldName; - } - - private final String fieldName; - private final int fieldId; - private final EnumSet clusters; - private final FlagFeatureFieldType flagFeatureField; - - private final UnusedFeatureFieldType unusedField; - - // Only set for feature fields. - @Nullable - private final FeatureConfiguration featureConfiguration; - - // Only set for feature fields. - private final ThriftFeatureNormalizationType featureNormalizationType; - - // To simplify field configurations and reduce duplicate code, we give clusters a default value - EarlybirdFieldConstant(String fieldName, int fieldId) { - this(fieldName, fieldId, EarlybirdCluster.GENERAL_PURPOSE_CLUSTERS, null); - } - - EarlybirdFieldConstant(String fieldName, int fieldId, Set clusters) { - this(fieldName, fieldId, clusters, null); - } - - EarlybirdFieldConstant(String fieldName, int fieldId, EarlybirdCluster cluster) { - this(fieldName, fieldId, ImmutableSet.of(cluster), null); - } - - /** - * Base field name is needed here in order to construct the full - * name of the feature. Our convention is that a feature should be named - * as: baseFieldName.featureName. For example: encoded_tweet_features.retweet_count. - */ - EarlybirdFieldConstant( - String baseName, - String fieldName, - int fieldId, - FlagFeatureFieldType flagFeatureField, - Set clusters) { - this((baseName + SchemaBuilder.CSF_VIEW_NAME_SEPARATOR + fieldName).toLowerCase(), - fieldId, clusters, flagFeatureField, null); - } - - EarlybirdFieldConstant( - String baseName, - String fieldName, - int fieldId, - FlagFeatureFieldType flagFeatureField, - UnusedFeatureFieldType unusedField, - Set clusters) { - this((baseName + SchemaBuilder.CSF_VIEW_NAME_SEPARATOR + fieldName).toLowerCase(), - fieldId, clusters, flagFeatureField, unusedField, null); - } - - EarlybirdFieldConstant( - String baseName, - String fieldName, - int fieldId, - FlagFeatureFieldType flagFeatureField, - Set clusters, - ThriftFeatureNormalizationType featureNormalizationType) { - this((baseName + SchemaBuilder.CSF_VIEW_NAME_SEPARATOR + fieldName).toLowerCase(), - fieldId, clusters, flagFeatureField, UnusedFeatureFieldType.USED_FEATURE_FIELD, - featureNormalizationType, null); - } - - /** - * Constructor. - */ - EarlybirdFieldConstant(String fieldName, int fieldId, Set clusters, - @Nullable FeatureConfiguration featureConfiguration) { - this(fieldName, fieldId, clusters, FlagFeatureFieldType.NON_FLAG_FEATURE_FIELD, - featureConfiguration); - } - - /** - * Constructor. - */ - EarlybirdFieldConstant(String fieldName, - int fieldId, - Set clusters, - FlagFeatureFieldType flagFeatureField, - @Nullable FeatureConfiguration featureConfiguration) { - this(fieldName, fieldId, clusters, flagFeatureField, - UnusedFeatureFieldType.USED_FEATURE_FIELD, featureConfiguration); - } - - /** - * Constructor. - */ - EarlybirdFieldConstant(String fieldName, - int fieldId, - Set clusters, - FlagFeatureFieldType flagFeatureField, - UnusedFeatureFieldType unusedField, - @Nullable FeatureConfiguration featureConfiguration) { - this(fieldName, fieldId, clusters, flagFeatureField, unusedField, null, featureConfiguration); - } - - /** - * Constructor. - */ - EarlybirdFieldConstant(String fieldName, - int fieldId, - Set clusters, - FlagFeatureFieldType flagFeatureField, - UnusedFeatureFieldType unusedField, - @Nullable ThriftFeatureNormalizationType featureNormalizationType, - @Nullable FeatureConfiguration featureConfiguration) { - this.fieldId = fieldId; - this.fieldName = fieldName; - this.clusters = EnumSet.copyOf(clusters); - this.flagFeatureField = flagFeatureField; - this.unusedField = unusedField; - this.featureNormalizationType = featureNormalizationType; - this.featureConfiguration = featureConfiguration; - } - - // Override toString to make replacing StatusConstant Easier. - @Override - public String toString() { - return fieldName; - } - - public boolean isValidFieldInCluster(EarlybirdCluster cluster) { - return clusters.contains(cluster); - } - - public String getFieldName() { - return fieldName; - } - - public int getFieldId() { - return fieldId; - } - - public FlagFeatureFieldType getFlagFeatureField() { - return flagFeatureField; - } - - public boolean isFlagFeatureField() { - return flagFeatureField == FlagFeatureFieldType.FLAG_FEATURE_FIELD; - } - - public boolean isUnusedField() { - return unusedField == UnusedFeatureFieldType.UNUSED_FEATURE_FIELD; - } - - @Nullable - public FeatureConfiguration getFeatureConfiguration() { - return featureConfiguration; - } - - @Nullable - public ThriftFeatureNormalizationType getFeatureNormalizationType() { - return featureNormalizationType; - } - } - - private static final Map NAME_TO_ID_MAP; - private static final Map ID_TO_FIELD_MAP; - static { - ImmutableMap.Builder nameToIdMapBuilder = - ImmutableMap.builder(); - ImmutableMap.Builder idToFieldMapBuilder = - ImmutableMap.builder(); - Set fieldNameDupDetector = Sets.newHashSet(); - Set fieldIdDupDetector = Sets.newHashSet(); - for (EarlybirdFieldConstant fc : EarlybirdFieldConstant.values()) { - if (fieldNameDupDetector.contains(fc.getFieldName())) { - throw new IllegalStateException("detected fields sharing field name: " + fc.getFieldName()); - } - if (fieldIdDupDetector.contains(fc.getFieldId())) { - throw new IllegalStateException("detected fields sharing field id: " + fc.getFieldId()); - } - - fieldNameDupDetector.add(fc.getFieldName()); - fieldIdDupDetector.add(fc.getFieldId()); - nameToIdMapBuilder.put(fc.getFieldName(), fc); - idToFieldMapBuilder.put(fc.getFieldId(), fc); - } - NAME_TO_ID_MAP = nameToIdMapBuilder.build(); - ID_TO_FIELD_MAP = idToFieldMapBuilder.build(); - } - - // This define the list of boolean features, but the name does not have "flag" inside. This - // definition is only for double checking purpose to prevent code change mistakes. The setting - // of the flag feature is based on FlagFeatureFieldType.FLAG_FEATURE_FIELD. - public static final Set EXTRA_FLAG_FIELDS = - Sets.newHashSet(EarlybirdFieldConstants.EarlybirdFieldConstant.IS_SENSITIVE_CONTENT); - public static final String FLAG_STRING = "flag"; - - private static final List FLAG_FEATURE_FIELDS; - static { - ImmutableList.Builder flagFieldBuilder = ImmutableList.builder(); - for (EarlybirdFieldConstant fc : EarlybirdFieldConstant.values()) { - if (fc.getFlagFeatureField() == FlagFeatureFieldType.FLAG_FEATURE_FIELD - && !fc.isUnusedField()) { - flagFieldBuilder.add(fc); - } - } - FLAG_FEATURE_FIELDS = flagFieldBuilder.build(); - } - - /** - * Get all the flag features meaning that they are boolean features with only 1 bit in the packed - * feature encoding. - */ - public static Collection getFlagFeatureFields() { - return FLAG_FEATURE_FIELDS; - } - - /** - * Get the EarlybirdFieldConstant for the specified field. - */ - public static EarlybirdFieldConstant getFieldConstant(String fieldName) { - EarlybirdFieldConstant field = NAME_TO_ID_MAP.get(fieldName); - if (field == null) { - throw new IllegalArgumentException("Unknown field: " + fieldName); - } - return field; - } - - /** - * Get the EarlybirdFieldConstant for the specified field. - */ - public static EarlybirdFieldConstant getFieldConstant(int fieldId) { - EarlybirdFieldConstant field = ID_TO_FIELD_MAP.get(fieldId); - if (field == null) { - throw new IllegalArgumentException("Unknown field: " + fieldId); - } - return field; - } - - /** - * Determines if there's a field with the given ID. - */ - public static boolean hasFieldConstant(int fieldId) { - return ID_TO_FIELD_MAP.keySet().contains(fieldId); - } - - @Override - public final int getFieldID(String fieldName) { - return getFieldConstant(fieldName).getFieldId(); - } - - public static final String formatGeoType(ThriftGeoLocationSource source) { - return "__geo_location_type_" + source.name().toLowerCase(); - } -} diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaBuilder.docx b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaBuilder.docx new file mode 100644 index 000000000..e230218f9 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaBuilder.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaBuilder.java b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaBuilder.java deleted file mode 100644 index 095e00fe5..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaBuilder.java +++ /dev/null @@ -1,96 +0,0 @@ -package com.twitter.search.common.schema.earlybird; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; - -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.search.common.schema.SchemaBuilder; -import com.twitter.search.common.schema.base.FieldNameToIdMapping; -import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration; -import com.twitter.search.common.schema.thriftjava.ThriftFieldSettings; -import com.twitter.search.common.schema.thriftjava.ThriftTokenStreamSerializer; -import com.twitter.search.common.util.analysis.CharTermAttributeSerializer; -import com.twitter.search.common.util.analysis.TermPayloadAttributeSerializer; - -/** - * Build class used to build a ThriftSchema - */ -public class EarlybirdSchemaBuilder extends SchemaBuilder { - private final EarlybirdCluster cluster; - - public EarlybirdSchemaBuilder(FieldNameToIdMapping idMapping, - EarlybirdCluster cluster, - TokenStreamSerializer.Version tokenStreamSerializerVersion) { - super(idMapping, tokenStreamSerializerVersion); - this.cluster = cluster; - } - - /** - * Configure the specified field to be Out-of-order. - * In the realtime cluster, this causes Earlybird to used the skip list posting format. - */ - public final EarlybirdSchemaBuilder withOutOfOrderEnabledForField(String fieldName) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldSettings settings = - schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings(); - Preconditions.checkState(settings.isSetIndexedFieldSettings(), - "Out of order field must be indexed"); - settings.getIndexedFieldSettings().setSupportOutOfOrderAppends(true); - return this; - } - - /** - * This turns on tweet specific normalizations. This turns on the following two token processors: - * {@link com.twitter.search.common.util.text.splitter.HashtagMentionPunctuationSplitter} - * {@link com.twitter.search.common.util.text.filter.NormalizedTokenFilter} - *

- * HashtagMentionPunctuationSplitter would break a mention or hashtag like @ab_cd or #ab_cd into - * tokens {ab, cd}. - * NormalizedTokenFilter strips out the # @ $ from the tokens. - */ - public final EarlybirdSchemaBuilder withTweetSpecificNormalization(String fieldName) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldSettings settings = - schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings(); - Preconditions.checkState(settings.isSetIndexedFieldSettings(), - "Tweet text field must be indexed."); - settings.getIndexedFieldSettings().setDeprecated_performTweetSpecificNormalizations(true); - return this; - } - - /** - * Add a twitter photo facet field. - */ - public final EarlybirdSchemaBuilder withPhotoUrlFacetField(String fieldName) { - if (!shouldIncludeField(fieldName)) { - return this; - } - ThriftFieldSettings photoFieldSettings = getNoPositionNoFreqSettings(); - ThriftTokenStreamSerializer tokenStreamSerializer = - new ThriftTokenStreamSerializer(tokenStreamSerializerVersion); - tokenStreamSerializer.setAttributeSerializerClassNames( - ImmutableList.of( - CharTermAttributeSerializer.class.getName(), - TermPayloadAttributeSerializer.class.getName())); - photoFieldSettings - .getIndexedFieldSettings() - .setTokenStreamSerializer(tokenStreamSerializer) - .setTokenized(true); - putIntoFieldConfigs(idMapping.getFieldID(fieldName), - new ThriftFieldConfiguration(fieldName).setSettings(photoFieldSettings)); - return this; - } - - /** - * Returns whether the given field should be included or dropped. - */ - @Override - protected boolean shouldIncludeField(String fieldName) { - return EarlybirdFieldConstants.getFieldConstant(fieldName).isValidFieldInCluster(cluster); - } -} - diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaCreateTool.docx b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaCreateTool.docx new file mode 100644 index 000000000..b5faf58e0 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaCreateTool.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaCreateTool.java b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaCreateTool.java deleted file mode 100644 index f2376cf6b..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdSchemaCreateTool.java +++ /dev/null @@ -1,702 +0,0 @@ -package com.twitter.search.common.schema.earlybird; - -import java.util.Map; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.schema.AnalyzerFactory; -import com.twitter.search.common.schema.DynamicSchema; -import com.twitter.search.common.schema.ImmutableSchema; -import com.twitter.search.common.schema.SchemaBuilder; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint; -import com.twitter.search.common.schema.thriftjava.ThriftSchema; - -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_FAVORITE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_QUOTE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_REPLY_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.BLINK_RETWEET_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.COMPOSER_SOURCE_IS_CAMERA_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_FAVORITE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_QUOTE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_REPLY_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.DECAYED_RETWEET_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_IMPRESSION_COUNT_V2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_URL_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EMBEDS_URL_COUNT_V2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_1; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_3; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXPERIMENTAL_HEALTH_MODEL_SCORE_4; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_FEATURE_UNUSED_BITS_0_24_8; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_12_30_2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_13_30_2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_14_10_22; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_16; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_17; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_18; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_19; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_20; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_4_31_1; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.EXTENDED_TEST_FEATURE_UNUSED_BITS_7_6_26; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_FAVORITE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_QUOTE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_REPLY_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAKE_RETWEET_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAVORITE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FAVORITE_COUNT_V2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CARD_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_CONSUMER_VIDEO_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_EXPANDO_CARD_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_LINK_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_MULTIPLE_MEDIA_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_NEWS_URL_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PERISCOPE_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_PRO_VIDEO_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_QUOTE_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_TREND_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VINE_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_NULLCAST_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_OFFENSIVE_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_REPLY_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_RETWEET_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_SENSITIVE_CONTENT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_TRENDING_NOW_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_BOT_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_NEW_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_NSFW_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.IS_USER_SPAM_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_ABUSIVE_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_ABUSIVE_HI_RCL_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_DUP_CONTENT_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_NSFW_HI_PRC_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_NSFW_HI_RCL_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_SPAM_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LABEL_SPAM_HI_RCL_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LANGUAGE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_FAVORITE_SINCE_CREATION_HRS; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_QUOTE_SINCE_CREATION_HRS; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_REPLY_SINCE_CREATION_HRS; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LAST_RETWEET_SINCE_CREATION_HRS; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.LINK_LANGUAGE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_HASHTAGS; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_HASHTAGS_V2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_MENTIONS; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_MENTIONS_V2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.NUM_STOCKS; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PARUS_SCORE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PBLOCK_SCORE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_EXISTS; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_HAS_BEEN_FEATURED; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_IS_CURRENTLY_FEATURED; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_IS_FROM_QUALITY_SOURCE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PERISCOPE_IS_LIVE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PREV_USER_TWEET_ENGAGEMENT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.PROFILE_IS_EGG_FLAG; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.P_REPORTED_TWEET_SCORE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.P_SPAMMY_TWEET_SCORE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.QUOTE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REPLY_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.REPLY_COUNT_V2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.RETWEET_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.RETWEET_COUNT_V2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.SPAMMY_TWEET_CONTENT_SCORE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.TEXT_SCORE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.TOXICITY_SCORE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.TWEET_SIGNATURE; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.USER_REPUTATION; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_VIEW_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_VIEW_COUNT_V2; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.VISIBLE_TOKEN_RATIO; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_FAVORITE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_QUOTE_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_REPLY_COUNT; -import static com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant.WEIGHTED_RETWEET_COUNT; - -/** - * Field configurations for Earlybird. - */ -public final class EarlybirdSchemaCreateTool { - // How many times a schema is built - private static final SearchCounter SCHEMA_BUILD_COUNT = - SearchCounter.export("schema_build_count"); - - // Number of integers for the column of ENCODED_TWEET_FEATURES_FIELD. - @VisibleForTesting - public static final int NUMBER_OF_INTEGERS_FOR_FEATURES = 5; - - // Number of integers for the column of EXTENDED_ENCODED_TWEET_FEATURES_FIELD. - // extra 80 bytes - // In realtime cluster, assuming 19 segments total, and 8388608 docs per segment - // this would amount to about 12.75GB of memory needed - // - @VisibleForTesting - public static final int NUMBER_OF_INTEGERS_FOR_EXTENDED_FEATURES = 20; - - @VisibleForTesting - public static final Map FEATURE_CONFIGURATION_MAP - = Maps.newLinkedHashMap(); - - public static final String BASE_FIELD_NAME = - EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName(); - - private static String getBaseFieldName(String fullName) { - int index = fullName.indexOf(SchemaBuilder.CSF_VIEW_NAME_SEPARATOR); - Preconditions.checkArgument(index > 0); - return fullName.substring(0, index); - } - - private static String getBaseFieldName(EarlybirdFieldConstant fieldConstant) { - return getBaseFieldName(fieldConstant.getFieldName()); - } - - private static String getFeatureNameInField(EarlybirdFieldConstant fieldConstant) { - int index = fieldConstant.getFieldName().indexOf(SchemaBuilder.CSF_VIEW_NAME_SEPARATOR); - Preconditions.checkArgument(index > 0); - return fieldConstant.getFieldName().substring(index + 1); - } - - // defining all features - static { - // Add individual tweet encoded features as views on top of - // EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD - - // int intIndex, int bitStartPos, int bitLength - newEarlybirdFeatureConfiguration(IS_RETWEET_FLAG, ThriftCSFType.BOOLEAN, 0, 0, 1); - newEarlybirdFeatureConfiguration(IS_OFFENSIVE_FLAG, ThriftCSFType.BOOLEAN, 0, 1, 1); - newEarlybirdFeatureConfiguration(HAS_LINK_FLAG, ThriftCSFType.BOOLEAN, 0, 2, 1); - newEarlybirdFeatureConfiguration(HAS_TREND_FLAG, ThriftCSFType.BOOLEAN, 0, 3, 1); - newEarlybirdFeatureConfiguration(IS_REPLY_FLAG, ThriftCSFType.BOOLEAN, 0, 4, 1); - newEarlybirdFeatureConfiguration(IS_SENSITIVE_CONTENT, ThriftCSFType.BOOLEAN, 0, 5, 1); - newEarlybirdFeatureConfiguration(HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG, - ThriftCSFType.BOOLEAN, 0, 6, 1); - newEarlybirdFeatureConfiguration(FROM_VERIFIED_ACCOUNT_FLAG, ThriftCSFType.BOOLEAN, 0, 7, 1); - newEarlybirdFeatureConfiguration(TEXT_SCORE, ThriftCSFType.INT, 0, 8, 8); - newEarlybirdFeatureConfiguration(LANGUAGE, ThriftCSFType.INT, 0, 16, 8); - newEarlybirdFeatureConfiguration(LINK_LANGUAGE, ThriftCSFType.INT, 0, 24, 8); - - newEarlybirdFeatureConfiguration(HAS_IMAGE_URL_FLAG, ThriftCSFType.BOOLEAN, 1, 0, 1); - newEarlybirdFeatureConfiguration(HAS_VIDEO_URL_FLAG, ThriftCSFType.BOOLEAN, 1, 1, 1); - newEarlybirdFeatureConfiguration(HAS_NEWS_URL_FLAG, ThriftCSFType.BOOLEAN, 1, 2, 1); - newEarlybirdFeatureConfiguration(HAS_EXPANDO_CARD_FLAG, ThriftCSFType.BOOLEAN, 1, 3, 1); - newEarlybirdFeatureConfiguration(HAS_MULTIPLE_MEDIA_FLAG, ThriftCSFType.BOOLEAN, 1, 4, 1); - newEarlybirdFeatureConfiguration(PROFILE_IS_EGG_FLAG, ThriftCSFType.BOOLEAN, 1, 5, 1); - newEarlybirdFeatureConfiguration(NUM_MENTIONS, ThriftCSFType.INT, 1, 6, 2); // 0, 1, 2, 3+ - newEarlybirdFeatureConfiguration(NUM_HASHTAGS, ThriftCSFType.INT, 1, 8, 2); // 0, 1, 2, 3+ - newEarlybirdFeatureConfiguration(HAS_CARD_FLAG, ThriftCSFType.BOOLEAN, 1, 10, 1); - newEarlybirdFeatureConfiguration(HAS_VISIBLE_LINK_FLAG, ThriftCSFType.BOOLEAN, 1, 11, 1); - newEarlybirdFeatureConfiguration(USER_REPUTATION, ThriftCSFType.INT, 1, 12, 8); - newEarlybirdFeatureConfiguration(IS_USER_SPAM_FLAG, ThriftCSFType.BOOLEAN, 1, 20, 1); - newEarlybirdFeatureConfiguration(IS_USER_NSFW_FLAG, ThriftCSFType.BOOLEAN, 1, 21, 1); - newEarlybirdFeatureConfiguration(IS_USER_BOT_FLAG, ThriftCSFType.BOOLEAN, 1, 22, 1); - newEarlybirdFeatureConfiguration(IS_USER_NEW_FLAG, ThriftCSFType.BOOLEAN, 1, 23, 1); - newEarlybirdFeatureConfiguration(PREV_USER_TWEET_ENGAGEMENT, ThriftCSFType.INT, 1, 24, 6); - newEarlybirdFeatureConfiguration(COMPOSER_SOURCE_IS_CAMERA_FLAG, - ThriftCSFType.BOOLEAN, 1, 30, 1); - newEarlybirdFeatureConfiguration(IS_NULLCAST_FLAG, ThriftCSFType.BOOLEAN, 1, 31, 1); - - newEarlybirdFeatureConfiguration(RETWEET_COUNT, ThriftCSFType.DOUBLE, 2, 0, 8, - ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(FAVORITE_COUNT, ThriftCSFType.DOUBLE, 2, 8, 8, - ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(REPLY_COUNT, ThriftCSFType.DOUBLE, 2, 16, 8, - ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(PARUS_SCORE, ThriftCSFType.DOUBLE, 2, 24, 8); - - newEarlybirdFeatureConfiguration(HAS_CONSUMER_VIDEO_FLAG, ThriftCSFType.BOOLEAN, 3, 0, 1); - newEarlybirdFeatureConfiguration(HAS_PRO_VIDEO_FLAG, ThriftCSFType.BOOLEAN, 3, 1, 1); - newEarlybirdFeatureConfiguration(HAS_VINE_FLAG, ThriftCSFType.BOOLEAN, 3, 2, 1); - newEarlybirdFeatureConfiguration(HAS_PERISCOPE_FLAG, ThriftCSFType.BOOLEAN, 3, 3, 1); - newEarlybirdFeatureConfiguration(HAS_NATIVE_IMAGE_FLAG, ThriftCSFType.BOOLEAN, 3, 4, 1); - // NOTE: There are 3 bits left in the first byte of INT 3, if possible, please reserve them - // for future media types (SEARCH-9131) - // newEarlybirdFeatureConfiguration(FUTURE_MEDIA_BITS, ThriftCSFType.INT, 3, 5, 3); - - newEarlybirdFeatureConfiguration(VISIBLE_TOKEN_RATIO, ThriftCSFType.INT, 3, 8, 4); - newEarlybirdFeatureConfiguration(HAS_QUOTE_FLAG, ThriftCSFType.BOOLEAN, 3, 12, 1); - newEarlybirdFeatureConfiguration(FROM_BLUE_VERIFIED_ACCOUNT_FLAG, - ThriftCSFType.BOOLEAN, 3, 13, 1); - // Unused bits from bit 14 to bit 31 (18 bits) - // newEarlybirdFeatureConfiguration(UNUSED_BITS, ThriftCSFType.INT, 3, 14, 18); - - newEarlybirdFeatureConfiguration(TWEET_SIGNATURE, ThriftCSFType.INT, 4, 0, 32); - - newEarlybirdFeatureConfiguration(EMBEDS_IMPRESSION_COUNT, - ThriftCSFType.DOUBLE, 0, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(EMBEDS_URL_COUNT, - ThriftCSFType.DOUBLE, 0, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(VIDEO_VIEW_COUNT, - ThriftCSFType.DOUBLE, 0, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - - // Unused bits from bit 24 to bit 31 (8 bits). - // This used to be a feature that was decommissioned (SEARCHQUAL-10321) - newEarlybirdFeatureConfiguration(EXTENDED_FEATURE_UNUSED_BITS_0_24_8, - ThriftCSFType.INT, 0, 24, 8); - - newEarlybirdFeatureConfiguration(REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT, - ThriftCSFType.INT, 1, 0, 32, ThriftFeatureUpdateConstraint.IMMUTABLE); - newEarlybirdFeatureConfiguration(REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT, - ThriftCSFType.INT, 2, 0, 32, ThriftFeatureUpdateConstraint.IMMUTABLE); - - newEarlybirdFeatureConfiguration(RETWEET_COUNT_V2, - ThriftCSFType.DOUBLE, 3, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(FAVORITE_COUNT_V2, - ThriftCSFType.DOUBLE, 3, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(REPLY_COUNT_V2, - ThriftCSFType.DOUBLE, 3, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(EMBEDS_IMPRESSION_COUNT_V2, - ThriftCSFType.DOUBLE, 3, 24, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - - newEarlybirdFeatureConfiguration(EMBEDS_URL_COUNT_V2, - ThriftCSFType.DOUBLE, 4, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(VIDEO_VIEW_COUNT_V2, - ThriftCSFType.DOUBLE, 4, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(QUOTE_COUNT, - ThriftCSFType.DOUBLE, 4, 16, 8); - - newEarlybirdFeatureConfiguration(LABEL_ABUSIVE_FLAG, ThriftCSFType.BOOLEAN, 4, 24, 1); - newEarlybirdFeatureConfiguration(LABEL_ABUSIVE_HI_RCL_FLAG, ThriftCSFType.BOOLEAN, 4, 25, 1); - newEarlybirdFeatureConfiguration(LABEL_DUP_CONTENT_FLAG, ThriftCSFType.BOOLEAN, 4, 26, 1); - newEarlybirdFeatureConfiguration(LABEL_NSFW_HI_PRC_FLAG, ThriftCSFType.BOOLEAN, 4, 27, 1); - newEarlybirdFeatureConfiguration(LABEL_NSFW_HI_RCL_FLAG, ThriftCSFType.BOOLEAN, 4, 28, 1); - newEarlybirdFeatureConfiguration(LABEL_SPAM_FLAG, ThriftCSFType.BOOLEAN, 4, 29, 1); - newEarlybirdFeatureConfiguration(LABEL_SPAM_HI_RCL_FLAG, ThriftCSFType.BOOLEAN, 4, 30, 1); - - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_4_31_1, - ThriftCSFType.INT, 4, 31, 1); - - newEarlybirdFeatureConfiguration(WEIGHTED_RETWEET_COUNT, - ThriftCSFType.DOUBLE, 5, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(WEIGHTED_REPLY_COUNT, - ThriftCSFType.DOUBLE, 5, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(WEIGHTED_FAVORITE_COUNT, - ThriftCSFType.DOUBLE, 5, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(WEIGHTED_QUOTE_COUNT, - ThriftCSFType.DOUBLE, 5, 24, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - - newEarlybirdFeatureConfiguration(PERISCOPE_EXISTS, - ThriftCSFType.BOOLEAN, 6, 0, 1); - newEarlybirdFeatureConfiguration(PERISCOPE_HAS_BEEN_FEATURED, - ThriftCSFType.BOOLEAN, 6, 1, 1); - newEarlybirdFeatureConfiguration(PERISCOPE_IS_CURRENTLY_FEATURED, - ThriftCSFType.BOOLEAN, 6, 2, 1); - newEarlybirdFeatureConfiguration(PERISCOPE_IS_FROM_QUALITY_SOURCE, - ThriftCSFType.BOOLEAN, 6, 3, 1); - newEarlybirdFeatureConfiguration(PERISCOPE_IS_LIVE, - ThriftCSFType.BOOLEAN, 6, 4, 1); - - newEarlybirdFeatureConfiguration(IS_TRENDING_NOW_FLAG, - ThriftCSFType.BOOLEAN, 6, 5, 1); - - // remaining bits for integer 6 - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_7_6_26, - ThriftCSFType.INT, 6, 6, 26); - - // The decaying counters can become smaller - newEarlybirdFeatureConfiguration(DECAYED_RETWEET_COUNT, - ThriftCSFType.DOUBLE, 7, 0, 8, ThriftFeatureUpdateConstraint.POSITIVE); - newEarlybirdFeatureConfiguration(DECAYED_REPLY_COUNT, - ThriftCSFType.DOUBLE, 7, 8, 8, ThriftFeatureUpdateConstraint.POSITIVE); - newEarlybirdFeatureConfiguration(DECAYED_FAVORITE_COUNT, - ThriftCSFType.DOUBLE, 7, 16, 8, ThriftFeatureUpdateConstraint.POSITIVE); - newEarlybirdFeatureConfiguration(DECAYED_QUOTE_COUNT, - ThriftCSFType.DOUBLE, 7, 24, 8, ThriftFeatureUpdateConstraint.POSITIVE); - - // The fake engagement counters. - newEarlybirdFeatureConfiguration(FAKE_RETWEET_COUNT, - ThriftCSFType.DOUBLE, 8, 0, 8, ThriftFeatureUpdateConstraint.POSITIVE); - newEarlybirdFeatureConfiguration(FAKE_REPLY_COUNT, - ThriftCSFType.DOUBLE, 8, 8, 8, ThriftFeatureUpdateConstraint.POSITIVE); - newEarlybirdFeatureConfiguration(FAKE_FAVORITE_COUNT, - ThriftCSFType.DOUBLE, 8, 16, 8, ThriftFeatureUpdateConstraint.POSITIVE); - newEarlybirdFeatureConfiguration(FAKE_QUOTE_COUNT, - ThriftCSFType.DOUBLE, 8, 24, 8, ThriftFeatureUpdateConstraint.POSITIVE); - - newEarlybirdFeatureConfiguration(LAST_RETWEET_SINCE_CREATION_HRS, - ThriftCSFType.INT, 9, 0, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(LAST_REPLY_SINCE_CREATION_HRS, - ThriftCSFType.INT, 9, 8, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(LAST_FAVORITE_SINCE_CREATION_HRS, - ThriftCSFType.INT, 9, 16, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - newEarlybirdFeatureConfiguration(LAST_QUOTE_SINCE_CREATION_HRS, - ThriftCSFType.INT, 9, 24, 8, ThriftFeatureUpdateConstraint.INC_ONLY); - - newEarlybirdFeatureConfiguration(NUM_HASHTAGS_V2, - ThriftCSFType.INT, 10, 0, 4); - newEarlybirdFeatureConfiguration(NUM_MENTIONS_V2, - ThriftCSFType.INT, 10, 4, 4); - newEarlybirdFeatureConfiguration(NUM_STOCKS, - ThriftCSFType.INT, 10, 8, 4); - - // Remaining bits for integer 10 - // Production Toxicity and PBlock score from HML (go/toxicity, go/pblock) - newEarlybirdFeatureConfiguration(TOXICITY_SCORE, - ThriftCSFType.DOUBLE, 10, 12, 10); - newEarlybirdFeatureConfiguration(PBLOCK_SCORE, - ThriftCSFType.DOUBLE, 10, 22, 10); - - // The blink engagement counters - newEarlybirdFeatureConfiguration(BLINK_RETWEET_COUNT, - ThriftCSFType.DOUBLE, 11, 0, 8, ThriftFeatureUpdateConstraint.POSITIVE); - newEarlybirdFeatureConfiguration(BLINK_REPLY_COUNT, - ThriftCSFType.DOUBLE, 11, 8, 8, ThriftFeatureUpdateConstraint.POSITIVE); - newEarlybirdFeatureConfiguration(BLINK_FAVORITE_COUNT, - ThriftCSFType.DOUBLE, 11, 16, 8, ThriftFeatureUpdateConstraint.POSITIVE); - newEarlybirdFeatureConfiguration(BLINK_QUOTE_COUNT, - ThriftCSFType.DOUBLE, 11, 24, 8, ThriftFeatureUpdateConstraint.POSITIVE); - - // Experimental health model scores from HML - newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_1, - ThriftCSFType.DOUBLE, 12, 0, 10); - newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_2, - ThriftCSFType.DOUBLE, 12, 10, 10); - newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_3, - ThriftCSFType.DOUBLE, 12, 20, 10); - // remaining bits for integer 12 - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_12_30_2, - ThriftCSFType.INT, 12, 30, 2); - - // Experimental health model scores from HML (cont.) - newEarlybirdFeatureConfiguration(EXPERIMENTAL_HEALTH_MODEL_SCORE_4, - ThriftCSFType.DOUBLE, 13, 0, 10); - // Production pSpammyTweet score from HML (go/pspammytweet) - newEarlybirdFeatureConfiguration(P_SPAMMY_TWEET_SCORE, - ThriftCSFType.DOUBLE, 13, 10, 10); - // Production pReportedTweet score from HML (go/preportedtweet) - newEarlybirdFeatureConfiguration(P_REPORTED_TWEET_SCORE, - ThriftCSFType.DOUBLE, 13, 20, 10); - // remaining bits for integer 13 - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_13_30_2, - ThriftCSFType.INT, 13, 30, 2); - - // Experimental health model scores from HML (cont.) - // Prod Spammy Tweet Content model score from Platform Manipulation (go/spammy-tweet-content) - newEarlybirdFeatureConfiguration(SPAMMY_TWEET_CONTENT_SCORE, - ThriftCSFType.DOUBLE, 14, 0, 10); - // remaining bits for integer 14 - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_14_10_22, - ThriftCSFType.INT, 14, 10, 22); - - // Note that the integer index below is 0-based, but the index j in UNUSED_BITS_{j} below - // is 1-based. - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_16, - ThriftCSFType.INT, 15, 0, 32); - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_17, - ThriftCSFType.INT, 16, 0, 32); - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_18, - ThriftCSFType.INT, 17, 0, 32); - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_19, - ThriftCSFType.INT, 18, 0, 32); - newEarlybirdFeatureConfiguration(EXTENDED_TEST_FEATURE_UNUSED_BITS_20, - ThriftCSFType.INT, 19, 0, 32); - } - - private EarlybirdSchemaCreateTool() { } - - /** - * Get schema for the Earlybird. - */ - public static DynamicSchema buildSchema(EarlybirdCluster cluster) - throws Schema.SchemaValidationException { - SCHEMA_BUILD_COUNT.increment(); - return new DynamicSchema(new ImmutableSchema(buildThriftSchema(cluster), - new AnalyzerFactory(), - cluster.getNameForStats())); - } - - /** - * Get schema for the Earlybird, can throw runtime exception. This is mostly for static schema - * usage, which does not care about schema updates. - */ - @VisibleForTesting - public static DynamicSchema buildSchemaWithRuntimeException(EarlybirdCluster cluster) { - try { - return buildSchema(cluster); - } catch (Schema.SchemaValidationException e) { - throw new RuntimeException(e); - } - } - - private static FeatureConfiguration newEarlybirdFeatureConfiguration( - EarlybirdFieldConstant fieldConstant, - ThriftCSFType type, - int intIndex, int bitStartPos, int bitLength, - ThriftFeatureUpdateConstraint... constraints) { - - if (!fieldConstant.isFlagFeatureField() && type == ThriftCSFType.BOOLEAN) { - throw new IllegalArgumentException( - "Non-flag feature field configured with boolean Thrift type: " + fieldConstant); - } - if (fieldConstant.isFlagFeatureField() && type != ThriftCSFType.BOOLEAN) { - throw new IllegalArgumentException( - "Flag feature field configured with non-boolean Thrift type: " + fieldConstant); - } - - String baseFieldName = getBaseFieldName(fieldConstant); - String name = getFeatureNameInField(fieldConstant); - FeatureConfiguration.Builder builder = FeatureConfiguration.builder() - .withName(name) - .withType(type) - .withBitRange(intIndex, bitStartPos, bitLength); - // remove the following line once we configure features purely by the schema - builder.withBaseField(baseFieldName); - - if (!fieldConstant.isUnusedField()) { - builder.withOutputType(type); - } - if (fieldConstant.getFeatureNormalizationType() != null) { - builder.withFeatureNormalizationType(fieldConstant.getFeatureNormalizationType()); - } - - for (ThriftFeatureUpdateConstraint constraint : constraints) { - builder.withFeatureUpdateConstraint(constraint); - } - FeatureConfiguration featureConfiguration = builder.build(); - FEATURE_CONFIGURATION_MAP.put(fieldConstant.getFieldName(), featureConfiguration); - return featureConfiguration; - } - - /** - * Build ThriftSchema for the Earlybird. Note that the schema returned can be used - * all Earlybird clusters. However, some clusters may not use all the field configurations. - */ - @VisibleForTesting - public static ThriftSchema buildThriftSchema(EarlybirdCluster cluster) { - EarlybirdSchemaBuilder builder = new EarlybirdSchemaBuilder( - new EarlybirdFieldConstants(), cluster, TokenStreamSerializer.Version.VERSION_2); - - builder.withSchemaVersion( - FlushVersion.CURRENT_FLUSH_VERSION.getVersionNumber(), - FlushVersion.CURRENT_FLUSH_VERSION.getMinorVersion(), - FlushVersion.CURRENT_FLUSH_VERSION.getDescription(), - FlushVersion.CURRENT_FLUSH_VERSION.isOfficial()); - - // ID field, used for partitioning - builder.withPartitionFieldId(0) - .withSortableLongTermField(EarlybirdFieldConstant.ID_FIELD.getFieldName()) - // Text Fields that are searched by default - .withTextField(EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), true) - .withSearchFieldByDefault( - EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), 0.1f) - .withPretokenizedTextField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), true) - .withSearchFieldByDefault(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), 1.0f); - builder.withTweetSpecificNormalization(EarlybirdFieldConstant.TEXT_FIELD.getFieldName()) - .withTextField(EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), true) - .withSearchFieldByDefault( - EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), 0.2f) - - // Text fields not searched by default - .withTextField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), false) - .withTextField(EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(), false) - - // cards are not searched by default, and have weight 0. - .withPretokenizedTextField(EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(), false) - .withPretokenizedTextField( - EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(), false) - .withTextField(EarlybirdFieldConstant.CARD_LANG.getFieldName(), false) - - // Out-of-order append fields - .withLongTermField(EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName()) - - // No Position fields, sorted alphabetically - .withPretokenizedNoPositionField(EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName()) - .withIntTermField(EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.RETWEET_SOURCE_TWEET_ID_FIELD.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.RETWEET_SOURCE_USER_ID_FIELD.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName()) - .withTextField(EarlybirdFieldConstant.PLACE_FULL_NAME_FIELD.getFieldName(), false) - .withIndexedNotTokenizedField( - EarlybirdFieldConstant.PLACE_COUNTRY_CODE_FIELD.getFieldName()) - .withIndexedNotTokenizedField( - EarlybirdFieldConstant.PROFILE_GEO_COUNTRY_CODE_FIELD.getFieldName()) - .withTextField(EarlybirdFieldConstant.PROFILE_GEO_REGION_FIELD.getFieldName(), false) - .withTextField(EarlybirdFieldConstant.PROFILE_GEO_LOCALITY_FIELD.getFieldName(), false) - .withTermTextLookup(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName()) - .withTermTextLookup(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName()) - .withPretokenizedNoPositionField(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName()) - .withIndexedNotTokenizedField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD) - .withIndexedNotTokenizedField(ImmutableSchema.HF_TERM_PAIRS_FIELD) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.LINKS_FIELD.getFieldName()) - .withIntTermField(EarlybirdFieldConstant.LINK_CATEGORY_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.PLACE_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName()) - .withPretokenizedNoPositionField(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName()) - .withIndexedNotTokenizedField(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName()) - .withIntTermField(NORMALIZED_FAVORITE_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName()) - .withIntTermField(NORMALIZED_REPLY_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName()) - .withIntTermField(NORMALIZED_RETWEET_COUNT_GREATER_THAN_OR_EQUAL_TO_FIELD.getFieldName()) - - .withIntTermField(EarlybirdFieldConstant.COMPOSER_SOURCE.getFieldName()) - - .withLongTermField(EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName()) - .withLongTermField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName()) - - // Named entity fields - .withIndexedNotTokenizedField( - EarlybirdFieldConstant.NAMED_ENTITY_FROM_URL_FIELD.getFieldName(), true) - .withIndexedNotTokenizedField( - EarlybirdFieldConstant.NAMED_ENTITY_FROM_TEXT_FIELD.getFieldName(), true) - .withIndexedNotTokenizedField( - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD.getFieldName(), true) - .withIndexedNotTokenizedField( - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD.getFieldName(), true) - - // camelCase-tokenized user handles and tokenized user names, not searchable by default - .withPretokenizedTextField( - EarlybirdFieldConstant.CAMELCASE_USER_HANDLE_FIELD.getFieldName(), false) - .withPretokenizedTextField( - EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(), false) - - .withIndexedNotTokenizedField( - EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName()) - .withTextField(EarlybirdFieldConstant.SPACE_ADMIN_FIELD.getFieldName(), false) - .withPretokenizedTextField(EarlybirdFieldConstant.SPACE_TITLE_FIELD.getFieldName(), false) - .withTextField(EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), true) - .withPretokenizedTextField( - EarlybirdFieldConstant.CAMELCASE_TOKENIZED_SPACE_ADMIN_FIELD.getFieldName(), false) - .withPretokenizedTextField( - EarlybirdFieldConstant.TOKENIZED_SPACE_ADMIN_DISPLAY_NAME_FIELD.getFieldName(), false) - .withPretokenizedTextField( - EarlybirdFieldConstant.URL_DESCRIPTION_FIELD.getFieldName(), false) - .withPretokenizedTextField( - EarlybirdFieldConstant.URL_TITLE_FIELD.getFieldName(), false); - - builder - .withPhotoUrlFacetField(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName()) - .withOutOfOrderEnabledForField( - EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName()) - .withOutOfOrderEnabledForField( - EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName()) - .withOutOfOrderEnabledForField( - EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName()); - - // ColumnStrideFields. - boolean loadCSFIntoRAMDefault = cluster != EarlybirdCluster.FULL_ARCHIVE; - - builder - .withColumnStrideField(EarlybirdFieldConstants.ENCODED_TWEET_FEATURES_FIELD_NAME, - ThriftCSFType.INT, NUMBER_OF_INTEGERS_FOR_FEATURES, - true, loadCSFIntoRAMDefault) - .withColumnStrideField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(), - ThriftCSFType.LONG, 1, false, /* the full archive loads this field into RAM */ true) - .withColumnStrideField(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(), - ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault) - .withColumnStrideField(EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(), - ThriftCSFType.BYTE, 1, false, loadCSFIntoRAMDefault) - // CSF Used by archive mappers - .withColumnStrideField(EarlybirdFieldConstant.CREATED_AT_CSF_FIELD.getFieldName(), - ThriftCSFType.INT, 1, false, /* the full archive loads this field into RAM */ true) - .withColumnStrideField(EarlybirdFieldConstant.ID_CSF_FIELD.getFieldName(), - ThriftCSFType.LONG, 1, false, /* the full archive loads this field into RAM */ true) - .withColumnStrideField(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName(), - ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault) - .withColumnStrideField(EarlybirdFieldConstant.CONVERSATION_ID_CSF.getFieldName(), - ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault) - .withColumnStrideField(EarlybirdFieldConstant.QUOTED_TWEET_ID_CSF.getFieldName(), - ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault) - .withColumnStrideField(EarlybirdFieldConstant.QUOTED_USER_ID_CSF.getFieldName(), - ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault) - .withColumnStrideField(EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(), - ThriftCSFType.INT, 1, false, loadCSFIntoRAMDefault) - .withColumnStrideField(EarlybirdFieldConstant.CARD_URI_CSF.getFieldName(), - ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault) - .withColumnStrideField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(), - ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault) - .withColumnStrideField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), - ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault) - .withColumnStrideField( - EarlybirdFieldConstant.EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF.getFieldName(), - ThriftCSFType.LONG, 1, false, loadCSFIntoRAMDefault) - - /* Semicolon on separate line to preserve git blame. */; - - builder.withColumnStrideField( - EarlybirdFieldConstants.EXTENDED_ENCODED_TWEET_FEATURES_FIELD_NAME, - ThriftCSFType.INT, NUMBER_OF_INTEGERS_FOR_EXTENDED_FEATURES, - true, loadCSFIntoRAMDefault); - - for (Map.Entry entry : FEATURE_CONFIGURATION_MAP.entrySet()) { - String fullName = entry.getKey(); - String baseName = getBaseFieldName(fullName); - EarlybirdFieldConstant fieldConstant = EarlybirdFieldConstants.getFieldConstant(fullName); - if (fieldConstant.isValidFieldInCluster(cluster)) { - builder.withFeatureConfiguration(baseName, fullName, entry.getValue()); - } - } - // Add facet settings for facet fields - // boolean args are respectively whether to use skiplist, whether offensive, whether to use CSF - builder - .withFacetConfigs(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName(), - EarlybirdFieldConstant.MENTIONS_FACET, true, false, false) - .withFacetConfigs(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName(), - EarlybirdFieldConstant.HASHTAGS_FACET, true, false, false) - .withFacetConfigs(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName(), - EarlybirdFieldConstant.STOCKS_FACET, true, false, false) - .withFacetConfigs(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName(), - EarlybirdFieldConstant.IMAGES_FACET, true, true, false) - .withFacetConfigs(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName(), - EarlybirdFieldConstant.VIDEOS_FACET, true, true, false) - .withFacetConfigs(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName(), - EarlybirdFieldConstant.NEWS_FACET, true, false, false) - .withFacetConfigs(EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(), - EarlybirdFieldConstant.LANGUAGES_FACET, false, false, false) - .withFacetConfigs(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(), - EarlybirdFieldConstant.SOURCES_FACET, false, false, false) - .withFacetConfigs(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName(), - EarlybirdFieldConstant.TWIMG_FACET, true, true, false) - .withFacetConfigs(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(), - EarlybirdFieldConstant.FROM_USER_ID_FACET, false, false, true /* facet on CSF */) - .withFacetConfigs(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(), - EarlybirdFieldConstant.RETWEETS_FACET, false, false, true /* facet on CSF */) - .withFacetConfigs(EarlybirdFieldConstant.LINKS_FIELD.getFieldName(), - EarlybirdFieldConstant.LINKS_FACET, true, false, false) - .withFacetConfigs( - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD.getFieldName(), - true, false, false) - .withFacetConfigs( - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD.getFieldName(), - true, false, false) - .withFacetConfigs( - EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(), - true, false, false) - .withFacetConfigs(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName(), - EarlybirdFieldConstant.SPACES_FACET, true, false, false); - return builder.build(); - } -} diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentBuilder.docx b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentBuilder.docx new file mode 100644 index 000000000..c0a5bae7e Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentBuilder.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentBuilder.java b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentBuilder.java deleted file mode 100644 index 06666adc0..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentBuilder.java +++ /dev/null @@ -1,897 +0,0 @@ -package com.twitter.search.common.schema.earlybird; - -import java.io.IOException; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import javax.annotation.Nonnull; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Sets; - -import org.apache.commons.lang.StringUtils; -import org.apache.lucene.analysis.TokenStream; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.collections.Pair; -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.cuad.ner.plain.thriftjava.NamedEntity; -import com.twitter.cuad.ner.plain.thriftjava.NamedEntityContext; -import com.twitter.cuad.ner.plain.thriftjava.NamedEntityInputSourceType; -import com.twitter.cuad.ner.thriftjava.WholeEntityType; -import com.twitter.search.common.constants.SearchCardType; -import com.twitter.search.common.indexing.thriftjava.ThriftExpandedUrl; -import com.twitter.search.common.indexing.thriftjava.ThriftGeoLocationSource; -import com.twitter.search.common.indexing.thriftjava.TwitterPhotoUrl; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.schema.ThriftDocumentBuilder; -import com.twitter.search.common.schema.base.FieldNameToIdMapping; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.util.analysis.CharTermAttributeSerializer; -import com.twitter.search.common.util.analysis.IntTermAttributeSerializer; -import com.twitter.search.common.util.analysis.TermPayloadAttributeSerializer; -import com.twitter.search.common.util.analysis.TwitterPhotoTokenStream; -import com.twitter.search.common.util.spatial.GeoUtil; -import com.twitter.search.common.util.text.TokenizerHelper; -import com.twitter.search.common.util.text.TweetTokenStreamSerializer; -import com.twitter.search.common.util.text.regex.Regex; -import com.twitter.search.common.util.url.LinkVisibilityUtils; -import com.twitter.search.common.util.url.URLUtils; - -import geo.google.datamodel.GeoAddressAccuracy; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; - -/** - * Builder class for building a {@link ThriftDocument}. - */ -public final class EarlybirdThriftDocumentBuilder extends ThriftDocumentBuilder { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdThriftDocumentBuilder.class); - - private static final SearchCounter SERIALIZE_FAILURE_COUNT_NONPENGUIN_DEPENDENT = - SearchCounter.export("tokenstream_serialization_failure_non_penguin_dependent"); - - private static final String HASHTAG_SYMBOL = "#"; - private static final String CASHTAG_SYMBOL = "$"; - private static final String MENTION_SYMBOL = "@"; - - private static final SearchCounter BCP47_LANGUAGE_TAG_COUNTER = - SearchCounter.export("bcp47_language_tag"); - - /** - * Used to check if a card is video card. - * - * @see #withSearchCard - */ - private static final String AMPLIFY_CARD_NAME = "amplify"; - private static final String PLAYER_CARD_NAME = "player"; - - // Extra term indexed for native retweets, to ensure that the "-rt" query excludes them. - public static final String RETWEET_TERM = "rt"; - public static final String QUESTION_MARK = "?"; - - private static final Set NAMED_ENTITY_URL_SOURCE_TYPES = - ImmutableSet.of( - NamedEntityInputSourceType.URL_TITLE, NamedEntityInputSourceType.URL_DESCRIPTION); - - private final TokenStreamSerializer intTermAttributeSerializer = - new TokenStreamSerializer(ImmutableList.of( - new IntTermAttributeSerializer())); - private final TokenStreamSerializer photoUrlSerializer = - new TokenStreamSerializer(ImmutableList - .of( - new CharTermAttributeSerializer(), new TermPayloadAttributeSerializer())); - private final Schema schema; - - private boolean isSetLatLonCSF = false; - private boolean addLatLonCSF = true; - private boolean addEncodedTweetFeatures = true; - - @Nonnull - private final EarlybirdEncodedFeatures encodedTweetFeatures; - @Nullable - private final EarlybirdEncodedFeatures extendedEncodedTweetFeatures; - - /** - * Default constructor - */ - public EarlybirdThriftDocumentBuilder( - @Nonnull EarlybirdEncodedFeatures encodedTweetFeatures, - @Nullable EarlybirdEncodedFeatures extendedEncodedTweetFeatures, - FieldNameToIdMapping idMapping, - Schema schema) { - super(idMapping); - this.schema = schema; - this.encodedTweetFeatures = Preconditions.checkNotNull(encodedTweetFeatures); - - this.extendedEncodedTweetFeatures = extendedEncodedTweetFeatures; - } - - /** - * Get internal {@link EarlybirdEncodedFeatures} - */ - public EarlybirdEncodedFeatures getEncodedTweetFeatures() { - return encodedTweetFeatures; - } - - /** - * Add skip list entry for the given field. - * This adds a term __has_fieldName in the INTERNAL field. - */ - public EarlybirdThriftDocumentBuilder addFacetSkipList(String fieldName) { - withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.getFacetSkipFieldName(fieldName)); - return this; - } - - /** - * Add a filter term in the INTERNAL field. - */ - public EarlybirdThriftDocumentBuilder addFilterInternalFieldTerm(String filterName) { - withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdThriftDocumentUtil.formatFilter(filterName)); - return this; - } - - /** - * Add id field and id csf field. - */ - public EarlybirdThriftDocumentBuilder withID(long id) { - withLongField(EarlybirdFieldConstant.ID_FIELD.getFieldName(), id); - withLongField(EarlybirdFieldConstant.ID_CSF_FIELD.getFieldName(), id); - return this; - } - - /** - * Add created at field and created at csf field. - */ - public EarlybirdThriftDocumentBuilder withCreatedAt(int createdAt) { - withIntField(EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName(), createdAt); - withIntField(EarlybirdFieldConstant.CREATED_AT_CSF_FIELD.getFieldName(), createdAt); - return this; - } - - /** - * Add tweet text field. - */ - public EarlybirdThriftDocumentBuilder withTweetText( - String text, byte[] textTokenStream) throws IOException { - withTokenStreamField(EarlybirdFieldConstants.EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), - text, textTokenStream); - return this; - } - - public EarlybirdThriftDocumentBuilder withTweetText(String text) throws IOException { - withTweetText(text, null); - return this; - } - - /** - * Add a list of cashTags. Like $TWTR. - */ - public EarlybirdThriftDocumentBuilder withStocksFields(List cashTags) { - if (isNotEmpty(cashTags)) { - addFacetSkipList(EarlybirdFieldConstant.STOCKS_FIELD.getFieldName()); - for (String cashTag : cashTags) { - withStringField( - EarlybirdFieldConstant.STOCKS_FIELD.getFieldName(), CASHTAG_SYMBOL + cashTag); - } - } - return this; - } - - /** - * Add a list of hashtags. - */ - public EarlybirdThriftDocumentBuilder withHashtagsField(List hashtags) { - if (isNotEmpty(hashtags)) { - int numHashtags = Math.min( - hashtags.size(), - schema.getFeatureConfigurationById( - EarlybirdFieldConstant.NUM_HASHTAGS.getFieldId()).getMaxValue()); - encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.NUM_HASHTAGS, numHashtags); - addFacetSkipList(EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName()); - for (String hashtag : hashtags) { - withStringField( - EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName(), HASHTAG_SYMBOL + hashtag); - } - } - return this; - } - - /** - * Added a list of mentions. - */ - public EarlybirdThriftDocumentBuilder withMentionsField(List mentions) { - if (isNotEmpty(mentions)) { - int numMentions = Math.min( - mentions.size(), - schema.getFeatureConfigurationById( - EarlybirdFieldConstant.NUM_HASHTAGS.getFieldId()).getMaxValue()); - encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.NUM_MENTIONS, numMentions); - addFacetSkipList(EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName()); - for (String mention : mentions) { - withStringField( - EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName(), MENTION_SYMBOL + mention); - } - } - return this; - } - - /** - * Add a list of Twitter Photo URLs (twimg URLs). These are different from regular URLs, because - * we use the TwitterPhotoTokenStream to index them, and we also include the status ID as payload. - */ - public EarlybirdThriftDocumentBuilder withTwimgURLs( - List urls) throws IOException { - if (isNotEmpty(urls)) { - for (TwitterPhotoUrl photoUrl : urls) { - TokenStream ts = new TwitterPhotoTokenStream(photoUrl.getPhotoStatusId(), - photoUrl.getMediaUrl()); - byte[] serializedTs = photoUrlSerializer.serialize(ts); - withTokenStreamField(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName(), - Long.toString(photoUrl.getPhotoStatusId()), serializedTs); - addFacetSkipList(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName()); - } - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG); - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_NATIVE_IMAGE_FLAG); - } - return this; - } - - /** - * Add a list of URLs. This also add facet skip list terms for news / images / videos if needed. - */ - public EarlybirdThriftDocumentBuilder withURLs(List urls) { - if (isNotEmpty(urls)) { - Set dedupedLinks = Sets.newHashSet(); - - for (ThriftExpandedUrl expandedUrl : urls) { - if (expandedUrl.isSetOriginalUrl()) { - String normalizedOriginalUrl = URLUtils.normalizePath(expandedUrl.getOriginalUrl()); - dedupedLinks.add(normalizedOriginalUrl); - } - if (expandedUrl.isSetExpandedUrl()) { - dedupedLinks.add(URLUtils.normalizePath(expandedUrl.getExpandedUrl())); - } - - if (expandedUrl.isSetCanonicalLastHopUrl()) { - String url = URLUtils.normalizePath(expandedUrl.getCanonicalLastHopUrl()); - dedupedLinks.add(url); - - String facetUrl = URLUtils.normalizeFacetURL(url); - - if (expandedUrl.isSetMediaType()) { - switch (expandedUrl.getMediaType()) { - case NEWS: - withStringField(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName(), url); - addFacetSkipList(EarlybirdFieldConstant.NEWS_LINKS_FIELD.getFieldName()); - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG); - break; - case VIDEO: - withStringField(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName(), facetUrl); - addFacetSkipList(EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName()); - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG); - break; - case IMAGE: - withStringField(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName(), facetUrl); - addFacetSkipList(EarlybirdFieldConstant.IMAGE_LINKS_FIELD.getFieldName()); - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG); - break; - case NATIVE_IMAGE: - // Nothing done here. Native images are handled separately. - // They are in PhotoUrls instead of expandedUrls. - break; - case UNKNOWN: - break; - default: - throw new RuntimeException("Unknown Media Type: " + expandedUrl.getMediaType()); - } - } - - if (expandedUrl.isSetLinkCategory()) { - withIntField(EarlybirdFieldConstant.LINK_CATEGORY_FIELD.getFieldName(), - expandedUrl.getLinkCategory().getValue()); - } - } - } - - if (!dedupedLinks.isEmpty()) { - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_LINK_FLAG); - - addFacetSkipList(EarlybirdFieldConstant.LINKS_FIELD.getFieldName()); - - for (String linkUrl : dedupedLinks) { - withStringField(EarlybirdFieldConstant.LINKS_FIELD.getFieldName(), linkUrl); - } - } - - encodedTweetFeatures.setFlagValue( - EarlybirdFieldConstant.HAS_VISIBLE_LINK_FLAG, - LinkVisibilityUtils.hasVisibleLink(urls)); - } - - return this; - } - - /** - * Add a list of places. The place are U64 encoded place IDs. - */ - public EarlybirdThriftDocumentBuilder withPlacesField(List places) { - if (isNotEmpty(places)) { - for (String place : places) { - withStringField(EarlybirdFieldConstant.PLACE_FIELD.getFieldName(), place); - } - } - return this; - } - - /** - * Add tweet text signature field. - */ - public EarlybirdThriftDocumentBuilder withTweetSignature(int signature) { - encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.TWEET_SIGNATURE, signature); - return this; - } - - /** - * Add geo hash field and internal filter field. - */ - public EarlybirdThriftDocumentBuilder withGeoHash(double lat, double lon, int accuracy) { - if (GeoUtil.validateGeoCoordinates(lat, lon)) { - withGeoField( - EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(), - lat, lon, accuracy); - withLatLonCSF(lat, lon); - } - return this; - } - - public EarlybirdThriftDocumentBuilder withGeoHash(double lat, double lon) { - withGeoHash(lat, lon, GeoAddressAccuracy.UNKNOWN_LOCATION.getCode()); - return this; - } - - /** - * Add geo location source to the internal field with ThriftGeoLocationSource object. - */ - public EarlybirdThriftDocumentBuilder withGeoLocationSource( - ThriftGeoLocationSource geoLocationSource) { - if (geoLocationSource != null) { - withGeoLocationSource(EarlybirdFieldConstants.formatGeoType(geoLocationSource)); - } - return this; - } - - /** - * Add geo location source to the internal field. - */ - public EarlybirdThriftDocumentBuilder withGeoLocationSource(String geoLocationSource) { - withStringField(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), geoLocationSource); - return this; - } - - /** - * Add encoded lat and lon to LatLonCSF field. - */ - public EarlybirdThriftDocumentBuilder withLatLonCSF(double lat, double lon) { - isSetLatLonCSF = true; - long encodedLatLon = GeoUtil.encodeLatLonIntoInt64((float) lat, (float) lon); - withLongField(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName(), encodedLatLon); - return this; - } - - /** - * Add from verified account flag to internal field. - */ - public EarlybirdThriftDocumentBuilder withFromVerifiedAccountFlag() { - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG); - addFilterInternalFieldTerm(EarlybirdFieldConstant.VERIFIED_FILTER_TERM); - return this; - } - - /** - * Add from blue-verified account flag to internal field. - */ - public EarlybirdThriftDocumentBuilder withFromBlueVerifiedAccountFlag() { - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.FROM_BLUE_VERIFIED_ACCOUNT_FLAG); - addFilterInternalFieldTerm(EarlybirdFieldConstant.BLUE_VERIFIED_FILTER_TERM); - return this; - } - - /** - * Add offensive flag to internal field. - */ - public EarlybirdThriftDocumentBuilder withOffensiveFlag() { - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG); - withStringField( - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.IS_OFFENSIVE); - return this; - } - - /** - * Add user reputation value to encoded feature. - */ - public EarlybirdThriftDocumentBuilder withUserReputation(byte score) { - encodedTweetFeatures.setFeatureValue(EarlybirdFieldConstant.USER_REPUTATION, score); - return this; - } - - /** - * This method creates the fields related to document language. - * For most languages, their isoLanguageCode and bcp47LanguageTag are the same. - * For some languages with variants, these two fields are different. - * E.g. for simplified Chinese, their isoLanguageCode is zh, but their bcp47LanguageTag is zh-cn. - *

- * This method adds fields for both the isoLanguageCode and bcp47LanguageTag. - */ - public EarlybirdThriftDocumentBuilder withLanguageCodes( - String isoLanguageCode, String bcp47LanguageTag) { - if (isoLanguageCode != null) { - withISOLanguage(isoLanguageCode); - } - if (bcp47LanguageTag != null && !bcp47LanguageTag.equals(isoLanguageCode)) { - BCP47_LANGUAGE_TAG_COUNTER.increment(); - withISOLanguage(bcp47LanguageTag); - } - return this; - } - - /** - * Adds a String field into the ISO_LANGUAGE_FIELD. - */ - public EarlybirdThriftDocumentBuilder withISOLanguage(String languageString) { - withStringField( - EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(), languageString.toLowerCase()); - return this; - } - - /** - * Add from user ID fields. - */ - public EarlybirdThriftDocumentBuilder withFromUserID(long fromUserId) { - withLongField(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(), fromUserId); - withLongField(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName(), fromUserId); - return this; - } - - /** - * Add from user information fields. - */ - public EarlybirdThriftDocumentBuilder withFromUser( - long fromUserId, String fromUser) { - withFromUser(fromUserId, fromUser, null); - return this; - } - - /** - * Add from user information fields. - */ - public EarlybirdThriftDocumentBuilder withFromUser(String fromUser) { - withFromUser(fromUser, null); - return this; - } - - /** - * Add from user information fields. - */ - public EarlybirdThriftDocumentBuilder withFromUser( - String fromUser, String tokenizedFromUser) { - withStringField(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), fromUser); - withStringField(EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), - isNotBlank(tokenizedFromUser) ? tokenizedFromUser : fromUser); - return this; - } - - /** - * Add from user information fields. - */ - public EarlybirdThriftDocumentBuilder withFromUser( - long fromUserId, String fromUser, String tokenizedFromUser) { - withFromUserID(fromUserId); - withFromUser(fromUser, tokenizedFromUser); - return this; - } - - /** - * Add to user field. - */ - public EarlybirdThriftDocumentBuilder withToUser( - String toUser) { - withStringField(EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(), toUser); - return this; - } - - /** - * Add escherbird annotation fields. - */ - public EarlybirdThriftDocumentBuilder withAnnotationEntities(List entities) { - if (isNotEmpty(entities)) { - for (String entity : entities) { - withStringField(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(), entity); - } - } - return this; - } - - /** - * Add replies to internal field and set is reply flag. - */ - public EarlybirdThriftDocumentBuilder withReplyFlag() { - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.IS_REPLY_FLAG); - addFilterInternalFieldTerm(EarlybirdFieldConstant.REPLIES_FILTER_TERM); - return this; - } - - public EarlybirdThriftDocumentBuilder withCameraComposerSourceFlag() { - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.COMPOSER_SOURCE_IS_CAMERA_FLAG); - return this; - } - - /** - * Add in reply to user id. - *

- * Notice {@link #withReplyFlag} is not automatically called since retweet a tweet that is - * a reply to some other tweet is not considered a reply. - * The caller should call {@link #withReplyFlag} separately if this tweet is really a reply tweet. - */ - public EarlybirdThriftDocumentBuilder withInReplyToUserID(long inReplyToUserID) { - withLongField(EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName(), inReplyToUserID); - return this; - } - - /** - * Add reference tweet author id. - */ - public EarlybirdThriftDocumentBuilder withReferenceAuthorID(long referenceAuthorID) { - withLongField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), referenceAuthorID); - return this; - } - - /** - * Add all native retweet related fields/label - */ - @VisibleForTesting - public EarlybirdThriftDocumentBuilder withNativeRetweet(final long retweetUserID, - final long sharedStatusID) { - withLongField(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF.getFieldName(), sharedStatusID); - - withLongField(EarlybirdFieldConstant.RETWEET_SOURCE_TWEET_ID_FIELD.getFieldName(), - sharedStatusID); - withLongField(EarlybirdFieldConstant.RETWEET_SOURCE_USER_ID_FIELD.getFieldName(), - retweetUserID); - withLongField(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), retweetUserID); - - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.IS_RETWEET_FLAG); - - // Add native retweet label to the internal field. - addFilterInternalFieldTerm(EarlybirdFieldConstant.NATIVE_RETWEETS_FILTER_TERM); - withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), RETWEET_TERM); - return this; - } - - /** - * Add quoted tweet id and user id. - */ - @VisibleForTesting - public EarlybirdThriftDocumentBuilder withQuote( - final long quotedStatusId, final long quotedUserId) { - withLongField(EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName(), quotedStatusId); - withLongField(EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName(), quotedUserId); - - withLongField(EarlybirdFieldConstant.QUOTED_TWEET_ID_CSF.getFieldName(), quotedStatusId); - withLongField(EarlybirdFieldConstant.QUOTED_USER_ID_CSF.getFieldName(), quotedUserId); - - encodedTweetFeatures.setFlag(EarlybirdFieldConstant.HAS_QUOTE_FLAG); - - // Add quote label to the internal field. - addFilterInternalFieldTerm(EarlybirdFieldConstant.QUOTE_FILTER_TERM); - return this; - } - - /** - * Add resolved links text field. - */ - public EarlybirdThriftDocumentBuilder withResolvedLinksText(String linksText) { - withStringField(EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), linksText); - return this; - } - - /** - * Add source field. - */ - public EarlybirdThriftDocumentBuilder withSource(String source) { - withStringField(EarlybirdFieldConstant.SOURCE_FIELD.getFieldName(), source); - return this; - } - - /** - * Add normalized source field. - */ - public EarlybirdThriftDocumentBuilder withNormalizedSource(String normalizedSource) { - withStringField( - EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName(), normalizedSource); - return this; - } - - /** - * Add positive smiley to internal field. - */ - public EarlybirdThriftDocumentBuilder withPositiveSmiley() { - withStringField( - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.HAS_POSITIVE_SMILEY); - return this; - } - - /** - * Add negative smiley to internal field. - */ - public EarlybirdThriftDocumentBuilder withNegativeSmiley() { - withStringField( - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.HAS_NEGATIVE_SMILEY); - return this; - } - - /** - * Add question mark label to a text field. - */ - public EarlybirdThriftDocumentBuilder withQuestionMark() { - withStringField(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), QUESTION_MARK); - return this; - } - - /** - * Add card related fields. - */ - public EarlybirdThriftDocumentBuilder withSearchCard( - String name, - String domain, - String title, byte[] serializedTitleStream, - String description, byte[] serializedDescriptionStream, - String lang) { - if (isNotBlank(title)) { - withTokenStreamField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(), - title, serializedTitleStream); - } - - if (isNotBlank(description)) { - withTokenStreamField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName(), - description, serializedDescriptionStream); - } - - if (isNotBlank(lang)) { - withStringField(EarlybirdFieldConstant.CARD_LANG.getFieldName(), lang); - } - - if (isNotBlank(domain)) { - withStringField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName(), domain); - } - - if (isNotBlank(name)) { - withStringField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(), name); - withIntField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD.getFieldName(), - SearchCardType.cardTypeFromStringName(name).getByteValue()); - } - - if (AMPLIFY_CARD_NAME.equalsIgnoreCase(name) - || PLAYER_CARD_NAME.equalsIgnoreCase(name)) { - // Add into "internal" field so that this tweet is returned by filter:videos. - addFacetSkipList( - EarlybirdFieldConstants.EarlybirdFieldConstant.VIDEO_LINKS_FIELD.getFieldName()); - } - - return this; - } - - public EarlybirdThriftDocumentBuilder withNormalizedMinEngagementField( - String fieldName, int normalizedNumEngagements) throws IOException { - EarlybirdThriftDocumentUtil.addNormalizedMinEngagementField(doc, fieldName, - normalizedNumEngagements); - return this; - } - - /** - * Add named entity with given canonical name and type to document. - */ - public EarlybirdThriftDocumentBuilder withNamedEntity(NamedEntity namedEntity) { - if (namedEntity.getContexts() == null) { - // In this unlikely case, we don't have any context for named entity type or source, - // so we can't properly index it in any of our fields. We'll just skip it in this case. - return this; - } - - // Keep track of the fields we've applied in the builder already, to ensure we only index - // each term (field/value pair) once - Set> fieldsApplied = new HashSet<>(); - for (NamedEntityContext context : namedEntity.getContexts()) { - if (context.isSetInput_source() - && NAMED_ENTITY_URL_SOURCE_TYPES.contains(context.getInput_source().getSource_type())) { - // If the source is one of the URL* types, add the named entity to the "from_url" fields, - // ensuring we add it only once - addNamedEntityFields( - fieldsApplied, - EarlybirdFieldConstant.NAMED_ENTITY_FROM_URL_FIELD, - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD, - namedEntity.getCanonical_name(), - context); - } else { - addNamedEntityFields( - fieldsApplied, - EarlybirdFieldConstant.NAMED_ENTITY_FROM_TEXT_FIELD, - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD, - namedEntity.getCanonical_name(), - context); - } - } - - return this; - } - - /** - * Add space id fields. - */ - public EarlybirdThriftDocumentBuilder withSpaceIdFields(Set spaceIds) { - if (!spaceIds.isEmpty()) { - addFacetSkipList(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName()); - for (String spaceId : spaceIds) { - withStringField(EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName(), spaceId); - } - } - return this; - } - - /** - * Add directed at user. - */ - @VisibleForTesting - public EarlybirdThriftDocumentBuilder withDirectedAtUser(final long directedAtUserId) { - withLongField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName(), - directedAtUserId); - - withLongField(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(), directedAtUserId); - - return this; - } - - /** - * Add a white space tokenized screen name field. - * - * Example: - * screenName - "super_hero" - * tokenized version - "super hero" - */ - public EarlybirdThriftDocumentBuilder withWhiteSpaceTokenizedScreenNameField( - String fieldName, - String normalizedScreenName) { - String whiteSpaceTokenizableScreenName = StringUtils.join( - normalizedScreenName.split(Regex.HASHTAG_USERNAME_PUNCTUATION_REGEX), " "); - withStringField(fieldName, whiteSpaceTokenizableScreenName); - return this; - } - - /** - * Add a camel case tokenized screen name field. - */ - public EarlybirdThriftDocumentBuilder withCamelCaseTokenizedScreenNameField( - String fieldName, - String screenName, - String normalizedScreenName, - TokenStream screenNameTokenStream) { - - // this normalized text is consistent to how the tokenized stream is created from - // TokenizerHelper.getNormalizedCamelcaseTokenStream - ie. just lowercasing. - String camelCaseTokenizedScreenNameText = - TokenizerHelper.getNormalizedCamelcaseTokenStreamText(screenName); - try { - // Reset the token stream in case it has been read before. - screenNameTokenStream.reset(); - byte[] camelCaseTokenizedScreenName = - TweetTokenStreamSerializer.getTweetTokenStreamSerializer() - .serialize(screenNameTokenStream); - - withTokenStreamField( - fieldName, - camelCaseTokenizedScreenNameText.isEmpty() - ? normalizedScreenName : camelCaseTokenizedScreenNameText, - camelCaseTokenizedScreenName); - } catch (IOException e) { - LOG.error("TwitterTokenStream serialization error! Could not serialize: " + screenName); - SERIALIZE_FAILURE_COUNT_NONPENGUIN_DEPENDENT.increment(); - } - return this; - } - - private void addNamedEntityFields( - Set> fieldsApplied, - EarlybirdFieldConstant nameOnlyField, - EarlybirdFieldConstant nameWithTypeField, - String name, - NamedEntityContext context) { - withOneTimeStringField(fieldsApplied, nameOnlyField, name, false); - if (context.isSetEntity_type()) { - withOneTimeStringField(fieldsApplied, nameWithTypeField, - formatNamedEntityString(name, context.getEntity_type()), true); - } - } - - private void withOneTimeStringField( - Set> fieldsApplied, EarlybirdFieldConstant field, - String value, boolean addToFacets) { - Pair fieldValuePair = new Pair<>(field, value); - if (!fieldsApplied.contains(fieldValuePair)) { - if (addToFacets) { - addFacetSkipList(field.getFieldName()); - } - withStringField(field.getFieldName(), value); - fieldsApplied.add(fieldValuePair); - } - } - - private String formatNamedEntityString(String name, WholeEntityType type) { - return String.format("%s:%s", name, type).toLowerCase(); - } - - /** - * Set whether set LAT_LON_CSF_FIELD or not before build - * if LAT_LON_CSF_FIELD is not set deliberately. - * - * @see #prepareToBuild() - */ - public EarlybirdThriftDocumentBuilder setAddLatLonCSF(boolean isSet) { - addLatLonCSF = isSet; - return this; - } - - /** - * Set if add encoded tweet feature field in the end. - * - * @see #prepareToBuild() - */ - public EarlybirdThriftDocumentBuilder setAddEncodedTweetFeatures(boolean isSet) { - addEncodedTweetFeatures = isSet; - return this; - } - - @Override - protected void prepareToBuild() { - if (!isSetLatLonCSF && addLatLonCSF) { - // In lucene archives, this CSF is needed regardless of whether geoLocation is set. - withLatLonCSF(GeoUtil.ILLEGAL_LATLON, GeoUtil.ILLEGAL_LATLON); - } - - if (addEncodedTweetFeatures) { - // Add encoded_tweet_features before building the document. - withBytesField( - EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName(), - EarlybirdEncodedFeaturesUtil.toBytesForThriftDocument(encodedTweetFeatures)); - } - - if (extendedEncodedTweetFeatures != null) { - // Add extended_encoded_tweet_features before building the document. - withBytesField( - EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldName(), - EarlybirdEncodedFeaturesUtil.toBytesForThriftDocument(extendedEncodedTweetFeatures)); - } - } - - private static boolean isNotBlank(String value) { - return value != null && !value.isEmpty(); - } - - private static boolean isNotEmpty(List value) { - return value != null && !value.isEmpty(); - } -} diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentUtil.docx b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentUtil.docx new file mode 100644 index 000000000..95e6c96d4 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentUtil.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentUtil.java b/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentUtil.java deleted file mode 100644 index b8a13722b..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/EarlybirdThriftDocumentUtil.java +++ /dev/null @@ -1,377 +0,0 @@ -package com.twitter.search.common.schema.earlybird; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; - -import com.google.common.collect.ImmutableList; - -import com.twitter.common.text.util.TokenStreamSerializer; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.ThriftDocumentUtil; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftField; -import com.twitter.search.common.schema.thriftjava.ThriftFieldData; -import com.twitter.search.common.util.analysis.IntTermAttributeSerializer; -import com.twitter.search.common.util.analysis.TwitterNormalizedMinEngagementTokenStream; - -/** - * Utility APIs for ThriftDocument used in Earlybird. - */ -public final class EarlybirdThriftDocumentUtil { - private static final EarlybirdFieldConstants ID_MAPPING = new EarlybirdFieldConstants(); - - private static final String FILTER_FORMAT_STRING = "__filter_%s"; - - /** - * Used to check whether a thrift document has filter nullcast internal field set. - * @see #isNullcastFilterSet(ThriftDocument) - */ - private static final String NULLCAST_FILTER_TERM = - formatFilter(EarlybirdFieldConstant.NULLCAST_FILTER_TERM); - - private static final String SELF_THREAD_FILTER_TERM = - formatFilter(EarlybirdFieldConstant.SELF_THREAD_FILTER_TERM); - - private static final String DIRECTED_AT_FILTER_TERM = - formatFilter(EarlybirdFieldConstant.DIRECTED_AT_FILTER_TERM); - - private EarlybirdThriftDocumentUtil() { - // Cannot instantiate. - } - - /** - * Formats a regular, simple filter term. The 'filter' argument should correspond to a constant - * from the Operator class, matching the operand (filter:links -> "links"). - */ - public static final String formatFilter(String filter) { - return String.format(FILTER_FORMAT_STRING, filter); - } - - /** - * Get status id. - */ - public static long getID(ThriftDocument document) { - return ThriftDocumentUtil.getLongValue( - document, EarlybirdFieldConstant.ID_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get Card name. - */ - public static String getCardName(ThriftDocument document) { - return ThriftDocumentUtil.getStringValue( - document, EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get Card language. - */ - public static String getCardLang(ThriftDocument document) { - return ThriftDocumentUtil.getStringValue( - document, EarlybirdFieldConstant.CARD_LANG.getFieldName(), ID_MAPPING); - } - - /** - * Get Card language CSF. - * - * card language CSF is represented internally as an integer ID for a ThriftLanguage. - */ - public static int getCardLangCSF(ThriftDocument document) { - return ThriftDocumentUtil.getIntValue( - document, EarlybirdFieldConstant.CARD_LANG_CSF.getFieldName(), ID_MAPPING); - } - - /** - * Get quoted tweet id. - */ - public static long getQuotedTweetID(ThriftDocument document) { - return ThriftDocumentUtil.getLongValue( - document, EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get quoted tweet user id. - */ - public static long getQuotedUserID(ThriftDocument document) { - return ThriftDocumentUtil.getLongValue( - document, EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get directed at user id. - */ - public static long getDirectedAtUserId(ThriftDocument document) { - return ThriftDocumentUtil.getLongValue( - document, EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get directed at user id CSF. - */ - public static long getDirectedAtUserIdCSF(ThriftDocument document) { - return ThriftDocumentUtil.getLongValue( - document, EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName(), ID_MAPPING); - } - - /** - * Get reference author id CSF. - */ - public static long getReferenceAuthorIdCSF(ThriftDocument document) { - return ThriftDocumentUtil.getLongValue( - document, EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF.getFieldName(), ID_MAPPING); - } - - /** - * Get links. - */ - public static List getLinks(ThriftDocument document) { - return getStringValues(document, EarlybirdFieldConstant.LINKS_FIELD); - } - - /** - * Get created at timestamp in sec. - */ - public static int getCreatedAtSec(ThriftDocument document) { - return ThriftDocumentUtil.getIntValue( - document, EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get created at timestamp in ms. - */ - public static long getCreatedAtMs(ThriftDocument document) { - long createdAtSec = (long) getCreatedAtSec(document); - return createdAtSec * 1000L; - } - - /** - * Get from user id. - */ - public static long getFromUserID(ThriftDocument document) { - return ThriftDocumentUtil.getLongValue( - document, EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get from user. - */ - public static String getFromUser(ThriftDocument document) { - return ThriftDocumentUtil.getStringValue( - document, EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get tokenized from user display name. - */ - public static String getFromUserDisplayName(ThriftDocument document) { - return ThriftDocumentUtil.getStringValue( - document, EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get tokenized from user. - */ - public static String getTokenizedFromUser(ThriftDocument document) { - return ThriftDocumentUtil.getStringValue( - document, EarlybirdFieldConstant.TOKENIZED_FROM_USER_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get resolved links text. - */ - public static String getResolvedLinksText(ThriftDocument document) { - return ThriftDocumentUtil.getStringValue( - document, EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * Get iso language code. - */ - public static List getISOLanguage(ThriftDocument document) { - return ThriftDocumentUtil.getStringValues( - document, EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(), ID_MAPPING); - } - - /** - * First remove the old timestamp if they exist. - * Then add the created at and created at csf fields to the given thrift document. - */ - public static void replaceCreatedAtAndCreatedAtCSF(ThriftDocument document, int value) { - removeField(document, EarlybirdFieldConstant.CREATED_AT_FIELD); - removeField(document, EarlybirdFieldConstant.CREATED_AT_CSF_FIELD); - - addIntField(document, EarlybirdFieldConstant.CREATED_AT_FIELD, value); - addIntField(document, EarlybirdFieldConstant.CREATED_AT_CSF_FIELD, value); - } - - /** - * Add the given int value as the given field into the given document. - */ - public static ThriftDocument addIntField( - ThriftDocument document, EarlybirdFieldConstant fieldConstant, int value) { - ThriftFieldData fieldData = new ThriftFieldData().setIntValue(value); - ThriftField field = - new ThriftField().setFieldConfigId(fieldConstant.getFieldId()).setFieldData(fieldData); - document.addToFields(field); - return document; - } - - private static EarlybirdFieldConstant getFeatureField(EarlybirdFieldConstant field) { - if (field.getFieldName().startsWith( - EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD.getFieldName())) { - return EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD; - } else if (field.getFieldName().startsWith( - EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD.getFieldName())) { - return EarlybirdFieldConstant.EXTENDED_ENCODED_TWEET_FEATURES_FIELD; - } else { - throw new IllegalArgumentException("Not a feature field: " + field); - } - } - - /** - * Get the feature value of a field. - */ - public static int getFeatureValue( - ImmutableSchemaInterface schema, - ThriftDocument document, - EarlybirdFieldConstant field) { - - EarlybirdFieldConstant featureField = getFeatureField(field); - - byte[] encodedFeaturesBytes = - ThriftDocumentUtil.getBytesValue(document, featureField.getFieldName(), ID_MAPPING); - - if (encodedFeaturesBytes == null) { - // Treat the feature value as 0 if there is no encoded feature field. - return 0; - } else { - EarlybirdEncodedFeatures encodedFeatures = EarlybirdEncodedFeaturesUtil.fromBytes( - schema, featureField, encodedFeaturesBytes, 0); - return encodedFeatures.getFeatureValue(field); - } - } - - /** - * Check whether the feature flag is set. - */ - public static boolean isFeatureBitSet( - ImmutableSchemaInterface schema, - ThriftDocument document, - EarlybirdFieldConstant field) { - - EarlybirdFieldConstant featureField = getFeatureField(field); - - byte[] encodedFeaturesBytes = - ThriftDocumentUtil.getBytesValue(document, featureField.getFieldName(), ID_MAPPING); - - if (encodedFeaturesBytes == null) { - // Treat the bit as not set if there is no encoded feature field. - return false; - } else { - EarlybirdEncodedFeatures encodedFeatures = EarlybirdEncodedFeaturesUtil.fromBytes( - schema, featureField, encodedFeaturesBytes, 0); - return encodedFeatures.isFlagSet(field); - } - } - - /** - * Check whether nullcast flag is set in the encoded features field. - */ - public static boolean isNullcastBitSet(ImmutableSchemaInterface schema, ThriftDocument document) { - return isFeatureBitSet(schema, document, EarlybirdFieldConstant.IS_NULLCAST_FLAG); - } - - /** - * Remove all fields with the given field constant in a document. - */ - public static void removeField(ThriftDocument document, EarlybirdFieldConstant fieldConstant) { - List fields = document.getFields(); - if (fields != null) { - Iterator fieldsIterator = fields.iterator(); - while (fieldsIterator.hasNext()) { - if (fieldsIterator.next().getFieldConfigId() == fieldConstant.getFieldId()) { - fieldsIterator.remove(); - } - } - } - } - - /** - * Remove a string field with given fieldConstant and value. - */ - public static void removeStringField( - ThriftDocument document, EarlybirdFieldConstant fieldConstant, String value) { - List fields = document.getFields(); - if (fields != null) { - for (ThriftField field : fields) { - if (field.getFieldConfigId() == fieldConstant.getFieldId() - && field.getFieldData().getStringValue().equals(value)) { - fields.remove(field); - return; - } - } - } - } - - /** - * Adds a new TokenStream field for each engagement counter if normalizedNumEngagements >= 1. - */ - public static void addNormalizedMinEngagementField( - ThriftDocument doc, - String fieldName, - int normalizedNumEngagements) throws IOException { - if (normalizedNumEngagements < 1) { - return; - } - TokenStreamSerializer serializer = - new TokenStreamSerializer(ImmutableList.of(new IntTermAttributeSerializer())); - TwitterNormalizedMinEngagementTokenStream stream = new - TwitterNormalizedMinEngagementTokenStream(normalizedNumEngagements); - byte[] serializedStream = serializer.serialize(stream); - ThriftFieldData fieldData = new ThriftFieldData().setTokenStreamValue(serializedStream); - ThriftField field = new ThriftField().setFieldConfigId(ID_MAPPING.getFieldID(fieldName)) - .setFieldData(fieldData); - doc.addToFields(field); - } - - public static List getStringValues( - ThriftDocument document, EarlybirdFieldConstant field) { - return ThriftDocumentUtil.getStringValues(document, field.getFieldName(), ID_MAPPING); - } - - public static boolean isNullcastFilterSet(ThriftDocument document) { - return isFilterSet(document, NULLCAST_FILTER_TERM); - } - - public static boolean isSelfThreadFilterSet(ThriftDocument document) { - return isFilterSet(document, SELF_THREAD_FILTER_TERM); - } - - public static String getSelfThreadFilterTerm() { - return SELF_THREAD_FILTER_TERM; - } - - public static String getDirectedAtFilterTerm() { - return DIRECTED_AT_FILTER_TERM; - } - - public static boolean isDirectedAtFilterSet(ThriftDocument document) { - return isFilterSet(document, DIRECTED_AT_FILTER_TERM); - } - - /** - * Check whether given filter is set in the internal field. - */ - private static boolean isFilterSet(ThriftDocument document, String filter) { - List terms = ThriftDocumentUtil.getStringValues( - document, EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), ID_MAPPING); - for (String term : terms) { - if (filter.equals(term)) { - return true; - } - } - return false; - } -} diff --git a/src/java/com/twitter/search/common/schema/earlybird/FlushVersion.docx b/src/java/com/twitter/search/common/schema/earlybird/FlushVersion.docx new file mode 100644 index 000000000..9c3f78708 Binary files /dev/null and b/src/java/com/twitter/search/common/schema/earlybird/FlushVersion.docx differ diff --git a/src/java/com/twitter/search/common/schema/earlybird/FlushVersion.java b/src/java/com/twitter/search/common/schema/earlybird/FlushVersion.java deleted file mode 100644 index dd935c90e..000000000 --- a/src/java/com/twitter/search/common/schema/earlybird/FlushVersion.java +++ /dev/null @@ -1,336 +0,0 @@ -package com.twitter.search.common.schema.earlybird; - -import javax.annotation.Nullable; - -import com.twitter.search.common.config.Config; - -public enum FlushVersion { - /* ======================================================= - * Versions - * ======================================================= */ - VERSION_0("Initial version of partition flushing."), - VERSION_1("Added timestamps and corresponding mapper to SegmentData."), - VERSION_2("Add column stride fields."), - VERSION_3("Change facet field configuration."), - VERSION_4("Add per term offensive counters to parallel posting arrays."), - VERSION_5("Add native photo facet."), - VERSION_6("Add UserFeature column stride field"), - VERSION_7("Index segment optimizations; new facet data structures."), - VERSION_8("Store statuses in memory in Earlybird."), - VERSION_9("Index from_user_ids into a searchable field."), - VERSION_10("Change from_user_id dictionary from fst to mphf"), - VERSION_11("Write image and video facet in separate lucene field."), - VERSION_12("Add retweeted status ID to the sparse CSF."), - VERSION_13("Add isOffensive field for profanity filter."), - VERSION_14("Fix features column stride field corruption."), - VERSION_15("Upgrade Lucene version, which has a different FST serialization format."), - VERSION_16("Remove maxDoc in favor of lastDocID"), - VERSION_17("Added partition and timeslice identifiers to SegmentData."), - VERSION_18("Per-term payloads"), - VERSION_19("Multiple per-doc payload fields"), - VERSION_20("Unify and fix hash codes"), - VERSION_21("Super awesome new flexible realtime posting list format."), - VERSION_22("Added new geo implementation."), - VERSION_23("Upgrade to Lucene 4.0.0 Final"), - VERSION_24("Added tweet topic ids."), - VERSION_25("Turn on skip list for mention facet."), - VERSION_26("Added new EncodedTweetFeaturesColumnStrideField."), - VERSION_27("Topic ids facet field."), - VERSION_28("From-user discover stories skiplist field."), - VERSION_29("Move tokenized screen name to the new username field"), - VERSION_30("Enable HF term pairs index."), - VERSION_31("Remove reverse doc ids."), - VERSION_32("Switch shared status id CSF to non-sparse long CSF index."), - VERSION_33("New skip lists for optimized high df posting lists."), - VERSION_34("Store tweet signature in EarlybirdEncodedFeatures."), - VERSION_35("Don't store shared status id csf in archive indexes."), - VERSION_36("Don't store norms."), - VERSION_37("64 bit user ids."), - VERSION_38("Index links in archive."), - VERSION_39("Fix pic.twitter.com image link handling not setting the internal field correctly."), - VERSION_40("Fix all archive tweets being marked as replies."), - VERSION_41("Avoid flushing event_ids field; event clusters are applied as updates."), - VERSION_42("No position fields refactoring; made a few fields to not use position."), - VERSION_43("Index private geo coordinates"), - VERSION_44("Materialize last doc id in HighDFCompressedPostinglists", true), - VERSION_45("Removing from_user_id facets support", true), - VERSION_46("Guard against badly out of order tweets in the search archive.", true), - VERSION_47("Added card title and description fields.", true), - VERSION_48("Added card type CSF.", true), - VERSION_49("Lucene 4.4 upgrade", true), - VERSION_50("Put mem-archive back on non-lucene optimized indexes", true), - VERSION_51("Force index rebuild to fix blank text field. See SEARCH-2505.", true), - VERSION_52("Refactoring of docValues/CSF.", true), - VERSION_53("Remove SegmentData.Configuration", true), - VERSION_54("Fix bad indices caused by SEARCH-2723.", true), - VERSION_55("Fixed non-deterministic facetIds across restarts. SEARCH-2815.", true), - VERSION_56("Flush FacetIDMap.", true), - VERSION_57("Remove LatLonMapper and use standard DocValues instead.", true), - VERSION_58("Longterm Attribute Optimization.", true), - VERSION_59("Renamed archive segment names. Current segment is no longer mutable.", true), - // Flush version 60 and 59 have the same format. - // Flush version is increased to trigger a rebuild, because we noticed incomplete segments. - // More details can be found on SEARCH-3664 - VERSION_60("Flush version change to trigger segment rebuild.", true), - VERSION_61("Adding back from_user_id", true), - VERSION_62("Add retweet facet.", true), - VERSION_63("Switch to new index API in com.twitter.search.core.earlybird.", true), - VERSION_64("Sort merge archive day and part-* data. SEARCH-4692.", true), - VERSION_65("Fix ID_FIELD and CREATED_AT_FIELD sort order. SEARCH-4004 SEARCH-912 ", true), - VERSION_66("Rebuild data for 1/5/2015. Data on HDFS fixed as part of SEARCH-5347.", true), - VERSION_67("Upgrade to Lucene 4.10.3.", true), - VERSION_68("Switching to Penguin v4", true), - VERSION_69("Fix 16% archive segments: SEARCH-6073", true), - VERSION_70("Switching to Penguin v4 for full archive cluster. SEARCH-5302", true), - VERSION_71("Switching to Penguin v4 for ssd archive cluster.", true), - VERSION_72("Added Escherbird annotations for full archive.", true), - VERSION_73("Lucene 5.2.1 upgrade.", true, 0), - VERSION_74("Hanndle geo scurbbed data and archive geo index accuracy", true, 0), - VERSION_75("Delete from_user_id_stories from indices", true, 0), - VERSION_76("Allow multiple index extensions.", true, 0), - VERSION_77("Removed EarlybirdCodec", true, 0), - // minor version 2: added embedded tweet features - // minor version 3: change embedded tweet features to INC_ONLY - VERSION_78("Added 80 bytes of extended features", true, 3), - // minor version 1: SEARCH-8564 - Reference Tweet Author ID, using - // EXTENDED_TEST_FEATURE_UNUSED_BITS_2 and EXTENDED_TEST_FEATURE_UNUSED_BITS_3 - VERSION_79("Renamed UNUSED_BIT to HAS_VISIBLE_LINK", true, 1), - // minor version 2: SEARCH-8564 / http://go/rb/770373 - // Made REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT and - // REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT immutable field - VERSION_80("Facet for links: SEARCH-8331", true, 2), - // minor version 1: added video view count - VERSION_81("Adding LowDF posting list with packed ints", true, 1), - VERSION_82("Enabling HighDF posting list with packed ints", true, 0), - // minor version 1: SEARCH-9379 - Added bitset for nullcast tweets - // minor version 2: SEARCH-8765 - Added visible token ratio - VERSION_83("Add bits in encoded features for media type flags. SEARCH-9131", true, 2), - VERSION_84("Enable archive rebuild for __has_links field. SEARCH-9635", true, 0), - // minor version 1: SEARCHQUAL-8130, add engagement v2 - VERSION_85("New archive build gen for missing geo data. SEARCH-9894", true, 1), - VERSION_86("Added new fields to the index", true, 0), - // During this rebuild both the statuses and the engagement counts were regenerated. - // minor version 1: added quote_count - VERSION_87("Periodic archive full rebuild. SEARCH-9423", true, 1), - // minor version 1: make new tokenized user name/handle fields textSearchable - // (see go/rb/847134/) - // minor version 2: added has_quote - VERSION_88("Fixing missing day in the full archive index. SEARCH-11233", true, 2), - VERSION_89("Index and store conversation ids.", true, 0), - VERSION_90("Fixing inconsistent days in the full archive index. SEARCH-11744", true, 0), - VERSION_91("Making in_reply_to_user_id field use MPH. SEARCH-10836", true, 0), - VERSION_92("Allow searches by any field. SEARCH-11251", true, 0), - // During this rebuild we regenerated engagement counts and merged the annotations in the - // aggregate job. - VERSION_93("Periodic archive full rebuild. SEARCH-11076", true, 0), - // minor version 1: add ThriftCSFViewSettings.outputCSFType - VERSION_94("Indexing a bunch of geo fields. SEARCH-10283", true, 1), - VERSION_95("Removing topic ID fields. SEARCH-8616", true, 0), - // minor version 1: add ThriftCSFViewSettings.normalizationType - VERSION_96("Enabling conversation ID for all clusters. SEARCH-11989", true, 1), - // minor version 1: set several feature configuration to be correct double type - // minor version 2: set some more feature configuration to be correct double type - // minor version 3: add safety labels SEARCHQUAL-9561 - // minor version 4: add weighted engagement counts SEARCHQUAL-9574 - // minor version 5: add Dopamine non personalized score SEARCHQUAL-9743 - VERSION_97("Changing CSF type to BOOLEAN for some has_* flags.", true, 5), - VERSION_98("Periodic archive full rebuild. PCM-56871.", true, 1), - VERSION_99("Removing named_entities field. SEARCH-13708", true, 0), - // minor version 1: add periscope features (SEARCHQUAL-10008) - // minor version 2: add raw_earlybird_score to TweetExternalFeatures (SEARCHQUAL-10347) - VERSION_100("Upgrade Penguin Version from V4 to V6. SEARCH-12991", true, 2), - // minor version 1: adjust for normalizer type for some engagement counters (SEARCHQUAL-9537) - // minor version 2: add decaying engagement counts and last engaged timestamps (SEARCHQUAL-10532) - VERSION_101("Add emoji to the index. SEARCH-12991", true, 2), - VERSION_102("Periodic full archive rebuild. PCM-67851", true, 0), - VERSION_103("Add liked_by_user_id field. SEARCH-15341", true, 0), - // minor version 1: remove last engaged timestamp with 3-hour increment (SEARCHQUAL-10903) - // minor version 2: add fake engagement counts (SEARCHQUAL-10795) - // minor version 3: add last engaged timestamp with 1-hour increment (SEARCHQUAL-10942) - VERSION_104("Reverting to the 20170109_pc100_par30 build gen. SEARCH-15731", true, 3), - VERSION_105("Add 3 new fields to archive index for engagement features. SEARCH-16102", true, 0), - // This is the last rebuild based on /tables/statuses. Starting 9/14 this build-gen is powered - // by TweetSource. During this rebuild both statuses and engagement counts were rebuilt. - VERSION_106("Periodic archive full rebuild. PCM-74652", true, 0), - VERSION_107("Removing card fields from full archive index.", true, 0), - VERSION_108("Removing the tms_id field from all schemas.", true, 0), - VERSION_109("Removing LAT_LON_FIELD from all schemas.", true, 0), - VERSION_110("Adding the card fields back to the full archive index.", true, 1), - // minor version 1: Add composer source csf field (SEARCH-22494) - VERSION_111("Adding composer_source to index. SEARCH-20377.", true, 1), - VERSION_112("Partial rebuild to fix SEARCH-22529.", true, 0), - VERSION_113("Full archive build gen 20180312_pc100_par30.", true, 0), - VERSION_114("Fix for SEARCH-23761.", true, 0), - VERSION_115("Add fields for quoted tweets. SEARCH-23919", true, 0), - // minor version 1: Add 4 bit hashtag count, mention count and stock count (SEARCH-24336) - VERSION_116("Bump flush version for scrubbing pipeline. SEARCH-24225", true, 1), - VERSION_117("Add retweeted_by_user_id and replied_to_by_user_id fields. SEARCH-24463", true, 0), - // minor version 1: Removed dopamine_non_personalized_score (SEARCHQUAL-10321) - VERSION_118("Adding the reply and retweet source tweet IDs: SEARCH-23702, SEARCH-24502", true, 1), - // minor version 1: add blink engagement counts (SEARCHQUAL-15176) - VERSION_119("Remove public inferred location: SEARCH-24235", true, 1), - VERSION_120("Flush extensions before fields when flushing segments.", true, 0), - VERSION_121("Flush the startingDocIdForSearch field. SEARCH-25464.", true, 0), - VERSION_122("Do not flush the startingDocIdForSearch field.", true, 0), - VERSION_123("Renaming the largestDocID flushed property to firstAddedDocID.", true, 0), - VERSION_124("Use the skip list posting list for all fields.", true, 0), - VERSION_125("Use hashmap for tweet ID lookup.", true, 0), - VERSION_126("Use the skip list posting list for all fields.", true, 0), - VERSION_127("Flushing the min and max doc IDs in each segment.", true, 0), - VERSION_128("Add card_lang to index. SEARCH-26539", true, 0), - VERSION_129("Move the tweet ID mapper to the segment data.", true, 0), - VERSION_130("Move the time mapper to the segment data.", true, 0), - VERSION_131("Change the facets classes to work with any doc IDs.", true, 0), - VERSION_132("Make the CSF classes work with any doc IDs.", true, 0), - VERSION_133("Removing smallestDocID property.", true, 0), - VERSION_134("Optimize DeletedDocs before flushing.", true, 0), - VERSION_135("Add payloads to skiplists.", true, 0), - VERSION_136("Add name to int pools.", true, 0), - VERSION_137("Add unsorted stream offset.", true, 0), - VERSION_138("Switch to the OutOfOrderRealtimeTweetIDMapper.", true, 0), - VERSION_139("Remove realtime posting lists.", true, 0), - VERSION_140("Add named_entity field. SEARCH-27547", true, 0), - VERSION_141("Flush the out of order updates count.", true, 0), - VERSION_142("Add named_entity facet support. SEARCH-28054", true, 0), - VERSION_143("Index updates before optimizing segment.", true, 0), - VERSION_144("Refactor TermsArray.", true, 0), - VERSION_145("Remove SmallestDocID.", true, 0), - VERSION_146("Add entity_id facet support. SEARCH-28071", true, 0), - VERSION_147("Enable updating facets", true, 0), - VERSION_148("Rename the counter for feature updates to partial updates", true, 0), - VERSION_149("Stop flushing offsets for sorted updates DL streams.", true, 0), - VERSION_150("Update the name of the property for the updates DL stream offset.", true, 0), - VERSION_151("Upgrade Lucene version to 5.5.5.", true, 0), - VERSION_152("Upgrade Lucene version to 6.0.0.", true, 0), - VERSION_153("Upgrade Lucene version to 6.6.6.", true, 0), - VERSION_154("Store the timeslice ID on EarlybirdIndexSegmentData.", true, 0), - VERSION_155("Do not flush index extensions.", true, 0), - VERSION_156("Deprecate ThriftIndexedFieldSettings.defaultFieldBoost.", true, 0), - VERSION_157("Load CREATED_AT_CSF_FIELD into RAM in archive.", true, 0), - VERSION_158("Added directed at user ID field and CSF.", true, 0), - VERSION_159("Changing deleted docs serialization format.", true, 0), - VERSION_160("Add fields for health model scores. SEARCH-31907, HML-2099", true, 0), - VERSION_161("Switch to the 'search' Kafka cluster.", true, 0), - VERSION_162("Update Lucene version to 7.0.0.", true, 0), - VERSION_163("Update Lucene version to 7.7.2.", true, 0), - // minor version 1: add IS_TRENDING_NOW_FLAG - VERSION_164("Collect per-term stats in the realtime segments.", true, 1), - VERSION_165("Update Lucene version to 8.5.2.", true, 0), - VERSION_166("Serialize maxPosition field for InvertedRealtimeIndex", true, 0), - VERSION_167("Add field for pSpammyTweetScore. HML-2557", true, 0), - VERSION_168("Add field for pReportedTweetScore. HML-2644", true, 0), - VERSION_169("Add field for spammyTweetContentScore. PFM-70", true, 0), - VERSION_170("Add reference author id CSF. SEARCH-34715", true, 0), - VERSION_171("Add space_id field. SEARCH-36156", true, 0), - VERSION_172("Add facet support for space_id. SEARCH-36388", true, 0), - VERSION_173("Add space admin and title fields. SEARCH-36986", true, 0), - VERSION_174("Switching to Penguin v7 for realtime-exp0 cluster. SEARCH-36068", true, 0), - VERSION_175("Adding exclusive conversation author id CSF", true, 0), - VERSION_176("Adding card URI CSF", true, 0), - // minor version 1: add FROM_BLUE_VERIFIED_ACCOUNT_FLAG - // minor version 2: Adding new cluster REALTIME_CG. SEARCH-45692 - VERSION_177("Adding URL Description and Title fields. SEARCH-41641", true, 2), - - /** - * This semi colon is on a separate line to avoid polluting git blame history. - * Put a comma after the new enum field you're adding. - */; - - // The current version. - public static final FlushVersion CURRENT_FLUSH_VERSION = - FlushVersion.values()[FlushVersion.values().length - 1]; - - public static final String DELIMITER = "_v_"; - - /* ======================================================= - * Helper methods - * ======================================================= */ - private final String description; - private final boolean isOfficial; - private final int minorVersion; - - /** - * A flush version is not official unless explicitly stated to be official. - * An unofficial flush version is never uploaded to HDFS. - */ - private FlushVersion(String description) { - this(description, false, 0); - } - - private FlushVersion(String description, boolean isOfficial) { - this(description, isOfficial, 0); - } - - private FlushVersion(String description, boolean isOfficial, int minorVersion) { - this.description = description; - this.isOfficial = isOfficial; - this.minorVersion = minorVersion; - } - - /** - * Returns file extension with version number. - */ - public String getVersionFileExtension() { - if (this == VERSION_0) { - return ""; - } else { - return DELIMITER + ordinal(); - } - } - - /** - * Returns file extension given flush version number. - * If the flush version is unknown (e.g. higher than current flush version or lower than 0), null - * is returned. - */ - @Nullable - public static String getVersionFileExtension(int flushVersion) { - if (flushVersion > CURRENT_FLUSH_VERSION.ordinal() || flushVersion < 0) { - return null; - } else { - return FlushVersion.values()[flushVersion].getVersionFileExtension(); - } - } - - /** - * Returns a string describing the current schema version. - * @deprecated Please use {@link com.twitter.search.common.schema.base.Schema#getVersionDescription()} - */ - @Deprecated - public String getDescription() { - return description; - } - - /** - * Returns the schema's major version. - * @deprecated Please use {@link com.twitter.search.common.schema.base.Schema#getMajorVersionNumber()}. - */ - @Deprecated - public int getVersionNumber() { - return this.ordinal(); - } - - public boolean onOrAfter(FlushVersion other) { - return compareTo(other) >= 0; - } - - /** - * Returns whether the schema version is official. Only official segments are uploaded to HDFS. - * @deprecated Please use {@link com.twitter.search.common.schema.base.Schema#isVersionOfficial()}. - */ - @Deprecated - public boolean isOfficial() { - // We want the loading/flushing tests to pass locally even if the version is not meant - // to be an official version. - return isOfficial || Config.environmentIsTest(); - } - - /** - * As of now, this is hardcoded to 0. We will start using this soon. - * @deprecated Please consult schema for minor version. This should only be used to build schema. - */ - @Deprecated - public int getMinorVersion() { - return minorVersion; - } -} diff --git a/src/java/com/twitter/search/common/search/AndNotDocIdSetIterator.docx b/src/java/com/twitter/search/common/search/AndNotDocIdSetIterator.docx new file mode 100644 index 000000000..369258fb5 Binary files /dev/null and b/src/java/com/twitter/search/common/search/AndNotDocIdSetIterator.docx differ diff --git a/src/java/com/twitter/search/common/search/AndNotDocIdSetIterator.java b/src/java/com/twitter/search/common/search/AndNotDocIdSetIterator.java deleted file mode 100644 index 5fc221ba7..000000000 --- a/src/java/com/twitter/search/common/search/AndNotDocIdSetIterator.java +++ /dev/null @@ -1,71 +0,0 @@ -package com.twitter.search.common.search; - -import java.io.IOException; - -import org.apache.lucene.search.DocIdSetIterator; - -public class AndNotDocIdSetIterator extends DocIdSetIterator { - private int nextDelDoc; - private final DocIdSetIterator baseIter; - private final DocIdSetIterator notIter; - private int currID; - - /** Creates a new AndNotDocIdSetIterator instance. */ - public AndNotDocIdSetIterator(DocIdSetIterator baseIter, DocIdSetIterator notIter) - throws IOException { - nextDelDoc = notIter.nextDoc(); - this.baseIter = baseIter; - this.notIter = notIter; - currID = -1; - } - - @Override - public int advance(int target) throws IOException { - currID = baseIter.advance(target); - if (currID == DocIdSetIterator.NO_MORE_DOCS) { - return currID; - } - - if (nextDelDoc != DocIdSetIterator.NO_MORE_DOCS) { - if (currID < nextDelDoc) { - return currID; - } else if (currID == nextDelDoc) { - return nextDoc(); - } else { - nextDelDoc = notIter.advance(currID); - if (currID == nextDelDoc) { - return nextDoc(); - } - } - } - return currID; - } - - @Override - public int docID() { - return currID; - } - - @Override - public int nextDoc() throws IOException { - currID = baseIter.nextDoc(); - if (nextDelDoc != DocIdSetIterator.NO_MORE_DOCS) { - while (currID != DocIdSetIterator.NO_MORE_DOCS) { - if (currID < nextDelDoc) { - return currID; - } else { - if (currID == nextDelDoc) { - currID = baseIter.nextDoc(); - } - nextDelDoc = notIter.advance(currID); - } - } - } - return currID; - } - - @Override - public long cost() { - return baseIter.cost(); - } -} diff --git a/src/java/com/twitter/search/common/search/BUILD b/src/java/com/twitter/search/common/search/BUILD deleted file mode 100644 index ac5fe14b7..000000000 --- a/src/java/com/twitter/search/common/search/BUILD +++ /dev/null @@ -1,33 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/log4j", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/lucene:lucene-queries", - "3rdparty/jvm/org/apache/lucene:lucene-spatial-extras", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/query", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/util/spatial", - "src/java/com/twitter/search/queryparser", - "src/thrift/com/twitter/search/common:facets-java", - "src/thrift/com/twitter/search/common:query-java", - ], -) diff --git a/src/java/com/twitter/search/common/search/BUILD.docx b/src/java/com/twitter/search/common/search/BUILD.docx new file mode 100644 index 000000000..680b0c504 Binary files /dev/null and b/src/java/com/twitter/search/common/search/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/search/DelegatingEarlyTerminationCollector.docx b/src/java/com/twitter/search/common/search/DelegatingEarlyTerminationCollector.docx new file mode 100644 index 000000000..c417be3f3 Binary files /dev/null and b/src/java/com/twitter/search/common/search/DelegatingEarlyTerminationCollector.docx differ diff --git a/src/java/com/twitter/search/common/search/DelegatingEarlyTerminationCollector.java b/src/java/com/twitter/search/common/search/DelegatingEarlyTerminationCollector.java deleted file mode 100644 index 977f4a0a5..000000000 --- a/src/java/com/twitter/search/common/search/DelegatingEarlyTerminationCollector.java +++ /dev/null @@ -1,75 +0,0 @@ -package com.twitter.search.common.search; - -import java.io.IOException; -import java.util.List; - -import javax.annotation.Nullable; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.LeafCollector; -import org.apache.lucene.search.Scorable; -import org.apache.lucene.search.ScoreMode; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.query.thriftjava.CollectorParams; - -/** - * A {@link com.twitter.search.common.search.TwitterEarlyTerminationCollector} - * that delegates actual hit collection to a sub collector. - */ -public final class DelegatingEarlyTerminationCollector - extends TwitterEarlyTerminationCollector { - private final Collector subCollector; - private LeafCollector subLeafCollector; - - /** Creates a new DelegatingEarlyTerminationCollector instance. */ - public DelegatingEarlyTerminationCollector(Collector subCollector, - CollectorParams collectorParams, - TerminationTracker terminationTracker, - @Nullable QueryCostProvider queryCostProvider, - int numDocsBetweenTimeoutChecks, - Clock clock) { - super( - collectorParams, - terminationTracker, - queryCostProvider, - numDocsBetweenTimeoutChecks, - clock); - this.subCollector = subCollector; - } - - @Override - public void setScorer(Scorable scorer) throws IOException { - super.setScorer(scorer); - subLeafCollector.setScorer(scorer); - } - - @Override - protected void doCollect() throws IOException { - subLeafCollector.collect(curDocId); - } - - @Override - protected void doFinishSegment(int lastSearchedDocID) throws IOException { - if (subCollector instanceof TwitterCollector) { - ((TwitterCollector) subCollector).finishSegment(lastSearchedDocID); - } - } - - @Override - public void setNextReader(LeafReaderContext context) throws IOException { - super.setNextReader(context); - subLeafCollector = subCollector.getLeafCollector(context); - } - - @Override - public ScoreMode scoreMode() { - return subCollector.scoreMode(); - } - - @Override - public List getDebugInfo() { - return null; - } -} diff --git a/src/java/com/twitter/search/common/search/DocIdTracker.docx b/src/java/com/twitter/search/common/search/DocIdTracker.docx new file mode 100644 index 000000000..7de1b8f66 Binary files /dev/null and b/src/java/com/twitter/search/common/search/DocIdTracker.docx differ diff --git a/src/java/com/twitter/search/common/search/DocIdTracker.java b/src/java/com/twitter/search/common/search/DocIdTracker.java deleted file mode 100644 index 97546315e..000000000 --- a/src/java/com/twitter/search/common/search/DocIdTracker.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.search.common.search; - -/** - * Provide an accessor for a doc ID. This is useful for classes that iterate through doc IDs - * and maintain a "last seen" doc ID. - */ -public interface DocIdTracker { - /** - * Retrieve current doc ID - */ - int getCurrentDocId(); -} diff --git a/src/java/com/twitter/search/common/search/EarlyTerminationState.docx b/src/java/com/twitter/search/common/search/EarlyTerminationState.docx new file mode 100644 index 000000000..da39e6e6e Binary files /dev/null and b/src/java/com/twitter/search/common/search/EarlyTerminationState.docx differ diff --git a/src/java/com/twitter/search/common/search/EarlyTerminationState.java b/src/java/com/twitter/search/common/search/EarlyTerminationState.java deleted file mode 100644 index 31a1731e6..000000000 --- a/src/java/com/twitter/search/common/search/EarlyTerminationState.java +++ /dev/null @@ -1,51 +0,0 @@ -package com.twitter.search.common.search; - -import javax.annotation.Nonnull; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.metrics.SearchCounter; - -/** - * This is not an enum to allow different clusters to define their own EarlyTerminationStates. - */ -public final class EarlyTerminationState { - private static final String STATS_PREFIX = "early_termination_"; - - public static final EarlyTerminationState COLLECTING = - new EarlyTerminationState("no_early_termination", false); - public static final EarlyTerminationState TERMINATED_TIME_OUT_EXCEEDED = - new EarlyTerminationState("terminated_timeout_exceeded", true); - public static final EarlyTerminationState TERMINATED_MAX_QUERY_COST_EXCEEDED = - new EarlyTerminationState("terminated_max_query_cost_exceeded", true); - public static final EarlyTerminationState TERMINATED_MAX_HITS_EXCEEDED = - new EarlyTerminationState("terminated_max_hits_exceeded", true); - public static final EarlyTerminationState TERMINATED_NUM_RESULTS_EXCEEDED = - new EarlyTerminationState("terminated_num_results_exceeded", true); - - - // This string can be returned as a part of a search response, to tell the searcher - // why the search got early terminated. - private final String terminationReason; - private final boolean terminated; - private final SearchCounter count; - - public EarlyTerminationState(@Nonnull String terminationReason, boolean terminated) { - this.terminationReason = Preconditions.checkNotNull(terminationReason); - this.terminated = terminated; - count = SearchCounter.export(STATS_PREFIX + terminationReason + "_count"); - - } - - public boolean isTerminated() { - return terminated; - } - - public String getTerminationReason() { - return terminationReason; - } - - public void incrementCount() { - count.increment(); - } -} diff --git a/src/java/com/twitter/search/common/search/GeoQuadTreeQueryBuilderUtil.docx b/src/java/com/twitter/search/common/search/GeoQuadTreeQueryBuilderUtil.docx new file mode 100644 index 000000000..90dfab8b6 Binary files /dev/null and b/src/java/com/twitter/search/common/search/GeoQuadTreeQueryBuilderUtil.docx differ diff --git a/src/java/com/twitter/search/common/search/GeoQuadTreeQueryBuilderUtil.java b/src/java/com/twitter/search/common/search/GeoQuadTreeQueryBuilderUtil.java deleted file mode 100644 index 43475e9b7..000000000 --- a/src/java/com/twitter/search/common/search/GeoQuadTreeQueryBuilderUtil.java +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.search.common.search; - -import java.util.LinkedHashSet; -import java.util.Set; - -import org.apache.lucene.search.Query; -import org.apache.lucene.spatial.prefix.tree.Cell; -import org.apache.lucene.spatial.prefix.tree.CellIterator; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.util.spatial.GeohashChunkImpl; -import com.twitter.search.queryparser.util.GeoCode; - -import geo.google.datamodel.GeoAddressAccuracy; - -public final class GeoQuadTreeQueryBuilderUtil { - private GeoQuadTreeQueryBuilderUtil() { - } - - /** - * Build a geo quad tree query based around the geo code based on the geo field. - * @param geocode the geo location for the quad tree query - * @param field the field where the geohash tokens are indexed - * @return the corresponding for the geo quad tree query - */ - public static Query buildGeoQuadTreeQuery(GeoCode geocode, String field) { - Set geoHashSet = new LinkedHashSet<>(); - - // if accuracy is specified. Add a term query based on accuracy. - if (geocode.accuracy != GeoAddressAccuracy.UNKNOWN_LOCATION.getCode()) { - BytesRef termRef = new BytesRef(GeohashChunkImpl.buildGeoStringWithAccuracy(geocode.latitude, - geocode.longitude, - geocode.accuracy)); - geoHashSet.add(termRef); - } - - // If distance is specified. Add term queries based on distance - if (geocode.distanceKm != GeoCode.DOUBLE_DISTANCE_NOT_SET) { - // Build query based on distance - int treeLevel = -1; - // First find block containing query point with diagonal greater than 2 * radius. - Cell centerNode = GeohashChunkImpl.getGeoNodeByRadius(geocode.latitude, geocode.longitude, - geocode.distanceKm); - // Add center node querying term - if (centerNode != null) { - geoHashSet.add(centerNode.getTokenBytesNoLeaf(new BytesRef())); - treeLevel = centerNode.getLevel(); - } - - // This improves edge case recall, by adding cells also intersecting the query area. - CellIterator nodes = GeohashChunkImpl.getNodesIntersectingCircle(geocode.latitude, - geocode.longitude, - geocode.distanceKm, - treeLevel); - // If there are other nodes intersecting query circle, also add them in. - if (nodes != null) { - while (nodes.hasNext()) { - geoHashSet.add(nodes.next().getTokenBytesNoLeaf(new BytesRef())); - } - } - } - - return new com.twitter.search.common.query.MultiTermDisjunctionQuery(field, geoHashSet); - } -} diff --git a/src/java/com/twitter/search/common/search/IntArrayDocIdSetIterator.docx b/src/java/com/twitter/search/common/search/IntArrayDocIdSetIterator.docx new file mode 100644 index 000000000..73e5d6fb7 Binary files /dev/null and b/src/java/com/twitter/search/common/search/IntArrayDocIdSetIterator.docx differ diff --git a/src/java/com/twitter/search/common/search/IntArrayDocIdSetIterator.java b/src/java/com/twitter/search/common/search/IntArrayDocIdSetIterator.java deleted file mode 100644 index ea370ce9d..000000000 --- a/src/java/com/twitter/search/common/search/IntArrayDocIdSetIterator.java +++ /dev/null @@ -1,76 +0,0 @@ -package com.twitter.search.common.search; - -import java.util.Arrays; - -import org.apache.lucene.search.DocIdSetIterator; - -/** - * DocIdSetIterator implementation from a sorted list of non-negative integers. If the given list of - * doc IDs is not sorted or contains negative doc IDs, the results are undefined. - */ -public class IntArrayDocIdSetIterator extends DocIdSetIterator { - private final int[] docIds; - private int docId; - private int cursor; - - public IntArrayDocIdSetIterator(int[] ids) { - docIds = ids; - reset(); - } - - /** Used for testing. */ - public void reset() { - docId = -1; - cursor = -1; - } - - @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() { - return advance(docId); - } - - @Override - public int advance(int target) { - if (docId == NO_MORE_DOCS) { - return docId; - } - - if (target < docId) { - return docId; - } - - if (cursor == docIds.length - 1) { - docId = NO_MORE_DOCS; - return docId; - } - - if (target == docId) { - docId = docIds[++cursor]; - return docId; - } - - int toIndex = Math.min(cursor + (target - docId) + 1, docIds.length); - int targetIndex = Arrays.binarySearch(docIds, cursor + 1, toIndex, target); - if (targetIndex < 0) { - targetIndex = -targetIndex - 1; - } - - if (targetIndex == docIds.length) { - docId = NO_MORE_DOCS; - } else { - cursor = targetIndex; - docId = docIds[cursor]; - } - return docId; - } - - @Override - public long cost() { - return docIds == null ? 0 : docIds.length; - } -} diff --git a/src/java/com/twitter/search/common/search/PairDocIdSetIterator.docx b/src/java/com/twitter/search/common/search/PairDocIdSetIterator.docx new file mode 100644 index 000000000..e4c8c5ae2 Binary files /dev/null and b/src/java/com/twitter/search/common/search/PairDocIdSetIterator.docx differ diff --git a/src/java/com/twitter/search/common/search/PairDocIdSetIterator.java b/src/java/com/twitter/search/common/search/PairDocIdSetIterator.java deleted file mode 100644 index 0ed125923..000000000 --- a/src/java/com/twitter/search/common/search/PairDocIdSetIterator.java +++ /dev/null @@ -1,82 +0,0 @@ -package com.twitter.search.common.search; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.search.DocIdSetIterator; -/** - * Disjunction over 2 DocIdSetIterators. This should be faster than a disjunction over N since there - * would be no need to adjust the heap. - */ -public class PairDocIdSetIterator extends DocIdSetIterator { - - private final DocIdSetIterator d1; - private final DocIdSetIterator d2; - - private int doc = -1; - - /** Creates a new PairDocIdSetIterator instance. */ - public PairDocIdSetIterator(DocIdSetIterator d1, DocIdSetIterator d2) throws IOException { - Preconditions.checkNotNull(d1); - Preconditions.checkNotNull(d2); - this.d1 = d1; - this.d2 = d2; - // position the iterators - this.d1.nextDoc(); - this.d2.nextDoc(); - } - - @Override - public int docID() { - return doc; - } - - @Override - public int nextDoc() throws IOException { - int doc1 = d1.docID(); - int doc2 = d2.docID(); - DocIdSetIterator iter = null; - if (doc1 < doc2) { - doc = doc1; - //d1.nextDoc(); - iter = d1; - } else if (doc1 > doc2) { - doc = doc2; - //d2.nextDoc(); - iter = d2; - } else { - doc = doc1; - //d1.nextDoc(); - //d2.nextDoc(); - } - - if (doc != NO_MORE_DOCS) { - if (iter != null) { - iter.nextDoc(); - } else { - d1.nextDoc(); - d2.nextDoc(); - } - } - return doc; - } - - @Override - public int advance(int target) throws IOException { - if (d1.docID() < target) { - d1.advance(target); - } - if (d2.docID() < target) { - d2.advance(target); - } - return (doc != NO_MORE_DOCS) ? nextDoc() : doc; - } - - @Override - public long cost() { - // very coarse estimate - return d1.cost() + d2.cost(); - } - -} diff --git a/src/java/com/twitter/search/common/search/QueryCostProvider.docx b/src/java/com/twitter/search/common/search/QueryCostProvider.docx new file mode 100644 index 000000000..dfa3f8a8e Binary files /dev/null and b/src/java/com/twitter/search/common/search/QueryCostProvider.docx differ diff --git a/src/java/com/twitter/search/common/search/QueryCostProvider.java b/src/java/com/twitter/search/common/search/QueryCostProvider.java deleted file mode 100644 index 7e5d72433..000000000 --- a/src/java/com/twitter/search/common/search/QueryCostProvider.java +++ /dev/null @@ -1,9 +0,0 @@ -package com.twitter.search.common.search; - -/** - * Any class that can track and return query cost. - */ -public interface QueryCostProvider { - /** Returns the total cost. */ - double getTotalCost(); -} diff --git a/src/java/com/twitter/search/common/search/TerminationTracker.docx b/src/java/com/twitter/search/common/search/TerminationTracker.docx new file mode 100644 index 000000000..3a2c3466b Binary files /dev/null and b/src/java/com/twitter/search/common/search/TerminationTracker.docx differ diff --git a/src/java/com/twitter/search/common/search/TerminationTracker.java b/src/java/com/twitter/search/common/search/TerminationTracker.java deleted file mode 100644 index 916415078..000000000 --- a/src/java/com/twitter/search/common/search/TerminationTracker.java +++ /dev/null @@ -1,202 +0,0 @@ -package com.twitter.search.common.search; - -import java.util.HashSet; -import java.util.Set; - -import com.google.common.base.Preconditions; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.query.thriftjava.CollectorTerminationParams; - -/** - * Used for tracking termination criteria for earlybird queries. - * - * Currently this tracks the query time out and query cost, if they are set on the - * {@link com.twitter.search.common.query.thriftjava.CollectorTerminationParams}. - */ -public class TerminationTracker { - /** Query start time provided by client. */ - private final long clientStartTimeMillis; - - /** Timeout end times, calculated from {@link #clientStartTimeMillis}. */ - private final long timeoutEndTimeMillis; - - /** Query start time recorded at earlybird server. */ - private final long localStartTimeMillis; - - /** Tracking query cost */ - private final double maxQueryCost; - - // Sometimes, we want to early terminate before timeoutEndTimeMillis, to reserve time for - // work that needs to be done after early termination (E.g. merging results). - private final int postTerminationOverheadMillis; - - // We don't check for early termination often enough. Some times requests timeout in between - // early termination checks. This buffer time is also substracted from deadline. - // To illustrate how this is used, let's use a simple example: - // If we spent 750ms searching 5 segments, a rough estimate is that we need 150ms to search - // one segment. If the timeout is set to 800ms, we should not starting searching the next segment. - // In this case, on can set preTerminationSafeBufferTimeMillis to 150ms, so that when early - // termination check computes the deadline, this buffer is also subtracted. See SEARCH-29723. - private int preTerminationSafeBufferTimeMillis = 0; - - private EarlyTerminationState earlyTerminationState = EarlyTerminationState.COLLECTING; - - // This flag determines whether the last searched doc ID trackers should be consulted when a - // timeout occurs. - private final boolean useLastSearchedDocIdOnTimeout; - - private final Set lastSearchedDocIdTrackers = new HashSet<>(); - - /** - * Creates a new termination tracker that will not specify a timeout or max query cost. - * Can be used for queries that explicitly do not want to use a timeout. Meant to be used for - * tests, and background queries running for the query cache. - */ - public TerminationTracker(Clock clock) { - this.clientStartTimeMillis = clock.nowMillis(); - this.localStartTimeMillis = clientStartTimeMillis; - this.timeoutEndTimeMillis = Long.MAX_VALUE; - this.maxQueryCost = Double.MAX_VALUE; - this.postTerminationOverheadMillis = 0; - this.useLastSearchedDocIdOnTimeout = false; - } - - /** - * Convenient method overloading for - * {@link #TerminationTracker(CollectorTerminationParams, long, Clock, int)}. - */ - public TerminationTracker( - CollectorTerminationParams terminationParams, Clock clock, - int postTerminationOverheadMillis) { - this(terminationParams, clock.nowMillis(), clock, postTerminationOverheadMillis); - } - - /** - * Convenient method overloading for - * {@link #TerminationTracker(CollectorTerminationParams, long, Clock, int)}. - */ - public TerminationTracker( - CollectorTerminationParams terminationParams, int postTerminationOverheadMillis) { - this( - terminationParams, - System.currentTimeMillis(), - Clock.SYSTEM_CLOCK, - postTerminationOverheadMillis); - } - - /** - * Creates a new TerminationTracker instance. - * - * @param terminationParams CollectorParams.CollectorTerminationParams carrying parameters - * about early termination. - * @param clientStartTimeMillis The query start time (in millis) specified by client. This is used - * to calculate timeout end time, like {@link #timeoutEndTimeMillis}. - * @param clock used to sample {@link #localStartTimeMillis}. - * @param postTerminationOverheadMillis How much time should be reserved. E.g. if request time - * out is 800ms, and this is set to 200ms, early termination - * will kick in at 600ms mark. - */ - public TerminationTracker( - CollectorTerminationParams terminationParams, - long clientStartTimeMillis, - Clock clock, - int postTerminationOverheadMillis) { - Preconditions.checkNotNull(terminationParams); - Preconditions.checkArgument(postTerminationOverheadMillis >= 0); - - this.clientStartTimeMillis = clientStartTimeMillis; - this.localStartTimeMillis = clock.nowMillis(); - - if (terminationParams.isSetTimeoutMs() - && terminationParams.getTimeoutMs() > 0) { - Preconditions.checkState(terminationParams.getTimeoutMs() >= postTerminationOverheadMillis); - this.timeoutEndTimeMillis = this.clientStartTimeMillis + terminationParams.getTimeoutMs(); - } else { - // Effectively no timeout. - this.timeoutEndTimeMillis = Long.MAX_VALUE; - } - - // Tracking query cost - if (terminationParams.isSetMaxQueryCost() - && terminationParams.getMaxQueryCost() > 0) { - maxQueryCost = terminationParams.getMaxQueryCost(); - } else { - maxQueryCost = Double.MAX_VALUE; - } - - this.useLastSearchedDocIdOnTimeout = terminationParams.isEnforceQueryTimeout(); - this.postTerminationOverheadMillis = postTerminationOverheadMillis; - } - - /** - * Returns the reserve time to perform post termination work. Return the deadline timestamp - * with postTerminationWorkEstimate subtracted. - */ - public long getTimeoutEndTimeWithReservation() { - // Return huge value if time out is disabled. - if (timeoutEndTimeMillis == Long.MAX_VALUE) { - return timeoutEndTimeMillis; - } else { - return timeoutEndTimeMillis - - postTerminationOverheadMillis - - preTerminationSafeBufferTimeMillis; - } - } - - public void setPreTerminationSafeBufferTimeMillis(int preTerminationSafeBufferTimeMillis) { - Preconditions.checkArgument(preTerminationSafeBufferTimeMillis >= 0); - - this.preTerminationSafeBufferTimeMillis = preTerminationSafeBufferTimeMillis; - } - - public long getLocalStartTimeMillis() { - return localStartTimeMillis; - } - - public long getClientStartTimeMillis() { - return clientStartTimeMillis; - } - - public double getMaxQueryCost() { - return maxQueryCost; - } - - public boolean isEarlyTerminated() { - return earlyTerminationState.isTerminated(); - } - - public EarlyTerminationState getEarlyTerminationState() { - return earlyTerminationState; - } - - public void setEarlyTerminationState(EarlyTerminationState earlyTerminationState) { - this.earlyTerminationState = earlyTerminationState; - } - - /** - * Return the minimum searched doc ID amongst all registered trackers, or -1 if there aren't any - * trackers. Doc IDs are stored in ascending order, and trackers update their doc IDs as they - * search, so the minimum doc ID reflects the most recent fully searched doc ID. - */ - int getLastSearchedDocId() { - return lastSearchedDocIdTrackers.stream() - .mapToInt(DocIdTracker::getCurrentDocId).min().orElse(-1); - } - - void resetDocIdTrackers() { - lastSearchedDocIdTrackers.clear(); - } - - /** - * Add a DocIdTracker, to keep track of the last fully-searched doc ID when early termination - * occurs. - */ - public void addDocIdTracker(DocIdTracker docIdTracker) { - lastSearchedDocIdTrackers.add(docIdTracker); - } - - public boolean useLastSearchedDocIdOnTimeout() { - return useLastSearchedDocIdOnTimeout; - } -} diff --git a/src/java/com/twitter/search/common/search/TwitterCollector.docx b/src/java/com/twitter/search/common/search/TwitterCollector.docx new file mode 100644 index 000000000..8c421d08c Binary files /dev/null and b/src/java/com/twitter/search/common/search/TwitterCollector.docx differ diff --git a/src/java/com/twitter/search/common/search/TwitterCollector.java b/src/java/com/twitter/search/common/search/TwitterCollector.java deleted file mode 100644 index 0661db8fc..000000000 --- a/src/java/com/twitter/search/common/search/TwitterCollector.java +++ /dev/null @@ -1,31 +0,0 @@ -package com.twitter.search.common.search; - -import java.io.IOException; - -import org.apache.lucene.search.Collector; - -/** - * Lucene Collectors throw CollectionTerminatedException to perform early termination. - * We don't believe that throwing Exceptions to control execution flow is ideal, so we are adding - * this class to be a base of all Twitter Collectors. - * - * {@link com.twitter.search.common.search.TwitterIndexSearcher} uses the {@link #isTerminated()} - * method to perform early termination, instead of relying on CollectionTerminatedException. - */ -public abstract class TwitterCollector implements Collector { - - /** - * Subclasses should return true if they want to perform early termination. - * This method is called every hit and should not be expensive. - */ - public abstract boolean isTerminated() throws IOException; - - /** - * Lucene API only has a method that's called before searching a segment setNextReader(). - * This hook is called after finishing searching a segment. - * @param lastSearchedDocID is the last docid searched before termination, - * or NO_MORE_DOCS if there was no early termination. This doc need not be a hit, - * and should not be collected here. - */ - public abstract void finishSegment(int lastSearchedDocID) throws IOException; -} diff --git a/src/java/com/twitter/search/common/search/TwitterEarlyTerminationCollector.docx b/src/java/com/twitter/search/common/search/TwitterEarlyTerminationCollector.docx new file mode 100644 index 000000000..17241efe0 Binary files /dev/null and b/src/java/com/twitter/search/common/search/TwitterEarlyTerminationCollector.docx differ diff --git a/src/java/com/twitter/search/common/search/TwitterEarlyTerminationCollector.java b/src/java/com/twitter/search/common/search/TwitterEarlyTerminationCollector.java deleted file mode 100644 index bc7711e7d..000000000 --- a/src/java/com/twitter/search/common/search/TwitterEarlyTerminationCollector.java +++ /dev/null @@ -1,328 +0,0 @@ -package com.twitter.search.common.search; - -import java.io.IOException; -import java.util.List; -import javax.annotation.Nonnull; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.LeafCollector; -import org.apache.lucene.search.Scorable; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.query.thriftjava.CollectorParams; -import com.twitter.search.common.query.thriftjava.CollectorTerminationParams; - -/** - * A TwitterCollector containing the most common early termination logic based on - * timeout, cost, and max hits. This class does not do any actual hit collection---this class - * is abstract and cannot be instantiated. - * - * If a Collector and all its subclasses need early termination, it should extend this class. - * - * However, if one just wants to add EarlyTermination to any single collector, he can just - * use {@link DelegatingEarlyTerminationCollector} - * as a wrapper. - */ -public abstract class TwitterEarlyTerminationCollector - extends TwitterCollector implements LeafCollector { - private static final Logger LOG = LoggerFactory.getLogger(TwitterEarlyTerminationCollector.class); - private static final SearchCounter NEGATIVE_TIME_PER_SEGMENT = - SearchCounter.export("TwitterEarlyTerminationCollector_negative_time_per_segment"); - private static final SearchRateCounter QUERY_TIMEOUT_ENFORCED = - SearchRateCounter.export("TwitterEarlyTerminationCollector_query_timeout_enforced"); - - protected int curDocId = -1; - - protected Scorable scorer = null; - private LeafReader curReader = null; - private final long maxHitsToProcess; - private long numHitsProcessed = 0; - private int lastEarlyTerminationCheckDocId = -1; - private final Clock clock; - - @Nullable - private final QueryCostProvider queryCostProvider; - - private final TerminationTracker terminationTracker; - - // This determines how often the expensive early termination check is performed. - // If set to be negative, expensive early termination check only performed at segment boundaries. - // If set to a positive number X, this check is performed every X docs processed. - private int numDocsBetweenTimeoutChecks; - - // Number of segments searched so far. - // This is used to predicatively early terminate. - // Expensive early termination checks may not happen often enough. Sometimes the request - // times out in between the termination checks. - // After finishing searching a segment, we estimate how much time is needed to search one - // segment on average. If searching the next segment would cause a timeout, we early terminate. - private int numSearchedSegments = 0; - - /** - * Creates a new TwitterEarlyTerminationCollector instance. - * - * @param collectorParams the parameters needed to guide early termination. - * @param terminationTracker If null is passed in, a new TerminationTrack is created. Otherwise, - * the one passed in is used. - * @param numDocsBetweenTimeoutChecks TerminationTracker based check are performed upon a hit - * every numDocsBetweenTimeoutChecks docs. If a non-positive number is passed - * in, TerminationTracker based checks are disabled. - * If collectorParams specifies a value as well, that value is used. - */ - public TwitterEarlyTerminationCollector( - CollectorParams collectorParams, - TerminationTracker terminationTracker, - @Nullable QueryCostProvider queryCostProvider, - int numDocsBetweenTimeoutChecks, - Clock clock) { - CollectorTerminationParams terminationParams = collectorParams.getTerminationParams(); - - if (terminationParams == null) { - terminationParams = new CollectorTerminationParams() - .setMaxHitsToProcess(Integer.MAX_VALUE) - .setMaxQueryCost(Double.MAX_VALUE) - .setTimeoutMs(Integer.MAX_VALUE); - } - - if (!terminationParams.isSetMaxHitsToProcess() || terminationParams.getMaxHitsToProcess() < 0) { - maxHitsToProcess = Integer.MAX_VALUE; - } else { - maxHitsToProcess = terminationParams.getMaxHitsToProcess(); - } - - if (terminationParams.isSetNumDocsBetweenTimeoutChecks()) { - this.numDocsBetweenTimeoutChecks = terminationParams.getNumDocsBetweenTimeoutChecks(); - } else { - this.numDocsBetweenTimeoutChecks = numDocsBetweenTimeoutChecks; - } - - this.terminationTracker = Preconditions.checkNotNull(terminationTracker); - this.queryCostProvider = queryCostProvider; - this.clock = clock; - } - - public final LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { - this.setNextReader(context); - return this; - } - - /** - * Sub-classes may override this to add more collection logic. - */ - protected abstract void doCollect() throws IOException; - - /** - * Sub-classes may override this to add more segment completion logic. - * @param lastSearchedDocID is the last docid searched before termination, - * or NO_MORE_DOCS if there was no early termination. This doc may not be a hit! - */ - protected abstract void doFinishSegment(int lastSearchedDocID) throws IOException; - - /** - * sub classes can override this to perform more early termination checks. - */ - public EarlyTerminationState innerShouldCollectMore() throws IOException { - return EarlyTerminationState.COLLECTING; - } - - /** - * After early termination, this method can be used to retrieve early termination reason. - */ - @Nonnull - public final EarlyTerminationState getEarlyTerminationState() { - return terminationTracker.getEarlyTerminationState(); - } - - protected final EarlyTerminationState setEarlyTerminationState( - EarlyTerminationState newEarlyTerminationState) { - terminationTracker.setEarlyTerminationState(newEarlyTerminationState); - return newEarlyTerminationState; - } - - @Override - public final boolean isTerminated() throws IOException { - EarlyTerminationState earlyTerminationState = getEarlyTerminationState(); - - if (earlyTerminationState.isTerminated()) { - return true; - } - - if (getNumHitsProcessed() >= getMaxHitsToProcess()) { - collectedEnoughResults(); - if (shouldTerminate()) { - return setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_HITS_EXCEEDED) - .isTerminated(); - } else { - return false; - } - } - - return innerShouldCollectMore().isTerminated(); - } - - /** - * Note: subclasses overriding this method are expected to call "super.setNextReader" - * in their setNextReader(). - * @deprecated Remove this methods in favor of {@link #getLeafCollector(LeafReaderContext)} - */ - @Deprecated - public void setNextReader(LeafReaderContext context) throws IOException { - if (!terminationTracker.useLastSearchedDocIdOnTimeout()) { - expensiveEarlyTerminationCheck(); - } - - // Reset curDocId for next segment - curDocId = -1; - lastEarlyTerminationCheckDocId = -1; - curReader = context.reader(); - } - - /** - * Sub-classes overriding this method are expected to call super.setScorer() - */ - @Override - public void setScorer(Scorable scorer) throws IOException { - this.scorer = scorer; - } - - @Override - public final void collect(int doc) throws IOException { - curDocId = doc; - doCollect(); - numHitsProcessed++; - if (numDocsBetweenTimeoutChecks > 0 - && (curDocId - lastEarlyTerminationCheckDocId) >= numDocsBetweenTimeoutChecks) { - lastEarlyTerminationCheckDocId = curDocId; - - if (!terminationTracker.useLastSearchedDocIdOnTimeout()) { - expensiveEarlyTerminationCheck(); - } - } - } - - /** - * Accounting for a segment searched. - * @param lastSearchedDocID is the last docid searched before termination, - * or NO_MORE_DOCS if there was no early termination. This doc may not be a hit! - */ - protected final void trackCompleteSegment(int lastSearchedDocID) throws IOException { - doFinishSegment(lastSearchedDocID); - } - - @Override - public final void finishSegment(int lastSearchedDocID) throws IOException { - // finished searching a segment. Computer average time needed to search a segment. - Preconditions.checkState(curReader != null, "Did subclass call super.setNextReader()?"); - numSearchedSegments++; - - long totalTime = clock.nowMillis() - terminationTracker.getLocalStartTimeMillis(); - - if (totalTime >= Integer.MAX_VALUE) { - String msg = String.format( - "%s: A query runs for %d that is longer than Integer.MAX_VALUE ms. lastSearchedDocID: %d", - getClass().getSimpleName(), totalTime, lastSearchedDocID - ); - LOG.error(msg); - throw new IllegalStateException(msg); - } - - int timePerSegment = ((int) totalTime) / numSearchedSegments; - - if (timePerSegment < 0) { - NEGATIVE_TIME_PER_SEGMENT.increment(); - timePerSegment = 0; - } - - // If we're enforcing timeout via the last searched doc ID, we don't need to add this buffer, - // since we'll detect the timeout right away. - if (!terminationTracker.useLastSearchedDocIdOnTimeout()) { - terminationTracker.setPreTerminationSafeBufferTimeMillis(timePerSegment); - } - - // Check whether we timed out and are checking for timeout at the leaves. If so, we should use - // the captured lastSearchedDocId from the tracker instead, which is the most up-to-date amongst - // the query nodes. - if (terminationTracker.useLastSearchedDocIdOnTimeout() - && EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED.equals( - terminationTracker.getEarlyTerminationState())) { - QUERY_TIMEOUT_ENFORCED.increment(); - trackCompleteSegment(terminationTracker.getLastSearchedDocId()); - } else { - trackCompleteSegment(lastSearchedDocID); - } - - // We finished a segment, so clear out the DocIdTrackers. The next segment will register its - // own trackers, and we don't need to keep the trackers from the current segment. - terminationTracker.resetDocIdTrackers(); - - curDocId = -1; - curReader = null; - scorer = null; - } - - /** - * More expensive Early Termination checks, which are not called every hit. - * This sets EarlyTerminationState if it decides that early termination should kick in. - * See: SEARCH-29723. - */ - private void expensiveEarlyTerminationCheck() { - if (queryCostProvider != null) { - double totalQueryCost = queryCostProvider.getTotalCost(); - double maxQueryCost = terminationTracker.getMaxQueryCost(); - if (totalQueryCost >= maxQueryCost) { - setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_QUERY_COST_EXCEEDED); - } - } - - final long nowMillis = clock.nowMillis(); - if (nowMillis >= terminationTracker.getTimeoutEndTimeWithReservation()) { - setEarlyTerminationState(EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED); - } - } - - public long getMaxHitsToProcess() { - return maxHitsToProcess; - } - - public final void setNumHitsProcessed(long numHitsProcessed) { - this.numHitsProcessed = numHitsProcessed; - } - - protected final long getNumHitsProcessed() { - return numHitsProcessed; - } - - protected final int getNumSearchedSegments() { - return numSearchedSegments; - } - - protected final Clock getClock() { - return clock; - } - - @VisibleForTesting - protected final TerminationTracker getTerminationTracker() { - return this.terminationTracker; - } - - protected void collectedEnoughResults() throws IOException { - } - - protected boolean shouldTerminate() { - return true; - } - - /** - * Debug info collected during execution. - */ - public abstract List getDebugInfo(); -} diff --git a/src/java/com/twitter/search/common/search/TwitterIndexSearcher.docx b/src/java/com/twitter/search/common/search/TwitterIndexSearcher.docx new file mode 100644 index 000000000..c4898a857 Binary files /dev/null and b/src/java/com/twitter/search/common/search/TwitterIndexSearcher.docx differ diff --git a/src/java/com/twitter/search/common/search/TwitterIndexSearcher.java b/src/java/com/twitter/search/common/search/TwitterIndexSearcher.java deleted file mode 100644 index 97f10160a..000000000 --- a/src/java/com/twitter/search/common/search/TwitterIndexSearcher.java +++ /dev/null @@ -1,189 +0,0 @@ -package com.twitter.search.common.search; - -import java.io.IOException; -import java.util.List; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.MultiDocValues; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.search.CollectionStatistics; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafCollector; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.TermStatistics; -import org.apache.lucene.search.Weight; - -/** - * An IndexSearch that works with TwitterEarlyTerminationCollector. - * If a stock Lucene collector is passed into search(), this IndexSearch.search() behaves the - * same as Lucene's stock IndexSearcher. However, if a TwitterEarlyTerminationCollector is passed - * in, this IndexSearcher performs early termination without relying on - * CollectionTerminatedException. - */ -public class TwitterIndexSearcher extends IndexSearcher { - public TwitterIndexSearcher(IndexReader r) { - super(r); - } - - /** - * search() main loop. - * This behaves exactly like IndexSearcher.search() if a stock Lucene collector passed in. - * However, if a TwitterCollector is passed in, this class performs Twitter style early - * termination without relying on - * {@link org.apache.lucene.search.CollectionTerminatedException}. - */ - @Override - protected void search(List leaves, Weight weight, Collector coll) - throws IOException { - - // If an TwitterCollector is passed in, we can do a few extra things in here, such - // as early termination. Otherwise we can just fall back to IndexSearcher.search(). - if (coll instanceof TwitterCollector) { - TwitterCollector collector = (TwitterCollector) coll; - - for (LeafReaderContext ctx : leaves) { // search each subreader - if (collector.isTerminated()) { - return; - } - - // Notify the collector that we're starting this segment, and check for early - // termination criteria again. setNextReader() performs 'expensive' early - // termination checks in some implementations such as TwitterEarlyTerminationCollector. - LeafCollector leafCollector = collector.getLeafCollector(ctx); - if (collector.isTerminated()) { - return; - } - - // Initialize the scorer - it should not be null. Note that constructing the scorer - // may actually do real work, such as advancing to the first hit. - Scorer scorer = weight.scorer(ctx); - - if (scorer == null) { - collector.finishSegment(DocIdSetIterator.NO_MORE_DOCS); - continue; - } - - leafCollector.setScorer(scorer); - - // Start searching. - DocIdSetIterator docIdSetIterator = scorer.iterator(); - int docID = docIdSetIterator.nextDoc(); - if (docID != DocIdSetIterator.NO_MORE_DOCS) { - // Collect results. Note: check isTerminated() before calling nextDoc(). - do { - leafCollector.collect(docID); - } while (!collector.isTerminated() - && (docID = docIdSetIterator.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS); - } - - // Always finish the segment, providing the last docID advanced to. - collector.finishSegment(docID); - } - } else { - // The collector given is not a TwitterCollector, just use stock lucene search(). - super.search(leaves, weight, coll); - } - } - - /** Returns {@link NumericDocValues} for this field, or - * null if no {@link NumericDocValues} were indexed for - * this field. The returned instance should only be - * used by a single thread. */ - public NumericDocValues getNumericDocValues(String field) throws IOException { - return MultiDocValues.getNumericValues(getIndexReader(), field); - } - - @Override - public CollectionStatistics collectionStatistics(String field) throws IOException { - return collectionStatistics(field, getIndexReader()); - } - - @Override - public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) { - return termStats(term, docFreq, totalTermFreq); - } - - /** - * Lucene relies on the fact that maxDocID is typically equal to the number of documents in the - * index, which is false when we have sparse doc IDs or when we start from 8 million docs and - * decrement, so in this class we pass in numDocs instead of the maximum assigned document ID. - * Note that the comment on {@link CollectionStatistics#maxDoc()} says that it returns the number - * of documents in the segment, not the maximum ID, and that it is only used this way. This is - * necessary for all lucene scoring methods, e.g. - * {@link org.apache.lucene.search.similarities.TFIDFSimilarity#idfExplain}. This method body is - * largely copied from {@link IndexSearcher#collectionStatistics(String)}. - */ - public static CollectionStatistics collectionStatistics(String field, IndexReader indexReader) - throws IOException { - Preconditions.checkNotNull(field); - - int docsWithField = 0; - long sumTotalTermFreq = 0; - long sumDocFreq = 0; - for (LeafReaderContext leaf : indexReader.leaves()) { - Terms terms = leaf.reader().terms(field); - if (terms == null) { - continue; - } - - docsWithField += terms.getDocCount(); - sumTotalTermFreq += terms.getSumTotalTermFreq(); - sumDocFreq += terms.getSumDocFreq(); - } - - if (docsWithField == 0) { - // The CollectionStatistics API in Lucene is designed poorly. On one hand, starting with - // Lucene 8.0.0, searchers are expected to always produce valid CollectionStatistics instances - // and all int fields in these instances are expected to be strictly greater than 0. On the - // other hand, Lucene itself produces null CollectionStatistics instances in a few places. - // Also, there's no good placeholder value to indicate that a field is empty, which is a very - // reasonable thing to happen (for example, the first few tweets in a new segment might not - // have any links, so then the resolved_links_text would be empty). So to get around this - // issue, we do here what Lucene does: we return a CollectionStatistics instance with all - // fields set to 1. - return new CollectionStatistics(field, 1, 1, 1, 1); - } - - // The writer could have added more docs to the index since this searcher started processing - // this request, or could be in the middle of adding a doc, which could mean that only some of - // the docsWithField, sumTotalTermFreq and sumDocFreq stats have been updated. I don't think - // this is a big deal, as these stats are only used for computing a hit's score, and minor - // inaccuracies should have very little effect on a hit's final score. But CollectionStatistic's - // constructor has some strict asserts for the relationship between these stats. So we need to - // make sure we cap the values of these stats appropriately. - // - // Adjust numDocs based on docsWithField (instead of doing the opposite), because: - // 1. If new documents were added to this segment after the reader was created, it seems - // reasonable to take the more recent information into account. - // 2. The termStats() method below will return the most recent docFreq (not the value that - // docFreq was set to when this reader was created). If this value is higher than numDocs, - // then Lucene might end up producing negative scores, which must never happen. - int numDocs = Math.max(indexReader.numDocs(), docsWithField); - sumDocFreq = Math.max(sumDocFreq, docsWithField); - sumTotalTermFreq = Math.max(sumTotalTermFreq, sumDocFreq); - return new CollectionStatistics(field, numDocs, docsWithField, sumTotalTermFreq, sumDocFreq); - } - - /** - * This method body is largely copied from {@link IndexSearcher#termStatistics(Term, int, long)}. - * The only difference is that we make sure all parameters we pass to the TermStatistics instance - * we create are set to at least 1 (because Lucene 8.0.0 expects them to be). - */ - public static TermStatistics termStats(Term term, int docFreq, long totalTermFreq) { - // Lucene expects the doc frequency and total term frequency to be at least 1. This assumption - // doesn't always make sense (the segment can be empty -- see comment above), but to make Lucene - // happy, make sure to always set these parameters to at least 1. - int adjustedDocFreq = Math.max(docFreq, 1); - return new TermStatistics( - term.bytes(), - adjustedDocFreq, - Math.max(totalTermFreq, adjustedDocFreq)); - } -} diff --git a/src/java/com/twitter/search/common/search/termination/BUILD b/src/java/com/twitter/search/common/search/termination/BUILD deleted file mode 100644 index 913bb480e..000000000 --- a/src/java/com/twitter/search/common/search/termination/BUILD +++ /dev/null @@ -1,20 +0,0 @@ -java_library( - name = "termination", - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/lucene:lucene-queries", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/query", - "src/java/com/twitter/search/common/search", - "src/thrift/com/twitter/search:earlybird-java", - ], -) diff --git a/src/java/com/twitter/search/common/search/termination/BUILD.docx b/src/java/com/twitter/search/common/search/termination/BUILD.docx new file mode 100644 index 000000000..f784a73c8 Binary files /dev/null and b/src/java/com/twitter/search/common/search/termination/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/search/termination/QueryTimeout.docx b/src/java/com/twitter/search/common/search/termination/QueryTimeout.docx new file mode 100644 index 000000000..a15461c79 Binary files /dev/null and b/src/java/com/twitter/search/common/search/termination/QueryTimeout.docx differ diff --git a/src/java/com/twitter/search/common/search/termination/QueryTimeout.java b/src/java/com/twitter/search/common/search/termination/QueryTimeout.java deleted file mode 100644 index 52ffa2b54..000000000 --- a/src/java/com/twitter/search/common/search/termination/QueryTimeout.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.search.common.search.termination; - -import com.twitter.search.common.search.DocIdTracker; - -/** - * QueryTimeout provides a method for early termination of queries. - */ -public interface QueryTimeout { - /** - * Returns true if query processing should terminate, otherwise false. - */ - boolean shouldExit(); - - /** - * Register a DocIdTracker for the scope of the query, to determine the last fully-searched - * doc ID after early termination. - */ - void registerDocIdTracker(DocIdTracker docIdTracker); - - /** - * Return client ID of query. - */ - String getClientId(); -} diff --git a/src/java/com/twitter/search/common/search/termination/QueryTimeoutFactory.docx b/src/java/com/twitter/search/common/search/termination/QueryTimeoutFactory.docx new file mode 100644 index 000000000..8cec84e45 Binary files /dev/null and b/src/java/com/twitter/search/common/search/termination/QueryTimeoutFactory.docx differ diff --git a/src/java/com/twitter/search/common/search/termination/QueryTimeoutFactory.java b/src/java/com/twitter/search/common/search/termination/QueryTimeoutFactory.java deleted file mode 100644 index 8ac2e0ec7..000000000 --- a/src/java/com/twitter/search/common/search/termination/QueryTimeoutFactory.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.search.common.search.termination; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.search.TerminationTracker; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; - -public class QueryTimeoutFactory { - /** - * Creates a QueryTimeout instance for a given EarlybirdRequest and TerminationTracker, if the - * required conditions for leaf-level timeout checking are met. Returns null otherwise. - * - * The conditions are: - * 1) CollectorTerminationParams.isEnforceQueryTimeout() - * 2) CollectorTerminationParams.isSetTimeoutMs() - */ - public QueryTimeout createQueryTimeout( - EarlybirdRequest request, - TerminationTracker tracker, - Clock clock) { - if (tracker != null - && request != null - && request.isSetSearchQuery() - && request.getSearchQuery().isSetCollectorParams() - && request.getSearchQuery().getCollectorParams().isSetTerminationParams() - && request.getSearchQuery().getCollectorParams().getTerminationParams() - .isEnforceQueryTimeout() - && request.getSearchQuery().getCollectorParams().getTerminationParams() - .isSetTimeoutMs()) { - return new QueryTimeoutImpl(request.getClientId(), tracker, clock); - } else { - return null; - } - } -} diff --git a/src/java/com/twitter/search/common/search/termination/QueryTimeoutImpl.docx b/src/java/com/twitter/search/common/search/termination/QueryTimeoutImpl.docx new file mode 100644 index 000000000..cadeebb02 Binary files /dev/null and b/src/java/com/twitter/search/common/search/termination/QueryTimeoutImpl.docx differ diff --git a/src/java/com/twitter/search/common/search/termination/QueryTimeoutImpl.java b/src/java/com/twitter/search/common/search/termination/QueryTimeoutImpl.java deleted file mode 100644 index 252b57db1..000000000 --- a/src/java/com/twitter/search/common/search/termination/QueryTimeoutImpl.java +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.search.common.search.termination; - -import com.google.common.base.Preconditions; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.search.DocIdTracker; -import com.twitter.search.common.search.EarlyTerminationState; -import com.twitter.search.common.search.TerminationTracker; - -/** - * QueryTimeoutImpl provides a method for early termination of queries based on time. - */ -public class QueryTimeoutImpl implements QueryTimeout { - private final String clientId; - private final TerminationTracker tracker; - private final Clock clock; - - private final SearchRateCounter shouldTerminateCounter; - - public QueryTimeoutImpl(String clientId, TerminationTracker tracker, Clock clock) { - this.clientId = Preconditions.checkNotNull(clientId); - this.tracker = Preconditions.checkNotNull(tracker); - this.clock = Preconditions.checkNotNull(clock); - shouldTerminateCounter = - SearchRateCounter.export("query_timeout_should_terminate_" + clientId); - } - - /** - * Returns true when the clock's time has met or exceeded the tracker's timeout end. - */ - public boolean shouldExit() { - if (clock.nowMillis() >= tracker.getTimeoutEndTimeWithReservation()) { - tracker.setEarlyTerminationState(EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED); - shouldTerminateCounter.increment(); - return true; - } - return false; - } - - @Override - public void registerDocIdTracker(DocIdTracker docIdTracker) { - tracker.addDocIdTracker(docIdTracker); - } - - @Override - public String getClientId() { - return clientId; - } - - @Override - public int hashCode() { - return clientId.hashCode() * 13 + tracker.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof QueryTimeoutImpl)) { - return false; - } - - QueryTimeoutImpl queryTimeout = QueryTimeoutImpl.class.cast(obj); - return clientId.equals(queryTimeout.clientId) && tracker.equals(queryTimeout.tracker); - } -} diff --git a/src/java/com/twitter/search/common/search/termination/TerminationQuery.docx b/src/java/com/twitter/search/common/search/termination/TerminationQuery.docx new file mode 100644 index 000000000..f5e172cb7 Binary files /dev/null and b/src/java/com/twitter/search/common/search/termination/TerminationQuery.docx differ diff --git a/src/java/com/twitter/search/common/search/termination/TerminationQuery.java b/src/java/com/twitter/search/common/search/termination/TerminationQuery.java deleted file mode 100644 index a91ae074a..000000000 --- a/src/java/com/twitter/search/common/search/termination/TerminationQuery.java +++ /dev/null @@ -1,66 +0,0 @@ -package com.twitter.search.common.search.termination; - -import java.io.IOException; -import java.util.Arrays; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Weight; - -/** - * Query implementation that can timeout and return non-exhaustive results. - */ -public class TerminationQuery extends Query { - private final Query inner; - private final QueryTimeout timeout; - - public TerminationQuery(Query inner, QueryTimeout timeout) { - this.inner = Preconditions.checkNotNull(inner); - this.timeout = Preconditions.checkNotNull(timeout); - } - - @Override - public Weight createWeight( - IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { - Weight innerWeight = inner.createWeight(searcher, scoreMode, boost); - return new TerminationQueryWeight(this, innerWeight, timeout); - } - - @Override - public Query rewrite(IndexReader reader) throws IOException { - Query rewritten = inner.rewrite(reader); - if (rewritten != inner) { - return new TerminationQuery(rewritten, timeout); - } - return this; - } - - public QueryTimeout getTimeout() { - return timeout; - } - - @Override - public int hashCode() { - return Arrays.hashCode(new Object[] {inner, timeout}); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof TerminationQuery)) { - return false; - } - - TerminationQuery terminationQuery = TerminationQuery.class.cast(obj); - return Arrays.equals(new Object[] {inner, timeout}, - new Object[] {terminationQuery.inner, terminationQuery.timeout}); - } - - @Override - public String toString(String field) { - return inner.toString(field); - } -} diff --git a/src/java/com/twitter/search/common/search/termination/TerminationQueryScorer.docx b/src/java/com/twitter/search/common/search/termination/TerminationQueryScorer.docx new file mode 100644 index 000000000..21788e5c4 Binary files /dev/null and b/src/java/com/twitter/search/common/search/termination/TerminationQueryScorer.docx differ diff --git a/src/java/com/twitter/search/common/search/termination/TerminationQueryScorer.java b/src/java/com/twitter/search/common/search/termination/TerminationQueryScorer.java deleted file mode 100644 index d6d8af04d..000000000 --- a/src/java/com/twitter/search/common/search/termination/TerminationQueryScorer.java +++ /dev/null @@ -1,91 +0,0 @@ -package com.twitter.search.common.search.termination; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Weight; - -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.query.FilteredScorer; -import com.twitter.search.common.search.DocIdTracker; - -/** - * Scorer implementation that adds termination support for an underlying query. - * Meant to be used in conjunction with {@link TerminationQuery}. - */ -public class TerminationQueryScorer extends FilteredScorer implements DocIdTracker { - private final QueryTimeout timeout; - private int lastSearchedDocId = -1; - - TerminationQueryScorer(Weight weight, Scorer inner, QueryTimeout timeout) { - super(weight, inner); - this.timeout = Preconditions.checkNotNull(timeout); - this.timeout.registerDocIdTracker(this); - SearchRateCounter.export( - timeout.getClientId() + "_num_termination_query_scorers_created").increment(); - } - - @Override - public DocIdSetIterator iterator() { - final DocIdSetIterator superDISI = super.iterator(); - return new DocIdSetIterator() { - // lastSearchedDocId is the ID of the last document that was traversed in the posting list. - // docId is the current doc ID in this iterator. In most cases, lastSearchedDocId and docId - // will be equal. They will be different only if the query needed to be terminated based on - // the timeout. In that case, docId will be set to NO_MORE_DOCS, but lastSearchedDocId will - // still be set to the last document that was actually traversed. - private int docId = -1; - - @Override - public int docID() { - return docId; - } - - @Override - public int nextDoc() throws IOException { - if (docId == NO_MORE_DOCS) { - return NO_MORE_DOCS; - } - - if (timeout.shouldExit()) { - docId = NO_MORE_DOCS; - } else { - docId = superDISI.nextDoc(); - lastSearchedDocId = docId; - } - return docId; - } - - @Override - public int advance(int target) throws IOException { - if (docId == NO_MORE_DOCS) { - return NO_MORE_DOCS; - } - - if (target == NO_MORE_DOCS) { - docId = NO_MORE_DOCS; - lastSearchedDocId = docId; - } else if (timeout.shouldExit()) { - docId = NO_MORE_DOCS; - } else { - docId = superDISI.advance(target); - lastSearchedDocId = docId; - } - return docId; - } - - @Override - public long cost() { - return superDISI.cost(); - } - }; - } - - @Override - public int getCurrentDocId() { - return lastSearchedDocId; - } -} diff --git a/src/java/com/twitter/search/common/search/termination/TerminationQueryWeight.docx b/src/java/com/twitter/search/common/search/termination/TerminationQueryWeight.docx new file mode 100644 index 000000000..e40e9a22f Binary files /dev/null and b/src/java/com/twitter/search/common/search/termination/TerminationQueryWeight.docx differ diff --git a/src/java/com/twitter/search/common/search/termination/TerminationQueryWeight.java b/src/java/com/twitter/search/common/search/termination/TerminationQueryWeight.java deleted file mode 100644 index 41aee0e7b..000000000 --- a/src/java/com/twitter/search/common/search/termination/TerminationQueryWeight.java +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.search.common.search.termination; - -import java.io.IOException; -import java.util.Set; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.Weight; - -/** - * Weight implementation that adds termination support for an underlying query. - * Meant to be used in conjunction with {@link TerminationQuery}. - */ -public class TerminationQueryWeight extends Weight { - private final Weight inner; - private final QueryTimeout timeout; - - TerminationQueryWeight(TerminationQuery query, Weight inner, QueryTimeout timeout) { - super(query); - this.inner = inner; - this.timeout = Preconditions.checkNotNull(timeout); - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) - throws IOException { - return inner.explain(context, doc); - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - Scorer innerScorer = inner.scorer(context); - if (innerScorer != null) { - return new TerminationQueryScorer(this, innerScorer, timeout); - } - - return null; - } - - @Override - public void extractTerms(Set terms) { - inner.extractTerms(terms); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return inner.isCacheable(ctx); - } -} diff --git a/src/java/com/twitter/search/common/util/earlybird/BUILD b/src/java/com/twitter/search/common/util/earlybird/BUILD deleted file mode 100644 index ac7f561d9..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/BUILD +++ /dev/null @@ -1,32 +0,0 @@ -java_library( - sources = ["**/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/logging", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/relevance:ranking", - "src/java/com/twitter/search/common/relevance:text", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/runtime", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/thrift/com/twitter/search:earlybird-java", - "src/thrift/com/twitter/search/adaptive:adaptive-results-java", - "src/thrift/com/twitter/search/common:constants-java", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:query-java", - "src/thrift/com/twitter/search/common:ranking-java", - "util/util-core:scala", - ], -) diff --git a/src/java/com/twitter/search/common/util/earlybird/BUILD.docx b/src/java/com/twitter/search/common/util/earlybird/BUILD.docx new file mode 100644 index 000000000..52e600761 Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseMergeUtil.docx b/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseMergeUtil.docx new file mode 100644 index 000000000..e545330d9 Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseMergeUtil.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseMergeUtil.java b/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseMergeUtil.java deleted file mode 100644 index c41003e7d..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseMergeUtil.java +++ /dev/null @@ -1,269 +0,0 @@ -package com.twitter.search.common.util.earlybird; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ExecutionException; - -import com.google.common.base.Preconditions; -import com.google.common.cache.LoadingCache; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.collections.Pair; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftTweetSource; - -/** - * Utility methods to merge EarlybirdResponses. - */ -public final class EarlybirdResponseMergeUtil { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdResponseMergeUtil.class); - - private static final String INVALID_RESPONSE_STATS_PREFIX = "invalid_response_stats_"; - - // Stats for invalid earlybird response - private static final ImmutableMap ERROR_EXCEPTIONS; - - public static final SearchCounter NULL_RESPONSE_COUNTER = - SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "null_response"); - public static final SearchCounter SEARCH_RESULTS_NOT_SET_COUNTER = - SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "search_results_not_set"); - public static final SearchCounter SEARCH_RESULTS_WITH_RESULTS_NOT_SET_COUNTER = - SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "search_results_with_results_not_set"); - public static final SearchCounter MAX_SEARCHED_STATUS_ID_NOT_SET_COUNTER = - SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "max_searched_status_id_not_set"); - public static final SearchCounter MIN_SEARCHED_STATUS_ID_NOT_SET_COUNTER = - SearchCounter.export(INVALID_RESPONSE_STATS_PREFIX + "min_searched_status_id_not_set"); - - static { - ImmutableMap.Builder builder = ImmutableMap.builder(); - - for (EarlybirdResponseCode responseCode : EarlybirdResponseCode.values()) { - if (responseCode != EarlybirdResponseCode.SUCCESS) { - builder.put(responseCode, SearchCounter.export( - INVALID_RESPONSE_STATS_PREFIX + responseCode.name().toLowerCase())); - } - } - - ERROR_EXCEPTIONS = builder.build(); - } - - private EarlybirdResponseMergeUtil() { - } - - /** - * Tags the results in the given EarlybirdResponse with the given ThriftTweetSource and adds them - * to the given list of results. - * - * @param results The list of results to which the new results will be added. - * @param earlybirdResponse The EarlybirdResponse whose results will be added to {@code results}. - * @param tweetSource The ThriftTweetSource that will be used to mark all results in - * {@code earlybirdResponse}. - * @return {@code false} if {@code earlybirdResponse} is {@code null} or doesn't have any results; - * {@code true}, otherwise. - */ - public static boolean addResultsToList(List results, - EarlybirdResponse earlybirdResponse, - ThriftTweetSource tweetSource) { - return EarlybirdResponseUtil.hasResults(earlybirdResponse) - && addResultsToList(results, - earlybirdResponse.getSearchResults().getResults(), - tweetSource); - } - - /** - * Tags the results in the given list with the given ThriftTweetSource and adds them to the given - * list of results. - * - * @param results The list of results to which the new results will be added. - * @param resultsToAdd The list of results to add. - * @param tweetSource The ThriftTweetSource that will be used to mark all results in - * {@code resultsToAdd}. - * @return {@code false} if {@code results} is {@code null} or if {@code resultsToAdd} is - * {@code null} or doesn't have any results; {@code true}, otherwise. - */ - public static boolean addResultsToList(List results, - List resultsToAdd, - ThriftTweetSource tweetSource) { - Preconditions.checkNotNull(results); - if ((resultsToAdd == null) || resultsToAdd.isEmpty()) { - return false; - } - - markWithTweetSource(resultsToAdd, tweetSource); - - results.addAll(resultsToAdd); - return true; - } - - /** - * Distinct the input ThriftSearchResult by its status id. If there are duplicates, the first - * instance of the duplicates is returned in the distinct result. If the distinct result is the - * same as the input result, the initial input result is returned; otherwise, the distinct result - * is returned. - * - * @param results the input result - * @param dupsStats stats counter track duplicates source - * @return the input result if there is no duplicate; otherwise, return the distinct result - */ - public static List distinctByStatusId( - List results, - LoadingCache, SearchCounter> dupsStats) { - Map seenStatusIdToSourceMap = new HashMap<>(); - List distinctResults = Lists.newArrayListWithCapacity(results.size()); - for (ThriftSearchResult result : results) { - if (seenStatusIdToSourceMap.containsKey(result.getId())) { - ThriftTweetSource source1 = seenStatusIdToSourceMap.get(result.getId()); - ThriftTweetSource source2 = result.getTweetSource(); - if (source1 != null && source2 != null) { - try { - dupsStats.get(Pair.of(source1, source2)).increment(); - } catch (ExecutionException e) { - LOG.warn("Could not increment stat for duplicate results from clusters " + source1 - + " and " + source2, e); - } - } - } else { - distinctResults.add(result); - seenStatusIdToSourceMap.put(result.getId(), result.getTweetSource()); - } - } - return results.size() == distinctResults.size() ? results : distinctResults; - } - - /** - * Tags the given results with the given ThriftTweetSource. - * - * @param results The results to be tagged. - * @param tweetSource The ThriftTweetSource to be used to tag the given results. - */ - public static void markWithTweetSource(List results, - ThriftTweetSource tweetSource) { - if (results != null) { - for (ThriftSearchResult result : results) { - result.setTweetSource(tweetSource); - } - } - } - - /** - * Check if an Earlybird response is valid - */ - public static boolean isValidResponse(final EarlybirdResponse response) { - if (response == null) { - NULL_RESPONSE_COUNTER.increment(); - return false; - } - - if (!EarlybirdResponseUtil.isSuccessfulResponse(response)) { - return false; - } - - if (!response.isSetSearchResults()) { - SEARCH_RESULTS_NOT_SET_COUNTER.increment(); - return true; - } - - if (!response.getSearchResults().isSetResults()) { - SEARCH_RESULTS_WITH_RESULTS_NOT_SET_COUNTER.increment(); - } - - // In earlybird, when earlybird terminated, e.g., time out, complex queries - we don't set the - // min/max searched status id. - boolean isEarlyTerminated = response.isSetEarlyTerminationInfo() - && response.getEarlyTerminationInfo().isEarlyTerminated(); - - if (!isEarlyTerminated && !response.getSearchResults().isSetMinSearchedStatusID()) { - MIN_SEARCHED_STATUS_ID_NOT_SET_COUNTER.increment(); - } - - if (!isEarlyTerminated && !response.getSearchResults().isSetMaxSearchedStatusID()) { - MAX_SEARCHED_STATUS_ID_NOT_SET_COUNTER.increment(); - } - - return true; - } - - /** - * For invalid successful Earlybird Response, return a failed response with debug msg. - */ - public static EarlybirdResponse transformInvalidResponse(final EarlybirdResponse response, - final String debugMsg) { - if (response == null) { - return failedEarlybirdResponse(EarlybirdResponseCode.PERSISTENT_ERROR, - debugMsg + ", msg: null response from downstream"); - } - Preconditions.checkState(response.getResponseCode() != EarlybirdResponseCode.SUCCESS); - - EarlybirdResponseCode newResponseCode; - EarlybirdResponseCode responseCode = response.getResponseCode(); - switch (responseCode) { - case TIER_SKIPPED: - ERROR_EXCEPTIONS.get(responseCode).increment(); - return response; - case REQUEST_BLOCKED_ERROR: - case CLIENT_ERROR: - case SERVER_TIMEOUT_ERROR: - case QUOTA_EXCEEDED_ERROR: - case CLIENT_CANCEL_ERROR: - case TOO_MANY_PARTITIONS_FAILED_ERROR: - ERROR_EXCEPTIONS.get(responseCode).increment(); - newResponseCode = responseCode; - break; - default: - ERROR_EXCEPTIONS.get(responseCode).increment(); - newResponseCode = EarlybirdResponseCode.PERSISTENT_ERROR; - } - - String newDebugMsg = debugMsg + ", downstream response code: " + responseCode - + (response.isSetDebugString() ? ", downstream msg: " + response.getDebugString() : ""); - - - return failedEarlybirdResponse(newResponseCode, newDebugMsg); - } - - /** - * Create a new EarlybirdResponse with debug msg - */ - public static EarlybirdResponse failedEarlybirdResponse(final EarlybirdResponseCode responseCode, - final String debugMsg) { - EarlybirdResponse failedResponse = new EarlybirdResponse(); - failedResponse.setResponseCode(responseCode); - failedResponse.setDebugString(debugMsg); - return failedResponse; - } - - /** - * Returns the number of results to keep as part of merge-collection. Recency mode should ignore - * relevance options. In particular, the flag returnAllResults inside relevance options. - */ - public static int computeNumResultsToKeep(EarlybirdRequest request) { - ThriftSearchQuery searchQuery = request.getSearchQuery(); - - if (searchQuery.getRankingMode() != ThriftSearchRankingMode.RECENCY - && searchQuery.isSetRelevanceOptions() - && searchQuery.getRelevanceOptions().isReturnAllResults()) { - return Integer.MAX_VALUE; - } - - if (request.isSetNumResultsToReturnAtRoot()) { - return request.getNumResultsToReturnAtRoot(); - } - - if (searchQuery.isSetCollectorParams()) { - return searchQuery.getCollectorParams().getNumResultsToReturn(); - } - - return searchQuery.getNumResults(); - } -} diff --git a/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseUtil.docx b/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseUtil.docx new file mode 100644 index 000000000..3aff400ad Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseUtil.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseUtil.java b/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseUtil.java deleted file mode 100644 index 51c81edfa..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/EarlybirdResponseUtil.java +++ /dev/null @@ -1,204 +0,0 @@ -package com.twitter.search.common.util.earlybird; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -import com.google.common.base.Preconditions; - -import com.twitter.search.adaptive.adaptive_results.thriftjava.TweetSource; -import com.twitter.search.common.logging.ObjectKey; -import com.twitter.search.common.runtime.DebugManager; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird.thrift.ThriftTweetSource; - -/** Utility methods that work on EarlybirdResponses. */ -public final class EarlybirdResponseUtil { - private EarlybirdResponseUtil() { - } - - /** - * Returns the results in the given EarlybirdResponse. - * - * @param response The EarlybirdResponse. - * @return The results in the given EarlybirdResponse, or {@code null} if the response is - * {@code null} or the results are not set. - */ - public static ThriftSearchResults getResults(EarlybirdResponse response) { - if ((response == null) || !response.isSetSearchResults()) { - return null; - } - - return response.getSearchResults(); - } - - /** - * Determines if the given EarlybirdResponse has results. - * - * @param response The EarlybirdResponse. - * @return {@code true} if the given EarlybirdResponse has results; {@code false} otherwise. - */ - public static boolean hasResults(EarlybirdResponse response) { - ThriftSearchResults results = getResults(response); - return (results != null) && results.isSetResults() && !results.getResults().isEmpty(); - } - - /** - * Returns the number of results in the given EarlybirdResponse. - * - * @param response The EarlybirdResponse. - * @return The number of results in the given EarlybirdResponse. - */ - public static int getNumResults(EarlybirdResponse response) { - return hasResults(response) ? response.getSearchResults().getResultsSize() : 0; - } - - /** - * Determines the response is early-terminated. - * - * @param response The EarlybirdResponse. - * @return {@code true} if the response is early-terminated; {@code false} otherwise. - */ - public static boolean isEarlyTerminated(EarlybirdResponse response) { - Preconditions.checkNotNull(response); - return response.isSetEarlyTerminationInfo() - && response.getEarlyTerminationInfo().isEarlyTerminated(); - } - - /** - * Returns if the response should be considered failed for purposes of stats and logging. - */ - public static boolean responseConsideredFailed(EarlybirdResponseCode code) { - return code != EarlybirdResponseCode.SUCCESS - && code != EarlybirdResponseCode.REQUEST_BLOCKED_ERROR - && code != EarlybirdResponseCode.TIER_SKIPPED; - } - - /** - * Extract results from Earlybird response. - */ - public static List extractResultsFromEarlybirdResponse( - EarlybirdResponse response) { - return hasResults(response) - ? response.getSearchResults().getResults() : Collections.emptyList(); - } - - /** - * Log the Earlybird response as a candidate source. - */ - public static EarlybirdResponse debugLogAsCandidateSource( - EarlybirdResponse response, TweetSource tweetSource) { - List results = extractResultsFromEarlybirdResponse(response); - debugLogAsCandidateSourceHelper(results, tweetSource); - return response; - } - - /** - * Log a list of ThriftSearchResult as a candidate source. - */ - public static List debugLogAsCandidateSource( - List results, TweetSource tweetSource) { - debugLogAsCandidateSourceHelper(results, tweetSource); - return results; - } - - private static void debugLogAsCandidateSourceHelper( - List results, TweetSource tweetSource) { - // debug message for Earlybird relevance candidate source - List strIds = results - .stream() - .map(ThriftSearchResult::getId) - .map(Object::toString) - .collect(Collectors.toList()); - ObjectKey debugMsgKey = ObjectKey.createTweetCandidateSourceKey( - tweetSource.name()); - DebugManager.perObjectBasic( - debugMsgKey, - String.format("[%s][%s] results: %s", debugMsgKey.getType(), debugMsgKey.getId(), strIds)); - } - - /** - * Extract the real time response from an existing response - */ - public static EarlybirdResponse extractRealtimeResponse(EarlybirdResponse response) { - EarlybirdResponse realtimeResponse = response.deepCopy(); - if (EarlybirdResponseUtil.hasResults(response)) { - List realtimeResults = realtimeResponse.getSearchResults().getResults(); - realtimeResults.clear(); - for (ThriftSearchResult result : response.getSearchResults().getResults()) { - if (result.getTweetSource() == ThriftTweetSource.REALTIME_CLUSTER) { - realtimeResults.add(result); - } - } - } - - return realtimeResponse; - } - - /** - * Returns an EarlybirdResponse that should be returned by roots when a tier was skipped. - * - * @param minId The minSearchedStatusID to be set on the response. - * @param maxId The maxSearchedStatusID to be set on the response. - * @param debugMsg The debug message to be set on the response. - * @return A response that should be returned by roots when a tier was skipped. - */ - public static EarlybirdResponse tierSkippedRootResponse(long minId, long maxId, String debugMsg) { - return new EarlybirdResponse(EarlybirdResponseCode.SUCCESS, 0) - .setSearchResults(new ThriftSearchResults() - .setResults(new ArrayList<>()) - .setMinSearchedStatusID(minId) - .setMaxSearchedStatusID(maxId)) - .setDebugString(debugMsg); - } - - /** - * Determines if the given response is a success response. - * - * A response is considered successful if it's not null and has either a SUCCESS, TIER_SKIPPED or - * REQUEST_BLOCKED_ERROR response code. - * - * @param response The response to check. - * @return Whether the given response is successful or not. - */ - public static boolean isSuccessfulResponse(EarlybirdResponse response) { - return response != null - && (response.getResponseCode() == EarlybirdResponseCode.SUCCESS - || response.getResponseCode() == EarlybirdResponseCode.TIER_SKIPPED - || response.getResponseCode() == EarlybirdResponseCode.REQUEST_BLOCKED_ERROR); - } - - /** - * Finds all unexpected nullcast statuses within the given result. A nullcast status is - * unexpected iff: - * 1. the tweet is a nullcast tweet. - * 2. the tweet is NOT explicitly requested with {@link ThriftSearchQuery#searchStatusIds} - */ - public static Set findUnexpectedNullcastStatusIds( - ThriftSearchResults thriftSearchResults, EarlybirdRequest request) { - Set statusIds = new HashSet<>(); - for (ThriftSearchResult result : thriftSearchResults.getResults()) { - if (resultIsNullcast(result) && !isSearchStatusId(request, result.getId())) { - statusIds.add(result.getId()); - } - } - return statusIds; - } - - private static boolean isSearchStatusId(EarlybirdRequest request, long id) { - return request.getSearchQuery().isSetSearchStatusIds() - && request.getSearchQuery().getSearchStatusIds().contains(id); - } - - private static boolean resultIsNullcast(ThriftSearchResult result) { - return result.isSetMetadata() && result.getMetadata().isIsNullcast(); - } -} diff --git a/src/java/com/twitter/search/common/util/earlybird/FacetsResultsUtils.docx b/src/java/com/twitter/search/common/util/earlybird/FacetsResultsUtils.docx new file mode 100644 index 000000000..7f45b870b Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/FacetsResultsUtils.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/FacetsResultsUtils.java b/src/java/com/twitter/search/common/util/earlybird/FacetsResultsUtils.java deleted file mode 100644 index 43d5732e4..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/FacetsResultsUtils.java +++ /dev/null @@ -1,495 +0,0 @@ -package com.twitter.search.common.util.earlybird; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; -import com.twitter.search.common.logging.DebugMessageBuilder; -import com.twitter.search.common.ranking.thriftjava.ThriftFacetFinalSortOrder; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftFacetCount; -import com.twitter.search.earlybird.thrift.ThriftFacetCountMetadata; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldRequest; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults; -import com.twitter.search.earlybird.thrift.ThriftFacetRankingMode; -import com.twitter.search.earlybird.thrift.ThriftFacetRequest; -import com.twitter.search.earlybird.thrift.ThriftFacetResults; -import com.twitter.search.earlybird.thrift.ThriftTermResults; - -/** - * A utility class to provide some functions for facets results processing. - */ -public final class FacetsResultsUtils { - - private static final Logger LOG = LoggerFactory.getLogger(FacetsResultsUtils.class); - - private FacetsResultsUtils() { - } - - public static class FacetFieldInfo { - public ThriftFacetFieldRequest fieldRequest; - public int totalCounts; - public Map topFacets; - public List> languageHistogramEntries = Lists.newLinkedList(); - } - - // Only return top languages in the language histogram which sum up to at least this much - // ratio, here we get first 80 percentiles. - public static final double MIN_PERCENTAGE_SUM_REQUIRED = 0.8; - // if a language ratio is over this number, we already return. - public static final double MIN_PERCENTAGE = 0.01; - - /** - * Prepare facet fields with empty entries and check if we need termStats for filtering. - * Returns true if termStats filtering is needed (thus the termStats servie call). - * @param facetRequest The related facet request. - * @param facetFieldInfoMap The facet field info map to fill, a map from facet type to the facet - * fiels results info. - * @return {@code true} if termstats request is needed afterwards. - */ - public static boolean prepareFieldInfoMap( - ThriftFacetRequest facetRequest, - final Map facetFieldInfoMap) { - boolean termStatsFilteringMode = false; - - for (ThriftFacetFieldRequest fieldRequest : facetRequest.getFacetFields()) { - FacetsResultsUtils.FacetFieldInfo info = new FacetsResultsUtils.FacetFieldInfo(); - info.fieldRequest = fieldRequest; - facetFieldInfoMap.put(fieldRequest.getFieldName(), info); - if (fieldRequest.getRankingMode() == ThriftFacetRankingMode.FILTER_WITH_TERM_STATISTICS) { - termStatsFilteringMode = true; - } - } - - return termStatsFilteringMode; - } - - /** - * Extract information from one ThriftFacetResults into facetFieldInfoMap and userIDWhitelist. - * @param facetResults Related facets results. - * @param facetFieldInfoMap The facets field info map to fill, a map from facet type to the facet - * fiels results info. - * @param userIDWhitelist The user whitelist to fill. - */ - public static void fillFacetFieldInfo( - final ThriftFacetResults facetResults, - final Map facetFieldInfoMap, - final Set userIDWhitelist) { - - for (String facetField : facetResults.getFacetFields().keySet()) { - FacetsResultsUtils.FacetFieldInfo info = facetFieldInfoMap.get(facetField); - if (info.topFacets == null) { - info.topFacets = new HashMap<>(); - } - - ThriftFacetFieldResults results = facetResults.getFacetFields().get(facetField); - if (results.isSetLanguageHistogram()) { - info.languageHistogramEntries.addAll(results.getLanguageHistogram().entrySet()); - } - for (ThriftFacetCount newCount : results.getTopFacets()) { - ThriftFacetCount resultCount = info.topFacets.get(newCount.facetLabel); - if (resultCount == null) { - info.topFacets.put(newCount.facetLabel, new ThriftFacetCount(newCount)); - } else { - resultCount.setFacetCount(resultCount.facetCount + newCount.facetCount); - resultCount.setSimpleCount(resultCount.simpleCount + newCount.simpleCount); - resultCount.setWeightedCount(resultCount.weightedCount + newCount.weightedCount); - resultCount.setPenaltyCount(resultCount.penaltyCount + newCount.penaltyCount); - // this could pass the old metadata object back or a new merged one. - resultCount.setMetadata( - mergeFacetMetadata(resultCount.getMetadata(), newCount.getMetadata(), - userIDWhitelist)); - } - } - info.totalCounts += results.totalCount; - } - } - - /** - * Merge a metadata into an existing one. - * @param baseMetadata the metadata to merge into. - * @param metadataUpdate the new metadata to merge. - * @param userIDWhitelist user id whitelist to filter user id with. - * @return The updated metadata. - */ - public static ThriftFacetCountMetadata mergeFacetMetadata( - final ThriftFacetCountMetadata baseMetadata, - final ThriftFacetCountMetadata metadataUpdate, - final Set userIDWhitelist) { - ThriftFacetCountMetadata mergedMetadata = baseMetadata; - if (metadataUpdate != null) { - String mergedExplanation = null; - if (mergedMetadata != null) { - if (mergedMetadata.maxTweepCred < metadataUpdate.maxTweepCred) { - mergedMetadata.setMaxTweepCred(metadataUpdate.maxTweepCred); - } - - if (mergedMetadata.isSetExplanation()) { - mergedExplanation = mergedMetadata.getExplanation(); - if (metadataUpdate.isSetExplanation()) { - mergedExplanation += "\n" + metadataUpdate.getExplanation(); - } - } else if (metadataUpdate.isSetExplanation()) { - mergedExplanation = metadataUpdate.getExplanation(); - } - - if (mergedMetadata.getStatusId() == -1) { - if (LOG.isDebugEnabled()) { - LOG.debug("status id in facet count metadata is -1: " + mergedMetadata); - } - mergedMetadata = metadataUpdate; - } else if (metadataUpdate.getStatusId() != -1 - && metadataUpdate.getStatusId() < mergedMetadata.getStatusId()) { - // keep the oldest tweet, ie. the lowest status ID - mergedMetadata = metadataUpdate; - } else if (metadataUpdate.getStatusId() == mergedMetadata.getStatusId()) { - if (mergedMetadata.getTwitterUserId() == -1) { - // in this case we didn't find the user in a previous partition yet - // only update the user if the status id matches - mergedMetadata.setTwitterUserId(metadataUpdate.getTwitterUserId()); - mergedMetadata.setDontFilterUser(metadataUpdate.isDontFilterUser()); - } - if (!mergedMetadata.isSetStatusLanguage()) { - mergedMetadata.setStatusLanguage(metadataUpdate.getStatusLanguage()); - } - } - if (!mergedMetadata.isSetNativePhotoUrl() && metadataUpdate.isSetNativePhotoUrl()) { - mergedMetadata.setNativePhotoUrl(metadataUpdate.getNativePhotoUrl()); - } - } else { - mergedMetadata = metadataUpdate; - } - - // this will not set an explanation if neither oldMetadata nor metadataUpdate - // had an explanation - if (mergedExplanation != null) { - mergedMetadata.setExplanation(mergedExplanation); - } - - if (userIDWhitelist != null) { - // result must not be null now because of the if above - if (mergedMetadata.getTwitterUserId() != -1 && !mergedMetadata.isDontFilterUser()) { - mergedMetadata.setDontFilterUser( - userIDWhitelist.contains(mergedMetadata.getTwitterUserId())); - } - } - } - - return mergedMetadata; - } - - /** - * Appends all twimg results to the image results. Optionally resorts the image results if - * a comparator is passed in. - * Also computes the sums of totalCount, totalScore, totalPenalty. - */ - public static void mergeTwimgResults(ThriftFacetResults facetResults, - Comparator optionalSortComparator) { - if (facetResults == null || !facetResults.isSetFacetFields()) { - return; - } - - ThriftFacetFieldResults imageResults = - facetResults.getFacetFields().get(EarlybirdFieldConstant.IMAGES_FACET); - ThriftFacetFieldResults twimgResults = - facetResults.getFacetFields().remove(EarlybirdFieldConstant.TWIMG_FACET); - if (imageResults == null) { - if (twimgResults != null) { - facetResults.getFacetFields().put(EarlybirdFieldConstant.IMAGES_FACET, twimgResults); - } - return; - } - - if (twimgResults != null) { - imageResults.setTotalCount(imageResults.getTotalCount() + twimgResults.getTotalCount()); - imageResults.setTotalPenalty(imageResults.getTotalPenalty() + twimgResults.getTotalPenalty()); - imageResults.setTotalScore(imageResults.getTotalScore() + twimgResults.getTotalScore()); - for (ThriftFacetCount count : twimgResults.getTopFacets()) { - imageResults.addToTopFacets(count); - } - if (optionalSortComparator != null) { - Collections.sort(imageResults.topFacets, optionalSortComparator); - } - } - } - - /** - * Dedup twimg facets. - * - * Twimg facet uses the status ID as the facet label, instead of the twimg URL, a.k.a. - * native photo URL. It is possible to have the same twimg URL appearing in two different - * facet label (RT style retweet? copy & paste the twimg URL?). Therefore, to dedup twimg - * facet correctly, we need to look at ThriftFacetCount.metadata.nativePhotoUrl - * - * @param dedupSet A set holding the native URLs from the twimg facetFieldResults. By having - * the caller passing in the set, it allows the caller to dedup the facet - * across different ThriftFacetFieldResults. - * @param facetFieldResults The twimg facet field results to be debupped - * @param debugMessageBuilder - */ - public static void dedupTwimgFacet(Set dedupSet, - ThriftFacetFieldResults facetFieldResults, - DebugMessageBuilder debugMessageBuilder) { - if (facetFieldResults == null || facetFieldResults.getTopFacets() == null) { - return; - } - - Iterator iterator = facetFieldResults.getTopFacetsIterator(); - - while (iterator.hasNext()) { - ThriftFacetCount count = iterator.next(); - if (count.isSetMetadata() && count.getMetadata().isSetNativePhotoUrl()) { - String nativeUrl = count.getMetadata().getNativePhotoUrl(); - - if (dedupSet.contains(nativeUrl)) { - iterator.remove(); - debugMessageBuilder.detailed("dedupTwimgFacet removed %s", nativeUrl); - } else { - dedupSet.add(nativeUrl); - } - } - } - - - } - - private static final class LanguageCount { - private final ThriftLanguage lang; - private final double count; - private LanguageCount(ThriftLanguage lang, double count) { - this.lang = lang; - this.count = count; - } - } - - /** - * Calculate the top languages and store them in the results. - */ - public static void fillTopLanguages(FacetsResultsUtils.FacetFieldInfo info, - final ThriftFacetFieldResults results) { - double sumForLanguage = 0.0; - double[] sums = new double[ThriftLanguage.values().length]; - for (Map.Entry entry : info.languageHistogramEntries) { - sumForLanguage += entry.getValue(); - if (entry.getKey() == null) { - // EB might be setting null key for unknown language. SEARCH-1294 - continue; - } - sums[entry.getKey().getValue()] += entry.getValue(); - } - if (sumForLanguage == 0.0) { - return; - } - List langCounts = new ArrayList<>(ThriftLanguage.values().length); - for (int i = 0; i < sums.length; i++) { - if (sums[i] > 0.0) { - // ThriftLanguage.findByValue() might return null, which should fall back to UNKNOWN. - ThriftLanguage lang = ThriftLanguage.findByValue(i); - lang = lang == null ? ThriftLanguage.UNKNOWN : lang; - langCounts.add(new LanguageCount(lang, sums[i])); - } - } - Collections.sort(langCounts, (left, right) -> Double.compare(right.count, left.count)); - double percentageSum = 0.0; - Map languageHistogramMap = - new HashMap<>(langCounts.size()); - int numAdded = 0; - for (LanguageCount langCount : langCounts) { - if (langCount.count == 0.0) { - break; - } - double percentage = langCount.count / sumForLanguage; - if (percentageSum > MIN_PERCENTAGE_SUM_REQUIRED - && percentage < MIN_PERCENTAGE && numAdded >= 3) { - break; - } - languageHistogramMap.put(langCount.lang, percentage); - percentageSum += percentage; - numAdded++; - } - results.setLanguageHistogram(languageHistogramMap); - } - - /** - * Replace "p.twimg.com/" part of the native photo (twimg) URL with "pbs.twimg.com/media/". - * We need to do this because of blobstore and it's suppose to be a temporary measure. This - * code should be removed once we verified that all native photo URL being sent to Search - * are prefixed with "pbs.twimg.com/media/" and no native photo URL in our index contains - * "p.twimg.com/" - * - * Please see SEARCH-783 and EVENTS-539 for more details. - * - * @param response response containing the facet results - */ - public static void fixNativePhotoUrl(EarlybirdResponse response) { - if (response == null - || !response.isSetFacetResults() - || !response.getFacetResults().isSetFacetFields()) { - return; - } - - for (Map.Entry facetMapEntry - : response.getFacetResults().getFacetFields().entrySet()) { - final String facetResultField = facetMapEntry.getKey(); - - if (EarlybirdFieldConstant.TWIMG_FACET.equals(facetResultField) - || EarlybirdFieldConstant.IMAGES_FACET.equals(facetResultField)) { - ThriftFacetFieldResults facetFieldResults = facetMapEntry.getValue(); - for (ThriftFacetCount facetCount : facetFieldResults.getTopFacets()) { - replacePhotoUrl(facetCount.getMetadata()); - } - } - } - } - - /** - * Replace "p.twimg.com/" part of the native photo (twimg) URL with "pbs.twimg.com/media/". - * We need to do this because of blobstore and it's suppose to be a temporary measure. This - * code should be removed once we verified that all native photo URL being sent to Search - * are prefixed with "pbs.twimg.com/media/" and no native photo URL in our index contains - * "p.twimg.com/" - * - * Please see SEARCH-783 and EVENTS-539 for more details. - * - * @param termResultsCollection collection of ThriftTermResults containing the native photo URL - */ - public static void fixNativePhotoUrl(Collection termResultsCollection) { - if (termResultsCollection == null) { - return; - } - - for (ThriftTermResults termResults : termResultsCollection) { - if (!termResults.isSetMetadata()) { - continue; - } - replacePhotoUrl(termResults.getMetadata()); - } - } - - /** - * Helper function for fixNativePhotoUrl() - */ - private static void replacePhotoUrl(ThriftFacetCountMetadata metadata) { - if (metadata != null - && metadata.isSetNativePhotoUrl()) { - String nativePhotoUrl = metadata.getNativePhotoUrl(); - nativePhotoUrl = nativePhotoUrl.replace("://p.twimg.com/", "://pbs.twimg.com/media/"); - metadata.setNativePhotoUrl(nativePhotoUrl); - } - } - - /** - * Deepcopy of an EarlybirdResponse without explanation - */ - public static EarlybirdResponse deepCopyWithoutExplanation(EarlybirdResponse facetsResponse) { - if (facetsResponse == null) { - return null; - } else if (!facetsResponse.isSetFacetResults() - || facetsResponse.getFacetResults().getFacetFieldsSize() == 0) { - return facetsResponse.deepCopy(); - } - EarlybirdResponse copy = facetsResponse.deepCopy(); - for (Map.Entry entry - : copy.getFacetResults().getFacetFields().entrySet()) { - if (entry.getValue().getTopFacetsSize() > 0) { - for (ThriftFacetCount fc : entry.getValue().getTopFacets()) { - fc.getMetadata().unsetExplanation(); - } - } - } - return copy; - } - - /** - * Returns a comparator used to compare facet counts by calling - * getFacetCountComparator(ThriftFacetFinalSortOrder). The sort order is determined by - * the facetRankingOptions on the facet request. - */ - public static Comparator getFacetCountComparator( - ThriftFacetRequest facetRequest) { - - ThriftFacetFinalSortOrder sortOrder = ThriftFacetFinalSortOrder.SCORE; - - if (facetRequest.isSetFacetRankingOptions() - && facetRequest.getFacetRankingOptions().isSetFinalSortOrder()) { - sortOrder = facetRequest.getFacetRankingOptions().getFinalSortOrder(); - } - - return getFacetCountComparator(sortOrder); - } - - /** - * Returns a comparator using the specified order. - */ - public static Comparator getFacetCountComparator( - ThriftFacetFinalSortOrder sortOrder) { - - switch (sortOrder) { - case SIMPLE_COUNT: return SIMPLE_COUNT_COMPARATOR; - case SCORE: return SCORE_COMPARATOR; - case CREATED_AT: return CREATED_AT_COMPARATOR; - case WEIGHTED_COUNT: return WEIGHTED_COUNT_COMPARATOR; - default: return SCORE_COMPARATOR; - } - } - - private static final Comparator SIMPLE_COUNT_COMPARATOR = - (count1, count2) -> { - if (count1.simpleCount > count2.simpleCount) { - return 1; - } else if (count1.simpleCount < count2.simpleCount) { - return -1; - } - - return count1.facetLabel.compareTo(count2.facetLabel); - }; - - private static final Comparator WEIGHTED_COUNT_COMPARATOR = - (count1, count2) -> { - if (count1.weightedCount > count2.weightedCount) { - return 1; - } else if (count1.weightedCount < count2.weightedCount) { - return -1; - } - - return SIMPLE_COUNT_COMPARATOR.compare(count1, count2); - }; - - private static final Comparator SCORE_COMPARATOR = - (count1, count2) -> { - if (count1.score > count2.score) { - return 1; - } else if (count1.score < count2.score) { - return -1; - } - return SIMPLE_COUNT_COMPARATOR.compare(count1, count2); - }; - - private static final Comparator CREATED_AT_COMPARATOR = - (count1, count2) -> { - if (count1.isSetMetadata() && count1.getMetadata().isSetCreated_at() - && count2.isSetMetadata() && count2.getMetadata().isSetCreated_at()) { - // more recent items have higher created_at values - if (count1.getMetadata().getCreated_at() > count2.getMetadata().getCreated_at()) { - return 1; - } else if (count1.getMetadata().getCreated_at() < count2.getMetadata().getCreated_at()) { - return -1; - } - } - - return SCORE_COMPARATOR.compare(count1, count2); - }; -} diff --git a/src/java/com/twitter/search/common/util/earlybird/ResponseMergerUtils.docx b/src/java/com/twitter/search/common/util/earlybird/ResponseMergerUtils.docx new file mode 100644 index 000000000..c1ceb98cc Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/ResponseMergerUtils.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/ResponseMergerUtils.java b/src/java/com/twitter/search/common/util/earlybird/ResponseMergerUtils.java deleted file mode 100644 index a0931a1da..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/ResponseMergerUtils.java +++ /dev/null @@ -1,45 +0,0 @@ -package com.twitter.search.common.util.earlybird; - -import java.util.List; -import java.util.Set; - -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; - -import com.twitter.search.common.query.thriftjava.EarlyTerminationInfo; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; - -public final class ResponseMergerUtils { - - // Utility class, disallow instantiation. - private ResponseMergerUtils() { - } - - /** - * Merges early termination infos from several earlybird responses. - * - * @param responses earlybird responses to merge the early termination infos from - * @return merged early termination info - */ - public static EarlyTerminationInfo mergeEarlyTerminationInfo(List responses) { - EarlyTerminationInfo etInfo = new EarlyTerminationInfo(false); - Set etReasonSet = Sets.newHashSet(); - // Fill in EarlyTerminationStatus - for (EarlybirdResponse ebResp : responses) { - if (ebResp.isSetEarlyTerminationInfo() - && ebResp.getEarlyTerminationInfo().isEarlyTerminated()) { - etInfo.setEarlyTerminated(true); - if (ebResp.getEarlyTerminationInfo().isSetEarlyTerminationReason()) { - etReasonSet.add(ebResp.getEarlyTerminationInfo().getEarlyTerminationReason()); - } - if (ebResp.getEarlyTerminationInfo().isSetMergedEarlyTerminationReasons()) { - etReasonSet.addAll(ebResp.getEarlyTerminationInfo().getMergedEarlyTerminationReasons()); - } - } - } - if (etInfo.isEarlyTerminated()) { - etInfo.setMergedEarlyTerminationReasons(Lists.newArrayList(etReasonSet)); - } - return etInfo; - } -} diff --git a/src/java/com/twitter/search/common/util/earlybird/ResultsUtil.docx b/src/java/com/twitter/search/common/util/earlybird/ResultsUtil.docx new file mode 100644 index 000000000..890855c07 Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/ResultsUtil.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/ResultsUtil.java b/src/java/com/twitter/search/common/util/earlybird/ResultsUtil.java deleted file mode 100644 index e314ca553..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/ResultsUtil.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.search.common.util.earlybird; - -import java.util.Map; - -import com.google.common.base.Function; -import com.google.common.collect.Iterables; -import com.google.common.collect.Maps; - -/** - * Utility class used to help merging results. - */ -public final class ResultsUtil { - private ResultsUtil() { } - - /** - * Aggregate a list of responses in the following way. - * 1. For each response, mapGetter can turn the response into a map. - * 2. Dump all entries from the above map into a "total" map, which accumulates entries from - * all the responses. - */ - public static Map aggregateCountMap( - Iterable responses, - Function> mapGetter) { - Map total = Maps.newHashMap(); - for (Map map : Iterables.transform(responses, mapGetter)) { - if (map != null) { - for (Map.Entry entry : map.entrySet()) { - T key = entry.getKey(); - total.put(key, total.containsKey(key) - ? total.get(key) + entry.getValue() : entry.getValue()); - } - } - } - return total; - } -} diff --git a/src/java/com/twitter/search/common/util/earlybird/TermStatisticsUtil.docx b/src/java/com/twitter/search/common/util/earlybird/TermStatisticsUtil.docx new file mode 100644 index 000000000..3a67a0c3d Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/TermStatisticsUtil.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/TermStatisticsUtil.java b/src/java/com/twitter/search/common/util/earlybird/TermStatisticsUtil.java deleted file mode 100644 index e599d5cf3..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/TermStatisticsUtil.java +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.search.common.util.earlybird; - -import java.util.concurrent.TimeUnit; - -import com.twitter.search.earlybird.thrift.ThriftHistogramSettings; - -/** - * A utility class to provide some functions for TermStatistics request processing - */ -public final class TermStatisticsUtil { - - private static final org.slf4j.Logger LOG = - org.slf4j.LoggerFactory.getLogger(TermStatisticsUtil.class); - - private TermStatisticsUtil() { - } - - /** - * Determine the binsize base on settings in ThriftHistogramSettings.granularity - */ - public static int determineBinSize(ThriftHistogramSettings histogramSettings) { - final int DEFAULT_BINSIZE = (int) TimeUnit.HOURS.toSeconds(1); - int binSize; - switch (histogramSettings.getGranularity()) { - case DAYS: - binSize = (int) TimeUnit.DAYS.toSeconds(1); - break; - case HOURS: - binSize = (int) TimeUnit.HOURS.toSeconds(1); - break; - case MINUTES: - binSize = (int) TimeUnit.MINUTES.toSeconds(1); - break; - case CUSTOM: - binSize = histogramSettings.isSetBinSizeInSeconds() - ? histogramSettings.getBinSizeInSeconds() - : DEFAULT_BINSIZE; - break; - default: - binSize = DEFAULT_BINSIZE; - LOG.warn("Unknown ThriftHistogramGranularityType {} using default binsize: {}", - histogramSettings.getGranularity(), DEFAULT_BINSIZE); - } - - return binSize; - } -} diff --git a/src/java/com/twitter/search/common/util/earlybird/ThriftSearchQueryUtil.docx b/src/java/com/twitter/search/common/util/earlybird/ThriftSearchQueryUtil.docx new file mode 100644 index 000000000..d44f42381 Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/ThriftSearchQueryUtil.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/ThriftSearchQueryUtil.java b/src/java/com/twitter/search/common/util/earlybird/ThriftSearchQueryUtil.java deleted file mode 100644 index 441349715..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/ThriftSearchQueryUtil.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.twitter.search.common.util.earlybird; - -import com.twitter.search.common.query.thriftjava.CollectorParams; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; - -/** - * Utility class from constructing ThriftSearchQuery. - */ -public final class ThriftSearchQueryUtil { - private ThriftSearchQueryUtil() { } - - /** - * Convenience methods for constructing a ThriftSearchQuery. - */ - public static ThriftSearchQuery newSearchQuery(String serializedQuery, int numResults) { - ThriftSearchQuery searchQuery = new ThriftSearchQuery(); - searchQuery.setSerializedQuery(serializedQuery); - searchQuery.setCollectorParams(new CollectorParams().setNumResultsToReturn(numResults)); - return searchQuery; - } - - /** Determines if the given request was initiated by a logged in user. */ - public static boolean requestInitiatedByLoggedInUser(EarlybirdRequest request) { - ThriftSearchQuery searchQuery = request.getSearchQuery(); - return (searchQuery != null) && searchQuery.isSetSearcherId() - && (searchQuery.getSearcherId() > 0); - } -} diff --git a/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultUtil.docx b/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultUtil.docx new file mode 100644 index 000000000..d1b726263 Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultUtil.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultUtil.java b/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultUtil.java deleted file mode 100644 index 3b9661c6e..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultUtil.java +++ /dev/null @@ -1,209 +0,0 @@ -package com.twitter.search.common.util.earlybird; - -import java.util.List; -import java.util.Map; -import javax.annotation.Nullable; - -import com.google.common.base.Function; -import com.google.common.base.Predicate; -import com.google.common.base.Predicates; -import com.google.common.collect.Iterables; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; -import com.twitter.search.common.relevance.ranking.ActionChain; -import com.twitter.search.common.relevance.ranking.filters.ExactDuplicateFilter; -import com.twitter.search.common.relevance.text.VisibleTokenRatioNormalizer; -import com.twitter.search.common.runtime.ActionChainDebugManager; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults; -import com.twitter.search.earlybird.thrift.ThriftFacetResults; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata; -import com.twitter.search.earlybird.thrift.ThriftSearchResultType; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird.thrift.ThriftTweetSource; - -/** - * ThriftSearchResultUtil contains some simple static methods for constructing - * ThriftSearchResult objects. - */ -public final class ThriftSearchResultUtil { - private ThriftSearchResultUtil() { } - - private static final VisibleTokenRatioNormalizer NORMALIZER = - VisibleTokenRatioNormalizer.createInstance(); - - public static final Function> LANG_MAP_GETTER = - searchResults -> searchResults.getLanguageHistogram(); - public static final Function> HIT_COUNTS_MAP_GETTER = - searchResults -> searchResults.getHitCounts(); - - // Some useful Predicates - public static final Predicate IS_OFFENSIVE_TWEET = - result -> { - if (result != null && result.isSetMetadata()) { - ThriftSearchResultMetadata metadata = result.getMetadata(); - return metadata.isIsOffensive(); - } else { - return false; - } - }; - - public static final Predicate IS_TOP_TWEET = - result -> result != null - && result.isSetMetadata() - && result.getMetadata().isSetResultType() - && result.getMetadata().getResultType() == ThriftSearchResultType.POPULAR; - - public static final Predicate FROM_FULL_ARCHIVE = - result -> result != null - && result.isSetTweetSource() - && result.getTweetSource() == ThriftTweetSource.FULL_ARCHIVE_CLUSTER; - - public static final Predicate IS_FULL_ARCHIVE_TOP_TWEET = - Predicates.and(FROM_FULL_ARCHIVE, IS_TOP_TWEET); - - public static final Predicate IS_NSFW_BY_ANY_MEANS_TWEET = - result -> { - if (result != null && result.isSetMetadata()) { - ThriftSearchResultMetadata metadata = result.getMetadata(); - return metadata.isIsUserNSFW() - || metadata.isIsOffensive() - || metadata.getExtraMetadata().isIsSensitiveContent(); - } else { - return false; - } - }; - - /** - * Returns the number of underlying ThriftSearchResult results. - */ - public static int numResults(ThriftSearchResults results) { - if (results == null || !results.isSetResults()) { - return 0; - } else { - return results.getResultsSize(); - } - } - - /** - * Returns the list of tweet IDs in ThriftSearchResults. - * Returns null if there's no results. - */ - @Nullable - public static List getTweetIds(ThriftSearchResults results) { - if (numResults(results) > 0) { - return getTweetIds(results.getResults()); - } else { - return null; - } - } - - /** - * Returns the list of tweet IDs in a list of ThriftSearchResult. - * Returns null if there's no results. - */ - public static List getTweetIds(@Nullable List results) { - if (results != null && results.size() > 0) { - return Lists.newArrayList(Iterables.transform( - results, - searchResult -> searchResult.getId() - )); - } - return null; - } - - /** - * Given ThriftSearchResults, build a map from tweet ID to the tweets metadata. - */ - public static Map getTweetMetadataMap( - Schema schema, ThriftSearchResults results) { - Map resultMap = Maps.newHashMap(); - if (results == null || results.getResultsSize() == 0) { - return resultMap; - } - for (ThriftSearchResult searchResult : results.getResults()) { - resultMap.put(searchResult.getId(), searchResult.getMetadata()); - } - return resultMap; - } - - /** - * Return the total number of facet results in ThriftFacetResults, by summing up the number - * of facet results in each field. - */ - public static int numFacetResults(ThriftFacetResults results) { - if (results == null || !results.isSetFacetFields()) { - return 0; - } else { - int numResults = 0; - for (ThriftFacetFieldResults field : results.getFacetFields().values()) { - if (field.isSetTopFacets()) { - numResults += field.topFacets.size(); - } - } - return numResults; - } - } - - /** - * Updates the search statistics on base, by adding the corresponding stats from delta. - */ - public static void incrementCounts(ThriftSearchResults base, - ThriftSearchResults delta) { - if (delta.isSetNumHitsProcessed()) { - base.setNumHitsProcessed(base.getNumHitsProcessed() + delta.getNumHitsProcessed()); - } - if (delta.isSetNumPartitionsEarlyTerminated() && delta.getNumPartitionsEarlyTerminated() > 0) { - // This currently used for merging results on a single earlybird, so we don't sum up all the - // counts, just set it to 1 if we see one that was early terminated. - base.setNumPartitionsEarlyTerminated(1); - } - if (delta.isSetMaxSearchedStatusID()) { - long deltaMax = delta.getMaxSearchedStatusID(); - if (!base.isSetMaxSearchedStatusID() || deltaMax > base.getMaxSearchedStatusID()) { - base.setMaxSearchedStatusID(deltaMax); - } - } - if (delta.isSetMinSearchedStatusID()) { - long deltaMin = delta.getMinSearchedStatusID(); - if (!base.isSetMinSearchedStatusID() || deltaMin < base.getMinSearchedStatusID()) { - base.setMinSearchedStatusID(deltaMin); - } - } - if (delta.isSetScore()) { - if (base.isSetScore()) { - base.setScore(base.getScore() + delta.getScore()); - } else { - base.setScore(delta.getScore()); - } - } - } - - /** - * Removes the duplicates from the given list of results. - * - * @param results The list of ThriftSearchResults. - * @return The given list with duplicates removed. - */ - public static List removeDuplicates(List results) { - ActionChain filterChain = - ActionChainDebugManager - .createActionChainBuilder("RemoveDuplicatesFilters") - .appendActions(new ExactDuplicateFilter()) - .build(); - return filterChain.apply(results); - } - - /** - * Returns ranking score from Earlybird shard-based ranking models if any, and 0 otherwise. - */ - public static double getTweetScore(@Nullable ThriftSearchResult result) { - if (result == null || !result.isSetMetadata() || !result.getMetadata().isSetScore()) { - return 0.0; - } - return result.getMetadata().getScore(); - } -} diff --git a/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultsRelevanceStatsUtil.docx b/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultsRelevanceStatsUtil.docx new file mode 100644 index 000000000..66303a936 Binary files /dev/null and b/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultsRelevanceStatsUtil.docx differ diff --git a/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultsRelevanceStatsUtil.java b/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultsRelevanceStatsUtil.java deleted file mode 100644 index 182dd1274..000000000 --- a/src/java/com/twitter/search/common/util/earlybird/ThriftSearchResultsRelevanceStatsUtil.java +++ /dev/null @@ -1,46 +0,0 @@ -package com.twitter.search.common.util.earlybird; - -import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats; - -public final class ThriftSearchResultsRelevanceStatsUtil { - private ThriftSearchResultsRelevanceStatsUtil() { } - - /** - * Adding ThriftSearchResultsRelevanceStats from one set of results onto a base set. - * Assumes all values are set on both of the inputs. - * - * @param base the stats to add to. - * @param delta the stats to be added. - */ - public static void addRelevanceStats(ThriftSearchResultsRelevanceStats base, - ThriftSearchResultsRelevanceStats delta) { - base.setNumScored(base.getNumScored() + delta.getNumScored()); - base.setNumSkipped(base.getNumSkipped() + delta.getNumSkipped()); - base.setNumSkippedForAntiGaming( - base.getNumSkippedForAntiGaming() + delta.getNumSkippedForAntiGaming()); - base.setNumSkippedForLowReputation( - base.getNumSkippedForLowReputation() + delta.getNumSkippedForLowReputation()); - base.setNumSkippedForLowTextScore( - base.getNumSkippedForLowTextScore() + delta.getNumSkippedForLowTextScore()); - base.setNumSkippedForSocialFilter( - base.getNumSkippedForSocialFilter() + delta.getNumSkippedForSocialFilter()); - base.setNumSkippedForLowFinalScore( - base.getNumSkippedForLowFinalScore() + delta.getNumSkippedForLowFinalScore()); - if (delta.getOldestScoredTweetAgeInSeconds() > base.getOldestScoredTweetAgeInSeconds()) { - base.setOldestScoredTweetAgeInSeconds(delta.getOldestScoredTweetAgeInSeconds()); - } - - base.setNumFromDirectFollows(base.getNumFromDirectFollows() + delta.getNumFromDirectFollows()); - base.setNumFromTrustedCircle(base.getNumFromTrustedCircle() + delta.getNumFromTrustedCircle()); - base.setNumReplies(base.getNumReplies() + delta.getNumReplies()); - base.setNumRepliesTrusted(base.getNumRepliesTrusted() + delta.getNumRepliesTrusted()); - base.setNumRepliesOutOfNetwork( - base.getNumRepliesOutOfNetwork() + delta.getNumRepliesOutOfNetwork()); - base.setNumSelfTweets(base.getNumSelfTweets() + delta.getNumSelfTweets()); - base.setNumWithMedia(base.getNumWithMedia() + delta.getNumWithMedia()); - base.setNumWithNews(base.getNumWithNews() + delta.getNumWithNews()); - base.setNumSpamUser(base.getNumSpamUser() + delta.getNumSpamUser()); - base.setNumOffensive(base.getNumOffensive() + delta.getNumOffensive()); - base.setNumBot(base.getNumBot() + delta.getNumBot()); - } -} diff --git a/src/java/com/twitter/search/common/util/lang/BUILD b/src/java/com/twitter/search/common/util/lang/BUILD deleted file mode 100644 index e88e63360..000000000 --- a/src/java/com/twitter/search/common/util/lang/BUILD +++ /dev/null @@ -1,18 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - provides = artifact( - org = "com.twitter.search.common.util", - name = "lang", - repo = artifactory, - ), - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/text/language:locale-util", - "src/thrift/com/twitter/search/common:constants-java", - ], -) diff --git a/src/java/com/twitter/search/common/util/lang/BUILD.docx b/src/java/com/twitter/search/common/util/lang/BUILD.docx new file mode 100644 index 000000000..ef531b5d2 Binary files /dev/null and b/src/java/com/twitter/search/common/util/lang/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/util/lang/ThriftLanguageUtil.docx b/src/java/com/twitter/search/common/util/lang/ThriftLanguageUtil.docx new file mode 100644 index 000000000..c26a59c1d Binary files /dev/null and b/src/java/com/twitter/search/common/util/lang/ThriftLanguageUtil.docx differ diff --git a/src/java/com/twitter/search/common/util/lang/ThriftLanguageUtil.java b/src/java/com/twitter/search/common/util/lang/ThriftLanguageUtil.java deleted file mode 100644 index 2ede4c2f0..000000000 --- a/src/java/com/twitter/search/common/util/lang/ThriftLanguageUtil.java +++ /dev/null @@ -1,141 +0,0 @@ -package com.twitter.search.common.util.lang; - -import java.lang.reflect.Field; -import java.util.Locale; -import java.util.Map; - -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.text.language.LocaleUtil; -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; - -/** - * This class can be used to convert ThriftLanguage to Locale object and vise versa. - */ -public final class ThriftLanguageUtil { - private static final Logger LOG = LoggerFactory.getLogger(ThriftLanguageUtil.class.getName()); - - // stores ThriftLanguage.id -> Locale mapping - private static final Locale[] LOCALES; - - // stores Locale -> ThriftLanguage mapping - private static final Map THRIFT_LANGUAGES; - - static { - LOCALES = new Locale[ThriftLanguage.values().length]; - Map thriftLanguageMap = Maps.newHashMap(); - - // get all languages defined in ThriftLanguage - Field[] fields = ThriftLanguage.class.getDeclaredFields(); - for (Field field : fields) { - if (!field.isEnumConstant()) { - continue; - } - - try { - ThriftLanguage thriftLang = (ThriftLanguage) field.get(null); - String thriftLanguageName = field.getName(); - - // get corresponding Locale declared in LocaleUtil - try { - Field localeUtilField = LocaleUtil.class.getDeclaredField(thriftLanguageName); - Locale localeLang = (Locale) localeUtilField.get(null); - - LOCALES[thriftLang.getValue()] = localeLang; - thriftLanguageMap.put(localeLang, thriftLang); - } catch (NoSuchFieldException e) { - LOG.warn("{} is defined in ThriftLanguage, but not in LocaleUtil.", thriftLanguageName); - } - } catch (IllegalAccessException e) { - // shouldn't happen. - LOG.warn("Could not get a declared field.", e); - } - } - - // Let's make sure that all Locales defined in LocaleUtil are also defined in ThriftLanguage - for (Locale lang : LocaleUtil.getDefinedLanguages()) { - if (!thriftLanguageMap.containsKey(lang)) { - LOG.warn("{} is defined in LocaleUtil but not in ThriftLanguage.", lang.getLanguage()); - } - } - - THRIFT_LANGUAGES = ImmutableMap.copyOf(thriftLanguageMap); - } - - private ThriftLanguageUtil() { - } - - /** - * Returns a Locale object which corresponds to a given ThriftLanguage object. - * @param language ThriftLanguage object - * @return a corresponding Locale object - */ - public static Locale getLocaleOf(ThriftLanguage language) { - // Note that ThriftLanguage.findByValue() can return null (thrift generated code). - // So ThriftLanguageUtil.getLocaleOf needs to handle null correctly. - if (language == null) { - return LocaleUtil.UNKNOWN; - } - - Preconditions.checkArgument(language.getValue() < LOCALES.length); - return LOCALES[language.getValue()]; - } - - /** - * Returns a ThriftLanguage object which corresponds to a given Locale object. - * - * @param language Locale object - * @return a corresponding ThriftLanguage object, or UNKNOWN if there's no corresponding one. - */ - public static ThriftLanguage getThriftLanguageOf(Locale language) { - Preconditions.checkNotNull(language); - ThriftLanguage thriftLang = THRIFT_LANGUAGES.get(language); - return thriftLang == null ? ThriftLanguage.UNKNOWN : thriftLang; - } - - /** - * Returns a ThriftLanguage object which corresponds to a given language code. - * - * @param languageCode BCP-47 language code - * @return a corresponding ThriftLanguage object, or UNKNOWN if there's no corresponding one. - */ - public static ThriftLanguage getThriftLanguageOf(String languageCode) { - Preconditions.checkNotNull(languageCode); - ThriftLanguage thriftLang = THRIFT_LANGUAGES.get(LocaleUtil.getLocaleOf(languageCode)); - return thriftLang == null ? ThriftLanguage.UNKNOWN : thriftLang; - } - - /** - * Returns a ThriftLanguage object which corresponds to a given int value. - * If value is not valid, returns ThriftLanguage.UNKNOWN - * @param value value of language - * @return a corresponding ThriftLanguage object - */ - public static ThriftLanguage safeFindByValue(int value) { - ThriftLanguage thriftLang = ThriftLanguage.findByValue(value); - return thriftLang == null ? ThriftLanguage.UNKNOWN : thriftLang; - } - - /** - * Returns the language code which corresponds to a given ThriftLanguage. - * - * Note that multiple ThriftLanguage entries can return the same language code. - * - * @param thriftLang ThriftLanguage object - * @return Corresponding language or null if thriftLang is null. - */ - @Nullable - public static String getLanguageCodeOf(@Nullable ThriftLanguage thriftLang) { - if (thriftLang == null) { - return null; - } - return ThriftLanguageUtil.getLocaleOf(thriftLang).getLanguage(); - } -} diff --git a/src/java/com/twitter/search/common/util/ml/BUILD b/src/java/com/twitter/search/common/util/ml/BUILD deleted file mode 100644 index b6c67753a..000000000 --- a/src/java/com/twitter/search/common/util/ml/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/it/unimi/dsi:fastutil", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/search/common/file", - "src/java/com/twitter/search/common/util/io", - ], -) diff --git a/src/java/com/twitter/search/common/util/ml/BUILD.docx b/src/java/com/twitter/search/common/util/ml/BUILD.docx new file mode 100644 index 000000000..267886688 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/EnumBasedLinearModel.docx b/src/java/com/twitter/search/common/util/ml/EnumBasedLinearModel.docx new file mode 100644 index 000000000..4089b45f0 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/EnumBasedLinearModel.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/EnumBasedLinearModel.java b/src/java/com/twitter/search/common/util/ml/EnumBasedLinearModel.java deleted file mode 100644 index 50b2fc46a..000000000 --- a/src/java/com/twitter/search/common/util/ml/EnumBasedLinearModel.java +++ /dev/null @@ -1,141 +0,0 @@ -package com.twitter.search.common.util.ml; - -import java.io.IOException; -import java.util.EnumMap; -import java.util.EnumSet; -import java.util.Map; -import java.util.Set; - -import com.google.common.base.Preconditions; -import com.google.common.base.Predicates; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; - -import com.twitter.search.common.file.AbstractFile; -import com.twitter.search.common.util.io.TextFileLoadingUtils; - -/** - * Represents a linear model for scoring and classification. - * - * The list of features is defined by an Enum class. The model weights and instances are - * represented as maps that must contain an entry for all the values of the enum. - * - */ -public class EnumBasedLinearModel> implements MapBasedLinearModel { - - private final EnumSet features; - private final EnumMap weights; - - /** - * Creates a model from a map of weights. - * - * @param enumType Enum used for the keys - * @param weights Feature weights. - */ - public EnumBasedLinearModel(Class enumType, Map weights) { - features = EnumSet.allOf(enumType); - EnumMap enumWeights = - new EnumMap<>(Maps.filterValues(weights, Predicates.notNull())); - Preconditions.checkArgument(features.equals(enumWeights.keySet()), - "The model does not include weights for all the available features"); - - this.weights = enumWeights; - } - - public ImmutableMap getWeights() { - return Maps.immutableEnumMap(weights); - } - - @Override - public float score(Map instance) { - float total = 0; - for (Map.Entry weightEntry : weights.entrySet()) { - Float feature = instance.get(weightEntry.getKey()); - if (feature != null) { - total += weightEntry.getValue() * feature; - } - } - return total; - } - - /** - * Determines whether an instance is positive. - */ - @Override - public boolean classify(float threshold, Map instance) { - return score(instance) > threshold; - } - - @Override - public boolean classify(Map instance) { - return classify(0, instance); - } - - @Override - public String toString() { - return String.format("EnumBasedLinearModel[%s]", weights); - } - - /** - * Creates a model where all the features have the same weight. - * This method is useful for generating the feature vectors for training a new model. - */ - public static > EnumBasedLinearModel createWithEqualWeight(Class enumType, - Float weight) { - EnumSet features = EnumSet.allOf(enumType); - EnumMap weights = Maps.newEnumMap(enumType); - for (T feature : features) { - weights.put(feature, weight); - } - return new EnumBasedLinearModel<>(enumType, weights); - } - - /** - * Loads the model from a TSV file with the following format: - * - * feature_name \t weight - */ - public static > EnumBasedLinearModel createFromFile( - Class enumType, AbstractFile path) throws IOException { - return new EnumBasedLinearModel<>(enumType, loadWeights(enumType, path, true)); - } - - /** - * Loads the model from a TSV file, using a default weight of 0 for missing features. - * - * File format: - * - * feature_name \t weight - */ - public static > EnumBasedLinearModel createFromFileSafe( - Class enumType, AbstractFile path) throws IOException { - return new EnumBasedLinearModel<>(enumType, loadWeights(enumType, path, false)); - } - - /** - * Creates a map of (feature_name, weight) from a TSV file. - * - * If strictMode is true, it will throw an exception if the file doesn't contain all the - * features declared in the enum. Otherwise, it will use zero as default value. - * - */ - private static > EnumMap loadWeights( - Class enumType, AbstractFile fileHandle, boolean strictMode) throws IOException { - Map weightsFromFile = - TextFileLoadingUtils.loadMapFromFile(fileHandle, input -> Float.parseFloat(input)); - EnumMap weights = Maps.newEnumMap(enumType); - Set expectedFeatures = EnumSet.allOf(enumType); - if (!strictMode) { - for (T feature : expectedFeatures) { - weights.put(feature, 0f); - } - } - for (String featureName : weightsFromFile.keySet()) { - Float weight = weightsFromFile.get(featureName); - weights.put(Enum.valueOf(enumType, featureName.toUpperCase()), weight); - } - Preconditions.checkArgument(expectedFeatures.equals(weights.keySet()), - "Model does not contain weights for all the features"); - return weights; - } -} diff --git a/src/java/com/twitter/search/common/util/ml/FeatureUtils.docx b/src/java/com/twitter/search/common/util/ml/FeatureUtils.docx new file mode 100644 index 000000000..f52453c23 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/FeatureUtils.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/FeatureUtils.java b/src/java/com/twitter/search/common/util/ml/FeatureUtils.java deleted file mode 100644 index fef79620d..000000000 --- a/src/java/com/twitter/search/common/util/ml/FeatureUtils.java +++ /dev/null @@ -1,120 +0,0 @@ -package com.twitter.search.common.util.ml; - -import java.util.List; -import java.util.Map; -import java.util.Optional; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Sets; - -/** - * Utilities for feature transformation and extraction. - */ -public final class FeatureUtils { - - private FeatureUtils() { - } - - /** - * Computes the difference between 2 values and returns the ratio of the difference over the - * minimum of both, according to these cases: - * - * 1. if (a > b) return a / b - * 2. if (a < b) return - b / a - * 3. if (a == b == 0) return 0 - * - * The upper/lower limit is (-) maxRatio. For cases 1 and 2, if the denominator is 0, - * it returns maxRatio. - * - * This method is used to define a feature that tells how much larger or smaller is the - * first value with respect to the second one.. - */ - public static float diffRatio(float a, float b, float maxRatio) { - float diff = a - b; - if (diff == 0) { - return 0; - } - float denominator = Math.min(a, b); - float ratio = denominator != 0 ? Math.abs(diff / denominator) : maxRatio; - return Math.copySign(Math.min(ratio, maxRatio), diff); - } - - /** - * Computes the cosine similarity between two maps that represent sparse vectors. - */ - public static double cosineSimilarity( - Map vector1, Map vector2) { - if (vector1 == null || vector1.isEmpty() || vector2 == null || vector2.isEmpty()) { - return 0; - } - double squaredSum1 = 0; - double squaredSum2 = 0; - double squaredCrossSum = 0; - - for (K key : Sets.union(vector1.keySet(), vector2.keySet())) { - double value1 = 0; - double value2 = 0; - - V optValue1 = vector1.get(key); - if (optValue1 != null) { - value1 = optValue1.doubleValue(); - } - V optValue2 = vector2.get(key); - if (optValue2 != null) { - value2 = optValue2.doubleValue(); - } - - squaredSum1 += value1 * value1; - squaredSum2 += value2 * value2; - squaredCrossSum += value1 * value2; - } - - if (squaredSum1 == 0 || squaredSum2 == 0) { - return 0; - } else { - return squaredCrossSum / Math.sqrt(squaredSum1 * squaredSum2); - } - } - - /** - * Computes the cosine similarity between two (dense) vectors. - */ - public static double cosineSimilarity( - List vector1, List vector2) { - if (vector1 == null || vector1.isEmpty() || vector2 == null || vector2.isEmpty()) { - return 0; - } - - Preconditions.checkArgument(vector1.size() == vector2.size()); - double squaredSum1 = 0; - double squaredSum2 = 0; - double squaredCrossSum = 0; - for (int i = 0; i < vector1.size(); i++) { - double value1 = vector1.get(i).doubleValue(); - double value2 = vector2.get(i).doubleValue(); - squaredSum1 += value1 * value1; - squaredSum2 += value2 * value2; - squaredCrossSum += value1 * value2; - } - - if (squaredSum1 == 0 || squaredSum2 == 0) { - return 0; - } else { - return squaredCrossSum / Math.sqrt(squaredSum1 * squaredSum2); - } - } - - /** - * Finds the key of the map with the highest value (compared in natural order) - */ - @SuppressWarnings("unchecked") - public static Optional findMaxKey(Map map) { - if (map == null || map.isEmpty()) { - return Optional.empty(); - } - - Optional> maxEntry = map.entrySet().stream().max(Map.Entry.comparingByValue()); - return maxEntry.map(Map.Entry::getKey); - } - -} diff --git a/src/java/com/twitter/search/common/util/ml/MapBasedLinearModel.docx b/src/java/com/twitter/search/common/util/ml/MapBasedLinearModel.docx new file mode 100644 index 000000000..719b3b99d Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/MapBasedLinearModel.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/MapBasedLinearModel.java b/src/java/com/twitter/search/common/util/ml/MapBasedLinearModel.java deleted file mode 100644 index 0f8899271..000000000 --- a/src/java/com/twitter/search/common/util/ml/MapBasedLinearModel.java +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.search.common.util.ml; - -import java.util.Map; - -/** - * An interface for linear models that are backed by some sort of map - */ -public interface MapBasedLinearModel { - /** - * Evaluate using this model given a feature vector. - * @param instance The feature vector in format of a hashmap. - * @return - */ - boolean classify(Map instance); - - /** - * Evaluate using this model given a classification threshold and a feature vector. - * @param threshold Score threshold used for classification. - * @param instance The feature vector in format of a hashmap. - * @return - */ - boolean classify(float threshold, Map instance); - - /** - * Computes the score of an instance as a linear combination of the features and the model - * weights. 0 is used as default value for features or weights that are not present. - * - * @param instance The feature vector in format of a hashmap. - * @return The instance score according to the model. - */ - float score(Map instance); -} diff --git a/src/java/com/twitter/search/common/util/ml/StringMapBasedLinearModel.docx b/src/java/com/twitter/search/common/util/ml/StringMapBasedLinearModel.docx new file mode 100644 index 000000000..e3edfdeaa Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/StringMapBasedLinearModel.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/StringMapBasedLinearModel.java b/src/java/com/twitter/search/common/util/ml/StringMapBasedLinearModel.java deleted file mode 100644 index cc0686ef4..000000000 --- a/src/java/com/twitter/search/common/util/ml/StringMapBasedLinearModel.java +++ /dev/null @@ -1,125 +0,0 @@ -package com.twitter.search.common.util.ml; - -import java.util.Map; - -import com.google.common.annotations.VisibleForTesting; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.base.Function; -import com.twitter.search.common.file.AbstractFile; -import com.twitter.search.common.util.io.TextFileLoadingUtils; - -import it.unimi.dsi.fastutil.objects.Object2FloatMap; -import it.unimi.dsi.fastutil.objects.Object2FloatOpenHashMap; - -/** - * Represents a linear model for scoring and classification. - * - * Features are represented as arbitrary strings, making this a fairly flexible implementation - * (at the cost of some performance, since all operations require hash lookups). Instances - * and weights are both encoded sparsely (as maps) so this implementation is well suited to - * models with large feature sets where most features are inactive at a given time. Weights - * for unknown features are assumed to be 0. - * - */ -public class StringMapBasedLinearModel implements MapBasedLinearModel { - private static final Logger LOG = LoggerFactory.getLogger(StringMapBasedLinearModel.class); - - protected final Object2FloatMap model = new Object2FloatOpenHashMap<>(); - - /** - * Creates a model from a map of weights. - * - * @param weights Feature weights. - */ - public StringMapBasedLinearModel(Map weights) { - model.putAll(weights); - model.defaultReturnValue(0.0f); - } - - /** - * Get the weight of a feature - * @param featureName - * @return - */ - public float getWeight(String featureName) { - return model.getFloat(featureName); - } - - /** - * Get the full weight map - */ - @VisibleForTesting - protected Map getWeights() { - return model; - } - - /** - * Evaluate using this model given a feature vector. - * @param values The feature vector in format of a hashmap. - * @return - */ - @Override - public float score(Map values) { - float score = 0.0f; - for (Map.Entry value : values.entrySet()) { - String featureName = value.getKey(); - float weight = getWeight(featureName); - if (weight != 0.0f) { - score += weight * value.getValue(); - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("%s = %.3f * %.3f = %.3f, ", - featureName, weight, value.getValue(), - weight * value.getValue())); - } - } - } - if (LOG.isDebugEnabled()) { - LOG.debug(String.format("Score = %.3f", score)); - } - return score; - } - - /** - * Determines whether an instance is positive. - */ - @Override - public boolean classify(Map values) { - return classify(0.0f, values); - } - - @Override - public boolean classify(float threshold, Map values) { - return score(values) > threshold; - } - - public int size() { - return model.size(); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("StringMapBasedLinearModel["); - for (Map.Entry entry : model.entrySet()) { - sb.append(String.format("(%s = %.3f), ", entry.getKey(), entry.getValue())); - } - sb.append("]"); - return sb.toString(); - } - - /** - * Loads the model from a TSV file with the following format: - * - * feature_name \t weight - */ - public static StringMapBasedLinearModel loadFromFile(AbstractFile fileHandle) { - Map weights = - TextFileLoadingUtils.loadMapFromFile( - fileHandle, - (Function) item -> Float.parseFloat(item)); - return new StringMapBasedLinearModel(weights); - } -} diff --git a/src/java/com/twitter/search/common/util/ml/models_manager/BUILD b/src/java/com/twitter/search/common/util/ml/models_manager/BUILD deleted file mode 100644 index ba62e194c..000000000 --- a/src/java/com/twitter/search/common/util/ml/models_manager/BUILD +++ /dev/null @@ -1,14 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - strict_deps = True, - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/slf4j:slf4j-api", - "3rdparty/jvm/org/yaml:snakeyaml", - "src/java/com/twitter/search/common/file", - "src/java/com/twitter/search/common/metrics", - ], -) diff --git a/src/java/com/twitter/search/common/util/ml/models_manager/BUILD.docx b/src/java/com/twitter/search/common/util/ml/models_manager/BUILD.docx new file mode 100644 index 000000000..fd9ef5eef Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/models_manager/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/models_manager/BaseModelsManager.docx b/src/java/com/twitter/search/common/util/ml/models_manager/BaseModelsManager.docx new file mode 100644 index 000000000..3c6eab70f Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/models_manager/BaseModelsManager.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/models_manager/BaseModelsManager.java b/src/java/com/twitter/search/common/util/ml/models_manager/BaseModelsManager.java deleted file mode 100644 index 5c94c7988..000000000 --- a/src/java/com/twitter/search/common/util/ml/models_manager/BaseModelsManager.java +++ /dev/null @@ -1,293 +0,0 @@ -package com.twitter.search.common.util.ml.models_manager; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.UncheckedIOException; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; -import java.util.function.Function; -import java.util.function.Supplier; -import java.util.stream.Collectors; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Strings; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Sets; -import com.google.common.util.concurrent.ThreadFactoryBuilder; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.yaml.snakeyaml.Yaml; - -import com.twitter.search.common.file.AbstractFile; -import com.twitter.search.common.file.FileUtils; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; - -/** - * Loads models from HDFS and provides an interface for reloading them periodically. - * - * There are 2 possible ways of detecting the active models: - * - * - DirectorySupplier: Uses all the subdirectories of a base path - * - ConfigSupplier: Gets the list from from a configuration file - * - * Models can be updated or added. Depending on the selected method, existing models can be removed - * if they are no longer active. - */ -public abstract class BaseModelsManager implements Runnable { - private static final Logger LOG = LoggerFactory.getLogger(BaseModelsManager.class); - - protected final Map lastModifiedMsByModel = new ConcurrentHashMap<>(); - protected final Map loadedModels = new ConcurrentHashMap<>(); - protected final Supplier> activeModelsSupplier; - - protected Map prevLoadedModels = new ConcurrentHashMap<>(); - - // This flag determines whether models are unloaded immediately when they're removed from - // activeModelsSupplier. If false, old models stay in memory until the process is restarted. - // This may be useful to safely change model configuration without restarting. - protected final boolean shouldUnloadInactiveModels; - - protected final SearchLongGauge numModels; - protected final SearchCounter numErrors; - protected final SearchLongGauge lastLoadedMs; - - protected Supplier shouldServeModels; - protected Supplier shouldLoadModels; - - public BaseModelsManager( - Supplier> activeModelsSupplier, - boolean shouldUnloadInactiveModels, - String statsPrefix - ) { - this( - activeModelsSupplier, - shouldUnloadInactiveModels, - statsPrefix, - () -> true, - () -> true - ); - } - - public BaseModelsManager( - Supplier> activeModelsSupplier, - boolean shouldUnloadInactiveModels, - String statsPrefix, - Supplier shouldServeModels, - Supplier shouldLoadModels - ) { - this.activeModelsSupplier = activeModelsSupplier; - this.shouldUnloadInactiveModels = shouldUnloadInactiveModels; - - this.shouldServeModels = shouldServeModels; - this.shouldLoadModels = shouldLoadModels; - - numModels = SearchLongGauge.export( - String.format("model_loader_%s_num_models", statsPrefix)); - numErrors = SearchCounter.export( - String.format("model_loader_%s_num_errors", statsPrefix)); - lastLoadedMs = SearchLongGauge.export( - String.format("model_loader_%s_last_loaded_timestamp_ms", statsPrefix)); - } - - /** - * Retrieves a particular model. - */ - public Optional getModel(String name) { - if (shouldServeModels.get()) { - return Optional.ofNullable(loadedModels.get(name)); - } else { - return Optional.empty(); - } - } - - /** - * Reads a model instance from the directory file instance. - * - * @param modelBaseDir AbstractFile instance representing the directory. - * @return Model instance parsed from the directory. - */ - public abstract T readModelFromDirectory(AbstractFile modelBaseDir) throws Exception; - - /** - * Cleans up any resources used by the model instance. - * This method is called after removing the model from the in-memory map. - * Sub-classes can provide custom overridden implementation as required. - * - * @param unloadedModel Model instance that would be unloaded from the manager. - */ - protected void cleanUpUnloadedModel(T unloadedModel) { } - - @Override - public void run() { - // Get available models, either from the config file or by listing the base directory - final Map modelPathsFromConfig; - if (!shouldLoadModels.get()) { - LOG.info("Loading models is currently disabled."); - return; - } - - modelPathsFromConfig = activeModelsSupplier.get(); - for (Map.Entry nameAndPath : modelPathsFromConfig.entrySet()) { - String modelName = nameAndPath.getKey(); - try { - AbstractFile modelDirectory = nameAndPath.getValue(); - if (!modelDirectory.exists() && loadedModels.containsKey(modelName)) { - LOG.warn("Loaded model '{}' no longer exists at HDFS path {}, keeping loaded version; " - + "replace directory in HDFS to update model.", modelName, modelDirectory); - continue; - } - - long previousModifiedTimestamp = lastModifiedMsByModel.getOrDefault(modelName, 0L); - long lastModifiedMs = modelDirectory.getLastModified(); - if (previousModifiedTimestamp == lastModifiedMs) { - continue; - } - - LOG.info("Starting to load model. name={} path={}", modelName, modelDirectory.getPath()); - T model = Preconditions.checkNotNull(readModelFromDirectory(modelDirectory)); - LOG.info("Model initialized: {}. Last modified: {} ({})", - modelName, lastModifiedMs, new Date(lastModifiedMs)); - T previousModel = loadedModels.put(modelName, model); - lastModifiedMsByModel.put(modelName, lastModifiedMs); - - if (previousModel != null) { - cleanUpUnloadedModel(previousModel); - } - } catch (Exception e) { - numErrors.increment(); - LOG.error("Error initializing model: {}", modelName, e); - } - } - - // Remove any currently loaded models not present in the latest list - if (shouldUnloadInactiveModels) { - Set inactiveModels = - Sets.difference(loadedModels.keySet(), modelPathsFromConfig.keySet()).immutableCopy(); - - for (String modelName : inactiveModels) { - T modelToUnload = loadedModels.get(modelName); - loadedModels.remove(modelName); - - if (modelToUnload != null) { - // We could have an inactive model key without a model (value) if the - // initial readModelFromDirectory failed for the model entry. - // Checking for null to avoid exception. - cleanUpUnloadedModel(modelToUnload); - } - LOG.info("Unloaded model that is no longer active: {}", modelName); - } - } - - if (!prevLoadedModels.keySet().equals(loadedModels.keySet())) { - LOG.info("Finished loading models: {}", loadedModels.keySet()); - } - prevLoadedModels = loadedModels; - numModels.set(loadedModels.size()); - lastLoadedMs.set(System.currentTimeMillis()); - } - - /** - * Schedules the loader to run periodically. - * @param period Period between executions - * @param timeUnit The time unit the period parameter. - */ - public final void scheduleAtFixedRate( - long period, TimeUnit timeUnit, String builderThreadName) { - Executors.newSingleThreadScheduledExecutor( - new ThreadFactoryBuilder() - .setDaemon(true) - .setNameFormat(builderThreadName) - .build()) - .scheduleAtFixedRate(this, 0, period, timeUnit); - } - - /** - * Gets the active list of models from the subdirectories in a base directory. - * - * Each model is identified by the name of the subdirectory. - */ - @VisibleForTesting - public static class DirectorySupplier implements Supplier> { - private static final Logger LOG = LoggerFactory.getLogger(DirectorySupplier.class); - private final AbstractFile baseDir; - - public DirectorySupplier(AbstractFile baseDir) { - this.baseDir = baseDir; - } - - @Override - public Map get() { - try { - LOG.info("Loading models from the directories in: {}", baseDir.getPath()); - List modelDirs = - ImmutableList.copyOf(baseDir.listFiles(AbstractFile.IS_DIRECTORY)); - LOG.info("Found {} model directories: {}", modelDirs.size(), modelDirs); - return modelDirs.stream() - .collect(Collectors.toMap( - AbstractFile::getName, - Function.identity() - )); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } - - /** - * Gets the active list of models by reading a YAML config file. - * - * The keys are the model names, the values are dictionaries with a single entry for the path - * of the model in HDFS (without the HDFS name node prefix). For example: - * - * model_a: - * path: /path/to/model_a - * model_b: - * path: /path/to/model_b - * - */ - @VisibleForTesting - public static class ConfigSupplier implements Supplier> { - - private final AbstractFile configFile; - - public ConfigSupplier(AbstractFile configFile) { - this.configFile = configFile; - } - - @SuppressWarnings("unchecked") - @Override - public Map get() { - try (BufferedReader configReader = configFile.getCharSource().openBufferedStream()) { - Yaml yamlParser = new Yaml(); - //noinspection unchecked - Map> config = - (Map>) yamlParser.load(configReader); - - if (config == null || config.isEmpty()) { - return Collections.emptyMap(); - } - - Map modelPaths = new HashMap<>(); - for (Map.Entry> nameAndConfig : config.entrySet()) { - String path = Strings.emptyToNull(nameAndConfig.getValue().get("path")); - Preconditions.checkNotNull(path, "Missing path for model: %s", nameAndConfig.getKey()); - modelPaths.put(nameAndConfig.getKey(), FileUtils.getHdfsFileHandle(path)); - } - return modelPaths; - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/BUILD b/src/java/com/twitter/search/common/util/ml/prediction_engine/BUILD deleted file mode 100644 index 45513f511..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/BUILD +++ /dev/null @@ -1,68 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common_internal/hadoop", - "src/java/com/twitter/ml/api:api-base", - "src/java/com/twitter/ml/api/transform", - "src/java/com/twitter/ml/common/base", - "src/java/com/twitter/ml/prediction/core", - "src/java/com/twitter/ml/tool/prediction:ModelInterpreter", - "src/java/com/twitter/ml/vw/constant", - "src/java/com/twitter/mlv2/trees/predictor", - "src/java/com/twitter/mlv2/trees/scorer", - "src/java/com/twitter/search/common/features", - "src/java/com/twitter/search/common/file", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/util/ml/models_manager", - "src/java/com/twitter/search/modeling/common", - "src/thrift/com/twitter/ml/api:data-java", - "src/thrift/com/twitter/search/common:features-java", - ], -) - -java_library( - name = "for-timelines", - sources = [ - "BaseLegacyScoreAccumulator.java", - "BaseModelBuilder.java", - "BaseScoreAccumulator.java", - "CompositeFeatureContext.java", - "DiscretizedFeature.java", - "DiscretizedFeatureRange.java", - "LegacyModelBuilder.java", - "LightweightLinearModel.java", - "ModelBuilder.java", - "SchemaBasedModelBuilder.java", - ], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common_internal/hadoop", - "src/java/com/twitter/ml/api:api-base", - "src/java/com/twitter/ml/api/transform:DiscretizerTransform", - "src/java/com/twitter/ml/common/base", - "src/java/com/twitter/ml/tool/prediction:ModelInterpreter", - "src/java/com/twitter/ml/vw/constant", - "src/java/com/twitter/search/common/features", - "src/java/com/twitter/search/common/file", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/util/ml/models_manager", - "src/java/com/twitter/search/modeling/common", - "src/thrift/com/twitter/ml/api:data-java", - "src/thrift/com/twitter/search/common:features-java", - ], -) diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/BUILD.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/BUILD.docx new file mode 100644 index 000000000..2983e03a6 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseLegacyScoreAccumulator.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseLegacyScoreAccumulator.docx new file mode 100644 index 000000000..51a325248 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseLegacyScoreAccumulator.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseLegacyScoreAccumulator.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseLegacyScoreAccumulator.java deleted file mode 100644 index 02c92b0d6..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseLegacyScoreAccumulator.java +++ /dev/null @@ -1,64 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import com.google.common.base.Preconditions; - -import com.twitter.ml.api.Feature; - -/** - * Score accumulator for legacy (non-schema-based) features. It provides methods to add features - * using Feature objects. - * - * @deprecated This class is retired and we suggest to switch to schema-based features. - */ -@Deprecated -public abstract class BaseLegacyScoreAccumulator extends BaseScoreAccumulator { - - public BaseLegacyScoreAccumulator(LightweightLinearModel model) { - super(model); - Preconditions.checkState(!model.isSchemaBased(), - "Cannot create LegacyScoreAccumulator with a schema-based model: %s", model.getName()); - } - - /** - * Add to the score the weight of a binary feature (if it's present). - * - * @deprecated This function is retired and we suggest to switch to addSchemaBooleanFeatures in - * SchemaBasedScoreAccumulator. - */ - @Deprecated - protected BaseLegacyScoreAccumulator addBinaryFeature(Feature feature, - boolean value) { - if (value) { - Double weight = model.binaryFeatures.get(feature); - if (weight != null) { - score += weight; - } - } - return this; - } - - /** - * Add to the score the weight of a continuous feature. - *

- * If the model uses real valued features, it multiplies its weight by the provided value. - * Otherwise, it tries to find the discretized feature and adds its weight to the score. - * - * @deprecated This function is retired and we suggest to switch to addSchemaContinuousFeatures in - * SchemaBasedScoreAccumulator. - */ - @Deprecated - protected BaseLegacyScoreAccumulator addContinuousFeature(Feature feature, - double value) { - Double weightFromContinuous = model.continuousFeatures.get(feature); - if (weightFromContinuous != null) { - score += weightFromContinuous * value; - } else { - DiscretizedFeature discretizedFeature = model.discretizedFeatures.get(feature); - if (discretizedFeature != null) { - // Use only the weight of the discretized feature (there's no need to multiply it) - score += discretizedFeature.getWeight(value); - } - } - return this; - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseModelBuilder.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseModelBuilder.docx new file mode 100644 index 000000000..d8bb334dc Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseModelBuilder.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseModelBuilder.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseModelBuilder.java deleted file mode 100644 index 2d4d539ee..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseModelBuilder.java +++ /dev/null @@ -1,111 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.util.Collection; -import java.util.Comparator; -import java.util.List; - -import com.google.common.collect.Lists; - -import com.twitter.ml.api.FeatureParser; -import com.twitter.ml.api.transform.DiscretizerTransform; -import com.twitter.ml.tool.prediction.ModelInterpreter; - -/** - * The base model builder for LightweightLinearModels. - */ -public abstract class BaseModelBuilder implements ModelBuilder { - // Ignore features that have an absolute weight lower than this value - protected static final double MIN_WEIGHT = 1e-9; - private static final String BIAS_FIELD_NAME = ModelInterpreter.BIAS_FIELD_NAME; - static final String DISCRETIZER_NAME_SUFFIX = - "." + DiscretizerTransform.DEFAULT_FEATURE_NAME_SUFFIX; - - protected final String modelName; - protected double bias; - - public BaseModelBuilder(String modelName) { - this.modelName = modelName; - this.bias = 0.0; - } - - /** - * Collects all the ranges of a discretized feature and sorts them. - */ - static DiscretizedFeature buildFeature(Collection ranges) { - List sortedRanges = Lists.newArrayList(ranges); - sortedRanges.sort(Comparator.comparingDouble(a -> a.minValue)); - - double[] splits = new double[ranges.size()]; - double[] weights = new double[ranges.size()]; - - for (int i = 0; i < sortedRanges.size(); i++) { - splits[i] = sortedRanges.get(i).minValue; - weights[i] = sortedRanges.get(i).weight; - } - return new DiscretizedFeature(splits, weights); - } - - /** - * Parses a line from the interpreted model text file. See the javadoc of the constructor for - * more details about how to create the text file. - *

- * The file uses TSV format with 3 columns: - *

- * Model name (Generated by ML API, but ignored by this class) - * Feature definition: - * Name of the feature or definition from the MDL discretizer. - * Weight: - * Weight of the feature using LOGIT scale. - *

- * When it parses each line, it stores the weights for all the features defined in the context, - * as well as the bias, but it ignores any other feature (e.g. label, prediction or - * meta.record_weight) and features with a small absolute weight (see MIN_WEIGHT). - *

- * Example lines: - *

- * model_name bias 0.019735312089324074 - * model_name demo.binary_feature 0.06524706073105327 - * model_name demo.continuous_feature 0.0 - * model_name demo.continuous_feature.dz/dz_model=mdl/dz_range=-inf_3.58e-01 0.07155931927263737 - * model_name demo.continuous_feature.dz/dz_model=mdl/dz_range=3.58e-01_inf -0.08979256264865387 - * - * @see ModelInterpreter - * @see DiscretizerTransform - */ - @Override - public ModelBuilder parseLine(String line) { - String[] columns = line.split("\t"); - if (columns.length != 3) { - return this; - } - - // columns[0] has the model name, which we don't need - String featureName = columns[1]; - double weight = Double.parseDouble(columns[2]); - - if (BIAS_FIELD_NAME.equals(featureName)) { - bias = weight; - return this; - } - - FeatureParser parser = FeatureParser.parse(featureName); - String baseName = parser.getBaseName(); - - if (Math.abs(weight) < MIN_WEIGHT && !baseName.endsWith(DISCRETIZER_NAME_SUFFIX)) { - // skip, unless it represents a range of a discretized feature. - // discretized features with all zeros should also be removed, but will handle that later - return this; - } - - addFeature(baseName, weight, parser); - return this; - } - - /** - * Adds feature to the model - */ - protected abstract void addFeature(String baseName, double weight, FeatureParser parser); - - @Override - public abstract LightweightLinearModel build(); -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseScoreAccumulator.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseScoreAccumulator.docx new file mode 100644 index 000000000..9c43606a2 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseScoreAccumulator.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseScoreAccumulator.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseScoreAccumulator.java deleted file mode 100644 index 1be1c4872..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/BaseScoreAccumulator.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -/** - * The base class for a lightweight scorer based on a model and some feature data. - * - * @param The type of feature data to be scored with - */ -public abstract class BaseScoreAccumulator { - protected final LightweightLinearModel model; - protected double score; - - public BaseScoreAccumulator(LightweightLinearModel model) { - this.model = model; - this.score = model.bias; - } - - /** - * Compute score with a model and feature data - */ - public final double scoreWith(D featureData, boolean useLogitScore) { - updateScoreWithFeatures(featureData); - return useLogitScore ? getLogitScore() : getSigmoidScore(); - } - - public final void reset() { - this.score = model.bias; - } - - /** - * Update the accumulator score with features, after this function the score should already - * be computed. - */ - protected abstract void updateScoreWithFeatures(D data); - - /** - * Get the already accumulated score - */ - protected final double getLogitScore() { - return score; - } - - /** - * Returns the score as a value mapped between 0 and 1. - */ - protected final double getSigmoidScore() { - return 1 / (1 + Math.exp(-score)); - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/CompositeFeatureContext.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/CompositeFeatureContext.docx new file mode 100644 index 000000000..0d076e9c9 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/CompositeFeatureContext.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/CompositeFeatureContext.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/CompositeFeatureContext.java deleted file mode 100644 index 5da921b13..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/CompositeFeatureContext.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.util.function.Supplier; -import javax.annotation.Nullable; - -import com.twitter.ml.api.FeatureContext; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchema; - -/** - * An object to store feature context information to build models with. - */ -public class CompositeFeatureContext { - // legacy static feature context - private final FeatureContext legacyContext; - // a supplier for the context (well the schema itself) of the schema-based features - private final Supplier schemaSupplier; - - public CompositeFeatureContext( - FeatureContext legacyContext, - @Nullable Supplier schemaSupplier) { - this.legacyContext = legacyContext; - this.schemaSupplier = schemaSupplier; - } - - FeatureContext getLegacyContext() { - return legacyContext; - } - - ThriftSearchFeatureSchema getFeatureSchema() { - if (schemaSupplier == null) { - throw new UnsupportedOperationException("Feature schema was not initialized"); - } - return schemaSupplier.get(); - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/DecisionForestModelsManager.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/DecisionForestModelsManager.docx new file mode 100644 index 000000000..2219622a5 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/DecisionForestModelsManager.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/DecisionForestModelsManager.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/DecisionForestModelsManager.java deleted file mode 100644 index 7b9d84ebf..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/DecisionForestModelsManager.java +++ /dev/null @@ -1,69 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.io.IOException; -import java.util.Collections; -import java.util.Map; -import java.util.function.Supplier; - -import com.google.common.base.Preconditions; - -import com.twitter.ml.api.FeatureContext; -import com.twitter.mlv2.trees.predictor.CartTree; -import com.twitter.mlv2.trees.scorer.DecisionForestScorer; -import com.twitter.search.common.file.AbstractFile; -import com.twitter.search.common.util.ml.models_manager.BaseModelsManager; - -/** - * Loads Decision Forest based models and keep them in memory. Can also be scheduled to reload - * models periodically. - * - * Note: Each instance is tied to a single {@link FeatureContext} instance. So, to load models - * for different tasks, you should use different instances of the this class. - */ -public class DecisionForestModelsManager extends BaseModelsManager> { - private static final String MODEL_FILE_NAME = "model.json"; - - private final FeatureContext featureContext; - - DecisionForestModelsManager( - Supplier> activeModelsSupplier, - FeatureContext featureContext, - boolean shouldUnloadInactiveModels, - String statsPrefix - ) { - super(activeModelsSupplier, shouldUnloadInactiveModels, statsPrefix); - this.featureContext = featureContext; - } - - @Override - public DecisionForestScorer readModelFromDirectory(AbstractFile modelBaseDir) - throws IOException { - String modelFilePath = modelBaseDir.getChild(MODEL_FILE_NAME).getPath(); - return DecisionForestScorer.createCartTreeScorer(modelFilePath, featureContext); - } - - /** - * Creates an instance that loads the models specified in a configuration file. - * - * Note that if the configuration file changes and it doesn't include a model that was present - * before, the model will be removed (i.e. it unloads models that are not active anymore). - */ - public static DecisionForestModelsManager createUsingConfigFile( - AbstractFile configFile, FeatureContext featureContext, String statsPrefix) { - Preconditions.checkArgument( - configFile.canRead(), "Config file is not readable: %s", configFile.getPath()); - return new DecisionForestModelsManager( - new ConfigSupplier(configFile), featureContext, true, statsPrefix); - } - - /** - * Creates a no-op instance. It can be used for tests or when the models are disabled. - */ - public static DecisionForestModelsManager createNoOp(String statsPrefix) { - return new DecisionForestModelsManager( - Collections::emptyMap, new FeatureContext(), false, statsPrefix) { - @Override - public void run() { } - }; - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeature.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeature.docx new file mode 100644 index 000000000..c7a995452 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeature.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeature.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeature.java deleted file mode 100644 index 562535c48..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeature.java +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.util.Arrays; - -import com.google.common.base.Preconditions; - -/** - * Represents a continuous feature that has been discretized into a set of disjoint ranges. - * - * Each range [a, b) is represented by the lower split point (a) and its associated weight. - */ -class DiscretizedFeature { - - protected final double[] splitPoints; - protected final double[] weights; - - /** - * Creates an instance from a list of split points and their corresponding weights. - * - * @param splitPoints Lower values of the ranges. The first entry must be Double.NEGATIVE_INFINITY - * They must be sorted (in ascending order). - * @param weights Weights for the splits. - */ - protected DiscretizedFeature(double[] splitPoints, double[] weights) { - Preconditions.checkArgument(splitPoints.length == weights.length); - Preconditions.checkArgument(splitPoints.length > 1); - Preconditions.checkArgument(splitPoints[0] == Double.NEGATIVE_INFINITY, - "First split point must be Double.NEGATIVE_INFINITY"); - this.splitPoints = splitPoints; - this.weights = weights; - } - - public double getWeight(double value) { - // binarySearch returns (- insertionPoint - 1) - int index = Math.abs(Arrays.binarySearch(splitPoints, value) + 1) - 1; - return weights[index]; - } - - public boolean allValuesBelowThreshold(double minWeight) { - for (double weight : weights) { - if (Math.abs(weight) > minWeight) { - return false; - } - } - return true; - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeatureRange.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeatureRange.docx new file mode 100644 index 000000000..c47873294 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeatureRange.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeatureRange.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeatureRange.java deleted file mode 100644 index 725009ab0..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/DiscretizedFeatureRange.java +++ /dev/null @@ -1,33 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import com.google.common.base.Preconditions; - -/** - * The discretized value range for a continous feature. After discretization a continuous feature - * may become multiple discretized binary features, each occupying a range. This class stores this - * range and a weight for it. - */ -public class DiscretizedFeatureRange { - protected final double minValue; - protected final double maxValue; - protected final double weight; - - DiscretizedFeatureRange(double weight, String range) { - String[] limits = range.split("_"); - Preconditions.checkArgument(limits.length == 2); - - this.minValue = parseRangeValue(limits[0]); - this.maxValue = parseRangeValue(limits[1]); - this.weight = weight; - } - - private static double parseRangeValue(String value) { - if ("inf".equals(value)) { - return Double.POSITIVE_INFINITY; - } else if ("-inf".equals(value)) { - return Double.NEGATIVE_INFINITY; - } else { - return Double.parseDouble(value); - } - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/LegacyModelBuilder.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/LegacyModelBuilder.docx new file mode 100644 index 000000000..82c51ebfb Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/LegacyModelBuilder.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/LegacyModelBuilder.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/LegacyModelBuilder.java deleted file mode 100644 index 4cb87f556..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/LegacyModelBuilder.java +++ /dev/null @@ -1,86 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.util.Map; - -import com.google.common.collect.HashMultimap; -import com.google.common.collect.Maps; -import com.google.common.collect.Multimap; - -import com.twitter.ml.api.Feature; -import com.twitter.ml.api.FeatureContext; -import com.twitter.ml.api.FeatureParser; -import com.twitter.ml.api.transform.DiscretizerTransform; - -/** - * The builder for a model based on the legacy (non-schema-based) features. - * See also SchemaBasedModelBuilder. - */ -public final class LegacyModelBuilder extends BaseModelBuilder { - - private final Map featuresByName; - // for legacy features - private final Map, Double> binaryFeatures; - private final Map, Double> continuousFeatures; - private final Multimap, DiscretizedFeatureRange> discretizedFeatureRanges; - - LegacyModelBuilder(String modelName, FeatureContext context) { - super(modelName); - featuresByName = getFeaturesByName(context); - binaryFeatures = Maps.newHashMap(); - continuousFeatures = Maps.newHashMap(); - discretizedFeatureRanges = HashMultimap.create(); - } - - private static Map getFeaturesByName(FeatureContext featureContext) { - Map featuresByName = Maps.newHashMap(); - for (Feature feature : featureContext.getAllFeatures()) { - featuresByName.put(feature.getFeatureName(), feature); - } - return featuresByName; - } - - @Override - protected void addFeature(String baseName, double weight, FeatureParser parser) { - Feature feature = featuresByName.get(baseName); - if (feature != null) { - switch (feature.getFeatureType()) { - case BINARY: - binaryFeatures.put(feature, weight); - break; - case CONTINUOUS: - continuousFeatures.put(feature, weight); - break; - default: - throw new IllegalArgumentException( - String.format("Unsupported feature type: %s", feature)); - } - } else if (baseName.endsWith(DISCRETIZER_NAME_SUFFIX) - && parser.getExtension().containsKey(DiscretizerTransform.DEFAULT_RANGE_EXT)) { - - String featureName = - baseName.substring(0, baseName.length() - DISCRETIZER_NAME_SUFFIX.length()); - - feature = featuresByName.get(featureName); - if (feature == null) { - return; - } - - String rangeSpec = parser.getExtension().get(DiscretizerTransform.DEFAULT_RANGE_EXT); - discretizedFeatureRanges.put(feature, new DiscretizedFeatureRange(weight, rangeSpec)); - } - } - - @Override - public LightweightLinearModel build() { - Map, DiscretizedFeature> discretizedFeatures = Maps.newHashMap(); - for (Feature feature : discretizedFeatureRanges.keySet()) { - DiscretizedFeature discretizedFeature = - BaseModelBuilder.buildFeature(discretizedFeatureRanges.get(feature)); - if (!discretizedFeature.allValuesBelowThreshold(MIN_WEIGHT)) { - discretizedFeatures.put(feature, discretizedFeature); - } - } - return LightweightLinearModel.createForLegacy( - modelName, bias, binaryFeatures, continuousFeatures, discretizedFeatures); - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/LightweightLinearModel.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/LightweightLinearModel.docx new file mode 100644 index 000000000..6355ce4e9 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/LightweightLinearModel.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/LightweightLinearModel.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/LightweightLinearModel.java deleted file mode 100644 index 57324120b..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/LightweightLinearModel.java +++ /dev/null @@ -1,187 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.io.BufferedReader; -import java.io.FileReader; -import java.io.IOException; -import java.util.Map; -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; - -import com.twitter.ml.api.Feature; -import com.twitter.search.common.file.AbstractFile; - -/** - * Provides an interface to the weights associated to the features of a linear model trained - * with Prediction Engine. - * - * This class is used along with ScoreAccumulator to efficiently score instances. It supports only - * a limited set of features: - * - * - Only linear models are supported. - * - Only binary and continuous features (i.e. it doesn't support discrete/categorical features). - * - It supports the MDL discretizer (but not the one based on trees). - * - It doesn't support feature crossings. - * - * Instances of this class should be created using only the load methods (loadFromHdfs and - * loadFromLocalFile). - * - * IMPORTANT: - * - * Use this class, and ScoreAccumulator, ONLY when runtime is a major concern. Otherwise, consider - * using Prediction Engine as a library. Ideally, we should access directly the structures that - * Prediction Engine creates when it loads a model, instead of parsing a text file with the - * feature weights. - * - * The discretized feature bins created by MDL may be too fine to be displayed properly in the - * parsed text file and there may be bins with the same min value. A binary search finding the - * bin for a same feature value therefore may end up with different bins/scores in different runs, - * producing unstable scores. See SEARCHQUAL-15957 for more detail. - * - * @see com.twitter.ml.tool.prediction.ModelInterpreter - */ -public class LightweightLinearModel { - protected final double bias; - protected final boolean schemaBased; - protected final String name; - - // for legacy metadata based model - protected final Map, Double> binaryFeatures; - protected final Map, Double> continuousFeatures; - protected final Map, DiscretizedFeature> discretizedFeatures; - - // for schema-based model - protected final Map binaryFeaturesById; - protected final Map continuousFeaturesById; - protected final Map discretizedFeaturesById; - - private static final String SCHEMA_BASED_SUFFIX = ".schema_based"; - - LightweightLinearModel( - String modelName, - double bias, - boolean schemaBased, - @Nullable Map, Double> binaryFeatures, - @Nullable Map, Double> continuousFeatures, - @Nullable Map, DiscretizedFeature> discretizedFeatures, - @Nullable Map binaryFeaturesById, - @Nullable Map continuousFeaturesById, - @Nullable Map discretizedFeaturesById) { - - this.name = modelName; - this.bias = bias; - this.schemaBased = schemaBased; - - // legacy feature maps - this.binaryFeatures = - schemaBased ? null : Preconditions.checkNotNull(binaryFeatures); - this.continuousFeatures = - schemaBased ? null : Preconditions.checkNotNull(continuousFeatures); - this.discretizedFeatures = - schemaBased ? null : Preconditions.checkNotNull(discretizedFeatures); - - // schema based feature maps - this.binaryFeaturesById = - schemaBased ? Preconditions.checkNotNull(binaryFeaturesById) : null; - this.continuousFeaturesById = - schemaBased ? Preconditions.checkNotNull(continuousFeaturesById) : null; - this.discretizedFeaturesById = - schemaBased ? Preconditions.checkNotNull(discretizedFeaturesById) : null; - } - - public String getName() { - return name; - } - - /** - * Create model for legacy features - */ - protected static LightweightLinearModel createForLegacy( - String modelName, - double bias, - Map, Double> binaryFeatures, - Map, Double> continuousFeatures, - Map, DiscretizedFeature> discretizedFeatures) { - return new LightweightLinearModel(modelName, bias, false, - binaryFeatures, continuousFeatures, discretizedFeatures, - null, null, null); - } - - /** - * Create model for schema-based features - */ - protected static LightweightLinearModel createForSchemaBased( - String modelName, - double bias, - Map binaryFeaturesById, - Map continuousFeaturesById, - Map discretizedFeaturesById) { - return new LightweightLinearModel(modelName, bias, true, - null, null, null, - binaryFeaturesById, continuousFeaturesById, discretizedFeaturesById); - } - - public boolean isSchemaBased() { - return schemaBased; - } - - /** - * Loads a model from a text file. - * - * See the javadoc of the constructor for more details on how to create the file from a trained - * Prediction Engine model. - * - * If schemaBased is true, the featureContext is ignored. - */ - public static LightweightLinearModel load( - String modelName, - BufferedReader reader, - boolean schemaBased, - CompositeFeatureContext featureContext) throws IOException { - - ModelBuilder builder = schemaBased - ? new SchemaBasedModelBuilder(modelName, featureContext.getFeatureSchema()) - : new LegacyModelBuilder(modelName, featureContext.getLegacyContext()); - String line; - while ((line = reader.readLine()) != null) { - builder.parseLine(line); - } - return builder.build(); - } - - /** - * Loads a model from a local text file. - * - * See the javadoc of the constructor for more details on how to create the file from a trained - * Prediction Engine model. - */ - public static LightweightLinearModel loadFromLocalFile( - String modelName, - CompositeFeatureContext featureContext, - String fileName) throws IOException { - try (BufferedReader reader = new BufferedReader(new FileReader(fileName))) { - boolean schemaBased = modelName.endsWith(SCHEMA_BASED_SUFFIX); - return load(modelName, reader, schemaBased, featureContext); - } - } - - /** - * Loads a model from a file in the local filesystem or in HDFS. - * - * See the javadoc of the constructor for more details on how to create the file from a trained - * Prediction Engine model. - */ - public static LightweightLinearModel load( - String modelName, CompositeFeatureContext featureContext, AbstractFile modelFile) - throws IOException { - try (BufferedReader reader = modelFile.getCharSource().openBufferedStream()) { - boolean schemaBased = modelName.endsWith(SCHEMA_BASED_SUFFIX); - return load(modelName, reader, schemaBased, featureContext); - } - } - - public String toString() { - return String.format("LightweightLinearModel. {bias=%s binary=%s continuous=%s discrete=%s}", - this.bias, this.binaryFeatures, this.continuousFeatures, this.discretizedFeatures); - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelBuilder.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelBuilder.docx new file mode 100644 index 000000000..3019f7b23 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelBuilder.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelBuilder.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelBuilder.java deleted file mode 100644 index f0c6612a5..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelBuilder.java +++ /dev/null @@ -1,16 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -/** - * A builder interface to build a LightweightLinearModel. - */ -public interface ModelBuilder { - /** - * parses a line of the model file and updates the build state - */ - ModelBuilder parseLine(String line); - - /** - * builds the model - */ - LightweightLinearModel build(); -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelLoader.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelLoader.docx new file mode 100644 index 000000000..43768b3b3 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelLoader.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelLoader.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelLoader.java deleted file mode 100644 index 7809161b0..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/ModelLoader.java +++ /dev/null @@ -1,178 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -import com.google.common.base.Optional; -import com.google.common.base.Supplier; -import com.google.common.base.Suppliers; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.file.AbstractFile; -import com.twitter.search.common.file.FileUtils; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchStatsReceiver; - -/** - * Loads LightweightLinearModel objects from a directory and provides an interface for reloading - * them periodically. - * - * All the models must support the same features (defined by a FeatureContext) and they are - * identified by the name of the subdirectory. This is the required directory structure: - * - * /path/to/base-directory - * one-model/model.tsv - * another-model/model.tsv - * experimental-model/model.tsv - * - * Each subdirectory must contain a file named 'model.tsv' in the format required by - * LightweightLinearModel. - */ -public class ModelLoader implements Runnable { - - private static final Logger LOG = LoggerFactory.getLogger(ModelLoader.class); - private static final String MODEL_FILE_NAME = "model.tsv"; - - private final CompositeFeatureContext featureContext; - private final Supplier directorySupplier; - - private final Map models; - private final Map lastModifiedMsByModel; - - private final SearchLongGauge lastModelLoadedAtMs; - private final SearchLongGauge numModels; - private final SearchCounter numLoads; - private final SearchCounter numErrors; - - /** - * Creates a new instance for a feature context and a base directory. - * - * It exports 4 counters: - * - * ${counterPrefix}_last_loaded: - * Timestamp (in ms) when the last model was loaded. - * ${counterPrefix}_num_models: - * Number of models currently loaded. - * ${counterPrefix}_num_loads: - * Number of succesful model loads. - * ${counterPrefix}_num_errors: - * Number of errors occurred while loading the models. - */ - protected ModelLoader( - CompositeFeatureContext featureContext, - Supplier directorySupplier, - String counterPrefix, - SearchStatsReceiver statsReceiver) { - this.featureContext = featureContext; - - // This function returns the base directory every time we call 'run'. We use a function instead - // of using directly an AbstractFile instance, in case that we can't obtain an instance at - // initialization time (e.g. if there's an issue with HDFS). - this.directorySupplier = directorySupplier; - this.models = Maps.newConcurrentMap(); - this.lastModifiedMsByModel = Maps.newConcurrentMap(); - - this.lastModelLoadedAtMs = statsReceiver.getLongGauge(counterPrefix + "last_loaded"); - this.numModels = statsReceiver.getLongGauge(counterPrefix + "num_models"); - this.numLoads = statsReceiver.getCounter(counterPrefix + "num_loads"); - this.numErrors = statsReceiver.getCounter(counterPrefix + "num_errors"); - } - - public Optional getModel(String name) { - return Optional.fromNullable(models.get(name)); - } - - /** - * Loads the models from the base directory. - * - * It doesn't load a model if its file has not been modified since the last time it was loaded. - * - * This method doesn't delete previously loaded models if their directories are not available. - */ - @Override - public void run() { - try { - AbstractFile baseDirectory = directorySupplier.get(); - List modelDirectories = - Lists.newArrayList(baseDirectory.listFiles(IS_MODEL_DIR)); - for (AbstractFile directory : modelDirectories) { - try { - // Note that the modelName is the directory name, if it ends with ".schema_based", the - // model will be loaded as a schema-based model. - String modelName = directory.getName(); - AbstractFile modelFile = directory.getChild(MODEL_FILE_NAME); - long currentLastModified = modelFile.getLastModified(); - Long lastModified = lastModifiedMsByModel.get(modelName); - if (lastModified == null || lastModified < currentLastModified) { - LightweightLinearModel model = - LightweightLinearModel.load(modelName, featureContext, modelFile); - if (!models.containsKey(modelName)) { - LOG.info("Loading model {}.", modelName); - } - models.put(modelName, model); - lastModifiedMsByModel.put(modelName, currentLastModified); - lastModelLoadedAtMs.set(System.currentTimeMillis()); - numLoads.increment(); - LOG.debug("Model: {}", model); - } else { - LOG.debug("Directory for model {} has not changed.", modelName); - } - } catch (Exception e) { - LOG.error("Error loading model from directory: " + directory.getPath(), e); - this.numErrors.increment(); - } - } - if (numModels.get() != models.size()) { - LOG.info("Finished loading models. Model names: {}", models.keySet()); - } - this.numModels.set(models.size()); - } catch (IOException e) { - LOG.error("Error loading models", e); - this.numErrors.increment(); - } - } - - /** - * Creates an instance that loads models from a directory (local or from HDFS). - */ - public static ModelLoader forDirectory( - final AbstractFile directory, - CompositeFeatureContext featureContext, - String counterPrefix, - SearchStatsReceiver statsReceiver) { - Supplier directorySupplier = Suppliers.ofInstance(directory); - return new ModelLoader(featureContext, directorySupplier, counterPrefix, statsReceiver); - } - - /** - * Creates an instance that loads models from HDFS. - */ - public static ModelLoader forHdfsDirectory( - final String nameNode, - final String directory, - CompositeFeatureContext featureContext, - String counterPrefix, - SearchStatsReceiver statsReceiver) { - Supplier directorySupplier = - () -> FileUtils.getHdfsFileHandle(directory, nameNode); - return new ModelLoader(featureContext, directorySupplier, counterPrefix, statsReceiver); - } - - private static final AbstractFile.Filter IS_MODEL_DIR = file -> { - try { - if (file.isDirectory()) { - AbstractFile modelFile = file.getChild(MODEL_FILE_NAME); - return (modelFile != null) && modelFile.canRead(); - } - } catch (IOException e) { - LOG.error("Error reading file: " + file, e); - } - return false; - }; -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/PredictionEngineModelsManager.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/PredictionEngineModelsManager.docx new file mode 100644 index 000000000..5abb76611 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/PredictionEngineModelsManager.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/PredictionEngineModelsManager.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/PredictionEngineModelsManager.java deleted file mode 100644 index b2a96fd42..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/PredictionEngineModelsManager.java +++ /dev/null @@ -1,67 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.util.Collections; -import java.util.Map; -import java.util.function.Supplier; - -import com.google.common.base.Preconditions; - -import com.twitter.ml.prediction.core.PredictionEngine; -import com.twitter.ml.prediction.core.PredictionEngineFactory; -import com.twitter.ml.prediction.core.PredictionEngineLoadingException; -import com.twitter.ml.vw.constant.SnapshotConstants; -import com.twitter.search.common.file.AbstractFile; -import com.twitter.search.common.util.ml.models_manager.BaseModelsManager; - -/** - * Loads PredictionEngine models from a model provider (config or fixed directory) - * and keeps them in memory. Can also reload models periodically by querying the - * same model provider source. - */ -public class PredictionEngineModelsManager extends BaseModelsManager { - - PredictionEngineModelsManager( - Supplier> activeModelsSupplier, - boolean shouldUnloadInactiveModels, - String statsPrefix) { - super(activeModelsSupplier, shouldUnloadInactiveModels, statsPrefix); - } - - @Override - public PredictionEngine readModelFromDirectory(AbstractFile modelBaseDir) - throws PredictionEngineLoadingException { - // We need to add the 'hdfs://' prefix, otherwise PredictionEngine will treat it as a - // path in the local filesystem. - PredictionEngine predictionEngine = new PredictionEngineFactory() - .createFromSnapshot( - "hdfs://" + modelBaseDir.getPath(), SnapshotConstants.FIXED_PATH); - - predictionEngine.initialize(); - - return predictionEngine; - } - - /** - * Creates an instance that loads the models specified in a configuration file. - * - * Note that if the configuration file changes and it doesn't include a model that was present - * before, the model will be removed (i.e. it unloads models that are not active anymore). - */ - public static PredictionEngineModelsManager createUsingConfigFile( - AbstractFile configFile, String statsPrefix) { - Preconditions.checkArgument( - configFile.canRead(), "Config file is not readable: %s", configFile.getPath()); - return new PredictionEngineModelsManager(new ConfigSupplier(configFile), true, statsPrefix); - } - - /** - * Creates a no-op instance. It can be used for tests or when the models are disabled. - */ - public static PredictionEngineModelsManager createNoOp(String statsPrefix) { - return new PredictionEngineModelsManager(Collections::emptyMap, false, statsPrefix) { - @Override - public void run() { } - }; - } - -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedModelBuilder.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedModelBuilder.docx new file mode 100644 index 000000000..855bd5ef2 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedModelBuilder.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedModelBuilder.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedModelBuilder.java deleted file mode 100644 index 3b8483f1c..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedModelBuilder.java +++ /dev/null @@ -1,105 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.util.Map; -import java.util.stream.Collectors; - -import com.google.common.collect.HashMultimap; -import com.google.common.collect.Maps; -import com.google.common.collect.Multimap; - -import com.twitter.ml.api.FeatureParser; -import com.twitter.ml.api.transform.DiscretizerTransform; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchema; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchemaEntry; - -/** - * Builds a model with schema-based features, here all features are tracked by Id. - * This class is very similar to LegacyModelBuilder, which will eventually be deprecated. - */ -public class SchemaBasedModelBuilder extends BaseModelBuilder { - private final Map featuresByName; - private final Map binaryFeatures; - private final Map continuousFeatures; - private final Multimap discretizedFeatureRanges; - - /** - * a class to hold feature information - */ - static class FeatureData { - private final ThriftSearchFeatureSchemaEntry entry; - private final int id; - - public FeatureData(ThriftSearchFeatureSchemaEntry entry, int id) { - this.entry = entry; - this.id = id; - } - } - - SchemaBasedModelBuilder(String modelName, ThriftSearchFeatureSchema featureSchema) { - super(modelName); - featuresByName = getFeatureDataMap(featureSchema); - binaryFeatures = Maps.newHashMap(); - continuousFeatures = Maps.newHashMap(); - discretizedFeatureRanges = HashMultimap.create(); - } - - /** - * Creates a map from feature name to thrift entries - */ - private static Map getFeatureDataMap( - ThriftSearchFeatureSchema schema) { - return schema.getEntries().entrySet().stream() - .collect(Collectors.toMap( - e -> e.getValue().getFeatureName(), - e -> new FeatureData(e.getValue(), e.getKey()) - )); - } - - @Override - protected void addFeature(String baseName, double weight, FeatureParser parser) { - FeatureData feature = featuresByName.get(baseName); - if (feature != null) { - switch (feature.entry.getFeatureType()) { - case BOOLEAN_VALUE: - binaryFeatures.put(feature.id, weight); - break; - case INT32_VALUE: - case LONG_VALUE: - case DOUBLE_VALUE: - continuousFeatures.put(feature.id, weight); - break; - default: - // other values are not supported yet - throw new IllegalArgumentException( - String.format("Unsupported feature type: %s", feature)); - } - } else if (baseName.endsWith(DISCRETIZER_NAME_SUFFIX) - && parser.getExtension().containsKey(DiscretizerTransform.DEFAULT_RANGE_EXT)) { - - String featureName = - baseName.substring(0, baseName.length() - DISCRETIZER_NAME_SUFFIX.length()); - - feature = featuresByName.get(featureName); - if (feature == null) { - return; - } - - String rangeSpec = parser.getExtension().get(DiscretizerTransform.DEFAULT_RANGE_EXT); - discretizedFeatureRanges.put(feature.id, new DiscretizedFeatureRange(weight, rangeSpec)); - } - } - - @Override - public LightweightLinearModel build() { - Map discretizedFeatures = Maps.newHashMap(); - for (Integer feature : discretizedFeatureRanges.keySet()) { - DiscretizedFeature discretizedFeature = - BaseModelBuilder.buildFeature(discretizedFeatureRanges.get(feature)); - if (!discretizedFeature.allValuesBelowThreshold(MIN_WEIGHT)) { - discretizedFeatures.put(feature, discretizedFeature); - } - } - return LightweightLinearModel.createForSchemaBased( - modelName, bias, binaryFeatures, continuousFeatures, discretizedFeatures); - } -} diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedScoreAccumulator.docx b/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedScoreAccumulator.docx new file mode 100644 index 000000000..252195579 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedScoreAccumulator.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedScoreAccumulator.java b/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedScoreAccumulator.java deleted file mode 100644 index 68742211f..000000000 --- a/src/java/com/twitter/search/common/util/ml/prediction_engine/SchemaBasedScoreAccumulator.java +++ /dev/null @@ -1,64 +0,0 @@ -package com.twitter.search.common.util.ml.prediction_engine; - -import java.util.Map; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures; -import com.twitter.search.modeling.common.TweetFeaturesUtils; - -/** - * Score accumulator for schema-based features. - */ -public class SchemaBasedScoreAccumulator extends BaseScoreAccumulator { - - public SchemaBasedScoreAccumulator(LightweightLinearModel model) { - super(model); - Preconditions.checkState(model.isSchemaBased(), - "Cannot create SchemaBasedScoreAccumulator with a non-schema-based model: %s", - model.getName()); - } - - @Override - protected final void updateScoreWithFeatures(ThriftSearchResultFeatures featureData) { - // go through all features available and apply all those available in the model - addSchemaBooleanFeatures(featureData.getBoolValues()); - addSchemaContinuousFeatures(featureData.getIntValues()); - addSchemaContinuousFeatures(featureData.getLongValues()); - addSchemaContinuousFeatures(featureData.getDoubleValues()); - } - - private void addSchemaBooleanFeatures(Map booleanMap) { - if (booleanMap == null || booleanMap.isEmpty()) { - return; - } - for (Map.Entry entry : booleanMap.entrySet()) { - if (entry.getValue()) { - score += model.binaryFeaturesById.getOrDefault(entry.getKey(), 0.0); - } - } - } - - private void addSchemaContinuousFeatures(Map valueMap) { - if (valueMap == null || valueMap.isEmpty()) { - return; - } - for (Map.Entry entry : valueMap.entrySet()) { - Integer id = entry.getKey(); - if (TweetFeaturesUtils.isFeatureDiscrete(id)) { - continue; // we don't process any discrete features now - } - Double weight = model.continuousFeaturesById.get(id); - if (weight != null) { - // found non-discretized entry - score += weight * entry.getValue().doubleValue(); - } else { - DiscretizedFeature discretizedFeature = model.discretizedFeaturesById.get(id); - if (discretizedFeature != null) { - // Use only the weight of the discretized feature (there's no need to multiply it) - score += discretizedFeature.getWeight(entry.getValue().doubleValue()); - } - } - } - } -} diff --git a/src/java/com/twitter/search/common/util/ml/tensorflow_engine/BUILD b/src/java/com/twitter/search/common/util/ml/tensorflow_engine/BUILD deleted file mode 100644 index 56923850e..000000000 --- a/src/java/com/twitter/search/common/util/ml/tensorflow_engine/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/org/slf4j:slf4j-api", - "3rdparty/jvm/org/tensorflow", - "finatra/inject/inject-slf4j/src/main/scala/com/twitter/inject", - "src/java/com/twitter/ml/api:api-base", - "src/java/com/twitter/search/common/file", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/util/ml/models_manager", - "src/thrift/com/twitter/search/common:features-java", - "tensorflow/tfcompute-java/src/main/java/com/twitter/tfcompute_java", - "twml/runtime/src/main/scala/com/twitter/twml/runtime/lib", - "twml/runtime/src/main/scala/com/twitter/twml/runtime/models", - "util/util-core:scala", - ], -) diff --git a/src/java/com/twitter/search/common/util/ml/tensorflow_engine/BUILD.docx b/src/java/com/twitter/search/common/util/ml/tensorflow_engine/BUILD.docx new file mode 100644 index 000000000..f18644d05 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/tensorflow_engine/BUILD.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/tensorflow_engine/TensorflowModelsManager.docx b/src/java/com/twitter/search/common/util/ml/tensorflow_engine/TensorflowModelsManager.docx new file mode 100644 index 000000000..4953465d7 Binary files /dev/null and b/src/java/com/twitter/search/common/util/ml/tensorflow_engine/TensorflowModelsManager.docx differ diff --git a/src/java/com/twitter/search/common/util/ml/tensorflow_engine/TensorflowModelsManager.java b/src/java/com/twitter/search/common/util/ml/tensorflow_engine/TensorflowModelsManager.java deleted file mode 100644 index 3028a3395..000000000 --- a/src/java/com/twitter/search/common/util/ml/tensorflow_engine/TensorflowModelsManager.java +++ /dev/null @@ -1,189 +0,0 @@ -package com.twitter.search.common.util.ml.tensorflow_engine; - -import java.io.IOException; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; -import java.util.function.Supplier; - -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.tensorflow.SavedModelBundle; -import org.tensorflow.Session; - -import com.twitter.ml.api.FeatureUtil; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchema; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchemaEntry; -import com.twitter.search.common.file.AbstractFile; -import com.twitter.search.common.schema.DynamicSchema; -import com.twitter.search.common.util.ml.models_manager.BaseModelsManager; -import com.twitter.tfcompute_java.TFModelRunner; -import com.twitter.tfcompute_java.TFSessionInit; -import com.twitter.twml.runtime.lib.TwmlLoader; -import com.twitter.twml.runtime.models.ModelLocator; -import com.twitter.twml.runtime.models.ModelLocator$; -import com.twitter.util.Await; - -/** - * TensorflowModelsManager manages the lifecyle of TF models. - */ -public class TensorflowModelsManager extends BaseModelsManager { - - private static final Logger LOG = LoggerFactory.getLogger(TensorflowModelsManager.class); - - private static final String[] TF_TAGS = new String[] {"serve"}; - - private volatile Map featureSchemaIdToMlApiId = new HashMap(); - - static { - TwmlLoader.load(); - } - - public static final TensorflowModelsManager NO_OP_MANAGER = - createNoOp("no_op_manager"); - - public TensorflowModelsManager( - Supplier> activeModelsSupplier, - boolean shouldUnloadInactiveModels, - String statsPrefix - ) { - this( - activeModelsSupplier, - shouldUnloadInactiveModels, - statsPrefix, - () -> true, - () -> true, - null - ); - } - - public TensorflowModelsManager( - Supplier> activeModelsSupplier, - boolean shouldUnloadInactiveModels, - String statsPrefix, - Supplier serveModels, - Supplier loadModels, - DynamicSchema dynamicSchema - ) { - super( - activeModelsSupplier, - shouldUnloadInactiveModels, - statsPrefix, - serveModels, - loadModels - ); - if (dynamicSchema != null) { - updateFeatureSchemaIdToMlIdMap(dynamicSchema.getSearchFeatureSchema()); - } - } - - /** - * The ML API feature ids for tensorflow scoring are hashes of their feature names. This hashing - * could be expensive to do for every search request. Instead, allow the map from schema feature - * id to ML API id to be updated whenever the schema is reloaded. - */ - public void updateFeatureSchemaIdToMlIdMap(ThriftSearchFeatureSchema schema) { - HashMap newFeatureSchemaIdToMlApiId = new HashMap(); - Map featureEntries = schema.getEntries(); - for (Map.Entry entry : featureEntries.entrySet()) { - long mlApiFeatureId = FeatureUtil.featureIdForName(entry.getValue().getFeatureName()); - newFeatureSchemaIdToMlApiId.put(entry.getKey(), mlApiFeatureId); - } - - featureSchemaIdToMlApiId = newFeatureSchemaIdToMlApiId; - } - - public Map getFeatureSchemaIdToMlApiId() { - return featureSchemaIdToMlApiId; - } - - /** - * If the manager is not enabled, it won't fetch TF models. - */ - public boolean isEnabled() { - return true; - } - - /** - * Load an individual model and make it available for inference. - */ - public TFModelRunner readModelFromDirectory( - AbstractFile modelDir) throws IOException { - - ModelLocator modelLocator = - ModelLocator$.MODULE$.apply( - modelDir.toString(), - modelDir.toURI() - ); - - try { - Await.result(modelLocator.ensureLocalPresent(true)); - } catch (Exception e) { - LOG.error("Couldn't find model " + modelDir.toString(), e); - throw new IOException("Couldn't find model " + modelDir.toString()); - } - - Session session = SavedModelBundle.load(modelLocator.localPath(), TF_TAGS).session(); - - return new TFModelRunner(session); - } - - - /** - * Initialize Tensorflow intra and inter op thread pools. - * See `ConfigProto.[intra|inter]_op_parallelism_threads` documentation for more information: - * https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/protobuf/config.proto - * Initialization should happen only once. - * Default values for Tensorflow are: - * intraOpParallelismThreads = 0 which means that TF will pick an appropriate default. - * interOpParallelismThreads = 0 which means that TF will pick an appropriate default. - * operation_timeout_in_ms = 0 which means that no timeout will be applied. - */ - public static void initTensorflowThreadPools( - int intraOpParallelismThreads, - int interOpParallelismThreads) { - new TFSessionInit(intraOpParallelismThreads, interOpParallelismThreads, 0); - } - - /** - * Creates a no-op instance. It can be used for tests or when the models are disabled. - */ - public static TensorflowModelsManager createNoOp(String statsPrefix) { - return new TensorflowModelsManager(Collections::emptyMap, false, statsPrefix) { - @Override - public void run() { } - - @Override - public boolean isEnabled() { - return false; - } - - @Override - public void updateFeatureSchemaIdToMlIdMap(ThriftSearchFeatureSchema schema) { } - }; - } - - /** - * Creates an instance that loads the models based on a ConfigSupplier. - */ - public static TensorflowModelsManager createUsingConfigFile( - AbstractFile configFile, - boolean shouldUnloadInactiveModels, - String statsPrefix, - Supplier serveModels, - Supplier loadModels, - DynamicSchema dynamicSchema) { - Preconditions.checkArgument( - configFile.canRead(), "Config file is not readable: %s", configFile.getPath()); - return new TensorflowModelsManager( - new ConfigSupplier(configFile), - shouldUnloadInactiveModels, - statsPrefix, - serveModels, - loadModels, - dynamicSchema - ); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/BUILD b/src/java/com/twitter/search/core/earlybird/BUILD deleted file mode 100644 index a8432dfe0..000000000 --- a/src/java/com/twitter/search/core/earlybird/BUILD +++ /dev/null @@ -1,38 +0,0 @@ -java_library( - sources = ["**/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/it/unimi/dsi:fastutil", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/lucene:lucene-queries", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/search/common/encoding/docvalues", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/facets", - "src/java/com/twitter/search/common/hashtable", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/search", - "src/java/com/twitter/search/common/util:log_format_util", - "src/java/com/twitter/search/common/util/analysis", - "src/java/com/twitter/search/common/util/hash", - "src/java/com/twitter/search/common/util/io:flushable", - "src/thrift/com/twitter/search/common:constants-java", - "src/thrift/com/twitter/search/common:facets-java", - "src/thrift/com/twitter/search/common:schema-java", - ], -) diff --git a/src/java/com/twitter/search/core/earlybird/BUILD.docx b/src/java/com/twitter/search/core/earlybird/BUILD.docx new file mode 100644 index 000000000..c3065e4d2 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/BUILD.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/README.docx b/src/java/com/twitter/search/core/earlybird/README.docx new file mode 100644 index 000000000..9b4a310b8 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/README.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/README.md b/src/java/com/twitter/search/core/earlybird/README.md deleted file mode 100644 index 337d327fb..000000000 --- a/src/java/com/twitter/search/core/earlybird/README.md +++ /dev/null @@ -1,21 +0,0 @@ -# Search Index (Earlybird) core classes - -> **TL;DR** Earlybird (Search Index) find tweets from people you follow, rank them, and serve tweets to Home. - -## What is Earlybird (Search Index) - -[Earlybird](http://notes.stephenholiday.com/Earlybird.pdf) is a **real-time search system** based on [Apache Lucene](https://lucene.apache.org/) to support the high volume of queries and content updates. The major use cases are Relevance Search (specifically, Text search) and Timeline In-network Tweet retrieval (or UserID based search). It is designed to enable the efficient indexing and querying of billions of tweets, and to provide low-latency search results, even with heavy query loads. - -## Directory Structure -The project consists of several packages and files, which can be summarized as follows: - - -* `facets/`: This subdirectory contains classes responsible for facet counting and processing. Some key classes include EarlybirdFacets, EarlybirdFacetsFactory, FacetAccumulator, and FacetCountAggregator. The classes handle facet counting, facet iterators, facet label providers, and facet response rewriting. -* `index/`: This directory contains the indexing and search infra files, with several subdirectories for specific components. - * `column/`: This subdirectory contains classes related to column-stride field indexes, including ColumnStrideByteIndex, ColumnStrideIntIndex, ColumnStrideLongIndex, and various optimized versions of these indexes. These classes deal with managing and updating doc values. - * `extensions/`: This subdirectory contains classes for index extensions, including EarlybirdIndexExtensionsData, EarlybirdIndexExtensionsFactory, and EarlybirdRealtimeIndexExtensionsData. - * `inverted/`: This subdirectory focuses on the inverted index and its components, such as InMemoryFields, IndexOptimizer, InvertedIndex, and InvertedRealtimeIndex. It also contains classes for managing and processing posting lists and term dictionaries, like EarlybirdPostingsEnum, FSTTermDictionary, and MPHTermDictionary. - * `util/`: This subdirectory contains utility classes for managing search iterators and filters, such as AllDocsIterator, RangeDISI, RangeFilterDISI, and SearchSortUtils. The system appears to be designed to handle search indexing and facet counting efficiently. Key components include an inverted index, various types of posting lists, and term dictionaries. Facet counting and processing is handled by specialized classes within the facets subdirectory. The overall structure indicates a well-organized and modular search indexing system that can be maintained and extended as needed. - -## Related Services -* The Earlybirds main classes. See `src/java/com/twitter/search/earlybird/` diff --git a/src/java/com/twitter/search/core/earlybird/facets/AbstractFacetCountingArray.docx b/src/java/com/twitter/search/core/earlybird/facets/AbstractFacetCountingArray.docx new file mode 100644 index 000000000..62e1bbc96 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/AbstractFacetCountingArray.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/AbstractFacetCountingArray.java b/src/java/com/twitter/search/core/earlybird/facets/AbstractFacetCountingArray.java deleted file mode 100644 index c587c5470..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/AbstractFacetCountingArray.java +++ /dev/null @@ -1,231 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; - -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.core.earlybird.index.inverted.IntBlockPool; - -/** - * AbstractFacetCountingArray implements a lookup from a doc ID to an unordered list of facets. - * A facet is a pair of (term ID, field ID), which could represent, - * for example ("http://twitter.com", "links"). - * - * Internally, we have two data structures: A map from doc ID to an int and a pool of ints. We refer - * to the values contained in these structures as packed values. A packed value can either be a - * pointer to a location in the pool, an encoded facet or a sentinel value. Pointers always have - * their high bit set to 1. - * - * If a document has just one facet, we will store the encoded facet in the map, and nothing in the - * pool. Otherwise, the map will contain a pointer into the int pool. - * - * The int pool is encoded in a block-allocated linked list. - * See {@link AbstractFacetCountingArray#collectForDocId} for details on how to traverse the list. - */ -public abstract class AbstractFacetCountingArray implements Flushable { - private static final Logger LOG = LoggerFactory.getLogger(AbstractFacetCountingArray.class); - - private static final FacetCountIterator EMPTY_ITERATOR = new FacetCountIterator() { - @Override - public void collect(int docID) { - // noop - } - }; - - public static final AbstractFacetCountingArray EMPTY_ARRAY = new AbstractFacetCountingArray() { - @Override - public final FacetCountIterator getIterator(EarlybirdIndexSegmentAtomicReader reader, - FacetCountState countState, - FacetCountIteratorFactory iteratorFactory) { - return EMPTY_ITERATOR; - } - - @Override - public final int getFacet(int docID) { - return UNASSIGNED; - } - - @Override - public final void setFacet(int docID, int facetID) { - } - - @Override - public final AbstractFacetCountingArray rewriteAndMapIDs( - Map termIDMapper, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) { - return this; - } - - @Override - public Handler getFlushHandler() { - return null; - } - }; - - protected class ArrayFacetCountIterator extends FacetCountIterator { - @Override - public void collect(int docID) { - collectForDocId(docID, this); - } - } - - private static final int NUM_BITS_TERM_ID = 27; - private static final int TERM_ID_MASK = (1 << NUM_BITS_TERM_ID) - 1; - - private static final int NUM_BITS_FIELD_ID = 4; - private static final int FIELD_ID_MASK = (1 << NUM_BITS_FIELD_ID) - 1; - - private static final int HIGHEST_ORDER_BIT = Integer.MIN_VALUE; // 1L << 31 - private static final int HIGHEST_ORDER_BIT_INVERSE_MASK = HIGHEST_ORDER_BIT - 1; - - protected static final int UNASSIGNED = Integer.MAX_VALUE; - - protected static final int decodeTermID(int facetID) { - if (facetID != UNASSIGNED) { - int termID = facetID & TERM_ID_MASK; - return termID; - } - - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - - protected static final int decodeFieldID(int facetID) { - return (facetID >>> NUM_BITS_TERM_ID) & FIELD_ID_MASK; - } - - protected static final int encodeFacetID(int fieldID, int termID) { - return ((fieldID & FIELD_ID_MASK) << NUM_BITS_TERM_ID) | (termID & TERM_ID_MASK); - } - - protected static final int decodePointer(int value) { - return value & HIGHEST_ORDER_BIT_INVERSE_MASK; - } - - protected static final int encodePointer(int value) { - return value | HIGHEST_ORDER_BIT; - } - - protected static final boolean isPointer(int value) { - return (value & HIGHEST_ORDER_BIT) != 0; - } - - private final IntBlockPool facetsPool; - - protected AbstractFacetCountingArray() { - facetsPool = new IntBlockPool("facets"); - } - - protected AbstractFacetCountingArray(IntBlockPool facetsPool) { - this.facetsPool = facetsPool; - } - - /** - * Returns an iterator to iterate all docs/facets stored in this FacetCountingArray. - */ - public FacetCountIterator getIterator( - EarlybirdIndexSegmentAtomicReader reader, - FacetCountState countState, - FacetCountIteratorFactory iteratorFactory) { - Preconditions.checkNotNull(countState); - Preconditions.checkNotNull(reader); - - List iterators = new ArrayList<>(); - for (Schema.FieldInfo fieldInfo : countState.getSchema().getCsfFacetFields()) { - if (countState.isCountField(fieldInfo)) { - // Rather than rely on the normal facet counting array, we read from a column stride - // field using a custom implementation of FacetCountIterator. - // This optimization is due to two factors: - // 1) for the from_user_id_csf facet, every document has a from user id, - // but many documents contain no other facets. - // 2) we require from_user_id and shared_status_id to be in a column stride field - // for other uses. - try { - iterators.add(iteratorFactory.getFacetCountIterator(reader, fieldInfo)); - } catch (IOException e) { - String facetName = fieldInfo.getFieldType().getFacetName(); - LOG.error("Failed to construct iterator for " + facetName + " facet", e); - } - } - } - if (iterators.size() == 0) { - return new ArrayFacetCountIterator(); - } - if (iterators.size() < countState.getNumFieldsToCount()) { - iterators.add(new ArrayFacetCountIterator()); - } - return new CompositeFacetCountIterator(iterators); - } - - /** - * Collects facets of the document with the provided docID. - * See {@link FacetCountingArrayWriter#addFacet} for details on the format of the int pool. - */ - public void collectForDocId(int docID, FacetTermCollector collector) { - int firstValue = getFacet(docID); - if (firstValue == UNASSIGNED) { - return; // no facet - } - if (!isPointer(firstValue)) { - // highest order bit not set, only one facet for this document. - collector.collect(docID, decodeTermID(firstValue), decodeFieldID(firstValue)); - return; - } - - // multiple facets, traverse the linked list to find all of the facets for this document. - int pointer = decodePointer(firstValue); - while (true) { - int packedValue = facetsPool.get(pointer); - // UNASSIGNED is a sentinel value indicating that we have reached the end of the linked list. - if (packedValue == UNASSIGNED) { - return; - } - - if (isPointer(packedValue)) { - // If the packedValue is a pointer, we need to skip over some ints to reach the facets for - // this document. - pointer = decodePointer(packedValue); - } else { - // If the packedValue is not a pointer, it is an encoded facet, and we can simply decrement - // the pointer to collect the next value. - collector.collect(docID, decodeTermID(packedValue), decodeFieldID(packedValue)); - pointer--; - } - } - } - - /** - * This method can return one of three values for each given doc ID: - * - UNASSIGNED, if the document has no facets - * - If the highest-order bit is not set, then the (negated) returned value is the single facet - * for this document. - * - If the highest-order bit is set, then the document has multiple facets, and the returned - * values is a pointer into facetsPool. - */ - protected abstract int getFacet(int docID); - - protected abstract void setFacet(int docID, int facetID); - - /** - * Called during segment optimization to map term ids that have changed as a - * result of the optimization. - */ - public abstract AbstractFacetCountingArray rewriteAndMapIDs( - Map termIDMapper, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException; - - IntBlockPool getFacetsPool() { - return facetsPool; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/CSFFacetCountIterator.docx b/src/java/com/twitter/search/core/earlybird/facets/CSFFacetCountIterator.docx new file mode 100644 index 000000000..d9c1adc3b Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/CSFFacetCountIterator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/CSFFacetCountIterator.java b/src/java/com/twitter/search/core/earlybird/facets/CSFFacetCountIterator.java deleted file mode 100644 index efe8cb7be..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/CSFFacetCountIterator.java +++ /dev/null @@ -1,56 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.NumericDocValues; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * An iterator that looks up the termID from the appropriate CSF - */ -public class CSFFacetCountIterator extends FacetCountIterator { - private final int fieldID; - private final NumericDocValues numericDocValues; - - /** - * Creates a new iterator for the given facet csf field. - */ - public CSFFacetCountIterator( - EarlybirdIndexSegmentAtomicReader reader, - Schema.FieldInfo facetFieldInfo) throws IOException { - FacetIDMap.FacetField facetField = reader.getFacetIDMap().getFacetField(facetFieldInfo); - Preconditions.checkNotNull(facetField); - this.fieldID = facetField.getFacetId(); - numericDocValues = reader.getNumericDocValues(facetFieldInfo.getName()); - Preconditions.checkNotNull(numericDocValues); - } - - @Override - public void collect(int internalDocID) throws IOException { - if (numericDocValues.advanceExact(internalDocID)) { - long termID = numericDocValues.longValue(); - if (shouldCollect(internalDocID, termID)) { - collect(internalDocID, termID, fieldID); - } - } - } - - /** - * Subclasses should override if they need to restrict the docs or termIDs - * that they collect on. For example, these may need to override if - * 1) Not all docs set this field, so we should not collect on - * the default value of 0 - * 2) The same CSF field means different things (in particular, shared_status_id means - * retweet OR reply parent id) so we need to do some other check to determine if we should - * collect - * - * @return whether we should collect on this doc/termID - */ - protected boolean shouldCollect(int internalDocID, long termID) throws IOException { - return true; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/CompositeFacetCountIterator.docx b/src/java/com/twitter/search/core/earlybird/facets/CompositeFacetCountIterator.docx new file mode 100644 index 000000000..8eeee95d5 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/CompositeFacetCountIterator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/CompositeFacetCountIterator.java b/src/java/com/twitter/search/core/earlybird/facets/CompositeFacetCountIterator.java deleted file mode 100644 index 4aa6e8748..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/CompositeFacetCountIterator.java +++ /dev/null @@ -1,46 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; -import java.util.Collection; -import java.util.List; - -import com.twitter.common.collections.Pair; - -/** - * Calls multiple FacetCountIterators. Currently this is used for calling the - * default FacetCountingArray iterator and the CSF and retweet iterators - */ -public class CompositeFacetCountIterator extends FacetCountIterator { - private final Collection iterators; - - /** - * Creates a new composite iterator on the provided collection of iterators. - */ - public CompositeFacetCountIterator(Collection iterators) { - this.iterators = iterators; - for (FacetCountIterator iterator : iterators) { - iterator.setIncrementData(this.incrementData); - } - } - - @Override - public void collect(int docID) throws IOException { - for (FacetCountIterator iterator : iterators) { - iterator.collect(docID); - } - } - - @Override - protected void addProof(int docID, long termID, int fieldID) { - for (FacetCountIterator iterator : iterators) { - iterator.addProof(docID, termID, fieldID); - } - } - - @Override - public void setProofs(List> proof) { - for (FacetCountIterator iterator : iterators) { - iterator.setProofs(proof); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/DummyFacetAccumulator.docx b/src/java/com/twitter/search/core/earlybird/facets/DummyFacetAccumulator.docx new file mode 100644 index 000000000..d49188eab Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/DummyFacetAccumulator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/DummyFacetAccumulator.java b/src/java/com/twitter/search/core/earlybird/facets/DummyFacetAccumulator.java deleted file mode 100644 index 2395b790e..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/DummyFacetAccumulator.java +++ /dev/null @@ -1,28 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -/** - * This accumulator does not accumulate the facet counts when {@link #add(long, int, int, int)} - * is called. - */ -public class DummyFacetAccumulator extends FacetAccumulator { - - @Override - public int add(long termID, int scoreIncrement, int penaltyCount, int tweepCred) { - return 0; - } - - @Override - public R getAllFacets() { - return null; - } - - @Override - public R getTopFacets(int n) { - return null; - } - - @Override - public void reset(FacetLabelProvider facetLabelProvider) { - } - -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetDocValueSet.docx b/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetDocValueSet.docx new file mode 100644 index 000000000..996d25c7f Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetDocValueSet.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetDocValueSet.java b/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetDocValueSet.java deleted file mode 100644 index ae7be787a..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetDocValueSet.java +++ /dev/null @@ -1,153 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.util.Map; -import java.util.Map.Entry; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.index.ReaderUtil; -import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefBuilder; - -import com.twitter.search.core.earlybird.index.inverted.InvertedIndex; - -public class EarlybirdFacetDocValueSet extends SortedSetDocValues { - private final AbstractFacetCountingArray countingArray; - private final InvertedIndex[] labelProviders; - private final String[] fieldNames; - private final int[] starts; - private final BytesRefBuilder ordCache; - private int totalTerms; - private int docID = -1; - private int currentFacet = FacetCountingArray.UNASSIGNED; - private int pointer = -1; - private boolean hasMoreOrds = false; - - public static final String FIELD_NAME = FacetsConfig.DEFAULT_INDEX_FIELD_NAME; - - /** - * Creates a new EarlybirdFacetDocValueSet from the provided FacetCountingArray. - */ - public EarlybirdFacetDocValueSet(AbstractFacetCountingArray countingArray, - Map labelProviderMap, - FacetIDMap facetIdMap) { - this.countingArray = countingArray; - labelProviders = new InvertedIndex[facetIdMap.getNumberOfFacetFields()]; - fieldNames = new String[facetIdMap.getNumberOfFacetFields()]; - for (Entry entry : labelProviderMap.entrySet()) { - FacetLabelProvider labelProvider = entry.getValue(); - if (labelProvider instanceof InvertedIndex) { - FacetIDMap.FacetField facetField = facetIdMap.getFacetFieldByFacetName(entry.getKey()); - if (facetField != null) { - labelProviders[facetField.getFacetId()] = (InvertedIndex) labelProvider; - fieldNames[facetField.getFacetId()] = entry.getKey(); - } - } - } - - starts = new int[labelProviders.length + 1]; // build starts array - ordCache = new BytesRefBuilder(); - totalTerms = 0; - - for (int i = 0; i < labelProviders.length; ++i) { - if (labelProviders[i] != null) { - starts[i] = totalTerms; - int termCount = labelProviders[i].getNumTerms(); - totalTerms += termCount; - } - } - - // added to so that mapping from ord to index works via ReaderUtil.subIndex - starts[labelProviders.length] = totalTerms; - } - - private long encodeOrd(int fieldId, int termId) { - assert starts[fieldId] + termId < starts[fieldId + 1]; - return starts[fieldId] + termId; - } - - @Override - public long nextOrd() { - if (!hasMoreOrds || currentFacet == FacetCountingArray.UNASSIGNED) { - return SortedSetDocValues.NO_MORE_ORDS; - } - - // only 1 facet val - if (!FacetCountingArray.isPointer(currentFacet)) { - int termId = FacetCountingArray.decodeTermID(currentFacet); - int fieldId = FacetCountingArray.decodeFieldID(currentFacet); - hasMoreOrds = false; - return encodeOrd(fieldId, termId); - } - - // multiple facets, follow the pointer to find all facets in the facetsPool. - if (pointer == -1) { - pointer = FacetCountingArray.decodePointer(currentFacet); - } - int facetID = countingArray.getFacetsPool().get(pointer); - int termId = FacetCountingArray.decodeTermID(facetID); - int fieldId = FacetCountingArray.decodeFieldID(facetID); - - hasMoreOrds = FacetCountingArray.isPointer(facetID); - pointer++; - return encodeOrd(fieldId, termId); - } - - @Override - public BytesRef lookupOrd(long ord) { - int idx = ReaderUtil.subIndex((int) ord, this.starts); - if (labelProviders[idx] != null) { - int termID = (int) ord - starts[idx]; - BytesRef term = new BytesRef(); - labelProviders[idx].getTerm(termID, term); - String name = fieldNames[idx]; - String val = FacetsConfig.pathToString(new String[] {name, term.utf8ToString()}); - ordCache.copyChars(val); - } else { - ordCache.copyChars(""); - } - return ordCache.get(); - } - - @Override - public long lookupTerm(BytesRef key) { - throw new UnsupportedOperationException(); - } - - @Override - public long getValueCount() { - return totalTerms; - } - - @Override - public int docID() { - return docID; - } - - @Override - public int nextDoc() { - return ++docID; - } - - @Override - public int advance(int target) { - Preconditions.checkState(target >= docID); - docID = target; - currentFacet = countingArray.getFacet(docID); - pointer = -1; - hasMoreOrds = true; - return docID; - } - - @Override - public boolean advanceExact(int target) { - return advance(target) != FacetCountingArray.UNASSIGNED; - } - - @Override - public long cost() { - return totalTerms; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacets.docx b/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacets.docx new file mode 100644 index 000000000..34ed2ab3b Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacets.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacets.java b/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacets.java deleted file mode 100644 index 8872d2049..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacets.java +++ /dev/null @@ -1,102 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; -import java.util.List; -import java.util.Map; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import org.apache.lucene.facet.FacetResult; -import org.apache.lucene.facet.Facets; -import org.apache.lucene.facet.FacetsCollector; -import org.apache.lucene.facet.FacetsCollector.MatchingDocs; -import org.apache.lucene.util.BitDocIdSet; -import org.apache.lucene.util.BitSet; - -import com.twitter.search.common.facets.FacetSearchParam; -import com.twitter.search.common.facets.thriftjava.FacetFieldRequest; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * Lucene accumulator implementation that counts on our facet counting array data structure. - * - */ -public class EarlybirdFacets extends Facets { - - private final AbstractFacetCountingArray countingArray; - private final FacetCountAggregator aggregator; - private final EarlybirdIndexSegmentAtomicReader reader; - private final MatchingDocs matchingDocs; - private final Map resultMapping; - - /** - * Constructs an EarlybirdFacets accumulator. - */ - public EarlybirdFacets( - List facetSearchParams, - FacetsCollector facetsCollector, - EarlybirdIndexSegmentAtomicReader reader) throws IOException { - - Preconditions.checkArgument(facetSearchParams != null && !facetSearchParams.isEmpty()); - Preconditions.checkArgument( - facetsCollector != null - && facetsCollector.getMatchingDocs() != null - && facetsCollector.getMatchingDocs().size() == 1); - Preconditions.checkNotNull(reader); - - this.countingArray = reader.getSegmentData().getFacetCountingArray(); - this.reader = reader; - this.aggregator = new FacetCountAggregator(facetSearchParams, - reader.getSegmentData().getSchema(), - reader.getFacetIDMap(), - reader.getSegmentData().getPerFieldMap()); - this.matchingDocs = facetsCollector.getMatchingDocs().get(0); - - this.resultMapping = count(); - } - - private Map count() throws IOException { - Preconditions.checkState(matchingDocs.bits instanceof BitDocIdSet, - "Assuming BitDocIdSet"); - final BitSet bits = ((BitDocIdSet) matchingDocs.bits).bits(); - final int length = bits.length(); - int doc = reader.getSmallestDocID(); - if (doc != -1) { - while (doc < length && (doc = bits.nextSetBit(doc)) != -1) { - countingArray.collectForDocId(doc, aggregator); - doc++; - } - } - return aggregator.getTop(); - } - - @Override - public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException { - FacetFieldRequest facetFieldRequest = new FacetFieldRequest(dim, topN); - if (path.length > 0) { - facetFieldRequest.setPath(Lists.newArrayList(path)); - } - - FacetResult result = resultMapping.get(facetFieldRequest); - - Preconditions.checkNotNull( - result, - "Illegal facet field request: %s, supported requests are: %s", - facetFieldRequest, - resultMapping.keySet()); - - return result; - } - - @Override - public Number getSpecificValue(String dim, String... path) { - throw new UnsupportedOperationException("Not supported"); - } - - @Override - public List getAllDims(int topN) throws IOException { - throw new UnsupportedOperationException("Not supported"); - } - -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetsFactory.docx b/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetsFactory.docx new file mode 100644 index 000000000..7128b119f Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetsFactory.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetsFactory.java b/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetsFactory.java deleted file mode 100644 index a790cd6f7..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/EarlybirdFacetsFactory.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; -import java.util.List; - -import org.apache.lucene.facet.Facets; -import org.apache.lucene.facet.FacetsCollector; - -import com.twitter.search.common.facets.CountFacetSearchParam; -import com.twitter.search.common.facets.FacetSearchParam; -import com.twitter.search.common.facets.FacetsFactory; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * Factory for EarlybirdFacets - */ -public class EarlybirdFacetsFactory implements FacetsFactory { - private final EarlybirdIndexSegmentAtomicReader reader; - - public EarlybirdFacetsFactory(EarlybirdIndexSegmentAtomicReader reader) { - this.reader = reader; - } - - @Override - public Facets create( - List facetSearchParams, - FacetsCollector facetsCollector) throws IOException { - - return new EarlybirdFacets(facetSearchParams, facetsCollector, reader); - } - - @Override - public boolean accept(FacetSearchParam facetSearchParam) { - if (!(facetSearchParam instanceof CountFacetSearchParam) - || (facetSearchParam.getFacetFieldRequest().getPath() != null - && !facetSearchParam.getFacetFieldRequest().getPath().isEmpty())) { - return false; - } - - String field = facetSearchParam.getFacetFieldRequest().getField(); - Schema.FieldInfo facetInfo = reader.getSegmentData().getSchema() - .getFacetFieldByFacetName(field); - - return facetInfo != null - && reader.getSegmentData().getPerFieldMap().containsKey(facetInfo.getName()); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetAccumulator.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetAccumulator.docx new file mode 100644 index 000000000..6f50a2d19 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetAccumulator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetAccumulator.java b/src/java/com/twitter/search/core/earlybird/facets/FacetAccumulator.java deleted file mode 100644 index e38d3f7c0..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetAccumulator.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - - -/** - * Counts facet occurrences and provides the top items - * at the end. Actual subclass can implement this functionality differently: e.g. by using - * a heap (priority queue) or a hashmap with pruning step. - * The type R represents the facet results, which can e.g. be a thrift class. - */ -public abstract class FacetAccumulator { - /** Called to notify the accumulator that the given termID has occurred in a document - * Returns the current count of the given termID. - */ - public abstract int add(long termID, int scoreIncrement, int penaltyIncrement, int tweepCred); - - /** After hit collection is done this can be called to - * retrieve the items that occurred most often */ - public abstract R getTopFacets(int n); - - /** After hit collection is done this can be called to retrieve all the items accumulated - * (which may not be all that occurred) */ - public abstract R getAllFacets(); - - /** Called to reset a facet accumulator for re-use. This is an optimization - * which takes advantage of the fact that these accumulators may allocate - * large hash-tables, and we use one per-segment, which may be as many as 10-20 **/ - public abstract void reset(FacetLabelProvider facetLabelProvider); - - /** Language histogram accumulation and retrieval. They both have no-op default implementations. - */ - public void recordLanguage(int languageId) { } - - public LanguageHistogram getLanguageHistogram() { - return LanguageHistogram.EMPTY_HISTOGRAM; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountAggregator.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetCountAggregator.docx new file mode 100644 index 000000000..5209ebc46 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetCountAggregator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountAggregator.java b/src/java/com/twitter/search/core/earlybird/facets/FacetCountAggregator.java deleted file mode 100644 index 36bf3598e..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetCountAggregator.java +++ /dev/null @@ -1,93 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.util.List; -import java.util.Map; -import java.util.Map.Entry; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; - -import org.apache.lucene.facet.FacetResult; - -import com.twitter.search.common.facets.CountFacetSearchParam; -import com.twitter.search.common.facets.FacetSearchParam; -import com.twitter.search.common.facets.thriftjava.FacetFieldRequest; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.index.inverted.InvertedIndex; - -/** - * Global facet aggregator across all fields. - * - */ -public class FacetCountAggregator implements FacetTermCollector { - - // keys for the following aggregators are fieldIds - private final Map aggregators; - private final Map facetSearchParamMap; - - /** - * Creates a new facet aggregator. - */ - public FacetCountAggregator( - List facetSearchParams, - Schema schema, - FacetIDMap facetIDMap, - Map labelProviderMap) { - - aggregators = Maps.newHashMap(); - facetSearchParamMap = Maps.newHashMap(); - - // Check params: - for (FacetSearchParam facetSearchParam : facetSearchParams) { - if (!(facetSearchParam instanceof CountFacetSearchParam)) { - throw new IllegalArgumentException( - "this collector only supports CountFacetSearchParam; got " + facetSearchParam); - } - if (facetSearchParam.getFacetFieldRequest().getPath() != null - && !facetSearchParam.getFacetFieldRequest().getPath().isEmpty()) { - throw new IllegalArgumentException( - "this collector dosen't support hierarchical facets: " - + facetSearchParam.getFacetFieldRequest().getPath()); - } - - String field = facetSearchParam.getFacetFieldRequest().getField(); - Schema.FieldInfo facetField = - schema == null ? null : schema.getFacetFieldByFacetName(field); - - if (facetField == null || !labelProviderMap.containsKey(facetField.getName())) { - throw new IllegalStateException("facet field: " + field + " is not defined"); - } - - int fieldId = facetIDMap.getFacetField(facetField).getFacetId(); - Preconditions.checkState(!aggregators.containsKey(fieldId)); - Preconditions.checkState(!facetSearchParamMap.containsKey(fieldId)); - aggregators.put(fieldId, new PerfieldFacetCountAggregator(field, - labelProviderMap.get(facetField.getName()))); - facetSearchParamMap.put(fieldId, facetSearchParam); - } - } - - /** - * Returns the top facets. - */ - public Map getTop() { - Map map = Maps.newHashMap(); - for (Entry entry : aggregators.entrySet()) { - FacetSearchParam facetSearchParam = facetSearchParamMap.get(entry.getKey()); - map.put(facetSearchParam.getFacetFieldRequest(), entry.getValue().getTop(facetSearchParam)); - } - return map; - } - - @Override - public boolean collect(int docID, long termID, int fieldID) { - PerfieldFacetCountAggregator perfieldAggregator = aggregators.get(fieldID); - if (perfieldAggregator != null) { - perfieldAggregator.collect((int) termID); - return true; - } else { - return false; - } - } - -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountIterator.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetCountIterator.docx new file mode 100644 index 000000000..b0eb4993e Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetCountIterator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountIterator.java b/src/java/com/twitter/search/core/earlybird/facets/FacetCountIterator.java deleted file mode 100644 index b70b5c560..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetCountIterator.java +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; -import java.util.List; - -import com.twitter.common.collections.Pair; - -/** - * The collect() method is called for every document for which facets shall be counted. - * This iterator then calls the FacetAccumulators for all facets that belong to the - * current document. - */ -public abstract class FacetCountIterator implements FacetTermCollector { - - public static class IncrementData { - public FacetAccumulator[] accumulators; - public int weightedCountIncrement; - public int penaltyIncrement; - public int tweepCred; - public int languageId; - } - - public IncrementData incrementData = new IncrementData(); - - private List> proofs = null; - - void setIncrementData(IncrementData incrementData) { - this.incrementData = incrementData; - } - - public void setProofs(List> proofs) { - this.proofs = proofs; - } - - // interface method that collects a specific term in a specific field for this document. - @Override - public boolean collect(int docID, long termID, int fieldID) { - FacetAccumulator accumulator = incrementData.accumulators[fieldID]; - accumulator.add(termID, incrementData.weightedCountIncrement, incrementData.penaltyIncrement, - incrementData.tweepCred); - accumulator.recordLanguage(incrementData.languageId); - - if (proofs != null) { - addProof(docID, termID, fieldID); - } - return true; - } - - protected void addProof(int docID, long termID, int fieldID) { - proofs.add(new Pair<>(fieldID, termID)); - } - - /** - * Collected facets for the given document. - */ - public abstract void collect(int docID) throws IOException; -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountIteratorFactory.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetCountIteratorFactory.docx new file mode 100644 index 000000000..92d3c4db5 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetCountIteratorFactory.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountIteratorFactory.java b/src/java/com/twitter/search/core/earlybird/facets/FacetCountIteratorFactory.java deleted file mode 100644 index 91df9649b..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetCountIteratorFactory.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * A factory for {@link FacetCountIterator}s. - */ -public abstract class FacetCountIteratorFactory { - /** - * For a field that is being faceted on and for which we should use a CSF for facet counting, - * return the iterator we should use for counting. - * - * @param reader The reader to use when getting CSF values - * @param fieldInfo The Schema.FieldInfo corresponding to the facet we're counting - * @return An iterator for this field - */ - public abstract FacetCountIterator getFacetCountIterator( - EarlybirdIndexSegmentAtomicReader reader, - Schema.FieldInfo fieldInfo) throws IOException; -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountState.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetCountState.docx new file mode 100644 index 000000000..0f9387f6d Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetCountState.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountState.java b/src/java/com/twitter/search/core/earlybird/facets/FacetCountState.java deleted file mode 100644 index 920868312..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetCountState.java +++ /dev/null @@ -1,88 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.util.HashMap; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; - -import com.google.common.collect.Sets; - -import com.twitter.search.common.schema.base.Schema; - -/** - * Maintains internal state during one facet count request. - */ -public final class FacetCountState { - private final Set fieldsToCount = new HashSet<>(); - private final Map> facetfieldResults = - new HashMap<>(); - private final int minNumFacetResults; - private final Schema schema; - - public FacetCountState(Schema schema, int minNumFacetResults) { - this.schema = schema; - this.minNumFacetResults = minNumFacetResults; - } - - /** - * Adds a facet to be counted in this request. - */ - public void addFacet(String facetName, int numResultsRequested) { - facetfieldResults.put(facetName, new FacetFieldResults(facetName, - Math.max(numResultsRequested, minNumFacetResults))); - Schema.FieldInfo field = schema.getFacetFieldByFacetName(facetName); - fieldsToCount.add(field); - } - - public Schema getSchema() { - return schema; - } - - public int getNumFieldsToCount() { - return fieldsToCount.size(); - } - - /** - * Returns whether or not there is a field to be counted for which no skip list is stored - */ - public boolean hasFieldToCountWithoutSkipList() { - for (Schema.FieldInfo facetField: fieldsToCount) { - if (!facetField.getFieldType().isStoreFacetSkiplist()) { - return true; - } - } - return false; - } - - public Set getFacetFieldsToCountWithSkipLists() { - return Sets.filter( - fieldsToCount, - facetField -> facetField.getFieldType().isStoreFacetSkiplist()); - } - - public boolean isCountField(Schema.FieldInfo field) { - return fieldsToCount.contains(field); - } - - public Iterator> getFacetFieldResultsIterator() { - return facetfieldResults.values().iterator(); - } - - public static final class FacetFieldResults { - public final String facetName; - public final int numResultsRequested; - public R results; - public int numResultsFound; - public boolean finished = false; - - private FacetFieldResults(String facetName, int numResultsRequested) { - this.facetName = facetName; - this.numResultsRequested = numResultsRequested; - } - - public boolean isFinished() { - return finished || results != null && numResultsFound >= numResultsRequested; - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArray.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArray.docx new file mode 100644 index 000000000..6c3bc1f6c Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArray.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArray.java b/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArray.java deleted file mode 100644 index cd6098d22..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArray.java +++ /dev/null @@ -1,156 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; -import java.util.Map; - -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.inverted.IntBlockPool; - -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; - -public class FacetCountingArray extends AbstractFacetCountingArray { - private static final Logger LOG = LoggerFactory.getLogger(FacetCountingArray.class); - - private final Int2IntOpenHashMap facetsMap; - - /** - * Creates a new, empty FacetCountingArray with the given size. - */ - public FacetCountingArray(int maxSegmentSize) { - super(); - facetsMap = new Int2IntOpenHashMap(maxSegmentSize); - facetsMap.defaultReturnValue(UNASSIGNED); - } - - private FacetCountingArray(Int2IntOpenHashMap facetsMap, IntBlockPool facetsPool) { - super(facetsPool); - this.facetsMap = facetsMap; - } - - @Override - protected int getFacet(int docID) { - return facetsMap.get(docID); - } - - @Override - protected void setFacet(int docID, int facetID) { - facetsMap.put(docID, facetID); - } - - @Override - public AbstractFacetCountingArray rewriteAndMapIDs( - Map termIDMapper, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - Preconditions.checkNotNull(originalTweetIdMapper); - Preconditions.checkNotNull(optimizedTweetIdMapper); - - // We need to rewrite the facet array, because the term ids have to be mapped to the - // key space of the minimum perfect hash function that replaces the hash table. - // We also need to remap tweet IDs to the optimized doc IDs. - int maxDocID = optimizedTweetIdMapper.getPreviousDocID(Integer.MAX_VALUE); - AbstractFacetCountingArray newArray = new OptimizedFacetCountingArray(maxDocID + 1); - final FacetCountingArrayWriter writer = new FacetCountingArrayWriter(newArray); - FacetCountIterator iterator = new ArrayFacetCountIterator() { - @Override - public boolean collect(int docID, long termID, int fieldID) { - int[] termIDMap = termIDMapper.get(fieldID); - int mappedTermID; - // If there isn't a map for this term, we are using the original term IDs and can continue - // with that term ID. If there is a term ID map, then we need to use the new term ID, - // because the new index will use an MPH term dictionary with new term IDs. - if (termIDMap == null) { - mappedTermID = (int) termID; - } else if (termID < termIDMap.length) { - mappedTermID = termIDMap[(int) termID]; - } else { - // During segment optimization we might index a new term after the termIDMap is created - // in IndexOptimizer.optimizeInvertedIndexes(). We can safely ignore these terms, as - // they will be re-indexed later. - return false; - } - - try { - long tweetId = originalTweetIdMapper.getTweetID(docID); - int newDocId = optimizedTweetIdMapper.getDocID(tweetId); - Preconditions.checkState(newDocId != DocIDToTweetIDMapper.ID_NOT_FOUND, - "Did not find a mapping in the new tweet ID mapper for doc ID " - + newDocId + ", tweet ID " + tweetId); - - writer.addFacet(newDocId, fieldID, mappedTermID); - } catch (IOException e) { - LOG.error("Caught an unexpected IOException while optimizing facet.", e); - } - - return true; - } - }; - - // We want to iterate the facets in increasing tweet ID order. This might not correspond to - // decreasing doc ID order in the original mapper (see OutOfOrderRealtimeTweetIDMapper). - // However, the optimized mapper should be sorted both by tweet IDs and by doc IDs (in reverse - // order). So we need to iterate here over the doc IDs in the optimized mapper, convert them - // to doc IDs in the original mapper, and pass those doc IDs to collect(). - int docId = optimizedTweetIdMapper.getPreviousDocID(Integer.MAX_VALUE); - while (docId != DocIDToTweetIDMapper.ID_NOT_FOUND) { - long tweetId = optimizedTweetIdMapper.getTweetID(docId); - int originalDocId = originalTweetIdMapper.getDocID(tweetId); - iterator.collect(originalDocId); - docId = optimizedTweetIdMapper.getPreviousDocID(docId); - } - return newArray; - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String FACETS_POOL_PROP_NAME = "facetsPool"; - private final int maxSegmentSize; - - public FlushHandler(int maxSegmentSize) { - this.maxSegmentSize = maxSegmentSize; - } - - public FlushHandler(FacetCountingArray objectToFlush) { - super(objectToFlush); - maxSegmentSize = -1; - } - - @Override - public void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - FacetCountingArray array = getObjectToFlush(); - out.writeInt(array.facetsMap.size()); - for (Int2IntOpenHashMap.Entry entry : array.facetsMap.int2IntEntrySet()) { - out.writeInt(entry.getIntKey()); - out.writeInt(entry.getIntValue()); - } - array.getFacetsPool().getFlushHandler().flush( - flushInfo.newSubProperties(FACETS_POOL_PROP_NAME), out); - } - - @Override - public FacetCountingArray doLoad(FlushInfo flushInfo, DataDeserializer in) throws IOException { - int size = in.readInt(); - Int2IntOpenHashMap facetsMap = new Int2IntOpenHashMap(maxSegmentSize); - facetsMap.defaultReturnValue(UNASSIGNED); - for (int i = 0; i < size; i++) { - facetsMap.put(in.readInt(), in.readInt()); - } - IntBlockPool facetsPool = new IntBlockPool.FlushHandler().load( - flushInfo.getSubProperties(FACETS_POOL_PROP_NAME), in); - return new FacetCountingArray(facetsMap, facetsPool); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArrayWriter.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArrayWriter.docx new file mode 100644 index 000000000..24cc1d03e Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArrayWriter.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArrayWriter.java b/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArrayWriter.java deleted file mode 100644 index f02d52bfb..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetCountingArrayWriter.java +++ /dev/null @@ -1,55 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import com.twitter.search.core.earlybird.index.inverted.IntBlockPool; - -public class FacetCountingArrayWriter { - private final AbstractFacetCountingArray facetCountingArray; - private int previousDocID = -1; - - public FacetCountingArrayWriter(AbstractFacetCountingArray array) { - facetCountingArray = array; - } - - /** - * Adds a facet for the given doc, field and term tuple. - * - * The layout of the packedValues in the term pool is: - * - * index |0 |1 |2 |3 |4 |5 |6 |7 |8 |9 | - * value |U |1a|1b|1c|U |2b|2c|P3|1d|1f| - * - * Where U is UNASSIGNED, P+X is a pointer to index X (e.g. P3 means pointer to index 3), - * or a doc ID and facet (e.g. doc ID 1 and facet a would be 1a). - */ - public void addFacet(int docID, int fieldID, int termID) { - IntBlockPool facetsPool = facetCountingArray.getFacetsPool(); - int packedValue = facetCountingArray.getFacet(docID); - - if (packedValue == AbstractFacetCountingArray.UNASSIGNED) { - // first facet for this doc. - // keep it in the array and don't add it to the map. - facetCountingArray.setFacet(docID, AbstractFacetCountingArray.encodeFacetID(fieldID, termID)); - return; - } - - if (!FacetCountingArray.isPointer(packedValue)) { - // If the packedValue is not a pointer, we know that we have exactly one facet in the index - // for this document, so copy the existing facet into the pool. - facetsPool.add(AbstractFacetCountingArray.UNASSIGNED); - facetsPool.add(packedValue); - } else if (previousDocID != docID) { - // We have seen this document ID in a different document. Store the pointer to the first facet - // for this doc ID in the pool so that we can traverse the linked list. - facetsPool.add(packedValue); - } - - previousDocID = docID; - - // Add the new facet to the end of the FacetCountingArray. - facetsPool.add(AbstractFacetCountingArray.encodeFacetID(fieldID, termID)); - - // Set the facetValue for this document to the pointer to the facet we just added to the array. - int poolPointer = AbstractFacetCountingArray.encodePointer(facetsPool.length() - 1); - facetCountingArray.setFacet(docID, poolPointer); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetIDMap.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetIDMap.docx new file mode 100644 index 000000000..bfc3a3829 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetIDMap.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetIDMap.java b/src/java/com/twitter/search/core/earlybird/facets/FacetIDMap.java deleted file mode 100644 index 4254abd89..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetIDMap.java +++ /dev/null @@ -1,161 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Collection; -import java.util.Map; - -import com.google.common.collect.Maps; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; - -/** - * Currently a facet is configured by: - * - Index field name: The Lucene field name which stores the indexed terms of this facet - * - Facet name: The name of the facet that the search API specifies to request facet counts. - * - Facet id: An internal id which is used to store the facet forward mapping in the facet counting - * data structures. - * - * This is a multi-map with two different mappings: - * Facet name -> Facet id - * Facet id -> FieldInfo - */ -public final class FacetIDMap implements Flushable { - private final FacetField[] facetIDToFieldMap; - private final Map facetNameToIDMap; - - private FacetIDMap(FacetField[] facetIDToFieldMap) { - this.facetIDToFieldMap = facetIDToFieldMap; - - facetNameToIDMap = Maps.newHashMapWithExpectedSize(facetIDToFieldMap.length); - for (int i = 0; i < facetIDToFieldMap.length; i++) { - facetNameToIDMap.put(facetIDToFieldMap[i].getFacetName(), i); - } - } - - public FacetField getFacetField(Schema.FieldInfo fieldInfo) { - return fieldInfo != null && fieldInfo.getFieldType().isFacetField() - ? getFacetFieldByFacetName(fieldInfo.getFieldType().getFacetName()) : null; - } - - public FacetField getFacetFieldByFacetName(String facetName) { - Integer facetID = facetNameToIDMap.get(facetName); - return facetID != null ? facetIDToFieldMap[facetID] : null; - } - - public FacetField getFacetFieldByFacetID(int facetID) { - return facetIDToFieldMap[facetID]; - } - - public Collection getFacetFields() { - return Arrays.asList(facetIDToFieldMap); - } - - public int getNumberOfFacetFields() { - return facetIDToFieldMap.length; - } - - /** - * Builds a new FacetIDMap from the given schema. - */ - public static FacetIDMap build(Schema schema) { - FacetField[] facetIDToFieldMap = new FacetField[schema.getNumFacetFields()]; - - int facetId = 0; - - for (Schema.FieldInfo fieldInfo : schema.getFieldInfos()) { - if (fieldInfo.getFieldType().isFacetField()) { - facetIDToFieldMap[facetId] = new FacetField(facetId, fieldInfo); - facetId++; - } - } - - return new FacetIDMap(facetIDToFieldMap); - } - - public static final class FacetField { - private final int facetId; - private final Schema.FieldInfo fieldInfo; - - private FacetField(int facetId, Schema.FieldInfo fieldInfo) { - this.facetId = facetId; - this.fieldInfo = fieldInfo; - } - - public int getFacetId() { - return facetId; - } - - public Schema.FieldInfo getFieldInfo() { - return fieldInfo; - } - - public String getFacetName() { - return fieldInfo.getFieldType().getFacetName(); - } - - public String getDescription() { - return String.format( - "(FacetField [facetId: %d, fieldInfo: %s])", - getFacetId(), fieldInfo.getDescription()); - } - } - - @SuppressWarnings("unchecked") - @Override - public FacetIDMap.FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String NUM_FACET_FIELDS_PROP_NAME = "numFacetFields"; - - private final Schema schema; - - public FlushHandler(Schema schema) { - this.schema = schema; - } - - public FlushHandler(FacetIDMap objectToFlush) { - super(objectToFlush); - // schema only needed here for loading, not for flushing - this.schema = null; - } - - @Override - public void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - FacetIDMap toFlush = getObjectToFlush(); - int[] idMap = new int[toFlush.facetIDToFieldMap.length]; - for (int i = 0; i < toFlush.facetIDToFieldMap.length; i++) { - idMap[i] = toFlush.facetIDToFieldMap[i].getFieldInfo().getFieldId(); - } - out.writeIntArray(idMap); - - flushInfo.addIntProperty(NUM_FACET_FIELDS_PROP_NAME, idMap.length); - } - - - @Override - public FacetIDMap doLoad(FlushInfo flushInfo, DataDeserializer in) throws IOException { - int[] idMap = in.readIntArray(); - if (idMap.length != schema.getNumFacetFields()) { - throw new IOException("Wrong number of facet fields. Expected by schema: " - + schema.getNumFacetFields() - + ", but found in serialized segment: " + idMap.length); - } - - FacetField[] facetIDToFieldMap = new FacetField[schema.getNumFacetFields()]; - - for (int i = 0; i < idMap.length; i++) { - int fieldConfigId = idMap[i]; - facetIDToFieldMap[i] = new FacetField(i, schema.getFieldInfo(fieldConfigId)); - } - - return new FacetIDMap(facetIDToFieldMap); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetLabelProvider.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetLabelProvider.docx new file mode 100644 index 000000000..45c111506 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetLabelProvider.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetLabelProvider.java b/src/java/com/twitter/search/core/earlybird/facets/FacetLabelProvider.java deleted file mode 100644 index 8f653e0eb..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetLabelProvider.java +++ /dev/null @@ -1,206 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.hashtable.HashTable; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.analysis.IntTermAttributeImpl; -import com.twitter.search.common.util.analysis.LongTermAttributeImpl; -import com.twitter.search.common.util.analysis.SortableLongTermAttributeImpl; -import com.twitter.search.core.earlybird.index.inverted.InvertedIndex; - -/** - * Given a termID this accessor can be used to retrieve the term bytesref and text - * that corresponds to the termID. - */ -public interface FacetLabelProvider { - /** - * Returns a {@link FacetLabelAccessor} for this provider. - */ - FacetLabelAccessor getLabelAccessor(); - - abstract class FacetLabelAccessor { - private int currentTermID = -1; - - protected final BytesRef termRef = new BytesRef(); - protected boolean hasTermPayload = false; - protected final BytesRef termPayload = new BytesRef(); - protected int offensiveCount = 0; - - protected final boolean maybeSeek(long termID) { - if (termID == currentTermID) { - return true; - } - - if (seek(termID)) { - currentTermID = (int) termID; - return true; - } else { - currentTermID = -1; - return false; - } - } - - // Seek to term id provided. Returns true if term found. Should update termRef, - // hasTermPayload, and termPayload as appropriate. - protected abstract boolean seek(long termID); - - public final BytesRef getTermRef(long termID) { - return maybeSeek(termID) ? termRef : null; - } - - public String getTermText(long termID) { - return maybeSeek(termID) ? termRef.utf8ToString() : null; - } - - public final BytesRef getTermPayload(long termID) { - return maybeSeek(termID) && hasTermPayload ? termPayload : null; - } - - public final int getOffensiveCount(long termID) { - return maybeSeek(termID) ? offensiveCount : 0; - } - } - - /** - * Assumes the term is stored as an IntTermAttribute, and uses this to convert - * the term bytesref to an integer string facet label. - */ - class IntTermFacetLabelProvider implements FacetLabelProvider { - private final InvertedIndex invertedIndex; - - public IntTermFacetLabelProvider(InvertedIndex invertedIndex) { - this.invertedIndex = invertedIndex; - } - - @Override - public FacetLabelAccessor getLabelAccessor() { - return new FacetLabelAccessor() { - @Override - protected boolean seek(long termID) { - if (termID != HashTable.EMPTY_SLOT) { - invertedIndex.getTerm((int) termID, termRef); - return true; - } - return false; - } - - @Override - public String getTermText(long termID) { - return maybeSeek(termID) - ? Integer.toString(IntTermAttributeImpl.copyBytesRefToInt(termRef)) - : null; - } - }; - } - } - - /** - * Assumes the term is stored as an LongTermAttribute, and uses this to convert - * the term bytesref to an long string facet label. - */ - class LongTermFacetLabelProvider implements FacetLabelProvider { - private final InvertedIndex invertedIndex; - - public LongTermFacetLabelProvider(InvertedIndex invertedIndex) { - this.invertedIndex = invertedIndex; - } - - @Override - public FacetLabelAccessor getLabelAccessor() { - return new FacetLabelAccessor() { - @Override - protected boolean seek(long termID) { - if (termID != HashTable.EMPTY_SLOT) { - invertedIndex.getTerm((int) termID, termRef); - return true; - } - return false; - } - - @Override - public String getTermText(long termID) { - return maybeSeek(termID) - ? Long.toString(LongTermAttributeImpl.copyBytesRefToLong(termRef)) - : null; - } - }; - } - } - - class SortedLongTermFacetLabelProvider implements FacetLabelProvider { - private final InvertedIndex invertedIndex; - - public SortedLongTermFacetLabelProvider(InvertedIndex invertedIndex) { - this.invertedIndex = invertedIndex; - } - - @Override - public FacetLabelAccessor getLabelAccessor() { - return new FacetLabelAccessor() { - @Override - protected boolean seek(long termID) { - if (termID != HashTable.EMPTY_SLOT) { - invertedIndex.getTerm((int) termID, termRef); - return true; - } - return false; - } - - @Override - public String getTermText(long termID) { - return maybeSeek(termID) - ? Long.toString(SortableLongTermAttributeImpl.copyBytesRefToLong(termRef)) - : null; - } - }; - } - } - - class IdentityFacetLabelProvider implements FacetLabelProvider { - @Override - public FacetLabelAccessor getLabelAccessor() { - return new FacetLabelAccessor() { - @Override - protected boolean seek(long termID) { - return true; - } - - @Override - public String getTermText(long termID) { - return Long.toString(termID); - } - }; - } - } - - /** - * The methods on this provider should NOT be called under normal circumstances! - * - * When a facet misses inverted index and does not use CSF, this InaccessibleFacetLabelProvider - * will be used as a dummy provider. Then, unexptectedFacetLabelAccess counter will be - * incremented when this provider is used later. - * - * Also see: - * {@link FacetUtil} - */ - class InaccessibleFacetLabelProvider implements FacetLabelProvider { - private final SearchCounter unexptectedFacetLabelAccess; - - public InaccessibleFacetLabelProvider(String fieldName) { - this.unexptectedFacetLabelAccess = - SearchCounter.export("unexpected_facet_label_access_for_field_" + fieldName); - } - - @Override - public FacetLabelAccessor getLabelAccessor() { - return new FacetLabelAccessor() { - @Override - protected boolean seek(long termID) { - unexptectedFacetLabelAccess.increment(); - return false; - } - }; - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetResponseRewriter.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetResponseRewriter.docx new file mode 100644 index 000000000..c7e488b1f Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetResponseRewriter.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetResponseRewriter.java b/src/java/com/twitter/search/core/earlybird/facets/FacetResponseRewriter.java deleted file mode 100644 index 349805d2d..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetResponseRewriter.java +++ /dev/null @@ -1,16 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import com.twitter.search.common.facets.thriftjava.FacetResponse; - -/** - * Rewrite facet responses - */ -public interface FacetResponseRewriter { - /** - * Do the response rewrite - * - * @param facetResponse the response before the rewriting - * @return the rewrited response - */ - FacetResponse rewrite(FacetResponse facetResponse); -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetTermCollector.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetTermCollector.docx new file mode 100644 index 000000000..e7731e498 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetTermCollector.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetTermCollector.java b/src/java/com/twitter/search/core/earlybird/facets/FacetTermCollector.java deleted file mode 100644 index 668b079d3..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetTermCollector.java +++ /dev/null @@ -1,16 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -/** - * An interface for collecting all facets in an document. - */ -public interface FacetTermCollector { - /** - * Collect one facet term. - * @param docID The docID for which the facets are being collected. - * @param termID The termID for this facet item. - * @param fieldID The fieldID for this facet item. - * @return True if anything has actually been collected, false if this has been skipped. - * Currently, this return value is not used. - */ - boolean collect(int docID, long termID, int fieldID); -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetUtil.docx b/src/java/com/twitter/search/core/earlybird/facets/FacetUtil.docx new file mode 100644 index 000000000..6462b3ca7 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/FacetUtil.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/FacetUtil.java b/src/java/com/twitter/search/core/earlybird/facets/FacetUtil.java deleted file mode 100644 index 7105e7728..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/FacetUtil.java +++ /dev/null @@ -1,106 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.util.HashMap; -import java.util.Map; - -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.base.IndexedNumericFieldSettings; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.thriftjava.ThriftNumericType; -import com.twitter.search.core.earlybird.index.inverted.InvertedIndex; - -/** - * A utility class for selecting iterators and label providers - * for facets. - * - */ -public abstract class FacetUtil { - private static final Logger LOG = LoggerFactory.getLogger(FacetUtil.class); - - private FacetUtil() { - // unused - } - - /** - * A utility method for choosing the right facet label provider based on the EarlybirdFieldType. - * Takes in a InvertedIndex since some facet label providers are or depend on the inverted - * index. - * Should never return null. - * - * @param fieldType A FieldType for the facet - * @param invertedField The inverted index associated with the facet. May be null. - * @return A non-null FacetLabelProvider - */ - public static FacetLabelProvider chooseFacetLabelProvider( - EarlybirdFieldType fieldType, - InvertedIndex invertedField) { - Preconditions.checkNotNull(fieldType); - - // In the case neither inverted index existing nor using CSF, - // return FacetLabelProvider.InaccessibleFacetLabelProvider to throw exception - // more meaningfully and explicitly. - if (invertedField == null && !fieldType.isUseCSFForFacetCounting()) { - return new FacetLabelProvider.InaccessibleFacetLabelProvider(fieldType.getFacetName()); - } - - if (fieldType.isUseCSFForFacetCounting()) { - return new FacetLabelProvider.IdentityFacetLabelProvider(); - } - IndexedNumericFieldSettings numericSettings = fieldType.getNumericFieldSettings(); - if (numericSettings != null && numericSettings.isUseTwitterFormat()) { - if (numericSettings.getNumericType() == ThriftNumericType.INT) { - return new FacetLabelProvider.IntTermFacetLabelProvider(invertedField); - } else if (numericSettings.getNumericType() == ThriftNumericType.LONG) { - return numericSettings.isUseSortableEncoding() - ? new FacetLabelProvider.SortedLongTermFacetLabelProvider(invertedField) - : new FacetLabelProvider.LongTermFacetLabelProvider(invertedField); - } else { - Preconditions.checkState(false, - "Should never be reached, indicates incomplete handling of different kinds of facets"); - return null; - } - } else { - return invertedField; - } - } - - /** - * Get segment-specific facet label providers based on the schema - * and on the fieldToInvertedIndexMapping for the segment. - * These will be used by facet accumulators to get the text of the termIDs - * - * @param schema the schema, for info on fields and facets - * @param fieldToInvertedIndexMapping map of fields to their inverted indices - * @return facet label provider map - */ - public static Map getFacetLabelProviders( - Schema schema, - Map fieldToInvertedIndexMapping) { - - HashMap facetLabelProviderBuilder - = new HashMap<>(); - - for (Schema.FieldInfo fieldInfo : schema.getFacetFields()) { - EarlybirdFieldType fieldType = fieldInfo.getFieldType(); - Preconditions.checkNotNull(fieldType); - String fieldName = fieldInfo.getName(); - String facetName = fieldType.getFacetName(); - InvertedIndex invertedIndex = fieldToInvertedIndexMapping.get(fieldName); - if (invertedIndex == null && !fieldType.isUseCSFForFacetCounting()) { - LOG.warn("No docs in segment had field " + fieldName - + " indexed for facet " + facetName - + " so InaccessibleFacetLabelProvider will be provided." - ); - } - facetLabelProviderBuilder.put(facetName, Preconditions.checkNotNull( - chooseFacetLabelProvider(fieldType, invertedIndex))); - } - - return facetLabelProviderBuilder; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/LanguageHistogram.docx b/src/java/com/twitter/search/core/earlybird/facets/LanguageHistogram.docx new file mode 100644 index 000000000..79219e184 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/LanguageHistogram.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/LanguageHistogram.java b/src/java/com/twitter/search/core/earlybird/facets/LanguageHistogram.java deleted file mode 100644 index 213519e4e..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/LanguageHistogram.java +++ /dev/null @@ -1,104 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.util.Arrays; -import java.util.Map; - -import com.google.common.collect.ImmutableMap; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; - -/** - * A util class to build a language histogram - */ -public class LanguageHistogram { - private static final Logger LOG = LoggerFactory.getLogger(LanguageHistogram.class); - - public static final LanguageHistogram EMPTY_HISTOGRAM = new LanguageHistogram() { - // Let's make this immutable for safety. - @Override public void clear() { - throw new UnsupportedOperationException(); - } - - @Override public void increment(int languageID) { - throw new UnsupportedOperationException(); - } - - @Override public void add(int languageID, int value) { - throw new UnsupportedOperationException(); - } - - @Override public void addAll(LanguageHistogram histogram) { - throw new UnsupportedOperationException(); - } - }; - - private final int[] languageHistogram = new int[ThriftLanguage.values().length]; - - public int[] getLanguageHistogram() { - return languageHistogram; - } - - /** - * Returns this histogram represented as a language->count map. - */ - public Map getLanguageHistogramAsMap() { - ImmutableMap.Builder builder = ImmutableMap.builder(); - for (int i = 0; i < languageHistogram.length; i++) { - // ThriftLanguage.findByValue() might return null, which should fall back to UNKNOWN. - ThriftLanguage lang = ThriftLanguage.findByValue(i); - lang = lang == null ? ThriftLanguage.UNKNOWN : lang; - builder.put(lang, languageHistogram[i]); - } - return builder.build(); - } - - public void clear() { - Arrays.fill(languageHistogram, 0); - } - - public void increment(int languageId) { - if (isValidLanguageId(languageId)) { - languageHistogram[languageId]++; - } - } - - public void increment(ThriftLanguage language) { - increment(language.getValue()); - } - - public void add(int languageId, int value) { - if (isValidLanguageId(languageId)) { - languageHistogram[languageId] += value; - } - } - - public void add(ThriftLanguage language, int value) { - add(language.getValue(), value); - } - - /** - * Adds all entries from the provided histogram to this histogram. - */ - public void addAll(LanguageHistogram histogram) { - if (histogram == EMPTY_HISTOGRAM) { - return; - } - for (int i = 0; i < languageHistogram.length; i++) { - languageHistogram[i] += histogram.languageHistogram[i]; - } - } - - // Check for out of bound languages. If a language is out of bounds, we don't want it - // to cause the entire search to fail. - private boolean isValidLanguageId(int languageId) { - if (languageId < languageHistogram.length) { - return true; - } else { - LOG.error("Language id " + languageId + " out of range"); - return false; - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/OptimizedFacetCountingArray.docx b/src/java/com/twitter/search/core/earlybird/facets/OptimizedFacetCountingArray.docx new file mode 100644 index 000000000..d4f5336e5 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/OptimizedFacetCountingArray.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/OptimizedFacetCountingArray.java b/src/java/com/twitter/search/core/earlybird/facets/OptimizedFacetCountingArray.java deleted file mode 100644 index 622ccc69f..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/OptimizedFacetCountingArray.java +++ /dev/null @@ -1,82 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Map; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.inverted.IntBlockPool; - -public class OptimizedFacetCountingArray extends AbstractFacetCountingArray { - private final int[] facetsMap; - - /** - * Creates a new, empty FacetCountingArray with the given size. - */ - public OptimizedFacetCountingArray(int maxDocIdInclusive) { - super(); - facetsMap = new int[maxDocIdInclusive]; - Arrays.fill(facetsMap, UNASSIGNED); - } - - private OptimizedFacetCountingArray(int[] facetsMap, IntBlockPool facetsPool) { - super(facetsPool); - this.facetsMap = facetsMap; - } - - @Override - protected int getFacet(int docID) { - return facetsMap[docID]; - } - - @Override - protected void setFacet(int docID, int facetID) { - facetsMap[docID] = facetID; - } - - @Override - public AbstractFacetCountingArray rewriteAndMapIDs( - Map termIDMapper, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) { - throw new UnsupportedOperationException( - "OptimizedFacetCountingArray instances should never be rewritten."); - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String FACETS_POOL_PROP_NAME = "facetsPool"; - - public FlushHandler() { - } - - public FlushHandler(OptimizedFacetCountingArray objectToFlush) { - super(objectToFlush); - } - - @Override - public void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - OptimizedFacetCountingArray objectToFlush = getObjectToFlush(); - out.writeIntArray(objectToFlush.facetsMap); - objectToFlush.getFacetsPool().getFlushHandler().flush( - flushInfo.newSubProperties(FACETS_POOL_PROP_NAME), out); - } - - @Override - public OptimizedFacetCountingArray doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - int[] facetsMap = in.readIntArray(); - IntBlockPool facetsPool = new IntBlockPool.FlushHandler().load( - flushInfo.getSubProperties(FACETS_POOL_PROP_NAME), in); - return new OptimizedFacetCountingArray(facetsMap, facetsPool); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/PerfieldFacetCountAggregator.docx b/src/java/com/twitter/search/core/earlybird/facets/PerfieldFacetCountAggregator.docx new file mode 100644 index 000000000..961ccd34e Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/PerfieldFacetCountAggregator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/PerfieldFacetCountAggregator.java b/src/java/com/twitter/search/core/earlybird/facets/PerfieldFacetCountAggregator.java deleted file mode 100644 index 7da65a031..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/PerfieldFacetCountAggregator.java +++ /dev/null @@ -1,96 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.facet.FacetResult; -import org.apache.lucene.facet.LabelAndValue; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.PriorityQueue; - -import com.twitter.search.common.facets.FacetSearchParam; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider.FacetLabelAccessor; - -import it.unimi.dsi.fastutil.ints.Int2IntMap.Entry; -import it.unimi.dsi.fastutil.ints.Int2IntMap.FastEntrySet; -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; - -public class PerfieldFacetCountAggregator { - - private final Int2IntOpenHashMap countMap; - private final FacetLabelAccessor facetLabelAccessor; - private final String name; - - /** - * Creates a new per-field facet aggregator. - */ - public PerfieldFacetCountAggregator(String name, FacetLabelProvider facetLabelProvider) { - this.name = name; - this.countMap = new Int2IntOpenHashMap(); - this.countMap.defaultReturnValue(0); - this.facetLabelAccessor = facetLabelProvider.getLabelAccessor(); - } - - public void collect(int termId) { - countMap.put(termId, countMap.get(termId) + 1); - } - - /** - * Returns the top facets. - */ - public FacetResult getTop(FacetSearchParam facetSearchParam) { - Preconditions.checkArgument( - facetSearchParam != null - && facetSearchParam.getFacetFieldRequest().getField().equals(name) - && (facetSearchParam.getFacetFieldRequest().getPath() == null - || facetSearchParam.getFacetFieldRequest().getPath().isEmpty())); - - PriorityQueue pq = new PriorityQueue( - facetSearchParam.getFacetFieldRequest().getNumResults()) { - - private BytesRef buffer = new BytesRef(); - - @Override - protected boolean lessThan(Entry a, Entry b) { - // first by count desc - int r = Integer.compare(a.getIntValue(), b.getIntValue()); - if (r != 0) { - return r < 0; - } - - // and then by label asc - BytesRef label1 = facetLabelAccessor.getTermRef(a.getIntKey()); - buffer.bytes = label1.bytes; - buffer.offset = label1.offset; - buffer.length = label1.length; - - return buffer.compareTo(facetLabelAccessor.getTermRef(b.getIntKey())) > 0; - } - - }; - - final FastEntrySet entrySet = countMap.int2IntEntrySet(); - - int numValid = 0; - for (Entry entry : entrySet) { - long val = entry.getIntValue(); - if (val > 0) { - numValid++; - pq.insertWithOverflow(entry); - } - } - - int numVals = pq.size(); - LabelAndValue[] labelValues = new LabelAndValue[numVals]; - - // Priority queue pops out "least" element first (that is the root). - // Least in our definition regardless of how we define what that is should be the last element. - for (int i = labelValues.length - 1; i >= 0; i--) { - Entry entry = pq.pop(); - labelValues[i] = new LabelAndValue( - facetLabelAccessor.getTermText(entry.getIntKey()), - entry.getValue()); - } - - return new FacetResult(name, null, 0, labelValues, numValid); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesFacetsFactory.docx b/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesFacetsFactory.docx new file mode 100644 index 000000000..c83a70b30 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesFacetsFactory.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesFacetsFactory.java b/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesFacetsFactory.java deleted file mode 100644 index 272e3749e..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesFacetsFactory.java +++ /dev/null @@ -1,45 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import java.io.IOException; -import java.util.List; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.facet.Facets; -import org.apache.lucene.facet.FacetsCollector; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; -import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState; - -import com.twitter.search.common.facets.CountFacetSearchParam; -import com.twitter.search.common.facets.FacetSearchParam; -import com.twitter.search.common.facets.FacetsFactory; - -/** - * Factory for SortedSetDocValuesFacetCounts - */ -public class SortedSetDocValuesFacetsFactory implements FacetsFactory { - private final SortedSetDocValuesReaderState state; - - public SortedSetDocValuesFacetsFactory(SortedSetDocValuesReaderState state) { - this.state = state; - } - - @Override - public Facets create( - List facetSearchParams, - FacetsCollector facetsCollector) throws IOException { - - Preconditions.checkNotNull(facetsCollector); - - return new SortedSetDocValuesFacetCounts(state, facetsCollector); - } - - @Override - public boolean accept(FacetSearchParam facetSearchParam) { - return facetSearchParam instanceof CountFacetSearchParam - && (facetSearchParam.getFacetFieldRequest().getPath() == null - || facetSearchParam.getFacetFieldRequest().getPath().isEmpty()) - && SortedSetDocValuesReaderStateHelper.isDimSupported( - state, facetSearchParam.getFacetFieldRequest().getField()); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesReaderStateHelper.docx b/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesReaderStateHelper.docx new file mode 100644 index 000000000..3ae51b7ab Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesReaderStateHelper.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesReaderStateHelper.java b/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesReaderStateHelper.java deleted file mode 100644 index de9c58548..000000000 --- a/src/java/com/twitter/search/core/earlybird/facets/SortedSetDocValuesReaderStateHelper.java +++ /dev/null @@ -1,14 +0,0 @@ -package com.twitter.search.core.earlybird.facets; - -import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState; - -/** - * We have to check if the facet field (dim called by lucene) is supported or - * not by the SortedSetDocValuesReaderState. The method we have to call is - * private to the lucene package, so we have this helper to do the call for us. - */ -public abstract class SortedSetDocValuesReaderStateHelper { - public static boolean isDimSupported(SortedSetDocValuesReaderState state, String dim) { - return state.getOrdRange(dim) != null; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/DocIDToTweetIDMapper.docx b/src/java/com/twitter/search/core/earlybird/index/DocIDToTweetIDMapper.docx new file mode 100644 index 000000000..56805ec0d Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/DocIDToTweetIDMapper.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/DocIDToTweetIDMapper.java b/src/java/com/twitter/search/core/earlybird/index/DocIDToTweetIDMapper.java deleted file mode 100644 index 187213dab..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/DocIDToTweetIDMapper.java +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.IOException; - -/** - * An interface for mapping the doc IDs in our indexes to the corresponding tweet IDs. - */ -public interface DocIDToTweetIDMapper { - /** A constant indicating that a doc ID was not found in the mapper. */ - int ID_NOT_FOUND = -1; - - /** - * Returns the tweet ID corresponding to the given doc ID. - * - * @param docID The doc ID stored in our indexes. - * @return The tweet ID corresponding to the given doc ID. - */ - long getTweetID(int docID); - - /** - * Returns the internal doc ID corresponding to the given tweet ID. Returns ID_NOT_FOUND if the - * given tweet ID cannot be found in the index. - * - * @param tweetID The tweet ID. - * @return The doc ID corresponding to the given tweet ID. - */ - int getDocID(long tweetID) throws IOException; - - /** - * Returns the smallest valid doc ID in this mapper that's strictly higher than the given doc ID. - * If no such doc ID exists, ID_NOT_FOUND is returned. - * - * @param docID The current doc ID. - * @return The smallest valid doc ID in this mapper that's strictly higher than the given doc ID, - * or a negative number, if no such doc ID exists. - */ - int getNextDocID(int docID); - - /** - * Returns the largest valid doc ID in this mapper that's strictly smaller than the given doc ID. - * If no such doc ID exists, ID_NOT_FOUND is returned. - * - * @param docID The current doc ID. - * @return The largest valid doc ID in this mapper that's strictly smaller than the given doc ID, - * or a negative number, if no such doc ID exists. - */ - int getPreviousDocID(int docID); - - /** - * Returns the total number of documents stored in this mapper. - * - * @return The total number of documents stored in this mapper. - */ - int getNumDocs(); - - /** - * Adds a mapping for the given tweet ID. Returns the doc ID assigned to this tweet ID. - * This method does not check if the tweet ID is already present in the mapper. It always assigns - * a new doc ID to the given tweet. - * - * @param tweetID The tweet ID to be added to the mapper. - * @return The doc ID assigned to the given tweet ID, or ID_NOT_FOUND if a doc ID could not be - * assigned to this tweet. - */ - int addMapping(long tweetID); - - /** - * Converts the current DocIDToTweetIDMapper to a DocIDToTweetIDMapper instance with the same - * tweet IDs. The tweet IDs in the original and optimized instances can be mapped to different - * doc IDs. However, we expect doc IDs to be assigned such that tweets created later have smaller - * have smaller doc IDs. - * - * This method should be called when an earlybird segment is being optimized, right before - * flushing it to disk. - * - * @return An optimized DocIDToTweetIDMapper with the same tweet IDs. - */ - DocIDToTweetIDMapper optimize() throws IOException; -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentAtomicReader.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentAtomicReader.docx new file mode 100644 index 000000000..0b7344185 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentAtomicReader.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentAtomicReader.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentAtomicReader.java deleted file mode 100644 index 5d960f049..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentAtomicReader.java +++ /dev/null @@ -1,139 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.IOException; -import java.util.Map; -import java.util.Set; - -import com.google.common.collect.Sets; - -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.DocIdSetIterator; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.facets.AbstractFacetCountingArray; -import com.twitter.search.core.earlybird.facets.FacetIDMap; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.index.inverted.DeletedDocs; - -/** - * Base class for atomic Earlybird segment readers. - */ -public abstract class EarlybirdIndexSegmentAtomicReader extends LeafReader { - public static final int TERM_NOT_FOUND = -1; - - private final DeletedDocs.View deletesView; - private final EarlybirdIndexSegmentData segmentData; - protected final EarlybirdIndexSegmentData.SyncData syncData; - - private FieldInfos fieldInfos; - - /** - * Creates a new atomic reader for this Earlybird segment. - */ - public EarlybirdIndexSegmentAtomicReader(EarlybirdIndexSegmentData segmentData) { - super(); - this.segmentData = segmentData; - this.syncData = segmentData.getSyncData(); - this.deletesView = segmentData.getDeletedDocs().getView(); - // fieldInfos will be initialized lazily if required - this.fieldInfos = null; - } - - public int getSmallestDocID() { - return syncData.getSmallestDocID(); - } - - public final FacetIDMap getFacetIDMap() { - return segmentData.getFacetIDMap(); - } - - public final Map getFacetLabelProviders() { - return segmentData.getFacetLabelProviders(); - } - - public AbstractFacetCountingArray getFacetCountingArray() { - return segmentData.getFacetCountingArray(); - } - - public final FacetLabelProvider getFacetLabelProviders(Schema.FieldInfo field) { - String facetName = field.getFieldType().getFacetName(); - return facetName != null && segmentData.getFacetLabelProviders() != null - ? segmentData.getFacetLabelProviders().get(facetName) : null; - } - - @Override - public FieldInfos getFieldInfos() { - if (fieldInfos == null) { - // TwitterInMemoryIndexReader is constructed per query, and this call is only needed for - // optimize. We wouldn't want to create a new FieldInfos per search, so we deffer it. - Schema schema = segmentData.getSchema(); - final Set fieldSet = Sets.newHashSet(segmentData.getPerFieldMap().keySet()); - fieldSet.addAll(segmentData.getDocValuesManager().getDocValueNames()); - fieldInfos = schema.getLuceneFieldInfos(input -> input != null && fieldSet.contains(input)); - } - return fieldInfos; - } - - /** - * Returns the ID that was assigned to the given term in - * {@link com.twitter.search.core.earlybird.index.inverted.InvertedRealtimeIndex} - */ - public abstract int getTermID(Term t) throws IOException; - - /** - * Returns the oldest posting for the given term - * NOTE: This method may return a deleted doc id. - */ - public abstract int getOldestDocID(Term t) throws IOException; - - @Override - public abstract NumericDocValues getNumericDocValues(String field) throws IOException; - - /** - * Determines if this reader has any documents to traverse. Note that it is possible for the tweet - * ID mapper to have documents, but for this reader to not see them yet. In this case, this method - * will return false. - */ - public boolean hasDocs() { - return segmentData.numDocs() > 0; - } - - /** - * Returns the newest posting for the given term - */ - public final int getNewestDocID(Term term) throws IOException { - PostingsEnum td = postings(term); - if (td == null) { - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - - if (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - return td.docID(); - } else { - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - } - - public final DeletedDocs.View getDeletesView() { - return deletesView; - } - - @Override - public final Fields getTermVectors(int docID) { - // Earlybird does not use term vectors. - return null; - } - - public EarlybirdIndexSegmentData getSegmentData() { - return segmentData; - } - - public Schema getSchema() { - return segmentData.getSchema(); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentData.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentData.docx new file mode 100644 index 000000000..f01ae4faa Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentData.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentData.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentData.java deleted file mode 100644 index fefb1b4d1..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentData.java +++ /dev/null @@ -1,474 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.store.Directory; - -import com.twitter.common.collections.Pair; -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.facets.AbstractFacetCountingArray; -import com.twitter.search.core.earlybird.facets.FacetCountingArrayWriter; -import com.twitter.search.core.earlybird.facets.FacetIDMap; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.index.column.ColumnStrideByteIndex; -import com.twitter.search.core.earlybird.index.column.DocValuesManager; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsData; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsFactory; -import com.twitter.search.core.earlybird.index.inverted.DeletedDocs; -import com.twitter.search.core.earlybird.index.inverted.InvertedIndex; -import com.twitter.search.core.earlybird.index.inverted.InvertedRealtimeIndex; -import com.twitter.search.core.earlybird.index.inverted.OptimizedMemoryIndex; -import com.twitter.search.core.earlybird.index.inverted.TermPointerEncoding; - -/** - * Base class that references data structures belonging to an Earlybird segment. - */ -public abstract class EarlybirdIndexSegmentData implements Flushable { - /** - * This class has a map which contains a snapshot of max published pointers, to distinguish the - * documents in the skip lists that are fully indexed, and safe to return to searchers and those - * that are in progress and should not be returned to searchers. See - * "Earlybird Indexing Latency Design Document" - * for rationale and design. - * - * It also has the smallestDocID, which determines the smallest assigned doc ID in the tweet ID - * mapper that is safe to traverse. - * - * The pointer map and smallestDocID need to be updated atomically. See SEARCH-27650. - */ - public static class SyncData { - private final Map indexPointers; - private final int smallestDocID; - - public SyncData(Map indexPointers, int smallestDocID) { - this.indexPointers = indexPointers; - this.smallestDocID = smallestDocID; - } - - public Map getIndexPointers() { - return indexPointers; - } - - public int getSmallestDocID() { - return smallestDocID; - } - } - - private volatile SyncData syncData; - - private final int maxSegmentSize; - private final long timeSliceID; - - private final ConcurrentHashMap queryCacheMap = - new ConcurrentHashMap<>(); - private final AbstractFacetCountingArray facetCountingArray; - private final boolean isOptimized; - private final ConcurrentHashMap perFieldMap; - private final ConcurrentHashMap normsMap; - - private final Map facetLabelProviders; - private final FacetIDMap facetIDMap; - - private final Schema schema; - private final DocValuesManager docValuesManager; - - private final DeletedDocs deletedDocs; - - private final DocIDToTweetIDMapper docIdToTweetIdMapper; - private final TimeMapper timeMapper; - - static LeafReader getLeafReaderFromOptimizedDirectory(Directory directory) throws IOException { - List leaves = DirectoryReader.open(directory).getContext().leaves(); - int leavesSize = leaves.size(); - Preconditions.checkState(1 == leavesSize, - "Expected one leaf reader in directory %s, but found %s", directory, leavesSize); - return leaves.get(0).reader(); - } - - /** - * Creates a new SegmentData instance using the provided data. - */ - public EarlybirdIndexSegmentData( - int maxSegmentSize, - long timeSliceID, - Schema schema, - boolean isOptimized, - int smallestDocID, - ConcurrentHashMap perFieldMap, - ConcurrentHashMap normsMap, - AbstractFacetCountingArray facetCountingArray, - DocValuesManager docValuesManager, - Map facetLabelProviders, - FacetIDMap facetIDMap, - DeletedDocs deletedDocs, - DocIDToTweetIDMapper docIdToTweetIdMapper, - TimeMapper timeMapper) { - this.maxSegmentSize = maxSegmentSize; - this.timeSliceID = timeSliceID; - this.schema = schema; - this.isOptimized = isOptimized; - this.facetCountingArray = facetCountingArray; - this.perFieldMap = perFieldMap; - this.syncData = new SyncData(buildIndexPointers(), smallestDocID); - this.normsMap = normsMap; - this.docValuesManager = docValuesManager; - this.facetLabelProviders = facetLabelProviders; - this.facetIDMap = facetIDMap; - this.deletedDocs = deletedDocs; - this.docIdToTweetIdMapper = docIdToTweetIdMapper; - this.timeMapper = timeMapper; - - Preconditions.checkNotNull(schema); - } - - public final Schema getSchema() { - return schema; - } - - /** - * Returns all {@link EarlybirdIndexExtensionsData} instances contained in this segment. - * Since index extensions are optional, the returned map might be null or empty. - */ - public abstract S getIndexExtensionsData(); - - public DocIDToTweetIDMapper getDocIDToTweetIDMapper() { - return docIdToTweetIdMapper; - } - - public TimeMapper getTimeMapper() { - return timeMapper; - } - - public final DocValuesManager getDocValuesManager() { - return docValuesManager; - } - - public Map getFacetLabelProviders() { - return facetLabelProviders; - } - - public FacetIDMap getFacetIDMap() { - return facetIDMap; - } - - /** - * Returns the QueryCacheResult for the given filter for this segment. - */ - public QueryCacheResultForSegment getQueryCacheResult(String queryCacheFilterName) { - return queryCacheMap.get(queryCacheFilterName); - } - - public long getQueryCachesCardinality() { - return queryCacheMap.values().stream().mapToLong(q -> q.getCardinality()).sum(); - } - - /** - * Get cache cardinality for each query cache. - * @return - */ - public List> getPerQueryCacheCardinality() { - ArrayList> result = new ArrayList<>(); - - queryCacheMap.forEach((cacheName, queryCacheResult) -> { - result.add(Pair.of(cacheName, queryCacheResult.getCardinality())); - }); - return result; - } - - /** - * Updates the QueryCacheResult stored for the given filter for this segment - */ - public QueryCacheResultForSegment updateQueryCacheResult( - String queryCacheFilterName, QueryCacheResultForSegment queryCacheResultForSegment) { - return queryCacheMap.put(queryCacheFilterName, queryCacheResultForSegment); - } - - /** - * Subclasses are allowed to return null here to disable writing to a FacetCountingArray. - */ - public FacetCountingArrayWriter createFacetCountingArrayWriter() { - return getFacetCountingArray() != null - ? new FacetCountingArrayWriter(getFacetCountingArray()) : null; - } - - public int getMaxSegmentSize() { - return maxSegmentSize; - } - - public long getTimeSliceID() { - return timeSliceID; - } - - public void updateSmallestDocID(int smallestDocID) { - // Atomic swap - syncData = new SyncData(Collections.unmodifiableMap(buildIndexPointers()), smallestDocID); - } - - private Map buildIndexPointers() { - Map newIndexPointers = new HashMap<>(); - for (InvertedIndex index : perFieldMap.values()) { - if (index.hasMaxPublishedPointer()) { - newIndexPointers.put(index, index.getMaxPublishedPointer()); - } - } - - return newIndexPointers; - } - - public SyncData getSyncData() { - return syncData; - } - - public AbstractFacetCountingArray getFacetCountingArray() { - return facetCountingArray; - } - - public void addField(String fieldName, InvertedIndex field) { - perFieldMap.put(fieldName, field); - } - - public Map getPerFieldMap() { - return Collections.unmodifiableMap(perFieldMap); - } - - public InvertedIndex getFieldIndex(String fieldName) { - return perFieldMap.get(fieldName); - } - - public Map getNormsMap() { - return Collections.unmodifiableMap(normsMap); - } - - public DeletedDocs getDeletedDocs() { - return deletedDocs; - } - - /** - * Returns the norms index for the given field name. - */ - public ColumnStrideByteIndex getNormIndex(String fieldName) { - return normsMap == null ? null : normsMap.get(fieldName); - } - - /** - * Returns the norms index for the given field name, add if not exist. - */ - public ColumnStrideByteIndex createNormIndex(String fieldName) { - if (normsMap == null) { - return null; - } - ColumnStrideByteIndex csf = normsMap.get(fieldName); - if (csf == null) { - csf = new ColumnStrideByteIndex(fieldName, maxSegmentSize); - normsMap.put(fieldName, csf); - } - return csf; - } - - /** - * Flushes this segment to disk. - */ - public void flushSegment(FlushInfo flushInfo, DataSerializer out) throws IOException { - getFlushHandler().flush(flushInfo, out); - } - - public final boolean isOptimized() { - return this.isOptimized; - } - - /** - * Returns a new atomic reader for this segment. - */ - public EarlybirdIndexSegmentAtomicReader createAtomicReader() throws IOException { - EarlybirdIndexSegmentAtomicReader reader = doCreateAtomicReader(); - EarlybirdIndexExtensionsData indexExtension = getIndexExtensionsData(); - if (indexExtension != null) { - indexExtension.setupExtensions(reader); - } - return reader; - } - - /** - * Creates a new atomic reader for this segment. - */ - protected abstract EarlybirdIndexSegmentAtomicReader doCreateAtomicReader() throws IOException; - - /** - * Creates a new segment writer for this segment. - */ - public abstract EarlybirdIndexSegmentWriter createEarlybirdIndexSegmentWriter( - IndexWriterConfig indexWriterConfig) throws IOException; - - public abstract static class AbstractSegmentDataFlushHandler - - extends Flushable.Handler { - protected static final String MAX_SEGMENT_SIZE_PROP_NAME = "maxSegmentSize"; - protected static final String TIME_SLICE_ID_PROP_NAME = "time_slice_id"; - protected static final String SMALLEST_DOCID_PROP_NAME = "smallestDocID"; - protected static final String DOC_ID_MAPPER_SUBPROPS_NAME = "doc_id_mapper"; - protected static final String TIME_MAPPER_SUBPROPS_NAME = "time_mapper"; - public static final String IS_OPTIMIZED_PROP_NAME = "isOptimized"; - - // Abstract methods child classes should implement: - // 1. How to additional data structures - protected abstract void flushAdditionalDataStructures( - FlushInfo flushInfo, DataSerializer out, EarlybirdIndexSegmentData toFlush) - throws IOException; - - // 2. Load additional data structures and construct SegmentData. - // Common data structures should be passed into this method to avoid code duplication. - // Subclasses should load additional data structures and construct a SegmentData. - protected abstract EarlybirdIndexSegmentData constructSegmentData( - FlushInfo flushInfo, - ConcurrentHashMap perFieldMap, - int maxSegmentSize, - S indexExtension, - DocIDToTweetIDMapper docIdToTweetIdMapper, - TimeMapper timeMapper, - DataDeserializer in) throws IOException; - - protected abstract S newIndexExtension(); - - protected final Schema schema; - protected final EarlybirdIndexExtensionsFactory indexExtensionsFactory; - private final Flushable.Handler docIdMapperFlushHandler; - private final Flushable.Handler timeMapperFlushHandler; - - public AbstractSegmentDataFlushHandler( - Schema schema, - EarlybirdIndexExtensionsFactory indexExtensionsFactory, - Flushable.Handler docIdMapperFlushHandler, - Flushable.Handler timeMapperFlushHandler) { - super(); - this.schema = schema; - this.indexExtensionsFactory = indexExtensionsFactory; - this.docIdMapperFlushHandler = docIdMapperFlushHandler; - this.timeMapperFlushHandler = timeMapperFlushHandler; - } - - public AbstractSegmentDataFlushHandler(EarlybirdIndexSegmentData objectToFlush) { - super(objectToFlush); - this.schema = objectToFlush.schema; - this.indexExtensionsFactory = null; // factory only needed for loading SegmentData from disk - this.docIdMapperFlushHandler = null; // docIdMapperFlushHandler needed only for loading data - this.timeMapperFlushHandler = null; // timeMapperFlushHandler needed only for loading data - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) - throws IOException { - EarlybirdIndexSegmentData segmentData = getObjectToFlush(); - - Preconditions.checkState(segmentData.docIdToTweetIdMapper instanceof Flushable); - ((Flushable) segmentData.docIdToTweetIdMapper).getFlushHandler().flush( - flushInfo.newSubProperties(DOC_ID_MAPPER_SUBPROPS_NAME), out); - - if (segmentData.timeMapper != null) { - segmentData.timeMapper.getFlushHandler() - .flush(flushInfo.newSubProperties(TIME_MAPPER_SUBPROPS_NAME), out); - } - - flushInfo.addBooleanProperty(IS_OPTIMIZED_PROP_NAME, segmentData.isOptimized()); - flushInfo.addIntProperty(MAX_SEGMENT_SIZE_PROP_NAME, segmentData.getMaxSegmentSize()); - flushInfo.addLongProperty(TIME_SLICE_ID_PROP_NAME, segmentData.getTimeSliceID()); - flushInfo.addIntProperty(SMALLEST_DOCID_PROP_NAME, - segmentData.getSyncData().getSmallestDocID()); - - flushIndexes(flushInfo, out, segmentData); - - // Flush cluster specific data structures: - // FacetCountingArray, TweetIDMapper, LatLonMapper, and TimeMapper - flushAdditionalDataStructures(flushInfo, out, segmentData); - } - - private void flushIndexes( - FlushInfo flushInfo, - DataSerializer out, - EarlybirdIndexSegmentData segmentData) throws IOException { - Map perFieldMap = segmentData.getPerFieldMap(); - FlushInfo fieldProps = flushInfo.newSubProperties("fields"); - long sizeBeforeFlush = out.length(); - for (Map.Entry entry : perFieldMap.entrySet()) { - String fieldName = entry.getKey(); - entry.getValue().getFlushHandler().flush(fieldProps.newSubProperties(fieldName), out); - } - fieldProps.setSizeInBytes(out.length() - sizeBeforeFlush); - } - - @Override - protected EarlybirdIndexSegmentData doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - DocIDToTweetIDMapper docIdToTweetIdMapper = docIdMapperFlushHandler.load( - flushInfo.getSubProperties(DOC_ID_MAPPER_SUBPROPS_NAME), in); - - FlushInfo timeMapperFlushInfo = flushInfo.getSubProperties(TIME_MAPPER_SUBPROPS_NAME); - TimeMapper timeMapper = - timeMapperFlushInfo != null ? timeMapperFlushHandler.load(timeMapperFlushInfo, in) : null; - - final int maxSegmentSize = flushInfo.getIntProperty(MAX_SEGMENT_SIZE_PROP_NAME); - ConcurrentHashMap perFieldMap = loadIndexes(flushInfo, in); - return constructSegmentData( - flushInfo, - perFieldMap, - maxSegmentSize, - newIndexExtension(), - docIdToTweetIdMapper, - timeMapper, - in); - } - - // Move this method into EarlybirdRealtimeIndexSegmentData (careful, - // we may need to increment FlushVersion because EarlybirdLuceneIndexSegmentData - // currently has the 'fields' subproperty in its FlushInfo as well) - private ConcurrentHashMap loadIndexes( - FlushInfo flushInfo, DataDeserializer in) throws IOException { - ConcurrentHashMap perFieldMap = new ConcurrentHashMap<>(); - - FlushInfo fieldProps = flushInfo.getSubProperties("fields"); - Iterator fieldIterator = fieldProps.getKeyIterator(); - while (fieldIterator.hasNext()) { - String fieldName = fieldIterator.next(); - EarlybirdFieldType fieldType = schema.getFieldInfo(fieldName).getFieldType(); - FlushInfo subProp = fieldProps.getSubProperties(fieldName); - boolean isOptimized = subProp.getBooleanProperty( - OptimizedMemoryIndex.FlushHandler.IS_OPTIMIZED_PROP_NAME); - final InvertedIndex invertedIndex; - if (isOptimized) { - if (!fieldType.becomesImmutable()) { - throw new IOException("Tried to load an optimized field that is not immutable: " - + fieldName); - } - invertedIndex = (new OptimizedMemoryIndex.FlushHandler(fieldType)).load(subProp, in); - } else { - invertedIndex = (new InvertedRealtimeIndex.FlushHandler( - fieldType, TermPointerEncoding.DEFAULT_ENCODING)) - .load(subProp, in); - } - perFieldMap.put(fieldName, invertedIndex); - } - return perFieldMap; - } - } - - public int numDocs() { - return docIdToTweetIdMapper.getNumDocs(); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentWriter.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentWriter.docx new file mode 100644 index 000000000..3665c3516 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentWriter.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentWriter.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentWriter.java deleted file mode 100644 index 697a95bb2..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexSegmentWriter.java +++ /dev/null @@ -1,130 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.Closeable; -import java.io.IOException; - -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.LeafCollector; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorable; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.store.Directory; - -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldIndex; -import com.twitter.search.core.earlybird.index.column.DocValuesUpdate; - -/** - * IndexSegmentWriter combines some common functionality between the Lucene and Realtime index - * segment writers. - */ -public abstract class EarlybirdIndexSegmentWriter implements Closeable { - - public EarlybirdIndexSegmentWriter() { - } - - /** - * Gets the segment data this segment write is associated with. - * @return - */ - public abstract EarlybirdIndexSegmentData getSegmentData(); - - /** - * Appends terms from the document to the document matching the query. Does not replace a field or - * document, actually adds to the the field in the segment. - */ - public final void appendOutOfOrder(Query query, Document doc) throws IOException { - runQuery(query, docID -> appendOutOfOrder(doc, docID)); - } - - protected abstract void appendOutOfOrder(Document doc, int docId) throws IOException; - - /** - * Deletes a document in this segment that matches this query. - */ - public void deleteDocuments(Query query) throws IOException { - runQuery(query, docID -> getSegmentData().getDeletedDocs().deleteDoc(docID)); - } - - /** - * Updates the docvalues of a document in this segment that matches this query. - */ - public void updateDocValues(Query query, String field, DocValuesUpdate update) - throws IOException { - runQuery(query, docID -> { - ColumnStrideFieldIndex docValues = - getSegmentData().getDocValuesManager().getColumnStrideFieldIndex(field); - if (docValues == null) { - return; - } - - update.update(docValues, docID); - }); - } - - private void runQuery(final Query query, final OnHit onHit) throws IOException { - try (IndexReader reader = getSegmentData().createAtomicReader()) { - new IndexSearcher(reader).search(query, new Collector() { - @Override - public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException { - return new LeafCollector() { - @Override - public void setScorer(Scorable scorer) { - } - - @Override - public void collect(int docID) throws IOException { - onHit.hit(docID); - } - }; - } - - @Override - public ScoreMode scoreMode() { - return ScoreMode.COMPLETE_NO_SCORES; - } - }); - } - } - - private interface OnHit { - void hit(int docID) throws IOException; - } - - /** - * Adds a new document to this segment. In production, this method should be called only by - * Expertsearch. - */ - public abstract void addDocument(Document doc) throws IOException; - - /** - * Adds a new tweet to this segment. This method should be called only by Earlybird. - */ - public abstract void addTweet(Document doc, long tweetId, boolean docIsOffensive) - throws IOException; - - /** - * Returns the total number of documents in the segment. - */ - public abstract int numDocs() throws IOException; - - /** - * Returns the number of documents in this segment without taking deleted docs into account. - * E.g. if 10 documents were added to this segments, and 5 were deleted, - * this method still returns 10. - */ - public abstract int numDocsNoDelete() throws IOException; - - /** - * Forces the underlying index to be merged down to a single segment. - */ - public abstract void forceMerge() throws IOException; - - /** - * Appends the provides Lucene indexes to this segment. - */ - public abstract void addIndexes(Directory... dirs) throws IOException; -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexableField.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexableField.docx new file mode 100644 index 000000000..1d6d0b6df Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexableField.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexableField.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexableField.java deleted file mode 100644 index 4bc5558f4..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdIndexableField.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import org.apache.lucene.document.Field; -import org.apache.lucene.index.DocValuesType; - -import com.twitter.search.common.schema.base.EarlybirdFieldType; - -public class EarlybirdIndexableField extends Field { - - /** - * Creates a new indexable field with the given name, value and {@link EarlybirdFieldType}. - */ - public EarlybirdIndexableField(String name, Object value, EarlybirdFieldType fieldType) { - super(name, fieldType); - if (fieldType.docValuesType() == DocValuesType.NUMERIC) { - if (value instanceof Number) { - super.fieldsData = ((Number) value).longValue(); - } else { - throw new IllegalArgumentException("value not a number: " + value.getClass()); - } - } - } - -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentAtomicReader.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentAtomicReader.docx new file mode 100644 index 000000000..6d3e182eb Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentAtomicReader.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentAtomicReader.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentAtomicReader.java deleted file mode 100644 index 63c811449..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentAtomicReader.java +++ /dev/null @@ -1,336 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.FieldInfos; -import org.apache.lucene.index.FilterLeafReader; -import org.apache.lucene.index.LeafMetaData; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.PointValues; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedNumericDocValues; -import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.StoredFieldVisitor; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.encoding.docvalues.CSFTypeUtil; -import com.twitter.search.common.encoding.features.IntegerEncodedFeatures; -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.Schema.FieldInfo; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldDocValues; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldIndex; - -public final class EarlybirdLuceneIndexSegmentAtomicReader - extends EarlybirdIndexSegmentAtomicReader { - private abstract static class DocIdSetIteratorWrapper extends NumericDocValues { - private final DocIdSetIterator delegate; - - public DocIdSetIteratorWrapper(DocIdSetIterator delegate) { - this.delegate = Preconditions.checkNotNull(delegate); - } - - @Override - public int docID() { - return delegate.docID(); - } - - @Override - public int nextDoc() throws IOException { - return delegate.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return delegate.advance(target); - } - - @Override - public long cost() { - return delegate.cost(); - } - } - - private static class BytesRefBasedIntegerEncodedFeatures extends IntegerEncodedFeatures { - private final BytesRef bytesRef; - private final int numInts; - - public BytesRefBasedIntegerEncodedFeatures(BytesRef bytesRef, int numInts) { - this.bytesRef = bytesRef; - this.numInts = numInts; - } - - @Override - public int getInt(int pos) { - return CSFTypeUtil.convertFromBytes(bytesRef.bytes, bytesRef.offset, pos); - } - - @Override - public void setInt(int pos, int value) { - throw new UnsupportedOperationException(); - } - - @Override - public int getNumInts() { - return numInts; - } - } - - private static final int OLDEST_DOC_SKIP_INTERVAL = 256; - - private final LeafReader delegate; - - /** - * Do not add public constructors to this class. EarlybirdLuceneIndexSegmentAtomicReader instances - * should be created only by calling EarlybirdLuceneIndexSegmentData.createAtomicReader(), to make - * sure everything is set up properly (such as CSF readers). - */ - EarlybirdLuceneIndexSegmentAtomicReader( - EarlybirdIndexSegmentData segmentData, Directory directory) throws IOException { - super(segmentData); - this.delegate = getDelegateReader(directory); - } - - private LeafReader getDelegateReader(Directory directory) throws IOException { - LeafReader directoryReader = - EarlybirdIndexSegmentData.getLeafReaderFromOptimizedDirectory(directory); - return new FilterLeafReader(directoryReader) { - @Override - public NumericDocValues getNumericDocValues(String field) throws IOException { - EarlybirdFieldType type = getSchema().getFieldInfo(field).getFieldType(); - if ((type == null) || !type.isCsfViewField()) { - return in.getNumericDocValues(field); - } - - // Compute as many things as possible once, outside the NumericDocValues.get() call. - String baseFieldName = getSchema().getFieldInfo(type.getCsfViewBaseFieldId()).getName(); - FieldInfo baseFieldInfo = - Preconditions.checkNotNull(getSchema().getFieldInfo(baseFieldName)); - EarlybirdFieldType baseFieldType = baseFieldInfo.getFieldType(); - Preconditions.checkState(!baseFieldType.isCsfVariableLength()); - int numInts = baseFieldType.getCsfFixedLengthNumValuesPerDoc(); - FeatureConfiguration featureConfiguration = - Preconditions.checkNotNull(type.getCsfViewFeatureConfiguration()); - Preconditions.checkArgument(featureConfiguration.getValueIndex() < numInts); - - if (numInts == 1) { - // All encoded tweet features are encoded in a single integer. - NumericDocValues numericDocValues = in.getNumericDocValues(baseFieldName); - return new DocIdSetIteratorWrapper(numericDocValues) { - @Override - public long longValue() throws IOException { - return (numericDocValues.longValue() & featureConfiguration.getBitMask()) - >> featureConfiguration.getBitStartPosition(); - } - - @Override - public boolean advanceExact(int target) throws IOException { - return numericDocValues.advanceExact(target); - } - }; - } - - BinaryDocValues binaryDocValues = - Preconditions.checkNotNull(in.getBinaryDocValues(baseFieldName)); - return new DocIdSetIteratorWrapper(binaryDocValues) { - @Override - public long longValue() throws IOException { - BytesRef data = binaryDocValues.binaryValue(); - IntegerEncodedFeatures encodedFeatures = - new BytesRefBasedIntegerEncodedFeatures(data, numInts); - return encodedFeatures.getFeatureValue(featureConfiguration); - } - - @Override - public boolean advanceExact(int target) throws IOException { - return binaryDocValues.advanceExact(target); - } - }; - } - - @Override - public CacheHelper getCoreCacheHelper() { - return in.getCoreCacheHelper(); - } - - @Override - public CacheHelper getReaderCacheHelper() { - return in.getReaderCacheHelper(); - } - }; - } - - private TermsEnum getTermsEnumAtTerm(Term term) throws IOException { - Terms terms = terms(term.field()); - if (terms == null) { - return null; - } - - TermsEnum termsEnum = terms.iterator(); - return termsEnum.seekExact(term.bytes()) ? termsEnum : null; - } - - @Override - public int getOldestDocID(Term term) throws IOException { - TermsEnum termsEnum = getTermsEnumAtTerm(term); - if (termsEnum == null) { - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - - PostingsEnum td = termsEnum.postings(null); - int oldestDocID = td.nextDoc(); - if (oldestDocID == DocIdSetIterator.NO_MORE_DOCS) { - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - - final int docFreq = termsEnum.docFreq(); - if (docFreq > OLDEST_DOC_SKIP_INTERVAL * 16) { - final int skipSize = docFreq / OLDEST_DOC_SKIP_INTERVAL; - do { - oldestDocID = td.docID(); - } while (td.advance(oldestDocID + skipSize) != DocIdSetIterator.NO_MORE_DOCS); - - td = delegate.postings(term); - td.advance(oldestDocID); - } - - do { - oldestDocID = td.docID(); - } while (td.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); - - return oldestDocID; - } - - @Override - public int getTermID(Term term) throws IOException { - TermsEnum termsEnum = getTermsEnumAtTerm(term); - return termsEnum != null - ? (int) termsEnum.ord() - : EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - - @Override - public Terms terms(String field) throws IOException { - return delegate.terms(field); - } - - @Override - public FieldInfos getFieldInfos() { - return delegate.getFieldInfos(); - } - - @Override - public Bits getLiveDocs() { - return getDeletesView().getLiveDocs(); - } - - @Override - public int numDocs() { - return delegate.numDocs(); - } - - @Override - public int maxDoc() { - return delegate.maxDoc(); - } - - @Override - public void document(int docID, StoredFieldVisitor visitor) throws IOException { - delegate.document(docID, visitor); - } - - @Override - public boolean hasDeletions() { - return getDeletesView().hasDeletions(); - } - - @Override - protected void doClose() throws IOException { - delegate.close(); - } - - @Override - public NumericDocValues getNumericDocValues(String field) throws IOException { - FieldInfo fieldInfo = getSegmentData().getSchema().getFieldInfo(field); - if (fieldInfo == null) { - return null; - } - - // If this field is a CSF view field or if it's not loaded in memory, get the NumericDocValues - // from the delegate. - EarlybirdFieldType fieldType = fieldInfo.getFieldType(); - if (fieldType.isCsfViewField() || !fieldInfo.getFieldType().isCsfLoadIntoRam()) { - NumericDocValues delegateVals = delegate.getNumericDocValues(field); - if (delegateVals != null) { - return delegateVals; - } - } - - // The field is either loaded in memory, or the delegate doesn't have NumericDocValues for it. - // Return the NumericDocValues for this field stored in the DocValuesManager. - ColumnStrideFieldIndex csf = - getSegmentData().getDocValuesManager().getColumnStrideFieldIndex(field); - return csf != null ? new ColumnStrideFieldDocValues(csf, this) : null; - } - - @Override - public BinaryDocValues getBinaryDocValues(String field) throws IOException { - return delegate.getBinaryDocValues(field); - } - - @Override - public SortedDocValues getSortedDocValues(String field) throws IOException { - return delegate.getSortedDocValues(field); - } - - @Override - public SortedSetDocValues getSortedSetDocValues(String field) throws IOException { - return delegate.getSortedSetDocValues(field); - } - - @Override - public NumericDocValues getNormValues(String field) throws IOException { - return delegate.getNormValues(field); - } - - @Override - public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException { - return delegate.getSortedNumericDocValues(field); - } - - @Override - public void checkIntegrity() throws IOException { - delegate.checkIntegrity(); - } - - @Override - public PointValues getPointValues(String field) throws IOException { - return delegate.getPointValues(field); - } - - @Override - public LeafMetaData getMetaData() { - return delegate.getMetaData(); - } - - @Override - public CacheHelper getCoreCacheHelper() { - return delegate.getCoreCacheHelper(); - } - - @Override - public CacheHelper getReaderCacheHelper() { - return delegate.getReaderCacheHelper(); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentData.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentData.docx new file mode 100644 index 000000000..f956a2f57 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentData.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentData.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentData.java deleted file mode 100644 index 82c858e69..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentData.java +++ /dev/null @@ -1,197 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.IOException; -import java.util.concurrent.ConcurrentHashMap; - -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.store.Directory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.facets.AbstractFacetCountingArray; -import com.twitter.search.core.earlybird.facets.FacetCountingArrayWriter; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldIndex; -import com.twitter.search.core.earlybird.index.column.DocValuesManager; -import com.twitter.search.core.earlybird.index.column.OptimizedDocValuesManager; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsData; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsFactory; -import com.twitter.search.core.earlybird.index.inverted.DeletedDocs; -import com.twitter.search.core.earlybird.index.inverted.InvertedIndex; - -/** - * Implements {@link EarlybirdIndexSegmentData} for Lucene-based on-disk Earlybird segments. - */ -public final class EarlybirdLuceneIndexSegmentData extends EarlybirdIndexSegmentData { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdLuceneIndexSegmentData.class); - - private final Directory directory; - private final EarlybirdIndexExtensionsData indexExtension; - - /** - * Creates a new Lucene-based SegmentData instance from a lucene directory. - */ - public EarlybirdLuceneIndexSegmentData( - Directory directory, - int maxSegmentSize, - long timeSliceID, - Schema schema, - DocIDToTweetIDMapper docIdToTweetIdMapper, - TimeMapper timeMapper, - EarlybirdIndexExtensionsFactory indexExtensionsFactory) { - this( - directory, - maxSegmentSize, - timeSliceID, - schema, - false, // isOptimized - 0, // smallestDocId - new ConcurrentHashMap<>(), - AbstractFacetCountingArray.EMPTY_ARRAY, - new OptimizedDocValuesManager(schema, maxSegmentSize), - docIdToTweetIdMapper, - timeMapper, - indexExtensionsFactory == null - ? null : indexExtensionsFactory.newLuceneIndexExtensionsData()); - } - - public EarlybirdLuceneIndexSegmentData( - Directory directory, - int maxSegmentSize, - long timeSliceID, - Schema schema, - boolean isOptimized, - int smallestDocID, - ConcurrentHashMap perFieldMap, - AbstractFacetCountingArray facetCountingArray, - DocValuesManager docValuesManager, - DocIDToTweetIDMapper docIdToTweetIdMapper, - TimeMapper timeMapper, - EarlybirdIndexExtensionsData indexExtension) { - super(maxSegmentSize, - timeSliceID, - schema, - isOptimized, - smallestDocID, - perFieldMap, - new ConcurrentHashMap<>(), - facetCountingArray, - docValuesManager, - null, // facetLabelProviders - null, // facetIDMap - DeletedDocs.NO_DELETES, - docIdToTweetIdMapper, - timeMapper); - this.directory = directory; - this.indexExtension = indexExtension; - } - - public Directory getLuceneDirectory() { - return directory; - } - - @Override - public EarlybirdIndexExtensionsData getIndexExtensionsData() { - return indexExtension; - } - - @Override - public FacetCountingArrayWriter createFacetCountingArrayWriter() { - return null; - } - - @Override - protected EarlybirdIndexSegmentAtomicReader doCreateAtomicReader() throws IOException { - // EarlybirdSegment creates one single EarlybirdIndexSegmentAtomicReader instance per segment - // and caches it, and the cached instance is recreated only when the segment's data changes. - // This is why this is a good place to reload all CSFs that should be loaded in RAM. Also, it's - // easier and less error-prone to do it here, than trying to track down all places that mutate - // the segment data and do it there. - LeafReader reader = getLeafReaderFromOptimizedDirectory(directory); - for (Schema.FieldInfo fieldInfo : getSchema().getFieldInfos()) { - // Load CSF into RAM based on configurations in the schema. - if (fieldInfo.getFieldType().getCsfType() != null - && fieldInfo.getFieldType().isCsfLoadIntoRam()) { - if (reader.getNumericDocValues(fieldInfo.getName()) != null) { - ColumnStrideFieldIndex index = getDocValuesManager().addColumnStrideField( - fieldInfo.getName(), fieldInfo.getFieldType()); - index.load(reader, fieldInfo.getName()); - } else { - LOG.warn("Field {} does not have NumericDocValues.", fieldInfo.getName()); - } - } - } - - return new EarlybirdLuceneIndexSegmentAtomicReader(this, directory); - } - - @Override - public EarlybirdIndexSegmentWriter createEarlybirdIndexSegmentWriter( - IndexWriterConfig indexWriterConfig) throws IOException { - return new EarlybirdLuceneIndexSegmentWriter(this, indexWriterConfig); - } - - @Override - public EarlybirdIndexSegmentData.AbstractSegmentDataFlushHandler getFlushHandler() { - return new OnDiskSegmentDataFlushHandler(this); - } - - public static class OnDiskSegmentDataFlushHandler - extends AbstractSegmentDataFlushHandler { - private final Directory directory; - - public OnDiskSegmentDataFlushHandler(EarlybirdLuceneIndexSegmentData objectToFlush) { - super(objectToFlush); - this.directory = objectToFlush.directory; - } - - public OnDiskSegmentDataFlushHandler( - Schema schema, - Directory directory, - EarlybirdIndexExtensionsFactory indexExtensionsFactory, - Flushable.Handler docIdMapperFlushHandler, - Flushable.Handler timeMapperFlushHandler) { - super(schema, indexExtensionsFactory, docIdMapperFlushHandler, timeMapperFlushHandler); - this.directory = directory; - } - - @Override - protected EarlybirdIndexExtensionsData newIndexExtension() { - return indexExtensionsFactory.newLuceneIndexExtensionsData(); - } - - @Override - protected void flushAdditionalDataStructures( - FlushInfo flushInfo, DataSerializer out, EarlybirdIndexSegmentData toFlush) { - } - - @Override - protected EarlybirdIndexSegmentData constructSegmentData( - FlushInfo flushInfo, - ConcurrentHashMap perFieldMap, - int maxSegmentSize, - EarlybirdIndexExtensionsData indexExtension, - DocIDToTweetIDMapper docIdToTweetIdMapper, - TimeMapper timeMapper, - DataDeserializer in) { - return new EarlybirdLuceneIndexSegmentData( - directory, - maxSegmentSize, - flushInfo.getLongProperty(TIME_SLICE_ID_PROP_NAME), - schema, - flushInfo.getBooleanProperty(IS_OPTIMIZED_PROP_NAME), - flushInfo.getIntProperty(SMALLEST_DOCID_PROP_NAME), - perFieldMap, - AbstractFacetCountingArray.EMPTY_ARRAY, - new OptimizedDocValuesManager(schema, maxSegmentSize), - docIdToTweetIdMapper, - timeMapper, - indexExtension); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentWriter.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentWriter.docx new file mode 100644 index 000000000..902e2fe44 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentWriter.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentWriter.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentWriter.java deleted file mode 100644 index 6f73c3c32..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdLuceneIndexSegmentWriter.java +++ /dev/null @@ -1,170 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.File; -import java.io.IOException; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.slf4j.Marker; -import org.slf4j.MarkerFactory; - -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.search.Query; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.LockObtainFailedException; - -/** - * EarlybirdIndexWriter implementation that's a wrapper around Lucene's {@link IndexWriter} - * and writes Lucene segments into a {@link Directory}. - */ -public class EarlybirdLuceneIndexSegmentWriter extends EarlybirdIndexSegmentWriter { - private static final Logger LOG = - LoggerFactory.getLogger(EarlybirdLuceneIndexSegmentWriter.class); - private static final Marker FATAL = MarkerFactory.getMarker("FATAL"); - - private final EarlybirdLuceneIndexSegmentData segmentData; - private final IndexWriter indexWriter; - - @Override - public EarlybirdIndexSegmentData getSegmentData() { - return segmentData; - } - - /** - * Construct a lucene IndexWriter-based Earlybird segment writer. - * This will open a Lucene IndexWriter on segmentData.getLuceneDirectory(). - * This constructor will throw LockObtainFailedException if it cannot obtain the "write.lock" - * inside the directory segmentData.getLuceneDirectory(). - * - * Don't add public constructors to this class. EarlybirdLuceneIndexSegmentWriter instances should - * be created only by calling EarlybirdLuceneIndexSegmentData.createEarlybirdIndexSegmentWriter(), - * to make sure everything is set up properly (such as CSF readers). - */ - EarlybirdLuceneIndexSegmentWriter( - EarlybirdLuceneIndexSegmentData segmentData, - IndexWriterConfig indexWriterConfig) throws IOException { - Preconditions.checkNotNull(segmentData); - this.segmentData = segmentData; - try { - this.indexWriter = new IndexWriter(segmentData.getLuceneDirectory(), indexWriterConfig); - } catch (LockObtainFailedException e) { - logDebuggingInfoUponFailureToObtainLuceneWriteLock(segmentData, e); - // Rethrow the exception, and this Earlybird will trigger critical alerts - throw e; - } - } - - private void logDebuggingInfoUponFailureToObtainLuceneWriteLock( - EarlybirdLuceneIndexSegmentData luceneIndexSegmentData, - LockObtainFailedException e) throws IOException { - // Every day, we create a new Lucene dir---we do not append into existing Lucene dirs. - // Supposedly, we should never fail to obtain the write lock from a fresh and empty - // Lucene directory. - // Adding debugging information for SEARCH-4454, where a timeslice roll failed because - // Earlybird failed to get the write lock for a new timeslice. - Directory dir = luceneIndexSegmentData.getLuceneDirectory(); - LOG.error( - FATAL, - "Unable to obtain write.lock for Lucene directory. The Lucene directory is: " + dir, - e); - - if (dir instanceof FSDirectory) { // this check should always be true in our current setup. - FSDirectory fsDir = (FSDirectory) dir; - // Log if the underlying directory on disk does not exist. - File underlyingDir = fsDir.getDirectory().toFile(); - if (underlyingDir.exists()) { - LOG.info("Lucene directory contains the following files: " - + Lists.newArrayList(fsDir.listAll())); - } else { - LOG.error( - FATAL, - "Directory " + underlyingDir + " does not exist on disk.", - e); - } - - if (!underlyingDir.canWrite()) { - LOG.error( - FATAL, - "Cannot write into directory " + underlyingDir, - e); - } - - File writeLockFile = new File(underlyingDir, "write.lock"); - if (writeLockFile.exists()) { - LOG.error( - FATAL, - "Write lock file " + writeLockFile + " already exists.", - e); - } - - if (!writeLockFile.canWrite()) { - LOG.error( - FATAL, - "No write access to lock file: " + writeLockFile - + " Usable space: " + underlyingDir.getUsableSpace(), - e); - } - - // List all files in the segment directory - File segmentDir = underlyingDir.getParentFile(); - LOG.warn("Segment directory contains the following files: " - + Lists.newArrayList(segmentDir.list())); - } else { - LOG.warn("Unable to log debugging info upon failing to acquire Lucene write lock." - + "The class of the directory is: " + dir.getClass().getName()); - } - } - - @Override - public void addDocument(Document doc) throws IOException { - indexWriter.addDocument(doc); - } - - @Override - public void addTweet(Document doc, long tweetId, boolean docIdOffensive) throws IOException { - indexWriter.addDocument(doc); - } - - @Override - protected void appendOutOfOrder(Document doc, int docId) throws IOException { - throw new UnsupportedOperationException("This Lucene-based IndexWriter does not support " - + "updates and out-of-order appends."); - } - - @Override - public int numDocs() { - return indexWriter.getDocStats().maxDoc; - } - - @Override - public int numDocsNoDelete() throws IOException { - return numDocs(); - } - - @Override - public void deleteDocuments(Query query) throws IOException { - super.deleteDocuments(query); - indexWriter.deleteDocuments(query); - } - - @Override - public void addIndexes(Directory... dirs) throws IOException { - indexWriter.addIndexes(dirs); - } - - @Override - public void forceMerge() throws IOException { - indexWriter.forceMerge(1); - } - - @Override - public void close() throws IOException { - indexWriter.close(); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentAtomicReader.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentAtomicReader.docx new file mode 100644 index 000000000..046cbb705 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentAtomicReader.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentAtomicReader.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentAtomicReader.java deleted file mode 100644 index 78c1f6d45..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentAtomicReader.java +++ /dev/null @@ -1,175 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.IOException; - -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.LeafMetaData; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.PointValues; -import org.apache.lucene.index.SortedDocValues; -import org.apache.lucene.index.SortedNumericDocValues; -import org.apache.lucene.index.SortedSetDocValues; -import org.apache.lucene.index.StoredFieldVisitor; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.Terms; -import org.apache.lucene.search.Sort; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.Version; - -import com.twitter.search.core.earlybird.facets.EarlybirdFacetDocValueSet; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldDocValues; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldIndex; -import com.twitter.search.core.earlybird.index.inverted.InMemoryFields; -import com.twitter.search.core.earlybird.index.inverted.InvertedIndex; - -public final class EarlybirdRealtimeIndexSegmentAtomicReader - extends EarlybirdIndexSegmentAtomicReader { - private final Fields fields; - private final int maxDocId; - private final int numDocs; - - /** - * Creates a new real-time reader for the given segment. Do not add public constructors to this - * class. EarlybirdRealtimeIndexSegmentAtomicReader instances should be created only by calling - * EarlybirdRealtimeIndexSegmentData.createAtomicReader(), to make sure everything is set up - * properly (such as CSF readers). - */ - EarlybirdRealtimeIndexSegmentAtomicReader(EarlybirdRealtimeIndexSegmentData segmentData) { - super(segmentData); - - this.fields = new InMemoryFields(segmentData.getPerFieldMap(), syncData.getIndexPointers()); - - // We cache the highest doc ID and the number of docs, because the reader must return the same - // values for its entire lifetime, and the segment will get more tweets over time. - // These values could be slightly out of sync with 'fields', because we don't update these - // values atomically with the fields. - this.maxDocId = segmentData.getDocIDToTweetIDMapper().getPreviousDocID(Integer.MAX_VALUE); - this.numDocs = segmentData.getDocIDToTweetIDMapper().getNumDocs(); - } - - @Override - public int maxDoc() { - return maxDocId + 1; - } - - @Override - public int numDocs() { - return numDocs; - } - - @Override - protected void doClose() { - // nothing to do - } - - @Override - public void document(int docID, StoredFieldVisitor visitor) { - // not supported - } - - @Override - public int getOldestDocID(Term t) throws IOException { - InvertedIndex perField = getSegmentData().getPerFieldMap().get(t.field()); - if (perField == null) { - return TERM_NOT_FOUND; - } - return perField.getLargestDocIDForTerm(t.bytes()); - } - - @Override - public int getTermID(Term t) throws IOException { - InvertedIndex perField = getSegmentData().getPerFieldMap().get(t.field()); - if (perField == null) { - return TERM_NOT_FOUND; - } - return perField.lookupTerm(t.bytes()); - } - - @Override - public Bits getLiveDocs() { - // liveDocs contains inverted (decreasing) docIDs. - return getDeletesView().getLiveDocs(); - } - - @Override - public boolean hasDeletions() { - return getDeletesView().hasDeletions(); - } - - @Override - public Terms terms(String field) throws IOException { - return fields.terms(field); - } - - @Override - public NumericDocValues getNumericDocValues(String field) throws IOException { - ColumnStrideFieldIndex csf = - getSegmentData().getDocValuesManager().getColumnStrideFieldIndex(field); - return csf != null ? new ColumnStrideFieldDocValues(csf, this) : null; - } - - @Override - public boolean hasDocs() { - // smallestDocID is the smallest document ID that was available when this reader was created. - // So we need to check its value in order to decide if this reader can see any documents, - // because in the meantime other documents might've been added to the tweet ID mapper. - return getSmallestDocID() != Integer.MAX_VALUE; - } - - @Override - public BinaryDocValues getBinaryDocValues(String field) { - return null; - } - - @Override - public SortedDocValues getSortedDocValues(String field) { - return null; - } - - @Override - public SortedSetDocValues getSortedSetDocValues(String field) { - // special handling for facet field - if (EarlybirdFacetDocValueSet.FIELD_NAME.equals(field)) { - return ((EarlybirdRealtimeIndexSegmentData) getSegmentData()).getFacetDocValueSet(); - } - - return null; - } - - @Override - public NumericDocValues getNormValues(String field) throws IOException { - ColumnStrideFieldIndex csf = getSegmentData().getNormIndex(field); - return csf != null ? new ColumnStrideFieldDocValues(csf, this) : null; - } - - @Override - public SortedNumericDocValues getSortedNumericDocValues(String field) { - return null; - } - - @Override - public void checkIntegrity() { - // nothing to do - } - - @Override - public PointValues getPointValues(String field) { - return null; - } - - @Override - public LeafMetaData getMetaData() { - return new LeafMetaData(Version.LATEST.major, Version.LATEST, Sort.RELEVANCE); - } - - @Override - public CacheHelper getCoreCacheHelper() { - return null; - } - - @Override - public CacheHelper getReaderCacheHelper() { - return null; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentData.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentData.docx new file mode 100644 index 000000000..b7142c1a9 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentData.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentData.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentData.java deleted file mode 100644 index 58ea6f3bf..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentData.java +++ /dev/null @@ -1,251 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.IOException; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; - -import com.google.common.collect.Maps; - -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.search.IndexSearcher; - -import com.twitter.search.common.schema.SearchWhitespaceAnalyzer; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.facets.AbstractFacetCountingArray; -import com.twitter.search.core.earlybird.facets.EarlybirdFacetDocValueSet; -import com.twitter.search.core.earlybird.facets.FacetCountingArray; -import com.twitter.search.core.earlybird.facets.FacetIDMap; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.facets.FacetUtil; -import com.twitter.search.core.earlybird.facets.OptimizedFacetCountingArray; -import com.twitter.search.core.earlybird.index.column.DocValuesManager; -import com.twitter.search.core.earlybird.index.column.OptimizedDocValuesManager; -import com.twitter.search.core.earlybird.index.column.UnoptimizedDocValuesManager; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsFactory; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdRealtimeIndexExtensionsData; -import com.twitter.search.core.earlybird.index.inverted.DeletedDocs; -import com.twitter.search.core.earlybird.index.inverted.IndexOptimizer; -import com.twitter.search.core.earlybird.index.inverted.InvertedIndex; - -/** - * Implements {@link EarlybirdIndexSegmentData} for real-time in-memory Earlybird segments. - */ -public class EarlybirdRealtimeIndexSegmentData extends EarlybirdIndexSegmentData { - private final EarlybirdRealtimeIndexExtensionsData indexExtension; - - private EarlybirdFacetDocValueSet facetDocValueSet; - - /** - * Creates a new empty real-time SegmentData instance. - */ - public EarlybirdRealtimeIndexSegmentData( - int maxSegmentSize, - long timeSliceID, - Schema schema, - DocIDToTweetIDMapper docIdToTweetIdMapper, - TimeMapper timeMapper, - EarlybirdIndexExtensionsFactory indexExtensionsFactory) { - this( - maxSegmentSize, - timeSliceID, - schema, - false, // isOptimized - Integer.MAX_VALUE, - new ConcurrentHashMap<>(), - new FacetCountingArray(maxSegmentSize), - new UnoptimizedDocValuesManager(schema, maxSegmentSize), - Maps.newHashMapWithExpectedSize(schema.getNumFacetFields()), - FacetIDMap.build(schema), - new DeletedDocs.Default(maxSegmentSize), - docIdToTweetIdMapper, - timeMapper, - indexExtensionsFactory == null - ? null - : indexExtensionsFactory.newRealtimeIndexExtensionsData()); - } - - /** - * Creates a new real-time SegmentData instance using the passed in data structures. Usually this - * constructor is used by the FlushHandler after a segment was loaded from disk, but also the - * {@link IndexOptimizer} uses it to create an - * optimized segment. - */ - public EarlybirdRealtimeIndexSegmentData( - int maxSegmentSize, - long timeSliceID, - Schema schema, - boolean isOptimized, - int smallestDocID, - ConcurrentHashMap perFieldMap, - AbstractFacetCountingArray facetCountingArray, - DocValuesManager docValuesManager, - Map facetLabelProviders, - FacetIDMap facetIDMap, - DeletedDocs deletedDocs, - DocIDToTweetIDMapper docIdToTweetIdMapper, - TimeMapper timeMapper, - EarlybirdRealtimeIndexExtensionsData indexExtension) { - super(maxSegmentSize, - timeSliceID, - schema, - isOptimized, - smallestDocID, - perFieldMap, - new ConcurrentHashMap<>(), - facetCountingArray, - docValuesManager, - facetLabelProviders, - facetIDMap, - deletedDocs, - docIdToTweetIdMapper, - timeMapper); - this.indexExtension = indexExtension; - this.facetDocValueSet = null; - } - - @Override - public EarlybirdRealtimeIndexExtensionsData getIndexExtensionsData() { - return indexExtension; - } - - /** - * For realtime segments, this wraps a facet datastructure into a SortedSetDocValues to - * comply to Lucene facet api. - */ - public EarlybirdFacetDocValueSet getFacetDocValueSet() { - if (facetDocValueSet == null) { - AbstractFacetCountingArray facetCountingArray = getFacetCountingArray(); - if (facetCountingArray != null) { - facetDocValueSet = new EarlybirdFacetDocValueSet( - facetCountingArray, getFacetLabelProviders(), getFacetIDMap()); - } - } - return facetDocValueSet; - } - - @Override - protected EarlybirdIndexSegmentAtomicReader doCreateAtomicReader() { - return new EarlybirdRealtimeIndexSegmentAtomicReader(this); - } - - /** - * Convenience method for creating an EarlybirdIndexSegmentWriter for this segment with a default - * IndexSegmentWriter config. - */ - public EarlybirdIndexSegmentWriter createEarlybirdIndexSegmentWriter() { - return createEarlybirdIndexSegmentWriter( - new IndexWriterConfig(new SearchWhitespaceAnalyzer()).setSimilarity( - IndexSearcher.getDefaultSimilarity())); - } - - @Override - public EarlybirdIndexSegmentWriter createEarlybirdIndexSegmentWriter( - IndexWriterConfig indexWriterConfig) { - // Prepare the in-memory segment with all enabled CSF fields. - DocValuesManager docValuesManager = getDocValuesManager(); - for (Schema.FieldInfo fieldInfo : getSchema().getFieldInfos()) { - if (fieldInfo.getFieldType().getCsfType() != null) { - docValuesManager.addColumnStrideField(fieldInfo.getName(), fieldInfo.getFieldType()); - } - } - - return new EarlybirdRealtimeIndexSegmentWriter( - this, - indexWriterConfig.getAnalyzer(), - indexWriterConfig.getSimilarity()); - } - - @Override - public EarlybirdIndexSegmentData.AbstractSegmentDataFlushHandler getFlushHandler() { - return new InMemorySegmentDataFlushHandler(this); - } - - public static class InMemorySegmentDataFlushHandler - extends AbstractSegmentDataFlushHandler { - public InMemorySegmentDataFlushHandler(EarlybirdIndexSegmentData objectToFlush) { - super(objectToFlush); - } - - public InMemorySegmentDataFlushHandler( - Schema schema, - EarlybirdIndexExtensionsFactory factory, - Flushable.Handler docIdMapperFlushHandler, - Flushable.Handler timeMapperFlushHandler) { - super(schema, factory, docIdMapperFlushHandler, timeMapperFlushHandler); - } - - @Override - protected EarlybirdRealtimeIndexExtensionsData newIndexExtension() { - return indexExtensionsFactory.newRealtimeIndexExtensionsData(); - } - - @Override - protected void flushAdditionalDataStructures( - FlushInfo flushInfo, - DataSerializer out, - EarlybirdIndexSegmentData segmentData) throws IOException { - segmentData.getFacetCountingArray().getFlushHandler() - .flush(flushInfo.newSubProperties("facet_counting_array"), out); - - // flush all column stride fields - segmentData.getDocValuesManager().getFlushHandler() - .flush(flushInfo.newSubProperties("doc_values"), out); - - segmentData.getFacetIDMap().getFlushHandler() - .flush(flushInfo.newSubProperties("facet_id_map"), out); - - segmentData.getDeletedDocs().getFlushHandler() - .flush(flushInfo.newSubProperties("deleted_docs"), out); - } - - @Override - protected EarlybirdIndexSegmentData constructSegmentData( - FlushInfo flushInfo, - ConcurrentHashMap perFieldMap, - int maxSegmentSize, - EarlybirdRealtimeIndexExtensionsData indexExtension, - DocIDToTweetIDMapper docIdToTweetIdMapper, - TimeMapper timeMapper, - DataDeserializer in) throws IOException { - boolean isOptimized = flushInfo.getBooleanProperty(IS_OPTIMIZED_PROP_NAME); - - Flushable.Handler facetLoader = isOptimized - ? new OptimizedFacetCountingArray.FlushHandler() - : new FacetCountingArray.FlushHandler(maxSegmentSize); - AbstractFacetCountingArray facetCountingArray = - facetLoader.load(flushInfo.getSubProperties("facet_counting_array"), in); - - Flushable.Handler docValuesLoader = isOptimized - ? new OptimizedDocValuesManager.OptimizedFlushHandler(schema) - : new UnoptimizedDocValuesManager.UnoptimizedFlushHandler(schema); - DocValuesManager docValuesManager = - docValuesLoader.load(flushInfo.getSubProperties("doc_values"), in); - - FacetIDMap facetIDMap = new FacetIDMap.FlushHandler(schema) - .load(flushInfo.getSubProperties("facet_id_map"), in); - - DeletedDocs.Default deletedDocs = new DeletedDocs.Default.FlushHandler(maxSegmentSize) - .load(flushInfo.getSubProperties("deleted_docs"), in); - - return new EarlybirdRealtimeIndexSegmentData( - maxSegmentSize, - flushInfo.getLongProperty(TIME_SLICE_ID_PROP_NAME), - schema, - isOptimized, - flushInfo.getIntProperty(SMALLEST_DOCID_PROP_NAME), - perFieldMap, - facetCountingArray, - docValuesManager, - FacetUtil.getFacetLabelProviders(schema, perFieldMap), - facetIDMap, - deletedDocs, - docIdToTweetIdMapper, - timeMapper, - indexExtension); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentWriter.docx b/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentWriter.docx new file mode 100644 index 000000000..6c846d9f1 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentWriter.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentWriter.java b/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentWriter.java deleted file mode 100644 index 049d33ce8..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/EarlybirdRealtimeIndexSegmentWriter.java +++ /dev/null @@ -1,789 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; - -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.index.DocValuesType; -import org.apache.lucene.index.FieldInvertState; -import org.apache.lucene.index.IndexOptions; -import org.apache.lucene.index.IndexableField; -import org.apache.lucene.index.IndexableFieldType; -import org.apache.lucene.search.similarities.Similarity; -import org.apache.lucene.store.Directory; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefHash; -import org.apache.lucene.util.Version; - -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.core.earlybird.facets.FacetCountingArrayWriter; -import com.twitter.search.core.earlybird.facets.FacetIDMap.FacetField; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.facets.FacetUtil; -import com.twitter.search.core.earlybird.index.column.ColumnStrideByteIndex; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdRealtimeIndexExtensionsData; -import com.twitter.search.core.earlybird.index.inverted.EarlybirdCSFDocValuesProcessor; -import com.twitter.search.core.earlybird.index.inverted.InvertedRealtimeIndex; -import com.twitter.search.core.earlybird.index.inverted.InvertedRealtimeIndexWriter; -import com.twitter.search.core.earlybird.index.inverted.TermPointerEncoding; -import com.twitter.search.core.earlybird.index.util.AllDocsIterator; - -/** - * EarlybirdIndexWriter implementation that writes realtime in-memory segments. - * Note that it is used by both Earlybirds and ExpertSearch. - */ -public final class EarlybirdRealtimeIndexSegmentWriter extends EarlybirdIndexSegmentWriter { - private static final Logger LOG = - LoggerFactory.getLogger(EarlybirdRealtimeIndexSegmentWriter.class); - /** - * Maximum tweet length is 10k, setting maximum token position to 25k in case of weird unicode. - */ - private static final int MAX_POSITION = 25000; - - private static final String OUT_OF_ORDER_APPEND_UNSUPPORTED_STATS_PATTERN = - "out_of_order_append_unsupported_for_field_%s"; - private static final ConcurrentHashMap - UNSUPPORTED_OUT_OF_ORDER_APPEND_MAP = new ConcurrentHashMap<>(); - private static final SearchRateCounter NUM_TWEETS_DROPPED = - SearchRateCounter.export("EarlybirdRealtimeIndexSegmentWriter_num_tweets_dropped"); - - private long nextFieldGen; - - private HashMap fields = new HashMap<>(); - private List fieldsInDocument = new ArrayList<>(); - - private final EarlybirdCSFDocValuesProcessor docValuesProcessor; - - private Map termHashSync = new HashMap<>(); - private Set appendedFields = new HashSet<>(); - - private final Analyzer analyzer; - private final Similarity similarity; - - private final EarlybirdRealtimeIndexSegmentData segmentData; - - private final Field allDocsField; - - @Nullable - private final FacetCountingArrayWriter facetCountingArrayWriter; - - /** - * Creates a new writer for a real-time in-memory Earlybird segment. - * - * Do not add public constructors to this class. EarlybirdRealtimeIndexSegmentWriter instances - * should be created only by calling - * EarlybirdRealtimeIndexSegmentData.createEarlybirdIndexSegmentWriter(), to make sure everything - * is set up properly (such as CSF readers). - */ - EarlybirdRealtimeIndexSegmentWriter( - EarlybirdRealtimeIndexSegmentData segmentData, - Analyzer analyzer, - Similarity similarity) { - Preconditions.checkNotNull(segmentData); - this.segmentData = segmentData; - this.facetCountingArrayWriter = segmentData.createFacetCountingArrayWriter(); - this.docValuesProcessor = new EarlybirdCSFDocValuesProcessor(segmentData.getDocValuesManager()); - this.analyzer = analyzer; - this.similarity = similarity; - this.allDocsField = buildAllDocsField(segmentData); - } - - @Override - public EarlybirdRealtimeIndexSegmentData getSegmentData() { - return segmentData; - } - - @Override - public int numDocsNoDelete() { - return segmentData.getDocIDToTweetIDMapper().getNumDocs(); - } - - @Override - public void addDocument(Document doc) throws IOException { - // This method should be called only from Expertsearch, not tweets Earlybirds. - DocIDToTweetIDMapper docIdToTweetIdMapper = segmentData.getDocIDToTweetIDMapper(); - Preconditions.checkState(docIdToTweetIdMapper instanceof SequentialDocIDMapper); - - // Make sure we have space for a new doc in this segment. - Preconditions.checkState(docIdToTweetIdMapper.getNumDocs() < segmentData.getMaxSegmentSize(), - "Cannot add a new document to the segment, because it's full."); - - addDocument(doc, docIdToTweetIdMapper.addMapping(-1L), false); - } - - @Override - public void addTweet(Document doc, long tweetId, boolean docIsOffensive) throws IOException { - DocIDToTweetIDMapper docIdToTweetIdMapper = segmentData.getDocIDToTweetIDMapper(); - Preconditions.checkState(!(docIdToTweetIdMapper instanceof SequentialDocIDMapper)); - - // Make sure we have space for a new doc in this segment. - Preconditions.checkState(docIdToTweetIdMapper.getNumDocs() < segmentData.getMaxSegmentSize(), - "Cannot add a new document to the segment, because it's full."); - - Preconditions.checkNotNull(doc.getField( - EarlybirdFieldConstants.EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName())); - - addAllDocsField(doc); - - int docId = docIdToTweetIdMapper.addMapping(tweetId); - // Make sure we successfully assigned a doc ID to the new document/tweet before proceeding. - // If the docId is DocIDToTweetIDMapper.ID_NOT_FOUND then either: - // 1. the tweet is older than the OutOfOrderRealtimeTweetIDMapper.segmentBoundaryTimestamp and - // is too old for this segment - // 2. the OutOfOrderRealtimeTweetIDMapper does not have any available doc ids left - if (docId == DocIDToTweetIDMapper.ID_NOT_FOUND) { - LOG.info("Could not assign doc id for tweet. Dropping tweet id " + tweetId - + " for segment with timeslice: " + segmentData.getTimeSliceID()); - NUM_TWEETS_DROPPED.increment(); - return; - } - - addDocument(doc, docId, docIsOffensive); - } - - private void addDocument(Document doc, - int docId, - boolean docIsOffensive) throws IOException { - fieldsInDocument.clear(); - - long fieldGen = nextFieldGen++; - - // NOTE: we need two passes here, in case there are - // multi-valued fields, because we must process all - // instances of a given field at once, since the - // analyzer is free to reuse TokenStream across fields - // (i.e., we cannot have more than one TokenStream - // running "at once"): - - try { - for (IndexableField field : doc) { - if (!skipField(field.name())) { - processField(docId, field, fieldGen, docIsOffensive); - } - } - } finally { - // Finish each indexed field name seen in the document: - for (PerField field : fieldsInDocument) { - field.finish(docId); - } - - // When indexing a dummy document for out-of-order updates into a loaded segment, that - // document gets docID set as maxSegment size. So we have to make sure that we never - // sync backwards in document order. - int smallestDocID = Math.min(docId, segmentData.getSyncData().getSmallestDocID()); - segmentData.updateSmallestDocID(smallestDocID); - } - } - - @Override - protected void appendOutOfOrder(Document doc, int internalDocID) throws IOException { - Preconditions.checkNotNull(doc); - fieldsInDocument.clear(); - - long fieldGen = nextFieldGen++; - - try { - for (IndexableField indexableField : doc) { - if (!skipField(indexableField.name())) { - Schema.FieldInfo fi = segmentData.getSchema().getFieldInfo(indexableField.name()); - if (fi == null) { - LOG.error("FieldInfo for " + indexableField.name() + " is null!"); - continue; - } - if (segmentData.isOptimized() && fi.getFieldType().becomesImmutable()) { - UNSUPPORTED_OUT_OF_ORDER_APPEND_MAP.computeIfAbsent( - indexableField.name(), - f -> SearchRateCounter.export( - String.format(OUT_OF_ORDER_APPEND_UNSUPPORTED_STATS_PATTERN, f)) - ).increment(); - continue; - } - processField(internalDocID, indexableField, fieldGen, false); - appendedFields.add(indexableField.name()); - } - } - } finally { - // Finish each indexed field name seen in the document: - for (PerField field : fieldsInDocument) { - field.finish(internalDocID); - } - // force sync - segmentData.updateSmallestDocID(segmentData.getSyncData().getSmallestDocID()); - } - } - - @Override - public void addIndexes(Directory... dirs) { - throw new UnsupportedOperationException("In realtime mode addIndexes() is currently " - + "not supported."); - } - - @Override - public void forceMerge() { - // we always have a single segment in realtime-mode - } - - @Override - public void close() { - // nothing to close - } - - private void processField( - int docId, - IndexableField field, - long fieldGen, - boolean currentDocIsOffensive) throws IOException { - String fieldName = field.name(); - IndexableFieldType fieldType = field.fieldType(); - - // Invert indexed fields: - if (fieldType.indexOptions() != IndexOptions.NONE) { - PerField perField = getOrAddField(fieldName, fieldType); - - // Whether this is the first time we have seen this field in this document. - boolean first = perField.fieldGen != fieldGen; - perField.invert(field, docId, first, currentDocIsOffensive); - - if (first) { - fieldsInDocument.add(perField); - perField.fieldGen = fieldGen; - } - } else { - Schema.FieldInfo facetFieldInfo = - segmentData.getSchema().getFacetFieldByFieldName(fieldName); - FacetField facetField = facetFieldInfo != null - ? segmentData.getFacetIDMap().getFacetField(facetFieldInfo) : null; - EarlybirdFieldType facetFieldType = facetFieldInfo != null - ? facetFieldInfo.getFieldType() : null; - Preconditions.checkState( - facetFieldInfo == null || (facetField != null && facetFieldType != null)); - if (facetField != null && facetFieldType.isUseCSFForFacetCounting()) { - segmentData.getFacetLabelProviders().put( - facetField.getFacetName(), - Preconditions.checkNotNull( - FacetUtil.chooseFacetLabelProvider(facetFieldType, null))); - } - } - - if (fieldType.docValuesType() != DocValuesType.NONE) { - StoredFieldsConsumerBuilder consumerBuilder = new StoredFieldsConsumerBuilder( - fieldName, (EarlybirdFieldType) fieldType); - EarlybirdRealtimeIndexExtensionsData indexExtension = segmentData.getIndexExtensionsData(); - if (indexExtension != null) { - indexExtension.createStoredFieldsConsumer(consumerBuilder); - } - if (consumerBuilder.isUseDefaultConsumer()) { - consumerBuilder.addConsumer(docValuesProcessor); - } - - StoredFieldsConsumer storedFieldsConsumer = consumerBuilder.build(); - if (storedFieldsConsumer != null) { - storedFieldsConsumer.addField(docId, field); - } - } - } - - /** Returns a previously created {@link PerField}, absorbing the type information from - * {@link org.apache.lucene.document.FieldType}, and creates a new {@link PerField} if this field - * name wasn't seen yet. */ - private PerField getOrAddField(String name, IndexableFieldType fieldType) { - // Note that this could be a computeIfAbsent, but that allocates a closure in the hot path and - // slows down indexing. - PerField perField = fields.get(name); - if (perField == null) { - boolean omitNorms = fieldType.omitNorms() || fieldType.indexOptions() == IndexOptions.NONE; - perField = new PerField(this, name, fieldType.indexOptions(), omitNorms); - fields.put(name, perField); - } - return perField; - } - - /** NOTE: not static: accesses at least docState, termsHash. */ - private static final class PerField implements Comparable { - - private final EarlybirdRealtimeIndexSegmentWriter indexSegmentWriter; - - private final String fieldName; - private final IndexOptions indexOptions; - private final boolean omitNorms; - - private InvertedRealtimeIndex invertedField; - private InvertedDocConsumer indexWriter; - - /** We use this to know when a PerField is seen for the - * first time in the current document. */ - private long fieldGen = -1; - - // reused - private TokenStream tokenStream; - - private int currentPosition; - private int currentOffset; - private int currentLength; - private int currentOverlap; - private int lastStartOffset; - private int lastPosition; - - public PerField( - EarlybirdRealtimeIndexSegmentWriter indexSegmentWriter, - String fieldName, - IndexOptions indexOptions, - boolean omitNorms) { - this.indexSegmentWriter = indexSegmentWriter; - this.fieldName = fieldName; - this.indexOptions = indexOptions; - this.omitNorms = omitNorms; - - initInvertState(); - } - - void initInvertState() { - // it's okay if this is null - in that case TwitterTermHashPerField - // will not add it to the facet array - final Schema.FieldInfo facetFieldInfo - = indexSegmentWriter.segmentData.getSchema().getFacetFieldByFieldName(fieldName); - final FacetField facetField = facetFieldInfo != null - ? indexSegmentWriter.segmentData.getFacetIDMap().getFacetField(facetFieldInfo) : null; - final EarlybirdFieldType facetFieldType - = facetFieldInfo != null ? facetFieldInfo.getFieldType() : null; - Preconditions.checkState( - facetFieldInfo == null || (facetField != null && facetFieldType != null)); - - if (facetField != null && facetFieldType.isUseCSFForFacetCounting()) { - indexSegmentWriter.segmentData.getFacetLabelProviders().put( - facetField.getFacetName(), - Preconditions.checkNotNull( - FacetUtil.chooseFacetLabelProvider(facetFieldType, null))); - return; - } - - Schema.FieldInfo fi = indexSegmentWriter.segmentData.getSchema().getFieldInfo(fieldName); - final EarlybirdFieldType fieldType = fi.getFieldType(); - - InvertedDocConsumerBuilder consumerBuilder = new InvertedDocConsumerBuilder( - indexSegmentWriter.segmentData, fieldName, fieldType); - EarlybirdRealtimeIndexExtensionsData indexExtension = - indexSegmentWriter.segmentData.getIndexExtensionsData(); - if (indexExtension != null) { - indexExtension.createInvertedDocConsumer(consumerBuilder); - } - - if (consumerBuilder.isUseDefaultConsumer()) { - if (indexSegmentWriter.segmentData.getPerFieldMap().containsKey(fieldName)) { - invertedField = (InvertedRealtimeIndex) indexSegmentWriter - .segmentData.getPerFieldMap().get(fieldName); - } else { - invertedField = new InvertedRealtimeIndex( - fieldType, - TermPointerEncoding.DEFAULT_ENCODING, - fieldName); - } - - InvertedRealtimeIndexWriter fieldWriter = new InvertedRealtimeIndexWriter( - invertedField, facetField, indexSegmentWriter.facetCountingArrayWriter); - - if (facetField != null) { - Map providerMap = - indexSegmentWriter.segmentData.getFacetLabelProviders(); - if (!providerMap.containsKey(facetField.getFacetName())) { - providerMap.put( - facetField.getFacetName(), - Preconditions.checkNotNull( - FacetUtil.chooseFacetLabelProvider(facetFieldType, invertedField))); - } - } - - indexSegmentWriter.segmentData.addField(fieldName, invertedField); - - if (indexSegmentWriter.appendedFields.contains(fieldName)) { - indexSegmentWriter.termHashSync.put(fieldName, fieldWriter); - } - - consumerBuilder.addConsumer(fieldWriter); - } - - indexWriter = consumerBuilder.build(); - } - - @Override - public int compareTo(PerField other) { - return this.fieldName.compareTo(other.fieldName); - } - - @Override - public boolean equals(Object other) { - if (!(other instanceof PerField)) { - return false; - } - - return this.fieldName.equals(((PerField) other).fieldName); - } - - @Override - public int hashCode() { - return fieldName.hashCode(); - } - - public void finish(int docId) { - if (indexWriter != null) { - indexWriter.finish(); - } - - if (!omitNorms) { - FieldInvertState state = new FieldInvertState( - Version.LATEST.major, - fieldName, - indexOptions, - currentPosition, - currentLength, - currentOverlap, - currentOffset, - 0, // maxTermFrequency - 0); // uniqueTermCount - ColumnStrideByteIndex normsIndex = - indexSegmentWriter.segmentData.createNormIndex(fieldName); - if (normsIndex != null) { - normsIndex.setValue(docId, (byte) indexSegmentWriter.similarity.computeNorm(state)); - } - } - } - - /** Inverts one field for one document; first is true - * if this is the first time we are seeing this field - * name in this document. */ - public void invert(IndexableField field, - int docId, - boolean first, - boolean currentDocIsOffensive) throws IOException { - if (indexWriter == null) { - return; - } - if (first) { - currentPosition = -1; - currentOffset = 0; - lastPosition = 0; - lastStartOffset = 0; - - if (invertedField != null) { - invertedField.incrementNumDocs(); - } - } - - IndexableFieldType fieldType = field.fieldType(); - final boolean analyzed = fieldType.tokenized() && indexSegmentWriter.analyzer != null; - boolean succeededInProcessingField = false; - try { - tokenStream = field.tokenStream(indexSegmentWriter.analyzer, tokenStream); - tokenStream.reset(); - - PositionIncrementAttribute posIncrAttribute = - tokenStream.addAttribute(PositionIncrementAttribute.class); - OffsetAttribute offsetAttribute = tokenStream.addAttribute(OffsetAttribute.class); - TermToBytesRefAttribute termAtt = tokenStream.addAttribute(TermToBytesRefAttribute.class); - - Set seenTerms = new HashSet<>(); - indexWriter.start(tokenStream, currentDocIsOffensive); - while (tokenStream.incrementToken()) { - // If we hit an exception in stream.next below - // (which is fairly common, e.g. if analyzer - // chokes on a given document), then it's - // non-aborting and (above) this one document - // will be marked as deleted, but still - // consume a docID - - int posIncr = posIncrAttribute.getPositionIncrement(); - currentPosition += posIncr; - if (currentPosition < lastPosition) { - if (posIncr == 0) { - throw new IllegalArgumentException( - "first position increment must be > 0 (got 0) for field '" + field.name() + "'"); - } else if (posIncr < 0) { - throw new IllegalArgumentException( - "position increments (and gaps) must be >= 0 (got " + posIncr + ") for field '" - + field.name() + "'"); - } else { - throw new IllegalArgumentException( - "position overflowed Integer.MAX_VALUE (got posIncr=" + posIncr + " lastPosition=" - + lastPosition + " position=" + currentPosition + ") for field '" + field.name() - + "'"); - } - } else if (currentPosition > MAX_POSITION) { - throw new IllegalArgumentException( - "position " + currentPosition + " is too large for field '" + field.name() - + "': max allowed position is " + MAX_POSITION); - } - lastPosition = currentPosition; - if (posIncr == 0) { - currentOverlap++; - } - - int startOffset = currentOffset + offsetAttribute.startOffset(); - int endOffset = currentOffset + offsetAttribute.endOffset(); - if (startOffset < lastStartOffset || endOffset < startOffset) { - throw new IllegalArgumentException( - "startOffset must be non-negative, and endOffset must be >= startOffset, and " - + "offsets must not go backwards startOffset=" + startOffset + ",endOffset=" - + endOffset + ",lastStartOffset=" + lastStartOffset + " for field '" + field.name() - + "'"); - } - lastStartOffset = startOffset; - indexWriter.add(docId, currentPosition); - currentLength++; - - BytesRef term = termAtt.getBytesRef(); - if (seenTerms.add(term) && (invertedField != null)) { - invertedField.incrementSumTermDocFreq(); - } - } - - tokenStream.end(); - - currentPosition += posIncrAttribute.getPositionIncrement(); - currentOffset += offsetAttribute.endOffset(); - succeededInProcessingField = true; - } catch (BytesRefHash.MaxBytesLengthExceededException e) { - byte[] prefix = new byte[30]; - BytesRef bigTerm = tokenStream.getAttribute(TermToBytesRefAttribute.class).getBytesRef(); - System.arraycopy(bigTerm.bytes, bigTerm.offset, prefix, 0, 30); - String msg = "Document contains at least one immense term in field=\"" + fieldName - + "\" (whose UTF8 encoding is longer than the max length), all of " - + "which were skipped." + "Please correct the analyzer to not produce such terms. " - + "The prefix of the first immense term is: '" + Arrays.toString(prefix) - + "...', original message: " + e.getMessage(); - LOG.warn(msg); - // Document will be deleted above: - throw new IllegalArgumentException(msg, e); - } finally { - if (!succeededInProcessingField) { - LOG.warn("An exception was thrown while processing field " + fieldName); - } - if (tokenStream != null) { - try { - tokenStream.close(); - } catch (IOException e) { - if (succeededInProcessingField) { - // only throw this exception if no other exception already occurred above - throw e; - } else { - LOG.warn("Exception while trying to close TokenStream.", e); - } - } - } - } - - if (analyzed) { - currentPosition += indexSegmentWriter.analyzer.getPositionIncrementGap(fieldName); - currentOffset += indexSegmentWriter.analyzer.getOffsetGap(fieldName); - } - } - } - - @Override - public int numDocs() { - return segmentData.getDocIDToTweetIDMapper().getNumDocs(); - } - - public interface InvertedDocConsumer { - /** - * Called for each document before inversion starts. - */ - void start(AttributeSource attributeSource, boolean currentDocIsOffensive); - - /** - * Called for each token in the current document. - * @param docID Document id. - * @param position Position in the token stream for this document. - */ - void add(int docID, int position) throws IOException; - - /** - * Called after the last token was added and before the next document is processed. - */ - void finish(); - } - - public interface StoredFieldsConsumer { - /** - * Adds a new stored fields. - */ - void addField(int docID, IndexableField field) throws IOException; - } - - /** - * This Builder allows registering listeners for a particular field of an indexable document. - * For each field name any number of listeners can be added. - * - * Using {@link #useDefaultConsumer} it can be specified whether this index writer will use - * the default consumer in addition to any additionally registered consumers. - */ - public abstract static class ConsumerBuilder { - private boolean useDefaultConsumer; - private final List consumers; - private final EarlybirdFieldType fieldType; - private final String fieldName; - - private ConsumerBuilder(String fieldName, EarlybirdFieldType fieldType) { - useDefaultConsumer = true; - consumers = Lists.newArrayList(); - this.fieldName = fieldName; - this.fieldType = fieldType; - } - - public String getFieldName() { - return fieldName; - } - - public EarlybirdFieldType getFieldType() { - return fieldType; - } - - /** - * If set to true, {@link EarlybirdRealtimeIndexSegmentWriter} will use the default consumer - * (e.g. build a default inverted index for an inverted field) in addition to any consumers - * added via {@link #addConsumer(Object)}. - */ - public void setUseDefaultConsumer(boolean useDefaultConsumer) { - this.useDefaultConsumer = useDefaultConsumer; - } - - public boolean isUseDefaultConsumer() { - return useDefaultConsumer; - } - - /** - * Allows registering any number of additional consumers for the field associated with this - * builder. - */ - public void addConsumer(T consumer) { - consumers.add(consumer); - } - - T build() { - if (consumers.isEmpty()) { - return null; - } else if (consumers.size() == 1) { - return consumers.get(0); - } else { - return build(consumers); - } - } - - abstract T build(List consumerList); - } - - public static final class StoredFieldsConsumerBuilder - extends ConsumerBuilder { - private StoredFieldsConsumerBuilder(String fieldName, EarlybirdFieldType fieldType) { - super(fieldName, fieldType); - } - - @Override - StoredFieldsConsumer build(final List consumers) { - return (docID, field) -> { - for (StoredFieldsConsumer consumer : consumers) { - consumer.addField(docID, field); - } - }; - } - } - - public static final class InvertedDocConsumerBuilder - extends ConsumerBuilder { - private final EarlybirdIndexSegmentData segmentData; - - private InvertedDocConsumerBuilder( - EarlybirdIndexSegmentData segmentData, String fieldName, EarlybirdFieldType fieldType) { - super(fieldName, fieldType); - this.segmentData = segmentData; - } - - @Override - InvertedDocConsumer build(final List consumers) { - return new InvertedDocConsumer() { - @Override - public void start(AttributeSource attributeSource, boolean currentDocIsOffensive) { - for (InvertedDocConsumer consumer : consumers) { - consumer.start(attributeSource, currentDocIsOffensive); - } - } - - @Override - public void finish() { - for (InvertedDocConsumer consumer : consumers) { - consumer.finish(); - } - } - - @Override - public void add(int docID, int position) throws IOException { - for (InvertedDocConsumer consumer : consumers) { - consumer.add(docID, position); - } - } - }; - } - - public EarlybirdIndexSegmentData getSegmentData() { - return segmentData; - } - } - - /** - * Returns true, if a field should not be indexed. - * @deprecated This writer should be able to process all fields in the future. - */ - @Deprecated - private static boolean skipField(String fieldName) { - // ignore lucene facet fields for realtime index, we are handling it differently for now. - return fieldName.startsWith(FacetsConfig.DEFAULT_INDEX_FIELD_NAME); - } - - private static Field buildAllDocsField(EarlybirdRealtimeIndexSegmentData segmentData) { - String fieldName = EarlybirdFieldConstants.EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(); - if (segmentData.getSchema().hasField(fieldName)) { - Schema.FieldInfo fi = Preconditions.checkNotNull( - segmentData.getSchema().getFieldInfo(fieldName)); - return new Field(fi.getName(), AllDocsIterator.ALL_DOCS_TERM, fi.getFieldType()); - } - - return null; - } - - /** - * Every document must have this field and term, so that we can safely iterate through documents - * using {@link AllDocsIterator}. This is to prevent the problem of adding a tweet to the doc ID - * mapper, and returning it for a match-all query when the rest of the document hasn't been - * published. This could lead to queries returning incorrect results for queries that are only - * negations. - * */ - private void addAllDocsField(Document doc) { - if (allDocsField != null) { - doc.add(allDocsField); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/QueryCacheResultForSegment.docx b/src/java/com/twitter/search/core/earlybird/index/QueryCacheResultForSegment.docx new file mode 100644 index 000000000..403c520c4 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/QueryCacheResultForSegment.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/QueryCacheResultForSegment.java b/src/java/com/twitter/search/core/earlybird/index/QueryCacheResultForSegment.java deleted file mode 100644 index 21d2e0d29..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/QueryCacheResultForSegment.java +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import org.apache.lucene.search.DocIdSet; - -/** - * Class to hold the actual cache which provides a doc id iterator to walk through the cache/result. - * - * An instance holds the results for a single query of the different ones defined in querycache.yml. - */ -public class QueryCacheResultForSegment { - private final DocIdSet docIdSet; - private final int smallestDocID; - private final long cardinality; - - /** - * Stores query cache results. - * - * @param docIdSet Documents in the cache. - * @param cardinality Size of the cache. - * @param smallestDocID The most recently posted document contained in the cache. - */ - public QueryCacheResultForSegment(DocIdSet docIdSet, long cardinality, int smallestDocID) { - this.docIdSet = docIdSet; - this.smallestDocID = smallestDocID; - this.cardinality = cardinality; - } - - public DocIdSet getDocIdSet() { - return docIdSet; - } - - public int getSmallestDocID() { - return smallestDocID; - } - - public long getCardinality() { - return cardinality; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/SequentialDocIDMapper.docx b/src/java/com/twitter/search/core/earlybird/index/SequentialDocIDMapper.docx new file mode 100644 index 000000000..6a3323ab9 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/SequentialDocIDMapper.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/SequentialDocIDMapper.java b/src/java/com/twitter/search/core/earlybird/index/SequentialDocIDMapper.java deleted file mode 100644 index 3d029dcb2..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/SequentialDocIDMapper.java +++ /dev/null @@ -1,87 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -/** - * A doc ID mapper that assigns doc IDs sequentially in decreasing order, starting with the given - * max ID. Used by Expertsearch, which doesn't index tweets. - */ -public class SequentialDocIDMapper implements DocIDToTweetIDMapper { - private final int maxSegmentSize; - private int lastAssignedDocId; - - public SequentialDocIDMapper(int maxSegmentSize) { - this.maxSegmentSize = maxSegmentSize; - lastAssignedDocId = maxSegmentSize; - } - - @Override - public long getTweetID(int docID) { - // Should be used only at segment optimization time and in tests. - if ((docID < lastAssignedDocId) || (docID >= maxSegmentSize)) { - return ID_NOT_FOUND; - } - - return docID; - } - - @Override - public int getDocID(long tweetID) { - // Should be used only at segment optimization time and in tests. - if ((tweetID < lastAssignedDocId) || (tweetID >= maxSegmentSize)) { - return ID_NOT_FOUND; - } - - return (int) tweetID; - } - - @Override - public int getNumDocs() { - return maxSegmentSize - lastAssignedDocId; - } - - @Override - public int getNextDocID(int docID) { - int nextDocID = docID + 1; - - // nextDocID is larger than any doc ID that can be assigned by this mapper. - if (nextDocID >= maxSegmentSize) { - return ID_NOT_FOUND; - } - - // nextDocID is smaller than any doc ID assigned by this mapper so far. - if (nextDocID < lastAssignedDocId) { - return lastAssignedDocId; - } - - // nextDocID is in the range of doc IDs assigned by this mapper. - return nextDocID; - } - - @Override - public int getPreviousDocID(int docID) { - int previousDocID = docID - 1; - - // previousDocID is larger than any doc ID that can be assigned by this mapper. - if (previousDocID >= maxSegmentSize) { - return maxSegmentSize - 1; - } - - // previousDocID is smaller than any doc ID assigned by this mapper so far. - if (previousDocID < lastAssignedDocId) { - return ID_NOT_FOUND; - } - - // previousDocID is in the range of doc IDs assigned by this mapper. - return previousDocID; - } - - @Override - public int addMapping(final long tweetID) { - return --lastAssignedDocId; - } - - @Override - public DocIDToTweetIDMapper optimize() { - // Segments that use this DocIDToTweetIDMapper should never be optimized. - throw new UnsupportedOperationException("SequentialDocIDMapper cannot be optimized."); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/TimeMapper.docx b/src/java/com/twitter/search/core/earlybird/index/TimeMapper.docx new file mode 100644 index 000000000..6189662a8 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/TimeMapper.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/TimeMapper.java b/src/java/com/twitter/search/core/earlybird/index/TimeMapper.java deleted file mode 100644 index e2f609168..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/TimeMapper.java +++ /dev/null @@ -1,80 +0,0 @@ -package com.twitter.search.core.earlybird.index; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.Flushable; - -/** - * Maps timestamps to the doc IDs assigned to the documents that are indexed (tweets, users, etc.). - */ -public interface TimeMapper extends Flushable { - // Unless specified, all time fields are seconds-since-epoch. - int ILLEGAL_TIME = Integer.MIN_VALUE; - - /** - * Returns the time of the newest tweet in the index. - * - * @return The time of the newest tweet in the index. - */ - int getLastTime(); - - /** - * Returns the time of the oldest tweet in the index. - * - * @return The time of the oldest tweet in the index. - */ - int getFirstTime(); - - /** - * Returns the timestamp of the document mapped to the given doc ID, or ILLEGAL_TIME if this - * mapper doesn't know about this doc ID. - * - * @param docID The document's internal ID. - * @return The timestamp of the document mapped to the given doc ID. - */ - int getTime(int docID); - - /** - * Returns the doc ID of the first indexed document with a timestamp equal to or greater than the - * given timestamp. - * - * If timeSeconds is larger than the max timestamp in this mapper, smallestDocID is returned. - * If timeSeconds is smaller than the min timestamp in the mapper, the largest docID is returned. - * - * Note that when tweets are indexed out of order, this method might return the doc ID of a tweet - * with a timestamp greater than timeSeconds, even if there's a tweet with a timestamp of - * timeSeconds. So the callers of this method can use the returned doc ID as a starting point for - * iteration purposes, but should have a check that the traversed doc IDs have a timestamp in the - * desired range. See SinceUntilFilter.getDocIdSet() for an example. - * - * Example: - * DocIds: 6, 5, 4, 3, 2, 1, 0 - * Times: 1, 5, 3, 4, 4, 3, 6 - * With that data: - * findFirstDocId(1, 0) should return 6. - * findFirstDocId(3, 0) should return 5. - * findFirstDocId(4, 0) should return 5. - * findFirstDocId(5, 0) should return 5. - * findFirstDocId(6, 0) should return 0. - * - * @param timeSeconds The boundary timestamp, in seconds. - * @param smallestDocID The doc ID to return if the given time boundary is larger than the max - * timestamp in this mapper. - */ - int findFirstDocId(int timeSeconds, int smallestDocID) throws IOException; - - /** - * Optimizes this time mapper. - * - * At segment optimization time, the doc IDs assigned to the documents in that segment might - * change (they might be mapped to a more compact space for performance reasons, for example). - * When that happens, we need to remap accordingly the doc IDs stored in the time mapper for that - * segment too. It would also be a good time to optimize the data stored in the time mapper. - * - * @param originalDocIdMapper The doc ID mapper used by this segment before it was optimized. - * @param optimizedDocIdMapper The doc ID mapper used by this segment after it was optimized. - * @return An optimized TimeMapper with the same tweet IDs. - */ - TimeMapper optimize(DocIDToTweetIDMapper originalDocIdMapper, - DocIDToTweetIDMapper optimizedDocIdMapper) throws IOException; -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/AbstractColumnStrideMultiIntIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/AbstractColumnStrideMultiIntIndex.docx new file mode 100644 index 000000000..5b87ac3c5 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/AbstractColumnStrideMultiIntIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/AbstractColumnStrideMultiIntIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/AbstractColumnStrideMultiIntIndex.java deleted file mode 100644 index d7dc63910..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/AbstractColumnStrideMultiIntIndex.java +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.BinaryDocValues; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.encoding.docvalues.CSFTypeUtil; -import com.twitter.search.common.util.io.flushable.Flushable; - -public abstract class AbstractColumnStrideMultiIntIndex - extends ColumnStrideFieldIndex implements Flushable { - private static final int NUM_BYTES_PER_INT = java.lang.Integer.SIZE / java.lang.Byte.SIZE; - - private final int numIntsPerField; - - protected AbstractColumnStrideMultiIntIndex(String name, int numIntsPerField) { - super(name); - this.numIntsPerField = numIntsPerField; - } - - public int getNumIntsPerField() { - return numIntsPerField; - } - - @Override - public long get(int docID) { - throw new UnsupportedOperationException(); - } - - /** - * Returns the value stored at the given index for the given doc ID. - */ - public abstract int get(int docID, int valueIndex); - - /** - * Sets the value stored at the given index for the given doc ID. - */ - public abstract void setValue(int docID, int valueIndex, int val); - - @Override - public void load(LeafReader atomicReader, String field) throws IOException { - BinaryDocValues docValues = atomicReader.getBinaryDocValues(field); - int numBytesPerDoc = numIntsPerField * NUM_BYTES_PER_INT; - - for (int docID = 0; docID < atomicReader.maxDoc(); docID++) { - Preconditions.checkState(docValues.advanceExact(docID)); - BytesRef scratch = docValues.binaryValue(); - Preconditions.checkState( - scratch.length == numBytesPerDoc, - "Unexpected doc value length for field " + field - + ": Should be " + numBytesPerDoc + ", but was " + scratch.length); - - scratch.length = NUM_BYTES_PER_INT; - for (int i = 0; i < numIntsPerField; i++) { - setValue(docID, i, asInt(scratch)); - scratch.offset += NUM_BYTES_PER_INT; - } - } - } - - public void updateDocValues(BytesRef ref, int docID) { - for (int i = 0; i < numIntsPerField; i++) { - setValue(docID, i, CSFTypeUtil.convertFromBytes(ref.bytes, ref.offset, i)); - } - } - - private static int asInt(BytesRef b) { - return asInt(b, b.offset); - } - - private static int asInt(BytesRef b, int pos) { - int p = pos; - return (b.bytes[p++] << 24) | (b.bytes[p++] << 16) | (b.bytes[p++] << 8) | (b.bytes[p] & 0xFF); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideByteIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideByteIndex.docx new file mode 100644 index 000000000..8acc1e3cf Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideByteIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideByteIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideByteIndex.java deleted file mode 100644 index 8dd783504..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideByteIndex.java +++ /dev/null @@ -1,88 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -import it.unimi.dsi.fastutil.ints.Int2ByteOpenHashMap; - -public class ColumnStrideByteIndex extends ColumnStrideFieldIndex implements Flushable { - private final Int2ByteOpenHashMap values; - private final int maxSize; - - public ColumnStrideByteIndex(String name, int maxSize) { - super(name); - values = new Int2ByteOpenHashMap(maxSize); // default unset value is 0 - this.maxSize = maxSize; - } - - private ColumnStrideByteIndex(String name, Int2ByteOpenHashMap values, int maxSize) { - super(name); - this.values = values; - this.maxSize = maxSize; - } - - @Override - public void setValue(int docID, long value) { - values.put(docID, (byte) value); - } - - @Override - public long get(int docID) { - return values.get(docID); - } - - @Override - public ColumnStrideFieldIndex optimize( - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - return new OptimizedColumnStrideByteIndex(this, originalTweetIdMapper, optimizedTweetIdMapper); - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String NAME_PROP_NAME = "fieldName"; - private static final String MAX_SIZE_PROP = "maxSize"; - - public FlushHandler() { - super(); - } - - public FlushHandler(ColumnStrideByteIndex objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - ColumnStrideByteIndex index = getObjectToFlush(); - flushInfo.addStringProperty(NAME_PROP_NAME, index.getName()); - flushInfo.addIntProperty(MAX_SIZE_PROP, index.maxSize); - - out.writeInt(index.values.size()); - for (Int2ByteOpenHashMap.Entry entry : index.values.int2ByteEntrySet()) { - out.writeInt(entry.getIntKey()); - out.writeByte(entry.getByteValue()); - } - } - - @Override - protected ColumnStrideByteIndex doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - int size = in.readInt(); - int maxSize = flushInfo.getIntProperty(MAX_SIZE_PROP); - Int2ByteOpenHashMap map = new Int2ByteOpenHashMap(maxSize); - for (int i = 0; i < size; i++) { - map.put(in.readInt(), in.readByte()); - } - return new ColumnStrideByteIndex(flushInfo.getStringProperty(NAME_PROP_NAME), map, maxSize); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldDocValues.docx b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldDocValues.docx new file mode 100644 index 000000000..c3bf06e27 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldDocValues.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldDocValues.java b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldDocValues.java deleted file mode 100644 index 9b6a1d6d6..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldDocValues.java +++ /dev/null @@ -1,76 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.NumericDocValues; - -import com.twitter.search.core.earlybird.index.util.AllDocsIterator; - -/** - * A NumericDocValues implementation that uses an AllDocsIterator to iterate through all docs, and - * gets its values from a ColumnStrideFieldIndex instance. - */ -public class ColumnStrideFieldDocValues extends NumericDocValues { - private final ColumnStrideFieldIndex csf; - private final AllDocsIterator iterator; - - public ColumnStrideFieldDocValues(ColumnStrideFieldIndex csf, LeafReader reader) - throws IOException { - this.csf = Preconditions.checkNotNull(csf); - this.iterator = new AllDocsIterator(Preconditions.checkNotNull(reader)); - } - - @Override - public long longValue() { - return csf.get(docID()); - } - - @Override - public int docID() { - return iterator.docID(); - } - - @Override - public int nextDoc() throws IOException { - return iterator.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return iterator.advance(target); - } - - @Override - public boolean advanceExact(int target) throws IOException { - // The javadocs for advance() and advanceExact() are inconsistent. advance() allows the target - // to be smaller than the current doc ID, and requires the iterator to advance the current doc - // ID past the target, and past the current doc ID. So essentially, advance(target) returns - // max(target, currentDocId + 1). At the same time, advanceExact() is undefined if the target is - // smaller than the current do ID (or if it's an invalid doc ID), and always returns the target. - // So essentially, advanceExact(target) should always set the current doc ID to the given target - // and if target == currentDocId, then currentDocId should not be advanced. This is why we have - // these extra checks here instead of moving them to advance(). - Preconditions.checkState( - target >= docID(), - "ColumnStrideFieldDocValues.advance() for field %s called with target %s, " - + "but the current doc ID is %s.", - csf.getName(), - target, - docID()); - if (target == docID()) { - return true; - } - - // We don't need to check if we have a value for 'target', because a ColumnStrideFieldIndex - // instance has a value for every doc ID (though that value might be 0). - return advance(target) == target; - } - - @Override - public long cost() { - return iterator.cost(); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldIndex.docx new file mode 100644 index 000000000..62be6be0e Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldIndex.java deleted file mode 100644 index 0f5261f0a..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideFieldIndex.java +++ /dev/null @@ -1,64 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.NumericDocValues; - -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -/** - * Get an underlying data for a field by calling - * EarlybirdIndexSegmentAtomicReader#getNumericDocValues(String). - */ -public abstract class ColumnStrideFieldIndex { - private final String name; - - public ColumnStrideFieldIndex(String name) { - this.name = name; - } - - public String getName() { - return name; - } - - /** - * Returns the CSF value for the given doc ID. - */ - public abstract long get(int docID); - - /** - * Updates the CSF value for the given doc ID to the given value. - */ - public void setValue(int docID, long value) { - throw new UnsupportedOperationException(); - } - - /** - * Loads the CSF from an AtomicReader. - */ - public void load(LeafReader atomicReader, String field) throws IOException { - NumericDocValues docValues = atomicReader.getNumericDocValues(field); - if (docValues != null) { - for (int i = 0; i < atomicReader.maxDoc(); i++) { - if (docValues.advanceExact(i)) { - setValue(i, docValues.longValue()); - } - } - } - } - - /** - * Optimizes the representation of this column stride field, and remaps its doc IDs, if necessary. - * - * @param originalTweetIdMapper The original tweet ID mapper. - * @param optimizedTweetIdMapper The optimized tweet ID mapper. - * @return An optimized column stride field equivalent to this CSF, - * with possibly remapped doc IDs. - */ - public ColumnStrideFieldIndex optimize( - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - return this; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntIndex.docx new file mode 100644 index 000000000..ddd5dc479 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntIndex.java deleted file mode 100644 index 7bb0d1b02..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntIndex.java +++ /dev/null @@ -1,88 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; - -public class ColumnStrideIntIndex extends ColumnStrideFieldIndex implements Flushable { - private final Int2IntOpenHashMap values; - private final int maxSize; - - public ColumnStrideIntIndex(String name, int maxSize) { - super(name); - values = new Int2IntOpenHashMap(maxSize); // default unset value is 0 - this.maxSize = maxSize; - } - - public ColumnStrideIntIndex(String name, Int2IntOpenHashMap values, int maxSize) { - super(name); - this.values = values; - this.maxSize = maxSize; - } - - @Override - public void setValue(int docID, long value) { - values.put(docID, (int) value); - } - - @Override - public long get(int docID) { - return values.get(docID); - } - - @Override - public ColumnStrideFieldIndex optimize( - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - return new OptimizedColumnStrideIntIndex(this, originalTweetIdMapper, optimizedTweetIdMapper); - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String NAME_PROP_NAME = "fieldName"; - private static final String MAX_SIZE_PROP = "maxSize"; - - public FlushHandler() { - super(); - } - - public FlushHandler(ColumnStrideIntIndex objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - ColumnStrideIntIndex index = getObjectToFlush(); - flushInfo.addStringProperty(NAME_PROP_NAME, index.getName()); - flushInfo.addIntProperty(MAX_SIZE_PROP, index.maxSize); - - out.writeInt(index.values.size()); - for (Int2IntOpenHashMap.Entry entry : index.values.int2IntEntrySet()) { - out.writeInt(entry.getIntKey()); - out.writeInt(entry.getIntValue()); - } - } - - @Override - protected ColumnStrideIntIndex doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - int size = in.readInt(); - int maxSize = flushInfo.getIntProperty(MAX_SIZE_PROP); - Int2IntOpenHashMap map = new Int2IntOpenHashMap(maxSize); - for (int i = 0; i < size; i++) { - map.put(in.readInt(), in.readInt()); - } - return new ColumnStrideIntIndex(flushInfo.getStringProperty(NAME_PROP_NAME), map, maxSize); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntViewIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntViewIndex.docx new file mode 100644 index 000000000..e0514a2a1 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntViewIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntViewIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntViewIndex.java deleted file mode 100644 index 9084bc0ed..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideIntViewIndex.java +++ /dev/null @@ -1,71 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import com.twitter.search.common.encoding.features.IntegerEncodedFeatures; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -/** - * An Int CSF view on top of an {@link AbstractColumnStrideMultiIntIndex}. - * - * Used for decoding encoded packed features and exposing them as - * {@link org.apache.lucene.index.NumericDocValues}. - */ -public class ColumnStrideIntViewIndex extends ColumnStrideFieldIndex { - private static class IntViewIntegerEncodedFeatures extends IntegerEncodedFeatures { - private final AbstractColumnStrideMultiIntIndex baseIndex; - private final int docID; - - public IntViewIntegerEncodedFeatures(AbstractColumnStrideMultiIntIndex baseIndex, int docID) { - this.baseIndex = baseIndex; - this.docID = docID; - } - - @Override - public int getInt(int pos) { - return baseIndex.get(docID, pos); - } - - @Override - public void setInt(int pos, int value) { - baseIndex.setValue(docID, pos, value); - } - - @Override - public int getNumInts() { - return baseIndex.getNumIntsPerField(); - } - } - - private final AbstractColumnStrideMultiIntIndex baseIndex; - private final FeatureConfiguration featureConfiguration; - - /** - * Creates a new ColumnStrideIntViewIndex on top of an existing AbstractColumnStrideMultiIntIndex. - */ - public ColumnStrideIntViewIndex(Schema.FieldInfo info, - AbstractColumnStrideMultiIntIndex baseIndex) { - super(info.getName()); - this.baseIndex = baseIndex; - this.featureConfiguration = info.getFieldType().getCsfViewFeatureConfiguration(); - } - - @Override - public long get(int docID) { - IntegerEncodedFeatures encodedFeatures = new IntViewIntegerEncodedFeatures(baseIndex, docID); - return encodedFeatures.getFeatureValue(featureConfiguration); - } - - @Override - public void setValue(int docID, long value) { - IntegerEncodedFeatures encodedFeatures = new IntViewIntegerEncodedFeatures(baseIndex, docID); - encodedFeatures.setFeatureValue(featureConfiguration, (int) value); - } - - @Override - public ColumnStrideFieldIndex optimize( - DocIDToTweetIDMapper originalTweetIdMapper, DocIDToTweetIDMapper optimizedTweetIdMapper) { - throw new UnsupportedOperationException( - "ColumnStrideIntViewIndex instances do not support optimization"); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideLongIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideLongIndex.docx new file mode 100644 index 000000000..7fc85c08f Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideLongIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideLongIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideLongIndex.java deleted file mode 100644 index 37321b8b8..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideLongIndex.java +++ /dev/null @@ -1,88 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -import it.unimi.dsi.fastutil.ints.Int2LongOpenHashMap; - -public class ColumnStrideLongIndex extends ColumnStrideFieldIndex implements Flushable { - private final Int2LongOpenHashMap values; - private final int maxSize; - - public ColumnStrideLongIndex(String name, int maxSize) { - super(name); - values = new Int2LongOpenHashMap(maxSize); // default unset value is 0 - this.maxSize = maxSize; - } - - private ColumnStrideLongIndex(String name, Int2LongOpenHashMap values, int maxSize) { - super(name); - this.values = values; - this.maxSize = maxSize; - } - - @Override - public void setValue(int docID, long value) { - values.put(docID, value); - } - - @Override - public long get(int docID) { - return values.get(docID); - } - - @Override - public ColumnStrideFieldIndex optimize( - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - return new OptimizedColumnStrideLongIndex(this, originalTweetIdMapper, optimizedTweetIdMapper); - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String NAME_PROP_NAME = "fieldName"; - private static final String MAX_SIZE_PROP = "maxSize"; - - public FlushHandler() { - super(); - } - - public FlushHandler(ColumnStrideLongIndex objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - ColumnStrideLongIndex index = getObjectToFlush(); - flushInfo.addStringProperty(NAME_PROP_NAME, index.getName()); - flushInfo.addIntProperty(MAX_SIZE_PROP, index.maxSize); - - out.writeInt(index.values.size()); - for (Int2LongOpenHashMap.Entry entry : index.values.int2LongEntrySet()) { - out.writeInt(entry.getIntKey()); - out.writeLong(entry.getLongValue()); - } - } - - @Override - protected ColumnStrideLongIndex doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - int size = in.readInt(); - int maxSize = flushInfo.getIntProperty(MAX_SIZE_PROP); - Int2LongOpenHashMap map = new Int2LongOpenHashMap(maxSize); - for (int i = 0; i < size; i++) { - map.put(in.readInt(), in.readLong()); - } - return new ColumnStrideLongIndex(flushInfo.getStringProperty(NAME_PROP_NAME), map, maxSize); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideMultiIntIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideMultiIntIndex.docx new file mode 100644 index 000000000..195caa1d9 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideMultiIntIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideMultiIntIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideMultiIntIndex.java deleted file mode 100644 index ccbf99a29..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/ColumnStrideMultiIntIndex.java +++ /dev/null @@ -1,102 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; - -public class ColumnStrideMultiIntIndex extends AbstractColumnStrideMultiIntIndex { - private final Int2IntOpenHashMap[] values; - private final int maxSize; - - public ColumnStrideMultiIntIndex(String name, int maxSize, int numIntsPerField) { - super(name, numIntsPerField); - values = new Int2IntOpenHashMap[numIntsPerField]; - for (int i = 0; i < numIntsPerField; i++) { - values[i] = new Int2IntOpenHashMap(maxSize); // default unset value is 0 - } - this.maxSize = maxSize; - } - - public ColumnStrideMultiIntIndex(String name, Int2IntOpenHashMap[] values, int maxSize) { - super(name, values.length); - this.values = values; - this.maxSize = maxSize; - } - - @Override - public void setValue(int docID, int valueIndex, int value) { - values[valueIndex].put(docID, value); - } - - @Override - public int get(int docID, int valueIndex) { - return values[valueIndex].get(docID); - } - - @Override - public ColumnStrideFieldIndex optimize( - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - return new OptimizedColumnStrideMultiIntIndex( - this, originalTweetIdMapper, optimizedTweetIdMapper); - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String NAME_PROP_NAME = "fieldName"; - private static final String MAX_SIZE_PROP = "maxSize"; - - public FlushHandler() { - super(); - } - - public FlushHandler(ColumnStrideMultiIntIndex objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - ColumnStrideMultiIntIndex index = getObjectToFlush(); - flushInfo.addStringProperty(NAME_PROP_NAME, index.getName()); - flushInfo.addIntProperty(MAX_SIZE_PROP, index.maxSize); - - out.writeInt(index.values.length); - for (int i = 0; i < index.values.length; i++) { - Int2IntOpenHashMap map = index.values[i]; - out.writeInt(map.size()); - for (Int2IntOpenHashMap.Entry entry : map.int2IntEntrySet()) { - out.writeInt(entry.getIntKey()); - out.writeInt(entry.getIntValue()); - } - } - } - - @Override - protected ColumnStrideMultiIntIndex doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - int numIntsPerField = in.readInt(); - int maxSize = flushInfo.getIntProperty(MAX_SIZE_PROP); - Int2IntOpenHashMap[] values = new Int2IntOpenHashMap[numIntsPerField]; - for (int i = 0; i < numIntsPerField; i++) { - int size = in.readInt(); - Int2IntOpenHashMap map = new Int2IntOpenHashMap(maxSize); - for (int j = 0; j < size; j++) { - map.put(in.readInt(), in.readInt()); - } - values[i] = map; - } - return new ColumnStrideMultiIntIndex( - flushInfo.getStringProperty(NAME_PROP_NAME), values, maxSize); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ConstantColumnStrideFieldIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/ConstantColumnStrideFieldIndex.docx new file mode 100644 index 000000000..d1d3235aa Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/ConstantColumnStrideFieldIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/ConstantColumnStrideFieldIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/ConstantColumnStrideFieldIndex.java deleted file mode 100644 index 3da67aec3..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/ConstantColumnStrideFieldIndex.java +++ /dev/null @@ -1,18 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -/** - * A ColumnStrideFieldIndex implementation that always returns the same value. - */ -public class ConstantColumnStrideFieldIndex extends ColumnStrideFieldIndex { - private final long defaultValue; - - public ConstantColumnStrideFieldIndex(String name, long defaultValue) { - super(name); - this.defaultValue = defaultValue; - } - - @Override - public long get(int docID) { - return defaultValue; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/DocValuesManager.docx b/src/java/com/twitter/search/core/earlybird/index/column/DocValuesManager.docx new file mode 100644 index 000000000..a9a81c986 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/DocValuesManager.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/DocValuesManager.java b/src/java/com/twitter/search/core/earlybird/index/column/DocValuesManager.java deleted file mode 100644 index 2e1e61b4b..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/DocValuesManager.java +++ /dev/null @@ -1,248 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; -import java.util.Iterator; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public abstract class DocValuesManager implements Flushable { - protected final Schema schema; - protected final int segmentSize; - protected final ConcurrentHashMap columnStrideFields; - - public DocValuesManager(Schema schema, int segmentSize) { - this(schema, segmentSize, new ConcurrentHashMap<>()); - } - - protected DocValuesManager(Schema schema, - int segmentSize, - ConcurrentHashMap columnStrideFields) { - this.schema = Preconditions.checkNotNull(schema); - this.segmentSize = segmentSize; - this.columnStrideFields = columnStrideFields; - } - - protected abstract ColumnStrideFieldIndex newByteCSF(String field); - protected abstract ColumnStrideFieldIndex newIntCSF(String field); - protected abstract ColumnStrideFieldIndex newLongCSF(String field); - protected abstract ColumnStrideFieldIndex newMultiIntCSF(String field, int numIntsPerField); - - /** - * Optimize this doc values manager, and return a doc values manager a more compact and fast - * encoding for doc values (but that we can't add new doc IDs to). - */ - public abstract DocValuesManager optimize( - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException; - - public Set getDocValueNames() { - return columnStrideFields.keySet(); - } - - /** - * Creates a new {@link ColumnStrideFieldIndex} for the given field and returns it. - */ - public ColumnStrideFieldIndex addColumnStrideField(String field, EarlybirdFieldType fieldType) { - // For CSF view fields, we will perform the same check on the base field when we try to create - // a ColumnStrideFieldIndex for them in newIntViewCSF(). - if (!fieldType.isCsfViewField()) { - Preconditions.checkState( - fieldType.isCsfLoadIntoRam(), "Field %s is not loaded in RAM", field); - } - - if (columnStrideFields.containsKey(field)) { - return columnStrideFields.get(field); - } - - final ColumnStrideFieldIndex index; - switch (fieldType.getCsfType()) { - case BYTE: - index = newByteCSF(field); - break; - case INT: - if (fieldType.getCsfFixedLengthNumValuesPerDoc() > 1) { - index = newMultiIntCSF(field, fieldType.getCsfFixedLengthNumValuesPerDoc()); - } else if (fieldType.isCsfViewField()) { - index = newIntViewCSF(field); - } else { - index = newIntCSF(field); - } - break; - case LONG: - index = newLongCSF(field); - break; - default: - throw new RuntimeException("Invalid CsfType."); - } - - columnStrideFields.put(field, index); - return index; - } - - protected ColumnStrideFieldIndex newIntViewCSF(String field) { - Schema.FieldInfo info = Preconditions.checkNotNull(schema.getFieldInfo(field)); - Schema.FieldInfo baseFieldInfo = Preconditions.checkNotNull( - schema.getFieldInfo(info.getFieldType().getCsfViewBaseFieldId())); - - Preconditions.checkState( - baseFieldInfo.getFieldType().isCsfLoadIntoRam(), - "Field %s has a base field (%s) that is not loaded in RAM", - field, baseFieldInfo.getName()); - - // We might not have a CSF for the base field yet. - ColumnStrideFieldIndex baseFieldIndex = - addColumnStrideField(baseFieldInfo.getName(), baseFieldInfo.getFieldType()); - Preconditions.checkNotNull(baseFieldIndex); - Preconditions.checkState(baseFieldIndex instanceof AbstractColumnStrideMultiIntIndex); - return new ColumnStrideIntViewIndex(info, (AbstractColumnStrideMultiIntIndex) baseFieldIndex); - } - - /** - * Returns the ColumnStrideFieldIndex instance for the given field. - */ - public ColumnStrideFieldIndex getColumnStrideFieldIndex(String field) { - ColumnStrideFieldIndex docValues = columnStrideFields.get(field); - if (docValues == null) { - Schema.FieldInfo info = schema.getFieldInfo(field); - if (info != null && info.getFieldType().isCsfDefaultValueSet()) { - return new ConstantColumnStrideFieldIndex(field, info.getFieldType().getCsfDefaultValue()); - } - } - - return docValues; - } - - private static final String CSF_INDEX_CLASS_NAME_PROP_NAME = "csfIndexClassName"; - private static final String CSF_PROP_NAME = "column_stride_fields"; - protected static final String MAX_SEGMENT_SIZE_PROP_NAME = "maxSegmentSize"; - - private static Map> getIntViewFields(Schema schema) { - Map> intViewFields = Maps.newHashMap(); - for (Schema.FieldInfo fieldInfo : schema.getFieldInfos()) { - if (fieldInfo.getFieldType().isCsfViewField()) { - Schema.FieldInfo baseFieldInfo = Preconditions.checkNotNull( - schema.getFieldInfo(fieldInfo.getFieldType().getCsfViewBaseFieldId())); - String baseFieldName = baseFieldInfo.getName(); - Set intViewFieldsForBaseField = - intViewFields.computeIfAbsent(baseFieldName, k -> Sets.newHashSet()); - intViewFieldsForBaseField.add(fieldInfo); - } - } - return intViewFields; - } - - public abstract static class FlushHandler extends Handler { - private final Schema schema; - - public FlushHandler(Schema schema) { - this.schema = schema; - } - - public FlushHandler(DocValuesManager docValuesManager) { - super(docValuesManager); - this.schema = docValuesManager.schema; - } - - @Override - public void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - long startTime = getClock().nowMillis(); - - DocValuesManager docValuesManager = getObjectToFlush(); - flushInfo.addIntProperty(MAX_SEGMENT_SIZE_PROP_NAME, docValuesManager.segmentSize); - long sizeBeforeFlush = out.length(); - FlushInfo csfProps = flushInfo.newSubProperties(CSF_PROP_NAME); - for (ColumnStrideFieldIndex csf : docValuesManager.columnStrideFields.values()) { - if (!(csf instanceof ColumnStrideIntViewIndex)) { - Preconditions.checkState( - csf instanceof Flushable, - "Cannot flush column stride field {} of type {}", - csf.getName(), csf.getClass().getCanonicalName()); - FlushInfo info = csfProps.newSubProperties(csf.getName()); - info.addStringProperty(CSF_INDEX_CLASS_NAME_PROP_NAME, csf.getClass().getCanonicalName()); - ((Flushable) csf).getFlushHandler().flush(info, out); - } - } - csfProps.setSizeInBytes(out.length() - sizeBeforeFlush); - getFlushTimerStats().timerIncrement(getClock().nowMillis() - startTime); - } - - @Override - public DocValuesManager doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - long startTime = getClock().nowMillis(); - Map> intViewFields = getIntViewFields(schema); - - FlushInfo csfProps = flushInfo.getSubProperties(CSF_PROP_NAME); - ConcurrentHashMap columnStrideFields = - new ConcurrentHashMap<>(); - - Iterator csfPropIter = csfProps.getKeyIterator(); - while (csfPropIter.hasNext()) { - String fieldName = csfPropIter.next(); - try { - FlushInfo info = csfProps.getSubProperties(fieldName); - String className = info.getStringProperty(CSF_INDEX_CLASS_NAME_PROP_NAME); - Class fieldIndexType = - (Class) Class.forName(className); - Preconditions.checkNotNull( - fieldIndexType, - "Invalid field configuration: field " + fieldName + " not found in config."); - - for (Class c : fieldIndexType.getDeclaredClasses()) { - if (Handler.class.isAssignableFrom(c)) { - @SuppressWarnings("rawtypes") - Handler handler = (Handler) c.newInstance(); - ColumnStrideFieldIndex index = (ColumnStrideFieldIndex) handler.load( - csfProps.getSubProperties(fieldName), in); - columnStrideFields.put(fieldName, index); - - // If this is a base field, create ColumnStrideIntViewIndex instances for all the - // view fields based on it. - if (index instanceof AbstractColumnStrideMultiIntIndex) { - AbstractColumnStrideMultiIntIndex multiIntIndex = - (AbstractColumnStrideMultiIntIndex) index; - - // We should have AbstractColumnStrideMultiIntIndex instances only for base fields - // and all our base fields have views defined on top of them. - for (Schema.FieldInfo intViewFieldInfo : intViewFields.get(fieldName)) { - columnStrideFields.put( - intViewFieldInfo.getName(), - new ColumnStrideIntViewIndex(intViewFieldInfo, multiIntIndex)); - } - } - - break; - } - } - } catch (ClassNotFoundException | IllegalAccessException | InstantiationException e) { - throw new IOException( - "Invalid field configuration for column stride field: " + fieldName, e); - } - } - getLoadTimerStats().timerIncrement(getClock().nowMillis() - startTime); - - return createDocValuesManager( - schema, - flushInfo.getIntProperty(MAX_SEGMENT_SIZE_PROP_NAME), - columnStrideFields); - } - - protected abstract DocValuesManager createDocValuesManager( - Schema docValuesSchema, - int maxSegmentSize, - ConcurrentHashMap columnStrideFields); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/DocValuesUpdate.docx b/src/java/com/twitter/search/core/earlybird/index/column/DocValuesUpdate.docx new file mode 100644 index 000000000..057894290 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/DocValuesUpdate.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/DocValuesUpdate.java b/src/java/com/twitter/search/core/earlybird/index/column/DocValuesUpdate.java deleted file mode 100644 index 1f49cc3c7..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/DocValuesUpdate.java +++ /dev/null @@ -1,8 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -public interface DocValuesUpdate { - /** - * Performs an doc values update on the given document. - */ - void update(ColumnStrideFieldIndex docValues, int docID); -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideByteIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideByteIndex.docx new file mode 100644 index 000000000..fd4e43a7c Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideByteIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideByteIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideByteIndex.java deleted file mode 100644 index 93a6faea3..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideByteIndex.java +++ /dev/null @@ -1,81 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public class OptimizedColumnStrideByteIndex extends ColumnStrideFieldIndex implements Flushable { - private final byte[] values; - - public OptimizedColumnStrideByteIndex(String name, int maxSize) { - super(name); - values = new byte[maxSize]; - } - - public OptimizedColumnStrideByteIndex( - ColumnStrideByteIndex columnStrideByteIndex, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - super(columnStrideByteIndex.getName()); - int maxDocId = optimizedTweetIdMapper.getPreviousDocID(Integer.MAX_VALUE); - values = new byte[maxDocId + 1]; - - int docId = optimizedTweetIdMapper.getNextDocID(Integer.MIN_VALUE); - while (docId != DocIDToTweetIDMapper.ID_NOT_FOUND) { - int originalDocId = originalTweetIdMapper.getDocID(optimizedTweetIdMapper.getTweetID(docId)); - setValue(docId, columnStrideByteIndex.get(originalDocId)); - docId = optimizedTweetIdMapper.getNextDocID(docId); - } - } - - private OptimizedColumnStrideByteIndex(String name, byte[] values) { - super(name); - this.values = values; - } - - @Override - public void setValue(int docID, long value) { - this.values[docID] = (byte) value; - } - - @Override - public long get(int docID) { - return values[docID]; - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String NAME_PROP_NAME = "fieldName"; - - public FlushHandler() { - super(); - } - - public FlushHandler(OptimizedColumnStrideByteIndex objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - OptimizedColumnStrideByteIndex columnStrideByteIndex = getObjectToFlush(); - flushInfo.addStringProperty(NAME_PROP_NAME, columnStrideByteIndex.getName()); - out.writeByteArray(columnStrideByteIndex.values); - } - - @Override - protected OptimizedColumnStrideByteIndex doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - byte[] values = in.readByteArray(); - return new OptimizedColumnStrideByteIndex( - flushInfo.getStringProperty(NAME_PROP_NAME), values); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideIntIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideIntIndex.docx new file mode 100644 index 000000000..881690037 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideIntIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideIntIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideIntIndex.java deleted file mode 100644 index 725b54746..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideIntIndex.java +++ /dev/null @@ -1,81 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public class OptimizedColumnStrideIntIndex extends ColumnStrideFieldIndex implements Flushable { - private final int[] values; - - public OptimizedColumnStrideIntIndex(String name, int maxSize) { - super(name); - values = new int[maxSize]; - } - - public OptimizedColumnStrideIntIndex( - ColumnStrideIntIndex columnStrideIntIndex, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - super(columnStrideIntIndex.getName()); - int maxDocId = optimizedTweetIdMapper.getPreviousDocID(Integer.MAX_VALUE); - values = new int[maxDocId + 1]; - - int docId = optimizedTweetIdMapper.getNextDocID(Integer.MIN_VALUE); - while (docId != DocIDToTweetIDMapper.ID_NOT_FOUND) { - int originalDocId = originalTweetIdMapper.getDocID(optimizedTweetIdMapper.getTweetID(docId)); - setValue(docId, columnStrideIntIndex.get(originalDocId)); - docId = optimizedTweetIdMapper.getNextDocID(docId); - } - } - - private OptimizedColumnStrideIntIndex(String name, int[] values) { - super(name); - this.values = values; - } - - @Override - public void setValue(int docID, long value) { - this.values[docID] = (int) value; - } - - @Override - public long get(int docID) { - return values[docID]; - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String NAME_PROP_NAME = "fieldName"; - - public FlushHandler() { - super(); - } - - public FlushHandler(OptimizedColumnStrideIntIndex objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - OptimizedColumnStrideIntIndex columnStrideIntIndex = getObjectToFlush(); - flushInfo.addStringProperty(NAME_PROP_NAME, columnStrideIntIndex.getName()); - out.writeIntArray(columnStrideIntIndex.values); - } - - @Override - protected OptimizedColumnStrideIntIndex doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - int[] values = in.readIntArray(); - return new OptimizedColumnStrideIntIndex( - flushInfo.getStringProperty(NAME_PROP_NAME), values); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideLongIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideLongIndex.docx new file mode 100644 index 000000000..611a54690 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideLongIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideLongIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideLongIndex.java deleted file mode 100644 index df74a7e4e..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideLongIndex.java +++ /dev/null @@ -1,81 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public class OptimizedColumnStrideLongIndex extends ColumnStrideFieldIndex implements Flushable { - private final long[] values; - - public OptimizedColumnStrideLongIndex(String name, int maxSize) { - super(name); - values = new long[maxSize]; - } - - public OptimizedColumnStrideLongIndex( - ColumnStrideLongIndex columnStrideLongIndex, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - super(columnStrideLongIndex.getName()); - int maxDocId = optimizedTweetIdMapper.getPreviousDocID(Integer.MAX_VALUE); - values = new long[maxDocId + 1]; - - int docId = optimizedTweetIdMapper.getNextDocID(Integer.MIN_VALUE); - while (docId != DocIDToTweetIDMapper.ID_NOT_FOUND) { - int originalDocId = originalTweetIdMapper.getDocID(optimizedTweetIdMapper.getTweetID(docId)); - setValue(docId, columnStrideLongIndex.get(originalDocId)); - docId = optimizedTweetIdMapper.getNextDocID(docId); - } - } - - private OptimizedColumnStrideLongIndex(String name, long[] values) { - super(name); - this.values = values; - } - - @Override - public void setValue(int docID, long value) { - this.values[docID] = value; - } - - @Override - public long get(int docID) { - return values[docID]; - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String NAME_PROP_NAME = "fieldName"; - - public FlushHandler() { - super(); - } - - public FlushHandler(OptimizedColumnStrideLongIndex objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - OptimizedColumnStrideLongIndex columnStrideLongIndex = getObjectToFlush(); - flushInfo.addStringProperty(NAME_PROP_NAME, columnStrideLongIndex.getName()); - out.writeLongArray(columnStrideLongIndex.values); - } - - @Override - protected OptimizedColumnStrideLongIndex doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - long[] values = in.readLongArray(); - return new OptimizedColumnStrideLongIndex( - flushInfo.getStringProperty(NAME_PROP_NAME), values); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideMultiIntIndex.docx b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideMultiIntIndex.docx new file mode 100644 index 000000000..7de1cc568 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideMultiIntIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideMultiIntIndex.java b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideMultiIntIndex.java deleted file mode 100644 index 82f233ad8..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedColumnStrideMultiIntIndex.java +++ /dev/null @@ -1,90 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public class OptimizedColumnStrideMultiIntIndex - extends AbstractColumnStrideMultiIntIndex implements Flushable { - private final int[] values; - - public OptimizedColumnStrideMultiIntIndex(String name, int maxSize, int numIntsPerField) { - super(name, numIntsPerField); - values = new int[Math.multiplyExact(maxSize, numIntsPerField)]; - } - - public OptimizedColumnStrideMultiIntIndex( - ColumnStrideMultiIntIndex columnStrideMultiIntIndex, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - super(columnStrideMultiIntIndex.getName(), columnStrideMultiIntIndex.getNumIntsPerField()); - int maxDocId = optimizedTweetIdMapper.getPreviousDocID(Integer.MAX_VALUE); - values = new int[columnStrideMultiIntIndex.getNumIntsPerField() * (maxDocId + 1)]; - - int docId = optimizedTweetIdMapper.getNextDocID(Integer.MIN_VALUE); - while (docId != DocIDToTweetIDMapper.ID_NOT_FOUND) { - int originalDocId = originalTweetIdMapper.getDocID(optimizedTweetIdMapper.getTweetID(docId)); - for (int i = 0; i < columnStrideMultiIntIndex.getNumIntsPerField(); ++i) { - setValue(docId, i, columnStrideMultiIntIndex.get(originalDocId, i)); - } - docId = optimizedTweetIdMapper.getNextDocID(docId); - } - } - - private OptimizedColumnStrideMultiIntIndex(String name, int numIntsPerField, int[] values) { - super(name, numIntsPerField); - this.values = values; - } - - @Override - public void setValue(int docID, int valueIndex, int value) { - values[docID * getNumIntsPerField() + valueIndex] = value; - } - - @Override - public int get(int docID, int valueIndex) { - return values[docID * getNumIntsPerField() + valueIndex]; - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler - extends Flushable.Handler { - private static final String INTS_PER_FIELD_PROP_NAME = "intsPerField"; - private static final String NAME_PROP_NAME = "fieldName"; - - public FlushHandler() { - super(); - } - - public FlushHandler(OptimizedColumnStrideMultiIntIndex objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - OptimizedColumnStrideMultiIntIndex columnStrideMultiIntIndex = getObjectToFlush(); - flushInfo.addStringProperty(NAME_PROP_NAME, columnStrideMultiIntIndex.getName()); - flushInfo.addIntProperty(INTS_PER_FIELD_PROP_NAME, - columnStrideMultiIntIndex.getNumIntsPerField()); - out.writeIntArray(columnStrideMultiIntIndex.values); - } - - @Override - protected OptimizedColumnStrideMultiIntIndex doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - int[] values = in.readIntArray(); - return new OptimizedColumnStrideMultiIntIndex( - flushInfo.getStringProperty(NAME_PROP_NAME), - flushInfo.getIntProperty(INTS_PER_FIELD_PROP_NAME), - values); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedDocValuesManager.docx b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedDocValuesManager.docx new file mode 100644 index 000000000..6d7472211 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedDocValuesManager.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedDocValuesManager.java b/src/java/com/twitter/search/core/earlybird/index/column/OptimizedDocValuesManager.java deleted file mode 100644 index 2053d7ce5..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/OptimizedDocValuesManager.java +++ /dev/null @@ -1,97 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; -import java.util.Set; -import java.util.concurrent.ConcurrentHashMap; - -import com.google.common.collect.Sets; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public class OptimizedDocValuesManager extends DocValuesManager { - public OptimizedDocValuesManager(Schema schema, int segmentSize) { - super(schema, segmentSize); - } - - public OptimizedDocValuesManager(DocValuesManager docValuesManager, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - super(docValuesManager.schema, docValuesManager.segmentSize); - Set intViewIndexes = Sets.newHashSet(); - for (String fieldName : docValuesManager.columnStrideFields.keySet()) { - ColumnStrideFieldIndex originalColumnStrideField = - docValuesManager.columnStrideFields.get(fieldName); - if (originalColumnStrideField instanceof ColumnStrideIntViewIndex) { - intViewIndexes.add((ColumnStrideIntViewIndex) originalColumnStrideField); - } else { - ColumnStrideFieldIndex optimizedColumnStrideField = - originalColumnStrideField.optimize(originalTweetIdMapper, optimizedTweetIdMapper); - columnStrideFields.put(fieldName, optimizedColumnStrideField); - } - } - - // We have to process the ColumnStrideIntViewIndex instances after we process all other CSFs, - // because we need to make sure we've optimized the CSFs for the base fields. - for (ColumnStrideIntViewIndex intViewIndex : intViewIndexes) { - String fieldName = intViewIndex.getName(); - columnStrideFields.put(fieldName, newIntViewCSF(fieldName)); - } - } - - private OptimizedDocValuesManager( - Schema schema, - int segmentSize, - ConcurrentHashMap columnStrideFields) { - super(schema, segmentSize, columnStrideFields); - } - - @Override - protected ColumnStrideFieldIndex newByteCSF(String field) { - return new OptimizedColumnStrideByteIndex(field, segmentSize); - } - - @Override - protected ColumnStrideFieldIndex newIntCSF(String field) { - return new OptimizedColumnStrideIntIndex(field, segmentSize); - } - - @Override - protected ColumnStrideFieldIndex newLongCSF(String field) { - return new OptimizedColumnStrideLongIndex(field, segmentSize); - } - - @Override - protected ColumnStrideFieldIndex newMultiIntCSF(String field, int numIntsPerField) { - return new OptimizedColumnStrideMultiIntIndex(field, segmentSize, numIntsPerField); - } - - @Override - public DocValuesManager optimize(DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - return this; - } - - @Override - public FlushHandler getFlushHandler() { - return new OptimizedFlushHandler(this); - } - - public static class OptimizedFlushHandler extends FlushHandler { - public OptimizedFlushHandler(Schema schema) { - super(schema); - } - - private OptimizedFlushHandler(DocValuesManager docValuesManager) { - super(docValuesManager); - } - - @Override - protected DocValuesManager createDocValuesManager( - Schema schema, - int maxSegmentSize, - ConcurrentHashMap columnStrideFields) { - return new OptimizedDocValuesManager(schema, maxSegmentSize, columnStrideFields); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/column/UnoptimizedDocValuesManager.docx b/src/java/com/twitter/search/core/earlybird/index/column/UnoptimizedDocValuesManager.docx new file mode 100644 index 000000000..bc150983d Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/column/UnoptimizedDocValuesManager.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/column/UnoptimizedDocValuesManager.java b/src/java/com/twitter/search/core/earlybird/index/column/UnoptimizedDocValuesManager.java deleted file mode 100644 index 840fe7cfe..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/column/UnoptimizedDocValuesManager.java +++ /dev/null @@ -1,69 +0,0 @@ -package com.twitter.search.core.earlybird.index.column; - -import java.io.IOException; -import java.util.concurrent.ConcurrentHashMap; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public class UnoptimizedDocValuesManager extends DocValuesManager { - public UnoptimizedDocValuesManager(Schema schema, int segmentSize) { - super(schema, segmentSize); - } - - private UnoptimizedDocValuesManager( - Schema schema, - int segmentSize, - ConcurrentHashMap columnStrideFields) { - super(schema, segmentSize, columnStrideFields); - } - - @Override - protected ColumnStrideFieldIndex newByteCSF(String field) { - return new ColumnStrideByteIndex(field, segmentSize); - } - - @Override - protected ColumnStrideFieldIndex newIntCSF(String field) { - return new ColumnStrideIntIndex(field, segmentSize); - } - - @Override - protected ColumnStrideFieldIndex newLongCSF(String field) { - return new ColumnStrideLongIndex(field, segmentSize); - } - - @Override - protected ColumnStrideFieldIndex newMultiIntCSF(String field, int numIntsPerField) { - return new ColumnStrideMultiIntIndex(field, segmentSize, numIntsPerField); - } - - @Override - public DocValuesManager optimize(DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - return new OptimizedDocValuesManager(this, originalTweetIdMapper, optimizedTweetIdMapper); - } - - @Override - public FlushHandler getFlushHandler() { - return new UnoptimizedFlushHandler(this); - } - - public static class UnoptimizedFlushHandler extends FlushHandler { - public UnoptimizedFlushHandler(Schema schema) { - super(schema); - } - - private UnoptimizedFlushHandler(DocValuesManager docValuesManager) { - super(docValuesManager); - } - - @Override - protected DocValuesManager createDocValuesManager( - Schema schema, - int maxSegmentSize, - ConcurrentHashMap columnStrideFields) { - return new UnoptimizedDocValuesManager(schema, maxSegmentSize, columnStrideFields); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsData.docx b/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsData.docx new file mode 100644 index 000000000..9d075d043 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsData.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsData.java b/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsData.java deleted file mode 100644 index b63defe36..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsData.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.search.core.earlybird.index.extensions; - -import java.io.IOException; - -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * Base index extensions class. - */ -public interface EarlybirdIndexExtensionsData { - /** - * Sets up the extensions for the given reader. - */ - void setupExtensions(EarlybirdIndexSegmentAtomicReader atomicReader) throws IOException; -} diff --git a/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsFactory.docx b/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsFactory.docx new file mode 100644 index 000000000..2b746847a Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsFactory.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsFactory.java b/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsFactory.java deleted file mode 100644 index 6b9d30687..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdIndexExtensionsFactory.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.twitter.search.core.earlybird.index.extensions; - -/** - * Base class to implement factories that create realtime and Lucene index extensions. - * - * The factory needs to be able to create instances for new segments, as well as load - * index extensions of existing segments from disk. - */ -public abstract class EarlybirdIndexExtensionsFactory { - /** - * Returns the {@link EarlybirdRealtimeIndexExtensionsData} instance to be used for a new segment. - */ - public abstract EarlybirdRealtimeIndexExtensionsData newRealtimeIndexExtensionsData(); - - /** - * Returns the {@link EarlybirdIndexExtensionsData} instance to be used for a new Lucene segment. - */ - public abstract EarlybirdIndexExtensionsData newLuceneIndexExtensionsData(); -} diff --git a/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdRealtimeIndexExtensionsData.docx b/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdRealtimeIndexExtensionsData.docx new file mode 100644 index 000000000..313c63d6e Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdRealtimeIndexExtensionsData.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdRealtimeIndexExtensionsData.java b/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdRealtimeIndexExtensionsData.java deleted file mode 100644 index 284475566..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/extensions/EarlybirdRealtimeIndexExtensionsData.java +++ /dev/null @@ -1,20 +0,0 @@ -package com.twitter.search.core.earlybird.index.extensions; - -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentWriter; - -/** - * An index extensions implementation for real-time Earlybird indexes. - */ -public interface EarlybirdRealtimeIndexExtensionsData extends EarlybirdIndexExtensionsData { - /** - * Optionally, an implementing class can provide a custom consumer for inverted fields (i.e. streams of tokens). - */ - void createInvertedDocConsumer( - EarlybirdRealtimeIndexSegmentWriter.InvertedDocConsumerBuilder builder); - - /** - * Optionally, an implementing class can provide a custom consumer for stored fields (e.g. doc values fields). - */ - void createStoredFieldsConsumer( - EarlybirdRealtimeIndexSegmentWriter.StoredFieldsConsumerBuilder builder); -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/BaseByteBlockPool.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/BaseByteBlockPool.docx new file mode 100644 index 000000000..f99ad9ac1 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/BaseByteBlockPool.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/BaseByteBlockPool.java b/src/java/com/twitter/search/core/earlybird/index/inverted/BaseByteBlockPool.java deleted file mode 100644 index 332ce3872..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/BaseByteBlockPool.java +++ /dev/null @@ -1,373 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.lucene.store.DataInput; -import org.apache.lucene.store.DataOutput; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.ByteBlockPool; -import org.apache.lucene.util.BytesRef; - -import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF; - -/** - * Base class for BlockPools backed by byte[] arrays. - */ -public abstract class BaseByteBlockPool { - /** - * The extra object with final array is necessary to guarantee visibility to - * other threads without synchronization/using volatile. - * - * From 'Java Concurrency in practice' by Brian Goetz, p. 349: - * - * "Initialization safety guarantees that for properly constructed objects, all - * threads will see the correct values of final fields that were set by the con- - * structor, regardless of how the object is published. Further, any variables - * that can be reached through a final field of a properly constructed object - * (such as the elements of a final array or the contents of a HashMap refer- - * enced by a final field) are also guaranteed to be visible to other threads." - */ - public static final class Pool { - public final byte[][] buffers; - - public Pool(byte[][] buffers) { - this.buffers = buffers; - } - - public byte[][] getBlocks() { - return buffers; - } - } - - public Pool pool = new Pool(new byte[10][]); - // The index of the current buffer in pool.buffers. - public int bufferUpto = -1; - // The number of bytes that have been written in the current buffer. - public int byteUpto = ByteBlockPool.BYTE_BLOCK_SIZE; - // The current buffer, i.e. a reference to pool.buffers[bufferUpto] - public byte[] buffer; - // The total number of bytes that have been used up to now, excluding the current buffer. - public int byteOffset = -ByteBlockPool.BYTE_BLOCK_SIZE; - // The one and only WriteStream for this pool. - private WriteStream writeStream = new WriteStream(); - - protected BaseByteBlockPool() { } - - /** - * Used for loading flushed pool. - */ - protected BaseByteBlockPool(Pool pool, int bufferUpto, int byteUpTo, int byteOffset) { - this.pool = pool; - this.bufferUpto = bufferUpto; - this.byteUpto = byteUpTo; - this.byteOffset = byteOffset; - if (bufferUpto >= 0) { - this.buffer = pool.buffers[bufferUpto]; - } - } - - /** - * Resets the index of the pool to 0 in the first buffer and resets the byte arrays of - * all previously allocated buffers to 0s. - */ - public void reset() { - if (bufferUpto != -1) { - // We allocated at least one buffer - - for (int i = 0; i < bufferUpto; i++) { - // Fully zero fill buffers that we fully used - Arrays.fill(pool.buffers[i], (byte) 0); - } - - // Partial zero fill the final buffer - Arrays.fill(pool.buffers[bufferUpto], 0, byteUpto, (byte) 0); - - bufferUpto = 0; - byteUpto = 0; - byteOffset = 0; - buffer = pool.buffers[0]; - } - } - - /** - * Switches to the next buffer and positions the index at its beginning. - */ - public void nextBuffer() { - if (1 + bufferUpto == pool.buffers.length) { - byte[][] newBuffers = new byte[ArrayUtil.oversize(pool.buffers.length + 1, - NUM_BYTES_OBJECT_REF)][]; - System.arraycopy(pool.buffers, 0, newBuffers, 0, pool.buffers.length); - pool = new Pool(newBuffers); - } - buffer = pool.buffers[1 + bufferUpto] = new byte[ByteBlockPool.BYTE_BLOCK_SIZE]; - bufferUpto++; - - byteUpto = 0; - byteOffset += ByteBlockPool.BYTE_BLOCK_SIZE; - } - - /** - * Returns the start offset of the next data that will be added to the pool, UNLESS the data is - * added using addBytes and avoidSplitting = true - */ - public int getOffset() { - return byteOffset + byteUpto; - } - - /** - * Returns the start offset of b in the pool - * @param b byte to put - */ - public int addByte(byte b) { - int initOffset = byteOffset + byteUpto; - int remainingBytesInBuffer = ByteBlockPool.BYTE_BLOCK_SIZE - byteUpto; - // If the buffer is full, move on to the next one. - if (remainingBytesInBuffer <= 0) { - nextBuffer(); - } - buffer[byteUpto] = b; - byteUpto++; - return initOffset; - } - - /** - * Returns the start offset of the bytes in the pool. - * If avoidSplitting is false, this is guaranteed to return the same value that would be - * returned by getOffset() - * @param bytes source array - * @param length number of bytes to put - * @param avoidSplitting if possible (the length is less than ByteBlockPool.BYTE_BLOCK_SIZE), - * the bytes will not be split across buffer boundaries. This is useful for small data - * that will be read a lot (small amount of space wasted in return for avoiding copying - * memory when calling getBytes). - */ - public int addBytes(byte[] bytes, int offset, int length, boolean avoidSplitting) { - // The first time this is called, there may not be an existing buffer yet. - if (buffer == null) { - nextBuffer(); - } - - int remainingBytesInBuffer = ByteBlockPool.BYTE_BLOCK_SIZE - byteUpto; - - if (avoidSplitting && length < ByteBlockPool.BYTE_BLOCK_SIZE) { - if (remainingBytesInBuffer < length) { - nextBuffer(); - } - int initOffset = byteOffset + byteUpto; - System.arraycopy(bytes, offset, buffer, byteUpto, length); - byteUpto += length; - return initOffset; - } else { - int initOffset = byteOffset + byteUpto; - if (remainingBytesInBuffer < length) { - // Must split the bytes across buffers. - int remainingLength = length; - while (remainingLength > ByteBlockPool.BYTE_BLOCK_SIZE - byteUpto) { - int lengthToCopy = ByteBlockPool.BYTE_BLOCK_SIZE - byteUpto; - System.arraycopy(bytes, length - remainingLength + offset, - buffer, byteUpto, lengthToCopy); - remainingLength -= lengthToCopy; - nextBuffer(); - } - System.arraycopy(bytes, length - remainingLength + offset, - buffer, byteUpto, remainingLength); - byteUpto += remainingLength; - } else { - // Just add all bytes to the current buffer. - System.arraycopy(bytes, offset, buffer, byteUpto, length); - byteUpto += length; - } - return initOffset; - } - } - - /** - * Default addBytes. Does not avoid splitting. - * @see #addBytes(byte[], int, boolean) - */ - public int addBytes(byte[] bytes, int length) { - return addBytes(bytes, 0, length, false); - } - - /** - * Default addBytes. Does not avoid splitting. - * @see #addBytes(byte[], int, boolean) - */ - public int addBytes(byte[] bytes, int offset, int length) { - return addBytes(bytes, offset, length, false); - } - - /** - * Reads one byte from the pool. - * @param offset location to read byte from - */ - public byte getByte(int offset) { - int bufferIndex = offset >>> ByteBlockPool.BYTE_BLOCK_SHIFT; - int bufferOffset = offset & ByteBlockPool.BYTE_BLOCK_MASK; - return pool.buffers[bufferIndex][bufferOffset]; - } - - /** - * Returns false if offset is invalid or there aren't these many bytes - * available in the pool. - * @param offset location to start reading bytes from - * @param length number of bytes to read - * @param output the object to write the output to. MUST be non null. - */ - public boolean getBytesToBytesRef(int offset, int length, BytesRef output) { - if (offset < 0 || offset + length > byteUpto + byteOffset) { - return false; - } - int currentBuffer = offset >>> ByteBlockPool.BYTE_BLOCK_SHIFT; - int currentOffset = offset & ByteBlockPool.BYTE_BLOCK_MASK; - // If the requested bytes are split across pools, we have to make a new array of bytes - // to copy them into and return a ref to that. - if (currentOffset + length <= ByteBlockPool.BYTE_BLOCK_SIZE) { - output.bytes = pool.buffers[currentBuffer]; - output.offset = currentOffset; - output.length = length; - } else { - byte[] bytes = new byte[length]; - int remainingLength = length; - while (remainingLength > ByteBlockPool.BYTE_BLOCK_SIZE - currentOffset) { - int lengthToCopy = ByteBlockPool.BYTE_BLOCK_SIZE - currentOffset; - System.arraycopy(pool.buffers[currentBuffer], currentOffset, bytes, - length - remainingLength, lengthToCopy); - remainingLength -= lengthToCopy; - currentBuffer++; - currentOffset = 0; - } - System.arraycopy(pool.buffers[currentBuffer], currentOffset, bytes, length - remainingLength, - remainingLength); - output.bytes = bytes; - output.length = bytes.length; - output.offset = 0; - } - return true; - - } - - /** - * Returns the read bytes, or null if offset is invalid or there aren't these many bytes - * available in the pool. - * @param offset location to start reading bytes from - * @param length number of bytes to read - */ - public BytesRef getBytes(int offset, int length) { - BytesRef result = new BytesRef(); - if (getBytesToBytesRef(offset, length, result)) { - return result; - } else { - return null; - } - } - - /** - * get a new readStream at a given offset for this pool. - * - * Notice that individual ReadStreams are not threadsafe, but you can get as many ReadStreams as - * you want. - */ - public ReadStream getReadStream(int offset) { - return new ReadStream(offset); - } - - /** - * get the (one and only) WriteStream for this pool. - * - * Notice that there is exactly one WriteStream per pool, and it is not threadsafe. - */ - public WriteStream getWriteStream() { - return writeStream; - } - - /** - * A DataOutput-like interface for writing "contiguous" data to a ByteBlockPool. - * - * This is not threadsafe. - */ - public final class WriteStream extends DataOutput { - private WriteStream() { } - - /** - * Returns the start offset of the next data that will be added to the pool, UNLESS the data is - * added using addBytes and avoidSplitting = true - */ - public int getOffset() { - return BaseByteBlockPool.this.getOffset(); - } - - /** - * Write bytes to the pool. - * @param bytes source array - * @param offset offset in bytes of the data to write - * @param length number of bytes to put - * @param avoidSplitting same as {link ByteBlockPool.addBytes} - * @return the start offset of the bytes in the pool - */ - public int writeBytes(byte[] bytes, int offset, int length, boolean avoidSplitting) { - return addBytes(bytes, offset, length, avoidSplitting); - } - - @Override - public void writeBytes(byte[] b, int offset, int length) throws IOException { - addBytes(b, offset, length); - } - - @Override - public void writeByte(byte b) { - addByte(b); - } - } - - /** - * A DataInput-like interface for reading "contiguous" data from a ByteBlockPool. - * - * This is not threadsafe. - * - * This does not fully implement the DataInput interface - its DataInput.readBytes method throws - * UnsupportedOperationException because this class provides a facility for no-copy reading. - */ - public final class ReadStream extends DataInput { - private int offset; - - private ReadStream(int offset) { - this.offset = offset; - } - - public BytesRef readBytes(int n) { - return readBytes(n, false); - } - - /** - * read n bytes that were written with a given value of avoidSplitting - * @param n number of bytes to read. - * @param avoidSplitting this should be the same that was used at writeBytes time. - * @return a reference to the bytes read or null. - */ - public BytesRef readBytes(int n, boolean avoidSplitting) { - int currentBuffer = offset >>> ByteBlockPool.BYTE_BLOCK_SHIFT; - int currentOffset = offset & ByteBlockPool.BYTE_BLOCK_MASK; - if (avoidSplitting && n < ByteBlockPool.BYTE_BLOCK_SIZE - && currentOffset + n > ByteBlockPool.BYTE_BLOCK_SIZE) { - ++currentBuffer; - currentOffset = 0; - offset = currentBuffer << ByteBlockPool.BYTE_BLOCK_SHIFT; - } - BytesRef result = getBytes(offset, n); - this.offset += n; - return result; - } - - @Override - public byte readByte() { - return getByte(offset++); - } - - @Override - public void readBytes(byte[] b, int off, int len) throws IOException { - throw new UnsupportedOperationException("Use the no-copies version of ReadBytes instead."); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/ByteBlockPool.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/ByteBlockPool.docx new file mode 100644 index 000000000..067e196a8 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/ByteBlockPool.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/ByteBlockPool.java b/src/java/com/twitter/search/core/earlybird/index/inverted/ByteBlockPool.java deleted file mode 100644 index 1401b4003..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/ByteBlockPool.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; - -public class ByteBlockPool extends BaseByteBlockPool implements Flushable { - - public ByteBlockPool() { - } - - /** - * Used for loading flushed pool. - */ - private ByteBlockPool(Pool pool, int bufferUpto, int byteUpTo, int byteOffset) { - super(pool, bufferUpto, byteUpTo, byteOffset); - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static class FlushHandler extends Flushable.Handler { - private static final String BUFFER_UP_TO_PROP_NAME = "bufferUpto"; - private static final String BYTE_UP_TO_PROP_NAME = "byteUpto"; - private static final String BYTE_OFFSET_PROP_NAME = "byteOffset"; - - public FlushHandler(ByteBlockPool objectToFlush) { - super(objectToFlush); - } - - public FlushHandler() { - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - ByteBlockPool objectToFlush = getObjectToFlush(); - out.writeByteArray2D(objectToFlush.pool.buffers, objectToFlush.bufferUpto + 1); - flushInfo.addIntProperty(BUFFER_UP_TO_PROP_NAME, objectToFlush.bufferUpto); - flushInfo.addIntProperty(BYTE_UP_TO_PROP_NAME, objectToFlush.byteUpto); - flushInfo.addIntProperty(BYTE_OFFSET_PROP_NAME, objectToFlush.byteOffset); - } - - @Override - protected ByteBlockPool doLoad(FlushInfo flushInfo, - DataDeserializer in) throws IOException { - return new ByteBlockPool( - new BaseByteBlockPool.Pool(in.readByteArray2D()), - flushInfo.getIntProperty(BUFFER_UP_TO_PROP_NAME), - flushInfo.getIntProperty(BYTE_UP_TO_PROP_NAME), - flushInfo.getIntProperty(BYTE_OFFSET_PROP_NAME)); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/ByteTermUtils.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/ByteTermUtils.docx new file mode 100644 index 000000000..ff697ec29 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/ByteTermUtils.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/ByteTermUtils.java b/src/java/com/twitter/search/core/earlybird/index/inverted/ByteTermUtils.java deleted file mode 100644 index c246caa68..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/ByteTermUtils.java +++ /dev/null @@ -1,126 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import org.apache.lucene.util.ByteBlockPool; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.StringHelper; - -/** - * Utility class for BytePools which have each term's length encoded before the contents in the - * ByteBlockPool - * Another solution is to have a class that encapsulates both textStarts and the byteBlockPool and - * knows how the byteBlockPool is used to store the strings - **/ -public abstract class ByteTermUtils { - /** - * Fill in a BytesRef from term's length & bytes encoded in byte block - */ - public static int setBytesRef(final BaseByteBlockPool byteBlockPool, - BytesRef term, - final int textStart) { - final byte[] block = term.bytes = - byteBlockPool.pool.buffers[textStart >>> ByteBlockPool.BYTE_BLOCK_SHIFT]; - final int start = textStart & ByteBlockPool.BYTE_BLOCK_MASK; - int pos = start; - - byte b = block[pos++]; - term.length = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = block[pos++]; - term.length |= (b & 0x7F) << shift; - } - term.offset = pos; - - assert term.length >= 0; - return textStart + (pos - start) + term.length; - } - - /** - * Test whether the text for current RawPostingList p equals - * current tokenText in utf8. - */ - public static boolean postingEquals(final BaseByteBlockPool termPool, - final int textStart, final BytesRef other) { - final byte[] block = termPool.pool.getBlocks()[textStart >>> ByteBlockPool.BYTE_BLOCK_SHIFT]; - assert block != null; - - int pos = textStart & ByteBlockPool.BYTE_BLOCK_MASK; - - byte b = block[pos++]; - int len = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = block[pos++]; - len |= (b & 0x7F) << shift; - } - - if (len == other.length) { - final byte[] utf8Bytes = other.bytes; - for (int tokenPos = other.offset; - tokenPos < other.length + other.offset; pos++, tokenPos++) { - if (utf8Bytes[tokenPos] != block[pos]) { - return false; - } - } - return true; - } else { - return false; - } - } - - /** - * Returns the hashCode of the term stored at the given position in the block pool. - */ - public static int hashCode( - final BaseByteBlockPool termPool, final int textStart) { - final byte[] block = termPool.pool.getBlocks()[textStart >>> ByteBlockPool.BYTE_BLOCK_SHIFT]; - final int start = textStart & ByteBlockPool.BYTE_BLOCK_MASK; - - int pos = start; - - byte b = block[pos++]; - int len = b & 0x7F; - for (int shift = 7; (b & 0x80) != 0; shift += 7) { - b = block[pos++]; - len |= (b & 0x7F) << shift; - } - - // Hash code returned here must be consistent with the one used in TermHashTable.lookupItem, so - // use the fixed hash seed. See TermHashTable.lookupItem for explanation of fixed hash seed. - return StringHelper.murmurhash3_x86_32(block, pos, len, InvertedRealtimeIndex.FIXED_HASH_SEED); - } - - /** - * Copies the utf8 encoded byte ref to the termPool. - * @param termPool - * @param utf8 - * @return The text's start position in the termPool - */ - public static int copyToTermPool(BaseByteBlockPool termPool, BytesRef bytes) { - // Maybe grow the termPool before we write. Assume we need 5 bytes in - // the worst case to store the VInt. - if (bytes.length + 5 + termPool.byteUpto > ByteBlockPool.BYTE_BLOCK_SIZE) { - // Not enough room in current block - termPool.nextBuffer(); - } - - final int textStart = termPool.byteUpto + termPool.byteOffset; - - writeVInt(termPool, bytes.length); - System.arraycopy(bytes.bytes, bytes.offset, termPool.buffer, termPool.byteUpto, bytes.length); - termPool.byteUpto += bytes.length; - - return textStart; - } - - private static void writeVInt(final BaseByteBlockPool termPool, final int v) { - int value = v; - final byte[] block = termPool.buffer; - int blockUpto = termPool.byteUpto; - - while ((value & ~0x7F) != 0) { - block[blockUpto++] = (byte) ((value & 0x7f) | 0x80); - value >>>= 7; - } - block[blockUpto++] = (byte) value; - termPool.byteUpto = blockUpto; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/DeletedDocs.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/DeletedDocs.docx new file mode 100644 index 000000000..cf38e3f98 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/DeletedDocs.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/DeletedDocs.java b/src/java/com/twitter/search/core/earlybird/index/inverted/DeletedDocs.java deleted file mode 100644 index 264d105fa..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/DeletedDocs.java +++ /dev/null @@ -1,245 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import org.apache.lucene.util.Bits; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; - -public abstract class DeletedDocs implements Flushable { - private static final Logger LOG = LoggerFactory.getLogger(DeletedDocs.class); - - /** - * Deletes the given document. - */ - public abstract boolean deleteDoc(int docID); - - /** - * Returns a point-in-time view of the deleted docs. Calling {@link #deleteDoc(int)} afterwards - * will not alter this View. - */ - public abstract View getView(); - - /** - * Number of deletions. - */ - public abstract int numDeletions(); - - /** - * Returns a DeletedDocs instance that has the same deleted tweet IDs, but mapped to the doc IDs - * in the optimizedTweetIdMapper. - * - * @param originalTweetIdMapper The original DocIDToTweetIDMapper instance that was used to add - * doc IDs to this DeletedDocs instance. - * @param optimizedTweetIdMapper The new DocIDToTweetIDMapper instance. - * @return An DeletedDocs instance that has the same tweets deleted, but mapped to the doc IDs in - * optimizedTweetIdMapper. - */ - public abstract DeletedDocs optimize( - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException; - - public abstract class View { - /** - * Returns true, if the given document was deleted. - */ - public abstract boolean isDeleted(int docID); - - /** - * Returns true, if there are any deleted documents in this View. - */ - public abstract boolean hasDeletions(); - - /** - * Returns {@link Bits} where all deleted documents have their bit set to 0, and - * all non-deleted documents have their bits set to 1. - */ - public abstract Bits getLiveDocs(); - } - - public static class Default extends DeletedDocs { - private static final int KEY_NOT_FOUND = -1; - - private final int size; - private final Int2IntOpenHashMap deletes; - - // Each delete is marked with a unique, consecutively-increasing sequence ID. - private int sequenceID = 0; - - public Default(int size) { - this.size = size; - deletes = new Int2IntOpenHashMap(size); - deletes.defaultReturnValue(KEY_NOT_FOUND); - } - - /** - * Returns false, if this call was a noop, i.e. if the document was already deleted. - */ - @Override - public boolean deleteDoc(int docID) { - if (deletes.putIfAbsent(docID, sequenceID) == KEY_NOT_FOUND) { - sequenceID++; - return true; - } - return false; - } - - private boolean isDeleted(int internalID, int readerSequenceID) { - int deletedSequenceId = deletes.get(internalID); - return (deletedSequenceId >= 0) && (deletedSequenceId < readerSequenceID); - } - - private boolean hasDeletions(int readerSequenceID) { - return readerSequenceID > 0; - } - - @Override - public int numDeletions() { - return sequenceID; - } - - @Override - public View getView() { - return new View() { - private final int readerSequenceID = sequenceID; - - // liveDocs bitset contains inverted (decreasing) docids. - public final Bits liveDocs = !hasDeletions() ? null : new Bits() { - @Override - public final boolean get(int docID) { - return !isDeleted(docID); - } - - @Override - public final int length() { - return size; - } - }; - - @Override - public Bits getLiveDocs() { - return liveDocs; - } - - - // Operates on internal (increasing) docids. - @Override - public final boolean isDeleted(int internalID) { - return DeletedDocs.Default.this.isDeleted(internalID, readerSequenceID); - } - - @Override - public final boolean hasDeletions() { - return DeletedDocs.Default.this.hasDeletions(readerSequenceID); - } - }; - } - - @Override - public DeletedDocs optimize(DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - DeletedDocs optimizedDeletedDocs = new Default(size); - for (int deletedDocID : deletes.keySet()) { - long tweetID = originalTweetIdMapper.getTweetID(deletedDocID); - int optimizedDeletedDocID = optimizedTweetIdMapper.getDocID(tweetID); - optimizedDeletedDocs.deleteDoc(optimizedDeletedDocID); - } - return optimizedDeletedDocs; - } - - @SuppressWarnings("unchecked") - @Override - public Default.FlushHandler getFlushHandler() { - return new Default.FlushHandler(this, size); - } - - public static final class FlushHandler extends Flushable.Handler { - private final int size; - - public FlushHandler(Default objectToFlush, int size) { - super(objectToFlush); - this.size = size; - } - - public FlushHandler(int size) { - this.size = size; - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - long startTime = getClock().nowMillis(); - - Int2IntOpenHashMap deletes = getObjectToFlush().deletes; - out.writeIntArray(deletes.keySet().toIntArray()); - - getFlushTimerStats().timerIncrement(getClock().nowMillis() - startTime); - } - - @Override - protected Default doLoad(FlushInfo flushInfo, DataDeserializer in) throws IOException { - Default deletedDocs = new Default(size); - long startTime = getClock().nowMillis(); - - int[] deletedDocIDs = in.readIntArray(); - for (int docID : deletedDocIDs) { - deletedDocs.deleteDoc(docID); - } - - getLoadTimerStats().timerIncrement(getClock().nowMillis() - startTime); - return deletedDocs; - } - } - } - - public static final DeletedDocs NO_DELETES = new DeletedDocs() { - @Override - public Handler getFlushHandler() { - return null; - } - - @Override - public boolean deleteDoc(int docID) { - return false; - } - - @Override - public DeletedDocs optimize(DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) { - return this; - } - - @Override - public int numDeletions() { - return 0; - } - - @Override - public View getView() { - return new View() { - @Override - public boolean isDeleted(int docID) { - return false; - } - - @Override - public boolean hasDeletions() { - return false; - } - - @Override - public Bits getLiveDocs() { - return null; - } - - }; - } - }; -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdCSFDocValuesProcessor.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdCSFDocValuesProcessor.docx new file mode 100644 index 000000000..e0d7fa7c2 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdCSFDocValuesProcessor.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdCSFDocValuesProcessor.java b/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdCSFDocValuesProcessor.java deleted file mode 100644 index 45fec2f5f..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdCSFDocValuesProcessor.java +++ /dev/null @@ -1,74 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.facet.FacetsConfig; -import org.apache.lucene.index.DocValuesType; -import org.apache.lucene.index.IndexableField; - -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentWriter; -import com.twitter.search.core.earlybird.index.column.AbstractColumnStrideMultiIntIndex; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldIndex; -import com.twitter.search.core.earlybird.index.column.DocValuesManager; - -/** - * Handler for docvalues in the indexing chain. - */ -public class EarlybirdCSFDocValuesProcessor - implements EarlybirdRealtimeIndexSegmentWriter.StoredFieldsConsumer { - - private final DocValuesManager docValuesManager; - - public EarlybirdCSFDocValuesProcessor(DocValuesManager docValuesManager) { - this.docValuesManager = docValuesManager; - } - - @Override - public void addField(int docID, IndexableField field) throws IOException { - final DocValuesType dvType = field.fieldType().docValuesType(); - if (dvType != null) { - - // ignore lucene facet fields for realtime index, we are handling it differently - if (field.name().startsWith(FacetsConfig.DEFAULT_INDEX_FIELD_NAME)) { - return; - } - if (!(field.fieldType() instanceof EarlybirdFieldType)) { - throw new RuntimeException( - "fieldType must be an EarlybirdFieldType instance for field " + field.name()); - } - EarlybirdFieldType fieldType = (EarlybirdFieldType) field.fieldType(); - - if (dvType == DocValuesType.NUMERIC) { - if (!(field.numericValue() instanceof Long)) { - throw new IllegalArgumentException( - "illegal type " + field.numericValue().getClass() - + ": DocValues types must be Long"); - } - - ColumnStrideFieldIndex csfIndex = - docValuesManager.addColumnStrideField(field.name(), fieldType); - if (fieldType.getCsfFixedLengthNumValuesPerDoc() > 1) { - throw new UnsupportedOperationException("unsupported multi numeric values"); - } else { - csfIndex.setValue(docID, field.numericValue().longValue()); - } - - } else if (dvType == DocValuesType.BINARY) { - ColumnStrideFieldIndex csfIndex = - docValuesManager.addColumnStrideField(field.name(), fieldType); - if (fieldType.getCsfFixedLengthNumValuesPerDoc() > 1) { - Preconditions.checkArgument( - csfIndex instanceof AbstractColumnStrideMultiIntIndex, - "Unsupported multi-value binary CSF class: " + csfIndex); - ((AbstractColumnStrideMultiIntIndex) csfIndex).updateDocValues( - field.binaryValue(), docID); - } - } else { - throw new UnsupportedOperationException("unsupported DocValues.Type: " + dvType); - } - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdOptimizedPostingsEnum.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdOptimizedPostingsEnum.docx new file mode 100644 index 000000000..595523d46 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdOptimizedPostingsEnum.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdOptimizedPostingsEnum.java b/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdOptimizedPostingsEnum.java deleted file mode 100644 index a60562c5b..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdOptimizedPostingsEnum.java +++ /dev/null @@ -1,178 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import org.apache.lucene.util.BytesRef; - -/** - * Extend {@link EarlybirdPostingsEnum} to add more functionalities for docs (and positions) - * enumerator of {@link OptimizedPostingLists}. - */ -public abstract class EarlybirdOptimizedPostingsEnum extends EarlybirdPostingsEnum { - /** Current doc and its frequency. */ - private int currentDocID = -1; - private int currentFreq = 0; - - /** - * Next doc and its frequency. - * These values should be set at {@link #loadNextPosting()}. - */ - protected int nextDocID; - protected int nextFreq; - - /** Pointer to the enumerated posting list. */ - protected final int postingListPointer; - - /** Total number of postings in the enumerated posting list. */ - protected final int numPostingsTotal; - - /** Query cost tracker. */ - protected final QueryCostTracker queryCostTracker; - - /** - * Sole constructor. - * - * @param postingListPointer pointer to the posting list for which this enumerator is created - * @param numPostings number of postings in the posting list for which this enumerator is created - */ - public EarlybirdOptimizedPostingsEnum(int postingListPointer, int numPostings) { - this.postingListPointer = postingListPointer; - this.numPostingsTotal = numPostings; - - // Get the thread local query cost tracker. - this.queryCostTracker = QueryCostTracker.getTracker(); - } - - /** - * Set {@link #currentDocID} and {@link #currentFreq} and load next posting. - * This method will de-dup if duplicate doc IDs are stored. - * - * @return {@link #currentDocID} - * @see {@link #nextDoc()} - */ - @Override - protected final int nextDocNoDel() throws IOException { - currentDocID = nextDocID; - - // Return immediately if exhausted. - if (currentDocID == NO_MORE_DOCS) { - return NO_MORE_DOCS; - } - - currentFreq = nextFreq; - loadNextPosting(); - - // In case duplicate doc ID is stored. - while (currentDocID == nextDocID) { - currentFreq += nextFreq; - loadNextPosting(); - } - - startCurrentDoc(); - return currentDocID; - } - - /** - * Called when {@link #nextDocNoDel()} advances to a new docID. - * Subclasses can do extra accounting as needed. - */ - protected void startCurrentDoc() { - // No-op in this class. - } - - /** - * Loads the next posting, setting the nextDocID and nextFreq. - * - * @see #nextDocNoDel() - */ - protected abstract void loadNextPosting(); - - /** - * Subclass should implement {@link #skipTo(int)}. - * - * @see org.apache.lucene.search.DocIdSetIterator#advance(int) - */ - @Override - public final int advance(int target) throws IOException { - // Skipping to NO_MORE_DOCS or beyond largest doc ID. - if (target == NO_MORE_DOCS || target > getLargestDocID()) { - currentDocID = nextDocID = NO_MORE_DOCS; - currentFreq = nextFreq = 0; - return NO_MORE_DOCS; - } - - // Skip as close as possible. - skipTo(target); - - // Calling nextDoc to reach the target, or go beyond it if target does not exist. - int doc; - do { - doc = nextDoc(); - } while (doc < target); - - return doc; - } - - /** - * Used in {@link #advance(int)}. - * This method should skip to the given target as close as possible, but NOT reach the target. - * - * @see #advance(int) - */ - protected abstract void skipTo(int target); - - /** - * Return loaded {@link #currentFreq}. - * - * @see org.apache.lucene.index.PostingsEnum#freq() - * @see #nextDocNoDel() - */ - @Override - public final int freq() throws IOException { - return currentFreq; - } - - /** - * Return loaded {@link #currentDocID}. - * - * @see org.apache.lucene.index.PostingsEnum#docID() () - * @see #nextDocNoDel() - */ - @Override - public final int docID() { - return currentDocID; - } - - /********************************************* - * Not Supported Information * - * @see org.apache.lucene.index.PostingsEnum * - *********************************************/ - - @Override - public int nextPosition() throws IOException { - return -1; - } - - @Override - public int startOffset() throws IOException { - return -1; - } - - @Override - public int endOffset() throws IOException { - return -1; - } - - @Override - public BytesRef getPayload() throws IOException { - return null; - } - - /********************************* - * Helper methods for subclasses * - *********************************/ - - protected int getCurrentFreq() { - return currentFreq; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdPostingsEnum.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdPostingsEnum.docx new file mode 100644 index 000000000..f08e496f6 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdPostingsEnum.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdPostingsEnum.java b/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdPostingsEnum.java deleted file mode 100644 index 535c8b55d..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/EarlybirdPostingsEnum.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import org.apache.lucene.index.PostingsEnum; - -/** - * Extension of Lucene's PostingsEnum interface that adds additional funcionality. - */ -public abstract class EarlybirdPostingsEnum extends PostingsEnum { - @Override - public final int nextDoc() throws IOException { - // SEARCH-7008 - return nextDocNoDel(); - } - - /** - * Advances to the next doc without paying attention to liveDocs. - */ - protected abstract int nextDocNoDel() throws IOException; - - /** - * Returns the largest docID contained in this posting list. - */ - public abstract int getLargestDocID() throws IOException; -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/FSTTermDictionary.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/FSTTermDictionary.docx new file mode 100644 index 000000000..7e219fc5a Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/FSTTermDictionary.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/FSTTermDictionary.java b/src/java/com/twitter/search/core/earlybird/index/inverted/FSTTermDictionary.java deleted file mode 100644 index 638cbaffc..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/FSTTermDictionary.java +++ /dev/null @@ -1,299 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import java.util.Comparator; - -import org.apache.lucene.index.BaseTermsEnum; -import org.apache.lucene.index.ImpactsEnum; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SlowImpactsEnum; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.InPlaceMergeSorter; -import org.apache.lucene.util.IntsRefBuilder; -import org.apache.lucene.util.fst.BytesRefFSTEnum; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.PositiveIntOutputs; -import org.apache.lucene.util.fst.Util; -import org.apache.lucene.util.packed.PackedInts; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -public class FSTTermDictionary implements TermDictionary, Flushable { - private final FST fst; - - private final PackedInts.Reader termPointers; - private final ByteBlockPool termPool; - private final TermPointerEncoding termPointerEncoding; - private int numTerms; - - FSTTermDictionary(int numTerms, FST fst, - ByteBlockPool termPool, PackedInts.Reader termPointers, - TermPointerEncoding termPointerEncoding) { - this.numTerms = numTerms; - this.fst = fst; - this.termPool = termPool; - this.termPointers = termPointers; - this.termPointerEncoding = termPointerEncoding; - } - - @Override - public int getNumTerms() { - return numTerms; - } - - @Override - public int lookupTerm(BytesRef term) throws IOException { - if (fst == null) { - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum<>(fst); - - final BytesRefFSTEnum.InputOutput result = fstEnum.seekExact(term); - if (result != null && result.input.equals(term)) { - // -1 because 0 is not supported by the fst - return result.output.intValue() - 1; - } else { - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - } - - static FSTTermDictionary buildFST( - final ByteBlockPool termPool, - int[] termPointers, - int numTerms, - final Comparator comp, - boolean supportTermTextLookup, - final TermPointerEncoding termPointerEncoding) throws IOException { - final IntsRefBuilder scratchIntsRef = new IntsRefBuilder(); - - final int[] compact = new int[numTerms]; - for (int i = 0; i < numTerms; i++) { - compact[i] = i; - } - - // first sort the terms - new InPlaceMergeSorter() { - private BytesRef scratch1 = new BytesRef(); - private BytesRef scratch2 = new BytesRef(); - - @Override - protected void swap(int i, int j) { - final int o = compact[i]; - compact[i] = compact[j]; - compact[j] = o; - } - - @Override - protected int compare(int i, int j) { - final int ord1 = compact[i]; - final int ord2 = compact[j]; - ByteTermUtils.setBytesRef(termPool, scratch1, - termPointerEncoding.getTextStart(termPointers[ord1])); - ByteTermUtils.setBytesRef(termPool, scratch2, - termPointerEncoding.getTextStart(termPointers[ord2])); - return comp.compare(scratch1, scratch2); - } - - }.sort(0, compact.length); - - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - - final org.apache.lucene.util.fst.Builder builder = - new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE1, outputs); - - final BytesRef term = new BytesRef(); - for (int termID : compact) { - ByteTermUtils.setBytesRef(termPool, term, - termPointerEncoding.getTextStart(termPointers[termID])); - // +1 because 0 is not supported by the fst - builder.add(Util.toIntsRef(term, scratchIntsRef), (long) termID + 1); - } - - if (supportTermTextLookup) { - PackedInts.Reader packedTermPointers = OptimizedMemoryIndex.getPackedInts(termPointers); - return new FSTTermDictionary( - numTerms, - builder.finish(), - termPool, - packedTermPointers, - termPointerEncoding); - } else { - return new FSTTermDictionary( - numTerms, - builder.finish(), - null, // termPool - null, // termPointers - termPointerEncoding); - } - } - - @Override - public boolean getTerm(int termID, BytesRef text, BytesRef termPayload) { - if (termPool == null) { - throw new UnsupportedOperationException( - "This dictionary does not support term lookup by termID"); - } else { - int termPointer = (int) termPointers.get(termID); - boolean hasTermPayload = termPointerEncoding.hasPayload(termPointer); - int textStart = termPointerEncoding.getTextStart(termPointer); - // setBytesRef sets the passed in BytesRef "text" to the term in the termPool. - // As a side effect it returns the offset of the next entry in the pool after the term, - // which may optionally be used if this term has a payload. - int termPayloadStart = ByteTermUtils.setBytesRef(termPool, text, textStart); - if (termPayload != null && hasTermPayload) { - ByteTermUtils.setBytesRef(termPool, termPayload, termPayloadStart); - } - - return hasTermPayload; - } - } - - @Override - public TermsEnum createTermsEnum(OptimizedMemoryIndex index) { - return new BaseTermsEnum() { - private final BytesRefFSTEnum fstEnum = fst != null ? new BytesRefFSTEnum<>(fst) : null; - private BytesRefFSTEnum.InputOutput current; - - @Override - public SeekStatus seekCeil(BytesRef term) - throws IOException { - if (fstEnum == null) { - return SeekStatus.END; - } - - current = fstEnum.seekCeil(term); - if (current != null && current.input.equals(term)) { - return SeekStatus.FOUND; - } else { - return SeekStatus.END; - } - } - - @Override - public boolean seekExact(BytesRef text) throws IOException { - current = fstEnum.seekExact(text); - return current != null; - } - - // In our case the ord is the termId. - @Override - public void seekExact(long ord) { - current = new BytesRefFSTEnum.InputOutput<>(); - current.input = null; - // +1 because 0 is not supported by the fst - current.output = ord + 1; - - if (termPool != null) { - BytesRef bytesRef = new BytesRef(); - int termId = (int) ord; - assert termId == ord; - FSTTermDictionary.this.getTerm(termId, bytesRef, null); - current.input = bytesRef; - } - } - - @Override - public BytesRef next() throws IOException { - current = fstEnum.next(); - if (current == null) { - return null; - } - return current.input; - } - - @Override - public BytesRef term() { - return current.input; - } - - // In our case the ord is the termId. - @Override - public long ord() { - // -1 because 0 is not supported by the fst - return current.output - 1; - } - - @Override - public int docFreq() { - return index.getDF((int) ord()); - } - - @Override - public long totalTermFreq() { - return docFreq(); - } - - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { - int termID = (int) ord(); - int postingsPointer = index.getPostingListPointer(termID); - int numPostings = index.getNumPostings(termID); - return index.getPostingLists().postings(postingsPointer, numPostings, flags); - } - - @Override - public ImpactsEnum impacts(int flags) throws IOException { - return new SlowImpactsEnum(postings(null, flags)); - } - }; - } - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static class FlushHandler extends Flushable.Handler { - private static final String NUM_TERMS_PROP_NAME = "numTerms"; - private static final String SUPPORT_TERM_TEXT_LOOKUP_PROP_NAME = "supportTermTextLookup"; - private final TermPointerEncoding termPointerEncoding; - - public FlushHandler(TermPointerEncoding termPointerEncoding) { - super(); - this.termPointerEncoding = termPointerEncoding; - } - - public FlushHandler(FSTTermDictionary objectToFlush) { - super(objectToFlush); - this.termPointerEncoding = objectToFlush.termPointerEncoding; - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) - throws IOException { - FSTTermDictionary objectToFlush = getObjectToFlush(); - flushInfo.addIntProperty(NUM_TERMS_PROP_NAME, objectToFlush.getNumTerms()); - flushInfo.addBooleanProperty(SUPPORT_TERM_TEXT_LOOKUP_PROP_NAME, - objectToFlush.termPool != null); - if (objectToFlush.termPool != null) { - out.writePackedInts(objectToFlush.termPointers); - objectToFlush.termPool.getFlushHandler().flush(flushInfo.newSubProperties("termPool"), out); - } - objectToFlush.fst.save(out.getIndexOutput()); - } - - @Override - protected FSTTermDictionary doLoad(FlushInfo flushInfo, - DataDeserializer in) throws IOException { - int numTerms = flushInfo.getIntProperty(NUM_TERMS_PROP_NAME); - boolean supportTermTextLookup = - flushInfo.getBooleanProperty(SUPPORT_TERM_TEXT_LOOKUP_PROP_NAME); - PackedInts.Reader termPointers = null; - ByteBlockPool termPool = null; - if (supportTermTextLookup) { - termPointers = in.readPackedInts(); - termPool = (new ByteBlockPool.FlushHandler()) - .load(flushInfo.getSubProperties("termPool"), in); - } - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(); - return new FSTTermDictionary(numTerms, new FST<>(in.getIndexInput(), outputs), - termPool, termPointers, termPointerEncoding); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsAndPositionsEnum.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsAndPositionsEnum.docx new file mode 100644 index 000000000..1c1df0a2c Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsAndPositionsEnum.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsAndPositionsEnum.java b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsAndPositionsEnum.java deleted file mode 100644 index 7b18275d0..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsAndPositionsEnum.java +++ /dev/null @@ -1,156 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -/** - * Docs, frequencies, and positions enumerator for {@link HighDFPackedIntsPostingLists}. - */ -public class HighDFPackedIntsDocsAndPositionsEnum extends HighDFPackedIntsDocsEnum { - /** - * Pre-computed shifts, masks, and start int indices for {@link #positionListsReader}. - * These pre-computed values should be read-only and shared across all reader threads. - * - * Notice: - * - start int indices are NEEDED since there IS jumping within a slice in - * {@link #doAdditionalSkip()} and {@link #startCurrentDoc()}. - */ - private static final PackedLongsReaderPreComputedValues PRE_COMPUTED_VALUES = - new PackedLongsReaderPreComputedValues( - HighDFPackedIntsPostingLists.MAX_POSITION_BIT, - HighDFPackedIntsPostingLists.POSITION_SLICE_NUM_BITS_WITHOUT_HEADER, - HighDFPackedIntsPostingLists.POSITION_SLICE_SIZE_WITHOUT_HEADER, - true); - - /** - * Int block pool holding the positions for the read posting list. This is mainly used while - * reading slice headers in {@link #loadNextPositionSlice()}. - */ - private final IntBlockPool positionLists; - - /** Packed ints reader for positions. */ - private final IntBlockPoolPackedLongsReader positionListsReader; - - /** Total number of positions in the current position slice. */ - private int numPositionsInSliceTotal; - - /** - * Number of remaining positions for {@link #currentDocID}; this value is decremented every time - * {@link #nextPosition()} is called. - */ - private int numPositionsRemainingForCurrentDocID; - - /** - * Pointer to the first int, which contains the position slice header, of the next position slice. - * This value is used to track which slice will be loaded when {@link #loadNextPositionSlice()} is - * called. - */ - private int nextPositionSlicePointer; - - /** - * Create a docs and positions enumerator. - */ - public HighDFPackedIntsDocsAndPositionsEnum( - IntBlockPool skipLists, - IntBlockPool deltaFreqLists, - IntBlockPool positionLists, - int postingListPointer, - int numPostings, - boolean omitPositions) { - super(skipLists, deltaFreqLists, postingListPointer, numPostings, omitPositions); - - this.positionLists = positionLists; - this.positionListsReader = new IntBlockPoolPackedLongsReader( - positionLists, - PRE_COMPUTED_VALUES, - queryCostTracker, - QueryCostTracker.CostType.LOAD_OPTIMIZED_POSTING_BLOCK); - - // Load the first position slice. - this.nextPositionSlicePointer = skipListReader.getPositionCurrentSlicePointer(); - loadNextPositionSlice(); - } - - /** - * Prepare for current doc: - * - skipping over unread positions for the current doc. - * - reset remaining positions for current doc to {@link #currentFreq}. - * - * @see #nextDocNoDel() - */ - @Override - protected void startCurrentDoc() { - // Locate next position for current doc by skipping over unread positions from the previous doc. - if (numPositionsRemainingForCurrentDocID != 0) { - int numPositionsRemainingInSlice = - numPositionsInSliceTotal - positionListsReader.getPackedValueIndex(); - while (numPositionsRemainingInSlice <= numPositionsRemainingForCurrentDocID) { - numPositionsRemainingForCurrentDocID -= numPositionsRemainingInSlice; - nextPositionSlicePointer += HighDFPackedIntsPostingLists.SLICE_SIZE; - loadNextPositionSlice(); - numPositionsRemainingInSlice = numPositionsInSliceTotal; - } - - positionListsReader.setPackedValueIndex( - positionListsReader.getPackedValueIndex() + numPositionsRemainingForCurrentDocID); - } - - // Number of remaining positions for current doc is current freq. - numPositionsRemainingForCurrentDocID = getCurrentFreq(); - } - - /** - * Put positions reader to the start of next position slice and reset number of bits per packed - * value for next position slice. - */ - private void loadNextPositionSlice() { - final int header = positionLists.get(nextPositionSlicePointer); - final int bitsForPosition = HighDFPackedIntsPostingLists.getNumBitsForPosition(header); - numPositionsInSliceTotal = HighDFPackedIntsPostingLists.getNumPositionsInSlice(header); - - positionListsReader.jumpToInt( - nextPositionSlicePointer + HighDFPackedIntsPostingLists.POSITION_SLICE_HEADER_SIZE, - bitsForPosition); - } - - /** - * Return next position for current doc. - * @see org.apache.lucene.index.PostingsEnum#nextPosition() - */ - @Override - public int nextPosition() throws IOException { - // Return -1 immediately if all positions are used up for current doc. - if (numPositionsRemainingForCurrentDocID == 0) { - return -1; - } - - if (positionListsReader.getPackedValueIndex() < numPositionsInSliceTotal) { - // Read next position in current slice. - final int nextPosition = (int) positionListsReader.readPackedLong(); - numPositionsRemainingForCurrentDocID--; - return nextPosition; - } else { - // All positions in current slice is used up, load next slice. - nextPositionSlicePointer += HighDFPackedIntsPostingLists.SLICE_SIZE; - loadNextPositionSlice(); - return nextPosition(); - } - } - - /** - * Set {@link #positionListsReader} to the correct location and correct number of bits per packed - * value for the delta-freq slice on which this enum is landed after skipping. - * - * @see #skipTo(int) - */ - @Override - protected void doAdditionalSkip() { - nextPositionSlicePointer = skipListReader.getPositionCurrentSlicePointer(); - loadNextPositionSlice(); - - // Locate the exact position in slice. - final int skipListEntryEncodedMetadata = skipListReader.getEncodedMetadataCurrentSlice(); - positionListsReader.setPackedValueIndex( - HighDFPackedIntsPostingLists.getPositionOffsetInSlice(skipListEntryEncodedMetadata)); - numPositionsRemainingForCurrentDocID = 0; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsEnum.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsEnum.docx new file mode 100644 index 000000000..8cbb3aa16 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsEnum.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsEnum.java b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsEnum.java deleted file mode 100644 index e09a2ef2b..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsDocsEnum.java +++ /dev/null @@ -1,222 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -/** - * Docs and frequencies enumerator for {@link HighDFPackedIntsPostingLists}. - */ -public class HighDFPackedIntsDocsEnum extends EarlybirdOptimizedPostingsEnum { - /** - * Pre-computed shifts, masks for {@link #deltaFreqListsReader}. - * These pre-computed values should be read-only and shared across all reader threads. - * - * Notice: - * - start int indices are NOT needed since there is not jumping within a slice. - */ - private static final PackedLongsReaderPreComputedValues PRE_COMPUTED_VALUES = - new PackedLongsReaderPreComputedValues( - HighDFPackedIntsPostingLists.MAX_DOC_ID_BIT - + HighDFPackedIntsPostingLists.MAX_FREQ_BIT, - HighDFPackedIntsPostingLists.NUM_BITS_PER_SLICE, - HighDFPackedIntsPostingLists.SLICE_SIZE, - false); - - /** Packed ints reader for delta-freq pairs. */ - private final IntBlockPoolPackedLongsReader deltaFreqListsReader; - - /** Skip list reader. */ - protected final HighDFPackedIntsSkipListReader skipListReader; - - /** Number of remaining docs (delta-freq pairs) in a slice. */ - private int numDocsRemaining; - - /** - * Total number of docs (delta-freq pairs) in a slice. - * This value is set every time a slice is loaded in {@link #loadNextDeltaFreqSlice()}. - */ - private int numDocsInSliceTotal; - - /** - * Number of bits used for frequency in a delta-freq slice. - * This value is set every time a slice is loaded in {@link #loadNextDeltaFreqSlice()}. - */ - private int bitsForFreq; - - /** - * Frequency mask used to extract frequency from a delta-freq pair, in a delta-freq slice. - * This value is set every time a slice is loaded in {@link #loadNextDeltaFreqSlice()}. - */ - private int freqMask; - private boolean freqBitsIsZero; - - /** - * Sole constructor. - * - * @param skipLists skip lists int block pool - * @param deltaFreqLists delta-freq lists int block pool - * @param postingListPointer pointer to the posting list for which this enumerator is created - * @param numPostings number of postings in the posting list for which this enumerator is created - * @param omitPositions whether positions are omitted in the posting list of which this enumerator - * is created - */ - public HighDFPackedIntsDocsEnum( - IntBlockPool skipLists, - IntBlockPool deltaFreqLists, - int postingListPointer, - int numPostings, - boolean omitPositions) { - super(postingListPointer, numPostings); - - // Create skip list reader and get first skip entry. - this.skipListReader = new HighDFPackedIntsSkipListReader( - skipLists, postingListPointer, omitPositions); - this.skipListReader.getNextSkipEntry(); - - // Set number of remaining docs in this posting list. - this.numDocsRemaining = skipListReader.getNumDocsTotal(); - - // Create a delta-freq pair packed values reader. - this.deltaFreqListsReader = new IntBlockPoolPackedLongsReader( - deltaFreqLists, - PRE_COMPUTED_VALUES, - queryCostTracker, - QueryCostTracker.CostType.LOAD_OPTIMIZED_POSTING_BLOCK); - - loadNextDeltaFreqSlice(); - loadNextPosting(); - } - - /** - * Load next delta-freq slice, return false if all docs exhausted. - * Notice!! The caller of this method should make sure the current slice is all used up and - * {@link #numDocsRemaining} is updated accordingly. - * - * @return whether a slice is loaded. - * @see #loadNextPosting() - * @see #skipTo(int) - */ - private boolean loadNextDeltaFreqSlice() { - // Load nothing if no docs are remaining. - if (numDocsRemaining == 0) { - return false; - } - - final int encodedMetadata = skipListReader.getEncodedMetadataCurrentSlice(); - final int bitsForDelta = HighDFPackedIntsPostingLists.getNumBitsForDelta(encodedMetadata); - bitsForFreq = HighDFPackedIntsPostingLists.getNumBitsForFreq(encodedMetadata); - numDocsInSliceTotal = HighDFPackedIntsPostingLists.getNumDocsInSlice(encodedMetadata); - - freqMask = (1 << bitsForFreq) - 1; - freqBitsIsZero = bitsForFreq == 0; - - // Locate and reset the reader for this slice. - final int bitsPerPackedValue = bitsForDelta + bitsForFreq; - deltaFreqListsReader.jumpToInt( - skipListReader.getDeltaFreqCurrentSlicePointer(), bitsPerPackedValue); - return true; - } - - /** - * Load next delta-freq pair from the current slice and set the computed - * {@link #nextDocID} and {@link #nextFreq}. - */ - @Override - protected final void loadNextPosting() { - assert numDocsRemaining >= (numDocsInSliceTotal - deltaFreqListsReader.getPackedValueIndex()) - : "numDocsRemaining should be equal to or greater than number of docs remaining in slice"; - - if (deltaFreqListsReader.getPackedValueIndex() < numDocsInSliceTotal) { - // Current slice is not exhausted. - final long nextDeltaFreqPair = deltaFreqListsReader.readPackedLong(); - - /** - * Optimization: No need to do shifts and masks if number of bits for frequency is 0. - * Also, the stored frequency is the actual frequency - 1. - * @see - * HighDFPackedIntsPostingLists#copyPostingList(org.apache.lucene.index.PostingsEnum, int) - */ - if (freqBitsIsZero) { - nextFreq = 1; - nextDocID += (int) nextDeltaFreqPair; - } else { - nextFreq = (int) ((nextDeltaFreqPair & freqMask) + 1); - nextDocID += (int) (nextDeltaFreqPair >>> bitsForFreq); - } - - numDocsRemaining--; - } else { - // Current slice is exhausted, get next skip entry and load next slice. - skipListReader.getNextSkipEntry(); - if (loadNextDeltaFreqSlice()) { - // Next slice is loaded, load next posting again. - loadNextPosting(); - } else { - // All docs are exhausted, mark this enumerator as exhausted. - assert numDocsRemaining == 0; - nextDocID = NO_MORE_DOCS; - nextFreq = 0; - } - } - } - - /** - * Skip over slices to approach the given target as close as possible. - */ - @Override - protected final void skipTo(int target) { - assert target != NO_MORE_DOCS : "Should be handled in parent class advance method"; - - int numSlicesToSkip = 0; - int numDocsToSkip = 0; - int numDocsRemainingInSlice = numDocsInSliceTotal - deltaFreqListsReader.getPackedValueIndex(); - - // Skipping over slices. - while (skipListReader.peekPreviousDocIDNextSlice() < target) { - skipListReader.getNextSkipEntry(); - nextDocID = skipListReader.getPreviousDocIDCurrentSlice(); - numDocsToSkip += numDocsRemainingInSlice; - int header = skipListReader.getEncodedMetadataCurrentSlice(); - numDocsRemainingInSlice = HighDFPackedIntsPostingLists.getNumDocsInSlice(header); - - numSlicesToSkip++; - } - - // If skipped any slices, load the new slice. - if (numSlicesToSkip > 0) { - numDocsRemaining -= numDocsToSkip; - final boolean hasNextSlice = loadNextDeltaFreqSlice(); - assert hasNextSlice; - assert numDocsRemaining >= numDocsInSliceTotal && numDocsInSliceTotal > 0; - - // Do additional skip for the delta freq slice that was just loaded. - doAdditionalSkip(); - - loadNextPosting(); - } - } - - /** - * Subclass should override this method if want to do additional skip on its data structure. - */ - protected void doAdditionalSkip() { - // No-op in this class. - } - - /** - * Get the largest doc ID from {@link #skipListReader}. - */ - @Override - public int getLargestDocID() throws IOException { - return skipListReader.getLargestDocID(); - } - - /** - * Return {@link #numDocsRemaining} as a proxy of cost. - * - * @see org.apache.lucene.index.PostingsEnum#cost() - */ - @Override - public long cost() { - return numDocsRemaining; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsPostingLists.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsPostingLists.docx new file mode 100644 index 000000000..573402bdb Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsPostingLists.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsPostingLists.java b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsPostingLists.java deleted file mode 100644 index bf92d814f..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsPostingLists.java +++ /dev/null @@ -1,829 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import javax.annotation.Nullable; - -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.search.DocIdSetIterator; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; - -/** - * An optimized posting lists implementation storing doc deltas, doc freqs, and positions as packed - * ints in a 64 ints slice backed by {@link IntBlockPool}. - * - * There are three inner data structures used to store values used by a posting lists instance: - * - * - Skip lists, used for fast {@link PostingsEnum#advance(int)}, are stored in {@link #skipLists} - * int block pool. - * - Doc deltas and freqs are stored in {@link #deltaFreqLists} int block pool. - * - Positions are stored in {@link #positionLists} int block pool. - * - * For detail layout and configuration, please refer to the Javadoc of {@link #skipLists}, - * {@link #deltaFreqLists} and {@link #positionLists}. - * - * This implementation designed for posting lists with a LARGE number of postings. - * - * Acknowledgement: the concepts of slice based packed ints encoding/decoding is borrowed - * from {@code HighDFCompressedPostinglists}, which will be deprecated due - * to not supporting positions that are greater than 255. - */ -public class HighDFPackedIntsPostingLists extends OptimizedPostingLists { - /** - * A counter used to track when positions enum is required and a posting lists instance is set - * to omit positions. - * - * @see #postings(int, int, int) - */ - private static final SearchCounter GETTING_POSITIONS_WITH_OMIT_POSITIONS = - SearchCounter.export( - "high_df_packed_ints_posting_list_getting_positions_with_omit_positions"); - - /** - * Information related to size of a slice. - */ - static final int SLICE_SIZE_BIT = 6; - static final int SLICE_SIZE = 1 << SLICE_SIZE_BIT; // 64 ints per block - static final int NUM_BITS_PER_SLICE = SLICE_SIZE * Integer.SIZE; // 2048 bits per block - - /** - * A skip list has ONE skip list header that contains 5 ints (4 ints if positions are omitted): - * - 1st int: number of skip entries in this skip list. - * - 2nd int: largest doc ID in this posting list. - * - 3rd int: number of docs in this posting list. - * - 4th int: pointer to the start of the delta-freq list of this posting list. - * - 5th int: (OPTIONAL) pointer to the start of the position list of this posting list. - */ - static final int SKIPLIST_HEADER_SIZE = 5; - static final int SKIPLIST_HEADER_SIZE_WITHOUT_POSITIONS = SKIPLIST_HEADER_SIZE - 1; - - /** - * A skip list has MANY skip entries. Each skip entry is for one slice in delta-freq list. - * There are 3 ints in every skip entry (2 ints if positions are omitted): - * - 1st int: last doc ID in previous slice (0 for the first slice), this is mainly used during - * skipping because deltas, not absolute doc IDs, are stored in a slice. - * - 2nd int: encoded metadata of the corresponding delta-freq slice. There are 4 piece of - * information from the LOWEST bits to HIGHEST bits of this int: - * 11 bits: number of docs (delta-freq pairs) in this slice. - * 5 bits: number of bits used to encode each freq. - * 5 bits: number of bits used to encode each delta. - * 11 bits: POSITION SLICE OFFSET: an index of number of positions; this is where the - * first position of the first doc (in this delta-freq slice) is in the - * position slice. The position slice is identified by the 3rd int below. - * These two piece information uniquely identified the location of the start - * position of this delta-freq slice. This value is always 0 if position is - * omitted. - * - 3rd int: (OPTIONAL) POSITION SLICE INDEX: an index of of number of slices; this value - * identifies the slice in which the first position of the first doc (in this - * delta-freq slice) exists. The exact location inside the position slice is identified - * by POSITION SLICE OFFSET that is stored in the 2nd int above. - * Notice: this is not the absolute address in the block pool, but instead a relative - * offset (in number of slices) on top of this term's first position slice. - * This value DOES NOT EXIST if position is omitted. - */ - static final int SKIPLIST_ENTRY_SIZE = 3; - static final int SKIPLIST_ENTRY_SIZE_WITHOUT_POSITIONS = SKIPLIST_ENTRY_SIZE - 1; - - /** - * Shifts and masks used to encode/decode metadata from the 2nd int of a skip list entry. - * @see #SKIPLIST_ENTRY_SIZE - * @see #encodeSkipListEntryMetadata(int, int, int, int) - * @see #getNumBitsForDelta(int) - * @see #getNumBitsForFreq(int) - * @see #getNumDocsInSlice(int) - * @see #getPositionOffsetInSlice(int) - */ - static final int SKIPLIST_ENTRY_POSITION_OFFSET_SHIFT = 21; - static final int SKIPLIST_ENTRY_NUM_BITS_DELTA_SHIFT = 16; - static final int SKIPLIST_ENTRY_NUM_BITS_FREQ_SHIFT = 11; - static final int SKIPLIST_ENTRY_POSITION_OFFSET_MASK = (1 << 11) - 1; - static final int SKIPLIST_ENTRY_NUM_BITS_DELTA_MASK = (1 << 5) - 1; - static final int SKIPLIST_ENTRY_NUM_BITS_FREQ_MASK = (1 << 5) - 1; - static final int SKIPLIST_ENTRY_NUM_DOCS_MASK = (1 << 11) - 1; - - /** - * Each position slice has a header that is the 1st int in this position slice. From LOWEST bits - * to HIGHEST bits, there are 2 pieces of information encoded in this single int: - * 11 bits: number of positions in this slice. - * 5 bits: number of bits used to encode each position. - */ - static final int POSITION_SLICE_HEADER_SIZE = 1; - - /** - * Information related to size of a position slice. The actual size is the same as - * {@link #SLICE_SIZE}, but there is 1 int used for position slice header. - */ - static final int POSITION_SLICE_SIZE_WITHOUT_HEADER = SLICE_SIZE - POSITION_SLICE_HEADER_SIZE; - static final int POSITION_SLICE_NUM_BITS_WITHOUT_HEADER = - POSITION_SLICE_SIZE_WITHOUT_HEADER * Integer.SIZE; - - /** - * Shifts and masks used to encode/decode metadata from the position slice header. - * @see #POSITION_SLICE_HEADER_SIZE - * @see #encodePositionEntryHeader(int, int) - * @see #getNumPositionsInSlice(int) - * @see #getNumBitsForPosition(int) - */ - static final int POSITION_SLICE_HEADER_BITS_POSITION_SHIFT = 11; - static final int POSITION_SLICE_HEADER_BITS_POSITION_MASK = (1 << 5) - 1; - static final int POSITION_SLICE_HEADER_NUM_POSITIONS_MASK = (1 << 11) - 1; - - /** - * Stores skip list for each posting list. - * - * A skip list consists of ONE skip list header and MANY skip list entries, and each skip entry - * corresponds to one delta-freq slice. Also, unlike {@link #deltaFreqLists} and - * {@link #positionLists}, values in skip lists int pool are NOT stored in unit of slices. - * - * Example: - * H: skip list header int - * E: skip list entry int - * ': int boundary - * |: header/entry boundary (also a boundary of int) - * - * <----- skip list A -----> <- skip list B -> - * |H'H'H'H'H|E'E|E'E|E'E|E'E|H'H'H'H'H|E'E|E'E| - */ - private final IntBlockPool skipLists; - - /** - * Stores delta-freq list for each posting list. - * - * A delta-freq list consists of MANY 64-int slices, and delta-freq pairs are stored compactly - * with a fixed number of bits within a single slice. Each slice has a corresponding skip list - * entry in {@link #skipLists} storing metadata about this slice. - * - * Example: - * |: slice boundary - * - * <----------------- delta-freq list A -----------------> <--- delta-freq list B ---> - * |64 ints slice|64 ints slice|64 ints slice|64 ints slice|64 ints slice|64 ints slice| - */ - private final IntBlockPool deltaFreqLists; - - /** - * Stores position list for each posting list. - * - * A position list consists of MANY 64 ints slices, and positions are stored compactly with a - * fixed number of bits within a single slice. The first int in each slice is used as a header to - * store the metadata about this position slice. - * - * Example: - * H: position header int - * ': int boundary - * |: slice boundary - * - * <--------------- position list A ---------------> <---------- position list B ----------> - * |H'63 ints|H'63 ints|H'63 ints|H'63 ints|H'63 ints|H'63 ints|H'63 ints|H'63 ints|H'63 ints| - */ - private final IntBlockPool positionLists; - - /** - * Whether positions are omitted in this optimized posting lists. - */ - private final boolean omitPositions; - - /** - * Skip list header and entry size for this posting lists, could be different depends on whether - * position is omitted or not. - * - * @see #SKIPLIST_HEADER_SIZE - * @see #SKIPLIST_HEADER_SIZE_WITHOUT_POSITIONS - * @see #SKIPLIST_ENTRY_SIZE - * @see #SKIPLIST_ENTRY_SIZE_WITHOUT_POSITIONS - */ - private final int skipListHeaderSize; - private final int skiplistEntrySize; - - /** - * Buffer used in {@link #copyPostingList(PostingsEnum, int)} - * to queue up values needed for a slice. - * Loaded posting lists have them set as null. - */ - private final PostingsBufferQueue docFreqQueue; - private final PostingsBufferQueue positionQueue; - - /** - * Packed ints writer used to write into delta-freq int pool and position int pool. - * Loaded posting lists have them set as null. - */ - private final IntBlockPoolPackedLongsWriter deltaFreqListsWriter; - private final IntBlockPoolPackedLongsWriter positionListsWriter; - - /** - * Default constructor. - * - * @param omitPositions whether positions will be omitted in these posting lists. - */ - public HighDFPackedIntsPostingLists(boolean omitPositions) { - this( - new IntBlockPool("high_df_packed_ints_skip_lists"), - new IntBlockPool("high_df_packed_ints_delta_freq_lists"), - new IntBlockPool("high_df_packed_ints_position_lists"), - omitPositions, - new PostingsBufferQueue(NUM_BITS_PER_SLICE), - new PostingsBufferQueue(POSITION_SLICE_NUM_BITS_WITHOUT_HEADER)); - } - - /** - * Constructors used by loader. - * - * @param skipLists loaded int block pool represents skip lists - * @param deltaFreqLists loaded int block pool represents delta-freq lists - * @param positionLists loaded int block pool represents position lists - * @param omitPositions whether positions will be omitted in these posting lists - * @param docFreqQueue buffer used to queue up values used for a doc freq slice, null if loaded - * @param positionQueue buffer used to queue up values used for a position slice, null if loaded - * @see FlushHandler#doLoad(FlushInfo, DataDeserializer) - */ - private HighDFPackedIntsPostingLists( - IntBlockPool skipLists, - IntBlockPool deltaFreqLists, - IntBlockPool positionLists, - boolean omitPositions, - @Nullable PostingsBufferQueue docFreqQueue, - @Nullable PostingsBufferQueue positionQueue) { - this.skipLists = skipLists; - this.deltaFreqLists = deltaFreqLists; - this.positionLists = positionLists; - this.omitPositions = omitPositions; - - this.docFreqQueue = docFreqQueue; - this.positionQueue = positionQueue; - - // docFreqQueue is null if this postingLists is loaded, - // we don't need to create writer at that case. - if (docFreqQueue == null) { - assert positionQueue == null; - this.deltaFreqListsWriter = null; - this.positionListsWriter = null; - } else { - this.deltaFreqListsWriter = new IntBlockPoolPackedLongsWriter(deltaFreqLists); - this.positionListsWriter = new IntBlockPoolPackedLongsWriter(positionLists); - } - - if (omitPositions) { - skipListHeaderSize = SKIPLIST_HEADER_SIZE_WITHOUT_POSITIONS; - skiplistEntrySize = SKIPLIST_ENTRY_SIZE_WITHOUT_POSITIONS; - } else { - skipListHeaderSize = SKIPLIST_HEADER_SIZE; - skiplistEntrySize = SKIPLIST_ENTRY_SIZE; - } - } - - /** - * A simple wrapper around assorted states used when coping positions in a posting enum. - * @see #copyPostingList(PostingsEnum, int) - */ - private static class PositionsState { - /** Max position has been seen for the current position slice. */ - private int maxPosition = 0; - - /** Bits needed to encode/decode positions in the current position slice. */ - private int bitsNeededForPosition = 0; - - /** Total number of position slices created for current posting list. */ - private int numPositionsSlices = 0; - - /** - * Whenever a slice of doc/freq pairs is written, this will point to the first position - * associated with the first doc in the doc/freq slice. - */ - private int currentPositionsSliceIndex = 0; - private int currentPositionsSliceOffset = 0; - - /** - * Whenever a new document is processed, this points to the first position for this doc. - * This is used if this doc ends up being chosen as the first doc in a doc/freq slice. - */ - private int nextPositionsSliceIndex = 0; - private int nextPositionsSliceOffset = 0; - } - - /** - * Copies postings in the given postings enum into this posting lists instance. - * - * @param postingsEnum enumerator of the posting list that needs to be copied - * @param numPostings number of postings in the posting list that needs to be copied - * @return pointer to the copied posting list in this posting lists instance - */ - @Override - public int copyPostingList(PostingsEnum postingsEnum, int numPostings) throws IOException { - assert docFreqQueue.isEmpty() : "each new posting list should start with an empty queue"; - assert positionQueue.isEmpty() : "each new posting list should start with an empty queue"; - - final int skipListPointer = skipLists.length(); - final int deltaFreqListPointer = deltaFreqLists.length(); - final int positionListPointer = positionLists.length(); - assert isSliceStart(deltaFreqListPointer) : "each new posting list should start at a new slice"; - assert isSliceStart(positionListPointer) : "each new posting list should start at a new slice"; - - // Make room for skip list HEADER. - for (int i = 0; i < skipListHeaderSize; i++) { - skipLists.add(-1); - } - - int doc; - int prevDoc = 0; - int prevWrittenDoc = 0; - - int maxDelta = 0; - int maxFreq = 0; - - int bitsNeededForDelta = 0; - int bitsNeededForFreq = 0; - - // Keep tracking positions related info for this posting list. - PositionsState positionsState = new PositionsState(); - - int numDocs = 0; - int numDeltaFreqSlices = 0; - while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - numDocs++; - - int delta = doc - prevDoc; - assert delta <= MAX_DOC_ID; - - int newBitsForDelta = bitsNeededForDelta; - if (delta > maxDelta) { - maxDelta = delta; - newBitsForDelta = log(maxDelta, 2); - assert newBitsForDelta <= MAX_DOC_ID_BIT; - } - - /** - * Optimization: store freq - 1 since a freq must be positive. Save bits and improve decoding - * speed. At read side, the read frequency will plus 1. - * @see HighDFPackedIntsDocsEnum#loadNextPosting() - */ - int freq = postingsEnum.freq() - 1; - assert freq >= 0; - - int newBitsForFreq = bitsNeededForFreq; - if (freq > maxFreq) { - maxFreq = freq; - newBitsForFreq = log(maxFreq, 2); - assert newBitsForFreq <= MAX_FREQ_BIT; - } - - // Write positions for this doc if not omit positions. - if (!omitPositions) { - writePositionsForDoc(postingsEnum, positionsState); - } - - if ((newBitsForDelta + newBitsForFreq) * (docFreqQueue.size() + 1) > NUM_BITS_PER_SLICE) { - //The latest doc does not fit into this slice. - assert (bitsNeededForDelta + bitsNeededForFreq) * docFreqQueue.size() - <= NUM_BITS_PER_SLICE; - - prevWrittenDoc = writeDeltaFreqSlice( - bitsNeededForDelta, - bitsNeededForFreq, - positionsState, - prevWrittenDoc); - numDeltaFreqSlices++; - - maxDelta = delta; - maxFreq = freq; - bitsNeededForDelta = log(maxDelta, 2); - bitsNeededForFreq = log(maxFreq, 2); - } else { - bitsNeededForDelta = newBitsForDelta; - bitsNeededForFreq = newBitsForFreq; - } - - docFreqQueue.offer(doc, freq); - - prevDoc = doc; - } - - // Some positions may be left in the buffer queue. - if (!positionQueue.isEmpty()) { - writePositionSlice(positionsState.bitsNeededForPosition); - } - - // Some docs may be left in the buffer queue. - if (!docFreqQueue.isEmpty()) { - writeDeltaFreqSlice( - bitsNeededForDelta, - bitsNeededForFreq, - positionsState, - prevWrittenDoc); - numDeltaFreqSlices++; - } - - // Write skip list header. - int skipListHeaderPointer = skipListPointer; - final int numSkipListEntries = - (skipLists.length() - (skipListPointer + skipListHeaderSize)) / skiplistEntrySize; - assert numSkipListEntries == numDeltaFreqSlices - : "number of delta freq slices should be the same as number of skip list entries"; - skipLists.set(skipListHeaderPointer++, numSkipListEntries); - skipLists.set(skipListHeaderPointer++, prevDoc); - skipLists.set(skipListHeaderPointer++, numDocs); - skipLists.set(skipListHeaderPointer++, deltaFreqListPointer); - if (!omitPositions) { - skipLists.set(skipListHeaderPointer, positionListPointer); - } - - return skipListPointer; - } - - /** - * Write positions for current doc into {@link #positionLists}. - * - * @param postingsEnum postings enumerator containing the positions need to be written - * @param positionsState some states about {@link #positionLists} and {@link #positionQueue} - * @see #copyPostingList(PostingsEnum, int) - */ - private void writePositionsForDoc( - PostingsEnum postingsEnum, - PositionsState positionsState) throws IOException { - assert !omitPositions : "this method should not be called if positions are omitted"; - - for (int i = 0; i < postingsEnum.freq(); i++) { - int pos = postingsEnum.nextPosition(); - - int newBitsForPosition = positionsState.bitsNeededForPosition; - if (pos > positionsState.maxPosition) { - positionsState.maxPosition = pos; - newBitsForPosition = log(positionsState.maxPosition, 2); - assert newBitsForPosition <= MAX_POSITION_BIT; - } - - if (newBitsForPosition * (positionQueue.size() + 1) - > POSITION_SLICE_NUM_BITS_WITHOUT_HEADER - || positionQueue.isFull()) { - assert positionsState.bitsNeededForPosition * positionQueue.size() - <= POSITION_SLICE_NUM_BITS_WITHOUT_HEADER; - - writePositionSlice(positionsState.bitsNeededForPosition); - positionsState.numPositionsSlices++; - - positionsState.maxPosition = pos; - positionsState.bitsNeededForPosition = log(positionsState.maxPosition, 2); - } else { - positionsState.bitsNeededForPosition = newBitsForPosition; - } - - // Update first position pointer if this position is the first position of a doc - if (i == 0) { - positionsState.nextPositionsSliceIndex = positionsState.numPositionsSlices; - positionsState.nextPositionsSliceOffset = positionQueue.size(); - } - - // Stores a dummy doc -1 since doc is unused in position list. - positionQueue.offer(-1, pos); - } - } - - /** - * Write out all the buffered positions in {@link #positionQueue} into a position slice. - * - * @param bitsNeededForPosition number of bits used for each position in this position slice - */ - private void writePositionSlice(final int bitsNeededForPosition) { - assert !omitPositions; - assert 0 <= bitsNeededForPosition && bitsNeededForPosition <= MAX_POSITION_BIT; - - final int lengthBefore = positionLists.length(); - assert isSliceStart(lengthBefore); - - // First int in this slice stores number of bits needed for position - // and number of positions in this slice.. - positionLists.add(encodePositionEntryHeader(bitsNeededForPosition, positionQueue.size())); - - positionListsWriter.jumpToInt(positionLists.length(), bitsNeededForPosition); - while (!positionQueue.isEmpty()) { - int pos = PostingsBufferQueue.getSecondValue(positionQueue.poll()); - assert log(pos, 2) <= bitsNeededForPosition; - - positionListsWriter.writePackedInt(pos); - } - - // Fill up this slice in case it is only partially filled. - while (positionLists.length() < lengthBefore + SLICE_SIZE) { - positionLists.add(0); - } - - assert positionLists.length() - lengthBefore == SLICE_SIZE; - } - - /** - * Write out all the buffered docs and frequencies in {@link #docFreqQueue} into a delta-freq - * slice and update the skip list entry of this slice. - * - * @param bitsNeededForDelta number of bits used for each delta in this delta-freq slice - * @param bitsNeededForFreq number of bits used for each freq in this delta-freq slice - * @param positionsState some states about {@link #positionLists} and {@link #positionQueue} - * @param prevWrittenDoc last doc written in previous slice - * @return last doc written in this slice - */ - private int writeDeltaFreqSlice( - final int bitsNeededForDelta, - final int bitsNeededForFreq, - final PositionsState positionsState, - final int prevWrittenDoc) { - assert 0 <= bitsNeededForDelta && bitsNeededForDelta <= MAX_DOC_ID_BIT; - assert 0 <= bitsNeededForFreq && bitsNeededForFreq <= MAX_FREQ_BIT; - - final int lengthBefore = deltaFreqLists.length(); - assert isSliceStart(lengthBefore); - - writeSkipListEntry(prevWrittenDoc, bitsNeededForDelta, bitsNeededForFreq, positionsState); - - // Keep track of previous docID so that we compute the docID deltas. - int prevDoc = prevWrittenDoc; - - // A pair is stored as a packed value. - final int bitsPerPackedValue = bitsNeededForDelta + bitsNeededForFreq; - deltaFreqListsWriter.jumpToInt(deltaFreqLists.length(), bitsPerPackedValue); - while (!docFreqQueue.isEmpty()) { - long value = docFreqQueue.poll(); - int doc = PostingsBufferQueue.getDocID(value); - int delta = doc - prevDoc; - assert log(delta, 2) <= bitsNeededForDelta; - - int freq = PostingsBufferQueue.getSecondValue(value); - assert log(freq, 2) <= bitsNeededForFreq; - - // Cast the delta to long before left shift to avoid overflow. - final long deltaFreqPair = (((long) delta) << bitsNeededForFreq) + freq; - deltaFreqListsWriter.writePackedLong(deltaFreqPair); - prevDoc = doc; - } - - // Fill up this slice in case it is only partially filled. - while (deltaFreqLists.length() < lengthBefore + SLICE_SIZE) { - deltaFreqLists.add(0); - } - - positionsState.currentPositionsSliceIndex = positionsState.nextPositionsSliceIndex; - positionsState.currentPositionsSliceOffset = positionsState.nextPositionsSliceOffset; - - assert deltaFreqLists.length() - lengthBefore == SLICE_SIZE; - return prevDoc; - } - - /** - * Write the skip list entry for a delta-freq slice. - * - * @param prevWrittenDoc last doc written in previous slice - * @param bitsNeededForDelta number of bits used for each delta in this delta-freq slice - * @param bitsNeededForFreq number of bits used for each freq in this delta-freq slice - * @param positionsState some states about {@link #positionLists} and {@link #positionQueue} - * @see #writeDeltaFreqSlice(int, int, PositionsState, int) - * @see #SKIPLIST_ENTRY_SIZE - */ - private void writeSkipListEntry( - int prevWrittenDoc, - int bitsNeededForDelta, - int bitsNeededForFreq, - PositionsState positionsState) { - // 1st int: last written doc ID in previous slice - skipLists.add(prevWrittenDoc); - - // 2nd int: encoded metadata - skipLists.add( - encodeSkipListEntryMetadata( - positionsState.currentPositionsSliceOffset, - bitsNeededForDelta, - bitsNeededForFreq, - docFreqQueue.size())); - - // 3rd int: optional, position slice index - if (!omitPositions) { - skipLists.add(positionsState.currentPositionsSliceIndex); - } - } - - /** - * Create and return a docs enumerator or docs-positions enumerator based on input flag. - * - * @see org.apache.lucene.index.PostingsEnum - */ - @Override - public EarlybirdPostingsEnum postings( - int postingListPointer, int numPostings, int flags) throws IOException { - // Positions are omitted but position enumerator are requried. - if (omitPositions && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) { - GETTING_POSITIONS_WITH_OMIT_POSITIONS.increment(); - } - - if (!omitPositions && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS)) { - return new HighDFPackedIntsDocsAndPositionsEnum( - skipLists, - deltaFreqLists, - positionLists, - postingListPointer, - numPostings, - false); - } else { - return new HighDFPackedIntsDocsEnum( - skipLists, - deltaFreqLists, - postingListPointer, - numPostings, - omitPositions); - } - } - - /****************************************************** - * Skip list entry encoded data encoding and decoding * - ******************************************************/ - - /** - * Encode a skip list entry metadata, which is stored in the 2nd int of the skip list entry. - * - * @see #SKIPLIST_ENTRY_SIZE - */ - private static int encodeSkipListEntryMetadata( - int positionOffsetInSlice, int numBitsForDelta, int numBitsForFreq, int numDocsInSlice) { - assert 0 <= positionOffsetInSlice - && positionOffsetInSlice < POSITION_SLICE_NUM_BITS_WITHOUT_HEADER; - assert 0 <= numBitsForDelta && numBitsForDelta <= MAX_DOC_ID_BIT; - assert 0 <= numBitsForFreq && numBitsForFreq <= MAX_FREQ_BIT; - assert 0 < numDocsInSlice && numDocsInSlice <= NUM_BITS_PER_SLICE; - return (positionOffsetInSlice << SKIPLIST_ENTRY_POSITION_OFFSET_SHIFT) - + (numBitsForDelta << SKIPLIST_ENTRY_NUM_BITS_DELTA_SHIFT) - + (numBitsForFreq << SKIPLIST_ENTRY_NUM_BITS_FREQ_SHIFT) - // stores numDocsInSlice - 1 to avoid over flow since numDocsInSlice ranges in [1, 2048] - // and 11 bits are used to store number docs in slice - + (numDocsInSlice - 1); - } - - /** - * Decode POSITION_SLICE_OFFSET of the delta-freq slice having the given skip entry encoded data. - * - * @see #SKIPLIST_ENTRY_SIZE - */ - static int getPositionOffsetInSlice(int skipListEntryEncodedMetadata) { - return (skipListEntryEncodedMetadata >>> SKIPLIST_ENTRY_POSITION_OFFSET_SHIFT) - & SKIPLIST_ENTRY_POSITION_OFFSET_MASK; - } - - /** - * Decode number of bits used for delta in the slice having the given skip entry encoded data. - * - * @see #SKIPLIST_ENTRY_SIZE - */ - static int getNumBitsForDelta(int skipListEntryEncodedMetadata) { - return (skipListEntryEncodedMetadata >>> SKIPLIST_ENTRY_NUM_BITS_DELTA_SHIFT) - & SKIPLIST_ENTRY_NUM_BITS_DELTA_MASK; - } - - /** - * Decode number of bits used for freqs in the slice having the given skip entry encoded data. - * - * @see #SKIPLIST_ENTRY_SIZE - */ - static int getNumBitsForFreq(int skipListEntryEncodedMetadata) { - return (skipListEntryEncodedMetadata >>> SKIPLIST_ENTRY_NUM_BITS_FREQ_SHIFT) - & SKIPLIST_ENTRY_NUM_BITS_FREQ_MASK; - } - - /** - * Decode number of delta-freq pairs stored in the slice having the given skip entry encoded data. - * - * @see #SKIPLIST_ENTRY_SIZE - */ - static int getNumDocsInSlice(int skipListEntryEncodedMetadata) { - /** - * Add 1 to the decode value since the stored value is subtracted by 1. - * @see #encodeSkipListEntryMetadata(int, int, int, int) - */ - return (skipListEntryEncodedMetadata & SKIPLIST_ENTRY_NUM_DOCS_MASK) + 1; - } - - /***************************************************** - * Position slice entry header encoding and decoding * - *****************************************************/ - - /** - * Encode a position slice entry header. - * - * @param numBitsForPosition number of bits used to encode positions in this slice. - * @param numPositionsInSlice number of positions in this slice. - * @return an int as the encoded header. - * @see #POSITION_SLICE_HEADER_SIZE - */ - private static int encodePositionEntryHeader(int numBitsForPosition, int numPositionsInSlice) { - assert 0 <= numBitsForPosition && numBitsForPosition <= MAX_POSITION_BIT; - assert 0 < numPositionsInSlice && numPositionsInSlice <= POSITION_SLICE_NUM_BITS_WITHOUT_HEADER; - return (numBitsForPosition << POSITION_SLICE_HEADER_BITS_POSITION_SHIFT) + numPositionsInSlice; - } - - /** - * Decode number of bits used for position in the slice having the given header. - * - * @param positionEntryHeader entry header will be decoded. - * @see #POSITION_SLICE_HEADER_SIZE - */ - static int getNumBitsForPosition(int positionEntryHeader) { - return (positionEntryHeader >>> POSITION_SLICE_HEADER_BITS_POSITION_SHIFT) - & POSITION_SLICE_HEADER_BITS_POSITION_MASK; - } - - /** - * Decode number of positions stored in the slice having the given header. - * - * @param positionEntryHeader entry header will be decoded. - * @see #POSITION_SLICE_HEADER_SIZE - */ - static int getNumPositionsInSlice(int positionEntryHeader) { - return positionEntryHeader & POSITION_SLICE_HEADER_NUM_POSITIONS_MASK; - } - - /****************** - * Helper methods * - ******************/ - - /** - * Check if given pointer is pointing to the slice start. - * - * @param pointer the index will be checked. - */ - static boolean isSliceStart(int pointer) { - return pointer % HighDFPackedIntsPostingLists.SLICE_SIZE == 0; - } - - /** - * Ceil of log of x in the given base. - * - * @return x == 0 ? 0 : Math.ceil(Math.log(x) / Math.log(base)) - */ - private static int log(int x, int base) { - assert base >= 2; - if (x == 0) { - return 0; - } - int ret = 1; - long n = base; // needs to be a long to avoid overflow - while (x >= n) { - n *= base; - ret++; - } - return ret; - } - - /********************** - * For flush and load * - **********************/ - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static class FlushHandler extends Flushable.Handler { - private static final String OMIT_POSITIONS_PROP_NAME = "omitPositions"; - private static final String SKIP_LISTS_PROP_NAME = "skipLists"; - private static final String DELTA_FREQ_LISTS_PROP_NAME = "deltaFreqLists"; - private static final String POSITION_LISTS_PROP_NAME = "positionLists"; - - public FlushHandler() { - super(); - } - - public FlushHandler(HighDFPackedIntsPostingLists objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) - throws IOException { - HighDFPackedIntsPostingLists objectToFlush = getObjectToFlush(); - flushInfo.addBooleanProperty(OMIT_POSITIONS_PROP_NAME, objectToFlush.omitPositions); - objectToFlush.skipLists.getFlushHandler() - .flush(flushInfo.newSubProperties(SKIP_LISTS_PROP_NAME), out); - objectToFlush.deltaFreqLists.getFlushHandler() - .flush(flushInfo.newSubProperties(DELTA_FREQ_LISTS_PROP_NAME), out); - objectToFlush.positionLists.getFlushHandler() - .flush(flushInfo.newSubProperties(POSITION_LISTS_PROP_NAME), out); - } - - @Override - protected HighDFPackedIntsPostingLists doLoad( - FlushInfo flushInfo, DataDeserializer in) throws IOException { - IntBlockPool skipLists = (new IntBlockPool.FlushHandler()) - .load(flushInfo.getSubProperties(SKIP_LISTS_PROP_NAME), in); - IntBlockPool deltaFreqLists = (new IntBlockPool.FlushHandler()) - .load(flushInfo.getSubProperties(DELTA_FREQ_LISTS_PROP_NAME), in); - IntBlockPool positionLists = (new IntBlockPool.FlushHandler()) - .load(flushInfo.getSubProperties(POSITION_LISTS_PROP_NAME), in); - return new HighDFPackedIntsPostingLists( - skipLists, - deltaFreqLists, - positionLists, - flushInfo.getBooleanProperty(OMIT_POSITIONS_PROP_NAME), - null, - null); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsSkipListReader.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsSkipListReader.docx new file mode 100644 index 000000000..5b50bd82a Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsSkipListReader.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsSkipListReader.java b/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsSkipListReader.java deleted file mode 100644 index 7f6f04f47..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/HighDFPackedIntsSkipListReader.java +++ /dev/null @@ -1,200 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import org.apache.lucene.search.DocIdSetIterator; - -/** - * A skip list reader of a single term used {@link HighDFPackedIntsDocsEnum}. - * @see HighDFPackedIntsPostingLists - */ -class HighDFPackedIntsSkipListReader { - /** Skip lists int pool. */ - private final IntBlockPool skipLists; - - /** Whether positions are omitted in the posting list having the read skip list. */ - private final boolean omitPositions; - - /** - * Last doc in the previous slice relative to the current delta-freq slice. This value is 0 if - * the current slice is the first delta-freq slice. - */ - private int previousDocIDCurrentSlice; - - /** Encoded metadata of the current delta-freq slice.*/ - private int encodedMetadataCurrentSlice; - - /** - * Pointer to the first int (contains the position slice header) of the position slice that has - * the first position of the first doc in the current delta-freq slice. - */ - private int positionCurrentSliceIndex; - - /** Pointer to the first int in the current delta-freq slice. */ - private int deltaFreqCurrentSlicePointer; - - /** Data of next slice. */ - private int previousDocIDNextSlice; - private int encodedMetadataNextSlice; - private int positionNextSliceIndex; - private int deltaFreqNextSlicePointer; - - /** Used to load blocks and read ints from skip lists int pool. */ - private int[] currentSkipListBlock; - private int skipListBlockStart; - private int skipListBlockIndex; - - /** Number of remaining skip entries for the read skip list. */ - private int numSkipListEntriesRemaining; - - /** Largest doc ID in the posting list having the read skip list. */ - private final int largestDocID; - - /** Pointer to the first int in the first slice that stores positions for this term. */ - private final int positionListPointer; - - /** Total number of docs in the posting list having the read skip list. */ - private final int numDocsTotal; - - /** - * Create a skip list reader specified by the given skip list pointer in the given skip lists int - * pool. - * - * @param skipLists int pool where the read skip list exists - * @param skipListPointer pointer to the read skip list - * @param omitPositions whether positions are omitted in the positing list to which the read skip - * list belongs - */ - public HighDFPackedIntsSkipListReader( - final IntBlockPool skipLists, - final int skipListPointer, - final boolean omitPositions) { - this.skipLists = skipLists; - this.omitPositions = omitPositions; - - this.skipListBlockStart = IntBlockPool.getBlockStart(skipListPointer); - this.skipListBlockIndex = IntBlockPool.getOffsetInBlock(skipListPointer); - this.currentSkipListBlock = skipLists.getBlock(skipListBlockStart); - - // Read skip list header. - this.numSkipListEntriesRemaining = readNextValueFromSkipListBlock(); - this.largestDocID = readNextValueFromSkipListBlock(); - this.numDocsTotal = readNextValueFromSkipListBlock(); - int deltaFreqListPointer = readNextValueFromSkipListBlock(); - this.positionListPointer = omitPositions ? -1 : readNextValueFromSkipListBlock(); - - // Set it back by one slice for fetchNextSkipEntry() to advance correctly. - this.deltaFreqNextSlicePointer = deltaFreqListPointer - HighDFPackedIntsPostingLists.SLICE_SIZE; - fetchNextSkipEntry(); - } - - /** - * Load already fetched data in next skip entry into current data variables, and pre-fetch again. - */ - public void getNextSkipEntry() { - previousDocIDCurrentSlice = previousDocIDNextSlice; - encodedMetadataCurrentSlice = encodedMetadataNextSlice; - positionCurrentSliceIndex = positionNextSliceIndex; - deltaFreqCurrentSlicePointer = deltaFreqNextSlicePointer; - fetchNextSkipEntry(); - } - - /** - * Fetch data for next skip entry if skip list is not exhausted; otherwise, set docIDNextSlice - * to NO_MORE_DOCS. - */ - private void fetchNextSkipEntry() { - if (numSkipListEntriesRemaining == 0) { - previousDocIDNextSlice = DocIdSetIterator.NO_MORE_DOCS; - return; - } - - previousDocIDNextSlice = readNextValueFromSkipListBlock(); - encodedMetadataNextSlice = readNextValueFromSkipListBlock(); - if (!omitPositions) { - positionNextSliceIndex = readNextValueFromSkipListBlock(); - } - deltaFreqNextSlicePointer += HighDFPackedIntsPostingLists.SLICE_SIZE; - numSkipListEntriesRemaining--; - } - - /************************************** - * Getters of data in skip list entry * - **************************************/ - - /** - * In the context of a current slice, this is the docID of the last document in the previous - * slice (or 0 if the current slice is the first slice). - * - * @see HighDFPackedIntsPostingLists#SKIPLIST_ENTRY_SIZE - */ - public int getPreviousDocIDCurrentSlice() { - return previousDocIDCurrentSlice; - } - - /** - * Get the encoded metadata of the current delta-freq slice. - * - * @see HighDFPackedIntsPostingLists#SKIPLIST_ENTRY_SIZE - */ - public int getEncodedMetadataCurrentSlice() { - return encodedMetadataCurrentSlice; - } - - /** - * Get the pointer to the first int, WHICH CONTAINS THE POSITION SLICE HEADER, of the position - * slice that contains the first position of the first doc in the delta-freq slice that - * is corresponding to the current skip list entry. - * - * @see HighDFPackedIntsPostingLists#SKIPLIST_ENTRY_SIZE - */ - public int getPositionCurrentSlicePointer() { - assert !omitPositions; - return positionListPointer - + positionCurrentSliceIndex * HighDFPackedIntsPostingLists.SLICE_SIZE; - } - - /** - * Get the pointer to the first int in the current delta-freq slice. - */ - public int getDeltaFreqCurrentSlicePointer() { - return deltaFreqCurrentSlicePointer; - } - - /** - * In the context of next slice, get the last doc ID in the previous slice. This is used to skip - * over slices. - * - * @see HighDFPackedIntsDocsEnum#skipTo(int) - */ - public int peekPreviousDocIDNextSlice() { - return previousDocIDNextSlice; - } - - /*************************************** - * Getters of data in skip list header * - ***************************************/ - - public int getLargestDocID() { - return largestDocID; - } - - public int getNumDocsTotal() { - return numDocsTotal; - } - - /*************************************************** - * Methods helping loading int block and read ints * - ***************************************************/ - - private int readNextValueFromSkipListBlock() { - if (skipListBlockIndex == IntBlockPool.BLOCK_SIZE) { - loadSkipListBlock(); - } - return currentSkipListBlock[skipListBlockIndex++]; - } - - private void loadSkipListBlock() { - skipListBlockStart += IntBlockPool.BLOCK_SIZE; - currentSkipListBlock = skipLists.getBlock(skipListBlockStart); - skipListBlockIndex = 0; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/InMemoryFields.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/InMemoryFields.docx new file mode 100644 index 000000000..77cd4eb79 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/InMemoryFields.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/InMemoryFields.java b/src/java/com/twitter/search/core/earlybird/index/inverted/InMemoryFields.java deleted file mode 100644 index dad877614..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/InMemoryFields.java +++ /dev/null @@ -1,44 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; - -import org.apache.lucene.index.Fields; -import org.apache.lucene.index.Terms; - -public class InMemoryFields extends Fields { - private final Map termsCache = new HashMap<>(); - private final Map perFields; - private final Map pointerIndex; - - /** - * Returns a new {@link Fields} instance for the provided {@link InvertedIndex}es. - */ - public InMemoryFields(Map perFields, - Map pointerIndex) { - this.perFields = perFields; - this.pointerIndex = pointerIndex; - } - - @Override - public Iterator iterator() { - return perFields.keySet().iterator(); - } - - @Override - public Terms terms(String field) { - InvertedIndex invertedIndex = perFields.get(field); - if (invertedIndex == null) { - return null; - } - - return termsCache.computeIfAbsent(invertedIndex, - index -> index.createTerms(pointerIndex.getOrDefault(invertedIndex, -1))); - } - - @Override - public int size() { - return perFields.size(); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/IndexOptimizer.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/IndexOptimizer.docx new file mode 100644 index 000000000..800f771e9 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/IndexOptimizer.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/IndexOptimizer.java b/src/java/com/twitter/search/core/earlybird/index/inverted/IndexOptimizer.java deleted file mode 100644 index 3fc082a47..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/IndexOptimizer.java +++ /dev/null @@ -1,201 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.ConcurrentHashMap; - -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.facets.AbstractFacetCountingArray; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.facets.FacetUtil; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentData; -import com.twitter.search.core.earlybird.index.TimeMapper; -import com.twitter.search.core.earlybird.index.column.DocValuesManager; - -public final class IndexOptimizer { - private static final Logger LOG = LoggerFactory.getLogger(IndexOptimizer.class); - - private IndexOptimizer() { - } - - /** - * Optimizes this in-memory index segment. - */ - public static EarlybirdRealtimeIndexSegmentData optimize( - EarlybirdRealtimeIndexSegmentData source) throws IOException { - LOG.info("Starting index optimizing."); - - ConcurrentHashMap targetMap = new ConcurrentHashMap<>(); - LOG.info(String.format( - "Source PerFieldMap size is %d", source.getPerFieldMap().size())); - - LOG.info("Optimize doc id mapper."); - // Optimize the doc ID mapper first. - DocIDToTweetIDMapper originalTweetIdMapper = source.getDocIDToTweetIDMapper(); - DocIDToTweetIDMapper optimizedTweetIdMapper = originalTweetIdMapper.optimize(); - - TimeMapper optimizedTimeMapper = - source.getTimeMapper() != null - ? source.getTimeMapper().optimize(originalTweetIdMapper, optimizedTweetIdMapper) - : null; - - // Some fields have their terms rewritten to support the minimal perfect hash function we use - // (note that it's a minimal perfect hash function, not a minimal perfect hash _table_). - // The FacetCountingArray stores term IDs. This is a map from the facet field ID to a map from - // original term ID to the new, MPH term IDs. - Map termIDMapper = new HashMap<>(); - - LOG.info("Optimize inverted indexes."); - optimizeInvertedIndexes( - source, targetMap, originalTweetIdMapper, optimizedTweetIdMapper, termIDMapper); - - LOG.info("Rewrite and map ids in facet counting array."); - AbstractFacetCountingArray facetCountingArray = source.getFacetCountingArray().rewriteAndMapIDs( - termIDMapper, originalTweetIdMapper, optimizedTweetIdMapper); - - Map facetLabelProviders = - FacetUtil.getFacetLabelProviders(source.getSchema(), targetMap); - - LOG.info("Optimize doc values manager."); - DocValuesManager optimizedDocValuesManager = - source.getDocValuesManager().optimize(originalTweetIdMapper, optimizedTweetIdMapper); - - LOG.info("Optimize deleted docs."); - DeletedDocs optimizedDeletedDocs = - source.getDeletedDocs().optimize(originalTweetIdMapper, optimizedTweetIdMapper); - - final boolean isOptimized = true; - return new EarlybirdRealtimeIndexSegmentData( - source.getMaxSegmentSize(), - source.getTimeSliceID(), - source.getSchema(), - isOptimized, - optimizedTweetIdMapper.getNextDocID(Integer.MIN_VALUE), - targetMap, - facetCountingArray, - optimizedDocValuesManager, - facetLabelProviders, - source.getFacetIDMap(), - optimizedDeletedDocs, - optimizedTweetIdMapper, - optimizedTimeMapper, - source.getIndexExtensionsData()); - } - - private static void optimizeInvertedIndexes( - EarlybirdRealtimeIndexSegmentData source, - ConcurrentHashMap targetMap, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper, - Map termIDMapper - ) throws IOException { - for (Map.Entry entry : source.getPerFieldMap().entrySet()) { - String fieldName = entry.getKey(); - Preconditions.checkState(entry.getValue() instanceof InvertedRealtimeIndex); - InvertedRealtimeIndex sourceIndex = (InvertedRealtimeIndex) entry.getValue(); - EarlybirdFieldType fieldType = source.getSchema().getFieldInfo(fieldName).getFieldType(); - - InvertedIndex newIndex; - if (fieldType.becomesImmutable() && sourceIndex.getNumTerms() > 0) { - Schema.FieldInfo facetField = source.getSchema().getFacetFieldByFieldName(fieldName); - - newIndex = new OptimizedMemoryIndex( - fieldType, - fieldName, - sourceIndex, - termIDMapper, - source.getFacetIDMap().getFacetField(facetField), - originalTweetIdMapper, - optimizedTweetIdMapper); - } else { - newIndex = optimizeMutableIndex( - fieldType, - fieldName, - sourceIndex, - originalTweetIdMapper, - optimizedTweetIdMapper); - } - - targetMap.put(fieldName, newIndex); - } - } - - /** - * Optimize a mutable index. - */ - private static InvertedIndex optimizeMutableIndex( - EarlybirdFieldType fieldType, - String fieldName, - InvertedRealtimeIndex originalIndex, - DocIDToTweetIDMapper originalMapper, - DocIDToTweetIDMapper optimizedMapper - ) throws IOException { - Preconditions.checkState(!fieldType.isStorePerPositionPayloads()); - TermsEnum allTerms = originalIndex.createTermsEnum(originalIndex.getMaxPublishedPointer()); - - int numTerms = originalIndex.getNumTerms(); - - InvertedRealtimeIndex index = new InvertedRealtimeIndex( - fieldType, - TermPointerEncoding.DEFAULT_ENCODING, - fieldName); - index.setNumDocs(originalIndex.getNumDocs()); - - for (int termID = 0; termID < numTerms; termID++) { - allTerms.seekExact(termID); - PostingsEnum postingsEnum = new OptimizingPostingsEnumWrapper( - allTerms.postings(null), originalMapper, optimizedMapper); - - BytesRef termPayload = originalIndex.getLabelAccessor().getTermPayload(termID); - copyPostingList(index, postingsEnum, termID, allTerms.term(), termPayload); - } - return index; - } - - - /** - * Copies the given posting list into these posting lists. - * - * @param postingsEnum enumerator of the posting list that needs to be copied - */ - private static void copyPostingList( - InvertedRealtimeIndex index, - PostingsEnum postingsEnum, - int termID, - BytesRef term, - BytesRef termPayload - ) throws IOException { - int docId; - while ((docId = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - index.incrementSumTermDocFreq(); - for (int i = 0; i < postingsEnum.freq(); i++) { - index.incrementSumTotalTermFreq(); - int position = postingsEnum.nextPosition(); - int newTermID = InvertedRealtimeIndexWriter.indexTerm( - index, - term, - docId, - position, - termPayload, - null, // We know that fields that remain mutable never have a posting payload. - TermPointerEncoding.DEFAULT_ENCODING); - - // Our term lookups are very slow, so we cache term dictionaries for some fields across many - // segments, so we must keep the term IDs the same while remapping. - Preconditions.checkState(newTermID == termID); - } - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPool.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPool.docx new file mode 100644 index 000000000..c0f051040 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPool.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPool.java b/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPool.java deleted file mode 100644 index bf85c8765..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPool.java +++ /dev/null @@ -1,225 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import java.util.Arrays; - -import com.google.common.annotations.VisibleForTesting; - -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; - -// Modeled after TwitterCharBlockPool, with a lot of simplification. -public class IntBlockPool implements Flushable { - private static final SearchLongGauge INT_BLOCK_POOL_MAX_LENGTH = - SearchLongGauge.export("twitter_int_block_pool_max_size"); - private static final String STAT_PREFIX = "twitter_int_block_pool_size_"; - - private static final int BLOCK_SHIFT = 14; - public static final int BLOCK_SIZE = 1 << BLOCK_SHIFT; - private static final int BLOCK_MASK = BLOCK_SIZE - 1; - - // We can address up to 2^31 elements with an int. We use 1 << 14 bits for the block offset, - // so we can use the remaining 17 bits for the blocks index. Therefore the maximum number of - // addressable blocks is 1 << 17 or maxInt >> 14. - private static final int MAX_NUM_BLOCKS = Integer.MAX_VALUE >> BLOCK_SHIFT; - - // Initial value written into the blocks. - private final int initialValue; - - // Extra object with final array is necessary to guarantee visibility - // to other threads without synchronization / volatiles. See comment - // in TwitterCharBlockPool. - public static final class Pool { - public final int[][] blocks; - Pool(int[][] blocks) { - this.blocks = blocks; - - // Adjust max size if exceeded maximum value. - synchronized (INT_BLOCK_POOL_MAX_LENGTH) { - if (this.blocks != null) { - final long currentSize = (long) (this.blocks.length * BLOCK_SIZE); - if (currentSize > INT_BLOCK_POOL_MAX_LENGTH.get()) { - INT_BLOCK_POOL_MAX_LENGTH.set(currentSize); - } - } - } - } - } - public Pool pool; - - private int currBlockIndex; // Index into blocks array. - private int[] currBlock = null; - private int currBlockOffset; // Index into current block. - private final String poolName; - private final SearchLongGauge sizeGauge; - - public IntBlockPool(String poolName) { - this(0, poolName); - } - - public IntBlockPool(int initialValue, String poolName) { - // Start with room for 16 initial blocks (does not allocate these blocks). - this.pool = new Pool(new int[16][]); - this.initialValue = initialValue; - - // Start at the end of a previous, non-existent blocks. - this.currBlockIndex = -1; - this.currBlock = null; - this.currBlockOffset = BLOCK_SIZE; - this.poolName = poolName; - this.sizeGauge = createGauge(poolName, pool); - } - - // Constructor for FlushHandler. - protected IntBlockPool( - int currBlockIndex, - int currBlockOffset, - int[][]blocks, - String poolName) { - this.initialValue = 0; - this.pool = new Pool(blocks); - this.currBlockIndex = currBlockIndex; - this.currBlockOffset = currBlockOffset; - if (currBlockIndex >= 0) { - this.currBlock = this.pool.blocks[currBlockIndex]; - } - this.poolName = poolName; - this.sizeGauge = createGauge(poolName, pool); - } - - private static SearchLongGauge createGauge(String suffix, Pool pool) { - SearchLongGauge gauge = SearchLongGauge.export(STAT_PREFIX + suffix); - if (pool.blocks != null) { - gauge.set(pool.blocks.length * BLOCK_SIZE); - } - return gauge; - } - - /** - * Adds an int to the current block and returns it's overall index. - */ - public int add(int value) { - if (currBlockOffset == BLOCK_SIZE) { - newBlock(); - } - currBlock[currBlockOffset++] = value; - return (currBlockIndex << BLOCK_SHIFT) + currBlockOffset - 1; - } - - // Returns number of ints in this blocks - public int length() { - return currBlockOffset + currBlockIndex * BLOCK_SIZE; - } - - // Gets an int from the specified index. - public final int get(int index) { - return getBlock(index)[getOffsetInBlock(index)]; - } - - public static int getBlockStart(int index) { - return (index >>> BLOCK_SHIFT) * BLOCK_SIZE; - } - - public static int getOffsetInBlock(int index) { - return index & BLOCK_MASK; - } - - public final int[] getBlock(int index) { - final int blockIndex = index >>> BLOCK_SHIFT; - return pool.blocks[blockIndex]; - } - - // Sets an int value at the specified index. - public void set(int index, int value) { - final int blockIndex = index >>> BLOCK_SHIFT; - final int offset = index & BLOCK_MASK; - pool.blocks[blockIndex][offset] = value; - } - - /** - * Evaluates whether two instances of IntBlockPool are equal by value. It is - * slow because it has to check every element in the pool. - */ - @VisibleForTesting - public boolean verySlowEqualsForTests(IntBlockPool that) { - if (length() != that.length()) { - return false; - } - - for (int i = 0; i < length(); i++) { - if (get(i) != that.get(i)) { - return false; - } - } - - return true; - } - - private void newBlock() { - final int newBlockIndex = 1 + currBlockIndex; - if (newBlockIndex >= MAX_NUM_BLOCKS) { - throw new RuntimeException( - "Too many blocks, would overflow int index for blocks " + poolName); - } - if (newBlockIndex == pool.blocks.length) { - // Blocks array is too small to add a new block. Resize. - int[][] newBlocks = new int[pool.blocks.length * 2][]; - System.arraycopy(pool.blocks, 0, newBlocks, 0, pool.blocks.length); - pool = new Pool(newBlocks); - - sizeGauge.set(pool.blocks.length * BLOCK_SIZE); - } - - currBlock = pool.blocks[newBlockIndex] = allocateBlock(); - currBlockOffset = 0; - currBlockIndex = newBlockIndex; - } - - private int[] allocateBlock() { - int[] block = new int[BLOCK_SIZE]; - Arrays.fill(block, initialValue); - return block; - } - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String CURRENT_BLOCK_INDEX_PROP_NAME = "currentBlockIndex"; - private static final String CURRENT_BLOCK_OFFSET_PROP_NAME = "currentBlockOffset"; - private static final String POOL_NAME = "poolName"; - - public FlushHandler() { - super(); - } - - public FlushHandler(IntBlockPool objToFlush) { - super(objToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - IntBlockPool pool = getObjectToFlush(); - flushInfo.addIntProperty(CURRENT_BLOCK_INDEX_PROP_NAME, pool.currBlockIndex); - flushInfo.addIntProperty(CURRENT_BLOCK_OFFSET_PROP_NAME, pool.currBlockOffset); - flushInfo.addStringProperty(POOL_NAME, pool.poolName); - out.writeIntArray2D(pool.pool.blocks, pool.currBlockIndex + 1); - } - - @Override - protected IntBlockPool doLoad(FlushInfo flushInfo, DataDeserializer in) throws IOException { - String poolName = flushInfo.getStringProperty(POOL_NAME); - return new IntBlockPool( - flushInfo.getIntProperty(CURRENT_BLOCK_INDEX_PROP_NAME), - flushInfo.getIntProperty(CURRENT_BLOCK_OFFSET_PROP_NAME), - in.readIntArray2D(), - poolName); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsReader.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsReader.docx new file mode 100644 index 000000000..d3dc72592 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsReader.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsReader.java b/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsReader.java deleted file mode 100644 index 5edf92f77..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsReader.java +++ /dev/null @@ -1,253 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import javax.annotation.Nullable; - -/** - * A packed ints reader reading packed values (int/long) written in {@link IntBlockPool}. - * @see IntBlockPoolPackedLongsWriter - * - * A standard usage would be : - * - set reader at an int block pool pointer and number of bits per packed value: - * {@link #jumpToInt(int, int)}} - * - read: {@link #readPackedLong()} - * - * Example usage: - * @see HighDFPackedIntsDocsEnum - * @see HighDFPackedIntsDocsAndPositionsEnum - */ -public final class IntBlockPoolPackedLongsReader { - /** - * Mask used to convert an int to a long. We cannot just cast because it will fill in the higher - * 32 bits with the sign bit, but we need the higher 32 bits to be 0 instead. - */ - private static final long LONG_MASK = 0xFFFFFFFFL; - - /** The int block pool from which packed ints will be read. */ - private final IntBlockPool intBlockPool; - - /** Pre-computed shifts, masks, and start int indices used to decode packed ints. */ - private final PackedLongsReaderPreComputedValues preComputedValues; - - /** - * The underlying {@link #intBlockPool} will be read block by blocks. The current read - * block will be identified by {@link #startPointerForCurrentBlock} and assigned to - * {@link #currentBlock}. {@link #indexInCurrentBlock} will be used access values from the - * {@link #currentBlock}. - */ - private int[] currentBlock; - private int indexInCurrentBlock; - private int startPointerForCurrentBlock = -1; - - /** - * Whether the decoded packed values are spanning more than 1 int. - * @see #readPackedLong() - */ - private boolean packedValueNeedsLong; - - /** - * Masks used to extract packed values. - * @see #readPackedLong() - */ - private long packedValueMask; - - /** PRE-COMPUTED: The index of the first int that has a specific packed values. */ - private int[] packedValueStartIndices; - - /** PRE-COMPUTED: The shifts and masks used to decode packed values. */ - private int[] packedValueLowBitsRightShift; - private int[] packedValueMiddleBitsLeftShift; - private int[] packedValueMiddleBitsMask; - private int[] packedValueHighBitsLeftShift; - private int[] packedValueHighBitsMask; - - /** Index of packed values. */ - private int packedValueIndex; - - /** - * The {@link #indexInCurrentBlock} and {@link #startPointerForCurrentBlock} of the first int - * that holds packed values. This two values together uniquely form a int block pool pointer - * --- {@link #packedValueStartBlockStart} + {@link #packedValueStartBlockIndex} --- that points - * to the first int that has pointer. - * - * @see #jumpToInt(int, int) - */ - private int packedValueStartBlockIndex; - private int packedValueStartBlockStart; - - /** Current int read from {@link #currentBlock}. */ - private int currentInt; - - /** - * If given, query cost will be tracked every time a int block is loaded. - * @see #loadNextBlock() - */ - private final QueryCostTracker queryCostTracker; - private final QueryCostTracker.CostType queryCostType; - - /** - * Default constructor. - * - * @param intBlockPool from which packed ints will be read - * @param preComputedValues pre-computed shifts, masks, and start int - * @param queryCostTracker optional, query cost tracker used while loading a new block - * @param queryCostType optional, query cost type will be tracked while loading a new block - */ - public IntBlockPoolPackedLongsReader( - IntBlockPool intBlockPool, - PackedLongsReaderPreComputedValues preComputedValues, - @Nullable QueryCostTracker queryCostTracker, - @Nullable QueryCostTracker.CostType queryCostType) { - this.intBlockPool = intBlockPool; - this.preComputedValues = preComputedValues; - - // For query cost tracking. - this.queryCostTracker = queryCostTracker; - this.queryCostType = queryCostType; - } - - /** - * Constructor with {@link #queryCostTracker} and {@link #queryCostType} set to null. - * - * @param intBlockPool from which packed ints will be read - * @param preComputedValues pre-computed shifts, masks, and start int - */ - public IntBlockPoolPackedLongsReader( - IntBlockPool intBlockPool, - PackedLongsReaderPreComputedValues preComputedValues) { - this(intBlockPool, preComputedValues, null, null); - } - - /** - * 1. Set the reader to starting reading at the given int block pool pointer. Correct block will - * be loaded if the given pointer points to the different block than {@link #currentBlock}. - * 2. Update shifts, masks, and start int indices based on given number of bits per packed value. - * 3. Reset packed value sequence start data. - * - * @param intBlockPoolPointer points to the int from which this reader will start reading - * @param bitsPerPackedValue number of bits per packed value. - */ - public void jumpToInt(int intBlockPoolPointer, int bitsPerPackedValue) { - assert bitsPerPackedValue <= Long.SIZE; - - // Update indexInCurrentBlock and load a different index if needed. - int newBlockStart = IntBlockPool.getBlockStart(intBlockPoolPointer); - indexInCurrentBlock = IntBlockPool.getOffsetInBlock(intBlockPoolPointer); - - if (startPointerForCurrentBlock != newBlockStart) { - startPointerForCurrentBlock = newBlockStart; - loadNextBlock(); - } - - // Re-set shifts, masks, and start int indices for the given number bits per packed value. - packedValueNeedsLong = bitsPerPackedValue > Integer.SIZE; - packedValueMask = - bitsPerPackedValue == Long.SIZE ? 0xFFFFFFFFFFFFFFFFL : (1L << bitsPerPackedValue) - 1; - packedValueStartIndices = preComputedValues.getStartIntIndices(bitsPerPackedValue); - packedValueLowBitsRightShift = preComputedValues.getLowBitsRightShift(bitsPerPackedValue); - packedValueMiddleBitsLeftShift = preComputedValues.getMiddleBitsLeftShift(bitsPerPackedValue); - packedValueMiddleBitsMask = preComputedValues.getMiddleBitsMask(bitsPerPackedValue); - packedValueHighBitsLeftShift = preComputedValues.getHighBitsLeftShift(bitsPerPackedValue); - packedValueHighBitsMask = preComputedValues.getHighBitsMask(bitsPerPackedValue); - - // Update packed values sequence start data. - packedValueIndex = 0; - packedValueStartBlockIndex = indexInCurrentBlock; - packedValueStartBlockStart = startPointerForCurrentBlock; - - // Load an int to prepare for readPackedLong. - loadInt(); - } - - /** - * Read next packed value as a long. - * - * Caller could cast the returned long to an int if needed. - * NOTICE! Be careful of overflow while casting a long to an int. - * - * @return next packed value in a long. - */ - public long readPackedLong() { - long packedValue; - - if (packedValueNeedsLong) { - packedValue = - (LONG_MASK & currentInt) - >>> packedValueLowBitsRightShift[packedValueIndex] & packedValueMask; - packedValue |= - (LONG_MASK & loadInt() - & packedValueMiddleBitsMask[packedValueIndex]) - << packedValueMiddleBitsLeftShift[packedValueIndex]; - if (packedValueHighBitsLeftShift[packedValueIndex] != 0) { - packedValue |= - (LONG_MASK & loadInt() - & packedValueHighBitsMask[packedValueIndex]) - << packedValueHighBitsLeftShift[packedValueIndex]; - } - } else { - packedValue = - currentInt >>> packedValueLowBitsRightShift[packedValueIndex] & packedValueMask; - if (packedValueMiddleBitsLeftShift[packedValueIndex] != 0) { - packedValue |= - (loadInt() - & packedValueMiddleBitsMask[packedValueIndex]) - << packedValueMiddleBitsLeftShift[packedValueIndex]; - } - } - - packedValueIndex++; - return packedValue; - } - - /** - * A simple getter of {@link #packedValueIndex}. - */ - public int getPackedValueIndex() { - return packedValueIndex; - } - - /** - * A setter of {@link #packedValueIndex}. This setter will also set the correct - * {@link #indexInCurrentBlock} based on {@link #packedValueStartIndices}. - */ - public void setPackedValueIndex(int packedValueIndex) { - this.packedValueIndex = packedValueIndex; - this.indexInCurrentBlock = - packedValueStartBlockIndex + packedValueStartIndices[packedValueIndex]; - this.startPointerForCurrentBlock = packedValueStartBlockStart; - loadInt(); - } - - /************************** - * Private Helper Methods * - **************************/ - - /** - * Load a new int block, specified by {@link #startPointerForCurrentBlock}, from - * {@link #intBlockPool}. If {@link #queryCostTracker} is given, query cost with type - * {@link #queryCostType} will be tracked as well. - */ - private void loadNextBlock() { - if (queryCostTracker != null) { - assert queryCostType != null; - queryCostTracker.track(queryCostType); - } - - currentBlock = intBlockPool.getBlock(startPointerForCurrentBlock); - } - - /** - * Load an int from {@link #currentBlock}. The loaded int will be returned as well. - * If the {@link #currentBlock} is used up, next block will be automatically loaded. - */ - private int loadInt() { - while (indexInCurrentBlock >= IntBlockPool.BLOCK_SIZE) { - startPointerForCurrentBlock += IntBlockPool.BLOCK_SIZE; - loadNextBlock(); - - indexInCurrentBlock = Math.max(indexInCurrentBlock - IntBlockPool.BLOCK_SIZE, 0); - } - - currentInt = currentBlock[indexInCurrentBlock++]; - return currentInt; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsWriter.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsWriter.docx new file mode 100644 index 000000000..f858ad7e5 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsWriter.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsWriter.java b/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsWriter.java deleted file mode 100644 index 320be6650..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/IntBlockPoolPackedLongsWriter.java +++ /dev/null @@ -1,166 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -/** - * A packed ints writer writing packed values (int/long) into {@link IntBlockPool}. - * @see IntBlockPoolPackedLongsReader - * - * A standard useage would be: - * - set writer at an int block pool pointer and number of bits per packed value: - * {@link #jumpToInt(int, int)} - * - write: {@link #writePackedInt(int)} or {@link #writePackedLong(long)} - * - * Example usage: - * @see HighDFPackedIntsPostingLists - */ -public final class IntBlockPoolPackedLongsWriter { - /** - * Mask used to convert an int to a long. We cannot just cast because it will fill in the higher - * 32 bits with the sign bit, but we need the higher 32 bits to be 0 instead. - */ - private static final long LONG_MASK = 0xFFFFFFFFL; - - /** The int block pool into which packed ints will be written. */ - private final IntBlockPool intBlockPool; - - /** The value in the current position in the int block pool. */ - private int currentIntValue = 0; - - /** Starting bit index of unused bits in {@link #currentIntValue}. */ - private int currentIntBitIndex = 0; - - /** Pointer of {@link #currentIntValue} in {@link #intBlockPool}. */ - private int currentIntPointer = -1; - - /** - * Number of bits per packed value that will be written with - * {@link #writePackedInt(int)} or {@link #writePackedLong(long)}. - */ - private int numBitsPerPackedValue = -1; - - /** - * Mask used to extract the lower {@link #numBitsPerPackedValue} in a given value. - */ - private long packedValueBitsMask = 0; - - /** - * Sole constructor. - * - * @param intBlockPool into which packed ints will be written - */ - public IntBlockPoolPackedLongsWriter(IntBlockPool intBlockPool) { - this.intBlockPool = intBlockPool; - } - - /** - * 1. Set this writer to start writing at the given int block pool pointer. - * 2. Set number of bits per packed value that will be write. - * 3. Re-set {@link #currentIntValue} and {@link #currentIntBitIndex} to 0. - * - * @param intBlockPoolPointer the position this writer should start writing packed values. This - * pointer must be less then or equal to he length of the block pool. - * Subsequent writes will {@link IntBlockPool#add(int)} to the - * end of the int block pool if the given pointer equals to the length. - * @param bitsPerPackedValue must be non-negative. - */ - public void jumpToInt(int intBlockPoolPointer, int bitsPerPackedValue) { - assert intBlockPoolPointer <= intBlockPool.length(); - assert bitsPerPackedValue >= 0; - - // Set the writer to start writing at the given int block pool pointer. - this.currentIntPointer = intBlockPoolPointer; - - // Set number of bits that will be write per packed value. - this.numBitsPerPackedValue = bitsPerPackedValue; - - // Compute the mask used to extract lower number of bitsPerPackedValue. - this.packedValueBitsMask = - bitsPerPackedValue == Long.SIZE ? -1L : (1L << bitsPerPackedValue) - 1; - - // Reset current int data to 0. - this.currentIntValue = 0; - this.currentIntBitIndex = 0; - } - - /** - * The given int value will be ZERO extended to a long and written using - * {@link #writePackedValueInternal(long)} (long)}. - * - * @see #LONG_MASK - */ - public void writePackedInt(final int value) { - assert numBitsPerPackedValue <= Integer.SIZE; - writePackedValueInternal(LONG_MASK & value); - } - - /** - * Write a long value. - * The given long value must bu UNABLE to fit in an int. - */ - public void writePackedLong(final long value) { - assert numBitsPerPackedValue <= Long.SIZE; - writePackedValueInternal(value); - } - - /************************* - * Private Helper Method * - *************************/ - - /** - * Write the given number of bits of the given value into this int pool as a packed int. - * - * @param value value will be written - */ - private void writePackedValueInternal(final long value) { - // Extract lower 'numBitsPerPackedValue' from the given value. - long val = value & packedValueBitsMask; - - assert val == value : String.format( - "given value %d needs more bits than specified %d", value, numBitsPerPackedValue); - - int numBitsWrittenCurIter; - int numBitsRemaining = numBitsPerPackedValue; - - // Each iteration of this while loop is writing part of the given value. - while (numBitsRemaining > 0) { - // Write into 'currentIntValue' int. - currentIntValue |= val << currentIntBitIndex; - - // Calculate number of bits have been written in this iteration, - // we either used up all the remaining bits in 'currentIntValue' or - // finished up writing the value, whichever is smaller. - numBitsWrittenCurIter = Math.min(Integer.SIZE - currentIntBitIndex, numBitsRemaining); - - // Number of bits remaining should be decremented. - numBitsRemaining -= numBitsWrittenCurIter; - - // Right shift the value to remove the bits have been written. - val >>>= numBitsWrittenCurIter; - - // Update bit index in current int. - currentIntBitIndex += numBitsWrittenCurIter; - assert currentIntBitIndex <= Integer.SIZE; - - flush(); - - // if 'currentIntValue' int is used up. - if (currentIntBitIndex == Integer.SIZE) { - currentIntPointer++; - - currentIntValue = 0; - currentIntBitIndex = 0; - } - } - } - - /** - * Flush the {@link #currentIntValue} int into the int pool if the any bits of the int are used. - */ - private void flush() { - if (currentIntPointer == intBlockPool.length()) { - intBlockPool.add(currentIntValue); - assert currentIntPointer + 1 == intBlockPool.length(); - } else { - intBlockPool.set(currentIntPointer, currentIntValue); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedIndex.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedIndex.docx new file mode 100644 index 000000000..6669aec50 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedIndex.java b/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedIndex.java deleted file mode 100644 index 6e4b79250..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedIndex.java +++ /dev/null @@ -1,144 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * Inverted index for a single field. - * - * Example: The field is "hashtags", this index contains a mapping from all the hashtags - * that we've seen to a list of postings. - */ -public abstract class InvertedIndex implements FacetLabelProvider, Flushable { - protected final EarlybirdFieldType fieldType; - - public InvertedIndex(EarlybirdFieldType fieldType) { - this.fieldType = fieldType; - } - - public EarlybirdFieldType getFieldType() { - return fieldType; - } - - /** - * Get the internal doc id of the oldest doc that includes term. - * @param term the term to look for. - * @return The internal docid, or TERM_NOT_FOUND. - */ - public final int getLargestDocIDForTerm(BytesRef term) throws IOException { - final int termID = lookupTerm(term); - return getLargestDocIDForTerm(termID); - } - - /** - * Get the document frequency for this term. - * @param term the term to look for. - * @return The document frequency of this term in the index. - */ - public final int getDF(BytesRef term) throws IOException { - final int termID = lookupTerm(term); - if (termID == EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) { - return 0; - } - return getDF(termID); - } - - public boolean hasMaxPublishedPointer() { - return false; - } - - public int getMaxPublishedPointer() { - return -1; - } - - /** - * Create the Lucene magic Terms accessor. - * @param maxPublishedPointer used by the skip list to enable atomic document updates. - * @return a new Terms object. - */ - public abstract Terms createTerms(int maxPublishedPointer); - - /** - * Create the Lucene magic TermsEnum accessor. - * @param maxPublishedPointer used by the skip list to enable atomic document updates. - * @return a new TermsEnum object. - */ - public abstract TermsEnum createTermsEnum(int maxPublishedPointer); - - /** - * Returns the number of distinct terms in this inverted index. - * For example, if the indexed documents are: - * "i love chocolate and i love cakes" - * "i love cookies" - * - * then this method will return 6, because there are 6 distinct terms: - * i, love, chocolate, and, cakes, cookies - */ - public abstract int getNumTerms(); - - /** - * Returns the number of distinct documents in this index. - */ - public abstract int getNumDocs(); - - /** - * Returns the total number of postings in this inverted index. - * - * For example, if the indexed documents are: - * "i love chocolate and i love cakes" - * "i love cookies" - * - * then this method will return 10, because there's a total of 10 words in these 2 documents. - */ - public abstract int getSumTotalTermFreq(); - - /** - * Returns the sum of the number of documents for each term in this index. - * - * For example, if the indexed documents are: - * "i love chocolate and i love cakes" - * "i love cookies" - * - * then this method will return 8, because there are: - * 2 documents for term "i" (it doesn't matter that the first document has the term "i" twice) - * 2 documents for term "love" (same reason) - * 1 document for terms "chocolate", "and", "cakes", "cookies" - */ - public abstract int getSumTermDocFreq(); - - /** - * Lookup a term. - * @param term the term to lookup. - * @return the term ID for this term. - */ - public abstract int lookupTerm(BytesRef term) throws IOException; - - /** - * Get the text for a given termID. - * @param termID the term id - * @param text a BytesRef that will be modified to contain the text of this termid. - */ - public abstract void getTerm(int termID, BytesRef text); - - /** - * Get the internal doc id of the oldest doc that includes this term. - * @param termID The termID of the term. - * @return The internal docid, or TERM_NOT_FOUND. - */ - public abstract int getLargestDocIDForTerm(int termID) throws IOException; - - /** - * Get the document frequency for a given termID - * @param termID the term id - * @return the document frequency of this term in this index. - */ - public abstract int getDF(int termID); -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndex.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndex.docx new file mode 100644 index 000000000..2d6431599 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndex.java b/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndex.java deleted file mode 100644 index 381d4c6b5..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndex.java +++ /dev/null @@ -1,558 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import java.util.Comparator; - -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.StringHelper; - -import com.twitter.search.common.hashtable.HashTable; -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.util.hash.KeysSource; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -public class InvertedRealtimeIndex extends InvertedIndex { - public static final int FIXED_HASH_SEED = 0; - - public final class TermHashTable extends HashTable { - - private final TermPointerEncoding termPointerEncoding; - - public TermHashTable(int size, TermPointerEncoding termPointerEncoding) { - super(size); - this.termPointerEncoding = termPointerEncoding; - } - - public TermHashTable(int[] termsHash, TermPointerEncoding termPointerEncoding) { - super(termsHash); - this.termPointerEncoding = termPointerEncoding; - } - - @Override - public boolean matchItem(BytesRef term, int candidateTermID) { - return ByteTermUtils.postingEquals( - getTermPool(), - termPointerEncoding.getTextStart(termsArray.termPointers[candidateTermID]), term); - } - - @Override - public int hashCodeForItem(int itemID) { - return ByteTermUtils.hashCode( - getTermPool(), termPointerEncoding.getTextStart(termsArray.termPointers[itemID])); - } - - /* - * Use a fixed hash seed to compute the hash code for the given item. This is necessary because - * we want the TermHashTable to be consistent for lookups in indexes that have been flushed and - * loaded across restarts and redeploys. - * - * Note: previously we used item.hashcode(), however that hash function relies on the seed value - * StringHelper.GOOD_FAST_HASH_SEED, which is initialized to System.currentTimeMillis() when the - * JVM process starts up. - */ - public long lookupItem(BytesRef item) { - int itemHashCode = StringHelper.murmurhash3_x86_32(item, FIXED_HASH_SEED); - - return super.lookupItem(item, itemHashCode); - } - } - - - /** - * Skip list comparator used by {@link #termsSkipList}. The key would be the bytesRef of the term, - * and the value would be the termID of a term. - * - * Notice this comparator is keeping states, - * so different threads CANNOT share the same comparator. - */ - public static final class TermsSkipListComparator implements SkipListComparator { - private static final Comparator BYTES_REF_COMPARATOR = Comparator.naturalOrder(); - - private static final int SENTINEL_VALUE = HashTable.EMPTY_SLOT; - - // Initializing two BytesRef to use for later comparisons. - // Notice different threads cannot share the same comparator. - private final BytesRef bytesRef1 = new BytesRef(); - private final BytesRef bytesRef2 = new BytesRef(); - - /** - * We have to pass each part of the index in since during load process, the comparator - * needs to be build before the index. - */ - private final InvertedRealtimeIndex invertedIndex; - - public TermsSkipListComparator(InvertedRealtimeIndex invertedIndex) { - this.invertedIndex = invertedIndex; - } - - @Override - public int compareKeyWithValue(BytesRef key, int targetValue, int targetPosition) { - // No key could represent SENTINEL_VALUE and SENTINEL_VALUE is greatest. - if (targetValue == SENTINEL_VALUE) { - return -1; - } else { - getTerm(targetValue, bytesRef1); - return BYTES_REF_COMPARATOR.compare(key, bytesRef1); - } - } - - @Override - public int compareValues(int v1, int v2) { - // SENTINEL_VALUE is greatest. - if (v1 != SENTINEL_VALUE && v2 != SENTINEL_VALUE) { - getTerm(v1, bytesRef1); - getTerm(v2, bytesRef2); - return BYTES_REF_COMPARATOR.compare(bytesRef1, bytesRef2); - } else if (v1 == SENTINEL_VALUE && v2 == SENTINEL_VALUE) { - return 0; - } else if (v1 == SENTINEL_VALUE) { - return 1; - } else { - return -1; - } - } - - @Override - public int getSentinelValue() { - return SENTINEL_VALUE; - } - - /** - * Get the term specified by the termID. - * This method should be the same as {@link InvertedRealtimeIndex#getTerm} - */ - private void getTerm(int termID, BytesRef text) { - invertedIndex.getTerm(termID, text); - } - } - - private static final int HASHMAP_SIZE = 64 * 1024; - - private SkipListContainer termsSkipList; - - private final TermPointerEncoding termPointerEncoding; - private final ByteBlockPool termPool; - private final SkipListPostingList postingList; - - private int numTerms; - private int numDocs; - private int sumTotalTermFreq; - private int sumTermDocFreq; - private int maxPosition; - - private volatile TermHashTable hashTable; - private TermsArray termsArray; - - /** - * Creates a new in-memory real-time inverted index for the given field. - */ - public InvertedRealtimeIndex(EarlybirdFieldType fieldType, - TermPointerEncoding termPointerEncoding, - String fieldName) { - super(fieldType); - this.termPool = new ByteBlockPool(); - - this.termPointerEncoding = termPointerEncoding; - this.hashTable = new TermHashTable(HASHMAP_SIZE, termPointerEncoding); - - this.postingList = new SkipListPostingList( - fieldType.hasPositions() - ? SkipListContainer.HasPositions.YES - : SkipListContainer.HasPositions.NO, - fieldType.isStorePerPositionPayloads() - ? SkipListContainer.HasPayloads.YES - : SkipListContainer.HasPayloads.NO, - fieldName); - - this.termsArray = new TermsArray( - HASHMAP_SIZE, fieldType.isStoreFacetOffensiveCounters()); - - // Create termsSkipList to maintain order if field is support ordered terms. - if (fieldType.isSupportOrderedTerms()) { - // Terms skip list does not support position. - this.termsSkipList = new SkipListContainer<>( - new TermsSkipListComparator(this), - SkipListContainer.HasPositions.NO, - SkipListContainer.HasPayloads.NO, - "terms"); - this.termsSkipList.newSkipList(); - } else { - this.termsSkipList = null; - } - } - - void setTermsSkipList(SkipListContainer termsSkipList) { - this.termsSkipList = termsSkipList; - } - - SkipListContainer getTermsSkipList() { - return termsSkipList; - } - - private InvertedRealtimeIndex( - EarlybirdFieldType fieldType, - int numTerms, - int numDocs, - int sumTermDocFreq, - int sumTotalTermFreq, - int maxPosition, - int[] termsHash, - TermsArray termsArray, - ByteBlockPool termPool, - TermPointerEncoding termPointerEncoding, - SkipListPostingList postingList) { - super(fieldType); - this.numTerms = numTerms; - this.numDocs = numDocs; - this.sumTermDocFreq = sumTermDocFreq; - this.sumTotalTermFreq = sumTotalTermFreq; - this.maxPosition = maxPosition; - this.termsArray = termsArray; - this.termPool = termPool; - this.termPointerEncoding = termPointerEncoding; - this.hashTable = new TermHashTable(termsHash, termPointerEncoding); - this.postingList = postingList; - } - - void insertToTermsSkipList(BytesRef termBytesRef, int termID) { - if (termsSkipList != null) { - // Use the comparator passed in while building the skip list since we only have one writer. - termsSkipList.insert(termBytesRef, termID, SkipListContainer.FIRST_LIST_HEAD); - } - } - - @Override - public int getNumTerms() { - return numTerms; - } - - @Override - public int getNumDocs() { - return numDocs; - } - - @Override - public int getSumTotalTermFreq() { - return sumTotalTermFreq; - } - - @Override - public int getSumTermDocFreq() { - return sumTermDocFreq; - } - - @Override - public Terms createTerms(int maxPublishedPointer) { - return new RealtimeIndexTerms(this, maxPublishedPointer); - } - - @Override - public TermsEnum createTermsEnum(int maxPublishedPointer) { - // Use SkipListInMemoryTermsEnum if termsSkipList is not null, which indicates field required - // ordered term. - if (termsSkipList == null) { - return new RealtimeIndexTerms.InMemoryTermsEnum(this, maxPublishedPointer); - } else { - return new RealtimeIndexTerms.SkipListInMemoryTermsEnum(this, maxPublishedPointer); - } - } - - int getPostingListPointer(int termID) { - return termsArray.getPostingsPointer(termID); - } - - @Override - public int getLargestDocIDForTerm(int termID) { - if (termID == EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) { - return TermsArray.INVALID; - } else { - return postingList.getDocIDFromPosting(termsArray.largestPostings[termID]); - } - } - - @Override - public int getDF(int termID) { - if (termID == HashTable.EMPTY_SLOT) { - return 0; - } else { - return this.postingList.getDF(termID, termsArray); - } - } - - @Override - public int getMaxPublishedPointer() { - return this.postingList.getMaxPublishedPointer(); - } - - @Override - public int lookupTerm(BytesRef term) { - return HashTable.decodeItemId(hashTable.lookupItem(term)); - } - - @Override - public FacetLabelAccessor getLabelAccessor() { - final TermsArray termsArrayCopy = this.termsArray; - - return new FacetLabelAccessor() { - @Override protected boolean seek(long termID) { - if (termID == HashTable.EMPTY_SLOT) { - return false; - } - int termPointer = termsArrayCopy.termPointers[(int) termID]; - hasTermPayload = termPointerEncoding.hasPayload(termPointer); - int textStart = termPointerEncoding.getTextStart(termPointer); - int termPayloadStart = ByteTermUtils.setBytesRef(termPool, termRef, textStart); - if (hasTermPayload) { - ByteTermUtils.setBytesRef(termPool, termPayload, termPayloadStart); - } - offensiveCount = termsArrayCopy.offensiveCounters != null - ? termsArrayCopy.offensiveCounters[(int) termID] : 0; - - return true; - } - }; - } - - @Override - public boolean hasMaxPublishedPointer() { - return true; - } - - @Override - public void getTerm(int termID, BytesRef text) { - getTerm(termID, text, termsArray, termPointerEncoding, termPool); - } - - /** - * Extract to helper method so the logic can be shared with - * {@link TermsSkipListComparator#getTerm} - */ - private static void getTerm(int termID, BytesRef text, - TermsArray termsArray, - TermPointerEncoding termPointerEncoding, - ByteBlockPool termPool) { - int textStart = termPointerEncoding.getTextStart(termsArray.termPointers[termID]); - ByteTermUtils.setBytesRef(termPool, text, textStart); - } - - /** - * Called when postings hash is too small (> 50% occupied). - */ - void rehashPostings(int newSize) { - TermHashTable newTable = new TermHashTable(newSize, termPointerEncoding); - hashTable.rehash(newTable); - hashTable = newTable; - } - - /** - * Returns per-term array containing the number of documents indexed with that term that were - * considered to be offensive. - */ - @Nullable - int[] getOffensiveCounters() { - return this.termsArray.offensiveCounters; - } - - /** - * Returns access to all the terms in this index as a {@link KeysSource}. - */ - public KeysSource getKeysSource() { - final int localNumTerms = this.numTerms; - final TermsArray termsArrayCopy = this.termsArray; - - return new KeysSource() { - private int termID = 0; - private BytesRef text = new BytesRef(); - - @Override - public int getNumberOfKeys() { - return localNumTerms; - } - - /** Must not be called more often than getNumberOfKeys() before rewind() is called */ - @Override - public BytesRef nextKey() { - Preconditions.checkState(termID < localNumTerms); - int textStart = termPointerEncoding.getTextStart(termsArrayCopy.termPointers[termID]); - ByteTermUtils.setBytesRef(termPool, text, textStart); - termID++; - return text; - } - - @Override - public void rewind() { - termID = 0; - } - }; - } - - /** - * Returns byte pool containing term text for all terms in this index. - */ - public ByteBlockPool getTermPool() { - return this.termPool; - } - - /** - * Returns per-term array containing pointers to where the text of each term is stored in the - * byte pool returned by {@link #getTermPool()}. - */ - public int[] getTermPointers() { - return this.termsArray.termPointers; - } - - /** - * Returns the hash table used to look up terms in this index. - */ - InvertedRealtimeIndex.TermHashTable getHashTable() { - return hashTable; - } - - - TermsArray getTermsArray() { - return termsArray; - } - - TermsArray growTermsArray() { - termsArray = termsArray.grow(); - return termsArray; - } - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - TermPointerEncoding getTermPointerEncoding() { - return termPointerEncoding; - } - - SkipListPostingList getPostingList() { - return postingList; - } - - void incrementNumTerms() { - numTerms++; - } - - void incrementSumTotalTermFreq() { - sumTotalTermFreq++; - } - - public void incrementSumTermDocFreq() { - sumTermDocFreq++; - } - - public void incrementNumDocs() { - numDocs++; - } - - void setNumDocs(int numDocs) { - this.numDocs = numDocs; - } - - void adjustMaxPosition(int position) { - if (position > maxPosition) { - maxPosition = position; - } - } - - int getMaxPosition() { - return maxPosition; - } - - public static class FlushHandler extends Flushable.Handler { - private static final String NUM_DOCS_PROP_NAME = "numDocs"; - private static final String SUM_TOTAL_TERM_FREQ_PROP_NAME = "sumTotalTermFreq"; - private static final String SUM_TERM_DOC_FREQ_PROP_NAME = "sumTermDocFreq"; - private static final String NUM_TERMS_PROP_NAME = "numTerms"; - private static final String POSTING_LIST_PROP_NAME = "postingList"; - private static final String TERMS_SKIP_LIST_PROP_NAME = "termsSkipList"; - private static final String MAX_POSITION = "maxPosition"; - - protected final EarlybirdFieldType fieldType; - protected final TermPointerEncoding termPointerEncoding; - - public FlushHandler(EarlybirdFieldType fieldType, - TermPointerEncoding termPointerEncoding) { - this.fieldType = fieldType; - this.termPointerEncoding = termPointerEncoding; - } - - public FlushHandler(InvertedRealtimeIndex objectToFlush) { - super(objectToFlush); - this.fieldType = objectToFlush.fieldType; - this.termPointerEncoding = objectToFlush.getTermPointerEncoding(); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) - throws IOException { - InvertedRealtimeIndex objectToFlush = getObjectToFlush(); - flushInfo.addIntProperty(NUM_TERMS_PROP_NAME, objectToFlush.getNumTerms()); - flushInfo.addIntProperty(NUM_DOCS_PROP_NAME, objectToFlush.numDocs); - flushInfo.addIntProperty(SUM_TERM_DOC_FREQ_PROP_NAME, objectToFlush.sumTermDocFreq); - flushInfo.addIntProperty(SUM_TOTAL_TERM_FREQ_PROP_NAME, objectToFlush.sumTotalTermFreq); - flushInfo.addIntProperty(MAX_POSITION, objectToFlush.maxPosition); - - out.writeIntArray(objectToFlush.hashTable.slots()); - objectToFlush.termsArray.getFlushHandler() - .flush(flushInfo.newSubProperties("termsArray"), out); - objectToFlush.getTermPool().getFlushHandler() - .flush(flushInfo.newSubProperties("termPool"), out); - objectToFlush.getPostingList().getFlushHandler() - .flush(flushInfo.newSubProperties(POSTING_LIST_PROP_NAME), out); - - if (fieldType.isSupportOrderedTerms()) { - Preconditions.checkNotNull(objectToFlush.termsSkipList); - - objectToFlush.termsSkipList.getFlushHandler() - .flush(flushInfo.newSubProperties(TERMS_SKIP_LIST_PROP_NAME), out); - } - } - - @Override - protected InvertedRealtimeIndex doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - int[] termsHash = in.readIntArray(); - TermsArray termsArray = (new TermsArray.FlushHandler()) - .load(flushInfo.getSubProperties("termsArray"), in); - ByteBlockPool termPool = (new ByteBlockPool.FlushHandler()) - .load(flushInfo.getSubProperties("termPool"), in); - SkipListPostingList postingList = (new SkipListPostingList.FlushHandler()) - .load(flushInfo.getSubProperties(POSTING_LIST_PROP_NAME), in); - - InvertedRealtimeIndex index = new InvertedRealtimeIndex( - fieldType, - flushInfo.getIntProperty(NUM_TERMS_PROP_NAME), - flushInfo.getIntProperty(NUM_DOCS_PROP_NAME), - flushInfo.getIntProperty(SUM_TERM_DOC_FREQ_PROP_NAME), - flushInfo.getIntProperty(SUM_TOTAL_TERM_FREQ_PROP_NAME), - flushInfo.getIntProperty(MAX_POSITION), - termsHash, - termsArray, - termPool, - termPointerEncoding, - postingList); - - if (fieldType.isSupportOrderedTerms()) { - SkipListComparator comparator = new TermsSkipListComparator(index); - index.setTermsSkipList((new SkipListContainer.FlushHandler<>(comparator)) - .load(flushInfo.getSubProperties(TERMS_SKIP_LIST_PROP_NAME), in)); - } - - return index; - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndexWriter.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndexWriter.docx new file mode 100644 index 000000000..508aebd37 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndexWriter.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndexWriter.java b/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndexWriter.java deleted file mode 100644 index fbea007a2..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/InvertedRealtimeIndexWriter.java +++ /dev/null @@ -1,163 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.hashtable.HashTable; -import com.twitter.search.common.util.analysis.TermPayloadAttribute; -import com.twitter.search.core.earlybird.facets.FacetCountingArrayWriter; -import com.twitter.search.core.earlybird.facets.FacetIDMap.FacetField; -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentWriter; - -public class InvertedRealtimeIndexWriter - implements EarlybirdRealtimeIndexSegmentWriter.InvertedDocConsumer { - private final InvertedRealtimeIndex invertedIndex; - private final FacetCountingArrayWriter facetArray; - private final FacetField facetField; - - private TermToBytesRefAttribute termAtt; - private TermPayloadAttribute termPayloadAtt; - private PayloadAttribute payloadAtt; - private boolean currentDocIsOffensive; - - /** - * Creates a new writer for writing to an inverted in-memory real-time index. - */ - public InvertedRealtimeIndexWriter( - InvertedRealtimeIndex index, - FacetField facetField, - FacetCountingArrayWriter facetArray) { - super(); - this.invertedIndex = index; - this.facetArray = facetArray; - this.facetField = facetField; - } - - @Override - public void start(AttributeSource attributeSource, boolean docIsOffensive) { - termAtt = attributeSource.addAttribute(TermToBytesRefAttribute.class); - termPayloadAtt = attributeSource.addAttribute(TermPayloadAttribute.class); - payloadAtt = attributeSource.addAttribute(PayloadAttribute.class); - currentDocIsOffensive = docIsOffensive; - } - - /** - * Adds a posting to the provided inverted index. - * - * @param termBytesRef is a payload that is stored with the term. It is only stored once for each - * term. - * @param postingPayload is a byte payload that will be stored separately for every posting. - * @return term id of the added posting. - */ - public static int indexTerm(InvertedRealtimeIndex invertedIndex, BytesRef termBytesRef, - int docID, int position, BytesRef termPayload, - BytesRef postingPayload, TermPointerEncoding termPointerEncoding) { - - InvertedRealtimeIndex.TermHashTable hashTable = invertedIndex.getHashTable(); - BaseByteBlockPool termPool = invertedIndex.getTermPool(); - - TermsArray termsArray = invertedIndex.getTermsArray(); - - long hashTableInfoForBytesRef = hashTable.lookupItem(termBytesRef); - int termID = HashTable.decodeItemId(hashTableInfoForBytesRef); - int hashTableSlot = HashTable.decodeHashPosition(hashTableInfoForBytesRef); - - invertedIndex.adjustMaxPosition(position); - - if (termID == HashTable.EMPTY_SLOT) { - // First time we are seeing this token since we last flushed the hash. - // the LSB in textStart denotes whether this term has a term payload - int textStart = ByteTermUtils.copyToTermPool(termPool, termBytesRef); - boolean hasTermPayload = termPayload != null; - int termPointer = termPointerEncoding.encodeTermPointer(textStart, hasTermPayload); - - if (hasTermPayload) { - ByteTermUtils.copyToTermPool(termPool, termPayload); - } - - termID = invertedIndex.getNumTerms(); - invertedIndex.incrementNumTerms(); - if (termID >= termsArray.getSize()) { - termsArray = invertedIndex.growTermsArray(); - } - - termsArray.termPointers[termID] = termPointer; - - Preconditions.checkState(hashTable.slots()[hashTableSlot] == HashTable.EMPTY_SLOT); - hashTable.setSlot(hashTableSlot, termID); - - if (invertedIndex.getNumTerms() * 2 >= hashTable.numSlots()) { - invertedIndex.rehashPostings(2 * hashTable.numSlots()); - } - - // Insert termID into termsSkipList. - invertedIndex.insertToTermsSkipList(termBytesRef, termID); - } - - invertedIndex.incrementSumTotalTermFreq(); - invertedIndex.getPostingList() - .appendPosting(termID, termsArray, docID, position, postingPayload); - - return termID; - } - - /** - * Delete a posting that was inserted out of order. - * - * This function needs work before it is used in production: - * - It should take an isDocOffensive parameter so we can decrement the offensive - * document count for the term. - * - It doesn't allow the same concurrency guarantees that the other posting methods do. - */ - public static void deletePosting( - InvertedRealtimeIndex invertedIndex, BytesRef termBytesRef, int docID) { - - long hashTableInfoForBytesRef = invertedIndex.getHashTable().lookupItem(termBytesRef); - int termID = HashTable.decodeItemId(hashTableInfoForBytesRef); - - if (termID != HashTable.EMPTY_SLOT) { - // Have seen this term before, and the field that supports deletes. - invertedIndex.getPostingList().deletePosting(termID, invertedIndex.getTermsArray(), docID); - } - } - - @Override - public void add(int docID, int position) { - final BytesRef payload; - if (payloadAtt == null) { - payload = null; - } else { - payload = payloadAtt.getPayload(); - } - - BytesRef termPayload = termPayloadAtt.getTermPayload(); - - int termID = indexTerm(invertedIndex, termAtt.getBytesRef(), - docID, position, termPayload, payload, - invertedIndex.getTermPointerEncoding()); - - if (termID == -1) { - return; - } - - TermsArray termsArray = invertedIndex.getTermsArray(); - - if (currentDocIsOffensive && termsArray.offensiveCounters != null) { - termsArray.offensiveCounters[termID]++; - } - - if (facetField != null) { - facetArray.addFacet(docID, facetField.getFacetId(), termID); - } - } - - @Override - public void finish() { - payloadAtt = null; - termPayloadAtt = null; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingLists.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingLists.docx new file mode 100644 index 000000000..4a43cc70a Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingLists.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingLists.java b/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingLists.java deleted file mode 100644 index 8c1963a70..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingLists.java +++ /dev/null @@ -1,255 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.packed.PackedInts; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; - -/** - * A posting list intended for low-df terms, terms that have a small number of postings. - * - * The postings (docs and positions) are stored in PackedInts, packed based on the largest docId - * and position across all low-df terms in a field. - * - * All docIds are packed together in their own PackedInts, and all positions are stored together - * in their own PackedInts. - * - A docId is stored for every single posting, that is if a doc has a frequency of N, it will be - * stored N times. - * - For fields that omitPositions, positions are not stored at all. - * - * Example: - * Postings in the form (docId, position): - * (1, 0), (1, 1), (2, 1), (2, 3), (2, 5), (4, 0), (5, 0) - * Will be stored as: - * packedDocIds: [1, 1, 2, 2, 2, 4, 5] - * packedPositions: [0, 1, 1, 3, 5, 0, 0] - */ -public class LowDFPackedIntsPostingLists extends OptimizedPostingLists { - private static final SearchCounter GETTING_POSITIONS_WITH_OMIT_POSITIONS = - SearchCounter.export("low_df_packed_ints_posting_list_getting_positions_with_omit_positions"); - - /** - * Internal class for hiding PackedInts Readers and Writers. A Mutable instance of PackedInts is - * only required when we're optimizing a new index. - * For the read side, we only need a PackedInts.Reader. - * For loaded indexes, we also only need a PackedInts.Reader. - */ - private static final class PackedIntsWrapper { - // Will be null if we are operating on a loaded in read-only index. - @Nullable - private final PackedInts.Mutable mutablePackedInts; - private final PackedInts.Reader readerPackedInts; - - private PackedIntsWrapper(PackedInts.Mutable mutablePackedInts) { - this.mutablePackedInts = Preconditions.checkNotNull(mutablePackedInts); - this.readerPackedInts = mutablePackedInts; - } - - private PackedIntsWrapper(PackedInts.Reader readerPackedInts) { - this.mutablePackedInts = null; - this.readerPackedInts = readerPackedInts; - } - - public int size() { - return readerPackedInts.size(); - } - - public PackedInts.Reader getReader() { - return readerPackedInts; - } - - public void set(int index, long value) { - this.mutablePackedInts.set(index, value); - } - } - - private final PackedIntsWrapper packedDocIds; - /** - * Will be null for fields that omitPositions. - */ - @Nullable - private final PackedIntsWrapper packedPositions; - private final boolean omitPositions; - private final int totalPostingsAcrossTerms; - private final int maxPosition; - private int currentPackedIntsPosition; - - /** - * Creates a new LowDFPackedIntsPostingLists. - * @param omitPositions whether positions should be omitted or not. - * @param totalPostingsAcrossTerms how many postings across all terms this field has. - * @param maxPosition the largest position used in all the postings for this field. - */ - public LowDFPackedIntsPostingLists( - boolean omitPositions, - int totalPostingsAcrossTerms, - int maxPosition) { - this( - new PackedIntsWrapper(PackedInts.getMutable( - totalPostingsAcrossTerms, - PackedInts.bitsRequired(MAX_DOC_ID), - PackedInts.DEFAULT)), - omitPositions - ? null - : new PackedIntsWrapper(PackedInts.getMutable( - totalPostingsAcrossTerms, - PackedInts.bitsRequired(maxPosition), - PackedInts.DEFAULT)), - omitPositions, - totalPostingsAcrossTerms, - maxPosition); - } - - private LowDFPackedIntsPostingLists( - PackedIntsWrapper packedDocIds, - @Nullable - PackedIntsWrapper packedPositions, - boolean omitPositions, - int totalPostingsAcrossTerms, - int maxPosition) { - this.packedDocIds = packedDocIds; - this.packedPositions = packedPositions; - this.omitPositions = omitPositions; - this.totalPostingsAcrossTerms = totalPostingsAcrossTerms; - this.maxPosition = maxPosition; - this.currentPackedIntsPosition = 0; - } - - @Override - public int copyPostingList(PostingsEnum postingsEnum, int numPostings) throws IOException { - int pointer = currentPackedIntsPosition; - - int docId; - - while ((docId = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { - assert docId <= MAX_DOC_ID; - int freq = postingsEnum.freq(); - assert freq <= numPostings; - - for (int i = 0; i < freq; i++) { - packedDocIds.set(currentPackedIntsPosition, docId); - if (packedPositions != null) { - int position = postingsEnum.nextPosition(); - assert position <= maxPosition; - packedPositions.set(currentPackedIntsPosition, position); - } - currentPackedIntsPosition++; - } - } - - return pointer; - } - - @Override - public EarlybirdPostingsEnum postings( - int postingListPointer, - int numPostings, - int flags) throws IOException { - - if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) && !omitPositions) { - assert packedPositions != null; - return new LowDFPackedIntsPostingsEnum( - packedDocIds.getReader(), - packedPositions.getReader(), - postingListPointer, - numPostings); - } else { - if (PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS) && omitPositions) { - GETTING_POSITIONS_WITH_OMIT_POSITIONS.increment(); - } - - return new LowDFPackedIntsPostingsEnum( - packedDocIds.getReader(), - null, // no positions - postingListPointer, - numPostings); - } - } - - @VisibleForTesting - int getPackedIntsSize() { - return packedDocIds.size(); - } - - @VisibleForTesting - int getMaxPosition() { - return maxPosition; - } - - @VisibleForTesting - boolean isOmitPositions() { - return omitPositions; - } - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - static class FlushHandler extends Flushable.Handler { - private static final String OMIT_POSITIONS_PROP_NAME = "omitPositions"; - private static final String TOTAL_POSTINGS_PROP_NAME = "totalPostingsAcrossTerms"; - private static final String MAX_POSITION_PROP_NAME = "maxPosition"; - - public FlushHandler() { - super(); - } - - public FlushHandler(LowDFPackedIntsPostingLists objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - LowDFPackedIntsPostingLists objectToFlush = getObjectToFlush(); - - flushInfo.addBooleanProperty(OMIT_POSITIONS_PROP_NAME, objectToFlush.omitPositions); - flushInfo.addIntProperty(TOTAL_POSTINGS_PROP_NAME, objectToFlush.totalPostingsAcrossTerms); - flushInfo.addIntProperty(MAX_POSITION_PROP_NAME, objectToFlush.maxPosition); - - out.writePackedInts(objectToFlush.packedDocIds.getReader()); - - if (!objectToFlush.omitPositions) { - assert objectToFlush.packedPositions != null; - out.writePackedInts(objectToFlush.packedPositions.getReader()); - } - } - - @Override - protected LowDFPackedIntsPostingLists doLoad( - FlushInfo flushInfo, - DataDeserializer in) throws IOException { - - boolean omitPositions = flushInfo.getBooleanProperty(OMIT_POSITIONS_PROP_NAME); - int totalPostingsAcrossTerms = flushInfo.getIntProperty(TOTAL_POSTINGS_PROP_NAME); - int maxPosition = flushInfo.getIntProperty(MAX_POSITION_PROP_NAME); - - PackedIntsWrapper packedDocIds = new PackedIntsWrapper(in.readPackedInts()); - - PackedIntsWrapper packedPositions = null; - if (!omitPositions) { - packedPositions = new PackedIntsWrapper(in.readPackedInts()); - } - - return new LowDFPackedIntsPostingLists( - packedDocIds, - packedPositions, - omitPositions, - totalPostingsAcrossTerms, - maxPosition); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingsEnum.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingsEnum.docx new file mode 100644 index 000000000..74ae4199e Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingsEnum.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingsEnum.java b/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingsEnum.java deleted file mode 100644 index cb1c54c05..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/LowDFPackedIntsPostingsEnum.java +++ /dev/null @@ -1,112 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import javax.annotation.Nullable; - -import org.apache.lucene.util.packed.PackedInts; - -/** - * A PostingsEnum for iterating over LowDFPackedIntsPostingLists. - * - * Can be used with positions and without positions. - */ -public class LowDFPackedIntsPostingsEnum extends EarlybirdOptimizedPostingsEnum { - private static final int SKIP_INTERVAL = 128; - - private final PackedInts.Reader packedDocIds; - @Nullable - private final PackedInts.Reader packedPositions; - private final int lastPostingPointer; - private final int largestDocID; - private int currentPositionPointer; - - /** Pointer to the next posting that will be loaded. */ - private int nextPostingPointer; - - /** - * Creates a new PostingsEnum for all postings in a given term. - */ - public LowDFPackedIntsPostingsEnum( - PackedInts.Reader packedDocIds, - @Nullable - PackedInts.Reader packedPositions, - int postingListPointer, - int numPostings) { - super(postingListPointer, numPostings); - - this.packedDocIds = packedDocIds; - this.packedPositions = packedPositions; - this.nextPostingPointer = postingListPointer; - - this.lastPostingPointer = postingListPointer + numPostings - 1; - this.largestDocID = (int) packedDocIds.get(lastPostingPointer); - - loadNextPosting(); - - // Treat each term as a single block load. - queryCostTracker.track(QueryCostTracker.CostType.LOAD_OPTIMIZED_POSTING_BLOCK); - } - - @Override - protected void loadNextPosting() { - if (nextPostingPointer <= lastPostingPointer) { - nextDocID = (int) packedDocIds.get(nextPostingPointer); - nextFreq = 1; - } else { - // all postings fully processed - nextDocID = NO_MORE_DOCS; - nextFreq = 0; - } - nextPostingPointer++; - } - - @Override - protected void startCurrentDoc() { - if (packedPositions != null) { - /** - * Remember where we were at the beginning of this doc, so that we can iterate over the - * positions for this doc if needed. - * Adjust by `- 1 - getCurrentFreq()` because we already advanced beyond the last posting in - * the previous loadNextPosting() calls. - * @see #nextDocNoDel() - */ - currentPositionPointer = nextPostingPointer - 1 - getCurrentFreq(); - } - } - - @Override - protected void skipTo(int target) { - assert target != NO_MORE_DOCS : "Should be handled in parent class advance method"; - - // now we know there must be a doc in this block that we can return - int skipIndex = nextPostingPointer + SKIP_INTERVAL; - while (skipIndex <= lastPostingPointer && target > packedDocIds.get(skipIndex)) { - nextPostingPointer = skipIndex; - skipIndex += SKIP_INTERVAL; - } - } - - @Override - public int nextPosition() throws IOException { - if (packedPositions == null) { - return -1; - } else if (currentPositionPointer < packedPositions.size()) { - return (int) packedPositions.get(currentPositionPointer++); - } else { - return -1; - } - } - - @Override - public int getLargestDocID() throws IOException { - return largestDocID; - } - - @Override - public long cost() { - // cost would be -1 if this enum is exhausted. - final int cost = lastPostingPointer - nextPostingPointer + 1; - return cost < 0 ? 0 : cost; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MPHTermDictionary.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/MPHTermDictionary.docx new file mode 100644 index 000000000..3d840d42f Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/MPHTermDictionary.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MPHTermDictionary.java b/src/java/com/twitter/search/core/earlybird/index/inverted/MPHTermDictionary.java deleted file mode 100644 index dd76ee10e..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/MPHTermDictionary.java +++ /dev/null @@ -1,190 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import org.apache.lucene.index.BaseTermsEnum; -import org.apache.lucene.index.ImpactsEnum; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SlowImpactsEnum; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.packed.PackedInts; - -import com.twitter.search.common.util.hash.BDZAlgorithm; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -public class MPHTermDictionary implements TermDictionary, Flushable { - private final BDZAlgorithm termsHashFunction; - private final PackedInts.Reader termPointers; - private final ByteBlockPool termPool; - private final TermPointerEncoding termPointerEncoding; - private final int numTerms; - - MPHTermDictionary(int numTerms, BDZAlgorithm termsHashFunction, - PackedInts.Reader termPointers, ByteBlockPool termPool, - TermPointerEncoding termPointerEncoding) { - this.numTerms = numTerms; - this.termsHashFunction = termsHashFunction; - this.termPointers = termPointers; - this.termPool = termPool; - this.termPointerEncoding = termPointerEncoding; - } - - @Override - public int getNumTerms() { - return numTerms; - } - - @Override - public int lookupTerm(BytesRef term) { - int termID = termsHashFunction.lookup(term); - if (termID >= getNumTerms() || termID < 0) { - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - - if (ByteTermUtils.postingEquals(termPool, termPointerEncoding - .getTextStart((int) termPointers.get(termID)), term)) { - return termID; - } else { - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } - } - - @Override - public boolean getTerm(int termID, BytesRef text, BytesRef termPayload) { - int termPointer = (int) termPointers.get(termID); - boolean hasTermPayload = termPointerEncoding.hasPayload(termPointer); - int textStart = termPointerEncoding.getTextStart(termPointer); - // setBytesRef sets the passed in BytesRef "text" to the term in the termPool. - // As a side effect it returns the offset of the next entry in the pool after the term, - // which may optionally be used if this term has a payload. - int termPayloadStart = ByteTermUtils.setBytesRef(termPool, text, textStart); - if (termPayload != null && hasTermPayload) { - ByteTermUtils.setBytesRef(termPool, termPayload, termPayloadStart); - } - - return hasTermPayload; - } - - @Override - public TermsEnum createTermsEnum(OptimizedMemoryIndex index) { - return new MPHTermsEnum(index); - } - - public static class MPHTermsEnum extends BaseTermsEnum { - private int termID; - private final BytesRef bytesRef = new BytesRef(); - private final OptimizedMemoryIndex index; - - MPHTermsEnum(OptimizedMemoryIndex index) { - this.index = index; - } - - @Override - public int docFreq() { - return index.getDF(termID); - } - - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { - int postingsPointer = index.getPostingListPointer(termID); - int numPostings = index.getNumPostings(termID); - return index.getPostingLists().postings(postingsPointer, numPostings, flags); - } - - @Override - public ImpactsEnum impacts(int flags) throws IOException { - return new SlowImpactsEnum(postings(null, flags)); - } - - @Override - public SeekStatus seekCeil(BytesRef text) throws IOException { - termID = index.lookupTerm(text); - - if (termID == -1) { - return SeekStatus.END; - } else { - return SeekStatus.FOUND; - } - } - - @Override - public BytesRef next() { - return null; - } - - @Override - public long ord() { - return termID; - } - - @Override - public void seekExact(long ord) { - if (ord < index.getNumTerms()) { - termID = (int) ord; - index.getTerm(termID, bytesRef, null); - } - } - - @Override - public BytesRef term() { - return bytesRef; - } - - @Override - public long totalTermFreq() { - return docFreq(); - } - } - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static class FlushHandler extends Flushable.Handler { - static final String NUM_TERMS_PROP_NAME = "numTerms"; - private final TermPointerEncoding termPointerEncoding; - - public FlushHandler(TermPointerEncoding termPointerEncoding) { - super(); - this.termPointerEncoding = termPointerEncoding; - } - - public FlushHandler(MPHTermDictionary objectToFlush) { - super(objectToFlush); - this.termPointerEncoding = objectToFlush.termPointerEncoding; - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) - throws IOException { - MPHTermDictionary objectToFlush = getObjectToFlush(); - flushInfo.addIntProperty(NUM_TERMS_PROP_NAME, objectToFlush.getNumTerms()); - - out.writePackedInts(objectToFlush.termPointers); - objectToFlush.termPool.getFlushHandler().flush(flushInfo.newSubProperties("termPool"), out); - objectToFlush.termsHashFunction.getFlushHandler() - .flush(flushInfo.newSubProperties("termsHashFunction"), out); - } - - @Override - protected MPHTermDictionary doLoad(FlushInfo flushInfo, - DataDeserializer in) throws IOException { - int numTerms = flushInfo.getIntProperty(NUM_TERMS_PROP_NAME); - PackedInts.Reader termPointers = in.readPackedInts(); - ByteBlockPool termPool = (new ByteBlockPool.FlushHandler()).load( - flushInfo.getSubProperties("termPool"), in); - BDZAlgorithm termsHashFunction = (new BDZAlgorithm.FlushHandler()).load( - flushInfo.getSubProperties("termsHashFunction"), in); - - return new MPHTermDictionary(numTerms, termsHashFunction, termPointers, - termPool, termPointerEncoding); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiPostingLists.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiPostingLists.docx new file mode 100644 index 000000000..9eb270682 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiPostingLists.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiPostingLists.java b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiPostingLists.java deleted file mode 100644 index 12ff47365..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiPostingLists.java +++ /dev/null @@ -1,135 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.lucene.index.PostingsEnum; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; - -public class MultiPostingLists extends OptimizedPostingLists { - - @VisibleForTesting - public static final int DEFAULT_DF_THRESHOLD = 1000; - - private final OptimizedPostingLists lowDF; - private final OptimizedPostingLists highDF; - - private final int dfThreshold; - - /** - * Given the number of postings in each term (in this field), sum up the number of postings in - * the low df fields. - * @param numPostingsPerTerm number of postings in each term in this field. - * @param dfThreshold the low/high df threshold. - */ - private static int numPostingsInLowDfTerms(int[] numPostingsPerTerm, int dfThreshold) { - int sumOfAllPostings = 0; - for (int numPostingsInATerm : numPostingsPerTerm) { - if (numPostingsInATerm < dfThreshold) { - sumOfAllPostings += numPostingsInATerm; - } - } - return sumOfAllPostings; - } - - /** - * Creates a new posting list delegating to either lowDF or highDF posting list. - * @param omitPositions whether positions should be omitted or not. - * @param numPostingsPerTerm number of postings in each term in this field. - * @param maxPosition the largest position used in all the postings for this field. - */ - public MultiPostingLists( - boolean omitPositions, - int[] numPostingsPerTerm, - int maxPosition) { - this( - new LowDFPackedIntsPostingLists( - omitPositions, - numPostingsInLowDfTerms(numPostingsPerTerm, DEFAULT_DF_THRESHOLD), - maxPosition), - new HighDFPackedIntsPostingLists(omitPositions), - DEFAULT_DF_THRESHOLD); - } - - private MultiPostingLists( - OptimizedPostingLists lowDF, - OptimizedPostingLists highDF, - int dfThreshold) { - this.lowDF = lowDF; - this.highDF = highDF; - this.dfThreshold = dfThreshold; - } - - @Override - public int copyPostingList(PostingsEnum postingsEnum, int numPostings) - throws IOException { - return numPostings < dfThreshold - ? lowDF.copyPostingList(postingsEnum, numPostings) - : highDF.copyPostingList(postingsEnum, numPostings); - } - - @Override - public EarlybirdPostingsEnum postings(int postingsPointer, int numPostings, int flags) - throws IOException { - return numPostings < dfThreshold - ? lowDF.postings(postingsPointer, numPostings, flags) - : highDF.postings(postingsPointer, numPostings, flags); - } - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - @VisibleForTesting - OptimizedPostingLists getLowDfPostingsList() { - return lowDF; - } - - @VisibleForTesting - OptimizedPostingLists getHighDfPostingsList() { - return highDF; - } - - public static class FlushHandler extends Flushable.Handler { - private static final String DF_THRESHOLD_PROP_NAME = "dfThresHold"; - - public FlushHandler() { - super(); - } - - public FlushHandler(MultiPostingLists objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) - throws IOException { - MultiPostingLists objectToFlush = getObjectToFlush(); - flushInfo.addIntProperty(DF_THRESHOLD_PROP_NAME, objectToFlush.dfThreshold); - objectToFlush.lowDF.getFlushHandler().flush( - flushInfo.newSubProperties("lowDFPostinglists"), out); - objectToFlush.highDF.getFlushHandler().flush( - flushInfo.newSubProperties("highDFPostinglists"), out); - } - - @Override - protected MultiPostingLists doLoad(FlushInfo flushInfo, - DataDeserializer in) throws IOException { - OptimizedPostingLists lowDF = new LowDFPackedIntsPostingLists.FlushHandler() - .load(flushInfo.getSubProperties("lowDFPostinglists"), in); - OptimizedPostingLists highDF = new HighDFPackedIntsPostingLists.FlushHandler() - .load(flushInfo.getSubProperties("highDFPostinglists"), in); - return new MultiPostingLists( - lowDF, - highDF, - flushInfo.getIntProperty(DF_THRESHOLD_PROP_NAME)); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionary.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionary.docx new file mode 100644 index 000000000..b9dfc4cf3 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionary.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionary.java b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionary.java deleted file mode 100644 index 8b7dee75d..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionary.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import com.google.common.collect.ImmutableList; - -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * A term dictionary that's backed by multiple underlying segments/indexes. For a given term, will - * be able to return the termId for each of the underlying indexes. - */ -public interface MultiSegmentTermDictionary { - - /** - * Lookup a term in this multi segment term dictionary, and return the term ids for that term on - * all of the managed segments. - * - * @return An array containing a termId for each segment that this term dictionary is backed by. - * The order of segments will match the order returned by {@link #getSegmentIndexes()}. - * - * For each segment, the term id will be returned, or - * {@link EarlybirdIndexSegmentAtomicReader#TERM_NOT_FOUND} if that segment does not have the - * given term. - */ - int[] lookupTermIds(BytesRef term); - - /** - * A convenience method for checking whether a specific index/segment is backed by this term - * dictionary. Returning true here is equivalent to returning: - *

-   * getSegmentIndexes().contains(invertedIndex);
-   * 
- */ - default boolean supportSegmentIndex(InvertedIndex invertedIndex) { - return getSegmentIndexes().contains(invertedIndex); - } - - /** - * The list of indexes that this term dictionary is backed by. The order of indexes here will - * be consistent with the order of termIds returned by {@link #lookupTermIds(BytesRef)}. - */ - ImmutableList getSegmentIndexes(); - - /** - * Returns the number of terms in this term dictionary. - * - * If the term "foo" appears in segment A and in segment B, it will be counted once. To get the - * total number of terms across all managed segments, see {@link #getNumTermEntries()}. - */ - int getNumTerms(); - - /** - * Returns the total number of terms in this term dictionary across all managed segments. - * - * If the term "foo" appears in segment A and in segment B, it will have 2 entries in this term - * dictionary. - */ - int getNumTermEntries(); -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithFastutil.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithFastutil.docx new file mode 100644 index 000000000..afec8a903 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithFastutil.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithFastutil.java b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithFastutil.java deleted file mode 100644 index 56efa3754..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithFastutil.java +++ /dev/null @@ -1,161 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.OptionalInt; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Stopwatch; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Maps; - -import org.apache.lucene.util.BytesRef; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.util.LogFormatUtil; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -import it.unimi.dsi.fastutil.ints.IntArrayList; - -/** - * This implementation took MultiSegmentTermDictionaryWithMap and replaced some of the - * data structures with fastutil equivalents and it also uses a more memory efficient way to - * store the precomputed data. - * - * This implementation has a requirement that each term per field needs to be present at - * most once per document, since we only have space to index 2^24 terms and we have 2^23 - * documents as of now in realtime earlybirds. - * - * See UserIdMultiSegmentQuery class comment for more information on how this is used. - */ -public class MultiSegmentTermDictionaryWithFastutil implements MultiSegmentTermDictionary { - private static final Logger LOG = LoggerFactory.getLogger( - MultiSegmentTermDictionaryWithFastutil.class); - - @VisibleForTesting - public static final SearchTimerStats TERM_DICTIONARY_CREATION_STATS = - SearchTimerStats.export("multi_segment_term_dictionary_with_fastutil_creation", - TimeUnit.MILLISECONDS, false); - - private static final int MAX_TERM_ID_BITS = 24; - private static final int TERM_ID_MASK = (1 << MAX_TERM_ID_BITS) - 1; // First 24 bits. - private static final int MAX_SEGMENT_SIZE = 1 << (MAX_TERM_ID_BITS - 1); - - private final ImmutableList indexes; - - // For each term, a list of (index id, term id) packed into an integer. - // The integer contains: - // byte 0: index (segment id). Since we have ~20 segments, this fits into a byte. - // bytes [1-3]: term id. The terms we're building this dictionary for are user ids - // associated with a tweet - from_user_id and in_reply_to_user_id. Since we have - // at most 2**23 tweets in realtime, we'll have at most 2**23 unique terms per - // segments. The term ids post optimization are consecutive numbers, so they will - // fit in 24 bits. We don't use the term dictionary in archive, which has more - // tweets per segment. - // - // To verify the maximum amount of tweets in a segment, see max_segment_size in - // earlybird-config.yml. - private final HashMap termsMap; - private final int numTerms; - private final int numTermEntries; - - int encodeIndexAndTermId(int indexId, int termId) { - // Push the index id to the left and use the other 24 bits for the term id. - return (indexId << MAX_TERM_ID_BITS) | termId; - } - - void decodeIndexAndTermId(int[] arr, int packed) { - arr[packed >> MAX_TERM_ID_BITS] = packed & TERM_ID_MASK; - } - - - /** - * Creates a new multi-segment term dictionary backed by a regular java map. - */ - public MultiSegmentTermDictionaryWithFastutil( - String field, - List indexes) { - - this.indexes = ImmutableList.copyOf(indexes); - - // Pre-size the map with estimate of max number of terms. It should be at least that big. - OptionalInt optionalMax = indexes.stream().mapToInt(OptimizedMemoryIndex::getNumTerms).max(); - int maxNumTerms = optionalMax.orElse(0); - this.termsMap = Maps.newHashMapWithExpectedSize(maxNumTerms); - - LOG.info("About to merge {} indexes for field {}, estimated {} terms", - indexes.size(), field, LogFormatUtil.formatInt(maxNumTerms)); - Stopwatch stopwatch = Stopwatch.createStarted(); - - BytesRef termBytesRef = new BytesRef(); - - for (int indexId = 0; indexId < indexes.size(); indexId++) { - // The inverted index for this field. - OptimizedMemoryIndex index = indexes.get(indexId); - - int indexNumTerms = index.getNumTerms(); - - if (indexNumTerms > MAX_SEGMENT_SIZE) { - throw new IllegalStateException("too many terms: " + indexNumTerms); - } - - for (int termId = 0; termId < indexNumTerms; termId++) { - index.getTerm(termId, termBytesRef); - - IntArrayList indexTerms = termsMap.get(termBytesRef); - if (indexTerms == null) { - BytesRef term = BytesRef.deepCopyOf(termBytesRef); - - indexTerms = new IntArrayList(); - termsMap.put(term, indexTerms); - } - - indexTerms.add(encodeIndexAndTermId(indexId, termId)); - } - } - - this.numTerms = termsMap.size(); - this.numTermEntries = indexes.stream().mapToInt(OptimizedMemoryIndex::getNumTerms).sum(); - - TERM_DICTIONARY_CREATION_STATS.timerIncrement(stopwatch.elapsed(TimeUnit.MILLISECONDS)); - LOG.info("Done merging {} segments for field {} in {} - " - + "num terms: {}, num term entries: {}.", - indexes.size(), field, stopwatch, - LogFormatUtil.formatInt(this.numTerms), - LogFormatUtil.formatInt(this.numTermEntries)); - } - - @Override - public int[] lookupTermIds(BytesRef term) { - int[] termIds = new int[indexes.size()]; - Arrays.fill(termIds, EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND); - - IntArrayList indexTerms = termsMap.get(term); - if (indexTerms != null) { - for (int i = 0; i < indexTerms.size(); i++) { - decodeIndexAndTermId(termIds, indexTerms.getInt(i)); - } - } - - return termIds; - } - - @Override - public ImmutableList getSegmentIndexes() { - return indexes; - } - - @Override - public int getNumTerms() { - return this.numTerms; - } - - @Override - public int getNumTermEntries() { - return this.numTermEntries; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithMap.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithMap.docx new file mode 100644 index 000000000..2fa3912e8 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithMap.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithMap.java b/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithMap.java deleted file mode 100644 index 74b7103bc..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/MultiSegmentTermDictionaryWithMap.java +++ /dev/null @@ -1,134 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.util.Arrays; -import java.util.HashMap; -import java.util.List; -import java.util.OptionalInt; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.apache.lucene.util.BytesRef; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.util.LogFormatUtil; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * A rather simple implementation of a MultiSegmentTermDictionary that just keeps all terms in a - * java hash map, and all the termIds for a term in a java list. - * - * An alternate implementation could have an MPH for the map, and a IntBlockPool for storing - * the term ids. - * - * See UserIdMultiSegmentQuery class comment for more information on how this is used. - */ -public class MultiSegmentTermDictionaryWithMap implements MultiSegmentTermDictionary { - private static final Logger LOG = LoggerFactory.getLogger( - MultiSegmentTermDictionaryWithMap.class); - - @VisibleForTesting - public static final SearchTimerStats TERM_DICTIONARY_CREATION_STATS = - SearchTimerStats.export("multi_segment_term_dictionary_with_map_creation", - TimeUnit.MILLISECONDS, false); - - private final ImmutableList indexes; - private final HashMap> termsMap; - private final int numTerms; - private final int numTermEntries; - - private static class IndexTerm { - private int indexId; - private final int termId; - - public IndexTerm(int indexId, int termId) { - this.indexId = indexId; - this.termId = termId; - } - } - - /** - * Creates a new multi-segment term dictionary backed by a regular java map. - */ - public MultiSegmentTermDictionaryWithMap( - String field, - List indexes) { - - this.indexes = ImmutableList.copyOf(indexes); - - // Pre-size the map with estimate of max number of terms. It should be at least that big. - OptionalInt optionalMax = indexes.stream().mapToInt(OptimizedMemoryIndex::getNumTerms).max(); - int maxNumTerms = optionalMax.orElse(0); - this.termsMap = Maps.newHashMapWithExpectedSize(maxNumTerms); - - LOG.info("About to merge {} indexes for field {}, estimated {} terms", - indexes.size(), field, LogFormatUtil.formatInt(maxNumTerms)); - long start = System.currentTimeMillis(); - - BytesRef termText = new BytesRef(); - long copiedBytes = 0; - for (int indexId = 0; indexId < indexes.size(); indexId++) { - // The inverted index for this field. - OptimizedMemoryIndex index = indexes.get(indexId); - - int indexNumTerms = index.getNumTerms(); - for (int termId = 0; termId < indexNumTerms; termId++) { - index.getTerm(termId, termText); - - // This copies the underlying array to a new array. - BytesRef term = BytesRef.deepCopyOf(termText); - copiedBytes += term.length; - - List indexTerms = termsMap.computeIfAbsent(term, k -> Lists.newArrayList()); - - indexTerms.add(new IndexTerm(indexId, termId)); - } - } - - this.numTerms = termsMap.size(); - this.numTermEntries = indexes.stream().mapToInt(OptimizedMemoryIndex::getNumTerms).sum(); - - long elapsed = System.currentTimeMillis() - start; - TERM_DICTIONARY_CREATION_STATS.timerIncrement(elapsed); - LOG.info("Done merging {} indexes for field {} in {}ms - " - + "num terms: {}, num term entries: {}, copied bytes: {}", - indexes.size(), field, elapsed, - LogFormatUtil.formatInt(this.numTerms), LogFormatUtil.formatInt(this.numTermEntries), - LogFormatUtil.formatInt(copiedBytes)); - } - - @Override - public int[] lookupTermIds(BytesRef term) { - int[] termIds = new int[indexes.size()]; - Arrays.fill(termIds, EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND); - - List indexTerms = termsMap.get(term); - if (indexTerms != null) { - for (IndexTerm indexTerm : indexTerms) { - termIds[indexTerm.indexId] = indexTerm.termId; - } - } - - return termIds; - } - - @Override - public ImmutableList getSegmentIndexes() { - return indexes; - } - - @Override - public int getNumTerms() { - return this.numTerms; - } - - @Override - public int getNumTermEntries() { - return this.numTermEntries; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedIndexTerms.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedIndexTerms.docx new file mode 100644 index 000000000..52820a516 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedIndexTerms.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedIndexTerms.java b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedIndexTerms.java deleted file mode 100644 index 10aa7849d..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedIndexTerms.java +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; - -public class OptimizedIndexTerms extends Terms { - private final OptimizedMemoryIndex index; - - public OptimizedIndexTerms(OptimizedMemoryIndex index) { - this.index = index; - } - - @Override - public long size() { - return index.getNumTerms(); - } - - @Override - public TermsEnum iterator() { - return index.createTermsEnum(index.getMaxPublishedPointer()); - } - - @Override - public long getSumTotalTermFreq() { - return index.getSumTotalTermFreq(); - } - - @Override - public long getSumDocFreq() { - return index.getSumTermDocFreq(); - } - - @Override - public int getDocCount() { - return index.getNumDocs(); - } - - @Override - public boolean hasFreqs() { - return false; - } - - @Override - public boolean hasOffsets() { - return false; - } - - @Override - public boolean hasPositions() { - return true; - } - - @Override - public boolean hasPayloads() { - return false; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedMemoryIndex.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedMemoryIndex.docx new file mode 100644 index 000000000..fdb0077d9 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedMemoryIndex.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedMemoryIndex.java b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedMemoryIndex.java deleted file mode 100644 index 3298b80e8..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedMemoryIndex.java +++ /dev/null @@ -1,434 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import java.util.Comparator; -import java.util.Map; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.packed.PackedInts; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.util.hash.BDZAlgorithm; -import com.twitter.search.common.util.hash.BDZAlgorithm.MPHFNotFoundException; -import com.twitter.search.common.util.hash.KeysSource; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.facets.FacetIDMap.FacetField; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -public class OptimizedMemoryIndex extends InvertedIndex implements Flushable { - private static final Logger LOG = LoggerFactory.getLogger(OptimizedMemoryIndex.class); - private static final Comparator BYTES_REF_COMPARATOR = Comparator.naturalOrder(); - - private static final SearchCounter MPH_NOT_FOUND_COUNT = - SearchCounter.export("twitter_optimized_index_mph_not_found_count"); - - private final PackedInts.Reader numPostings; - private final PackedInts.Reader postingListPointers; - private final PackedInts.Reader offensiveCounters; - private final MultiPostingLists postingLists; - - private final TermDictionary dictionary; - - private final int numDocs; - private final int sumTotalTermFreq; - private final int sumTermDocFreq; - - private OptimizedMemoryIndex(EarlybirdFieldType fieldType, - int numDocs, - int sumTermDocFreq, - int sumTotalTermFreq, - PackedInts.Reader numPostings, - PackedInts.Reader postingListPointers, - PackedInts.Reader offensiveCounters, - MultiPostingLists postingLists, - TermDictionary dictionary) { - super(fieldType); - this.numDocs = numDocs; - this.sumTermDocFreq = sumTermDocFreq; - this.sumTotalTermFreq = sumTotalTermFreq; - this.numPostings = numPostings; - this.postingListPointers = postingListPointers; - this.offensiveCounters = offensiveCounters; - this.postingLists = postingLists; - this.dictionary = dictionary; - } - - public OptimizedMemoryIndex( - EarlybirdFieldType fieldType, - String field, - InvertedRealtimeIndex source, - Map termIDMapper, - FacetField facetField, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - super(fieldType); - - numDocs = source.getNumDocs(); - sumTermDocFreq = source.getSumTermDocFreq(); - sumTotalTermFreq = source.getSumTotalTermFreq(); - - Preconditions.checkNotNull(originalTweetIdMapper, "The segment must have a tweet ID mapper."); - Preconditions.checkNotNull(optimizedTweetIdMapper, - "The optimized tweet ID mapper cannot be null."); - - // We rely on the fact that new terms always have a greater term ID. We ignore all terms that - // are equal to or greater than numTerms, as they may be incompletely applied. If new terms are - // added while optimizing, they will be re-added when we re-apply updates. - final KeysSource termsIterator = source.getKeysSource(); - int numTerms = termsIterator.getNumberOfKeys(); - int maxPublishedPointer = source.getMaxPublishedPointer(); - - int[] tempPostingListPointers = new int[numTerms]; - - BDZAlgorithm termsHashFunction = null; - - final boolean supportTermTextLookup = facetField != null || fieldType.isSupportTermTextLookup(); - if (supportTermTextLookup) { - try { - termsHashFunction = new BDZAlgorithm(termsIterator); - } catch (MPHFNotFoundException e) { - // we couldn't find a mphf for this field - // no problem, this can happen for very small fields - // - just use the fst in that case - LOG.warn("Unable to build MPH for field: {}", field); - MPH_NOT_FOUND_COUNT.increment(); - } - } - - // Make sure to only call the expensive computeNumPostings() once. - int[] numPostingsSource = computeNumPostings(source, numTerms, maxPublishedPointer); - - // The BDZ Algorithm returns a function from bytesref to term ID. However, these term IDs are - // different than the original term IDs (it's a hash function, not a hash _table_), so we have - // to remap the term IDs to match the ones generated by BDZ. We track that using the termIDMap. - int[] termIDMap = null; - - if (termsHashFunction != null) { - termsIterator.rewind(); - termIDMap = BDZAlgorithm.createIdMap(termsHashFunction, termsIterator); - if (facetField != null) { - termIDMapper.put(facetField.getFacetId(), termIDMap); - } - - PackedInts.Reader termPointers = getPackedInts(source.getTermPointers(), termIDMap); - this.numPostings = getPackedInts(numPostingsSource, termIDMap); - this.offensiveCounters = source.getOffensiveCounters() == null ? null - : getPackedInts(source.getOffensiveCounters(), termIDMap); - - this.dictionary = new MPHTermDictionary( - numTerms, - termsHashFunction, - termPointers, - source.getTermPool(), - TermPointerEncoding.DEFAULT_ENCODING); - } else { - this.dictionary = FSTTermDictionary.buildFST( - source.getTermPool(), - source.getTermPointers(), - numTerms, - BYTES_REF_COMPARATOR, - supportTermTextLookup, - TermPointerEncoding.DEFAULT_ENCODING); - - this.numPostings = getPackedInts(numPostingsSource); - this.offensiveCounters = source.getOffensiveCounters() == null ? null - : getPackedInts(source.getOffensiveCounters()); - } - - TermsEnum allTerms = source.createTermsEnum(maxPublishedPointer); - - this.postingLists = new MultiPostingLists( - !fieldType.hasPositions(), - numPostingsSource, - source.getMaxPosition()); - - for (int termID = 0; termID < numTerms; termID++) { - allTerms.seekExact(termID); - PostingsEnum postingsEnum = new OptimizingPostingsEnumWrapper( - allTerms.postings(null), originalTweetIdMapper, optimizedTweetIdMapper); - int mappedTermID = termIDMap != null ? termIDMap[termID] : termID; - tempPostingListPointers[mappedTermID] = - postingLists.copyPostingList(postingsEnum, numPostingsSource[termID]); - } - - this.postingListPointers = getPackedInts(tempPostingListPointers); - } - - private static int[] map(int[] source, int[] map) { - int[] target = new int[map.length]; - for (int i = 0; i < map.length; i++) { - target[map[i]] = source[i]; - } - return target; - } - - static PackedInts.Reader getPackedInts(int[] values) { - return getPackedInts(values, null); - } - - private static PackedInts.Reader getPackedInts(int[] values, int[] map) { - int[] mappedValues = values; - if (map != null) { - mappedValues = map(mappedValues, map); - } - - // first determine max value - long maxValue = Long.MIN_VALUE; - for (int value : mappedValues) { - if (value > maxValue) { - maxValue = value; - } - } - - PackedInts.Mutable packed = - PackedInts.getMutable(mappedValues.length, PackedInts.bitsRequired(maxValue), - PackedInts.DEFAULT); - for (int i = 0; i < mappedValues.length; i++) { - packed.set(i, mappedValues[i]); - } - - return packed; - } - - /** - * Returns per-term array containing the number of posting in this index for each term. - * This call is extremely slow. - */ - private static int[] computeNumPostings( - InvertedRealtimeIndex source, - int numTerms, - int maxPublishedPointer - ) throws IOException { - int[] numPostings = new int[numTerms]; - TermsEnum allTerms = source.createTermsEnum(maxPublishedPointer); - - for (int termID = 0; termID < numTerms; termID++) { - allTerms.seekExact(termID); - PostingsEnum docsEnum = allTerms.postings(null); - while (docsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - numPostings[termID] += docsEnum.freq(); - } - } - - return numPostings; - } - - @Override - public int getNumDocs() { - return numDocs; - } - - @Override - public int getSumTotalTermFreq() { - return sumTotalTermFreq; - } - - @Override - public int getSumTermDocFreq() { - return sumTermDocFreq; - } - - public OptimizedPostingLists getPostingLists() { - Preconditions.checkState(hasPostingLists()); - return postingLists; - } - - int getPostingListPointer(int termID) { - Preconditions.checkState(hasPostingLists()); - return (int) postingListPointers.get(termID); - } - - int getNumPostings(int termID) { - Preconditions.checkState(hasPostingLists()); - return (int) numPostings.get(termID); - } - - public boolean getTerm(int termID, BytesRef text, BytesRef termPayload) { - return dictionary.getTerm(termID, text, termPayload); - } - - @Override - public FacetLabelAccessor getLabelAccessor() { - return new FacetLabelAccessor() { - @Override - protected boolean seek(long termID) { - if (termID != EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) { - hasTermPayload = getTerm((int) termID, termRef, termPayload); - offensiveCount = offensiveCounters != null - ? (int) offensiveCounters.get((int) termID) : 0; - return true; - } else { - return false; - } - } - }; - } - - @Override - public Terms createTerms(int maxPublishedPointer) { - return new OptimizedIndexTerms(this); - } - - @Override - public TermsEnum createTermsEnum(int maxPublishedPointer) { - return dictionary.createTermsEnum(this); - } - - @Override - public int lookupTerm(BytesRef term) throws IOException { - return dictionary.lookupTerm(term); - } - - @Override - public int getLargestDocIDForTerm(int termID) throws IOException { - Preconditions.checkState(hasPostingLists()); - if (termID == EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) { - return EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - } else { - return postingLists.getLargestDocID((int) postingListPointers.get(termID), - (int) numPostings.get(termID)); - } - } - - @Override - public int getDF(int termID) { - return (int) numPostings.get(termID); - } - - @Override - public int getNumTerms() { - return dictionary.getNumTerms(); - } - - @Override - public void getTerm(int termID, BytesRef text) { - dictionary.getTerm(termID, text, null); - } - - @VisibleForTesting TermDictionary getTermDictionary() { - return dictionary; - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public boolean hasPostingLists() { - return postingListPointers != null - && postingLists != null - && numPostings != null; - } - - @VisibleForTesting - OptimizedPostingLists getOptimizedPostingLists() { - return postingLists; - } - - public static class FlushHandler extends Flushable.Handler { - private static final String NUM_DOCS_PROP_NAME = "numDocs"; - private static final String SUM_TOTAL_TERM_FREQ_PROP_NAME = "sumTotalTermFreq"; - private static final String SUM_TERM_DOC_FREQ_PROP_NAME = "sumTermDocFreq"; - private static final String USE_MIN_PERFECT_HASH_PROP_NAME = "useMinimumPerfectHashFunction"; - private static final String SKIP_POSTING_LIST_PROP_NAME = "skipPostingLists"; - private static final String HAS_OFFENSIVE_COUNTERS_PROP_NAME = "hasOffensiveCounters"; - public static final String IS_OPTIMIZED_PROP_NAME = "isOptimized"; - - private final EarlybirdFieldType fieldType; - - public FlushHandler(EarlybirdFieldType fieldType) { - super(); - this.fieldType = fieldType; - } - - public FlushHandler(OptimizedMemoryIndex objectToFlush) { - super(objectToFlush); - fieldType = objectToFlush.fieldType; - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - long startTime = getClock().nowMillis(); - OptimizedMemoryIndex objectToFlush = getObjectToFlush(); - boolean useHashFunction = objectToFlush.dictionary instanceof MPHTermDictionary; - boolean skipPostingLists = !objectToFlush.hasPostingLists(); - - flushInfo.addIntProperty(NUM_DOCS_PROP_NAME, objectToFlush.numDocs); - flushInfo.addIntProperty(SUM_TERM_DOC_FREQ_PROP_NAME, objectToFlush.sumTermDocFreq); - flushInfo.addIntProperty(SUM_TOTAL_TERM_FREQ_PROP_NAME, objectToFlush.sumTotalTermFreq); - flushInfo.addBooleanProperty(USE_MIN_PERFECT_HASH_PROP_NAME, useHashFunction); - flushInfo.addBooleanProperty(SKIP_POSTING_LIST_PROP_NAME, skipPostingLists); - flushInfo.addBooleanProperty(HAS_OFFENSIVE_COUNTERS_PROP_NAME, - objectToFlush.offensiveCounters != null); - flushInfo.addBooleanProperty(IS_OPTIMIZED_PROP_NAME, true); - - if (!skipPostingLists) { - out.writePackedInts(objectToFlush.postingListPointers); - out.writePackedInts(objectToFlush.numPostings); - } - if (objectToFlush.offensiveCounters != null) { - out.writePackedInts(objectToFlush.offensiveCounters); - } - - if (!skipPostingLists) { - objectToFlush.postingLists.getFlushHandler().flush( - flushInfo.newSubProperties("postingLists"), out); - } - objectToFlush.dictionary.getFlushHandler().flush(flushInfo.newSubProperties("dictionary"), - out); - getFlushTimerStats().timerIncrement(getClock().nowMillis() - startTime); - } - - @Override - protected OptimizedMemoryIndex doLoad( - FlushInfo flushInfo, DataDeserializer in) throws IOException { - long startTime = getClock().nowMillis(); - boolean useHashFunction = flushInfo.getBooleanProperty(USE_MIN_PERFECT_HASH_PROP_NAME); - boolean skipPostingLists = flushInfo.getBooleanProperty(SKIP_POSTING_LIST_PROP_NAME); - - PackedInts.Reader postingListPointers = skipPostingLists ? null : in.readPackedInts(); - PackedInts.Reader numPostings = skipPostingLists ? null : in.readPackedInts(); - PackedInts.Reader offensiveCounters = - flushInfo.getBooleanProperty(HAS_OFFENSIVE_COUNTERS_PROP_NAME) - ? in.readPackedInts() : null; - - MultiPostingLists postingLists = skipPostingLists ? null - : (new MultiPostingLists.FlushHandler()) - .load(flushInfo.getSubProperties("postingLists"), in); - - TermDictionary dictionary; - if (useHashFunction) { - dictionary = (new MPHTermDictionary.FlushHandler(TermPointerEncoding.DEFAULT_ENCODING)) - .load(flushInfo.getSubProperties("dictionary"), in); - } else { - dictionary = (new FSTTermDictionary.FlushHandler(TermPointerEncoding.DEFAULT_ENCODING)) - .load(flushInfo.getSubProperties("dictionary"), in); - } - getLoadTimerStats().timerIncrement(getClock().nowMillis() - startTime); - - return new OptimizedMemoryIndex(fieldType, - flushInfo.getIntProperty(NUM_DOCS_PROP_NAME), - flushInfo.getIntProperty(SUM_TERM_DOC_FREQ_PROP_NAME), - flushInfo.getIntProperty(SUM_TOTAL_TERM_FREQ_PROP_NAME), - numPostings, - postingListPointers, - offensiveCounters, - postingLists, - dictionary); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedPostingLists.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedPostingLists.docx new file mode 100644 index 000000000..1a94b0e24 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedPostingLists.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedPostingLists.java b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedPostingLists.java deleted file mode 100644 index 3bec3505b..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizedPostingLists.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import org.apache.lucene.index.PostingsEnum; - -import com.twitter.search.common.util.io.flushable.Flushable; - -public abstract class OptimizedPostingLists implements Flushable { - static final int MAX_DOC_ID_BIT = 24; - static final int MAX_DOC_ID = (1 << MAX_DOC_ID_BIT) - 1; - - static final int MAX_POSITION_BIT = 31; - - static final int MAX_FREQ_BIT = 31; - - /** - * Copies the given posting list into these posting lists. - * - * @param postingsEnum enumerator of the posting list that needs to be copied - * @param numPostings number of postings in the posting list that needs to be copied - * @return position index of the head of the copied posting list in these posting lists instance - */ - public abstract int copyPostingList(PostingsEnum postingsEnum, int numPostings) - throws IOException; - - /** - * Create and return a postings doc enumerator or doc-position enumerator based on input flag. - * - * @see org.apache.lucene.index.PostingsEnum - */ - public abstract EarlybirdPostingsEnum postings(int postingListPointer, int numPostings, int flags) - throws IOException; - - /** - * Returns the largest docID contained in the posting list pointed by {@code postingListPointer}. - */ - public final int getLargestDocID(int postingListPointer, int numPostings) throws IOException { - return postings(postingListPointer, numPostings, PostingsEnum.NONE).getLargestDocID(); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizingPostingsEnumWrapper.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizingPostingsEnumWrapper.docx new file mode 100644 index 000000000..7821d3615 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizingPostingsEnumWrapper.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizingPostingsEnumWrapper.java b/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizingPostingsEnumWrapper.java deleted file mode 100644 index a06637c1b..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/OptimizingPostingsEnumWrapper.java +++ /dev/null @@ -1,128 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -/** - * A PostingsEnum that maps doc IDs in one DocIDToTweetIDMapper instance to doc IDs in another - * DocIDToTweetIDMapper. - * - * Unoptimized segments can use any DocIDToTweetIDMapper they want, which means that there are no - * guarantees on the distribution of the doc IDs in this mapper. However, optimized segments must - * use an OptimizedTweetIDMapper: we want to assign sequential doc IDs and use delta encondings in - * order to save space. So when an Earlybird segment needs to be optimized, we might need to convert - * the doc ID space of the unoptimized tweet ID mapper to the doc ID space of the optimized mapper. - * However, once we do this, the doc IDs stored in the posting lists in that segment will no longer - * be valid, unless we remap them too. So the goal of this class is to provide a way to do that. - * - * When we want to optimize a posting list, we need to traverse it and pack it. This class provides - * a wrapper around the original posting list that does the doc ID remapping at traversal time. - */ -public class OptimizingPostingsEnumWrapper extends PostingsEnum { - private final List docIds = Lists.newArrayList(); - private final Map> positions = Maps.newHashMap(); - - private int docIdIndex = -1; - private int positionIndex = -1; - - public OptimizingPostingsEnumWrapper(PostingsEnum source, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper newTweetIdMapper) throws IOException { - int docId; - while ((docId = source.nextDoc()) != NO_MORE_DOCS) { - long tweetId = originalTweetIdMapper.getTweetID(docId); - int newDocId = newTweetIdMapper.getDocID(tweetId); - Preconditions.checkState(newDocId != DocIDToTweetIDMapper.ID_NOT_FOUND, - "Did not find a mapping in the new tweet ID mapper for tweet ID %s, doc ID %s", - tweetId, docId); - - docIds.add(newDocId); - List docPositions = Lists.newArrayListWithCapacity(source.freq()); - positions.put(newDocId, docPositions); - for (int i = 0; i < source.freq(); ++i) { - docPositions.add(source.nextPosition()); - } - } - Collections.sort(docIds); - } - - @Override - public int nextDoc() { - ++docIdIndex; - if (docIdIndex >= docIds.size()) { - return NO_MORE_DOCS; - } - - positionIndex = -1; - return docIds.get(docIdIndex); - } - - @Override - public int freq() { - Preconditions.checkState(docIdIndex >= 0, "freq() called before nextDoc()."); - Preconditions.checkState(docIdIndex < docIds.size(), - "freq() called after nextDoc() returned NO_MORE_DOCS."); - return positions.get(docIds.get(docIdIndex)).size(); - } - - @Override - public int nextPosition() { - Preconditions.checkState(docIdIndex >= 0, "nextPosition() called before nextDoc()."); - Preconditions.checkState(docIdIndex < docIds.size(), - "nextPosition() called after nextDoc() returned NO_MORE_DOCS."); - - ++positionIndex; - Preconditions.checkState(positionIndex < positions.get(docIds.get(docIdIndex)).size(), - "nextPosition() called more than freq() times."); - return positions.get(docIds.get(docIdIndex)).get(positionIndex); - } - - // All other methods are not supported. - - @Override - public int advance(int target) { - throw new UnsupportedOperationException( - "OptimizingPostingsEnumWrapper.advance() is not supported."); - } - - @Override - public long cost() { - throw new UnsupportedOperationException( - "OptimizingPostingsEnumWrapper.cost() is not supported."); - } - - @Override - public int docID() { - throw new UnsupportedOperationException( - "OptimizingPostingsEnumWrapper.docID() is not supported."); - } - - @Override - public int endOffset() { - throw new UnsupportedOperationException( - "OptimizingPostingsEnumWrapper.endOffset() is not supported."); - } - - @Override - public BytesRef getPayload() { - throw new UnsupportedOperationException( - "OptimizingPostingsEnumWrapper.getPayload() is not supported."); - } - - @Override - public int startOffset() { - throw new UnsupportedOperationException( - "OptimizingPostingsEnumWrapper.startOffset() is not supported."); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/PackedLongsReaderPreComputedValues.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/PackedLongsReaderPreComputedValues.docx new file mode 100644 index 000000000..90729041d Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/PackedLongsReaderPreComputedValues.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/PackedLongsReaderPreComputedValues.java b/src/java/com/twitter/search/core/earlybird/index/inverted/PackedLongsReaderPreComputedValues.java deleted file mode 100644 index 3ea8d3480..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/PackedLongsReaderPreComputedValues.java +++ /dev/null @@ -1,202 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -/** - * Pre-computed shifts, mask, and start int indices used by - * {@link IntBlockPoolPackedLongsReader} to decode packed values from - * {@link IntBlockPool}. - * - * The purpose of this class is for decoding efficiency and speed. This class is thread-safe since - * all its usages are read-only. - * - * Packed ints are stored from LOWEST bits for HIGHEST bits in an int. - * - * Here are 3 different situations when a packed value spans 1, 2, and 3 ints: - * - * - A packed value spans 1 int: - * [High Bits ................................. Low Bits] - * int[n] = [possible_other_data|packed_value|possible_other_data] - * - * To decode, 1 shift right and 1 mask are needed: - * * shift - {@link #allLowBitsRightShift} - * * mask - dynamically computed based on bitsPerValue (in decoded slice). - * - * - A packed value spans 2 ints: - * The data is stored as: - * [High Bits .................. Low Bits] - * int[n] = [low_bits_of_packed_value | other_data] - * int[n+1] = [other_data| high_bits_of_packed_value] - * - * To decode, 1 shift right, 1 shift left, and 2 masks are needed: - * * 1 shift right {@link #allLowBitsRightShift} and 1 mask (computed on the fly) to compute - * low_bits_of_packed_value - * * 1 mask {@link #allMiddleBitsMask} and 1 shift left {@link #allMiddleBitsLeftShift} to - * compute high_bits_of_packed_value - * * 1 OR to combine `high_bits_of_packed_value | low_bits_of_packed_value` - * - * - A packed value spans 3 ints: - * The data is stored as: - * [High Bits .................. Low Bits] - * int[n] = [low_bits_of_packed_value | other_data] - * int[n+1] = [ ... middle_bits_of_packed_value ... ] - * int[n+2] = [other_data| high_bits_of_packed_value] - * - * To decode, 1 shift right, 2 shift left, and 3 masks are needed: - * * 1 shift right {@link #allLowBitsRightShift} and 1 mask (computed on the fly) to compute - * low_bits_of_packed_value - * * 1 shift left {@link #allMiddleBitsLeftShift} and 1 mask {@link #allMiddleBitsMask} to - * compute middle_bits_of_data - * * 1 shift left {@link #allHighBitsLeftShift} and 1 mask {@link #allHighBitsMask} to compute - * high_bits_of_data - * * 1 OR to combine `low_bits_of_data | middle_bits_of_data | high_bits_of_data` - * - * Example usage: - * @see HighDFPackedIntsDocsEnum - * @see HighDFPackedIntsDocsAndPositionsEnum - */ -public final class PackedLongsReaderPreComputedValues { - private final int[][] allLowBitsRightShift; - private final int[][] allMiddleBitsLeftShift; - private final int[][] allMiddleBitsMask; - private final int[][] allHighBitsLeftShift; - private final int[][] allHighBitsMask; - - /** - * 2D int arrays containing pre-computed start int indices; the 2 dimensions are - * int[numBitsPerPackedValue][packedValueIndex]. - * - * For a given number bits per packed value and a given packed value index, this is the first - * int in the subsequent of ints that contains the packed value with the given packed value index. - */ - private final int[][] allStartIntIndices; - - /** - * Sole constructor. - * - * @param maxBitsPerValue max possible number of bits of packed values that will be decoded - * @param maxNumValues max number of values are encoded back to back - * @param maxNumInts max number of ints are used to store packed values - * @param needStartIntIndex for optimization: whether start int indices are needed - */ - PackedLongsReaderPreComputedValues( - int maxBitsPerValue, - int maxNumValues, - int maxNumInts, - boolean needStartIntIndex) { - assert maxBitsPerValue <= Long.SIZE; - - if (needStartIntIndex) { - this.allStartIntIndices = new int[maxBitsPerValue + 1][maxNumValues]; - } else { - this.allStartIntIndices = null; - } - - this.allLowBitsRightShift = new int[maxBitsPerValue + 1][maxNumValues]; - this.allMiddleBitsLeftShift = new int[maxBitsPerValue + 1][maxNumValues]; - this.allMiddleBitsMask = new int[maxBitsPerValue + 1][maxNumValues]; - - // Packed value could use up 2 ints. - if (maxBitsPerValue > Integer.SIZE) { - this.allHighBitsLeftShift = new int[maxBitsPerValue + 1][maxNumValues]; - this.allHighBitsMask = new int[maxBitsPerValue + 1][maxNumValues]; - } else { - this.allHighBitsLeftShift = null; - this.allHighBitsMask = null; - } - - compute(maxBitsPerValue, maxNumValues, maxNumInts); - } - - /** - * Compute masks, shifts and start indices. - */ - private void compute(int maxBitsPerValue, int maxNumValues, int maxNumInts) { - // For each possible bits per packed value. - for (int bitsPerPackedValue = 0; bitsPerPackedValue <= maxBitsPerValue; bitsPerPackedValue++) { - int[] startIntIndices = - allStartIntIndices != null ? allStartIntIndices[bitsPerPackedValue] : null; - int[] lowBitsRightShift = - allLowBitsRightShift[bitsPerPackedValue]; - int[] middleBitsLeftShift = - allMiddleBitsLeftShift[bitsPerPackedValue]; - int[] middleBitsMask = - allMiddleBitsMask[bitsPerPackedValue]; - int[] highBitsLeftShift = - allHighBitsLeftShift != null ? allHighBitsLeftShift[bitsPerPackedValue] : null; - int[] highBitsMask = - allHighBitsMask != null ? allHighBitsMask[bitsPerPackedValue] : null; - - int shift = 0; - int currentIntIndex = 0; - int bitsRead; - int bitsRemaining; - - // For each packed value. - for (int packedValueIndex = 0; packedValueIndex < maxNumValues; packedValueIndex++) { - if (startIntIndices != null) { - startIntIndices[packedValueIndex] = currentIntIndex; - } - // Packed value spans to the 1st int. - lowBitsRightShift[packedValueIndex] = shift; - bitsRead = Integer.SIZE - shift; - bitsRemaining = bitsPerPackedValue - bitsRead; - - if (bitsRemaining >= 0) { - // Packed value spans to the 2nd int. - currentIntIndex++; - if (currentIntIndex == maxNumInts) { - break; - } - middleBitsLeftShift[packedValueIndex] = bitsRead; - middleBitsMask[packedValueIndex] = - bitsRemaining >= Integer.SIZE ? 0xFFFFFFFF : (1 << bitsRemaining) - 1; - - // Packed value spans to the 3rd int. - bitsRead += Integer.SIZE; - bitsRemaining -= Integer.SIZE; - if (bitsRemaining >= 0) { - currentIntIndex++; - if (currentIntIndex == maxNumInts) { - break; - } - assert highBitsLeftShift != null; - assert highBitsMask != null; - highBitsLeftShift[packedValueIndex] = bitsRead; - highBitsMask[packedValueIndex] = - bitsRemaining >= Integer.SIZE ? 0xFFFFFFFF : (1 << bitsRemaining) - 1; - } - } - - shift += bitsPerPackedValue; - shift = shift % Integer.SIZE; - } - } - } - - /******************************************************************** - * Getters of Pre-computed Values: returns should NEVER be modified * - ********************************************************************/ - - int[] getStartIntIndices(int numBitsPerValue) { - return allStartIntIndices == null ? null : allStartIntIndices[numBitsPerValue]; - } - - int[] getLowBitsRightShift(int numBitsPerValue) { - return allLowBitsRightShift[numBitsPerValue]; - } - - int[] getMiddleBitsLeftShift(int numBitsPerValue) { - return allMiddleBitsLeftShift[numBitsPerValue]; - } - - int[] getMiddleBitsMask(int numBitsPerValue) { - return allMiddleBitsMask[numBitsPerValue]; - } - - int[] getHighBitsLeftShift(int numBitsPerValue) { - return allHighBitsLeftShift == null ? null : allHighBitsLeftShift[numBitsPerValue]; - } - - int[] getHighBitsMask(int numBitsPerValue) { - return allHighBitsMask == null ? null : allHighBitsMask[numBitsPerValue]; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/PayloadUtil.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/PayloadUtil.docx new file mode 100644 index 000000000..a3d273b6a Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/PayloadUtil.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/PayloadUtil.java b/src/java/com/twitter/search/core/earlybird/index/inverted/PayloadUtil.java deleted file mode 100644 index f7addebbb..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/PayloadUtil.java +++ /dev/null @@ -1,91 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import org.apache.lucene.util.BytesRef; - -/** - * Utilities for encoding and decoding BytesRefs into ints. The encoding is: - * [0..n] n bytes big-endian decoded into integers. - * n: number of bytes. - * - * Example: - * encode([DE, AD, BE, EF, AB]) => [0xDEADBEEF, 0xAB000000, 5] - * - * It's necessary to store the length at the end instead of the start so that we can know how far to - * jump backward from a skiplist entry. We can't store it after the skip list entry because there - * can be a variable number of pointers after the skip list entry. - * - * An example skip list entry, with labels on the following line: - * [0xDEADBEEF, 12, 654, 0x877, 0x78879] - * [ payload, position, docID, level0Pointer, level1Pointer] - */ -public final class PayloadUtil { - private PayloadUtil() { - } - - public static final int[] EMPTY_PAYLOAD = new int[]{0}; - - /** - * Encodes a {@link BytesRef} into an int array (to be inserted into a - * {@link IntBlockPool}. The encoder considers the input to be big-endian encoded ints. - */ - public static int[] encodePayload(BytesRef payload) { - if (payload == null) { - return EMPTY_PAYLOAD; - } - - int intsInPayload = intsForBytes(payload.length); - - int[] arr = new int[1 + intsInPayload]; - - for (int i = 0; i < intsInPayload; i++) { - int n = 0; - for (int j = 0; j < 4; j++) { - int index = i * 4 + j; - int b; - if (index < payload.length) { - // mask off the top bits in case b is negative. - b = payload.bytes[index] & 0xFF; - } else { - b = 0; - } - n = n << 8 | b; - } - - arr[i] = n; - } - - arr[intsInPayload] = payload.length; - - return arr; - } - - /** - * Decodes a {@link IntBlockPool} and position into a {@link BytesRef}. The ints are - * converted into big-endian encoded bytes. - */ - public static BytesRef decodePayload( - IntBlockPool b, - int pointer) { - int length = b.get(pointer); - BytesRef bytesRef = new BytesRef(length); - bytesRef.length = length; - - int numInts = intsForBytes(length); - - for (int i = 0; i < numInts; i++) { - int n = b.get(pointer - numInts + i); - for (int j = 0; j < 4; j++) { - int byteIndex = 4 * i + j; - if (byteIndex < length) { - bytesRef.bytes[byteIndex] = (byte) (n >> 8 * (3 - byteIndex % 4)); - } - } - } - - return bytesRef; - } - - private static int intsForBytes(int byteCount) { - return (byteCount + 3) / 4; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/PostingsBufferQueue.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/PostingsBufferQueue.docx new file mode 100644 index 000000000..54bc31366 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/PostingsBufferQueue.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/PostingsBufferQueue.java b/src/java/com/twitter/search/core/earlybird/index/inverted/PostingsBufferQueue.java deleted file mode 100644 index 51ffbbe0c..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/PostingsBufferQueue.java +++ /dev/null @@ -1,155 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.util.NoSuchElementException; - -import com.google.common.annotations.VisibleForTesting; - -/** - * A posting buffer used by {@link HighDFPackedIntsPostingLists} while copying over posting list. - */ -final class PostingsBufferQueue { - /** - * Mask used to convert an int to a long. We cannot just cast because doing so will fill in the - * higher 32 bits with the sign bit, but we need the higher 32 bits to be 0 instead. - */ - static final long LONG_MASK = (1L << 32) - 1; - - /** - * A circular FIFO long queue used internally to store posting. - * @see #postingsQueue - */ - @VisibleForTesting - static final class Queue { - private final long[] queue; - private int head = 0; - private int tail = 0; - private int size; - - Queue(int maxSize) { - this.queue = new long[maxSize < 2 ? 2 : maxSize]; - } - - boolean isEmpty() { - return size() == 0; - } - - boolean isFull() { - return size() == queue.length; - } - - void offer(long value) { - if (size() == queue.length) { - throw new IllegalStateException("Queue is full"); - } - queue[tail] = value; - tail = (tail + 1) % queue.length; - size++; - } - - long poll() { - if (isEmpty()) { - throw new NoSuchElementException("Queue is empty."); - } - long value = queue[head]; - head = (head + 1) % queue.length; - size--; - return value; - } - - int size() { - return size; - } - } - - /** - * Internal posting queue. - */ - private final Queue postingsQueue; - - /** - * Constructor with max size. - * - * @param maxSize max size of this buffer. - */ - PostingsBufferQueue(int maxSize) { - this.postingsQueue = new Queue(maxSize); - } - - /** - * Check if the buffer is empty. - * - * @return If this buffer is empty - */ - boolean isEmpty() { - return postingsQueue.isEmpty(); - } - - /** - * Check if the buffer is full. - * - * @return If this buffer is full - */ - boolean isFull() { - return postingsQueue.isFull(); - } - - /** - * Get the current size of this buffer. - * - * @return Current size of this buffer - */ - int size() { - return postingsQueue.size(); - } - - /** - * Store a posting with docID and a second value that could be freq, position, or any additional - * info. This method will encode the offered doc ID and second value with - * {@link #encodePosting(int, int)}. - * - * @param docID doc ID of the posting - * @param secondValue an additional value of the posting - */ - void offer(int docID, int secondValue) { - postingsQueue.offer(encodePosting(docID, secondValue)); - } - - /** - * Remove and return the earliest inserted posting, this is a FIFO queue. - * - * @return the earliest inserted posting. - */ - long poll() { - return postingsQueue.poll(); - } - - /** - * Encode a doc ID and a second value, both are ints, into a long. The higher 32 bits store the - * doc ID and lower 32 bits store the second value. - * - * @param docID an int specifying doc ID of the posting - * @param secondValue an int specifying the second value of the posting - * @return an encoded long represent the posting - */ - private static long encodePosting(int docID, int secondValue) { - return ((LONG_MASK & docID) << 32) | (LONG_MASK & secondValue); - } - - /** - * Decode doc ID from the given posting. - * @param posting a given posting encoded with {@link #encodePosting(int, int)} - * @return the doc ID of the given posting. - */ - static int getDocID(long posting) { - return (int) (posting >> 32); - } - - /** - * Decode the second value from the given posting. - * @param posting a given posting encoded with {@link #encodePosting(int, int)} - * @return the second value of the given posting. - */ - static int getSecondValue(long posting) { - return (int) posting; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/QueryCostTracker.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/QueryCostTracker.docx new file mode 100644 index 000000000..b5c74e5f7 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/QueryCostTracker.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/QueryCostTracker.java b/src/java/com/twitter/search/core/earlybird/index/inverted/QueryCostTracker.java deleted file mode 100644 index 5918cd93b..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/QueryCostTracker.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import org.apache.lucene.util.CloseableThreadLocal; - -import com.twitter.search.common.search.QueryCostProvider; - -public class QueryCostTracker implements QueryCostProvider { - public static enum CostType { - // For the realtime segment we track how many posting list blocks - // are accessed during the lifetime of one query. - LOAD_REALTIME_POSTING_BLOCK(1), - - // Number of optimized posting list blocks - LOAD_OPTIMIZED_POSTING_BLOCK(1); - - private final double cost; - - private CostType(double cost) { - this.cost = cost; - } - } - - private static final CloseableThreadLocal TRACKERS - = new CloseableThreadLocal() { - @Override protected QueryCostTracker initialValue() { - return new QueryCostTracker(); - } - }; - - public static QueryCostTracker getTracker() { - return TRACKERS.get(); - } - - private double totalCost; - - public void track(CostType costType) { - totalCost += costType.cost; - } - - public void reset() { - totalCost = 0; - } - - @Override - public double getTotalCost() { - return totalCost; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/RealtimeIndexTerms.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/RealtimeIndexTerms.docx new file mode 100644 index 000000000..5dd87c5a3 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/RealtimeIndexTerms.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/RealtimeIndexTerms.java b/src/java/com/twitter/search/core/earlybird/index/inverted/RealtimeIndexTerms.java deleted file mode 100644 index 7f6c60b97..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/RealtimeIndexTerms.java +++ /dev/null @@ -1,365 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.util.Iterator; -import java.util.TreeSet; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.BaseTermsEnum; -import org.apache.lucene.index.ImpactsEnum; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SlowImpactsEnum; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.hashtable.HashTable; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.hash.KeysSource; - -public class RealtimeIndexTerms extends Terms { - // Calling InMemoryTermsEnum.next() creates a full copy of the entire term dictionary, and can - // be quite expensive. We don't expect these calls to happen, and they shpould not happen on the - // regular read path. We stat them here just in case to see if there is any unexpected usage. - private static final SearchCounter TERMS_ENUM_NEXT_CALLS = - SearchCounter.export("in_memory_terms_enum_next_calls"); - private static final SearchCounter TERMS_ENUM_CREATE_TERM_SET = - SearchCounter.export("in_memory_terms_enum_next_create_term_set"); - private static final SearchCounter TERMS_ENUM_CREATE_TERM_SET_SIZE = - SearchCounter.export("in_memory_terms_enum_next_create_term_set_size"); - - private final InvertedRealtimeIndex index; - private final int maxPublishedPointer; - - public RealtimeIndexTerms(InvertedRealtimeIndex index, int maxPublishedPointer) { - this.index = index; - this.maxPublishedPointer = maxPublishedPointer; - } - - @Override - public long size() { - return index.getNumTerms(); - } - - @Override - public TermsEnum iterator() { - return index.createTermsEnum(maxPublishedPointer); - } - - /** - * This TermsEnum use a tree set to support {@link TermsEnum#next()} method. However, this is not - * efficient enough to support realtime operation. {@link TermsEnum#seekCeil} is not fully - * supported in this termEnum. - */ - public static class InMemoryTermsEnum extends BaseTermsEnum { - private final InvertedRealtimeIndex index; - private final int maxPublishedPointer; - private int termID = -1; - private BytesRef bytesRef = new BytesRef(); - private Iterator termIter; - private TreeSet termSet; - - public InMemoryTermsEnum(InvertedRealtimeIndex index, int maxPublishedPointer) { - this.index = index; - this.maxPublishedPointer = maxPublishedPointer; - termIter = null; - } - - @Override - public int docFreq() { - return index.getDF(termID); - } - - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) { - int postingsPointer = index.getPostingListPointer(termID); - return index.getPostingList().postings(postingsPointer, docFreq(), maxPublishedPointer); - } - - @Override - public ImpactsEnum impacts(int flags) { - return new SlowImpactsEnum(postings(null, flags)); - } - - @Override - public SeekStatus seekCeil(BytesRef text) { - // Nullify termIter. - termIter = null; - - termID = index.lookupTerm(text); - - if (termID == -1) { - return SeekStatus.END; - } else { - index.getTerm(termID, bytesRef); - return SeekStatus.FOUND; - } - } - - @Override - public BytesRef next() { - TERMS_ENUM_NEXT_CALLS.increment(); - if (termSet == null) { - termSet = new TreeSet<>(); - KeysSource keysource = index.getKeysSource(); - keysource.rewind(); - int numTerms = keysource.getNumberOfKeys(); - for (int i = 0; i < numTerms; ++i) { - BytesRef ref = keysource.nextKey(); - // we need to clone the ref since the keysource is reusing the returned BytesRef - // instance and we are storing it - termSet.add(ref.clone()); - } - TERMS_ENUM_CREATE_TERM_SET.increment(); - TERMS_ENUM_CREATE_TERM_SET_SIZE.add(numTerms); - } - - // Construct termIter from the subset. - if (termIter == null) { - termIter = termSet.tailSet(bytesRef, true).iterator(); - } - - if (termIter.hasNext()) { - bytesRef = termIter.next(); - termID = index.lookupTerm(bytesRef); - } else { - termID = -1; - bytesRef = null; - } - return bytesRef; - } - - @Override - public long ord() { - return termID; - } - - @Override - public void seekExact(long ord) { - // Nullify termIter. - termIter = null; - - if (ord < index.getNumTerms()) { - termID = (int) ord; - index.getTerm(termID, bytesRef); - } - } - - @Override - public BytesRef term() { - return bytesRef; - } - - @Override - public long totalTermFreq() { - return docFreq(); - } - } - - /** - * This TermsEnum use a {@link SkipListContainer} backed termsSkipList provided by - * {@link InvertedRealtimeIndex} to supported ordered terms operations like - * {@link TermsEnum#next()} and {@link TermsEnum#seekCeil}. - */ - public static class SkipListInMemoryTermsEnum extends BaseTermsEnum { - private final InvertedRealtimeIndex index; - - private int termID = -1; - private BytesRef bytesRef = new BytesRef(); - private int nextTermIDPointer; - - /** - * {@link #nextTermIDPointer} is used to record pointer to next termsID to accelerate - * {@link #next}. However, {@link #seekCeil} and {@link #seekExact} may jump to an arbitrary - * term so the {@link #nextTermIDPointer} may not be correct, and this flag is used to check if - * this happens. If this flag is false, {@link #correctNextTermIDPointer} should be called to - * correct the value. - */ - private boolean isNextTermIDPointerCorrect; - - private final SkipListContainer termsSkipList; - private final InvertedRealtimeIndex.TermsSkipListComparator termsSkipListComparator; - private final int maxPublishedPointer; - - /** - * Creates a new {@link TermsEnum} for a skip list-based sorted real-time term dictionary. - */ - public SkipListInMemoryTermsEnum(InvertedRealtimeIndex index, int maxPublishedPointer) { - Preconditions.checkNotNull(index.getTermsSkipList()); - - this.index = index; - this.termsSkipList = index.getTermsSkipList(); - - // Each Terms Enum shall have their own comparators to be thread safe. - this.termsSkipListComparator = - new InvertedRealtimeIndex.TermsSkipListComparator(index); - this.nextTermIDPointer = - termsSkipList.getNextPointer(SkipListContainer.FIRST_LIST_HEAD); - this.isNextTermIDPointerCorrect = true; - this.maxPublishedPointer = maxPublishedPointer; - } - - @Override - public int docFreq() { - return index.getDF(termID); - } - - @Override - public PostingsEnum postings(PostingsEnum reuse, int flags) { - int postingsPointer = index.getPostingListPointer(termID); - return index.getPostingList().postings(postingsPointer, docFreq(), maxPublishedPointer); - } - - @Override - public ImpactsEnum impacts(int flags) { - return new SlowImpactsEnum(postings(null, flags)); - } - - @Override - public SeekStatus seekCeil(BytesRef text) { - // Next term pointer is not correct anymore since seek ceil - // will jump to an arbitrary term. - isNextTermIDPointerCorrect = false; - - // Doing precise lookup first. - termID = index.lookupTerm(text); - - // Doing ceil lookup if not found, otherwise we are good. - if (termID == -1) { - return seekCeilWithSkipList(text); - } else { - index.getTerm(termID, bytesRef); - return SeekStatus.FOUND; - } - } - - /** - * Doing ceil terms search with terms skip list. - */ - private SeekStatus seekCeilWithSkipList(BytesRef text) { - int termIDPointer = termsSkipList.searchCeil(text, - SkipListContainer.FIRST_LIST_HEAD, - termsSkipListComparator, - null); - - // End reached but still cannot found a ceil term. - if (termIDPointer == SkipListContainer.FIRST_LIST_HEAD) { - termID = HashTable.EMPTY_SLOT; - return SeekStatus.END; - } - - termID = termsSkipList.getValue(termIDPointer); - - // Set next termID pointer and is correct flag. - nextTermIDPointer = termsSkipList.getNextPointer(termIDPointer); - isNextTermIDPointerCorrect = true; - - // Found a ceil term but not the precise match. - index.getTerm(termID, bytesRef); - return SeekStatus.NOT_FOUND; - } - - /** - * {@link #nextTermIDPointer} is used to record the pointer to next termID. This method is used - * to correct {@link #nextTermIDPointer} to correct value after {@link #seekCeil} or - * {@link #seekExact} dropped current term to arbitrary point. - */ - private void correctNextTermIDPointer() { - final int curTermIDPointer = termsSkipList.search( - bytesRef, - SkipListContainer.FIRST_LIST_HEAD, - termsSkipListComparator, - null); - // Must be able to find the exact term. - assert termID == HashTable.EMPTY_SLOT - || termID == termsSkipList.getValue(curTermIDPointer); - - nextTermIDPointer = termsSkipList.getNextPointer(curTermIDPointer); - isNextTermIDPointerCorrect = true; - } - - @Override - public BytesRef next() { - // Correct nextTermIDPointer first if not correct due to seekExact or seekCeil. - if (!isNextTermIDPointerCorrect) { - correctNextTermIDPointer(); - } - - // Skip list is exhausted. - if (nextTermIDPointer == SkipListContainer.FIRST_LIST_HEAD) { - termID = HashTable.EMPTY_SLOT; - return null; - } - - termID = termsSkipList.getValue(nextTermIDPointer); - - index.getTerm(termID, bytesRef); - - // Set next termID Pointer. - nextTermIDPointer = termsSkipList.getNextPointer(nextTermIDPointer); - return bytesRef; - } - - @Override - public long ord() { - return termID; - } - - @Override - public void seekExact(long ord) { - if (ord < index.getNumTerms()) { - termID = (int) ord; - index.getTerm(termID, bytesRef); - - // Next term pointer is not correct anymore since seek exact - // just jump to an arbitrary term. - isNextTermIDPointerCorrect = false; - } - } - - @Override - public BytesRef term() { - return bytesRef; - } - - @Override - public long totalTermFreq() { - return docFreq(); - } - } - - @Override - public long getSumTotalTermFreq() { - return index.getSumTotalTermFreq(); - } - - @Override - public long getSumDocFreq() { - return index.getSumTermDocFreq(); - } - - @Override - public int getDocCount() { - return index.getNumDocs(); - } - - @Override - public boolean hasFreqs() { - return true; - } - - @Override - public boolean hasOffsets() { - return false; - } - - @Override - public boolean hasPositions() { - return true; - } - - @Override - public boolean hasPayloads() { - return true; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListComparator.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListComparator.docx new file mode 100644 index 000000000..22868473a Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListComparator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListComparator.java b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListComparator.java deleted file mode 100644 index 3c23de5d1..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListComparator.java +++ /dev/null @@ -1,43 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -/** - * Comparator interface for {@link SkipListContainer}, - * see sample implementation {@link SkipListIntegerComparator}. - * - * Notice: less/equal/greater here refer to the order precedence, instead of numerical value. - */ -public interface SkipListComparator { - - /** - * Determine the order between the given key and the key of the given targetValue. - * Notice, usually key of a value could be derived from the value along. - * - * Implementation of this method should consider sentinel value, see {@link #getSentinelValue()}. - * - * Can include position data (primarily for text posting lists). Position should be ignored if - * the skip list was constructed without positions enabled. - * - * @return negative, zero, or positive to indicate if first value is - * less than, equal to, or greater than the second value, respectively. - */ - int compareKeyWithValue(K key, int targetValue, int targetPosition); - - /** - * Determine the order of two given values based on their keys. - * Notice, usually key of a value could be derived from the value along. - * - * Implementation of this method should consider sentinel value, see {@link #getSentinelValue()}. - * - * @return negative, zero, or positive to indicate if first value is - * less than, equal to, or greater than the second value, respectively. - */ - int compareValues(int v1, int v2); - - /** - * Return a sentinel value, sentinel value should be considered by this comparator - * as an ADVISORY GREATEST value, which should NOT be actually inserted into the skip list. - * - * @return the sentinel value. - */ - int getSentinelValue(); -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListContainer.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListContainer.docx new file mode 100644 index 000000000..342d53968 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListContainer.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListContainer.java b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListContainer.java deleted file mode 100644 index da4d1d001..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListContainer.java +++ /dev/null @@ -1,739 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import java.util.Random; - -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; - -import static com.twitter.search.core.earlybird.index.inverted.PayloadUtil.EMPTY_PAYLOAD; - -/** - * This is a skip list container implementation backed by {@link IntBlockPool}. - * - * Skip list is a data structure similar to linked list, but with a hierarchy of lists - * each skipping over fewer elements, and the bottom hierarchy does NOT skip any elements. - * @see Skip List Wikipedia - * - * This implementation is lock free and thread safe with ONE writer thread and MULTIPLE reader - * threads. - * - * This implementation could contain one or more skip lists, and they are all backed by - * the same {@link IntBlockPool}. - * - * Values are actually stored as integers; however search key is implemented as a generic type. - * Inserts of values that already exist are stored as subsequent elements. This is used to support - * positions and term frequency. - * - * Also reserve the integer after value to store next ordinal pointer information. We avoid storing - * pointers to the next element in the tower by allocating them contiguously. To descend the tower, - * we just increment the pointer. - * - * This skip list can also store positions as integers. It allocates them before it allocates the - * value (the value is a doc ID if we are using positions). This means that we can access the - * position by simply decrementing the value pointer. - * - * To understand how the skip list works, first understand how insert works, then the rest will be - * more comprehendable. - * - * A skip list will be implemented in a circle linked way: - * - the list head node will have the sentinel value, which is the advisory greatest value - * provided by comparator. - * - Real first value will be pointed by the list head node. - * - Real last value will point to the list head. - * - * Constraints: - * - Does NOT support negative value. - * - * Simple Viz: - * - * Empty list with max tower height 5. S = Sentinel value, I = Initial value. - * | s| 0| 0| 0| 0| 0| i| i| i| i| i| i| i| i| i| i| - * - * One possible situation after inserting 4, 6, 5. - * | s| 6| 6| 9| 0| 0| 4|13|13| 6| 0| 0| 0| 5| 9| 9| - */ -public class SkipListContainer implements Flushable { - /** - * The list head of first skip list in the container, this is for convenient usage, - * so application use only one skip list does not need to keep track of the list head. - */ - static final int FIRST_LIST_HEAD = 0; - - /** - * Initial value used when initialize int block pool. Notice -1 is not used here in order to give - * application more freedom because -1 is a special value when doing bit manipulations. - */ - static final int INITIAL_VALUE = -2; - - /** - * Maximum tower height of this skip list and chance to grow tower by level. - * - * Notice these two values could affect the memory usage and the performance. - * Ideally they should be calculated based on the potential size of the skip list. - * - * Given n is the number of elements in the skip list, the memory usage is in O(n). - * - * More precisely, - * - * the memory is mainly used for the following data: - * - * header_tower = O(maxTowerHeight + 1) - * value = O(n) - * next_pointers = O(n * (1 - growTowerChance^(maxTowerHeight + 1)) / (1 - growTowerChance)) - * - * thus, the total memory usage is in O(header_tower + value + next_pointers). - * - * Default value for maximum tower height and grow tower chance, these two numbers are chosen - * arbitrarily now. - */ - @VisibleForTesting - public static final int MAX_TOWER_HEIGHT = 10; - private static final float GROW_TOWER_CHANCE = 0.2f; - - public enum HasPositions { - YES, - NO - } - - public enum HasPayloads { - YES, - NO - } - - static final int INVALID_POSITION = -3; - - /** Memory barrier. */ - private volatile int maxPoolPointer; - - /** Actual storage data structure. */ - private final IntBlockPool blockPool; - - /** - * Default comparator used to determine the order between two given values or between one key and - * another value. - * - * Notice this comparator is shared by all threads using this skip list, so it is not thread safe - * if it is maintaining some states. However, {@link #search}, {@link #insert}, and - * {@link #searchCeil} support passed in comparator as a parameter, which should be thread safe if - * managed by the caller properly. - */ - private final SkipListComparator defaultComparator; - - /** Random generator used to decide if to grow tower by one level or not. */ - private final Random random = new Random(); - - /** - * Used by writer thread to record last pointers at each level. Notice it is ok to have it as an - * instance field because we would only have one writer thread. - */ - private final int[] lastPointers; - - /** - * Whether the skip list contains positions. Used for text fields. - */ - private final HasPositions hasPositions; - - private final HasPayloads hasPayloads; - - /** - * Creates a new probabilistic skip list, using the provided comparator to compare keys - * of type K. - * - * @param comparator a comparator used to compare integer values. - */ - public SkipListContainer( - SkipListComparator comparator, - HasPositions hasPositions, - HasPayloads hasPayloads, - String name - ) { - this(comparator, new IntBlockPool(INITIAL_VALUE, name), hasPositions, hasPayloads); - } - - /** - * Base constructor, also used by flush handler. - */ - private SkipListContainer( - SkipListComparator comparator, - IntBlockPool blockPool, - HasPositions hasPositions, - HasPayloads hasPayloads) { - // Sentinel value specified by the comparator cannot equal to INITIAL_VALUE. - Preconditions.checkArgument(comparator.getSentinelValue() != INITIAL_VALUE); - - this.defaultComparator = comparator; - this.lastPointers = new int[MAX_TOWER_HEIGHT]; - this.blockPool = blockPool; - this.hasPositions = hasPositions; - this.hasPayloads = hasPayloads; - } - - /** - * Search for the index of the greatest value which has key less than or equal to the given key. - * - * This is more like a floor search function. See {@link #searchCeil} for ceil search. - * - * @param key target key will be searched. - * @param skipListHead index of the header tower of the skip list will be searched. - * @param comparator comparator used for comparison when traversing through the skip list. - * @param searchFinger {@link SkipListSearchFinger} to accelerate search speed, - * notice the search finger must be before the key. - * @return the index of the greatest value which is less than or equal to given value, - * will return skipListHead if given value has no greater or equal values. - */ - public int search( - K key, - int skipListHead, - SkipListComparator comparator, - @Nullable SkipListSearchFinger searchFinger) { - assert comparator != null; - // Start at the header tower. - int currentPointer = skipListHead; - - // Instantiate nextPointer and nextValue outside of the for loop so we can use the value - // directly after for loop. - int nextPointer = getForwardPointer(currentPointer, MAX_TOWER_HEIGHT - 1); - int nextValue = getValue(nextPointer); - - // Top down traversal. - for (int currentLevel = MAX_TOWER_HEIGHT - 1; currentLevel >= 0; currentLevel--) { - nextPointer = getForwardPointer(currentPointer, currentLevel); - nextValue = getValue(nextPointer); - - // Jump to search finger at current level. - if (searchFinger != null) { - final int fingerPointer = searchFinger.getPointer(currentLevel); - assert searchFinger.isInitialPointer(fingerPointer) - || comparator.compareKeyWithValue(key, getValue(fingerPointer), INVALID_POSITION) >= 0; - - if (!searchFinger.isInitialPointer(fingerPointer) - && comparator.compareValues(getValue(fingerPointer), nextValue) >= 0) { - currentPointer = fingerPointer; - nextPointer = getForwardPointer(currentPointer, currentLevel); - nextValue = getValue(nextPointer); - } - } - - // Move forward. - while (comparator.compareKeyWithValue(key, nextValue, INVALID_POSITION) > 0) { - currentPointer = nextPointer; - - nextPointer = getForwardPointer(currentPointer, currentLevel); - nextValue = getValue(nextPointer); - } - - // Advance search finger. - if (searchFinger != null && currentPointer != skipListHead) { - final int currentValue = getValue(currentPointer); - final int fingerPointer = searchFinger.getPointer(currentLevel); - - if (searchFinger.isInitialPointer(fingerPointer) - || comparator.compareValues(currentValue, getValue(fingerPointer)) > 0) { - searchFinger.setPointer(currentLevel, currentPointer); - } - } - } - - // Return next pointer if next value matches searched value; otherwise return currentPointer. - return comparator.compareKeyWithValue(key, nextValue, INVALID_POSITION) == 0 - ? nextPointer : currentPointer; - } - - /** - * Perform search with {@link #defaultComparator}. - * Notice {@link #defaultComparator} is not thread safe if it is keeping some states. - */ - public int search(K key, int skipListHead, @Nullable SkipListSearchFinger searchFinger) { - return search(key, skipListHead, this.defaultComparator, searchFinger); - } - - /** - * Ceil search on given {@param key}. - * - * @param key target key will be searched. - * @param skipListHead index of the header tower of the skip list will be searched. - * @param comparator comparator used for comparison when traversing through the skip list. - * @param searchFinger {@link SkipListSearchFinger} to accelerate search speed. - * @return index of the smallest value with key greater or equal to the given key. - */ - public int searchCeil( - K key, - int skipListHead, - SkipListComparator comparator, - @Nullable SkipListSearchFinger searchFinger) { - assert comparator != null; - - // Perform regular search. - final int foundPointer = search(key, skipListHead, comparator, searchFinger); - - // Return foundPointer if it is not the list head and the pointed value has key equal to the - // given key; otherwise, return next pointer. - if (foundPointer != skipListHead - && comparator.compareKeyWithValue(key, getValue(foundPointer), INVALID_POSITION) == 0) { - return foundPointer; - } else { - return getNextPointer(foundPointer); - } - } - - /** - * Perform searchCeil with {@link #defaultComparator}. - * Notice {@link #defaultComparator} is not thread safe if it is keeping some states. - */ - public int searchCeil( - K key, int skipListHead, @Nullable SkipListSearchFinger searchFinger) { - return searchCeil(key, skipListHead, this.defaultComparator, searchFinger); - } - - /** - * Insert a new value into the skip list. - * - * Notice inserting supports duplicate keys and duplicate values. - * - * Duplicate keys with different values or positions will be inserted consecutively. - * Duplciate keys with identical values will be ignored, and the duplicate will not be stored in - * the posting list. - * - * @param key is the key of the given value. - * @param value is the value will be inserted, cannot be {@link #getSentinelValue()}. - * @param skipListHead index of the header tower of the skip list will accept the new value. - * @param comparator comparator used for comparison when traversing through the skip list. - * @return whether this value exists in the posting list. Note that this will return true even - * if it is a new position. - */ - public boolean insert(K key, int value, int position, int[] payload, int skipListHead, - SkipListComparator comparator) { - Preconditions.checkArgument(comparator != null); - Preconditions.checkArgument(value != getSentinelValue()); - - // Start at the header tower. - int currentPointer = skipListHead; - - // Initialize lastPointers. - for (int i = 0; i < MAX_TOWER_HEIGHT; i++) { - this.lastPointers[i] = INITIAL_VALUE; - } - int nextPointer = INITIAL_VALUE; - - // Top down traversal. - for (int currentLevel = MAX_TOWER_HEIGHT - 1; currentLevel >= 0; currentLevel--) { - nextPointer = getForwardPointer(currentPointer, currentLevel); - int nextValue = getValue(nextPointer); - - int nextPosition = getPosition(nextPointer); - while (comparator.compareKeyWithValue(key, nextValue, nextPosition) > 0) { - currentPointer = nextPointer; - - nextPointer = getForwardPointer(currentPointer, currentLevel); - nextValue = getValue(nextPointer); - nextPosition = getPosition(nextPointer); - } - - // Store last pointers. - lastPointers[currentLevel] = currentPointer; - } - - // we use isDuplicateValue to determine if a value already exists in a posting list (even if it - // is a new position). We need to check both current pointer and next pointer in case this is - // the largest position we have seen for this value in this skip list. In that case, nextPointer - // will point to a larger value, but we want to check the smaller one to see if it is the same - // value. For example, if we have [(1, 2), (2, 4)] and we want to insert (1, 3), then - // nextPointer will point to (2, 4), but we want to check the doc ID of (1, 2) to see if it has - // the same document ID. - boolean isDuplicateValue = getValue(currentPointer) == value || getValue(nextPointer) == value; - - if (comparator.compareKeyWithValue(key, getValue(nextPointer), getPosition(nextPointer)) != 0) { - if (hasPayloads == HasPayloads.YES) { - Preconditions.checkNotNull(payload); - // If this skip list has payloads, we store the payload immediately before the document ID - // and position (iff the position exists) in the block pool. We store payloads before - // positions because they are variable length, and reading past them would require knowing - // the size of the payload. We don't store payloads after the doc ID because we have a - // variable number of pointers after the doc ID, and we would have no idea where the - // pointers stop and the payload starts. - for (int n : payload) { - this.blockPool.add(n); - } - } - - if (hasPositions == HasPositions.YES) { - // If this skip list has positions, we store the position before the document ID in the - // block pool. - this.blockPool.add(position); - } - - // Insert value. - final int insertedPointer = this.blockPool.add(value); - - // Insert outgoing pointers. - final int height = getRandomTowerHeight(); - for (int currentLevel = 0; currentLevel < height; currentLevel++) { - this.blockPool.add(getForwardPointer(lastPointers[currentLevel], currentLevel)); - } - - this.sync(); - - // Update incoming pointers. - for (int currentLevel = 0; currentLevel < height; currentLevel++) { - setForwardPointer(lastPointers[currentLevel], currentLevel, insertedPointer); - } - - this.sync(); - } - - return isDuplicateValue; - } - - /** - * Delete a given key from skip list - * - * @param key the key of the given value - * @param skipListHead index of the header tower of the skip list will accept the new value - * @param comparator comparator used for comparison when traversing through the skip list - * @return smallest value in the container. Returns {@link #INITIAL_VALUE} if the - * key does not exist. - */ - public int delete(K key, int skipListHead, SkipListComparator comparator) { - boolean foundKey = false; - - for (int currentLevel = MAX_TOWER_HEIGHT - 1; currentLevel >= 0; currentLevel--) { - int currentPointer = skipListHead; - int nextValue = getValue(getForwardPointer(currentPointer, currentLevel)); - - // First we skip over all the nodes that are smaller than our key. - while (comparator.compareKeyWithValue(key, nextValue, INVALID_POSITION) > 0) { - currentPointer = getForwardPointer(currentPointer, currentLevel); - nextValue = getValue(getForwardPointer(currentPointer, currentLevel)); - } - - Preconditions.checkState(currentPointer != INITIAL_VALUE); - - // If we don't find the node at this level that's OK, keep searching on a lower one. - if (comparator.compareKeyWithValue(key, nextValue, INVALID_POSITION) != 0) { - continue; - } - - // We found an element to delete. - foundKey = true; - - // Otherwise, save the current pointer. Right now, current pointer points to the first element - // that has the same value as key. - int savedPointer = currentPointer; - - currentPointer = getForwardPointer(currentPointer, currentLevel); - // Then, walk over every element that is equal to the key. - while (comparator.compareKeyWithValue(key, getValue(currentPointer), INVALID_POSITION) == 0) { - currentPointer = getForwardPointer(currentPointer, currentLevel); - } - - // update the saved pointer to point to the first non-equal element of the skip list. - setForwardPointer(savedPointer, currentLevel, currentPointer); - } - - // Something has changed, need to sync up here. - if (foundKey) { - this.sync(); - // return smallest value, might be used as first postings later - return getSmallestValue(skipListHead); - } - - return INITIAL_VALUE; - } - - /** - * Perform insert with {@link #defaultComparator}. - * Notice {@link #defaultComparator} is not thread safe if it is keeping some states. - */ - public boolean insert(K key, int value, int skipListHead) { - return insert(key, value, INVALID_POSITION, EMPTY_PAYLOAD, skipListHead, - this.defaultComparator); - } - - public boolean insert(K key, int value, int position, int[] payload, int skipListHead) { - return insert(key, value, position, payload, skipListHead, this.defaultComparator); - } - - /** - * Perform delete with {@link #defaultComparator}. - * Notice {@link #defaultComparator} is not thread safe if it is keeping some states. - */ - public int delete(K key, int skipListHead) { - return delete(key, skipListHead, this.defaultComparator); - } - - /** - * Get the pointer of next value pointed by the given pointer. - * - * @param pointer reference to the current value. - * @return pointer of next value. - */ - public int getNextPointer(int pointer) { - return getForwardPointer(pointer, 0); - } - - /** - * Get the value pointed by a pointer, this is a dereference process. - * - * @param pointer is an array index on this.blockPool. - * @return value pointed pointed by the pointer. - */ - public int getValue(int pointer) { - int value = blockPool.get(pointer); - - // Visibility race - if (value == INITIAL_VALUE) { - // Volatile read to cross the memory barrier again. - final boolean isSafe = isPointerSafe(pointer); - assert isSafe; - - // Re-read the pointer again - value = blockPool.get(pointer); - } - - return value; - } - - public int getSmallestValue(int skipListHeader) { - return getValue(getForwardPointer(skipListHeader, 0)); - } - - /** - * Builder of a forward search finger with header tower index. - * - * @return a new {@link SkipListSearchFinger} object. - */ - public SkipListSearchFinger buildSearchFinger() { - return new SkipListSearchFinger(MAX_TOWER_HEIGHT); - } - - /** - * Added another skip list into the int pool. - * - * @return index of the header tower of the newly created skip list. - */ - public int newSkipList() { - // Virtual value of header. - final int sentinelValue = getSentinelValue(); - if (hasPositions == HasPositions.YES) { - this.blockPool.add(INVALID_POSITION); - } - final int skipListHead = this.blockPool.add(sentinelValue); - - // Build header tower, initially point all the pointers to - // itself since no value has been inserted. - for (int i = 0; i < MAX_TOWER_HEIGHT; i++) { - this.blockPool.add(skipListHead); - } - - this.sync(); - - return skipListHead; - } - - /** - * Check if the block pool has been initiated by {@link #newSkipList}. - */ - public boolean isEmpty() { - return this.blockPool.length() == 0; - } - - /** - * Write to the volatile variable to cross memory barrier. maxPoolPointer is the memory barrier - * for new appends. - */ - private void sync() { - this.maxPoolPointer = this.blockPool.length(); - } - - /** - * Read from volatile variable to cross memory barrier. - * - * @param pointer is an block pool index. - * @return boolean indicate if given pointer is within the range of max pool pointer. - */ - private boolean isPointerSafe(int pointer) { - return pointer <= this.maxPoolPointer; - } - - /** - * Get the position associated with the doc ID pointed to by pointer. - * @param pointer aka doc ID pointer. - * @return The value of the position for that doc ID. Returns INVALID_POSITION if the skip list - * does not have positions, or if there is no position for that pointer. - */ - public int getPosition(int pointer) { - if (hasPositions == HasPositions.NO) { - return INVALID_POSITION; - } - // if this skip list has positions, the position will always be inserted into the block pool - // immediately before the doc ID. - return getValue(pointer - 1); - } - - /** - * Get the payload pointer from a normal pointer (e.g. one returned from the {@link this#search} - * method). - */ - public int getPayloadPointer(int pointer) { - Preconditions.checkState(hasPayloads == HasPayloads.YES, - "getPayloadPointer() should only be called on a skip list that supports payloads."); - - // if this skip list has payloads, the payload will always be inserted into the block pool - // before the doc ID, and before the position if there is a position. - int positionOffset = hasPositions == HasPositions.YES ? 1 : 0; - - return pointer - 1 - positionOffset; - } - - - int getPoolSize() { - return this.blockPool.length(); - } - - - IntBlockPool getBlockPool() { - return blockPool; - } - - public HasPayloads getHasPayloads() { - return hasPayloads; - } - - /****************** - * Helper Methods * - ******************/ - - /** - * Get the next forward pointer on a given level. - * - * @param pointer is an array index on this.blockPool, might be SENTINEL_VALUE. - * @param level indicates the level of the forward pointer will be acquired. It is zero indexed. - * @return next forward pointer on the given level, might be SENTINEL_VALUE. - */ - private int getForwardPointer(int pointer, int level) { - final int pointerIndex = pointer + level + 1; - - int forwardPointer = blockPool.get(pointerIndex); - - // Visibility race - if (forwardPointer == INITIAL_VALUE) { - // Volatile read to cross the memory barrier again. - final boolean isSafe = isPointerSafe(pointerIndex); - assert isSafe; - - // Re-read the pointer again - forwardPointer = blockPool.get(pointerIndex); - } - - return forwardPointer; - } - - /** - * Set the next forward pointer on a given level. - * - * @param pointer points to the value, of which the pointer value will be updated. - * @param level indicates the level of the forward pointer will be set. It is zero indexed. - * @param target the value fo the target pointer which will be set. - */ - private void setForwardPointer(int pointer, int level, int target) { - // Update header tower if given pointer points to headerTower. - setPointer(pointer + level + 1, target); - } - - /** - * Set the value pointed by pointer - * @param pointer point to the actual position in the pool - * @param target the value we are going to set - */ - private void setPointer(int pointer, int target) { - blockPool.set(pointer, target); - } - - /** - * Getter of the sentinel value used by this skip list. The sentinel value should be provided - * by the comparator. - * - * @return sentinel value used by this skip list. - */ - int getSentinelValue() { - return defaultComparator.getSentinelValue(); - } - - /** - * Return a height h in range [1, maxTowerHeight], each number with chance - * growTowerChance ^ (h - 1). - * - * @return a integer indicating height. - */ - private int getRandomTowerHeight() { - int height = 1; - while (height < MAX_TOWER_HEIGHT && random.nextFloat() < GROW_TOWER_CHANCE) { - height++; - } - return height; - } - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler<>(this); - } - - public static class FlushHandler extends Flushable.Handler> { - private final SkipListComparator comparator; - private static final String BLOCK_POOL_PROP_NAME = "blockPool"; - private static final String HAS_POSITIONS_PROP_NAME = "hasPositions"; - private static final String HAS_PAYLOADS_PROP_NAME = "hasPayloads"; - - public FlushHandler(SkipListContainer objectToFlush) { - super(objectToFlush); - this.comparator = objectToFlush.defaultComparator; - } - - public FlushHandler(SkipListComparator comparator) { - this.comparator = comparator; - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - long startTime = getClock().nowMillis(); - SkipListContainer objectToFlush = getObjectToFlush(); - flushInfo.addBooleanProperty(HAS_POSITIONS_PROP_NAME, - objectToFlush.hasPositions == HasPositions.YES); - flushInfo.addBooleanProperty(HAS_PAYLOADS_PROP_NAME, - objectToFlush.hasPayloads == HasPayloads.YES); - - objectToFlush.blockPool.getFlushHandler() - .flush(flushInfo.newSubProperties(BLOCK_POOL_PROP_NAME), out); - getFlushTimerStats().timerIncrement(getClock().nowMillis() - startTime); - } - - @Override - protected SkipListContainer doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - long startTime = getClock().nowMillis(); - IntBlockPool blockPool = (new IntBlockPool.FlushHandler()).load( - flushInfo.getSubProperties(BLOCK_POOL_PROP_NAME), in); - getLoadTimerStats().timerIncrement(getClock().nowMillis() - startTime); - - HasPositions hasPositions = flushInfo.getBooleanProperty(HAS_POSITIONS_PROP_NAME) - ? HasPositions.YES : HasPositions.NO; - HasPayloads hasPayloads = flushInfo.getBooleanProperty(HAS_PAYLOADS_PROP_NAME) - ? HasPayloads.YES : HasPayloads.NO; - - return new SkipListContainer<>( - this.comparator, - blockPool, - hasPositions, - hasPayloads); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListIntegerComparator.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListIntegerComparator.docx new file mode 100644 index 000000000..8819de973 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListIntegerComparator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListIntegerComparator.java b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListIntegerComparator.java deleted file mode 100644 index 6acc19542..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListIntegerComparator.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -/** - * Example implementation of {@link SkipListComparator} with Order-Theoretic Properties. - * - * Notice: - * Re-using key object is highly suggested! - * Normally the generic type should be a mutable object so it can be reused by the reader/writer. - */ -public class SkipListIntegerComparator implements SkipListComparator { - - @Override - public int compareKeyWithValue(Integer key, int targetValue, int targetPosition) { - return key - targetValue; - } - - @Override - public int compareValues(int v1, int v2) { - return v1 - v2; - } - - @Override - public int getSentinelValue() { - return Integer.MAX_VALUE; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingList.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingList.docx new file mode 100644 index 000000000..679544d92 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingList.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingList.java b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingList.java deleted file mode 100644 index 498321beb..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingList.java +++ /dev/null @@ -1,232 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; - -import static com.twitter.search.core.earlybird.index.inverted.SkipListContainer.HasPayloads; -import static com.twitter.search.core.earlybird.index.inverted.SkipListContainer.HasPositions; -import static com.twitter.search.core.earlybird.index.inverted.SkipListContainer.INVALID_POSITION; -import static com.twitter.search.core.earlybird.index.inverted.TermsArray.INVALID; - -/** - * A skip list implementation of real time posting list. Supports out of order updates. - */ -public class SkipListPostingList implements Flushable { - /** Underlying skip list. */ - private final SkipListContainer skipListContainer; - - /** Key used when inserting into the skip list. */ - private final Key key = new Key(); - - public SkipListPostingList( - HasPositions hasPositions, - HasPayloads hasPayloads, - String field) { - this.skipListContainer = new SkipListContainer<>( - new DocIDComparator(), - hasPositions, - hasPayloads, - field); - } - - /** Used by {@link SkipListPostingList.FlushHandler} */ - private SkipListPostingList(SkipListContainer skipListContainer) { - this.skipListContainer = skipListContainer; - } - - /** - * Appends a posting to the posting list for a term. - */ - public void appendPosting( - int termID, - TermsArray termsArray, - int docID, - int position, - @Nullable BytesRef payload) { - termsArray.getLargestPostings()[termID] = Math.max( - termsArray.getLargestPostings()[termID], - docID); - - // Append to an existing skip list. - // Notice, header tower index is stored at the last postings pointer spot. - int postingsPointer = termsArray.getPostingsPointer(termID); - if (postingsPointer == INVALID) { - // Create a new skip list and add the first posting. - postingsPointer = skipListContainer.newSkipList(); - } - - boolean havePostingForThisDoc = insertPosting(docID, position, payload, postingsPointer); - - // If this is a new document ID, we need to update the document frequency for this term - if (!havePostingForThisDoc) { - termsArray.getDocumentFrequency()[termID]++; - } - - termsArray.updatePostingsPointer(termID, postingsPointer); - } - - /** - * Deletes the given doc ID from the posting list for the term. - */ - public void deletePosting(int termID, TermsArray postingsArray, int docID) { - int docFreq = postingsArray.getDocumentFrequency()[termID]; - if (docFreq == 0) { - return; - } - - int postingsPointer = postingsArray.getPostingsPointer(termID); - // skipListContainer is not empty, try to delete docId from it. - int smallestDoc = deletePosting(docID, postingsPointer); - if (smallestDoc == SkipListContainer.INITIAL_VALUE) { - // Key does not exist. - return; - } - - postingsArray.getDocumentFrequency()[termID]--; - } - - /** - * Insert posting into an existing skip list. - * - * @param docID docID of the this posting. - * @param skipListHead header tower index of the skip list - * in which the posting will be inserted. - * @return whether we have already inserted this document ID into this term list. - */ - private boolean insertPosting(int docID, int position, BytesRef termPayload, int skipListHead) { - int[] payload = PayloadUtil.encodePayload(termPayload); - return skipListContainer.insert(key.withDocAndPosition(docID, position), docID, position, - payload, skipListHead); - } - - private int deletePosting(int docID, int skipListHead) { - return skipListContainer.delete(key.withDocAndPosition(docID, INVALID_POSITION), skipListHead); - } - - /** Return a term docs enumerator with position flag on. */ - public PostingsEnum postings( - int postingPointer, - int docFreq, - int maxPublishedPointer) { - return new SkipListPostingsEnum( - postingPointer, docFreq, maxPublishedPointer, skipListContainer); - } - - /** - * Get the number of documents (AKA document frequency or DF) for the given term. - */ - public int getDF(int termID, TermsArray postingsArray) { - int[] documentFrequency = postingsArray.getDocumentFrequency(); - Preconditions.checkArgument(termID < documentFrequency.length); - - return documentFrequency[termID]; - } - - public int getDocIDFromPosting(int posting) { - // Posting is simply the whole doc ID. - return posting; - } - - public int getMaxPublishedPointer() { - return skipListContainer.getPoolSize(); - } - - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static class FlushHandler extends Flushable.Handler { - private static final String SKIP_LIST_PROP_NAME = "skipList"; - - public FlushHandler(SkipListPostingList objectToFlush) { - super(objectToFlush); - } - - public FlushHandler() { - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - SkipListPostingList objectToFlush = getObjectToFlush(); - - objectToFlush.skipListContainer.getFlushHandler() - .flush(flushInfo.newSubProperties(SKIP_LIST_PROP_NAME), out); - } - - @Override - protected SkipListPostingList doLoad( - FlushInfo flushInfo, DataDeserializer in) throws IOException { - SkipListComparator comparator = new DocIDComparator(); - SkipListContainer.FlushHandler flushHandler = - new SkipListContainer.FlushHandler<>(comparator); - SkipListContainer skipList = - flushHandler.load(flushInfo.getSubProperties(SKIP_LIST_PROP_NAME), in); - return new SkipListPostingList(skipList); - } - } - - /** - * Key used to in {@link SkipListContainer} by {@link SkipListPostingList}. - */ - public static class Key { - private int docID; - private int position; - - public int getDocID() { - return docID; - } - - public int getPosition() { - return position; - } - - public Key withDocAndPosition(int withDocID, int withPosition) { - this.docID = withDocID; - this.position = withPosition; - return this; - } - } - - /** - * Comparator for docID and position. - */ - public static class DocIDComparator implements SkipListComparator { - private static final int SENTINEL_VALUE = DocIdSetIterator.NO_MORE_DOCS; - - @Override - public int compareKeyWithValue(Key key, int targetDocID, int targetPosition) { - // No key could represent sentinel value and sentinel value is the largest. - int docCompare = key.getDocID() - targetDocID; - if (docCompare == 0 && targetPosition != INVALID_POSITION) { - return key.getPosition() - targetPosition; - } else { - return docCompare; - } - } - - @Override - public int compareValues(int docID1, int docID2) { - // Sentinel value is the largest. - return docID1 - docID2; - } - - @Override - public int getSentinelValue() { - return SENTINEL_VALUE; - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingsEnum.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingsEnum.docx new file mode 100644 index 000000000..aaf77c108 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingsEnum.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingsEnum.java b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingsEnum.java deleted file mode 100644 index 908ae1c87..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListPostingsEnum.java +++ /dev/null @@ -1,255 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentData; - -import static com.twitter.search.core.earlybird.index.inverted.SkipListContainer.INVALID_POSITION; - -/** - * TermDocs enumerator used by {@link SkipListPostingList}. - */ -public class SkipListPostingsEnum extends PostingsEnum { - /** Initialize cur doc ID and frequency. */ - private int curDoc = TermsArray.INVALID; - private int curFreq = 0; - - private final int postingPointer; - - private final int cost; - - /** - * maxPublishedPointer exists to prevent us from returning documents that are partially indexed. - * These pointers are safe to follow, but the documents should not be returned. See - * {@link EarlybirdRealtimeIndexSegmentData#getSyncData()} ()}. - */ - private final int maxPublishedPointer; - - /** Skip list info and search key */ - private final SkipListContainer skiplist; - private final SkipListPostingList.Key key = new SkipListPostingList.Key(); - - /** - * Pointer/posting/docID of next posting in the skip list. - * Notice the next here is relative to last posting with curDoc ID. - */ - private int nextPostingPointer; - private int nextPostingDocID; - - /** - * We save the positionPointer because we must walk the posting list to obtain term frequency - * before we can start iterating through document positions. To do that walk, we increment - * postingsPointer until it points to the first posting for the next doc, so postingsPointer is no - * longer what we want to use as the start of the position list. The position pointer starts out - * pointing to the first posting with that doc ID value. There can be duplicate doc ID values with - * different positions. To find subsequent positions, we simply walk the posting list using this - * pointer. - */ - private int positionPointer = -1; - - /** - * The payloadPointer should only be called after calling nextPosition, as it points to a payload - * for each position. It is not updated unless nextPosition is called. - */ - private int payloadPointer = -1; - - /** Search finger used in advance method. */ - private final SkipListSearchFinger advanceSearchFinger; - - /** - * A new {@link PostingsEnum} for a real-time skip list-based posting list. - */ - public SkipListPostingsEnum( - int postingPointer, - int docFreq, - int maxPublishedPointer, - SkipListContainer skiplist) { - this.postingPointer = postingPointer; - this.skiplist = skiplist; - this.advanceSearchFinger = this.skiplist.buildSearchFinger(); - this.maxPublishedPointer = maxPublishedPointer; - this.nextPostingPointer = postingPointer; - - // WARNING: - // docFreq is approximate and may not be the true document frequency of the posting list. - this.cost = docFreq; - - if (postingPointer != -1) { - // Because the posting pointer is not negative 1, we know it's valid. - readNextPosting(); - } - - advanceSearchFinger.reset(); - } - - @Override - public final int nextDoc() { - // Notice if skip list is exhausted nextPostingPointer will point back to postingPointer since - // skip list is circle linked. - if (nextPostingPointer == postingPointer) { - // Skip list is exhausted. - curDoc = NO_MORE_DOCS; - curFreq = 0; - } else { - // Skip list is not exhausted. - curDoc = nextPostingDocID; - curFreq = 1; - positionPointer = nextPostingPointer; - - // Keep reading all the posting with the same doc ID. - // Notice: - // - posting with the same doc ID will be stored consecutively - // since the skip list is sorted. - // - if skip list is exhausted, nextPostingPointer will become postingPointer - // since skip list is circle linked. - readNextPosting(); - while (nextPostingPointer != postingPointer && nextPostingDocID == curDoc) { - curFreq++; - readNextPosting(); - } - } - - // Returned updated curDoc. - return curDoc; - } - - /** - * Moves the enumerator forward by one element, then reads the information at that position. - * */ - private void readNextPosting() { - // Move search finger forward at lowest level. - advanceSearchFinger.setPointer(0, nextPostingPointer); - - // Read next posting pointer. - nextPostingPointer = skiplist.getNextPointer(nextPostingPointer); - - // Read the new posting positioned under nextPostingPointer into the nextPostingDocID. - readNextPostingInfo(); - } - - private boolean isPointerPublished(int pointer) { - return pointer <= maxPublishedPointer; - } - - /** Read next posting and doc id encoded in next posting. */ - private void readNextPostingInfo() { - // We need to skip over every pointer that has not been published to this Enum, otherwise the - // searcher will see unpublished documents. We also end termination if we reach - // nextPostingPointer == postingPointer, because that means we have reached the end of the - // skiplist. - while (!isPointerPublished(nextPostingPointer) && nextPostingPointer != postingPointer) { - // Move search finger forward at lowest level. - advanceSearchFinger.setPointer(0, nextPostingPointer); - - // Read next posting pointer. - nextPostingPointer = skiplist.getNextPointer(nextPostingPointer); - } - - // Notice if skip list is exhausted, nextPostingPointer will be postingPointer - // since skip list is circle linked. - if (nextPostingPointer != postingPointer) { - nextPostingDocID = skiplist.getValue(nextPostingPointer); - } else { - nextPostingDocID = NO_MORE_DOCS; - } - } - - /** - * Jump to the target, then use {@link #nextDoc()} to collect nextDoc info. - * Notice target might be smaller than curDoc or smallestDocID. - */ - @Override - public final int advance(int target) { - if (target == NO_MORE_DOCS) { - // Exhaust the posting list, so that future calls to docID() always return NO_MORE_DOCS. - nextPostingPointer = postingPointer; - } - - if (nextPostingPointer == postingPointer) { - // Call nextDoc to ensure that all values are updated and we don't have to duplicate that - // here. - return nextDoc(); - } - - // Jump to target if target is bigger. - if (target >= curDoc && target >= nextPostingDocID) { - jumpToTarget(target); - } - - // Retrieve next doc. - return nextDoc(); - } - - /** - * Set the next posting pointer (and info) to the first posting - * with doc ID equal to or larger than the target. - * - * Notice this method does not set curDoc or curFreq. - */ - private void jumpToTarget(int target) { - // Do a ceil search. - nextPostingPointer = skiplist.searchCeil( - key.withDocAndPosition(target, INVALID_POSITION), postingPointer, advanceSearchFinger); - - // Read next posting information. - readNextPostingInfo(); - } - - @Override - public int nextPosition() { - // If doc ID is equal to no more docs than we are past the end of the posting list. If doc ID - // is invalid, then we have not called nextDoc yet, and we should not return a real position. - // If the position pointer is past the current doc ID, then we should not return a position - // until nextDoc is called again (we don't want to return positions for a different doc). - if (docID() == NO_MORE_DOCS - || docID() == TermsArray.INVALID - || skiplist.getValue(positionPointer) != docID()) { - return INVALID_POSITION; - } - payloadPointer = positionPointer; - int position = skiplist.getPosition(positionPointer); - do { - positionPointer = skiplist.getNextPointer(positionPointer); - } while (!isPointerPublished(positionPointer) && positionPointer != postingPointer); - return position; - } - - @Override - public BytesRef getPayload() { - if (skiplist.getHasPayloads() == SkipListContainer.HasPayloads.NO) { - return null; - } - - int pointer = skiplist.getPayloadPointer(this.payloadPointer); - Preconditions.checkState(pointer > 0); - return PayloadUtil.decodePayload(skiplist.getBlockPool(), pointer); - } - - @Override - public int startOffset() { - return -1; - } - - @Override - public int endOffset() { - return -1; - } - - @Override - public final int docID() { - return curDoc; - } - - @Override - public final int freq() { - return curFreq; - } - - @Override - public long cost() { - return cost; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListSearchFinger.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListSearchFinger.docx new file mode 100644 index 000000000..1cc084627 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListSearchFinger.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListSearchFinger.java b/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListSearchFinger.java deleted file mode 100644 index 2ab52c0f2..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/SkipListSearchFinger.java +++ /dev/null @@ -1,45 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -/** - * A forward search finger used, optionally, by {@link SkipListContainer#search}. - * - * A search finger is pointer to the result returned by last time a search method is performed. - * @see Finger search wikipedia. - * - * Using a search finger on a skip list could reduce the search search time from - * log(n) to log(k), where n is length of the skip list and k is the distance between last searched - * key and current searched key. - */ -public class SkipListSearchFinger { - // Pointer used when initialize the search finger. - public static final int INITIAL_POINTER = Integer.MIN_VALUE; - - private final int[] lastPointers; - - /** - * Creates a new search finger. - */ - public SkipListSearchFinger(int maxTowerHeight) { - lastPointers = new int[maxTowerHeight]; - - reset(); - } - - public void reset() { - for (int i = 0; i < lastPointers.length; i++) { - setPointer(i, INITIAL_POINTER); - } - } - - public int getPointer(int level) { - return lastPointers[level]; - } - - public void setPointer(int level, int pointer) { - lastPointers[level] = pointer; - } - - public boolean isInitialPointer(int pointer) { - return pointer == INITIAL_POINTER; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/TermDictionary.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/TermDictionary.docx new file mode 100644 index 000000000..eeec35de0 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/TermDictionary.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/TermDictionary.java b/src/java/com/twitter/search/core/earlybird/index/inverted/TermDictionary.java deleted file mode 100644 index 6a4360304..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/TermDictionary.java +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; - -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * A two-way mapping between terms and their interned value (termID). - * - * Implementation of this interface must guarantee that termIDs are dense, starting at 0; - * so they are good to be used as indices in arrays. - */ -public interface TermDictionary extends Flushable { - int TERM_NOT_FOUND = EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND; - - /** - * Returns the number of terms in this dictionary. - */ - int getNumTerms(); - - /** - * Create a TermsEnum object over this TermDictionary for a given index. - * @param index - */ - TermsEnum createTermsEnum(OptimizedMemoryIndex index); - - /** - * Lookup a term in this dictionary. - * @param term the term to lookup. - * @return the term id for this term, or TERM_NOT_FOUND - * @throws IOException - */ - int lookupTerm(BytesRef term) throws IOException; - - /** - * Get the term for given id and possibly its payload. - * @param termID the term that we want to get. - * @param text MUST be non-null. It will be filled with the term. - * @param termPayload if non-null, it will be filled with the payload if the term has any. - * @return Returns true, iff this term has a term payload. - */ - boolean getTerm(int termID, BytesRef text, BytesRef termPayload); -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/TermPointerEncoding.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/TermPointerEncoding.docx new file mode 100644 index 000000000..bc9af624d Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/TermPointerEncoding.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/TermPointerEncoding.java b/src/java/com/twitter/search/core/earlybird/index/inverted/TermPointerEncoding.java deleted file mode 100644 index 22927f6b0..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/TermPointerEncoding.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -/** - * Encodes and decodes term pointers. - */ -public abstract class TermPointerEncoding { - /** - * Returns the start of the text stored in a {@link BaseByteBlockPool} of the given term. - */ - public abstract int getTextStart(int termPointer); - - /** - * Returns true, if the given term stores a per-term payload. - */ - public abstract boolean hasPayload(int termPointer); - - /** - * Encodes and returns a pointer for a term stored at the given textStart in a - * {@link BaseByteBlockPool}. - */ - public abstract int encodeTermPointer(int textStart, boolean hasPayload); - - public static final TermPointerEncoding DEFAULT_ENCODING = new TermPointerEncoding() { - @Override public int getTextStart(int termPointer) { - return termPointer >>> 1; - } - - @Override public boolean hasPayload(int termPointer) { - return (termPointer & 1) != 0; - } - - @Override - public int encodeTermPointer(int textStart, boolean hasPayload) { - int code = textStart << 1; - return hasPayload ? (code | 1) : code; - } - }; -} diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/TermsArray.docx b/src/java/com/twitter/search/core/earlybird/index/inverted/TermsArray.docx new file mode 100644 index 000000000..ab2abfe07 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/inverted/TermsArray.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/inverted/TermsArray.java b/src/java/com/twitter/search/core/earlybird/index/inverted/TermsArray.java deleted file mode 100644 index a3331044d..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/inverted/TermsArray.java +++ /dev/null @@ -1,189 +0,0 @@ -package com.twitter.search.core.earlybird.index.inverted; - -import java.io.IOException; -import java.util.Arrays; - -import org.apache.lucene.util.ArrayUtil; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; - -/** - * TermsArray provides information on each term in the posting list. - * - * It does not provide any concurrency guarantees. The writer must ensure that all updates are - * visible to readers with an external memory barrier. - */ -public class TermsArray implements Flushable { - private static final int BYTES_PER_POSTING = 5 * Integer.BYTES; - public static final int INVALID = -1; - - private final int size; - - public final int[] termPointers; - private final int[] postingsPointers; - - // Derived data. Not atomic and not reliable. - public final int[] largestPostings; - public final int[] documentFrequency; - public final int[] offensiveCounters; - - TermsArray(int size, boolean useOffensiveCounters) { - this.size = size; - - termPointers = new int[size]; - postingsPointers = new int[size]; - - largestPostings = new int[size]; - documentFrequency = new int[size]; - - if (useOffensiveCounters) { - offensiveCounters = new int[size]; - } else { - offensiveCounters = null; - } - - Arrays.fill(postingsPointers, INVALID); - Arrays.fill(largestPostings, INVALID); - } - - private TermsArray(TermsArray oldArray, int newSize) { - this(newSize, oldArray.offensiveCounters != null); - copyFrom(oldArray); - } - - private TermsArray( - int size, - int[] termPointers, - int[] postingsPointers, - int[] largestPostings, - int[] documentFrequency, - int[] offensiveCounters) { - this.size = size; - - this.termPointers = termPointers; - this.postingsPointers = postingsPointers; - - this.largestPostings = largestPostings; - this.documentFrequency = documentFrequency; - this.offensiveCounters = offensiveCounters; - } - - TermsArray grow() { - int newSize = ArrayUtil.oversize(size + 1, BYTES_PER_POSTING); - return new TermsArray(this, newSize); - } - - - private void copyFrom(TermsArray from) { - copy(from.termPointers, termPointers); - copy(from.postingsPointers, postingsPointers); - - copy(from.largestPostings, largestPostings); - copy(from.documentFrequency, documentFrequency); - - if (from.offensiveCounters != null) { - copy(from.offensiveCounters, offensiveCounters); - } - } - - private void copy(int[] from, int[] to) { - System.arraycopy(from, 0, to, 0, from.length); - } - - /** - * Returns the size of this array. - */ - public int getSize() { - return size; - } - - /** - * Write side operation for updating the pointer to the last posting for a given term. - */ - public void updatePostingsPointer(int termID, int newPointer) { - postingsPointers[termID] = newPointer; - } - - /** - * The returned pointer is guaranteed to be memory safe to follow to its target. The data - * structure it points to will be consistent and safe to traverse. The posting list may contain - * doc IDs that the current reader should not see, and the reader should skip over these doc IDs - * to ensure that the readers provide an immutable view of the doc IDs in a posting list. - */ - public int getPostingsPointer(int termID) { - return postingsPointers[termID]; - } - - public int[] getDocumentFrequency() { - return documentFrequency; - } - - /** - * Gets the array containing the first posting for each indexed term. - */ - public int[] getLargestPostings() { - return largestPostings; - } - - @SuppressWarnings("unchecked") - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static class FlushHandler extends Flushable.Handler { - private static final String SIZE_PROP_NAME = "size"; - private static final String HAS_OFFENSIVE_COUNTERS_PROP_NAME = "hasOffensiveCounters"; - - public FlushHandler(TermsArray objectToFlush) { - super(objectToFlush); - } - - public FlushHandler() { - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - TermsArray objectToFlush = getObjectToFlush(); - flushInfo.addIntProperty(SIZE_PROP_NAME, objectToFlush.size); - boolean hasOffensiveCounters = objectToFlush.offensiveCounters != null; - flushInfo.addBooleanProperty(HAS_OFFENSIVE_COUNTERS_PROP_NAME, hasOffensiveCounters); - - out.writeIntArray(objectToFlush.termPointers); - out.writeIntArray(objectToFlush.postingsPointers); - - out.writeIntArray(objectToFlush.largestPostings); - out.writeIntArray(objectToFlush.documentFrequency); - - if (hasOffensiveCounters) { - out.writeIntArray(objectToFlush.offensiveCounters); - } - } - - @Override - protected TermsArray doLoad( - FlushInfo flushInfo, DataDeserializer in) throws IOException { - int size = flushInfo.getIntProperty(SIZE_PROP_NAME); - boolean hasOffensiveCounters = flushInfo.getBooleanProperty(HAS_OFFENSIVE_COUNTERS_PROP_NAME); - - int[] termPointers = in.readIntArray(); - int[] postingsPointers = in.readIntArray(); - - int[] largestPostings = in.readIntArray(); - int[] documentFrequency = in.readIntArray(); - - int[] offensiveCounters = hasOffensiveCounters ? in.readIntArray() : null; - - return new TermsArray( - size, - termPointers, - postingsPointers, - largestPostings, - documentFrequency, - offensiveCounters); - } - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/util/AllDocsIterator.docx b/src/java/com/twitter/search/core/earlybird/index/util/AllDocsIterator.docx new file mode 100644 index 000000000..eb87f3ad7 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/util/AllDocsIterator.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/util/AllDocsIterator.java b/src/java/com/twitter/search/core/earlybird/index/util/AllDocsIterator.java deleted file mode 100644 index b5ab9ae26..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/util/AllDocsIterator.java +++ /dev/null @@ -1,82 +0,0 @@ -package com.twitter.search.core.earlybird.index.util; - -import java.io.IOException; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentAtomicReader; - -/** - * Used to iterate through all of the documents in an Earlybird segment. This is necessary so that - * we can ensure all of the documents we are reading have been published to the readers. If we used - * the doc ID mapper to iterate through documents, it would return documents that have been only - * partially added to the index, and could return bogus search results (SEARCH-27711). - */ -public class AllDocsIterator extends DocIdSetIterator { - public static final String ALL_DOCS_TERM = "__all_docs"; - - private final DocIdSetIterator delegate; - - public AllDocsIterator(LeafReader reader) throws IOException { - delegate = buildDISI(reader); - } - - private static DocIdSetIterator buildDISI(LeafReader reader) throws IOException { - if (!isRealtimeUnoptimizedSegment(reader)) { - return all(reader.maxDoc()); - } - - Terms terms = - reader.terms(EarlybirdFieldConstants.EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName()); - if (terms == null) { - return all(reader.maxDoc()); - } - - TermsEnum termsEnum = terms.iterator(); - boolean hasTerm = termsEnum.seekExact(new BytesRef(ALL_DOCS_TERM)); - if (hasTerm) { - return termsEnum.postings(null); - } - - return empty(); - } - - @Override - public int docID() { - return delegate.docID(); - } - - @Override - public int nextDoc() throws IOException { - return delegate.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - return delegate.advance(target); - } - - @Override - public long cost() { - return delegate.cost(); - } - - /** - * Returns whether this is a realtime segment in the realtime index that is still unoptimized and - * mutable. - */ - private static boolean isRealtimeUnoptimizedSegment(LeafReader reader) { - if (reader instanceof EarlybirdRealtimeIndexSegmentAtomicReader) { - EarlybirdRealtimeIndexSegmentAtomicReader realtimeReader = - (EarlybirdRealtimeIndexSegmentAtomicReader) reader; - return !realtimeReader.getSegmentData().isOptimized(); - } - - return false; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/util/RangeDISI.docx b/src/java/com/twitter/search/core/earlybird/index/util/RangeDISI.docx new file mode 100644 index 000000000..e2bf4e6cc Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/util/RangeDISI.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/util/RangeDISI.java b/src/java/com/twitter/search/core/earlybird/index/util/RangeDISI.java deleted file mode 100644 index accee2156..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/util/RangeDISI.java +++ /dev/null @@ -1,50 +0,0 @@ -package com.twitter.search.core.earlybird.index.util; - -import java.io.IOException; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.search.DocIdSetIterator; - -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public class RangeDISI extends DocIdSetIterator { - private final int start; - private final int end; - private final AllDocsIterator delegate; - - private int currentDocId = -1; - - public RangeDISI(LeafReader reader, int start, int end) throws IOException { - this.delegate = new AllDocsIterator(reader); - this.start = start; - if (end == DocIDToTweetIDMapper.ID_NOT_FOUND) { - this.end = Integer.MAX_VALUE; - } else { - this.end = end; - } - } - - @Override - public int docID() { - return currentDocId; - } - - @Override - public int nextDoc() throws IOException { - return advance(currentDocId + 1); - } - - @Override - public int advance(int target) throws IOException { - currentDocId = delegate.advance(Math.max(target, start)); - if (currentDocId > end) { - currentDocId = NO_MORE_DOCS; - } - return currentDocId; - } - - @Override - public long cost() { - return delegate.cost(); - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/util/RangeFilterDISI.docx b/src/java/com/twitter/search/core/earlybird/index/util/RangeFilterDISI.docx new file mode 100644 index 000000000..0680ca1a3 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/util/RangeFilterDISI.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/util/RangeFilterDISI.java b/src/java/com/twitter/search/core/earlybird/index/util/RangeFilterDISI.java deleted file mode 100644 index 934355fc9..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/util/RangeFilterDISI.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.twitter.search.core.earlybird.index.util; - -import java.io.IOException; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.search.DocIdSetIterator; - -/** - * A doc id set iterator that iterates over a filtered set of ids from firstId inclusive to lastId - * inclusive. - */ -public class RangeFilterDISI extends DocIdSetIterator { - private final RangeDISI delegate; - - public RangeFilterDISI(LeafReader reader) throws IOException { - this(reader, 0, reader.maxDoc() - 1); - } - - public RangeFilterDISI(LeafReader reader, int smallestDocID, int largestDocID) - throws IOException { - this.delegate = new RangeDISI(reader, smallestDocID, largestDocID); - } - - @Override - public int docID() { - return delegate.docID(); - } - - @Override - public int nextDoc() throws IOException { - delegate.nextDoc(); - return nextValidDoc(); - } - - @Override - public int advance(int target) throws IOException { - delegate.advance(target); - return nextValidDoc(); - } - - private int nextValidDoc() throws IOException { - int doc = delegate.docID(); - while (doc != NO_MORE_DOCS && !shouldReturnDoc()) { - doc = delegate.nextDoc(); - } - return doc; - } - - @Override - public long cost() { - return delegate.cost(); - } - - // Override this method to add additional filters. Should return true if the current doc is OK. - protected boolean shouldReturnDoc() throws IOException { - return true; - } -} diff --git a/src/java/com/twitter/search/core/earlybird/index/util/SearchSortUtils.docx b/src/java/com/twitter/search/core/earlybird/index/util/SearchSortUtils.docx new file mode 100644 index 000000000..251e00756 Binary files /dev/null and b/src/java/com/twitter/search/core/earlybird/index/util/SearchSortUtils.docx differ diff --git a/src/java/com/twitter/search/core/earlybird/index/util/SearchSortUtils.java b/src/java/com/twitter/search/core/earlybird/index/util/SearchSortUtils.java deleted file mode 100644 index c17565784..000000000 --- a/src/java/com/twitter/search/core/earlybird/index/util/SearchSortUtils.java +++ /dev/null @@ -1,42 +0,0 @@ -package com.twitter.search.core.earlybird.index.util; - -import com.google.common.base.Preconditions; - -public abstract class SearchSortUtils { - public interface Comparator { - /** - * Compares the item represented by the given index with the provided value. - */ - int compare(int index, T value); - } - - /** - * Performs a binary search using the given comparator, and returns the index of the item that - * was found. If foundLow is true, the greatest item that's lower than the provided key - * is returned. Otherwise, the lowest item that's greater than the provided key is returned. - */ - public static int binarySearch(Comparator comparator, final int begin, final int end, - final T key, boolean findLow) { - int low = begin; - int high = end; - Preconditions.checkState(comparator.compare(low, key) <= comparator.compare(high, key)); - while (low <= high) { - int mid = (low + high) >>> 1; - int result = comparator.compare(mid, key); - if (result < 0) { - low = mid + 1; - } else if (result > 0) { - high = mid - 1; - } else { - return mid; - } // key found - } - - assert low > high; - if (findLow) { - return high < begin ? begin : high; - } else { - return low > end ? end : low; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/BUILD b/src/java/com/twitter/search/earlybird/BUILD deleted file mode 100644 index 457cfe12a..000000000 --- a/src/java/com/twitter/search/earlybird/BUILD +++ /dev/null @@ -1,222 +0,0 @@ -COMMON_SOURCES = ["common/**/*.java"] - -CONFIG_SOURCES = ["config/**/*.java"] - -TOOLS_SOURCES = ["tools/**/*.java"] - -INDEX_SOURCES = ["index/facets/**/*.java"] - -SEGMENT_BUILDER_SOURCES = ["archive/segmentbuilder/**/*.java"] - -java_library( - name = "earlybird-lib", - sources = ["**/*.java"] + exclude_globs(COMMON_SOURCES + CONFIG_SOURCES + TOOLS_SOURCES + SEGMENT_BUILDER_SOURCES + INDEX_SOURCES), - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/gson", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/twitter/distributedlog:distributedlog-core", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/commons-codec", - "3rdparty/jvm/commons-httpclient", - "3rdparty/jvm/commons-io", - "3rdparty/jvm/commons-lang", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/io/netty:netty4-tcnative-boringssl-static", - "3rdparty/jvm/it/unimi/dsi:fastutil", - "3rdparty/jvm/javax/servlet:servlet-api", - "3rdparty/jvm/net/java/dev/jets3t", - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server", - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider", - "3rdparty/jvm/org/apache/commons:commons-lang3", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/httpcomponents:httpclient", - "3rdparty/jvm/org/apache/kafka:kafka-clients", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/lucene:lucene-queries", - "3rdparty/jvm/org/apache/lucene:lucene-queryparser", - "3rdparty/jvm/org/apache/lucene:lucene-spatial-extras", - "3rdparty/jvm/org/apache/lucene:lucene-test-framework", - "3rdparty/jvm/org/apache/thrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/json", - "3rdparty/jvm/org/slf4j:slf4j-api", - "3rdparty/jvm/org/tensorflow", - "3rdparty/jvm/org/tensorflow:tensorflow-hadoop", - "3rdparty/jvm/org/yaml:snakeyaml", - "cuad/projects/ner/thrift/src/main/thrift:thrift-java", - "decider/src/main/scala", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/client", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/server", - "finagle-internal/slo/src/main/scala/com/twitter/finagle/slo", - "finagle/finagle-base-http", - "finagle/finagle-core/src/main", - "finagle/finagle-http", - "finagle/finagle-serversets/src/main/scala", - "finagle/finagle-stats/src/main/scala", - "finagle/finagle-thrift/src/main/java", - "finagle/finagle-thrift/src/main/scala", - "finagle/finagle-thriftmux/src/main/scala", - "finagle/finagle-zipkin-core/src/main/scala", - "finagle/finagle-zipkin-scribe/src/main/scala", - "kafka/finagle-kafka/finatra-kafka/src/main/scala", - "periscope/api-proxy-thrift/thrift/src/main/thrift:thrift-java", - "servo/decider", - "snowflake/src/main/scala/com/twitter/snowflake/id", - "src/antlr/com/twitter/search/queryparser/antlr:queryparser-antlr", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/collections", - "src/java/com/twitter/common/net:dynamic-host-set", - "src/java/com/twitter/common/quantity", - "src/java/com/twitter/common/text/language:locale-util", - "src/java/com/twitter/common/text/token", - "src/java/com/twitter/common/text/util:token-util", - "src/java/com/twitter/common/util", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/common/zookeeper:client", - "src/java/com/twitter/common/zookeeper:group", - "src/java/com/twitter/common/zookeeper:server-set", - "src/java/com/twitter/common_internal/bloomfilter", - "src/java/com/twitter/common_internal/collections", - "src/java/com/twitter/common_internal/text:text-penguin7", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/common_internal/zookeeper", - "src/java/com/twitter/ml/api:api-base", - "src/java/com/twitter/search/common/aurora", - "src/java/com/twitter/search/common/concurrent", - "src/java/com/twitter/search/common/config", - "src/java/com/twitter/search/common/constants", - "src/java/com/twitter/search/common/dark", - "src/java/com/twitter/search/common/database", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/encoding/docvalues", - "src/java/com/twitter/search/common/encoding/features", - "src/java/com/twitter/search/common/features", - "src/java/com/twitter/search/common/file", - "src/java/com/twitter/search/common/logging", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/partitioning/zookeeper", - "src/java/com/twitter/search/common/query", - "src/java/com/twitter/search/common/relevance:feature-update-reader", - "src/java/com/twitter/search/common/relevance:scorers", - "src/java/com/twitter/search/common/relevance:text", - "src/java/com/twitter/search/common/relevance/features", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/search", - "src/java/com/twitter/search/common/search/termination", - "src/java/com/twitter/search/common/util:closeresourceutil", - "src/java/com/twitter/search/common/util:finagleutil", - "src/java/com/twitter/search/common/util:gcutil", - "src/java/com/twitter/search/common/util:kerberos", - "src/java/com/twitter/search/common/util:log_format_util", - "src/java/com/twitter/search/common/util:longintconverter", - "src/java/com/twitter/search/common/util:platform_stats_exporter", - "src/java/com/twitter/search/common/util:rule_based_converter", - "src/java/com/twitter/search/common/util/analysis", - "src/java/com/twitter/search/common/util/date", - "src/java/com/twitter/search/common/util/earlybird", - "src/java/com/twitter/search/common/util/hash", - "src/java/com/twitter/search/common/util/io", - "src/java/com/twitter/search/common/util/io:dl-reader-writer", - "src/java/com/twitter/search/common/util/io:flushable", - "src/java/com/twitter/search/common/util/io:record-reader-api", - "src/java/com/twitter/search/common/util/io/kafka", - "src/java/com/twitter/search/common/util/lang", - "src/java/com/twitter/search/common/util/ml/models_manager", - "src/java/com/twitter/search/common/util/ml/prediction_engine", - "src/java/com/twitter/search/common/util/ml/tensorflow_engine", - "src/java/com/twitter/search/common/util/spatial", - "src/java/com/twitter/search/common/util/text", - "src/java/com/twitter/search/common/util/text/regex", - "src/java/com/twitter/search/common/util/thrift:text-protocol", - "src/java/com/twitter/search/common/util/thrift:thrift-utils", - "src/java/com/twitter/search/common/util/url", - "src/java/com/twitter/search/common/util/zktrylock", - "src/java/com/twitter/search/common/util/zookeeper", - "src/java/com/twitter/search/core/earlybird", - "src/java/com/twitter/search/earlybird/common", - "src/java/com/twitter/search/earlybird/common/config", - "src/java/com/twitter/search/earlybird/common/userupdates", - "src/java/com/twitter/search/earlybird/config", - "src/java/com/twitter/search/earlybird/index/facets", - "src/java/com/twitter/search/ingester/pipeline/strato_fetchers", - "src/java/com/twitter/search/modeling/common", - "src/java/com/twitter/search/modeling/tweet_ranking", - "src/java/com/twitter/search/queryparser", - "src/java/com/twitter/search/queryparser/query:core-query-nodes", - "src/java/com/twitter/search/queryparser/query/search:search-query-nodes", - "src/resources/com/twitter/search/earlybird/com/twitter", - "src/resources/com/twitter/search/earlybird/ml", - "src/thrift/com/twitter/search:common", - "src/thrift/com/twitter/search:earlybird-java", - "src/thrift/com/twitter/search/common:features-java", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/search/common:query-java", - "src/thrift/com/twitter/service/spiderduck/gen:metadata-store-java", - "src/thrift/com/twitter/tweetypie:events-java", - "src/thrift/org/apache/aurora/gen:api", - "stitch/stitch-core/src/main/scala/com/twitter/stitch", - "strato/src/main/scala/com/twitter/strato/catalog", - "strato/src/main/scala/com/twitter/strato/client", - "strato/src/main/scala/com/twitter/strato/data", - "strato/src/main/scala/com/twitter/strato/thrift", - "tensorflow/tfcompute-java/src/main/java/com/twitter/tfcompute_java", - "thrift-web-forms/src/main/java/com/twitter/thriftwebforms", - "thrift-web-forms/src/main/scala/com/twitter/thriftwebforms", - "twitter-server-internal", - "twitter-server/server/src/main/scala", - "ubs/common/src/main/thrift/com/twitter/ubs:broadcast-thrift-java", - "ubs/common/src/main/thrift/com/twitter/ubs:events-java", - "util-internal/util-eval/src/main/scala", - "util/util-app", - "util/util-core:scala", - "util/util-function", - "util/util-lint", - "util/util-slf4j-api/src/main/scala", - "util/util-stats/src/main/scala", - ], -) - -jvm_binary( - name = "earlybird-binary", - basename = "earlybird", - main = "com.twitter.search.earlybird.EarlybirdMain", - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - ":earlybird-lib", - "loglens/loglens-log4j", - ], -) - -java_library( - name = "tools", - sources = TOOLS_SOURCES, - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - ":earlybird-lib", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/guava:guava-testlib", - "3rdparty/jvm/commons-codec", - "3rdparty/jvm/commons-httpclient", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/junit", - "3rdparty/jvm/net/java/dev/jets3t", - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "src/thrift/com/twitter/search:earlybird-java", - ], -) diff --git a/src/java/com/twitter/search/earlybird/BUILD.docx b/src/java/com/twitter/search/earlybird/BUILD.docx new file mode 100644 index 000000000..d6dcd68ca Binary files /dev/null and b/src/java/com/twitter/search/earlybird/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird/CONFIG.docx b/src/java/com/twitter/search/earlybird/CONFIG.docx new file mode 100644 index 000000000..3ff7275ac Binary files /dev/null and b/src/java/com/twitter/search/earlybird/CONFIG.docx differ diff --git a/src/java/com/twitter/search/earlybird/CONFIG.ini b/src/java/com/twitter/search/earlybird/CONFIG.ini deleted file mode 100644 index 6d4d06376..000000000 --- a/src/java/com/twitter/search/earlybird/CONFIG.ini +++ /dev/null @@ -1,7 +0,0 @@ -; See http://go/CONFIG.ini - -[jira] -project: SEARCH - -[kite] -project: earlybird diff --git a/src/java/com/twitter/search/earlybird/Earlybird.docx b/src/java/com/twitter/search/earlybird/Earlybird.docx new file mode 100644 index 000000000..ed38fb576 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/Earlybird.docx differ diff --git a/src/java/com/twitter/search/earlybird/Earlybird.java b/src/java/com/twitter/search/earlybird/Earlybird.java deleted file mode 100644 index 54dc33f16..000000000 --- a/src/java/com/twitter/search/earlybird/Earlybird.java +++ /dev/null @@ -1,267 +0,0 @@ -package com.twitter.search.earlybird; - -import java.io.File; -import java.io.IOException; -import java.net.InetAddress; -import java.net.UnknownHostException; -import java.util.Arrays; -import java.util.Map; -import java.util.function.Predicate; -import java.util.stream.Collectors; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.app.Flag; -import com.twitter.app.Flaggable; -import com.twitter.finagle.Http; -import com.twitter.finagle.http.HttpMuxer; -import com.twitter.search.common.aurora.AuroraInstanceKey; -import com.twitter.search.common.config.Config; -import com.twitter.search.common.config.LoggerConfiguration; -import com.twitter.search.common.constants.SearchThriftWebFormsAccess; -import com.twitter.search.common.metrics.BuildInfoStats; -import com.twitter.search.common.util.Kerberos; -import com.twitter.search.common.util.PlatformStatsExporter; -import com.twitter.search.earlybird.admin.EarlybirdAdminManager; -import com.twitter.search.earlybird.admin.EarlybirdHealthHandler; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.exception.EarlybirdStartupException; -import com.twitter.search.earlybird.exception.UncaughtExceptionHandler; -import com.twitter.search.earlybird.factory.EarlybirdServerFactory; -import com.twitter.search.earlybird.factory.EarlybirdWireModule; -import com.twitter.search.earlybird.thrift.EarlybirdService; -import com.twitter.search.earlybird.util.EarlybirdDecider; -import com.twitter.server.handler.DeciderHandler$; -import com.twitter.server.AbstractTwitterServer; -import com.twitter.thriftwebforms.DisplaySettingsConfig; -import com.twitter.thriftwebforms.MethodOptionsAccessConfig; -import com.twitter.thriftwebforms.ThriftClientSettingsConfig; -import com.twitter.thriftwebforms.ThriftMethodSettingsConfig; -import com.twitter.thriftwebforms.ThriftServiceSettings; -import com.twitter.thriftwebforms.ThriftWebFormsSettings; -import com.twitter.thriftwebforms.TwitterServerThriftWebForms; -import com.twitter.util.Await; -import com.twitter.util.TimeoutException; - -public class Earlybird extends AbstractTwitterServer { - private static final Logger LOG = LoggerFactory.getLogger(Earlybird.class); - - // Flags defined here need to be processed before setting override values to EarlybirdConfig. - - private final Flag configFile = flag().create( - "config_file", - new File("earlybird-search.yml"), - "specify config file", - Flaggable.ofFile() - ); - - private final Flag logDir = flag().create( - "earlybird_log_dir", - "", - "override log dir from config file", - Flaggable.ofString() - ); - - private final Map> flagMap = Arrays.stream(EarlybirdProperty.values()) - .collect(Collectors.toMap( - property -> property.name(), - property -> property.createFlag(flag()))); - - private final UncaughtExceptionHandler uncaughtExceptionHandler = - new UncaughtExceptionHandler(); - - private EarlybirdServer earlybirdServer; - private EarlybirdAdminManager earlybirdAdminManager; - - public Earlybird() { - // Default health handler is added inside Lifecycle trait. To override that we need to set it - // in the constructor since HttpAdminServer is started before Earlybird.preMain() is called. - HttpMuxer.addHandler("/health", new EarlybirdHealthHandler()); - } - - /** - * Needs to be called from preMain and not from onInit() as flags / args parsing happens after - * onInit() is called. - */ - @VisibleForTesting - void configureFromFlagsAndSetupLogging() { - // Makes sure the EarlybirdStats is injected with a variable repository. - EarlybirdConfig.init(configFile.getWithDefault().get().getName()); - - if (logDir.isDefined()) { - EarlybirdConfig.overrideLogDir(logDir.get().get()); - } - new LoggerConfiguration(EarlybirdConfig.getLogPropertiesFile(), - EarlybirdConfig.getLogDir()).configure(); - - String instanceKey = System.getProperty("aurora.instanceKey"); - if (instanceKey != null) { - EarlybirdConfig.setAuroraInstanceKey(AuroraInstanceKey.fromInstanceKey(instanceKey)); - LOG.info("Earlybird is running on Aurora"); - checkRequiredProperties(EarlybirdProperty::isRequiredOnAurora, "Aurora"); - } else { - LOG.info("Earlybird is running on dedicated hardware"); - checkRequiredProperties(EarlybirdProperty::isRequiredOnDedicated, "dedicated hardware"); - } - LOG.info("Config environment: {}", Config.getEnvironment()); - - if (adminPort().isDefined() && adminPort().get().isDefined()) { - int adminPort = adminPort().get().get().getPort(); - LOG.info("Admin port is {}", adminPort); - EarlybirdConfig.setAdminPort(adminPort); - } - - EarlybirdConfig.setOverrideValues( - flagMap.values().stream() - .filter(Flag::isDefined) - .collect(Collectors.toMap(Flag::name, flag -> flag.get().get()))); - } - - private void checkRequiredProperties( - Predicate propertyPredicate, String location) { - Arrays.stream(EarlybirdProperty.values()) - .filter(propertyPredicate) - .map(property -> flagMap.get(property.name())) - .forEach(flag -> - Preconditions.checkState(flag.isDefined(), - "-%s is required on %s", flag.name(), location)); - } - - private void logEarlybirdInfo() { - try { - LOG.info("Hostname: {}", InetAddress.getLocalHost().getHostName()); - } catch (UnknownHostException e) { - LOG.info("Unable to be get local host: {}", e.getMessage()); - } - LOG.info("Earlybird info [Name: {}, Zone: {}, Env: {}]", - EarlybirdProperty.EARLYBIRD_NAME.get(), - EarlybirdProperty.ZONE.get(), - EarlybirdProperty.ENV.get()); - LOG.info("Earlybird scrubgen from Aurora: {}]", - EarlybirdProperty.EARLYBIRD_SCRUB_GEN.get()); - LOG.info("Find final partition config by searching the log for \"Partition config info\""); - } - - private EarlybirdServer makeEarlybirdServer() { - EarlybirdWireModule earlybirdWireModule = new EarlybirdWireModule(); - EarlybirdServerFactory earlybirdFactory = new EarlybirdServerFactory(); - try { - return earlybirdFactory.makeEarlybirdServer(earlybirdWireModule); - } catch (IOException e) { - LOG.error("Exception while constructing EarlybirdServer.", e); - throw new RuntimeException(e); - } - } - - private void setupThriftWebForms() { - TwitterServerThriftWebForms.addAdminRoutes(this, TwitterServerThriftWebForms.apply( - ThriftWebFormsSettings.apply( - DisplaySettingsConfig.DEFAULT, - ThriftServiceSettings.apply( - EarlybirdService.ServiceIface.class.getSimpleName(), - EarlybirdConfig.getThriftPort()), - ThriftClientSettingsConfig.makeCompactRequired( - EarlybirdProperty.getServiceIdentifier()), - ThriftMethodSettingsConfig.access( - MethodOptionsAccessConfig.byLdapGroup( - SearchThriftWebFormsAccess.READ_LDAP_GROUP))), - scala.reflect.ClassTag$.MODULE$.apply(EarlybirdService.ServiceIface.class))); - } - - private void setupDeciderWebForms() { - addAdminRoute( - DeciderHandler$.MODULE$.route( - "earlybird", - EarlybirdDecider.getMutableDecisionMaker(), - EarlybirdDecider.getDecider())); - } - - @Override - public Http.Server configureAdminHttpServer(Http.Server server) { - return server.withMonitor(uncaughtExceptionHandler); - } - - @Override - public void preMain() { - configureFromFlagsAndSetupLogging(); - logEarlybirdInfo(); - LOG.info("Starting preMain()"); - - BuildInfoStats.export(); - PlatformStatsExporter.exportPlatformStats(); - - // Use our own exception handler to monitor all unhandled exceptions. - Thread.setDefaultUncaughtExceptionHandler((thread, e) -> { - LOG.error("Invoked default uncaught exception handler."); - uncaughtExceptionHandler.handle(e); - }); - LOG.info("Registered unhandled exception monitor."); - - Kerberos.kinit( - EarlybirdConfig.getString("kerberos_user", ""), - EarlybirdConfig.getString("kerberos_keytab_path", "") - ); - - LOG.info("Creating earlybird server."); - earlybirdServer = makeEarlybirdServer(); - - uncaughtExceptionHandler.setShutdownHook(() -> { - earlybirdServer.shutdown(); - this.close(); - }); - - earlybirdAdminManager = EarlybirdAdminManager.create(earlybirdServer); - earlybirdAdminManager.start(); - LOG.info("Started admin interface."); - - setupThriftWebForms(); - setupDeciderWebForms(); - - LOG.info("Opened thrift serving form."); - - LOG.info("preMain() complete."); - } - - @Override - public void main() throws InterruptedException, TimeoutException, EarlybirdStartupException { - innerMain(); - } - - /** - * Setting up an innerMain() so that tests can mock out the contents of main without interfering - * with reflection being done in App.scala looking for a method named "main". - */ - @VisibleForTesting - void innerMain() throws TimeoutException, InterruptedException, EarlybirdStartupException { - LOG.info("Starting main()."); - - // If this method throws, TwitterServer will catch the exception and call close, so we don't - // catch it here. - try { - earlybirdServer.start(); - } catch (Throwable throwable) { - LOG.error("Exception while starting:", throwable); - throw throwable; - } - - Await.ready(adminHttpServer()); - LOG.info("main() complete."); - } - - @Override - public void onExit() { - LOG.info("Starting onExit()"); - earlybirdServer.shutdown(); - try { - earlybirdAdminManager.doShutdown(); - } catch (InterruptedException e) { - LOG.warn("earlybirdAdminManager shutdown was interrupted with " + e); - } - LOG.info("onExit() complete."); - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdCPUQualityFactor.docx b/src/java/com/twitter/search/earlybird/EarlybirdCPUQualityFactor.docx new file mode 100644 index 000000000..312299647 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdCPUQualityFactor.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdCPUQualityFactor.java b/src/java/com/twitter/search/earlybird/EarlybirdCPUQualityFactor.java deleted file mode 100644 index 0fdfbc1d5..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdCPUQualityFactor.java +++ /dev/null @@ -1,181 +0,0 @@ -package com.twitter.search.earlybird; - -import com.google.common.annotations.VisibleForTesting; -import com.sun.management.OperatingSystemMXBean; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.decider.Decider; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchStatsReceiver; - -/** - * Manages the quality factor for an Earlybird based on CPU usage. - */ -public class EarlybirdCPUQualityFactor implements QualityFactor { - public static final String ENABLE_QUALITY_FACTOR_DECIDER = "enable_quality_factor"; - public static final String OVERRIDE_QUALITY_FACTOR_DECIDER = "override_quality_factor"; - - @VisibleForTesting - protected static final double CPU_USAGE_THRESHOLD = 0.8; - @VisibleForTesting - protected static final double MAX_QF_INCREMENT = 0.5; - @VisibleForTesting - protected static final double MAX_QF_DECREMENT = 0.1; - @VisibleForTesting - protected static final double MAX_CPU_USAGE = 1.0; - - private static final Logger QUALITY_FACTOR_LOG = - LoggerFactory.getLogger(EarlybirdCPUQualityFactor.class); - private static final Logger EARLYBIRD_LOG = LoggerFactory.getLogger(Earlybird.class); - - /** - * Tracks the real, underlying CPU QF value, regardless of the decider enabling - * it. - */ - @VisibleForTesting - protected static final String UNDERLYING_CPU_QF_GUAGE = "underlying_cpu_quality_factor"; - - /** - * Reports the QF actually used to degrade Earlybirds. - */ - @VisibleForTesting - protected static final String CPU_QF_GUAGE = "cpu_quality_factor"; - - private static final int SAMPLING_WINDOW_MILLIS = 60 * 1000; // one minute - - - private double qualityFactor = 1; - private double previousQualityFactor = 1; - - private final SearchDecider decider; - private final OperatingSystemMXBean operatingSystemMXBean; - - public EarlybirdCPUQualityFactor( - Decider decider, - OperatingSystemMXBean operatingSystemMXBean, - SearchStatsReceiver searchStatsReceiver) { - this.decider = new SearchDecider(decider); - this.operatingSystemMXBean = operatingSystemMXBean; - - searchStatsReceiver.getCustomGauge(UNDERLYING_CPU_QF_GUAGE, () -> qualityFactor); - searchStatsReceiver.getCustomGauge(CPU_QF_GUAGE, this::get); - } - - /** - * Updates the current quality factor based on CPU usage. - */ - @VisibleForTesting - protected void update() { - previousQualityFactor = qualityFactor; - - double cpuUsage = operatingSystemMXBean.getSystemCpuLoad(); - - if (cpuUsage < CPU_USAGE_THRESHOLD) { - double increment = - ((CPU_USAGE_THRESHOLD - cpuUsage) / CPU_USAGE_THRESHOLD) * MAX_QF_INCREMENT; - qualityFactor = Math.min(1, qualityFactor + increment); - } else { - double decrement = - ((cpuUsage - CPU_USAGE_THRESHOLD) / (MAX_CPU_USAGE - CPU_USAGE_THRESHOLD)) - * MAX_QF_DECREMENT; - qualityFactor = Math.max(0, qualityFactor - decrement); - } - - if (!qualityFactorChanged()) { - return; - } - - QUALITY_FACTOR_LOG.info( - String.format("CPU: %.2f Quality Factor: %.2f", cpuUsage, qualityFactor)); - - if (!enabled()) { - return; - } - - if (degradationBegan()) { - EARLYBIRD_LOG.info("Service degradation began."); - } - - if (degradationEnded()) { - EARLYBIRD_LOG.info("Service degradation ended."); - } - } - - @Override - public double get() { - if (!enabled()) { - return 1; - } - - if (isOverridden()) { - return override(); - } - - return qualityFactor; - } - - @Override - public void startUpdates() { - new Thread(() -> { - while (true) { - update(); - try { - Thread.sleep(SAMPLING_WINDOW_MILLIS); - } catch (InterruptedException e) { - QUALITY_FACTOR_LOG.warn( - "Quality factoring thread interrupted during sleep between updates", e); - } - } - }).start(); - } - - /** - * Returns true if quality factoring is enabled by the decider. - * @return - */ - private boolean enabled() { - return decider != null && decider.isAvailable(ENABLE_QUALITY_FACTOR_DECIDER); - } - - /** - * Returns true if a decider has overridden the quality factor. - * @return - */ - private boolean isOverridden() { - return decider != null && decider.getAvailability(OVERRIDE_QUALITY_FACTOR_DECIDER) < 10000.0; - } - - /** - * Returns the override decider value. - * @return - */ - private double override() { - return decider == null ? 1 : decider.getAvailability(OVERRIDE_QUALITY_FACTOR_DECIDER) / 10000.0; - } - - /** - * Returns true if the quality factor has changed since the last update. - * @return - */ - private boolean qualityFactorChanged() { - return Math.abs(qualityFactor - previousQualityFactor) > 0.01; - } - - /** - * Returns true if we've entered a degraded state. - * @return - */ - private boolean degradationBegan() { - return Math.abs(previousQualityFactor - 1.0) < 0.01 && qualityFactor < previousQualityFactor; - } - - /** - * Returns true if we've left the degraded state. - * @return - */ - private boolean degradationEnded() { - return Math.abs(qualityFactor - 1.0) < 0.01 && previousQualityFactor < qualityFactor; - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdDarkProxy.docx b/src/java/com/twitter/search/earlybird/EarlybirdDarkProxy.docx new file mode 100644 index 000000000..b1fbcd1b8 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdDarkProxy.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdDarkProxy.java b/src/java/com/twitter/search/earlybird/EarlybirdDarkProxy.java deleted file mode 100644 index c0d2fea2e..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdDarkProxy.java +++ /dev/null @@ -1,113 +0,0 @@ -package com.twitter.search.earlybird; - -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Lists; - -import org.apache.thrift.protocol.TCompactProtocol; - -import com.twitter.finagle.ThriftMux; -import com.twitter.finagle.builder.ClientBuilder; -import com.twitter.finagle.builder.ClientConfig.Yes; -import com.twitter.finagle.mtls.client.MtlsThriftMuxClient; -import com.twitter.finagle.stats.StatsReceiver; -import com.twitter.finagle.thrift.ClientId; -import com.twitter.finagle.thrift.ThriftClientRequest; -import com.twitter.finagle.zipkin.thrift.ZipkinTracer; -import com.twitter.search.common.dark.DarkProxy; -import com.twitter.search.common.dark.ResolverProxy; -import com.twitter.search.common.dark.ServerSetResolver; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.util.thrift.BytesToThriftFilter; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.util.Duration; - -public class EarlybirdDarkProxy { - private static final String WARM_UP_DECIDER_KEY_PREFIX = "warmup_"; - - private static final int DARK_REQUESTS_TOTAL_REQUEST_TIMEOUT_MS = - EarlybirdConfig.getInt("dark_requests_total_request_timeout_ms", 800); - private static final int DARK_REQUESTS_INDIVIDUAL_REQUEST_TIMEOUT_MS = - EarlybirdConfig.getInt("dark_requests_individual_request_timeout_ms", 800); - private static final int DARK_REQUESTS_CONNECT_TIMEOUT_MS = - EarlybirdConfig.getInt("dark_requests_connect_timeout_ms", 500); - private static final int DARK_REQUESTS_NUM_RETRIES = - EarlybirdConfig.getInt("dark_requests_num_retries", 1); - private static final String DARK_REQUESTS_FINAGLE_CLIENT_ID = - EarlybirdConfig.getString("dark_requests_finagle_client_id", "earlybird_warmup"); - - private final DarkProxy darkProxy; - - public EarlybirdDarkProxy(SearchDecider searchDecider, - StatsReceiver statsReceiver, - EarlybirdServerSetManager earlybirdServerSetManager, - EarlybirdWarmUpManager earlybirdWarmUpManager, - String clusterName) { - darkProxy = newDarkProxy(searchDecider, - statsReceiver, - earlybirdServerSetManager, - earlybirdWarmUpManager, - clusterName); - } - - public DarkProxy getDarkProxy() { - return darkProxy; - } - - @VisibleForTesting - protected DarkProxy newDarkProxy( - SearchDecider searchDecider, - StatsReceiver statsReceiver, - EarlybirdServerSetManager earlybirdServerSetManager, - final EarlybirdWarmUpManager earlybirdWarmUpManager, - String clusterName) { - ResolverProxy resolverProxy = new ResolverProxy(); - ServerSetResolver.SelfServerSetResolver selfServerSetResolver = - new ServerSetResolver.SelfServerSetResolver( - earlybirdServerSetManager.getServerSetIdentifier(), resolverProxy); - selfServerSetResolver.init(); - - final String clusterNameForDeciderKey = clusterName.toLowerCase().replaceAll("-", "_"); - final String warmUpServerSetIdentifier = earlybirdWarmUpManager.getServerSetIdentifier(); - DarkProxy newDarkProxy = new DarkProxy( - selfServerSetResolver, - newClientBuilder(statsReceiver), - resolverProxy, - searchDecider, - Lists.newArrayList(warmUpServerSetIdentifier), - new BytesToThriftFilter(), - statsReceiver) { - @Override - protected String getServicePathDeciderKey(String servicePath) { - if (warmUpServerSetIdentifier.equals(servicePath)) { - return WARM_UP_DECIDER_KEY_PREFIX + clusterNameForDeciderKey; - } - - return clusterNameForDeciderKey; - } - }; - - newDarkProxy.init(); - return newDarkProxy; - } - - private ClientBuilder newClientBuilder( - StatsReceiver statsReceiver) { - return ClientBuilder.get() - .daemon(true) - .timeout(Duration.apply(DARK_REQUESTS_TOTAL_REQUEST_TIMEOUT_MS, TimeUnit.MILLISECONDS)) - .requestTimeout( - Duration.apply(DARK_REQUESTS_INDIVIDUAL_REQUEST_TIMEOUT_MS, TimeUnit.MILLISECONDS)) - .tcpConnectTimeout(Duration.apply(DARK_REQUESTS_CONNECT_TIMEOUT_MS, TimeUnit.MILLISECONDS)) - .retries(DARK_REQUESTS_NUM_RETRIES) - .reportTo(statsReceiver) - .tracer(ZipkinTracer.mk(statsReceiver)) - .stack(new MtlsThriftMuxClient( - ThriftMux.client()) - .withMutualTls(EarlybirdProperty.getServiceIdentifier()) - .withProtocolFactory(new TCompactProtocol.Factory()) - .withClientId(new ClientId(DARK_REQUESTS_FINAGLE_CLIENT_ID))); - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdFinagleServerManager.docx b/src/java/com/twitter/search/earlybird/EarlybirdFinagleServerManager.docx new file mode 100644 index 000000000..00d93b966 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdFinagleServerManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdFinagleServerManager.java b/src/java/com/twitter/search/earlybird/EarlybirdFinagleServerManager.java deleted file mode 100644 index c84083475..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdFinagleServerManager.java +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.search.earlybird; - -import com.twitter.finagle.thrift.ThriftClientRequest; -import com.twitter.search.common.dark.DarkProxy; -import com.twitter.search.earlybird.thrift.EarlybirdService; -import com.twitter.util.Duration; - -/** - * Manages a finagle server underneath, which can be recreated. - * - * This class is not thread-safe. It is up to the concrete implementations and their callers to - * correctly synchronize calls to these methods (for example, to make sure that there is no race - * condition if startProductionFinagleServer() and stopProductionFinagleServer() are called - * concurrently from two different threads). - */ -public interface EarlybirdFinagleServerManager { - /** - * Determines if the warm up finagle server is currently running - */ - boolean isWarmUpServerRunning(); - - /** - * Starts up the warm up finagle server on the given port. - */ - void startWarmUpFinagleServer( - EarlybirdService.ServiceIface serviceIface, - String serviceName, - int port); - - /** - * Stops the warm up finagle server, after waiting for at most the given amount of time. - */ - void stopWarmUpFinagleServer(Duration serverCloseWaitTime) throws InterruptedException; - - /** - * Determines if the production finagle server is currently running. - */ - boolean isProductionServerRunning(); - - /** - * Starts up the production finagle server on the given port. - */ - void startProductionFinagleServer( - DarkProxy darkProxy, - EarlybirdService.ServiceIface serviceIface, - String serviceName, - int port); - - /** - * Stops the production finagle server after waiting for at most the given amount of time. - */ - void stopProductionFinagleServer(Duration serverCloseWaitTime) throws InterruptedException; -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdFuturePoolManager.docx b/src/java/com/twitter/search/earlybird/EarlybirdFuturePoolManager.docx new file mode 100644 index 000000000..0c28b9e4b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdFuturePoolManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdFuturePoolManager.java b/src/java/com/twitter/search/earlybird/EarlybirdFuturePoolManager.java deleted file mode 100644 index 180b058f7..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdFuturePoolManager.java +++ /dev/null @@ -1,114 +0,0 @@ -package com.twitter.search.earlybird; - -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.RejectedExecutionException; -import java.util.concurrent.ThreadFactory; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; - -import scala.Function0; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.util.concurrent.ThreadFactoryBuilder; - -import com.twitter.search.common.concurrent.ThreadPoolExecutorStats; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.util.ExecutorServiceFuturePool; -import com.twitter.util.Future; -import com.twitter.util.FuturePool; - -/** - * A future pool that delegates all calls to an underlying futurePool, which can be recreated. - */ -public class EarlybirdFuturePoolManager implements FuturePool { - private volatile ExecutorServiceFuturePool pool = null; - - private final String threadName; - private final ThreadPoolExecutorStats threadPoolExecutorStats; - - public EarlybirdFuturePoolManager(String threadName) { - this.threadName = threadName; - this.threadPoolExecutorStats = new ThreadPoolExecutorStats(threadName); - } - - final synchronized void createUnderlyingFuturePool(int threadCount) { - Preconditions.checkState(pool == null, "Cannot create a new pool before stopping the old one"); - - ExecutorService executorService = - createExecutorService(threadCount, getMaxQueueSize()); - if (executorService instanceof ThreadPoolExecutor) { - threadPoolExecutorStats.setUnderlyingExecutorForStats((ThreadPoolExecutor) executorService); - } - - pool = new ExecutorServiceFuturePool(executorService); - } - - final synchronized void stopUnderlyingFuturePool(long timeout, TimeUnit timeunit) - throws InterruptedException { - Preconditions.checkNotNull(pool); - pool.executor().shutdown(); - pool.executor().awaitTermination(timeout, timeunit); - pool = null; - } - - boolean isPoolReady() { - return pool != null; - } - - @Override - public final Future apply(Function0 f) { - return Preconditions.checkNotNull(pool).apply(f); - } - - @VisibleForTesting - protected ExecutorService createExecutorService(int threadCount, int maxQueueSize) { - if (maxQueueSize <= 0) { - return Executors.newFixedThreadPool(threadCount, createThreadFactory(threadName)); - } - - SearchRateCounter rejectedTaskCounter = - SearchRateCounter.export(threadName + "_rejected_task_count"); - return new ThreadPoolExecutor( - threadCount, threadCount, 0, TimeUnit.MILLISECONDS, - new ArrayBlockingQueue<>(maxQueueSize), - createThreadFactory(threadName), - (runnable, executor) -> { - rejectedTaskCounter.increment(); - throw new RejectedExecutionException(threadName + " queue is full"); - }); - } - - @VisibleForTesting - protected int getMaxQueueSize() { - return EarlybirdProperty.MAX_QUEUE_SIZE.get(0); - } - - @VisibleForTesting - static ThreadFactory createThreadFactory(String threadName) { - return new ThreadFactoryBuilder() - .setNameFormat(threadName + "-%d") - .setDaemon(true) - .build(); - } - - @Override - public int poolSize() { - return Preconditions.checkNotNull(pool).poolSize(); - } - - @Override - public int numActiveTasks() { - return Preconditions.checkNotNull(pool).numActiveTasks(); - } - - @Override - public long numCompletedTasks() { - return Preconditions.checkNotNull(pool).numCompletedTasks(); - } - - -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdIndexConfig.docx b/src/java/com/twitter/search/earlybird/EarlybirdIndexConfig.docx new file mode 100644 index 000000000..f16713c7f Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdIndexConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdIndexConfig.java b/src/java/com/twitter/search/earlybird/EarlybirdIndexConfig.java deleted file mode 100644 index b5928c651..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdIndexConfig.java +++ /dev/null @@ -1,190 +0,0 @@ -package com.twitter.search.earlybird; - -import java.io.IOException; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Predicate; -import com.google.common.base.Predicates; - -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.store.Directory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.decider.Decider; -import com.twitter.search.common.schema.DynamicSchema; -import com.twitter.search.common.schema.base.Schema.SchemaValidationException; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdSchemaCreateTool; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.util.CloseResourceUtil; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsFactory; -import com.twitter.search.earlybird.document.DocumentFactory; -import com.twitter.search.earlybird.document.ThriftIndexingEventDocumentFactory; -import com.twitter.search.earlybird.document.ThriftIndexingEventUpdateFactory; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.partition.PartitionConfig; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentSyncInfo; -import com.twitter.search.earlybird.partition.UserPartitionUtil; - -/** - * Collection of required indexing entities that differ in the various Earlybird clusters. - */ -public abstract class EarlybirdIndexConfig { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdIndexConfig.class); - - private final EarlybirdCluster cluster; - private final DynamicSchema schema; - private final Decider decider; - private final SearchIndexingMetricSet searchIndexingMetricSet; - protected final CriticalExceptionHandler criticalExceptionHandler; - - /** - * Creates a new index config using an applicable schema built for the provided cluster. - */ - protected EarlybirdIndexConfig( - EarlybirdCluster cluster, Decider decider, SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler) { - this(cluster, buildSchema(cluster), decider, searchIndexingMetricSet, - criticalExceptionHandler); - } - - @VisibleForTesting - protected EarlybirdIndexConfig( - EarlybirdCluster cluster, - DynamicSchema schema, - Decider decider, - SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler) { - this.cluster = cluster; - this.schema = schema; - this.decider = decider; - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.criticalExceptionHandler = criticalExceptionHandler; - LOG.info("This Earlybird uses index config: " + this.getClass().getSimpleName()); - } - - private static DynamicSchema buildSchema(EarlybirdCluster cluster) { - try { - return EarlybirdSchemaCreateTool.buildSchema(cluster); - } catch (SchemaValidationException e) { - throw new RuntimeException(e); - } - } - - /** - * Creates the appropriate document factory for this earlybird. - */ - public final DocumentFactory createDocumentFactory() { - return new ThriftIndexingEventDocumentFactory( - getSchema(), getCluster(), decider, searchIndexingMetricSet, - criticalExceptionHandler); - } - - /** - * Creates a document factory for ThriftIndexingEvents that are updates to the index. - */ - public final DocumentFactory createUpdateFactory() { - return new ThriftIndexingEventUpdateFactory( - getSchema(), getCluster(), decider, criticalExceptionHandler); - } - - /** - * Return the EarlybirdCluster enum identifying the cluster this config is for. - */ - public final EarlybirdCluster getCluster() { - return cluster; - } - - /** - * Return the default filter for UserUpdatesTable - for the archive cluster keep - * users that belong to the current partition. - */ - public final Predicate getUserTableFilter(PartitionConfig partitionConfig) { - if (EarlybirdCluster.isArchive(getCluster())) { - return UserPartitionUtil.filterUsersByPartitionPredicate(partitionConfig); - } - - return Predicates.alwaysTrue(); - } - - /** - * Creates a new Lucene {@link Directory} to be used for indexing documents. - */ - public abstract Directory newLuceneDirectory(SegmentSyncInfo segmentSyncInfo) throws IOException; - - /** - * Creates a new Lucene IndexWriterConfig that can be used for creating a segment writer for a - * new segment. - */ - public abstract IndexWriterConfig newIndexWriterConfig(); - - /** - * Creates a new SegmentData object to add documents to. - */ - public abstract EarlybirdIndexSegmentData newSegmentData( - int maxSegmentSize, - long timeSliceID, - Directory dir, - EarlybirdIndexExtensionsFactory extensionsFactory); - - /** - * Loads a flushed index for the given segment. - */ - public abstract EarlybirdIndexSegmentData loadSegmentData( - FlushInfo flushInfo, - DataDeserializer dataInputStream, - Directory dir, - EarlybirdIndexExtensionsFactory extensionsFactory) throws IOException; - - /** - * Creates a new segment optimizer for the given segment data. - */ - public abstract EarlybirdIndexSegmentData optimize( - EarlybirdIndexSegmentData earlybirdIndexSegmentData) throws IOException; - - /** - * Whether the index is stored on disk or not. If an index is not on disk, it is presumed to be - * in memory. - */ - public abstract boolean isIndexStoredOnDisk(); - - /** - * Whether documents are search in LIFO ordering (RT mode), or default (Lucene) FIFO ordering - */ - public final boolean isUsingLIFODocumentOrdering() { - return !isIndexStoredOnDisk(); - } - - /** - * Whether this index supports out-of-order indexing - */ - public abstract boolean supportOutOfOrderIndexing(); - - /** - * Returns a CloseResourceUtil used for closing resources. - */ - public abstract CloseResourceUtil getResourceCloser(); - - /** - * Returns the schema for this index configuration. - */ - public final DynamicSchema getSchema() { - return schema; - } - - /** - * Returns the decider used by this EarlybirdIndexConfig instance. - */ - public Decider getDecider() { - return decider; - } - - public SearchIndexingMetricSet getSearchIndexingMetricSet() { - return searchIndexingMetricSet; - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdMain.docx b/src/java/com/twitter/search/earlybird/EarlybirdMain.docx new file mode 100644 index 000000000..d390d0a0f Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdMain.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdMain.java b/src/java/com/twitter/search/earlybird/EarlybirdMain.java deleted file mode 100644 index 809d1b7c9..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdMain.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.twitter.search.earlybird; - -public final class EarlybirdMain { - private EarlybirdMain() { - } - - public static void main(String[] args) { - new Earlybird().main(args); - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdProductionFinagleServerManager.docx b/src/java/com/twitter/search/earlybird/EarlybirdProductionFinagleServerManager.docx new file mode 100644 index 000000000..7f7d2f39b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdProductionFinagleServerManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdProductionFinagleServerManager.java b/src/java/com/twitter/search/earlybird/EarlybirdProductionFinagleServerManager.java deleted file mode 100644 index 3bdfa78a9..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdProductionFinagleServerManager.java +++ /dev/null @@ -1,151 +0,0 @@ -package com.twitter.search.earlybird; - -import java.net.InetSocketAddress; -import java.util.concurrent.atomic.AtomicReference; - -import org.apache.thrift.protocol.TCompactProtocol; -import org.apache.thrift.protocol.TProtocolFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.finagle.ListeningServer; -import com.twitter.finagle.Service; -import com.twitter.finagle.SslException; -import com.twitter.finagle.ThriftMux; -import com.twitter.finagle.mtls.server.MtlsThriftMuxServer; -import com.twitter.finagle.mux.transport.OpportunisticTls; -import com.twitter.finagle.stats.MetricsStatsReceiver; -import com.twitter.finagle.thrift.ThriftClientRequest; -import com.twitter.finagle.util.ExitGuard; -import com.twitter.finagle.zipkin.thrift.ZipkinTracer; -import com.twitter.search.common.dark.DarkProxy; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.exception.EarlybirdFinagleServerMonitor; -import com.twitter.search.earlybird.thrift.EarlybirdService; -import com.twitter.server.filter.AdmissionControl; -import com.twitter.server.filter.cpuAdmissionControl; -import com.twitter.util.Await; -import com.twitter.util.Duration; -import com.twitter.util.TimeoutException; - -public class EarlybirdProductionFinagleServerManager implements EarlybirdFinagleServerManager { - private static final Logger LOG = - LoggerFactory.getLogger(EarlybirdProductionFinagleServerManager.class); - - private final AtomicReference warmUpFinagleServer = new AtomicReference<>(); - private final AtomicReference productionFinagleServer = new AtomicReference<>(); - private final EarlybirdFinagleServerMonitor unhandledExceptionMonitor; - - public EarlybirdProductionFinagleServerManager( - CriticalExceptionHandler criticalExceptionHandler) { - this.unhandledExceptionMonitor = - new EarlybirdFinagleServerMonitor(criticalExceptionHandler); - } - - @Override - public boolean isWarmUpServerRunning() { - return warmUpFinagleServer.get() != null; - } - - @Override - public void startWarmUpFinagleServer(EarlybirdService.ServiceIface serviceIface, - String serviceName, - int port) { - TProtocolFactory protocolFactory = new TCompactProtocol.Factory(); - startFinagleServer(warmUpFinagleServer, "warmup", - new EarlybirdService.Service(serviceIface, protocolFactory), - protocolFactory, serviceName, port); - } - - @Override - public void stopWarmUpFinagleServer(Duration serverCloseWaitTime) throws InterruptedException { - stopFinagleServer(warmUpFinagleServer, serverCloseWaitTime, "Warm up"); - } - - @Override - public boolean isProductionServerRunning() { - return productionFinagleServer.get() != null; - } - - @Override - public void startProductionFinagleServer(DarkProxy darkProxy, - EarlybirdService.ServiceIface serviceIface, - String serviceName, - int port) { - TProtocolFactory protocolFactory = new TCompactProtocol.Factory(); - startFinagleServer(productionFinagleServer, "production", - darkProxy.toFilter().andThen(new EarlybirdService.Service(serviceIface, protocolFactory)), - protocolFactory, serviceName, port); - } - - @Override - public void stopProductionFinagleServer(Duration serverCloseWaitTime) - throws InterruptedException { - stopFinagleServer(productionFinagleServer, serverCloseWaitTime, "Production"); - } - - private void startFinagleServer(AtomicReference target, String serverDescription, - Service service, TProtocolFactory protocolFactory, String serviceName, - int port) { - target.set(getServer(service, serviceName, port, protocolFactory)); - LOG.info("Started EarlybirdServer " + serverDescription + " finagle server on port " + port); - } - - private ListeningServer getServer( - Service service, String serviceName, int port, - TProtocolFactory protocolFactory) { - MetricsStatsReceiver statsReceiver = new MetricsStatsReceiver(); - ThriftMux.Server server = new MtlsThriftMuxServer(ThriftMux.server()) - .withMutualTls(EarlybirdProperty.getServiceIdentifier()) - .withServiceClass(EarlybirdService.class) - .withOpportunisticTls(OpportunisticTls.Required()) - .withLabel(serviceName) - .withStatsReceiver(statsReceiver) - .withTracer(ZipkinTracer.mk(statsReceiver)) - .withMonitor(unhandledExceptionMonitor) - .withProtocolFactory(protocolFactory); - - if (cpuAdmissionControl.isDefined()) { - LOG.info("cpuAdmissionControl flag is set, replacing AuroraThrottlingAdmissionFilter" - + " with LinuxCpuAdmissionFilter"); - server = server - .configured(AdmissionControl.auroraThrottling().off().mk()) - .configured(AdmissionControl.linuxCpu().useGlobalFlag().mk()); - } - - return server.serve(new InetSocketAddress(port), service); - } - - private void stopFinagleServer(AtomicReference finagleServer, - Duration serverCloseWaitTime, - String serverDescription) throws InterruptedException { - try { - LOG.info("Waiting for " + serverDescription + " finagle server to close. " - + "Current time is " + System.currentTimeMillis()); - Await.result(finagleServer.get().close(), serverCloseWaitTime); - LOG.info("Stopped " + serverDescription + " finagle server. Current time is " - + System.currentTimeMillis()); - finagleServer.set(null); - } catch (TimeoutException e) { - LOG.warn(serverDescription + " finagle server did not shutdown cleanly.", e); - } catch (SslException e) { - // Closing the Thrift port seems to throw an SSLException (SSLEngine closed already). - // See SEARCH-29449. Log the exception and reset finagleServer, so that future calls to - // startProductionFinagleServer() succeed. - LOG.warn("Got a SSLException while trying to close the Thrift port.", e); - finagleServer.set(null); - } catch (InterruptedException e) { - // If we catch an InterruptedException here, it means that we're probably shutting down. - // We should propagate this exception, and rely on EarlybirdServer.stopThriftService() - // to do the right thing. - throw e; - } catch (Exception e) { - LOG.error(e.getMessage(), e); - } finally { - // If the finagle server does not close cleanly, this line prints details about - // the ExitGuards. - LOG.info(serverDescription + " server ExitGuard explanation: " + ExitGuard.explainGuards()); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdSearcher.docx b/src/java/com/twitter/search/earlybird/EarlybirdSearcher.docx new file mode 100644 index 000000000..6342fc959 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdSearcher.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdSearcher.java b/src/java/com/twitter/search/earlybird/EarlybirdSearcher.java deleted file mode 100644 index 386c0bcb6..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdSearcher.java +++ /dev/null @@ -1,1918 +0,0 @@ -package com.twitter.search.earlybird; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; -import javax.annotation.Nonnull; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Joiner; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.ImmutableSet; -import com.google.common.collect.Lists; - -import org.apache.commons.lang.StringUtils; -import org.apache.lucene.index.Term; -import org.apache.lucene.queryparser.classic.ParseException; -import org.apache.lucene.queryparser.classic.QueryParser; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Query; -import org.apache.thrift.TException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.search.common.database.DatabaseConfig; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.features.thrift.ThriftSearchFeatureSchema; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.partitioning.base.Segment; -import com.twitter.search.common.query.MappableField; -import com.twitter.search.common.query.QueryHitAttributeHelper; -import com.twitter.search.common.query.thriftjava.CollectorParams; -import com.twitter.search.common.query.thriftjava.CollectorTerminationParams; -import com.twitter.search.common.query.thriftjava.EarlyTerminationInfo; -import com.twitter.search.common.ranking.thriftjava.ThriftRankingParams; -import com.twitter.search.common.ranking.thriftjava.ThriftScoringFunctionType; -import com.twitter.search.common.results.thriftjava.FieldHitList; -import com.twitter.search.common.schema.SchemaUtil; -import com.twitter.search.common.schema.SearchWhitespaceAnalyzer; -import com.twitter.search.common.schema.base.FieldWeightDefault; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.search.TerminationTracker; -import com.twitter.search.common.search.TwitterEarlyTerminationCollector; -import com.twitter.search.common.search.termination.QueryTimeoutFactory; -import com.twitter.search.common.util.earlybird.EarlybirdResponseUtil; -import com.twitter.search.common.util.ml.tensorflow_engine.TensorflowModelsManager; -import com.twitter.search.common.util.thrift.ThriftUtils; -import com.twitter.search.core.earlybird.facets.FacetCountState; -import com.twitter.search.earlybird.common.ClientIdUtil; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.exception.ClientException; -import com.twitter.search.earlybird.exception.TransientException; -import com.twitter.search.earlybird.index.facets.FacetSkipList; -import com.twitter.search.earlybird.ml.ScoringModelsManager; -import com.twitter.search.earlybird.partition.AudioSpaceTable; -import com.twitter.search.earlybird.partition.MultiSegmentTermDictionaryManager; -import com.twitter.search.earlybird.partition.PartitionConfig; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentManager; -import com.twitter.search.earlybird.querycache.QueryCacheConversionRules; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.earlybird.queryparser.DetectFieldAnnotationVisitor; -import com.twitter.search.earlybird.queryparser.EarlybirdLuceneQueryVisitor; -import com.twitter.search.earlybird.queryparser.HighFrequencyTermPairRewriteVisitor; -import com.twitter.search.earlybird.queryparser.LuceneRelevanceQueryVisitor; -import com.twitter.search.earlybird.queryparser.ProtectedOperatorQueryRewriter; -import com.twitter.search.earlybird.search.AbstractResultsCollector; -import com.twitter.search.earlybird.search.AntiGamingFilter; -import com.twitter.search.earlybird.search.queries.BadUserRepFilter; -import com.twitter.search.earlybird.search.EarlybirdLuceneSearcher; -import com.twitter.search.earlybird.search.EarlybirdMultiSegmentSearcher; -import com.twitter.search.earlybird.search.queries.MatchAllDocsQuery; -import com.twitter.search.earlybird.search.queries.RequiredStatusIDsFilter; -import com.twitter.search.earlybird.search.SearchRequestInfo; -import com.twitter.search.earlybird.search.SearchResultsCollector; -import com.twitter.search.earlybird.search.SearchResultsInfo; -import com.twitter.search.earlybird.search.SimpleSearchResults; -import com.twitter.search.earlybird.search.SocialFilter; -import com.twitter.search.earlybird.search.SocialSearchResultsCollector; -import com.twitter.search.earlybird.search.queries.UserFlagsExcludeFilter; -import com.twitter.search.earlybird.search.queries.UserIdMultiSegmentQuery; -import com.twitter.search.earlybird.search.facets.EntityAnnotationCollector; -import com.twitter.search.earlybird.search.facets.ExpandedUrlCollector; -import com.twitter.search.earlybird.search.facets.ExplainFacetResultsCollector; -import com.twitter.search.earlybird.search.facets.FacetRankingModule; -import com.twitter.search.earlybird.search.facets.FacetResultsCollector; -import com.twitter.search.earlybird.search.facets.FacetSearchRequestInfo; -import com.twitter.search.earlybird.search.facets.NamedEntityCollector; -import com.twitter.search.earlybird.search.facets.SpaceFacetCollector; -import com.twitter.search.earlybird.search.facets.TermStatisticsCollector; -import com.twitter.search.earlybird.search.facets.TermStatisticsRequestInfo; -import com.twitter.search.earlybird.search.relevance.RelevanceSearchRequestInfo; -import com.twitter.search.earlybird.search.relevance.RelevanceSearchResults; -import com.twitter.search.earlybird.search.relevance.collectors.AbstractRelevanceCollector; -import com.twitter.search.earlybird.search.relevance.collectors.BatchRelevanceTopCollector; -import com.twitter.search.earlybird.search.relevance.collectors.RelevanceAllCollector; -import com.twitter.search.earlybird.search.relevance.collectors.RelevanceTopCollector; -import com.twitter.search.earlybird.search.relevance.scoring.RelevanceQuery; -import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction; -import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunctionProvider; -import com.twitter.search.earlybird.search.relevance.scoring.TensorflowBasedScoringFunction; -import com.twitter.search.earlybird.stats.EarlybirdRPCStats; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.EarlybirdDebugInfo; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.ThriftFacetCount; -import com.twitter.search.earlybird.thrift.ThriftFacetCountMetadata; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldRequest; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults; -import com.twitter.search.earlybird.thrift.ThriftFacetRequest; -import com.twitter.search.earlybird.thrift.ThriftFacetResults; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchRankingMode; -import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResultExtraMetadata; -import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird.thrift.ThriftTermRequest; -import com.twitter.search.earlybird.thrift.ThriftTermStatisticsRequest; -import com.twitter.search.earlybird.thrift.ThriftTermStatisticsResults; -import com.twitter.search.earlybird.util.EarlybirdSearchResultUtil; -import com.twitter.search.queryparser.parser.SerializedQueryParser; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.QueryNodeUtils; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.annotation.Annotation; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchOperatorConstants; -import com.twitter.search.queryparser.util.IdTimeRanges; -import com.twitter.search.queryparser.visitors.ConversionVisitor; -import com.twitter.search.queryparser.visitors.DetectPositiveOperatorVisitor; -import com.twitter.search.queryparser.visitors.NamedDisjunctionVisitor; -import com.twitter.search.queryparser.visitors.ProximityGroupRewriteVisitor; -import com.twitter.search.queryparser.visitors.StripAnnotationsVisitor; - -import static com.twitter.search.queryparser.query.search.SearchOperator.Type.UNTIL_TIME; - -/** - * This class provides the basic search() method: - * - converts the thrift request object into what lucene expects. - * - gets the segment. - * - handles all errors, and prepares the response in case of error. - * - * We have one instance of this class per search received. - */ -public class EarlybirdSearcher { - public enum QueryMode { - // Please think before adding more query modes: can this be implemented in a general way? - RECENCY(new EarlybirdRPCStats("search_recency")), - FACETS(new EarlybirdRPCStats("search_facets")), - TERM_STATS(new EarlybirdRPCStats("search_termstats")), - RELEVANCE(new EarlybirdRPCStats("search_relevance")), - TOP_TWEETS(new EarlybirdRPCStats("search_toptweets")); - - private final EarlybirdRPCStats requestStats; - - QueryMode(EarlybirdRPCStats requestStats) { - this.requestStats = requestStats; - } - - public EarlybirdRPCStats getRequestStats() { - return requestStats; - } - } - - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdSearcher.class); - private static final String MATCH_ALL_SERIALIZED_QUERY = "(* )"; - /** - * generic field annotations can be mapped to a concrete field in the index using this mapping - * via {@link com.twitter.search.queryparser.query.annotation.Annotation.Type#MAPPABLE_FIELD} - */ - private static final Map MAPPABLE_FIELD_MAP = - ImmutableMap.of( - MappableField.URL, - EarlybirdFieldConstant.RESOLVED_LINKS_TEXT_FIELD.getFieldName()); - - private static final String ALLOW_QUERY_SPECIFIC_SIGNAL_DECIDER_KEY - = "allow_query_specific_score_adjustments"; - - @VisibleForTesting - public static final String ALLOW_AUTHOR_SPECIFIC_SIGNAL_DECIDER_KEY - = "allow_author_specific_score_adjustments"; - - private static final String USE_MULTI_TERM_DISJUNCTION_FOR_LIKED_BY_USER_IDS_DECIDER_KEY - = "use_multi_term_disjunction_for_liked_by_user_ids"; - - private static final String ALLOW_CAMELCASE_USERNAME_FIELD_WEIGHT_OVERRIDE_DECIDER_KEY_PREFIX - = "allow_camelcase_username_field_weight_override_in_"; - - private static final String ALLOW_TOKENIZED_DISPLAY_NAME_FIELD_WEIGHT_OVERRIDE_DECIDER_KEY_PREFIX - = "allow_tokenized_display_name_field_weight_override_in_"; - - private static final boolean ALLOW_QUERY_SPECIFIC_SIGNAL_CONFIG - = EarlybirdConfig.getBool("allow_query_specific_score_adjustments", false); - - private static final boolean ALLOW_AUTHOR_SPECIFIC_SIGNAL_CONFIG - = EarlybirdConfig.getBool("allow_author_specific_score_adjustments", false); - - public static final int DEFAULT_NUM_FACET_RESULTS = 100; - - private final ImmutableSchemaInterface schemaSnapshot; - private final EarlybirdCluster cluster; - - private final Clock clock; - private final Decider decider; - - // The actual request thrift. - private final EarlybirdRequest request; - - // searchQuery from inside the request. - private final ThriftSearchQuery searchQuery; - - // CollectorParams from inside the searchQuery; - private final CollectorParams collectorParams; - - // Parsed query (parsed from serialized query string in request). - private com.twitter.search.queryparser.query.Query parsedQuery; - private boolean parsedQueryAllowNullcast; - private IdTimeRanges idTimeRanges; - - // Lucene version of the above. This is what we will actually be executing. - private org.apache.lucene.search.Query luceneQuery; - - // Used for queries where we want to collect per-field hit attribution - @Nullable - private QueryHitAttributeHelper hitAttributeHelper; - - // Debugging info can be appended to this buffer. - private final StringBuilder messageBuffer = new StringBuilder(1024); - private final EarlybirdDebugInfo debugInfo = new EarlybirdDebugInfo(); - - // The segment we are searching, or null for the multi-searcher. - private Segment segment = null; - - // True iff we are searching all segments (multi-searcher). - private final boolean searchAllSegments; - - // Tracking termination criteria for this query - private final TerminationTracker terminationTracker; - - private EarlybirdLuceneSearcher searcher = null; - - private final SegmentManager segmentManager; - private final QueryCacheManager queryCacheManager; - private final ScoringModelsManager scoringModelsManager; - private final TensorflowModelsManager tensorflowModelsManager; - - private AntiGamingFilter antiGamingFilter = null; - - private final boolean searchHighFrequencyTermPairs = - EarlybirdConfig.getBool("search_high_frequency_term_pairs", false); - - // How long to allow post-termination when enforcing query timeout - private final int enforceQueryTimeoutBufferMillis = - EarlybirdConfig.getInt("enforce_query_timeout_buffer_millis", 50); - - private EarlybirdRPCStats requestStats; - - private QueryTimeoutFactory queryTimeoutFactory; - - // Exported stats - private final EarlybirdSearcherStats searcherStats; - - @VisibleForTesting - public static final SearchCounter FIELD_WEIGHT_OVERRIDE_MAP_NON_NULL_COUNT = - SearchCounter.export("field_weight_override_map_non_null_count"); - @VisibleForTesting - public static final SearchCounter DROPPED_CAMELCASE_USERNAME_FIELD_WEIGHT_OVERRIDE = - SearchCounter.export("dropped_camelcase_username_field_weight_override"); - @VisibleForTesting - public static final SearchCounter DROPPED_TOKENIZED_DISPLAY_NAME_FIELD_WEIGHT_OVERRIDE = - SearchCounter.export("dropped_tokenized_display_name_field_weight_override"); - - private static final SearchCounter RESPONSE_HAS_NO_THRIFT_SEARCH_RESULTS = - SearchCounter.export("tweets_earlybird_searcher_response_has_no_thrift_search_results"); - private static final SearchCounter CLIENT_HAS_FEATURE_SCHEMA_COUNTER = - SearchCounter.export("tweets_earlybird_searcher_client_has_feature_schema"); - private static final SearchCounter CLIENT_DOESNT_HAVE_FEATURE_SCHEMA_COUNTER = - SearchCounter.export("tweet_earlybird_searcher_client_doesnt_have_feature_schema"); - private static final SearchCounter COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_NOT_SET_COUNTER = - SearchCounter.export("collector_params_max_hits_to_process_not_set"); - private static final SearchCounter POSITIVE_PROTECTED_OPERATOR_DETECTED_COUNTER = - SearchCounter.export("positive_protected_operator_detected_counter"); - - // Query mode we are executing. - private final QueryMode queryMode; - - // facetRequest from inside the request (or null). - private final ThriftFacetRequest facetRequest; - - // termStatisticsRequest from inside the request (or null). - private final ThriftTermStatisticsRequest termStatisticsRequest; - - // Results fields filled in during searchInternal(). - private ThriftSearchResults searchResults = null; - private ThriftFacetResults facetResults = null; - private ThriftTermStatisticsResults termStatisticsResults = null; - private EarlyTerminationInfo earlyTerminationInfo = null; - - // Partition config used to fill in debugging info. - // If null, no debug info is written into results. - @Nullable - private final PartitionConfig partitionConfig; - - private final MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager; - - private final QualityFactor qualityFactor; - - private Set queriedFields; - private final AudioSpaceTable audioSpaceTable; - - public EarlybirdSearcher( - EarlybirdRequest request, - SegmentManager segmentManager, - AudioSpaceTable audioSpaceTable, - QueryCacheManager queryCacheManager, - ImmutableSchemaInterface schema, - EarlybirdCluster cluster, - @Nullable PartitionConfig partitionConfig, - Decider decider, - EarlybirdSearcherStats searcherStats, - ScoringModelsManager scoringModelsManager, - TensorflowModelsManager tensorflowModelsManager, - Clock clock, - MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager, - QueryTimeoutFactory queryTimeoutFactory, - QualityFactor qualityFactor) { - this.queryMode = getQueryMode(request); - this.schemaSnapshot = schema.getSchemaSnapshot(); - // set the request stats as early as possible, so that we can track errors that happen - // early on in query processing. - this.requestStats = queryMode.getRequestStats(); - this.facetRequest = request.isSetFacetRequest() ? request.getFacetRequest() : null; - this.termStatisticsRequest = request.isSetTermStatisticsRequest() - ? request.getTermStatisticsRequest() : null; - this.partitionConfig = partitionConfig; - this.searcherStats = searcherStats; - this.multiSegmentTermDictionaryManager = multiSegmentTermDictionaryManager; - this.clock = clock; - this.decider = decider; - this.request = request; - this.segmentManager = segmentManager; - this.queryCacheManager = queryCacheManager; - this.cluster = cluster; - this.scoringModelsManager = scoringModelsManager; - this.tensorflowModelsManager = tensorflowModelsManager; - this.audioSpaceTable = audioSpaceTable; - // Note: we're deferring the validation/nullchecks until validateRequest() - // for more contained exception handling - this.searchQuery = request.getSearchQuery(); - this.collectorParams = this.searchQuery == null ? null : this.searchQuery.getCollectorParams(); - // Search all segments if searchSegmentId is unset. - this.searchAllSegments = !request.isSetSearchSegmentId(); - if (this.collectorParams == null - || !this.collectorParams.isSetTerminationParams()) { - this.terminationTracker = new TerminationTracker(clock); - } else if (request.isSetClientRequestTimeMs()) { - this.terminationTracker = new TerminationTracker(collectorParams.getTerminationParams(), - request.getClientRequestTimeMs(), clock, - getPostTerminationOverheadMillis(collectorParams.getTerminationParams())); - } else { - this.terminationTracker = new TerminationTracker( - collectorParams.getTerminationParams(), clock, - getPostTerminationOverheadMillis(collectorParams.getTerminationParams())); - } - this.queryTimeoutFactory = queryTimeoutFactory; - this.qualityFactor = qualityFactor; - } - - private int getPostTerminationOverheadMillis(CollectorTerminationParams terminationParams) { - // If enforcing timeouts, set the post-termination buffer to the smaller of the timeout or the - // configured buffer. This ensures that timeout >= buffer, and a request with a smaller timeout - // should just time out immediately (because timeout == buffer). - return (terminationParams.isEnforceQueryTimeout() && terminationParams.getTimeoutMs() > 0) - ? Math.min(enforceQueryTimeoutBufferMillis, terminationParams.getTimeoutMs()) : 0; - } - - // Appends a debug string to the buffer. - private void appendMessage(String message) { - messageBuffer.append(message).append("\n"); - } - - /** - * Processes an Earlybird search request. - * @return the earlybird response for this search request. - */ - public EarlybirdResponse search() { - try { - debugInfo.setHost(DatabaseConfig.getLocalHostname()); - - // Throws transient exception for invalid requests. - validateRequest(); - - // Throws client exception for bad queries, - parseEarlybirdRequest(); - - // Modify the Lucene query if necessary. - luceneQuery = postLuceneQueryProcess(luceneQuery); - - // Might return PARTITION_NOT_FOUND or PARTITION_DISABLED. - EarlybirdResponseCode code = initSearcher(); - if (code != EarlybirdResponseCode.SUCCESS) { - return respondError(code); - } - - return searchInternal(); - - } catch (TransientException e) { - LOG.error(String.format("Transient exception in search() for EarlybirdRequest:\n%s", request), - e); - appendMessage(e.getMessage()); - return respondError(EarlybirdResponseCode.TRANSIENT_ERROR); - } catch (ClientException e) { - LOG.warn(String.format("Client exception in search() %s for EarlybirdRequest:\n %s", - e, request)); - appendMessage(e.getMessage()); - return respondError(EarlybirdResponseCode.CLIENT_ERROR); - } catch (Exception e) { - LOG.warn(String.format("Uncaught exception in search() for EarlybirdRequest:\n%s", request), - e); - appendMessage(e.getMessage()); - return respondError(EarlybirdResponseCode.TRANSIENT_ERROR); - } catch (AssertionError e) { - LOG.warn(String.format("Assertion error in search() for EarlybirdRequest:\n%s", request), e); - appendMessage(e.getMessage()); - return respondError(EarlybirdResponseCode.TRANSIENT_ERROR); - } catch (Error e) { - // SEARCH-33166: If we got here, it means what was thrown was not an Exception, or anything - // we know how to handle. Log the Error for diagnostic purposes and propagate it. - LOG.error("Re-throwing uncaught error", e); - throw e; - } - } - - public EarlybirdRPCStats getRequestStats() { - return requestStats; - } - - /** - * Wraps the given query with the provided filter queries. - * - * @param query the query to wrap with filters. - * @param filters the filters to wrap the query with. - * @return a BooleanQuery wrapped with filters - */ - public static Query wrapFilters(Query query, Query... filters) { - boolean filtersEmpty = filters == null || filters.length == 0; - - if (!filtersEmpty) { - filtersEmpty = true; - for (Query f : filters) { - if (f != null) { - filtersEmpty = false; - break; - } - } - } - - if (filtersEmpty) { - if (query == null) { - return new MatchAllDocsQuery(); - } else { - return query; - } - } - - BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); - if (query != null) { - bqBuilder.add(query, Occur.MUST); - } - for (Query f : filters) { - if (f != null) { - bqBuilder.add(f, Occur.FILTER); - } - } - return bqBuilder.build(); - } - - // Examine all fields in the request for sanity. - private void validateRequest() throws TransientException, ClientException { - // First try thrift's internal validate. Should always succeed. - try { - request.validate(); - } catch (TException e) { - throw new TransientException(e.getMessage(), e); - } - - if (searchQuery == null) { - throw new TransientException("No ThriftSearchQuery specified"); - } - - if (collectorParams == null) { - throw new TransientException("No CollectorParams specified"); - } - - validateTermStatsRequest(); - - if (!searchAllSegments) { - if (request.getSearchSegmentId() <= 0) { - String msg = "Bad time slice ID: " + request.getSearchSegmentId(); - throw new TransientException(msg); - } - - // Initialize the segment. - SegmentInfo segmentInfo = this.segmentManager.getSegmentInfo(request.getSearchSegmentId()); - segment = segmentInfo != null ? segmentInfo.getSegment() : null; - } - - if (collectorParams.getNumResultsToReturn() < 0) { - String msg = "Invalid numResults: " + collectorParams.getNumResultsToReturn(); - throw new TransientException(msg); - } - - if (searchQuery.getNamedDisjunctionMapSize() > 0 && searchQuery.isSetLuceneQuery()) { - throw new ClientException("namedMultiTermDisjunctionMap does not support with luceneQuery"); - } - } - - private void validateTermStatsRequest() throws ClientException { - // Validate the field names and values for all ThriftTermRequests. - if (request.isSetTermStatisticsRequest() - && request.getTermStatisticsRequest().isSetTermRequests()) { - for (ThriftTermRequest termRequest : request.getTermStatisticsRequest().getTermRequests()) { - // If termRequest.fieldName is not set, it defaults to 'text', which is a string field, - // so we don't need to check the term. - if (termRequest.isSetFieldName()) { - String fieldName = termRequest.getFieldName(); - Schema.FieldInfo facetFieldInfo = schemaSnapshot.getFacetFieldByFacetName(fieldName); - if (facetFieldInfo != null) { - // Facet fields are string fields, so we don't need to check the term. - continue; - } - - Schema.FieldInfo fieldInfo = schemaSnapshot.getFieldInfo(fieldName); - if (fieldInfo == null) { - throw new ClientException("Field " + fieldName + " is not present in the schema."); - } - - try { - SchemaUtil.toBytesRef(fieldInfo, termRequest.getTerm()); - } catch (UnsupportedOperationException e) { - throw new ClientException("Term " + termRequest.getTerm() + " is not compatible with " - + "the type of field " + fieldName); - } - } - } - } - } - - private void setQueriesInDebugInfo( - com.twitter.search.queryparser.query.Query parsedQ, - org.apache.lucene.search.Query luceneQ) { - debugInfo.setParsedQuery(parsedQ == null ? null : parsedQ.serialize()); - debugInfo.setLuceneQuery(luceneQ == null ? null : luceneQ.toString()); - } - - /** - * Takes the EarlybirdRequest that came into the service and after various parsing and processing - * steps ultimately produces a Lucene query. - */ - private void parseEarlybirdRequest() throws ClientException { - SerializedQueryParser parser = new SerializedQueryParser(EarlybirdConfig.getPenguinVersion()); - - try { - // if the deprecated iterativeQueries field is set, return an error to the client - // indicating that support for it has been removed. - if (searchQuery.isSetDeprecated_iterativeQueries()) { - throw new ClientException("Invalid request: iterativeQueries feature has been removed"); - } - - // we parse the actual query from the user, if any - luceneQuery = null; - parsedQuery = null; // this will be set by parseQueryHelper() - - if (searchQuery.getLikedByUserIDFilter64Size() > 0 - && searchQuery.isSetLuceneQuery()) { - throw new ClientException("likedByUserIDFilter64 does not support with luceneQuery"); - } - - if (!StringUtils.isBlank(request.getSearchQuery().getSerializedQuery())) { - searcherStats.thriftQueryWithSerializedQuery.increment(); - luceneQuery = parseSerializedQuery(searchQuery.getSerializedQuery(), parser, true); - } else if (!StringUtils.isBlank(request.getSearchQuery().getLuceneQuery())) { - searcherStats.thriftQueryWithLuceneQuery.increment(); - luceneQuery = parseLuceneQuery(searchQuery.getLuceneQuery()); - LOG.info("lucene query: {}", searchQuery.getLuceneQuery()); - if (luceneQuery != null) { - LOG.info("Using lucene query directly from the request: " + luceneQuery.toString()); - } - } else { - searcherStats.thriftQueryWithoutTextQuery.increment(); - luceneQuery = parseSerializedQuery( - MATCH_ALL_SERIALIZED_QUERY, - parser, - queryMode != QueryMode.TERM_STATS); - } - } catch (QueryParserException | BooleanQuery.TooManyClauses e) { - LOG.info("Exception parsing query during search", e); - appendMessage(e.getMessage()); - throw new ClientException(e); - } - } - - /** - * Parses a serialized query and creates a Lucene query out of it. - * - * To see how serialized queries look like, go to go/searchsyntax. - */ - private Query parseSerializedQuery( - String serializedQuery, - SerializedQueryParser parser, - boolean shouldAdjustQueryBasedOnRequestParameters) throws QueryParserException { - // Parse the serialized query. - parsedQuery = parser.parse(serializedQuery); - if (parsedQuery == null) { - return null; - } - - // rewrite query if positive 'protected' operator is detected - if (parsedQuery.accept(new DetectPositiveOperatorVisitor(SearchOperatorConstants.PROTECTED))) { - POSITIVE_PROTECTED_OPERATOR_DETECTED_COUNTER.increment(); - ProtectedOperatorQueryRewriter rewriter = new ProtectedOperatorQueryRewriter(); - parsedQuery = rewriter.rewrite( - parsedQuery, - request.followedUserIds, - segmentManager.getUserTable()); - } - - ThriftSearchRelevanceOptions options = searchQuery.getRelevanceOptions(); - if (shouldAdjustQueryBasedOnRequestParameters) { - // If likedByUserIDFilter64 is set, combine it with query - // Note: we deal with likedByUserIDFilter64 here instead of in postLuceneQueryProcess as we - // want annotate query with ranks. - if (searchQuery.isSetLikedByUserIDFilter64() - && searchQuery.getLikedByUserIDFilter64Size() > 0) { - parsedQuery = combineWithLikedByUserIdFilter64( - parsedQuery, searchQuery.getLikedByUserIDFilter64()); - } - - // If namedListMap field is set, replace the named lists in the serialized query. - if (searchQuery.getNamedDisjunctionMapSize() > 0) { - parsedQuery = parsedQuery.accept( - new NamedDisjunctionVisitor(searchQuery.getNamedDisjunctionMap())); - } - - if (searchQuery.isSetRelevanceOptions() - && searchQuery.getRelevanceOptions().isCollectFieldHitAttributions()) { - // NOTE: Before we do any modifications to the serialized query tree, annotate the query - // nodes with their node rank in the original query. - this.hitAttributeHelper = - QueryHitAttributeHelper.from(parsedQuery, schemaSnapshot); - parsedQuery = hitAttributeHelper.getAnnotatedQuery(); - } - - // Currently antisocial/nullcast tweets are dropped when we build index, but some tweets may - // become antisocial with realtime updates. For consistency, we should always filter out - // antisocial/nullcast tweets if the user is not explicitly including it. - final boolean allowAntisocial = - parsedQuery.accept(new DetectPositiveOperatorVisitor(SearchOperatorConstants.ANTISOCIAL)); - if (!allowAntisocial) { - parsedQuery = QueryNodeUtils.appendAsConjunction( - parsedQuery, - QueryCacheConversionRules.CACHED_EXCLUDE_ANTISOCIAL); - } - parsedQueryAllowNullcast = - parsedQuery.accept(new DetectPositiveOperatorVisitor(SearchOperatorConstants.NULLCAST)); - if (!parsedQueryAllowNullcast) { - parsedQuery = QueryNodeUtils.appendAsConjunction( - parsedQuery, new SearchOperator("filter", SearchOperatorConstants.NULLCAST).negate()); - } - - // Strip all annotations from the filters that will be converted to query cache filters. - // See SEARCH-15552. - parsedQuery = parsedQuery.accept( - new StripAnnotationsVisitor(QueryCacheConversionRules.STRIP_ANNOTATIONS_QUERIES)); - - // Convert certain filters into cached filters, also consolidate them. - parsedQuery = parsedQuery.accept( - new ConversionVisitor(QueryCacheConversionRules.DEFAULT_RULES)); - - // add proximity if needed - if (options != null - && options.isProximityScoring() - && searchQuery.getRankingMode() != ThriftSearchRankingMode.RECENCY) { - parsedQuery = parsedQuery.accept(new ProximityGroupRewriteVisitor()).simplify(); - } - } - - if (request.isSkipVeryRecentTweets()) { - parsedQuery = restrictQueryToFullyIndexedTweets(parsedQuery); - } - - parsedQuery = parsedQuery.simplify(); - debugInfo.setParsedQuery(parsedQuery.serialize()); - - // Extract top-level since-id for pagination optimizations. - idTimeRanges = IdTimeRanges.fromQuery(parsedQuery); - - // Does any final processing specific to EarlybirdSearch class. - parsedQuery = preLuceneQueryProcess(parsedQuery); - - // Convert to a lucene query. - EarlybirdLuceneQueryVisitor luceneVisitor = getLuceneVisitor( - options == null ? null : options.getFieldWeightMapOverride()); - - if (options != null) { - luceneVisitor - .setProximityPhraseWeight((float) options.getProximityPhraseWeight()) - .setProximityPhraseSlop(options.getProximityPhraseSlop()); - } - - // Propagate hit attribute helper to the lucene visitor if it has been setup. - luceneVisitor.setFieldHitAttributeHelper(this.hitAttributeHelper); - - org.apache.lucene.search.Query query = parsedQuery.accept(luceneVisitor); - if (query != null) { - debugInfo.setLuceneQuery(query.toString()); - } - - queriedFields = luceneVisitor.getQueriedFields(); - - return query; - } - - private Query parseLuceneQuery(String query) { - QueryParser parser = new QueryParser( - EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), - new SearchWhitespaceAnalyzer()); - parser.setSplitOnWhitespace(true); - try { - return parser.parse(query); - } catch (ParseException e) { - LOG.error("Cannot parse raw lucene query: " + query, e); - } catch (NullPointerException e) { - LOG.error("NullPointerException while parsing raw lucene query: " + query - + ", probably your grammar is wrong.\n", e); - } - return null; - } - - private com.twitter.search.queryparser.query.Query combineWithLikedByUserIdFilter64( - com.twitter.search.queryparser.query.Query query, - List ids) throws QueryParserException { - return QueryNodeUtils.appendAsConjunction(query, getLikedByUserIdQuery(ids)); - } - - /** - * initSearcher initializes the segmentSearcher, and returns SUCCESS if OK - * or some other response code it not OK. - */ - private EarlybirdResponseCode initSearcher() throws IOException { - searcher = null; - if (searchAllSegments) { - return initMultiSegmentSearcher(); - } else { - return initSingleSegmentSearcher(); - } - } - - private EarlybirdResponseCode initSingleSegmentSearcher() throws IOException { - if (segment == null) { - String message = "Segment not found for time slice: " + request.getSearchSegmentId(); - LOG.warn(message); - appendMessage(message); - return EarlybirdResponseCode.PARTITION_NOT_FOUND; - } - - EarlybirdResponseCode code = this.segmentManager.checkSegment(segment); - if (code != EarlybirdResponseCode.SUCCESS) { - String message = "Segment " + segment + " either disabled or dropped"; - LOG.warn(message); - appendMessage(message); - return code; - } - - searcher = segmentManager.getSearcher(segment, schemaSnapshot); - if (searcher == null) { - String message = "Could not construct searcher for segment " + segment; - LOG.error(message); - appendMessage(message); - return EarlybirdResponseCode.PERSISTENT_ERROR; - } else { - appendMessage("Searching segment: " + segment); - return EarlybirdResponseCode.SUCCESS; - } - } - - private EarlybirdResponseCode initMultiSegmentSearcher() throws IOException { - EarlybirdMultiSegmentSearcher multiSearcher = - segmentManager.getMultiSearcher(schemaSnapshot); - searcher = multiSearcher; - Preconditions.checkNotNull(searcher); - - // Set a top level since id to skip entire segments when possible. - multiSearcher.setIdTimeRanges(idTimeRanges); - return EarlybirdResponseCode.SUCCESS; - } - - private com.twitter.search.queryparser.query.Query - restrictQueryToFullyIndexedTweets(com.twitter.search.queryparser.query.Query query) { - long untilTimeSeconds = - RecentTweetRestriction.recentTweetsUntilTime(decider, (int) (clock.nowMillis() / 1000)); - if (untilTimeSeconds == 0) { - return query; - } - - SearchOperator timeLimit = new SearchOperator(UNTIL_TIME, untilTimeSeconds); - return new Conjunction(query, timeLimit); - } - - private EarlybirdResponse newResponse(EarlybirdResponseCode code, boolean setDebugInfo) { - EarlybirdResponse response = new EarlybirdResponse(); - response.setResponseCode(code); - if (setDebugInfo) { - response.setDebugInfo(debugInfo); - if (messageBuffer.length() > 0) { - response.setDebugString(DatabaseConfig.getLocalHostname() - + ":\n" + messageBuffer.toString()); - } - } - return response; - } - - private EarlybirdResponse respondError(EarlybirdResponseCode code) { - appendMessage("Responding with error code " + code); - // Always respond with an error message, even when request.debug is false - return newResponse(code, true); - } - - @VisibleForTesting - public TerminationTracker getTerminationTracker() { - return terminationTracker; - } - - public void maybeSetCollectorDebugInfo(TwitterEarlyTerminationCollector collector) { - if (request.isSetDebugOptions() && request.getDebugOptions().isIncludeCollectorDebugInfo()) { - debugInfo.setCollectorDebugInfo(collector.getDebugInfo()); - } - } - - public void setTermStatisticsDebugInfo(List termStatisticsDebugInfo) { - debugInfo.setTermStatisticsDebugInfo(termStatisticsDebugInfo); - } - - private EarlybirdResponse searchInternal() throws TransientException, ClientException { - searchResults = new ThriftSearchResults(); - - SearchResultsInfo searchResultsInfo; - try { - switch (queryMode) { - case RECENCY: - searchResultsInfo = processRealtimeQuery(); - break; - case RELEVANCE: - // Relevance search and Model-based search differ only on the scoring function used. - SearchTimer timer = searcherStats.createTimer(); - timer.start(); - searchResultsInfo = processRelevanceQuery(); - timer.stop(); - searcherStats.recordRelevanceStats(timer, request); - break; - case FACETS: - searchResultsInfo = processFacetsQuery(); - break; - case TERM_STATS: - searchResultsInfo = processTermStatsQuery(); - break; - case TOP_TWEETS: - searchResultsInfo = processTopTweetsQuery(); - break; - default: - throw new TransientException("Unknown query mode " + queryMode); - } - - return respondSuccess(searchResults, facetResults, termStatisticsResults, - earlyTerminationInfo, searchResultsInfo); - } catch (IOException e) { - throw new TransientException(e.getMessage(), e); - } - } - - /** - * Helper method to process facets query. - */ - private SearchResultsInfo processFacetsQuery() throws ClientException, IOException { - // figure out which fields we need to count - FacetCountState facetCountState = newFacetCountState(); - - // Additionally wrap our query into a skip list boolean query for faster counting. - if (!facetRequest.isUsingQueryCache()) { - // Only if all fields to be counted use skip lists, then we can add a required clause - // that filters out all results that do not contain those fields - boolean cannotAddRequiredClause = facetCountState.hasFieldToCountWithoutSkipList(); - final Query facetSkipListFilter = - cannotAddRequiredClause ? null : FacetSkipList.getSkipListQuery(facetCountState); - final Query antisocialFilter = UserFlagsExcludeFilter.getUserFlagsExcludeFilter( - segmentManager.getUserTable(), true, true, false); - luceneQuery = wrapFilters(luceneQuery, - facetSkipListFilter, - antisocialFilter); - } - - facetResults = new ThriftFacetResults(new HashMap<>()); - - FacetSearchRequestInfo searchRequestInfo = - new FacetSearchRequestInfo(searchQuery, facetRequest.getFacetRankingOptions(), - luceneQuery, facetCountState, terminationTracker); - searchRequestInfo.setIdTimeRanges(idTimeRanges); - if (searchQuery.getMaxHitsPerUser() > 0) { - antiGamingFilter = new AntiGamingFilter( - searchQuery.getMaxHitsPerUser(), - searchQuery.getMaxTweepcredForAntiGaming(), - luceneQuery); - } - - AbstractResultsCollector< - FacetSearchRequestInfo, EarlybirdLuceneSearcher.FacetSearchResults> collector; - if (request.getDebugMode() > 2) { - collector = new ExplainFacetResultsCollector(schemaSnapshot, - searchRequestInfo, antiGamingFilter, searcherStats, clock, request.debugMode); - } else { - collector = new FacetResultsCollector(schemaSnapshot, - searchRequestInfo, antiGamingFilter, searcherStats, clock, request.debugMode); - } - - setQueriesInDebugInfo(parsedQuery, searchRequestInfo.getLuceneQuery()); - searcher.search(searchRequestInfo.getLuceneQuery(), collector); - EarlybirdLuceneSearcher.FacetSearchResults hits = collector.getResults(); - - EarlybirdSearchResultUtil.setResultStatistics(searchResults, hits); - earlyTerminationInfo = EarlybirdSearchResultUtil.prepareEarlyTerminationInfo(hits); - Set userIDWhitelist = - antiGamingFilter != null ? antiGamingFilter.getUserIDWhitelist() : null; - prepareFacetResults(facetResults, hits, facetCountState, userIDWhitelist, - request.getDebugMode()); - facetResults.setUserIDWhitelist(userIDWhitelist); - - maybeSetCollectorDebugInfo(collector); - - if (collector instanceof ExplainFacetResultsCollector) { - ((ExplainFacetResultsCollector) collector).setExplanations(facetResults); - } - - return hits; - } - - /** - * Helper method to process term-stats query. - */ - private SearchResultsInfo processTermStatsQuery() throws IOException { - // first extract the terms that we need to count - TermStatisticsRequestInfo searchRequestInfo = - new TermStatisticsRequestInfo(searchQuery, luceneQuery, termStatisticsRequest, - terminationTracker); - searchRequestInfo.setIdTimeRanges(idTimeRanges); - setQueriesInDebugInfo(parsedQuery, searchRequestInfo.getLuceneQuery()); - TermStatisticsCollector.TermStatisticsSearchResults hits = - searcher.collectTermStatistics(searchRequestInfo, this, request.getDebugMode()); - EarlybirdSearchResultUtil.setResultStatistics(searchResults, hits); - earlyTerminationInfo = EarlybirdSearchResultUtil.prepareEarlyTerminationInfo(hits); - if (hits.results != null) { - termStatisticsResults = new ThriftTermStatisticsResults(); - prepareTermStatisticsResults(termStatisticsResults, hits, request.getDebugMode()); - } - - return hits; - } - - /** - * Helper method to process realtime query. - */ - private SearchResultsInfo processRealtimeQuery() throws IOException, ClientException { - // Disable maxHitsToProcess. - if (!collectorParams.isSetTerminationParams()) { - collectorParams.setTerminationParams(new CollectorTerminationParams()); - collectorParams.getTerminationParams().setMaxHitsToProcess(-1); - COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_NOT_SET_COUNTER.increment(); - } - - SearchRequestInfo searchRequestInfo = new SearchRequestInfo( - searchQuery, luceneQuery, terminationTracker); - searchRequestInfo.setIdTimeRanges(idTimeRanges); - searchRequestInfo.setHitAttributeHelper(hitAttributeHelper); - searchRequestInfo.setTimestamp(getQueryTimestamp(searchQuery)); - - AbstractResultsCollector collector; - if (searchQuery.isSetSocialFilterType()) { - if (!searchRequestInfo.getSearchQuery().isSetDirectFollowFilter() - || !searchRequestInfo.getSearchQuery().isSetTrustedFilter()) { - searcherStats.unsetFiltersForSocialFilterTypeQuery.increment(); - throw new ClientException( - "SocialFilterType specified without a TrustedFilter or DirectFollowFilter"); - } - SocialFilter socialFilter = new SocialFilter( - searchQuery.getSocialFilterType(), - searchRequestInfo.getSearchQuery().getSearcherId(), - searchRequestInfo.getSearchQuery().getTrustedFilter(), - searchRequestInfo.getSearchQuery().getDirectFollowFilter()); - collector = new SocialSearchResultsCollector( - schemaSnapshot, - searchRequestInfo, - socialFilter, - searcherStats, - cluster, - segmentManager.getUserTable(), - request.getDebugMode()); - } else { - collector = new SearchResultsCollector( - schemaSnapshot, - searchRequestInfo, - clock, - searcherStats, - cluster, - segmentManager.getUserTable(), - request.getDebugMode()); - } - - setQueriesInDebugInfo(parsedQuery, luceneQuery); - searcher.search(luceneQuery, collector); - - SimpleSearchResults hits = collector.getResults(); - - EarlybirdSearchResultUtil.setResultStatistics(searchResults, hits); - earlyTerminationInfo = EarlybirdSearchResultUtil.prepareEarlyTerminationInfo(hits); - EarlybirdSearchResultUtil.prepareResultsArray( - searchResults.getResults(), hits, request.debugMode > 0 ? partitionConfig : null); - searchResults.setHitCounts(collector.getHitCountMap()); - - maybeSetCollectorDebugInfo(collector); - - addResultPayloads(); - - return hits; - } - - /** - * Helper method to process relevance query. - */ - private SearchResultsInfo processRelevanceQuery() throws IOException, ClientException { - if (!searchQuery.isSetRelevanceOptions()) { - LOG.warn("Relevance query with no relevance options!"); - searchQuery.setRelevanceOptions(new ThriftSearchRelevanceOptions()); - } - - // Note: today the assumption is that if you specify hasSpecifiedTweets, - // you really do want all tweets scored and returned. - final boolean hasSpecifiedTweets = searchQuery.getSearchStatusIdsSize() > 0; - if (hasSpecifiedTweets) { - collectorParams.setNumResultsToReturn(searchQuery.getSearchStatusIdsSize()); - } - // If we have explicit user ids, we will want to look at all results from those users, and will - // not need to use the AntiGamingFilter. - final boolean hasSpecifiedFromUserIds = searchQuery.getFromUserIDFilter64Size() > 0; - - createRelevanceAntiGamingFilter(hasSpecifiedTweets, hasSpecifiedFromUserIds); - - if (searchQuery.getRelevanceOptions().isSetRankingParams()) { - ThriftRankingParams rankingParams = searchQuery.getRelevanceOptions().getRankingParams(); - - // The score adjustment signals that are passed in the request are disabled for the archive - // cluster or when the features are decidered off. If the request provides those fields, - // we unset them since checking the hashmap when scoring can cause a slight bump in - // latency. - // - // Verify that the signal query specific scores for tweets signal is enabled - if (rankingParams.isSetQuerySpecificScoreAdjustments()) { - if (ALLOW_QUERY_SPECIFIC_SIGNAL_CONFIG - && DeciderUtil.isAvailableForRandomRecipient( - decider, ALLOW_QUERY_SPECIFIC_SIGNAL_DECIDER_KEY)) { - searcherStats.querySpecificSignalQueriesUsed.increment(); - searcherStats.querySpecificSignalMapTotalSize.add( - rankingParams.getQuerySpecificScoreAdjustmentsSize()); - } else { - searchQuery.getRelevanceOptions().getRankingParams().unsetQuerySpecificScoreAdjustments(); - searcherStats.querySpecificSignalQueriesErased.increment(); - } - } - - // Verify that the signal author specific scores signal is enabled - if (rankingParams.isSetAuthorSpecificScoreAdjustments()) { - if (ALLOW_AUTHOR_SPECIFIC_SIGNAL_CONFIG - && DeciderUtil.isAvailableForRandomRecipient( - decider, ALLOW_AUTHOR_SPECIFIC_SIGNAL_DECIDER_KEY)) { - searcherStats.authorSpecificSignalQueriesUsed.increment(); - searcherStats.authorSpecificSignalMapTotalSize.add( - rankingParams.getAuthorSpecificScoreAdjustmentsSize()); - } else { - searchQuery.getRelevanceOptions().getRankingParams() - .unsetAuthorSpecificScoreAdjustments(); - searcherStats.authorSpecificSignalQueriesErased.increment(); - } - } - } - - ScoringFunction scoringFunction = - new ScoringFunctionProvider.DefaultScoringFunctionProvider( - request, schemaSnapshot, searchQuery, antiGamingFilter, - segmentManager.getUserTable(), hitAttributeHelper, - parsedQuery, scoringModelsManager, tensorflowModelsManager) - .getScoringFunction(); - scoringFunction.setDebugMode(request.getDebugMode()); - - RelevanceQuery relevanceQuery = new RelevanceQuery(luceneQuery, scoringFunction); - RelevanceSearchRequestInfo searchRequestInfo = - new RelevanceSearchRequestInfo( - searchQuery, relevanceQuery, terminationTracker, qualityFactor); - searchRequestInfo.setIdTimeRanges(idTimeRanges); - searchRequestInfo.setHitAttributeHelper(hitAttributeHelper); - searchRequestInfo.setTimestamp(getQueryTimestamp(searchQuery)); - - if (shouldUseTensorFlowCollector() - && searchQuery.getRelevanceOptions().isUseRelevanceAllCollector()) { - throw new ClientException("Tensorflow scoring does not work with the RelevanceAllCollector"); - } - - final AbstractRelevanceCollector collector; - // First check if the Tensorflow results collector should be used, because the - // TensorflowBasedScoringFunction only works with the BatchRelevanceTopCollector - if (shouldUseTensorFlowCollector()) { - // Collect top numResults. - collector = new BatchRelevanceTopCollector( - schemaSnapshot, - searchRequestInfo, - scoringFunction, - searcherStats, - cluster, - segmentManager.getUserTable(), - clock, - request.getDebugMode()); - } else if (hasSpecifiedTweets - || searchQuery.getRelevanceOptions().isUseRelevanceAllCollector()) { - // Collect all. - collector = new RelevanceAllCollector( - schemaSnapshot, - searchRequestInfo, - scoringFunction, - searcherStats, - cluster, - segmentManager.getUserTable(), - clock, - request.getDebugMode()); - } else { - // Collect top numResults. - collector = new RelevanceTopCollector( - schemaSnapshot, - searchRequestInfo, - scoringFunction, - searcherStats, - cluster, - segmentManager.getUserTable(), - clock, - request.getDebugMode()); - } - - // Make sure that the Tensorflow scoring function and the Tensorflow results collector are - // always used together. If this fails it will result in a TRANSIENT_ERROR response. - Preconditions.checkState((collector instanceof BatchRelevanceTopCollector) - == (scoringFunction instanceof TensorflowBasedScoringFunction)); - - setQueriesInDebugInfo(parsedQuery, searchRequestInfo.getLuceneQuery()); - searcher.search(searchRequestInfo.getLuceneQuery(), collector); - - RelevanceSearchResults hits = collector.getResults(); - EarlybirdSearchResultUtil.setResultStatistics(searchResults, hits); - searchResults.setScoringTimeNanos(hits.getScoringTimeNanos()); - - earlyTerminationInfo = EarlybirdSearchResultUtil.prepareEarlyTerminationInfo(hits); - EarlybirdSearchResultUtil.setLanguageHistogram(searchResults, collector.getLanguageHistogram()); - EarlybirdSearchResultUtil.prepareRelevanceResultsArray( - searchResults.getResults(), - hits, - antiGamingFilter != null ? antiGamingFilter.getUserIDWhitelist() : null, - request.getDebugMode() > 0 ? partitionConfig : null); - - searchResults.setHitCounts(collector.getHitCountMap()); - searchResults.setRelevanceStats(hits.getRelevanceStats()); - - maybeSetCollectorDebugInfo(collector); - - if (explanationsEnabled(request.getDebugMode())) { - searcher.explainSearchResults(searchRequestInfo, hits, searchResults); - } - - addResultPayloads(); - - return hits; - } - - public static boolean explanationsEnabled(int debugLevel) { - return debugLevel > 1; - } - - private boolean shouldUseTensorFlowCollector() { - return tensorflowModelsManager.isEnabled() - && searchQuery.getRelevanceOptions().isSetRankingParams() - && searchQuery.getRelevanceOptions().getRankingParams().isSetType() - && searchQuery.getRelevanceOptions().getRankingParams().getType() - == ThriftScoringFunctionType.TENSORFLOW_BASED; - } - /** - * Optionally, if requested and needed, will create a new AntiGamingFilter. Otherwize, no - * AntiGamingFilter will be used for this query. - * @param hasSpecifiedTweets whether the request has searchStatusIds specified. - * @param hasSpecifiedFromUserIds whether the request has fromUserIDFilter64 specified. - */ - private void createRelevanceAntiGamingFilter( - boolean hasSpecifiedTweets, boolean hasSpecifiedFromUserIds) { - - // Anti-gaming filter (turned off for specified tweets mode, or when you're explicitly asking - // for specific users' tweets). - if (searchQuery.getMaxHitsPerUser() > 0 && !hasSpecifiedTweets && !hasSpecifiedFromUserIds) { - searcherStats.relevanceAntiGamingFilterUsed.increment(); - antiGamingFilter = new AntiGamingFilter( - searchQuery.getMaxHitsPerUser(), - searchQuery.getMaxTweepcredForAntiGaming(), - luceneQuery); - } else if (searchQuery.getMaxHitsPerUser() <= 0) { - searcherStats.relevanceAntiGamingFilterNotRequested.increment(); - } else if (hasSpecifiedTweets && hasSpecifiedFromUserIds) { - searcherStats.relevanceAntiGamingFilterSpecifiedTweetsAndFromUserIds.increment(); - } else if (hasSpecifiedTweets) { - searcherStats.relevanceAntiGamingFilterSpecifiedTweets.increment(); - } else if (hasSpecifiedFromUserIds) { - searcherStats.relevanceAntiGamingFilterSpecifiedFromUserIds.increment(); - } - } - - /** - * Check to make sure that there are no nullcast documents in results. If there exists nullcasts - * in results, we should log error and increment counters correspondingly. - */ - @VisibleForTesting - public void logAndIncrementStatsIfNullcastInResults(ThriftSearchResults thriftSearchResults) { - if (!thriftSearchResults.isSetResults()) { - return; - } - - Set unexpectedNullcastStatusIds = - EarlybirdResponseUtil.findUnexpectedNullcastStatusIds(thriftSearchResults, request); - - if (!unexpectedNullcastStatusIds.isEmpty()) { - searcherStats.nullcastUnexpectedQueries.increment(); - searcherStats.nullcastUnexpectedResults.add(unexpectedNullcastStatusIds.size()); - - String base64Request; - try { - base64Request = ThriftUtils.toBase64EncodedString(request); - } catch (TException e) { - base64Request = "Failed to parse base 64 request"; - } - LOG.error( - "Found unexpected nullcast tweets: {} | parsedQuery: {} | request: {} | response: {} | " - + "request base 64: {}", - Joiner.on(",").join(unexpectedNullcastStatusIds), - parsedQuery.serialize(), - request, - thriftSearchResults, - base64Request); - } - } - - private void addResultPayloads() throws IOException { - if (searchQuery.getResultMetadataOptions() != null) { - if (searchQuery.getResultMetadataOptions().isGetTweetUrls()) { - searcher.fillFacetResults(new ExpandedUrlCollector(), searchResults); - } - - if (searchQuery.getResultMetadataOptions().isGetNamedEntities()) { - searcher.fillFacetResults(new NamedEntityCollector(), searchResults); - } - - if (searchQuery.getResultMetadataOptions().isGetEntityAnnotations()) { - searcher.fillFacetResults(new EntityAnnotationCollector(), searchResults); - } - - if (searchQuery.getResultMetadataOptions().isGetSpaces()) { - searcher.fillFacetResults(new SpaceFacetCollector(audioSpaceTable), searchResults); - } - } - } - - /** - * Helper method to process top tweets query. - */ - private SearchResultsInfo processTopTweetsQuery() throws IOException, ClientException { - // set dummy relevance options if it's not available, but this shouldn't happen in prod - if (!searchQuery.isSetRelevanceOptions()) { - searchQuery.setRelevanceOptions(new ThriftSearchRelevanceOptions()); - } - if (!searchQuery.getRelevanceOptions().isSetRankingParams()) { - searchQuery.getRelevanceOptions().setRankingParams( - // this is important, or it's gonna pick DefaultScoringFunction which pretty much - // does nothing. - new ThriftRankingParams().setType(ThriftScoringFunctionType.TOPTWEETS)); - } - ScoringFunction scoringFunction = new ScoringFunctionProvider.DefaultScoringFunctionProvider( - request, schemaSnapshot, searchQuery, null, - segmentManager.getUserTable(), hitAttributeHelper, parsedQuery, - scoringModelsManager, tensorflowModelsManager) - .getScoringFunction(); - scoringFunction.setDebugMode(request.getDebugMode()); - - RelevanceQuery relevanceQuery = new RelevanceQuery(luceneQuery, scoringFunction); - RelevanceSearchRequestInfo searchRequestInfo = - new RelevanceSearchRequestInfo( - searchQuery, relevanceQuery, terminationTracker, qualityFactor); - searchRequestInfo.setIdTimeRanges(idTimeRanges); - searchRequestInfo.setTimestamp(getQueryTimestamp(searchQuery)); - - final AbstractRelevanceCollector collector = - new RelevanceTopCollector( - schemaSnapshot, - searchRequestInfo, - scoringFunction, - searcherStats, - cluster, - segmentManager.getUserTable(), - clock, - request.getDebugMode()); - - setQueriesInDebugInfo(parsedQuery, searchRequestInfo.getLuceneQuery()); - searcher.search(searchRequestInfo.getLuceneQuery(), collector); - - RelevanceSearchResults hits = collector.getResults(); - EarlybirdSearchResultUtil.setResultStatistics(searchResults, hits); - searchResults.setScoringTimeNanos(hits.getScoringTimeNanos()); - earlyTerminationInfo = EarlybirdSearchResultUtil.prepareEarlyTerminationInfo(hits); - EarlybirdSearchResultUtil.setLanguageHistogram( - searchResults, - collector.getLanguageHistogram()); - EarlybirdSearchResultUtil.prepareRelevanceResultsArray( - searchResults.getResults(), - hits, - null, - request.getDebugMode() > 0 ? partitionConfig : null); - - searchResults.setHitCounts(collector.getHitCountMap()); - searchResults.setRelevanceStats(hits.getRelevanceStats()); - - maybeSetCollectorDebugInfo(collector); - - if (explanationsEnabled(request.getDebugMode()) - && searchQuery.isSetRelevanceOptions() - && searchQuery.getRelevanceOptions().isSetRankingParams()) { - searcher.explainSearchResults(searchRequestInfo, hits, searchResults); - } - - addResultPayloads(); - - return hits; - } - - private FacetCountState newFacetCountState() throws ClientException { - int minNumFacetResults = DEFAULT_NUM_FACET_RESULTS; - if (facetRequest.isSetFacetRankingOptions() - && facetRequest.getFacetRankingOptions().isSetNumCandidatesFromEarlybird()) { - minNumFacetResults = facetRequest.getFacetRankingOptions().getNumCandidatesFromEarlybird(); - } - - // figure out which fields we need to count - FacetCountState facetCountState = new FacetCountState(schemaSnapshot, minNumFacetResults); - - // all categories if none! - if (facetRequest.getFacetFields() == null || facetRequest.getFacetFields().isEmpty()) { - for (Schema.FieldInfo facetField : schemaSnapshot.getFacetFields()) { - facetCountState.addFacet( - facetField.getFieldType().getFacetName(), DEFAULT_NUM_FACET_RESULTS); - } - } else { - Iterator it = facetRequest.getFacetFieldsIterator(); - while (it.hasNext()) { - ThriftFacetFieldRequest facetFieldRequest = it.next(); - Schema.FieldInfo facet = schemaSnapshot.getFacetFieldByFacetName( - facetFieldRequest.getFieldName()); - if (facet != null) { - facetCountState.addFacet( - facet.getFieldType().getFacetName(), facetFieldRequest.getNumResults()); - } else { - throw new ClientException("Unknown facet field: " + facetFieldRequest.getFieldName()); - } - } - } - return facetCountState; - } - - private com.twitter.search.queryparser.query.Query preLuceneQueryProcess( - com.twitter.search.queryparser.query.Query twitterQuery) throws QueryParserException { - - com.twitter.search.queryparser.query.Query query = twitterQuery; - if (searchHighFrequencyTermPairs && !includesCardField(searchQuery, query)) { - // Process high frequency term pairs. Works best when query is as flat as possible. - query = HighFrequencyTermPairRewriteVisitor.safeRewrite( - query, - DeciderUtil.isAvailableForRandomRecipient( - decider, "enable_hf_term_pair_negative_disjunction_rewrite")); - } - return query.simplify(); - } - - private Query postLuceneQueryProcess(final Query query) throws ClientException { - if (StringUtils.isBlank(request.getSearchQuery().getSerializedQuery()) - && StringUtils.isBlank(request.getSearchQuery().getLuceneQuery())) { - searcherStats.numRequestsWithBlankQuery.get(queryMode).increment(); - if (searchQuery.getSearchStatusIdsSize() == 0 - && searchQuery.getFromUserIDFilter64Size() == 0 - && searchQuery.getLikedByUserIDFilter64Size() == 0) { - // No query or ids to search. This is only allowed in some modes. - if (queryMode == QueryMode.RECENCY - || queryMode == QueryMode.RELEVANCE - || queryMode == QueryMode.TOP_TWEETS) { - throw new ClientException( - "No query or status ids for " + queryMode.toString().toLowerCase() + " query"); - } - } - } - - // Wrap the query as needed with additional query filters. - List filters = Lists.newArrayList(); - - // Min tweep cred filter. - if (searchQuery.isSetMinTweepCredFilter()) { - searcherStats.addedFilterBadUserRep.increment(); - filters.add(BadUserRepFilter.getBadUserRepFilter(searchQuery.getMinTweepCredFilter())); - } - - if (searchQuery.getFromUserIDFilter64Size() > 0) { - this.queriedFields.add(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName()); - this.searcherStats.addedFilterFromUserIds.increment(); - try { - filters.add(UserIdMultiSegmentQuery.createIdDisjunctionQuery( - "from_user_id_filter", - searchQuery.getFromUserIDFilter64(), - EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(), - schemaSnapshot, - multiSegmentTermDictionaryManager, - decider, - cluster, - Lists.newArrayList(), - null, - queryTimeoutFactory.createQueryTimeout(request, terminationTracker, clock))); - } catch (QueryParserException e) { - throw new ClientException(e); - } - } - - // Wrap the lucene query with these filters. - Query wrappedQuery = wrapFilters(query, filters.toArray(new Query[filters.size()])); - - // If searchStatusIds is set, additionally modify the query to search exactly these - // ids, using the luceneQuery only for scoring. - if (searchQuery.getSearchStatusIdsSize() > 0) { - this.searcherStats.addedFilterTweetIds.increment(); - - final Query queryForScoring = wrappedQuery; - final Query queryForRetrieval = - RequiredStatusIDsFilter.getRequiredStatusIDsQuery(searchQuery.getSearchStatusIds()); - - return new BooleanQuery.Builder() - .add(queryForRetrieval, Occur.MUST) - .add(queryForScoring, Occur.SHOULD) - .build(); - } - - return wrappedQuery; - } - - private com.twitter.search.queryparser.query.Query getLikedByUserIdQuery( - List ids) throws QueryParserException { - if (DeciderUtil.isAvailableForRandomRecipient( - decider, USE_MULTI_TERM_DISJUNCTION_FOR_LIKED_BY_USER_IDS_DECIDER_KEY)) { - // rewrite LikedByUserIdFilter64 to a multi_term_disjuntion query - return createMultiTermDisjunctionQueryForLikedByUserIds(ids); - } else { - // rewrite LikedByUserIdFilter64 to a disjunction of multiple liked_by_user_ids query - return createDisjunctionQueryForLikedByUserIds(ids); - } - } - - /** - * Returns the Lucene query visitor that should be applied to the original request. - * - * @param fieldWeightMapOverride The per-field weight overrides. - */ - @VisibleForTesting - public EarlybirdLuceneQueryVisitor getLuceneVisitor( - Map fieldWeightMapOverride) { - String clusterName = cluster.getNameForStats(); - // Iff in relevance mode _and_ intepreteSinceId is false, we turn off since_id - // operator by using LuceneRelevanceQueryVisitor. - - if (searchQuery.getRankingMode() == ThriftSearchRankingMode.RELEVANCE - && searchQuery.getRelevanceOptions() != null - && !searchQuery.getRelevanceOptions().isInterpretSinceId()) { - // hack! reset top level since id, which is the same thing LuceneRelevanceVisitor - // is doing. - idTimeRanges = null; - return new LuceneRelevanceQueryVisitor( - schemaSnapshot, - queryCacheManager, - segmentManager.getUserTable(), - segmentManager.getUserScrubGeoMap(), - terminationTracker, - FieldWeightDefault.overrideFieldWeightMap( - schemaSnapshot.getFieldWeightMap(), - dropBadFieldWeightOverrides(fieldWeightMapOverride, decider, clusterName)), - MAPPABLE_FIELD_MAP, - multiSegmentTermDictionaryManager, - decider, - cluster, - queryTimeoutFactory.createQueryTimeout( - request, terminationTracker, clock)); - } else { - return new EarlybirdLuceneQueryVisitor( - schemaSnapshot, - queryCacheManager, - segmentManager.getUserTable(), - segmentManager.getUserScrubGeoMap(), - terminationTracker, - FieldWeightDefault.overrideFieldWeightMap( - schemaSnapshot.getFieldWeightMap(), - dropBadFieldWeightOverrides(fieldWeightMapOverride, decider, clusterName)), - MAPPABLE_FIELD_MAP, - multiSegmentTermDictionaryManager, - decider, - cluster, - queryTimeoutFactory.createQueryTimeout( - request, terminationTracker, clock)); - } - } - - private void prepareFacetResults(ThriftFacetResults thriftFacetResults, - EarlybirdLuceneSearcher.FacetSearchResults hits, - FacetCountState facetCountState, - Set userIDWhitelist, - byte debugMode) throws IOException { - for (FacetRankingModule rankingModule : FacetRankingModule.REGISTERED_RANKING_MODULES) { - rankingModule.prepareResults(hits, facetCountState); - } - - Map allFacetResults = new HashMap<>(); - - Iterator> fieldResultsIterator = - facetCountState.getFacetFieldResultsIterator(); - while (fieldResultsIterator.hasNext()) { - - FacetCountState.FacetFieldResults facetFieldResults = - fieldResultsIterator.next(); - - if (facetFieldResults.results == null) { - // return empty resultset for this facet - List emptyList = new ArrayList<>(); - facetFieldResults.results = new ThriftFacetFieldResults(emptyList, 0); - } - thriftFacetResults.putToFacetFields(facetFieldResults.facetName, - facetFieldResults.results); - - Schema.FieldInfo field = schemaSnapshot.getFacetFieldByFacetName( - facetFieldResults.facetName); - - for (ThriftFacetCount result : facetFieldResults.results.topFacets) { - if (result.facetLabel != null) { - allFacetResults.put(new Term(field.getName(), result.facetLabel), result); - } else { - LOG.warn("Null facetLabel, field: {}, result: {}", field.getName(), result); - } - } - } - - searcher.fillFacetResultMetadata(allFacetResults, schemaSnapshot, debugMode); - - if (userIDWhitelist != null) { - for (ThriftFacetCount facetCount : allFacetResults.values()) { - ThriftFacetCountMetadata metadata = facetCount.getMetadata(); - if (metadata != null) { - metadata.setDontFilterUser(userIDWhitelist.contains(metadata.getTwitterUserId())); - } - } - } - } - - private void prepareTermStatisticsResults( - ThriftTermStatisticsResults termStatistics, - TermStatisticsCollector.TermStatisticsSearchResults hits, - byte debugMode) throws IOException { - - termStatistics.setBinIds(hits.binIds); - termStatistics.setHistogramSettings(termStatisticsRequest.getHistogramSettings()); - termStatistics.setTermResults(hits.results); - setTermStatisticsDebugInfo(hits.getTermStatisticsDebugInfo()); - - if (hits.lastCompleteBinId != -1) { - termStatistics.setMinCompleteBinId(hits.lastCompleteBinId); - } else { - SearchRateCounter.export(String.format( - "term_stats_%s_unset_min_complete_bin_id", request.getClientId())).increment(); - } - - if (idTimeRanges != null - && idTimeRanges.getUntilTimeExclusive().isPresent() - && hits.getMinSearchedTime() > idTimeRanges.getUntilTimeExclusive().get()) { - SearchRateCounter.export(String.format( - "term_stats_%s_min_searched_time_after_until_time", request.getClientId())).increment(); - } - - searcher.fillTermStatsMetadata(termStatistics, schemaSnapshot, debugMode); - } - - private EarlybirdResponse respondSuccess( - ThriftSearchResults thriftSearchResults, - ThriftFacetResults thriftFacetResults, - ThriftTermStatisticsResults termStatisticResults, - @Nonnull EarlyTerminationInfo earlyTerminationState, - @Nonnull SearchResultsInfo searchResultsInfo) { - - Preconditions.checkNotNull(earlyTerminationState); - Preconditions.checkNotNull(searchResultsInfo); - - exportEarlyTerminationStats(earlyTerminationState); - - EarlybirdResponse response = - newResponse(EarlybirdResponseCode.SUCCESS, request.getDebugMode() > 0); - response.setEarlyTerminationInfo(earlyTerminationState); - response.setNumSearchedSegments(searchResultsInfo.getNumSearchedSegments()); - - if (thriftSearchResults != null) { - // Nullcast check is only used when parsed query is available: if there is no parsed query, - // we would not add possible exclude nullcast filter. - if (parsedQuery != null && !parsedQueryAllowNullcast) { - logAndIncrementStatsIfNullcastInResults(thriftSearchResults); - } - response.setSearchResults(thriftSearchResults); - } else { - RESPONSE_HAS_NO_THRIFT_SEARCH_RESULTS.increment(); - } - if (thriftFacetResults != null) { - response.setFacetResults(thriftFacetResults); - } - if (termStatisticResults != null) { - response.setTermStatisticsResults(termStatisticResults); - } - - appendFeatureSchemaIfNeeded(response); - - appendLikedByUserIdsIfNeeded(response); - - return response; - } - - private void exportEarlyTerminationStats(@Nonnull EarlyTerminationInfo earlyTerminationState) { - if (earlyTerminationState.isSetEarlyTerminationReason()) { - SearchRateCounter.export(String.format("early_termination_%s_%s", - ClientIdUtil.formatClientId(request.getClientId()), - earlyTerminationState.getEarlyTerminationReason())).increment(); - SearchRateCounter.export(String.format("early_termination_%s_%s", - ClientIdUtil.formatClientIdAndRequestType( - request.getClientId(), queryMode.name().toLowerCase()), - earlyTerminationState.getEarlyTerminationReason())).increment(); - } - } - - /** - * Builds a rank -> userId map for liked_by_user_id queries that request hit attribution, and - * appends the resulting map to the response. - */ - private void appendLikedByUserIdsIfNeeded(EarlybirdResponse response) { - // Check if user asked for likedByUserIds list in response - ThriftSearchRelevanceOptions resultRelevanceOptions = - request.getSearchQuery().getRelevanceOptions(); - if ((resultRelevanceOptions == null) - || !resultRelevanceOptions.isCollectFieldHitAttributions()) { - return; - } - - // Make sure we have results in response and hit attribution helper is set up correctly - if (!response.isSetSearchResults() || hitAttributeHelper == null) { - return; - } - - // Get rank to node map - Map nodeToRankMap = - Preconditions.checkNotNull(hitAttributeHelper.getNodeToRankMap()); - - Map> expandedNodeToRankMap = - Preconditions.checkNotNull(hitAttributeHelper.getExpandedNodeToRankMap()); - - // Build a rank to id map - ImmutableMap.Builder builder = ImmutableMap.builder(); - for (com.twitter.search.queryparser.query.Query query : nodeToRankMap.keySet()) { - if (query instanceof SearchOperator) { - SearchOperator op = (SearchOperator) query; - if (expandedNodeToRankMap.containsKey(query)) { - // for multi_term_disjunction case - List ranks = expandedNodeToRankMap.get(op); - Preconditions.checkArgument(op.getNumOperands() == ranks.size() + 1); - for (int i = 0; i < ranks.size(); ++i) { - builder.put(ranks.get(i), Long.valueOf(op.getOperands().get(i + 1))); - } - } else if (op.getOperatorType() == SearchOperator.Type.LIKED_BY_USER_ID) { - // for liked_by_user_id case - Preconditions.checkArgument(op.getAnnotationOf(Annotation.Type.NODE_RANK).isPresent()); - builder.put( - (Integer) op.getAnnotationOf(Annotation.Type.NODE_RANK).get().getValue(), - Long.valueOf(op.getOperands().get(0))); - } - } - } - Map rankToIdMap = builder.build(); - - // Append liked_by_user_id filed into result - for (ThriftSearchResult result : response.getSearchResults().getResults()) { - if (result.isSetMetadata() - && result.getMetadata().isSetFieldHitAttribution() - && result.getMetadata().getFieldHitAttribution().isSetHitMap()) { - - List likedByUserIdList = Lists.newArrayList(); - - Map hitMap = - result.getMetadata().getFieldHitAttribution().getHitMap(); - // iterate hit attributions - for (int rank : hitMap.keySet()) { - if (rankToIdMap.containsKey(rank)) { - likedByUserIdList.add(rankToIdMap.get(rank)); - } - } - if (!result.getMetadata().isSetExtraMetadata()) { - result.getMetadata().setExtraMetadata(new ThriftSearchResultExtraMetadata()); - } - result.getMetadata().getExtraMetadata().setLikedByUserIds(likedByUserIdList); - } - } - } - - private void appendFeatureSchemaIfNeeded(EarlybirdResponse response) { - // Do not append the schema if the client didn't request it. - ThriftSearchResultMetadataOptions resultMetadataOptions = - request.getSearchQuery().getResultMetadataOptions(); - if ((resultMetadataOptions == null) || !resultMetadataOptions.isReturnSearchResultFeatures()) { - return; - } - - if (!response.isSetSearchResults()) { - return; - } - - ThriftSearchFeatureSchema featureSchema = schemaSnapshot.getSearchFeatureSchema(); - Preconditions.checkState( - featureSchema.isSetSchemaSpecifier(), - "The feature schema doesn't have a schema specifier set: {}", featureSchema); - - // If the client has this schema, we only need to return the schema version. - // If the client doesn't have this schema, we need to return the schema entries too. - if (resultMetadataOptions.isSetFeatureSchemasAvailableInClient() - && resultMetadataOptions.getFeatureSchemasAvailableInClient().contains( - featureSchema.getSchemaSpecifier())) { - CLIENT_HAS_FEATURE_SCHEMA_COUNTER.increment(); - ThriftSearchFeatureSchema responseFeatureSchema = new ThriftSearchFeatureSchema(); - responseFeatureSchema.setSchemaSpecifier(featureSchema.getSchemaSpecifier()); - response.getSearchResults().setFeatureSchema(responseFeatureSchema); - } else { - CLIENT_DOESNT_HAVE_FEATURE_SCHEMA_COUNTER.increment(); - Preconditions.checkState(featureSchema.isSetEntries(), - "Entries are not set in the feature schema: " + featureSchema); - response.getSearchResults().setFeatureSchema(featureSchema); - } - } - - private static long getQueryTimestamp(ThriftSearchQuery query) { - return query != null && query.isSetTimestampMsecs() ? query.getTimestampMsecs() : 0; - } - - private static boolean includesCardField(ThriftSearchQuery searchQuery, - com.twitter.search.queryparser.query.Query query) - throws QueryParserException { - - if (searchQuery.isSetRelevanceOptions()) { - ThriftSearchRelevanceOptions options = searchQuery.getRelevanceOptions(); - if (options.isSetFieldWeightMapOverride() - && (options.getFieldWeightMapOverride().containsKey( - EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName()) - || options.getFieldWeightMapOverride() - .containsKey(EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName()))) { - - return true; - } - } - - return query.accept(new DetectFieldAnnotationVisitor(ImmutableSet.of( - EarlybirdFieldConstant.CARD_TITLE_FIELD.getFieldName(), - EarlybirdFieldConstant.CARD_DESCRIPTION_FIELD.getFieldName()))); - } - - private static QueryMode getQueryMode(EarlybirdRequest request) { - if (request.isSetFacetRequest()) { - return QueryMode.FACETS; - } else if (request.isSetTermStatisticsRequest()) { - return QueryMode.TERM_STATS; - } - - // Recency mode until we determine otherwise. - QueryMode queryMode = QueryMode.RECENCY; - ThriftSearchQuery searchQuery = request.getSearchQuery(); - if (searchQuery != null) { - switch (searchQuery.getRankingMode()) { - case RECENCY: - queryMode = QueryMode.RECENCY; - break; - case RELEVANCE: - queryMode = QueryMode.RELEVANCE; - break; - case TOPTWEETS: - queryMode = QueryMode.TOP_TWEETS; - break; - default: - break; - } - } - - if (searchQuery == null - || !searchQuery.isSetSerializedQuery() - || searchQuery.getSerializedQuery().isEmpty()) { - LOG.debug("Search query was empty, query mode was " + queryMode); - } - - return queryMode; - } - - private static ImmutableMap dropBadFieldWeightOverrides( - Map map, Decider decider, String clusterName) { - - if (map == null) { - return null; - } - - FIELD_WEIGHT_OVERRIDE_MAP_NON_NULL_COUNT.increment(); - ImmutableMap.Builder builder = ImmutableMap.builder(); - - for (Map.Entry entry : map.entrySet()) { - if (EarlybirdFieldConstant.CAMELCASE_USER_HANDLE_FIELD.getFieldName().equals(entry.getKey()) - && !isAllowedCamelcaseUsernameFieldWeightOverride(decider, clusterName)) { - DROPPED_CAMELCASE_USERNAME_FIELD_WEIGHT_OVERRIDE.increment(); - } else if (EarlybirdFieldConstant.TOKENIZED_USER_NAME_FIELD.getFieldName().equals( - entry.getKey()) - && !isAllowedTokenizedScreenNameFieldWeightOverride(decider, clusterName)) { - DROPPED_TOKENIZED_DISPLAY_NAME_FIELD_WEIGHT_OVERRIDE.increment(); - } else { - builder.put(entry.getKey(), entry.getValue()); - } - } - - return builder.build(); - } - - private static boolean isAllowedCamelcaseUsernameFieldWeightOverride( - Decider decider, String clusterName) { - return DeciderUtil.isAvailableForRandomRecipient(decider, - ALLOW_CAMELCASE_USERNAME_FIELD_WEIGHT_OVERRIDE_DECIDER_KEY_PREFIX + clusterName); - } - - private static boolean isAllowedTokenizedScreenNameFieldWeightOverride( - Decider decider, String clusterName) { - return DeciderUtil.isAvailableForRandomRecipient(decider, - ALLOW_TOKENIZED_DISPLAY_NAME_FIELD_WEIGHT_OVERRIDE_DECIDER_KEY_PREFIX + clusterName); - } - - private static com.twitter.search.queryparser.query.Query - createMultiTermDisjunctionQueryForLikedByUserIds(List ids) throws QueryParserException { - List operands = new ArrayList<>(ids.size() + 1); - operands.add(EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName()); - for (long id : ids) { - operands.add(String.valueOf(id)); - } - return new SearchOperator(SearchOperator.Type.MULTI_TERM_DISJUNCTION, operands) - .simplify(); - } - - private static com.twitter.search.queryparser.query.Query createDisjunctionQueryForLikedByUserIds( - List ids) throws QueryParserException { - return new Disjunction( - ids.stream() - .map(id -> new SearchOperator(SearchOperator.Type.LIKED_BY_USER_ID, id)) - .collect(Collectors.toList())) - .simplify(); - } - - public com.twitter.search.queryparser.query.Query getParsedQuery() { - return parsedQuery; - } - - /** - * Get the index fields that were queried after this searcher completed its job. - * @return - */ - public Set getQueriedFields() { - return queriedFields; - } - - public Query getLuceneQuery() { - return luceneQuery; - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdServer.docx b/src/java/com/twitter/search/earlybird/EarlybirdServer.docx new file mode 100644 index 000000000..48028d4fd Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdServer.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdServer.java b/src/java/com/twitter/search/earlybird/EarlybirdServer.java deleted file mode 100644 index 44d9dc1d8..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdServer.java +++ /dev/null @@ -1,1087 +0,0 @@ -package com.twitter.search.earlybird; - -import java.io.BufferedWriter; -import java.io.Closeable; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; -import java.util.concurrent.ArrayBlockingQueue; -import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.RejectedExecutionException; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicReference; -import javax.annotation.Nullable; -import javax.annotation.concurrent.GuardedBy; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Charsets; -import com.google.common.base.Stopwatch; -import com.google.common.cache.CacheBuilder; -import com.google.common.cache.CacheLoader; -import com.google.common.cache.LoadingCache; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; -import com.google.common.util.concurrent.AtomicLongMap; - -import org.apache.commons.codec.binary.Base64; -import org.apache.lucene.search.IndexSearcher; -import org.apache.thrift.TBase; -import org.apache.thrift.TException; -import org.apache.thrift.TSerializer; -import org.apache.zookeeper.KeeperException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.collections.Pair; -import com.twitter.common.util.Clock; -import com.twitter.common.zookeeper.ServerSet.UpdateException; -import com.twitter.common.zookeeper.ZooKeeperClient; -import com.twitter.decider.Decider; -import com.twitter.finagle.Failure; -import com.twitter.search.common.database.DatabaseConfig; -import com.twitter.search.common.metrics.Percentile; -import com.twitter.search.common.metrics.PercentileUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.metrics.Timer; -import com.twitter.search.common.schema.DynamicSchema; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.earlybird.FlushVersion; -import com.twitter.search.common.search.termination.QueryTimeoutFactory; -import com.twitter.search.common.util.FinagleUtil; -import com.twitter.search.common.util.GCUtil; -import com.twitter.search.common.util.ml.tensorflow_engine.TensorflowModelsManager; -import com.twitter.search.common.util.zookeeper.ZooKeeperProxy; -import com.twitter.search.core.earlybird.index.inverted.QueryCostTracker; -import com.twitter.search.earlybird.admin.LastSearchesSummary; -import com.twitter.search.earlybird.admin.QueriedFieldsAndSchemaStats; -import com.twitter.search.earlybird.common.ClientIdUtil; -import com.twitter.search.earlybird.common.EarlybirdRequestLogger; -import com.twitter.search.earlybird.common.EarlybirdRequestPostLogger; -import com.twitter.search.earlybird.common.EarlybirdRequestPreLogger; -import com.twitter.search.earlybird.common.EarlybirdRequestUtil; -import com.twitter.search.earlybird.common.RequestResponsePair; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.exception.EarlybirdStartupException; -import com.twitter.search.earlybird.exception.TransientException; -import com.twitter.search.earlybird.ml.ScoringModelsManager; -import com.twitter.search.earlybird.partition.AudioSpaceTable; -import com.twitter.search.earlybird.partition.DynamicPartitionConfig; -import com.twitter.search.earlybird.partition.EarlybirdStartup; -import com.twitter.search.earlybird.partition.MultiSegmentTermDictionaryManager; -import com.twitter.search.earlybird.partition.PartitionConfig; -import com.twitter.search.earlybird.partition.PartitionManager; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentManager; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; -import com.twitter.search.earlybird.partition.SegmentVulture; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.earlybird.stats.EarlybirdRPCStats; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.search.earlybird.thrift.EarlybirdServerStats; -import com.twitter.search.earlybird.thrift.EarlybirdService; -import com.twitter.search.earlybird.thrift.EarlybirdStatusCode; -import com.twitter.search.earlybird.thrift.EarlybirdStatusResponse; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird.util.OneTaskScheduledExecutorManager; -import com.twitter.search.earlybird.util.TermCountMonitor; -import com.twitter.search.earlybird.util.TweetCountMonitor; -import com.twitter.snowflake.id.SnowflakeId; -import com.twitter.util.Duration; -import com.twitter.util.Function; -import com.twitter.util.Function0; -import com.twitter.util.Future; - -public class EarlybirdServer implements EarlybirdService.ServiceIface, ServerSetMember { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdServer.class); - - private static final String EARLYBIRD_STARTUP = "earlybird startup"; - public static final String SERVICE_NAME = "Earlybird"; - - private static final boolean REGISTER_WITH_ZK_ON_STARTUP = - EarlybirdConfig.getBool("register_with_zk_on_startup", true); - private static final Duration SERVER_CLOSE_WAIT_TIME = Duration.apply(5L, TimeUnit.SECONDS); - - private static final Failure QUEUE_FULL_FAILURE = - Failure.rejected("Rejected due to full executor queue"); - - private final int port = EarlybirdConfig.getThriftPort(); - private final int warmUpPort = EarlybirdConfig.getWarmUpThriftPort(); - private final int numSearcherThreads = EarlybirdConfig.getSearcherThreads(); - - private final SearchStatsReceiver earlybirdServerStatsReceiver; - private final EarlybirdRPCStats searchStats = new EarlybirdRPCStats("search"); - private final EarlybirdSearcherStats tweetsSearcherStats; - - private static final String REQUESTS_RECEIVED_BY_FINAGLE_ID_COUNTER_NAME_PATTERN = - "requests_for_finagle_id_%s_all"; - private static final String REQUESTS_RECEIVED_BY_FINAGLE_ID_AND_CLIENT_ID_COUNTER_NAME_PATTERN = - "requests_for_finagle_id_%s_and_client_id_%s"; - private static final String RESPONSES_PER_CLIENT_ID_STAT_TEMPLATE = - "responses_for_client_id_%s_with_response_code_%s"; - - // Loading cache for per finagle-client-id stats. Storing them in a loading cache key-ed by - // finagle client id so we don't export the stat multiple times. - private final LoadingCache requestCountersByFinagleClientId = - CacheBuilder.newBuilder().build( - new CacheLoader() { - @Override - public SearchTimerStats load(String finagleClientId) { - return earlybirdServerStatsReceiver.getTimerStats( - String.format( - REQUESTS_RECEIVED_BY_FINAGLE_ID_COUNTER_NAME_PATTERN, - finagleClientId), TimeUnit.MICROSECONDS, false, true, false); - } - }); - - // Counters per client and response code. - private final LoadingCache responseByClientIdAndResponseCode = - CacheBuilder.newBuilder().build( - new CacheLoader() { - @Override - public SearchCounter load(String key) { - return earlybirdServerStatsReceiver.getCounter(key); - } - }); - - private final LoadingCache resultsAgeCounter = - CacheBuilder.newBuilder().build( - new CacheLoader() { - @Override - public SearchCounter load(String key) { - return earlybirdServerStatsReceiver.getCounter(key); - } - } - ); - - // Loading cache for per finagle client id and client id stats. These are stored separate - // from the other stats because they are key-ed by the pair of finagle client id and client id - // in order to make sure the stats are only exported once. - // In the key-pair the first element is the finagle client id while the second element is the - // client id. - private final LoadingCache, SearchRateCounter> - requestCountersByFinagleIdAndClientId = CacheBuilder.newBuilder().build( - new CacheLoader, SearchRateCounter>() { - @Override - public SearchRateCounter load(Pair clientKey) { - return earlybirdServerStatsReceiver.getRateCounter( - String.format( - REQUESTS_RECEIVED_BY_FINAGLE_ID_AND_CLIENT_ID_COUNTER_NAME_PATTERN, - clientKey.getFirst(), - clientKey.getSecond())); - } - }); - - // Loading cache for per-client-id latency stats. Stored in a loading cache here mainly because - // the tests assert the mock stats receiver that each stat is only exported once. - private final LoadingCache clientIdSearchStats = - CacheBuilder.newBuilder().build( - new CacheLoader() { - @Override - public SearchTimerStats load(String clientId) { - String formattedClientId = ClientIdUtil.formatClientId(clientId); - return earlybirdServerStatsReceiver.getTimerStats(formattedClientId, - TimeUnit.MICROSECONDS, false, true, true); - } - }); - - private final LoadingCache clientIdScoringPerQueryStats = - CacheBuilder.newBuilder().build( - new CacheLoader() { - @Override - public SearchTimerStats load(String clientId) { - String statName = - String.format("scoring_time_per_query_for_client_id_%s", clientId); - return earlybirdServerStatsReceiver.getTimerStats(statName, - TimeUnit.NANOSECONDS, false, true, false); - } - }); - - private final LoadingCache clientIdScoringPerHitStats = - CacheBuilder.newBuilder().build( - new CacheLoader() { - @Override - public SearchTimerStats load(String clientId) { - String statName = - String.format("scoring_time_per_hit_for_client_id_%s", clientId); - return earlybirdServerStatsReceiver.getTimerStats(statName, - TimeUnit.NANOSECONDS, false, true, false); - } - }); - - private final LoadingCache> clientIdScoringNumHitsProcessedStats = - CacheBuilder.newBuilder().build( - new CacheLoader>() { - @Override - public Percentile load(String clientId) { - String statName = - String.format("scoring_num_hits_processed_for_client_id_%s", clientId); - return PercentileUtil.createPercentile(statName); - } - }); - - private final LoadingCache> lastRequestPerClientId = - CacheBuilder.newBuilder().build( - new CacheLoader>() { - @Override - public AtomicReference load(String key) throws Exception { - return new AtomicReference<>(null); - } - }); - - - private final SearchTimerStats overallScoringTimePerQueryStats; - private final SearchTimerStats overallScoringTimePerHitStats; - private final Percentile overallScoringNumHitsProcessedStats; - - private final EarlybirdIndexConfig earlybirdIndexConfig; - private final DynamicPartitionConfig dynamicPartitionConfig; - private final SegmentManager segmentManager; - private final UpdateableEarlybirdStateManager stateManager; - private final AudioSpaceTable audioSpaceTable; - - private final SearchLongGauge startupTimeGauge; - - // Time spent in an internal thread pool queue, between the time we get the search request - // from finagle until it actually starts being executed. - private final SearchTimerStats internalQueueWaitTimeStats; - - // Tracking request that have exceeded their allocated timeout prior to us actually being able - // to start executing the search. - private final SearchCounter requestTimeoutExceededBeforeSearchCounter; - // Current number of running searcher threads. - private final SearchLongGauge numSearcherThreadsGauge; - private final QueryTimeoutFactory queryTimeoutFactory; - - private PartitionManager partitionManager; - private QueryCacheManager queryCacheManager; - - private final ScoringModelsManager scoringModelsManager; - - private final TensorflowModelsManager tensorflowModelsManager; - - private final EarlybirdRequestPreLogger requestPreLogger; - private final EarlybirdRequestPostLogger requestLogger; - - private final TweetCountMonitor tweetCountMonitor; - private final TermCountMonitor termCountMonitor; - - private final EarlybirdServerSetManager serverSetManager; - private final EarlybirdWarmUpManager warmUpManager; - private final MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager; - - private final Object shutdownLock = new Object(); - @GuardedBy("shutdownLock") - private final EarlybirdFuturePoolManager futurePoolManager; - @GuardedBy("shutdownLock") - private final EarlybirdFinagleServerManager finagleServerManager; - - // If a search request comes in with a client-side start time, and we see that based on that - // the timeout has expired, whether we should drop that query immediately. - private final boolean skipTimedOutRequests = - EarlybirdConfig.getBool("skip_timedout_requests", false); - - // client of szookeeper.local.twitter.com. - // This is used to perform distributed locking and layout reading etc. - private final ZooKeeperProxy sZooKeeperClient; - - private final Decider decider; - - private final Clock clock; - - private final List toClose = new ArrayList<>(); - - private final SearchIndexingMetricSet searchIndexingMetricSet; - - private final EarlybirdDarkProxy earlybirdDarkProxy; - - private final ImmutableMap responseCodeCounters; - private final SegmentSyncConfig segmentSyncConfig; - private final EarlybirdStartup earlybirdStartup; - private final QualityFactor qualityFactor; - - private boolean isShutdown = false; - private boolean isShuttingDown = false; - - private final AtomicLongMap queriedFieldsCounts = AtomicLongMap.create(); - - public EarlybirdServer(QueryCacheManager queryCacheManager, - ZooKeeperProxy sZkClient, - Decider decider, - EarlybirdIndexConfig earlybirdIndexConfig, - DynamicPartitionConfig dynamicPartitionConfig, - PartitionManager partitionManager, - SegmentManager segmentManager, - AudioSpaceTable audioSpaceTable, - TermCountMonitor termCountMonitor, - TweetCountMonitor tweetCountMonitor, - UpdateableEarlybirdStateManager earlybirdStateManager, - EarlybirdFuturePoolManager futurePoolManager, - EarlybirdFinagleServerManager finagleServerManager, - EarlybirdServerSetManager serverSetManager, - EarlybirdWarmUpManager warmUpManager, - SearchStatsReceiver earlybirdServerStatsReceiver, - EarlybirdSearcherStats tweetsSearcherStats, - ScoringModelsManager scoringModelsManager, - TensorflowModelsManager tensorflowModelsManager, - Clock clock, - MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager, - EarlybirdDarkProxy earlybirdDarkProxy, - SegmentSyncConfig segmentSyncConfig, - QueryTimeoutFactory queryTimeoutFactory, - EarlybirdStartup earlybirdStartup, - QualityFactor qualityFactor, - SearchIndexingMetricSet searchIndexingMetricSet) { - LOG.info("Creating EarlybirdServer"); - this.decider = decider; - this.clock = clock; - this.sZooKeeperClient = sZkClient; - this.earlybirdIndexConfig = earlybirdIndexConfig; - this.dynamicPartitionConfig = dynamicPartitionConfig; - this.segmentManager = segmentManager; - this.queryCacheManager = queryCacheManager; - this.termCountMonitor = termCountMonitor; - this.tweetCountMonitor = tweetCountMonitor; - this.stateManager = earlybirdStateManager; - this.partitionManager = partitionManager; - this.futurePoolManager = futurePoolManager; - this.finagleServerManager = finagleServerManager; - this.serverSetManager = serverSetManager; - this.warmUpManager = warmUpManager; - this.earlybirdServerStatsReceiver = earlybirdServerStatsReceiver; - this.tweetsSearcherStats = tweetsSearcherStats; - this.scoringModelsManager = scoringModelsManager; - this.tensorflowModelsManager = tensorflowModelsManager; - this.multiSegmentTermDictionaryManager = multiSegmentTermDictionaryManager; - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.earlybirdDarkProxy = earlybirdDarkProxy; - this.segmentSyncConfig = segmentSyncConfig; - this.queryTimeoutFactory = queryTimeoutFactory; - this.earlybirdStartup = earlybirdStartup; - this.qualityFactor = qualityFactor; - this.audioSpaceTable = audioSpaceTable; - - EarlybirdStatus.setStartTime(System.currentTimeMillis()); - - // Our initial status code is STARTING. - EarlybirdStatus.setStatus(EarlybirdStatusCode.STARTING); - EarlybirdStatus.THRIFT_SERVICE_STARTED.set(false); - - PartitionConfig partitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - earlybirdServerStatsReceiver.getLongGauge( - "search_cluster_" + partitionConfig.getClusterName()).set(1); - earlybirdServerStatsReceiver.getLongGauge( - "tier_name_" + partitionConfig.getTierName()).set(1); - - earlybirdServerStatsReceiver.getLongGauge("partition").set( - partitionConfig.getIndexingHashPartitionID()); - earlybirdServerStatsReceiver.getLongGauge("replica").set( - partitionConfig.getHostPositionWithinHashPartition()); - earlybirdServerStatsReceiver.getLongGauge("penguin_version").set( - EarlybirdConfig.getPenguinVersionByte()); - - earlybirdServerStatsReceiver.getLongGauge("flush_version").set( - FlushVersion.CURRENT_FLUSH_VERSION.ordinal()); - String buildGen = EarlybirdConfig.getString("offline_segment_build_gen", "unknown"); - earlybirdServerStatsReceiver.getLongGauge("build_gen_" + buildGen).set(1); - - this.startupTimeGauge = earlybirdServerStatsReceiver.getLongGauge("startup_time_millis"); - this.internalQueueWaitTimeStats = earlybirdServerStatsReceiver.getTimerStats( - "internal_queue_wait_time", TimeUnit.MILLISECONDS, false, true, false); - this.requestTimeoutExceededBeforeSearchCounter = earlybirdServerStatsReceiver.getCounter( - "request_timeout_exceeded_before_search"); - this.numSearcherThreadsGauge = - earlybirdServerStatsReceiver.getLongGauge("num_searcher_threads"); - this.overallScoringTimePerQueryStats = earlybirdServerStatsReceiver.getTimerStats( - "overall_scoring_time_per_query", TimeUnit.NANOSECONDS, false, true, false); - - // For most of our scoring functions the scoring_time_per_hit records the actual time to score a - // single hit. However, the tensorflow based scoring function uses batch scoring, so we do not - // know the actual time it takes to score a single hit. We are now including batch scoring time - // in all scoring time stats (SEARCH-26014), which means that the scoring_time_per_hit stat may - // be a bit misleading for tensorflow based queries. For these queries the scoring_time_per_hit - // represents the ratio between total_scoring_time and the number_of_hits, instead of the actual - // time to score a single hit. - this.overallScoringTimePerHitStats = earlybirdServerStatsReceiver.getTimerStats( - "overall_scoring_time_per_hit", TimeUnit.NANOSECONDS, false, true, false); - this.overallScoringNumHitsProcessedStats = PercentileUtil.createPercentile( - "overall_scoring_num_hits_processed"); - - ImmutableMap.Builder responseCodeCountersBuilder = - new ImmutableMap.Builder<>(); - for (EarlybirdResponseCode responseCode : EarlybirdResponseCode.values()) { - responseCodeCountersBuilder.put( - responseCode, - earlybirdServerStatsReceiver.getCounter( - "responses_with_response_code_" + responseCode.name().toLowerCase())); - } - responseCodeCounters = responseCodeCountersBuilder.build(); - - disableLuceneQueryCache(); - initManagers(); - - requestPreLogger = EarlybirdRequestPreLogger.buildForShard( - EarlybirdConfig.getInt("latency_warn_threshold", 100), decider); - requestLogger = EarlybirdRequestPostLogger.buildForShard( - EarlybirdConfig.getInt("latency_warn_threshold", 100), decider); - - this.qualityFactor.startUpdates(); - - LOG.info("Created EarlybirdServer"); - } - - public boolean isShutdown() { - return this.isShutdown; - } - - private void initManagers() { - LOG.info("Created EarlybirdIndexConfig: " + earlybirdIndexConfig.getClass().getSimpleName()); - - segmentManager.addUpdateListener(queryCacheManager); - } - - public PartitionManager getPartitionManager() { - return partitionManager; - } - - public QueryCacheManager getQueryCacheManager() { - return queryCacheManager; - } - - public SegmentManager getSegmentManager() { - return segmentManager; - } - - public MultiSegmentTermDictionaryManager getMultiSegmentTermDictionaryManager() { - return this.multiSegmentTermDictionaryManager; - } - - @VisibleForTesting - public int getPort() { - return port; - } - - private void disableLuceneQueryCache() { - // SEARCH-30046: Look into possibly re-enabling the query -> weight cache. - // We can't use this cache until we upgrade to Lucene 6.0.0, because we have queries with a - // boost of 0.0, and they don't play nicely with Lucene's LRUQueryCache.get() method. - // - // Lucene 6.0.0 changes how boosts are handled: "real" boosts should be wrapped into BoostQuery - // instances, and queries with a boost of 0.0 should be rewritten as "filters" - // (BooleanQuery.add(query, BooleanClause.Occur.FILTER)). So when we upgrade to Lucene 6.0.0 we - // will be forced to refactor how we handle our current queries with a boost of 0.0, which might - // allow us to re-enable this cache. - // - // Note that disabling this cache is not a regression: it should give us the behavior that we - // had with Lucene 5.2.1 (and it's unclear if this cache is useful at all). - // - // WARNING: The default 'DefaultQueryCache' maintains a static reference to the weight forever, - // causing a memory leak. Our weights hold references to an entire segment so the memory leak is - // significant. - IndexSearcher.setDefaultQueryCache(null); - } - - /** - * Starts the earlybird server. - */ - public void start() throws EarlybirdStartupException { - // Make sure this is at the top of the function before other parts of the system start running - new EarlybirdBlacklistHandler(Clock.SYSTEM_CLOCK, sZooKeeperClient) - .blockThenExitIfBlacklisted(); - - Stopwatch startupWatch = Stopwatch.createStarted(); - EarlybirdStatus.beginEvent(EARLYBIRD_STARTUP, searchIndexingMetricSet.startupInProgress); - - LOG.info("java.library.path is: " + System.getProperty("java.library.path")); - - PartitionConfig partitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - - SegmentVulture.removeUnusedSegments(partitionManager, partitionConfig, - earlybirdIndexConfig.getSchema().getMajorVersionNumber(), segmentSyncConfig); - - // Start the schema manager - schedule(stateManager); - - Closeable closeable = earlybirdStartup.start(); - toClose.add(closeable); - if (EarlybirdStatus.getStatusCode() == EarlybirdStatusCode.STOPPING) { - LOG.info("Server is shutdown. Exiting..."); - return; - } - - startupTimeGauge.set(startupWatch.elapsed(TimeUnit.MILLISECONDS)); - - EarlybirdStatus.endEvent(EARLYBIRD_STARTUP, searchIndexingMetricSet.startupInProgress); - - GCUtil.runGC(); // Attempt to force a full GC before joining the serverset - - try { - startThriftService(null, true); - } catch (InterruptedException e) { - LOG.info("Interrupted while starting thrift server, quitting earlybird"); - throw new EarlybirdStartupException("Interrupted while starting thrift server"); - } - - EarlybirdStatus.THRIFT_SERVICE_STARTED.set(true); - - // only once we're current, kick off daily tweet count monitors only for archive cluster - if (EarlybirdConfig.getInt(TweetCountMonitor.RUN_INTERVAL_MINUTES_CONFIG_NAME, -1) > 0) { - schedule(tweetCountMonitor); - } - - // only once we're current, kick off per-field term count monitors - if (EarlybirdConfig.getInt(TermCountMonitor.RUN_INTERVAL_MINUTES_CONFIG_NAME, -1) > 0) { - schedule(termCountMonitor); - } - - startupTimeGauge.set(startupWatch.elapsed(TimeUnit.MILLISECONDS)); - LOG.info("EarlybirdServer start up time: {}", startupWatch); - } - - /** - * Starts the thrift server if the server is not running. - * If searcherThreads is null, it uses the value specified by EarlybirdConfig. - */ - public void startThriftService(@Nullable Integer searcherThreads, boolean isStartingUp) - throws InterruptedException { - synchronized (shutdownLock) { - if (!finagleServerManager.isWarmUpServerRunning() - && !finagleServerManager.isProductionServerRunning()) { - int threadCount = searcherThreads != null - ? searcherThreads : this.numSearcherThreads; - LOG.info("Starting searcher pool with " + threadCount + " threads"); - futurePoolManager.createUnderlyingFuturePool(threadCount); - numSearcherThreadsGauge.set(threadCount); - - // If the server is not shutting down, go through the warm up stage. If the server is - // instructed to shut down during warm up, warmUpManager.warmUp() should return within a - // second, and should leave the warm up server set. We should still shut down the warm up - // Finagle server. - if (isStartingUp && (EarlybirdStatus.getStatusCode() != EarlybirdStatusCode.STOPPING)) { - LOG.info("Opening warmup thrift port..."); - finagleServerManager.startWarmUpFinagleServer(this, SERVICE_NAME, warmUpPort); - EarlybirdStatus.WARMUP_THRIFT_PORT_OPEN.set(true); - - try { - warmUpManager.warmUp(); - } catch (UpdateException e) { - LOG.warn("Could not join or leave the warm up server set.", e); - } finally { - finagleServerManager.stopWarmUpFinagleServer(SERVER_CLOSE_WAIT_TIME); - EarlybirdStatus.WARMUP_THRIFT_PORT_OPEN.set(false); - } - } - - // If the server is not shutting down, we can start the production Finagle server and join - // the production server set. - if (EarlybirdStatus.getStatusCode() != EarlybirdStatusCode.STOPPING) { - LOG.info("Opening production thrift port..."); - finagleServerManager.startProductionFinagleServer( - earlybirdDarkProxy.getDarkProxy(), this, SERVICE_NAME, port); - EarlybirdStatus.THRIFT_PORT_OPEN.set(true); - - if (REGISTER_WITH_ZK_ON_STARTUP) { - // After the earlybird starts up, register with ZooKeeper. - try { - joinServerSet("internal start-up"); - - // Join separate server set for ServiceProxy on Archive Earlybirds - if (!EarlybirdConfig.isAurora()) { - joinServerSetForServiceProxy(); - } - } catch (UpdateException e) { - throw new RuntimeException("Unable to join ServerSet during startup.", e); - } - } - } - } - } - } - - /** - * Stops the thrift server if the server is already running. - */ - public void stopThriftService(boolean shouldShutDown) { - synchronized (shutdownLock) { - try { - leaveServerSet(shouldShutDown ? "internal shutdown" : "admin stopThriftService"); - } catch (UpdateException e) { - LOG.warn("Leaving production ServerSet failed.", e); - } - - if (finagleServerManager.isProductionServerRunning()) { - try { - finagleServerManager.stopProductionFinagleServer(SERVER_CLOSE_WAIT_TIME); - futurePoolManager.stopUnderlyingFuturePool( - SERVER_CLOSE_WAIT_TIME.inSeconds(), TimeUnit.SECONDS); - numSearcherThreadsGauge.set(0); - } catch (InterruptedException e) { - LOG.error("Interrupted while stopping thrift service", e); - Thread.currentThread().interrupt(); - } - EarlybirdStatus.THRIFT_PORT_OPEN.set(false); - } - } - } - - /** - * Gets a string with information about the last request we've seen from each client. - */ - public Future getLastSearchesByClient(boolean includeResults) { - LastSearchesSummary summary = new LastSearchesSummary( - lastRequestPerClientId, clientIdSearchStats, includeResults); - return Future.value(summary.getSummary()); - } - - /** - * The following are all the Thrift RPC methods inherited from EarlybirdService.Iface - */ - - // Thrift getName RPC. - @Override - public Future getName() { - return Future.value(SERVICE_NAME); - } - - // Thrift getStatus RPC. - @Override - public Future getStatus() { - EarlybirdStatusResponse response = new EarlybirdStatusResponse(); - response.setCode(EarlybirdStatus.getStatusCode()); - response.setAliveSince(EarlybirdStatus.getStartTime()); - response.setMessage(EarlybirdStatus.getStatusMessage()); - return Future.value(response); - } - - public Future> getSegmentMetadata() { - return Future.value(segmentManager.getSegmentMetadata()); - } - - public Future getQueryCachesData() { - return Future.value(segmentManager.getQueryCachesData()); - } - - /** - * Get a text summary for which fields did we use in a schema. - */ - public Future getQueriedFieldsAndSchemaStats() { - ImmutableSchemaInterface schema = this.earlybirdIndexConfig.getSchema().getSchemaSnapshot(); - - QueriedFieldsAndSchemaStats summary = new QueriedFieldsAndSchemaStats(schema, - queriedFieldsCounts); - return Future.value(summary.getSummary()); - } - - /** - * Shuts down the earlybird server. - */ - public void shutdown() { - LOG.info("shutdown(): status set to STOPPING"); - EarlybirdStatus.setStatus(EarlybirdStatusCode.STOPPING); - try { - LOG.info("Stopping Finagle server."); - stopThriftService(true); - EarlybirdStatus.THRIFT_SERVICE_STARTED.set(false); - - if (queryCacheManager != null) { - queryCacheManager.shutdown(); - } else { - LOG.info("No queryCacheManager to shut down"); - } - - earlybirdIndexConfig.getResourceCloser().shutdownExecutor(); - - isShuttingDown = true; - LOG.info("Closing {} closeables.", toClose.size()); - for (Closeable closeable : toClose) { - closeable.close(); - } - } catch (InterruptedException | IOException e) { - EarlybirdStatus.setStatus(EarlybirdStatusCode.UNHEALTHY, e.getMessage()); - LOG.error("Interrupted during shutdown, status set to UNHEALTHY"); - } - LOG.info("Earlybird server stopped!"); - isShutdown = true; - } - - @Override - public Future search(final EarlybirdRequest request) { - final long requestReceivedTimeMillis = System.currentTimeMillis(); - // Record clock diff as early as possible. - EarlybirdRequestUtil.recordClientClockDiff(request); - - if (!futurePoolManager.isPoolReady()) { - return Future.exception(new TransientException("Earlybird not yet able to handle requests.")); - } - - return futurePoolManager.apply(new Function0() { - @Override - public EarlybirdResponse apply() { - return doSearch(request, requestReceivedTimeMillis); - } - }).rescue(Function.func( - // respond with Nack when the queue is full - t -> Future.exception((t instanceof RejectedExecutionException) ? QUEUE_FULL_FAILURE : t))); - } - - private EarlybirdResponse doSearch(EarlybirdRequest request, long requestReceivedTimeMillis) { - final long queueWaitTime = System.currentTimeMillis() - requestReceivedTimeMillis; - internalQueueWaitTimeStats.timerIncrement(queueWaitTime); - - // request restart time, not to be confused with startTime which is server restart time - Timer timer = new Timer(TimeUnit.MICROSECONDS); - - requestPreLogger.logRequest(request); - - String clientId = ClientIdUtil.getClientIdFromRequest(request); - String finagleClientId = FinagleUtil.getFinagleClientName(); - requestCountersByFinagleIdAndClientId.getUnchecked(new Pair<>(finagleClientId, clientId)) - .increment(); - - EarlybirdRequestUtil.checkAndSetCollectorParams(request); - - // If the thrift logger is busy logging, queue the thrift request for logging. - if (EarlybirdThriftRequestLoggingUtil.thriftLoggerBusy) { - EarlybirdThriftRequestLoggingUtil.REQUEST_BUFFER.offer(request); - } - - EarlybirdRequestUtil.logAndFixExcessiveValues(request); - - final EarlybirdSearcher searcher = new EarlybirdSearcher( - request, - segmentManager, - audioSpaceTable, - queryCacheManager, - earlybirdIndexConfig.getSchema().getSchemaSnapshot(), - earlybirdIndexConfig.getCluster(), - dynamicPartitionConfig.getCurrentPartitionConfig(), - decider, - tweetsSearcherStats, - scoringModelsManager, - tensorflowModelsManager, - clock, - multiSegmentTermDictionaryManager, - queryTimeoutFactory, - qualityFactor); - - QueryCostTracker queryCostTracker = QueryCostTracker.getTracker(); - EarlybirdResponse response = null; - try { - if (skipTimedOutRequests - && searcher.getTerminationTracker().getTimeoutEndTimeWithReservation() - <= clock.nowMillis()) { - requestTimeoutExceededBeforeSearchCounter.increment(); - response = new EarlybirdResponse(); - response.setResponseCode(EarlybirdResponseCode.SERVER_TIMEOUT_ERROR); - } else { - queryCostTracker.reset(); - response = searcher.search(); - } - } finally { - if (response == null) { - // This can only happen if we failed to catch an exception in the searcher. - LOG.error("Response was null: " + request.toString()); - response = new EarlybirdResponse(); - response.setResponseCode(EarlybirdResponseCode.TRANSIENT_ERROR); - } - - if (response.getSearchResults() == null) { - List emptyResultSet = Lists.newArrayList(); - response.setSearchResults(new ThriftSearchResults(emptyResultSet)); - } - - long reqLatency = timer.stop(); - response.setResponseTime(reqLatency / 1000); - response.setResponseTimeMicros(reqLatency); - response.getSearchResults().setQueryCost(queryCostTracker.getTotalCost()); - - requestLogger.logRequest(request, response, timer); - - int numResults = EarlybirdRequestLogger.numResultsForLog(response); - boolean success = response.getResponseCode() == EarlybirdResponseCode.SUCCESS; - boolean clientError = response.getResponseCode() == EarlybirdResponseCode.CLIENT_ERROR; - boolean earlyTerminated = (response.getSearchResults().isSetNumPartitionsEarlyTerminated() - && response.getSearchResults().getNumPartitionsEarlyTerminated() > 0) - || searcher.getTerminationTracker().isEarlyTerminated(); - // Update termination stats. - searcher.getTerminationTracker().getEarlyTerminationState().incrementCount(); - - searchStats.requestComplete(reqLatency, numResults, success, earlyTerminated, clientError); - if (searcher.getRequestStats() != null) { - searcher.getRequestStats().requestComplete(reqLatency, numResults, success, - earlyTerminated, clientError); - } - - getResponseCodeCounter(response.getResponseCode()).increment(); - // Adding this counter to make it easier to debug cases where we see a spike in - // bad client request errors but don't know where they're coming from. (The - // alternative is to ssh to a machine in the cluster and sample - // /var/log/earlybird/earlybird.failed_requests). - getClientIdResponseCodeCounter(clientId, response.getResponseCode()).increment(); - - // Export request latency as a stat. - clientIdSearchStats.getUnchecked(clientId).timerIncrement(reqLatency); - requestCountersByFinagleClientId.getUnchecked(finagleClientId).timerIncrement(reqLatency); - addEarlybirdServerStats(response, queueWaitTime); - // Export scoring stats for the request. - exportScoringTimeStats(response, clientId); - } - - Set queriedFields = searcher.getQueriedFields(); - if (queriedFields != null) { - for (String queriedField : queriedFields) { - queriedFieldsCounts.incrementAndGet(queriedField); - } - } - - // Increment counters for age of the returned results. - if (response.getSearchResults() != null && response.getSearchResults().getResults() != null) { - long currentTime = System.currentTimeMillis(); - for (ThriftSearchResult result : response.getSearchResults().getResults()) { - long tweetId = result.getId(); - if (SnowflakeId.isSnowflakeId(tweetId)) { - long ageMillis = Math.max(0L, - currentTime - SnowflakeId.unixTimeMillisFromId(tweetId)); - int ageDays = Duration.fromMilliseconds(ageMillis).inDays(); - - if (EarlybirdConfig.isRealtimeOrProtected()) { - String key = "result_age_in_days_" + ageDays; - resultsAgeCounter.getUnchecked(key).increment(); - } else { - int ageYears = ageDays / 365; - String key = "result_age_in_years_" + ageYears; - resultsAgeCounter.getUnchecked(key).increment(); - } - } - } - } - - try { - lastRequestPerClientId.get(clientId).set( - new RequestResponsePair(request, searcher.getParsedQuery(), - searcher.getLuceneQuery(), response)); - } catch (ExecutionException ex) { - // Not a big problem, we'll just notice that the admin page doesn't work, and it - // probably won't happen. - } - - - return response; - } - - private void exportScoringTimeStats(EarlybirdResponse response, String clientId) { - if (response.isSetSearchResults() - && response.getSearchResults().isSetScoringTimeNanos() - && response.getSearchResults().isSetNumHitsProcessed()) { - int numHitsProcessed = response.getSearchResults().getNumHitsProcessed(); - long scoringTimeNanos = response.getSearchResults().getScoringTimeNanos(); - - if (numHitsProcessed > 0) { - // Only compute and report scoring time per hit when we have hits. (i.e. we don't just want - // to report 0's for cases where there were no hits, and only want to report legit per-hit - // times. - long scoringTimePerHit = scoringTimeNanos / numHitsProcessed; - - this.clientIdScoringPerHitStats.getUnchecked(clientId).timerIncrement(scoringTimePerHit); - this.overallScoringTimePerHitStats.timerIncrement(scoringTimePerHit); - } - - this.clientIdScoringPerQueryStats.getUnchecked(clientId).timerIncrement(scoringTimeNanos); - this.overallScoringTimePerQueryStats.timerIncrement(scoringTimeNanos); - - // The num hits processed stats here are scoped only to queries that were actually scored. - // This would exclude queries like term stats (that would otherwise have huge num hits - // processed). - this.clientIdScoringNumHitsProcessedStats.getUnchecked(clientId).record(numHitsProcessed); - this.overallScoringNumHitsProcessedStats.record(numHitsProcessed); - } - } - - private void addEarlybirdServerStats(EarlybirdResponse response, long queueWaitTime) { - PartitionConfig curPartitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - EarlybirdServerStats earlybirdServerStats = new EarlybirdServerStats(); - response.setEarlybirdServerStats(earlybirdServerStats); - earlybirdServerStats.setHostname(DatabaseConfig.getLocalHostname()); - earlybirdServerStats.setPartition(curPartitionConfig.getIndexingHashPartitionID()); - earlybirdServerStats.setTierName(curPartitionConfig.getTierName()); - earlybirdServerStats.setCurrentQps(searchStats.getRequestRate()); - earlybirdServerStats.setQueueTimeMillis(queueWaitTime); - earlybirdServerStats.setAverageQueueTimeMillis( - (long) (double) internalQueueWaitTimeStats.read()); - earlybirdServerStats.setAverageLatencyMicros(searchStats.getAverageLatency()); - } - - @Override - public void joinServerSet(String username) throws UpdateException { - serverSetManager.joinServerSet(username); - } - - - @Override - public int getNumberOfServerSetMembers() throws InterruptedException, - ZooKeeperClient.ZooKeeperConnectionException, KeeperException { - return serverSetManager.getNumberOfServerSetMembers(); - } - - @Override - public void leaveServerSet(String username) throws UpdateException { - serverSetManager.leaveServerSet(username); - } - - @Override - public void joinServerSetForServiceProxy() { - serverSetManager.joinServerSetForServiceProxy(); - } - - @VisibleForTesting - protected static class EarlybirdThriftRequestLoggingUtil { - private static final int DEFAULT_MAX_ENTRIES_TO_LOG = 50000; - private static final int DEFAULT_BUFFER_SIZE = 10000; - private static final int DEFAULT_LOGGING_SLEEP_MS = 100; - - @VisibleForTesting - protected static volatile boolean thriftLoggerBusy = false; - private static final ExecutorService LOGGING_EXECUTOR = Executors.newCachedThreadPool(); - - // Synchronized circular buffer used for buffering requests. - // If buffer is full, the oldest requests are replaced. This should not be a problem for - // logging purpose. - @VisibleForTesting - protected static final ArrayBlockingQueue REQUEST_BUFFER = - new ArrayBlockingQueue<>(DEFAULT_BUFFER_SIZE); - - - /** - * Create a separate thread to log thrift request to the given file. If a thread is already - * logging thrift requests, this does nothing and throws an IOException indicating that the - * logging thread is busy. - * - * @param logFile File to log to. - * @param maxEntriesToLog Number of entries to log. - * @param postLoggingHook Code to run after logging finishes. Only used for testing as of now. - */ - @VisibleForTesting - protected static synchronized void startThriftLogging(final File logFile, - final int maxEntriesToLog, - final Runnable postLoggingHook) - throws IOException { - if (thriftLoggerBusy) { - throw new IOException("Already busy logging thrift request. No action taken."); - } - - if (!logFile.canWrite()) { - throw new IOException("Unable to open log file for writing: " + logFile); - } - - final BufferedWriter thriftLogWriter = - Files.newBufferedWriter(logFile.toPath(), Charsets.UTF_8); - - // TSerializer used by the writer thread. - final TSerializer serializer = new TSerializer(); - - REQUEST_BUFFER.clear(); - thriftLoggerBusy = true; - LOG.info("Started to log thrift requests into file " + logFile.getAbsolutePath()); - LOGGING_EXECUTOR.submit(() -> { - try { - int count = 0; - while (count < maxEntriesToLog) { - if (REQUEST_BUFFER.isEmpty()) { - Thread.sleep(DEFAULT_LOGGING_SLEEP_MS); - continue; - } - - try { - EarlybirdRequest ebRequest = REQUEST_BUFFER.poll(); - String logLine = serializeThriftObject(ebRequest, serializer); - thriftLogWriter.write(logLine); - count++; - } catch (TException e) { - LOG.warn("Unable to serialize EarlybirdRequest for logging.", e); - } - } - return count; - } finally { - thriftLogWriter.close(); - thriftLoggerBusy = false; - LOG.info("Finished logging thrift requests into file " + logFile.getAbsolutePath()); - REQUEST_BUFFER.clear(); - if (postLoggingHook != null) { - postLoggingHook.run(); - } - } - }); - } - - /** - * Serialize a thrift object to a base 64 encoded string. - */ - private static String serializeThriftObject(TBase tObject, TSerializer serializer) - throws TException { - return new Base64().encodeToString(serializer.serialize(tObject)) + "\n"; - } - } - - /** - * Start to log thrift EarlybirdRequests. - * - * @param logFile Log file to write to. - * @param numRequestsToLog Number of requests to collect. Default value of 50000 used if - * 0 or negative numbers are pass in. - */ - public void startThriftLogging(File logFile, int numRequestsToLog) throws IOException { - int requestToLog = numRequestsToLog <= 0 - ? EarlybirdThriftRequestLoggingUtil.DEFAULT_MAX_ENTRIES_TO_LOG : numRequestsToLog; - EarlybirdThriftRequestLoggingUtil.startThriftLogging(logFile, requestToLog, null); - } - - @VisibleForTesting - @Override - public boolean isInServerSet() { - return serverSetManager.isInServerSet(); - } - - @VisibleForTesting - SearchCounter getResponseCodeCounter(EarlybirdResponseCode responseCode) { - return responseCodeCounters.get(responseCode); - } - - @VisibleForTesting - SearchCounter getClientIdResponseCodeCounter( - String clientId, EarlybirdResponseCode responseCode) { - String key = String.format(RESPONSES_PER_CLIENT_ID_STAT_TEMPLATE, - clientId, responseCode.name().toLowerCase()); - return responseByClientIdAndResponseCode.getUnchecked(key); - } - - public void setNoShutdownWhenNotInLayout(boolean noShutdown) { - stateManager.setNoShutdownWhenNotInLayout(noShutdown); - } - - private void schedule(OneTaskScheduledExecutorManager manager) { - if (!isShuttingDown) { - manager.schedule(); - toClose.add(manager); - } - } - - public DynamicSchema getSchema() { - return earlybirdIndexConfig.getSchema(); - } - - public AudioSpaceTable getAudioSpaceTable() { - return audioSpaceTable; - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdServerSetManager.docx b/src/java/com/twitter/search/earlybird/EarlybirdServerSetManager.docx new file mode 100644 index 000000000..35c414b7a Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdServerSetManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdServerSetManager.java b/src/java/com/twitter/search/earlybird/EarlybirdServerSetManager.java deleted file mode 100644 index cd490992c..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdServerSetManager.java +++ /dev/null @@ -1,275 +0,0 @@ -package com.twitter.search.earlybird; - -import java.net.InetAddress; -import java.net.InetSocketAddress; -import java.util.concurrent.atomic.AtomicLong; - -import javax.annotation.concurrent.GuardedBy; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Maps; - -import org.apache.zookeeper.KeeperException; -import org.apache.zookeeper.Watcher; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.zookeeper.ServerSet; -import com.twitter.common.zookeeper.ZooKeeperClient; -import com.twitter.common_internal.zookeeper.TwitterServerSet; -import com.twitter.search.common.config.Config; -import com.twitter.search.common.database.DatabaseConfig; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.util.zookeeper.ZooKeeperProxy; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.config.TierConfig; -import com.twitter.search.earlybird.exception.AlreadyInServerSetUpdateException; -import com.twitter.search.earlybird.exception.NotInServerSetUpdateException; -import com.twitter.search.earlybird.partition.PartitionConfig; - -public class EarlybirdServerSetManager implements ServerSetMember { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdServerSetManager.class); - - // How many times this earlybird joined/left its partition's server set - @VisibleForTesting - protected final SearchCounter leaveServerSetCounter; - @VisibleForTesting - protected final SearchCounter joinServerSetCounter; - private final ZooKeeperProxy discoveryZKClient; - private final SearchLongGauge inServerSetGauge; - private final PartitionConfig partitionConfig; - private final int port; - private final String serverSetNamePrefix; - - @VisibleForTesting - protected final SearchLongGauge connectedToZooKeeper; - - private final Object endpointStatusLock = new Object(); - @GuardedBy("endpointStatusLock") - private ServerSet.EndpointStatus endpointStatus = null; - - private boolean inServerSetForServiceProxy = false; - - public EarlybirdServerSetManager( - SearchStatsReceiver searchStatsReceiver, - ZooKeeperProxy discoveryZKClient, - final PartitionConfig partitionConfig, - int port, - String serverSetNamePrefix) { - this.discoveryZKClient = discoveryZKClient; - this.partitionConfig = partitionConfig; - this.port = port; - this.serverSetNamePrefix = serverSetNamePrefix; - - // Export serverset related stats - Preconditions.checkNotNull(searchStatsReceiver); - this.joinServerSetCounter = searchStatsReceiver.getCounter( - serverSetNamePrefix + "join_server_set_count"); - this.leaveServerSetCounter = searchStatsReceiver.getCounter( - serverSetNamePrefix + "leave_server_set_count"); - - // Create a new stat based on the partition number for hosts-in-partition aggregation. - // The value of the stat is dependent on whether the server is in the serverset so that the - // aggregate stat reflects the number serving traffic instead of the live process count. - AtomicLong sharedInServerSetStatus = new AtomicLong(); - this.inServerSetGauge = searchStatsReceiver.getLongGauge( - serverSetNamePrefix + "is_in_server_set", sharedInServerSetStatus); - this.connectedToZooKeeper = searchStatsReceiver.getLongGauge( - serverSetNamePrefix + "connected_to_zookeeper"); - - searchStatsReceiver.getLongGauge( - serverSetNamePrefix + "member_of_partition_" + partitionConfig.getIndexingHashPartitionID(), - sharedInServerSetStatus); - - this.discoveryZKClient.registerExpirationHandler(() -> connectedToZooKeeper.set(0)); - - this.discoveryZKClient.register(event -> { - if (event.getType() == Watcher.Event.EventType.None - && event.getState() == Watcher.Event.KeeperState.SyncConnected) { - connectedToZooKeeper.set(1); - } - }); - } - - /** - * Join ServerSet and update endpointStatus. - * This will allow Earlybird consumers, e.g. Blender, to detect when an - * Earlybird goes online and offline. - * @param username - */ - @Override - public void joinServerSet(String username) throws ServerSet.UpdateException { - joinServerSetCounter.increment(); - - synchronized (endpointStatusLock) { - LOG.info("Joining {} ServerSet (instructed by: {}) ...", serverSetNamePrefix, username); - if (endpointStatus != null) { - LOG.warn("Already in ServerSet. Nothing done."); - throw new AlreadyInServerSetUpdateException("Already in ServerSet. Nothing done."); - } - - try { - TwitterServerSet.Service service = getServerSetService(); - - ServerSet serverSet = discoveryZKClient.createServerSet(service); - endpointStatus = serverSet.join( - new InetSocketAddress(InetAddress.getLocalHost().getHostName(), port), - Maps.newHashMap(), - partitionConfig.getHostPositionWithinHashPartition()); - - inServerSetGauge.set(1); - - String path = service.getPath(); - EarlybirdStatus.recordEarlybirdEvent("Joined " + serverSetNamePrefix + " ServerSet " + path - + " (instructed by: " + username + ")"); - LOG.info("Successfully joined {} ServerSet {} (instructed by: {})", - serverSetNamePrefix, path, username); - } catch (Exception e) { - endpointStatus = null; - String message = "Failed to join " + serverSetNamePrefix + " ServerSet of partition " - + partitionConfig.getIndexingHashPartitionID(); - LOG.error(message, e); - throw new ServerSet.UpdateException(message, e); - } - } - } - - /** - * Takes this Earlybird out of its registered ServerSet. - * - * @throws ServerSet.UpdateException if there was a problem leaving the ServerSet, - * or if this Earlybird is already not in a ServerSet. - * @param username - */ - @Override - public void leaveServerSet(String username) throws ServerSet.UpdateException { - leaveServerSetCounter.increment(); - synchronized (endpointStatusLock) { - LOG.info("Leaving {} ServerSet (instructed by: {}) ...", serverSetNamePrefix, username); - if (endpointStatus == null) { - String message = "Not in a ServerSet. Nothing done."; - LOG.warn(message); - throw new NotInServerSetUpdateException(message); - } - - endpointStatus.leave(); - endpointStatus = null; - inServerSetGauge.set(0); - EarlybirdStatus.recordEarlybirdEvent("Left " + serverSetNamePrefix - + " ServerSet (instructed by: " + username + ")"); - LOG.info("Successfully left {} ServerSet. (instructed by: {})", - serverSetNamePrefix, username); - } - } - - @Override - public int getNumberOfServerSetMembers() - throws InterruptedException, ZooKeeperClient.ZooKeeperConnectionException, KeeperException { - String path = getServerSetService().getPath(); - return discoveryZKClient.getNumberOfServerSetMembers(path); - } - - /** - * Determines if this earlybird is in the server set. - */ - @Override - public boolean isInServerSet() { - synchronized (endpointStatusLock) { - return endpointStatus != null; - } - } - - /** - * Returns the server set that this earlybird should join. - */ - public String getServerSetIdentifier() { - TwitterServerSet.Service service = getServerSetService(); - return String.format("/cluster/local/%s/%s/%s", - service.getRole(), - service.getEnv(), - service.getName()); - } - - private TwitterServerSet.Service getServerSetService() { - // If the tier name is 'all' then it treat it as an untiered EB cluster - // and do not add the tier component into the ZK path it registers under. - String tierZKPathComponent = ""; - if (!TierConfig.DEFAULT_TIER_NAME.equalsIgnoreCase(partitionConfig.getTierName())) { - tierZKPathComponent = "/" + partitionConfig.getTierName(); - } - if (EarlybirdConfig.isAurora()) { - // ROLE, EARYLBIRD_NAME, and ENV properties are required on Aurora, thus will be set here - return new TwitterServerSet.Service( - EarlybirdProperty.ROLE.get(), - EarlybirdProperty.ENV.get(), - getServerSetPath(EarlybirdProperty.EARLYBIRD_NAME.get() + tierZKPathComponent)); - } else { - return new TwitterServerSet.Service( - DatabaseConfig.getZooKeeperRole(), - Config.getEnvironment(), - getServerSetPath("earlybird" + tierZKPathComponent)); - } - } - - private String getServerSetPath(String earlybirdName) { - return String.format("%s%s/hash_partition_%d", serverSetNamePrefix, earlybirdName, - partitionConfig.getIndexingHashPartitionID()); - } - - /** - * Join ServerSet for ServiceProxy with a named admin port and with a zookeeper path that Service - * Proxy can translate to a domain name label that is less than 64 characters (due to the size - * limit for domain name labels described here: https://tools.ietf.org/html/rfc1035) - * This will allow us to access Earlybirds that are not on mesos via ServiceProxy. - */ - @Override - public void joinServerSetForServiceProxy() { - // This additional Zookeeper server set is only necessary for Archive Earlybirds which are - // running on bare metal hardware, so ensure that this method is never called for services - // on Aurora. - Preconditions.checkArgument(!EarlybirdConfig.isAurora(), - "Attempting to join server set for ServiceProxy on Earlybird running on Aurora"); - - LOG.info("Attempting to join ServerSet for ServiceProxy"); - try { - TwitterServerSet.Service service = getServerSetForServiceProxyOnArchive(); - - ServerSet serverSet = discoveryZKClient.createServerSet(service); - String hostName = InetAddress.getLocalHost().getHostName(); - int adminPort = EarlybirdConfig.getAdminPort(); - serverSet.join( - new InetSocketAddress(hostName, port), - ImmutableMap.of("admin", new InetSocketAddress(hostName, adminPort)), - partitionConfig.getHostPositionWithinHashPartition()); - - String path = service.getPath(); - LOG.info("Successfully joined ServerSet for ServiceProxy {}", path); - inServerSetForServiceProxy = true; - } catch (Exception e) { - String message = "Failed to join ServerSet for ServiceProxy of partition " - + partitionConfig.getIndexingHashPartitionID(); - LOG.warn(message, e); - } - } - - @VisibleForTesting - protected TwitterServerSet.Service getServerSetForServiceProxyOnArchive() { - String serverSetPath = String.format("proxy/%s/p_%d", - partitionConfig.getTierName(), - partitionConfig.getIndexingHashPartitionID()); - return new TwitterServerSet.Service( - DatabaseConfig.getZooKeeperRole(), - Config.getEnvironment(), - serverSetPath); - } - - @VisibleForTesting - protected boolean isInServerSetForServiceProxy() { - return inServerSetForServiceProxy; - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdStatus.docx b/src/java/com/twitter/search/earlybird/EarlybirdStatus.docx new file mode 100644 index 000000000..27175d301 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdStatus.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdStatus.java b/src/java/com/twitter/search/earlybird/EarlybirdStatus.java deleted file mode 100644 index 49ee768e7..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdStatus.java +++ /dev/null @@ -1,204 +0,0 @@ -package com.twitter.search.earlybird; - -import java.text.SimpleDateFormat; -import java.util.Date; -import java.util.List; -import java.util.Optional; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; - -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.BuildInfo; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.thrift.EarlybirdStatusCode; -import com.twitter.util.Duration; - -/** - * High level status of an Earlybird server. SEARCH-28016 - */ -public final class EarlybirdStatus { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdStatus.class); - - private static final String BUILD_SHA = getBuildShaFromVars(); - - protected static long startTime; - protected static EarlybirdStatusCode statusCode; - protected static String statusMessage; - protected static final AtomicBoolean THRIFT_PORT_OPEN = new AtomicBoolean(false); - protected static final AtomicBoolean WARMUP_THRIFT_PORT_OPEN = new AtomicBoolean(false); - protected static final AtomicBoolean THRIFT_SERVICE_STARTED = new AtomicBoolean(false); - - private static final List EARLYBIRD_SERVER_EVENTS = Lists.newArrayList(); - private static class EarlybirdEvent { - private final String eventName; - private final long timestampMillis; - private final long timeSinceServerStartMillis; - private final long durationMillis; - - public EarlybirdEvent(String eventName, long timestampMillis) { - this(eventName, timestampMillis, -1); - } - - public EarlybirdEvent( - String eventName, - long timestampMillis, - long eventDurationMillis) { - this.eventName = eventName; - this.timestampMillis = timestampMillis; - this.timeSinceServerStartMillis = timestampMillis - startTime; - this.durationMillis = eventDurationMillis; - } - - public String getEventLogString() { - String result = String.format( - "%s %s", - new SimpleDateFormat("yyyy-MM-dd HH:mm:ss.SSS").format(new Date(timestampMillis)), - eventName); - - if (durationMillis > 0) { - result += String.format( - ", took: %s", Duration.apply(durationMillis, TimeUnit.MILLISECONDS).toString()); - } - - result += String.format( - ", time since server start: %s", - Duration.apply(timeSinceServerStartMillis, TimeUnit.MILLISECONDS).toString() - ); - - return result; - } - } - - private EarlybirdStatus() { - } - - public static synchronized void setStartTime(long time) { - startTime = time; - LOG.info("startTime set to " + time); - } - - public static synchronized void setStatus(EarlybirdStatusCode code) { - setStatus(code, null); - } - - public static synchronized void setStatus(EarlybirdStatusCode code, String message) { - statusCode = code; - statusMessage = message; - LOG.info("status set to " + code + (message != null ? " with message " + message : "")); - } - - public static synchronized long getStartTime() { - return startTime; - } - - public static synchronized boolean isStarting() { - return statusCode == EarlybirdStatusCode.STARTING; - } - - public static synchronized boolean hasStarted() { - return statusCode == EarlybirdStatusCode.CURRENT; - } - - public static boolean isThriftServiceStarted() { - return THRIFT_SERVICE_STARTED.get(); - } - - public static synchronized EarlybirdStatusCode getStatusCode() { - return statusCode; - } - - public static synchronized String getStatusMessage() { - return (statusMessage == null ? "" : statusMessage + ", ") - + "warmup thrift port is " + (WARMUP_THRIFT_PORT_OPEN.get() ? "OPEN" : "CLOSED") - + ", production thrift port is " + (THRIFT_PORT_OPEN.get() ? "OPEN" : "CLOSED"); - } - - public static synchronized void recordEarlybirdEvent(String eventName) { - long timeMillis = System.currentTimeMillis(); - EARLYBIRD_SERVER_EVENTS.add(new EarlybirdEvent(eventName, timeMillis)); - } - - private static String getBeginEventMessage(String eventName) { - return "[Begin Event] " + eventName; - } - - private static String getEndEventMessage(String eventName) { - return "[ End Event ] " + eventName; - } - - /** - * Records the beginning of the given event. - * - * @param eventName The event name. - * @param startupMetric The metric that will be used to keep track of the time for this event. - */ - public static synchronized void beginEvent(String eventName, - SearchIndexingMetricSet.StartupMetric startupMetric) { - long timeMillis = System.currentTimeMillis(); - String eventMessage = getBeginEventMessage(eventName); - LOG.info(eventMessage); - EARLYBIRD_SERVER_EVENTS.add(new EarlybirdEvent(eventMessage, timeMillis)); - - startupMetric.begin(); - } - - /** - * Records the end of the given event. - * - * @param eventName The event name. - * @param startupMetric The metric used to keep track of the time for this event. - */ - public static synchronized void endEvent(String eventName, - SearchIndexingMetricSet.StartupMetric startupMetric) { - long timeMillis = System.currentTimeMillis(); - - String beginEventMessage = getBeginEventMessage(eventName); - Optional beginEventOpt = EARLYBIRD_SERVER_EVENTS.stream() - .filter(event -> event.eventName.equals(beginEventMessage)) - .findFirst(); - - String eventMessage = getEndEventMessage(eventName); - LOG.info(eventMessage); - EarlybirdEvent endEvent = new EarlybirdEvent( - eventMessage, - timeMillis, - beginEventOpt.map(e -> timeMillis - e.timestampMillis).orElse(-1L)); - - EARLYBIRD_SERVER_EVENTS.add(endEvent); - - startupMetric.end(endEvent.durationMillis); - } - - public static synchronized void clearAllEvents() { - EARLYBIRD_SERVER_EVENTS.clear(); - } - - public static String getBuildSha() { - return BUILD_SHA; - } - - /** - * Returns the list of all earlybird events that happened since the server started. - */ - public static synchronized Iterable getEarlybirdEvents() { - List eventLog = Lists.newArrayListWithCapacity(EARLYBIRD_SERVER_EVENTS.size()); - for (EarlybirdEvent event : EARLYBIRD_SERVER_EVENTS) { - eventLog.add(event.getEventLogString()); - } - return eventLog; - } - - private static String getBuildShaFromVars() { - BuildInfo buildInfo = new BuildInfo(); - String buildSha = buildInfo.getProperties().getProperty(BuildInfo.Key.GIT_REVISION.value); - if (buildSha != null) { - return buildSha; - } else { - return "UNKNOWN"; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/EarlybirdWarmUpManager.docx b/src/java/com/twitter/search/earlybird/EarlybirdWarmUpManager.docx new file mode 100644 index 000000000..7c28d0374 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/EarlybirdWarmUpManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/EarlybirdWarmUpManager.java b/src/java/com/twitter/search/earlybird/EarlybirdWarmUpManager.java deleted file mode 100644 index 446bd9171..000000000 --- a/src/java/com/twitter/search/earlybird/EarlybirdWarmUpManager.java +++ /dev/null @@ -1,100 +0,0 @@ -package com.twitter.search.earlybird; - -import com.google.common.annotations.VisibleForTesting; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.common.zookeeper.ServerSet; -import com.twitter.decider.Decider; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.earlybird.partition.PartitionConfig; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.thrift.EarlybirdStatusCode; - -public class EarlybirdWarmUpManager { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdWarmUpManager.class); - private static final String WARM_UP_ON_DURATION_DECIDER_KEY_PATTERN = - "%s_warm_up_duration_seconds"; - - private final EarlybirdServerSetManager earlybirdServerSetManager; - private final String clusterName; - private final SearchIndexingMetricSet.StartupMetric startUpInWarmUpMetric; - private final Decider decider; - private final Clock clock; - - public EarlybirdWarmUpManager(EarlybirdServerSetManager earlybirdServerSetManager, - PartitionConfig partitionConfig, - SearchIndexingMetricSet searchIndexingMetricSet, - Decider decider, - Clock clock) { - this.earlybirdServerSetManager = earlybirdServerSetManager; - this.clusterName = partitionConfig.getClusterName(); - this.startUpInWarmUpMetric = searchIndexingMetricSet.startupInWarmUp; - this.decider = decider; - this.clock = clock; - } - - public String getServerSetIdentifier() { - return earlybirdServerSetManager.getServerSetIdentifier(); - } - - /** - * Warms up the earlybird. The earlybird joins a special server set that gets production dark - * reads, and leaves this server set after a specified period of time. - */ - public void warmUp() throws InterruptedException, ServerSet.UpdateException { - int warmUpDurationSeconds = DeciderUtil.getAvailability( - decider, - String.format(WARM_UP_ON_DURATION_DECIDER_KEY_PATTERN, clusterName.replaceAll("-", "_"))); - if (warmUpDurationSeconds == 0) { - LOG.info(String.format("Warm up stage duration for cluster %s set to 0. Skipping.", - clusterName)); - return; - } - - earlybirdServerSetManager.joinServerSet("internal warm up"); - - // If doWarmUp() is interrupted, try to leave the server set, and propagate the - // InterruptedException. Otherwise, try to leave the server set, and propagate any exception - // that it might throw. - InterruptedException warmUpInterruptedException = null; - try { - doWarmUp(warmUpDurationSeconds); - } catch (InterruptedException e) { - warmUpInterruptedException = e; - throw e; - } finally { - if (warmUpInterruptedException != null) { - try { - earlybirdServerSetManager.leaveServerSet("internal warm up"); - } catch (Exception e) { - warmUpInterruptedException.addSuppressed(e); - } - } else { - earlybirdServerSetManager.leaveServerSet("internal warm up"); - } - } - } - - @VisibleForTesting - protected void doWarmUp(int warmUpDurationSeconds) throws InterruptedException { - long warmUpStartTimeMillis = clock.nowMillis(); - LOG.info(String.format("Warming up for %d seconds.", warmUpDurationSeconds)); - EarlybirdStatus.beginEvent("warm_up", startUpInWarmUpMetric); - - // Sleep for warmUpDurationSeconds seconds, but check if the server is going down every second. - int count = 0; - try { - while ((count++ < warmUpDurationSeconds) - && (EarlybirdStatus.getStatusCode() != EarlybirdStatusCode.STOPPING)) { - clock.waitFor(1000); - } - } finally { - LOG.info(String.format("Done warming up after %d milliseconds.", - clock.nowMillis() - warmUpStartTimeMillis)); - EarlybirdStatus.endEvent("warm_up", startUpInWarmUpMetric); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/QualityFactor.docx b/src/java/com/twitter/search/earlybird/QualityFactor.docx new file mode 100644 index 000000000..fab730256 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/QualityFactor.docx differ diff --git a/src/java/com/twitter/search/earlybird/QualityFactor.java b/src/java/com/twitter/search/earlybird/QualityFactor.java deleted file mode 100644 index 601426cdf..000000000 --- a/src/java/com/twitter/search/earlybird/QualityFactor.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.twitter.search.earlybird; - -/** - * Interface defining a quality factor. - */ -public interface QualityFactor { - /** - * Returns the current quality factor. - * @return The quality factor; a number between 0.0 and 1.0. - */ - double get(); - - /** - * Starts a thread to update the quality factor periodically. - */ - void startUpdates(); -} diff --git a/src/java/com/twitter/search/earlybird/README.docx b/src/java/com/twitter/search/earlybird/README.docx new file mode 100644 index 000000000..c7513c5d4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/README.docx differ diff --git a/src/java/com/twitter/search/earlybird/README.md b/src/java/com/twitter/search/earlybird/README.md deleted file mode 100644 index c26adedcf..000000000 --- a/src/java/com/twitter/search/earlybird/README.md +++ /dev/null @@ -1,83 +0,0 @@ -# Search Index (Earlybird) main classes - -> **TL;DR** Earlybird (Search Index) find tweets from people you follow, rank them, and serve them to Home. - -## What is Earlybird (Search Index) - -[Earlybird](http://notes.stephenholiday.com/Earlybird.pdf) is a **real-time search system** based on [Apache Lucene](https://lucene.apache.org/) to support the high volume of queries and content updates. The major use cases are Relevance Search (specifically, Text search) and Timeline In-network Tweet retrieval (or UserID based search). It is designed to enable the efficient indexing and querying of billions of tweets, and to provide low-latency search results, even with heavy query loads. - -## High-level architecture -We split our entire tweet search index into three clusters: a **realtime** cluster indexing all public tweets posted in about the last 7 days, a **protected** cluster indexing all protected tweets for the same timeframe; and an **archive** cluster indexing all tweets ever posted, up to about two days ago. - -Earlybird addresses the challenges of scaling real-time search by splitting each cluster across multiple **partitions**, each responsible for a portion of the index. The architecture uses a distributed *inverted index* that is sharded and replicated. This design allows for efficient index updates and query processing. - -The system also employs an incremental indexing approach, enabling it to process and index new tweets in real-time as they arrive. With single writer, multiple reader structure, Earlybird can handle a large number of real-time updates and queries concurrently while maintaining low query latency. The system can achieve high query throughput and low query latency while maintaining a high degree of index freshness. - -## Main Components - -**Partition Manager**: Responsible for managing the configuration of partitions, as well as the mapping between users and partitions. It also handles index loading and flushing. - -**Real-time Indexer**: Continuously reads from a kafka stream of incoming tweets and updates the index (tweet creation, tweet updates, user updates). It also supports tweet deletion events. - -**Query Engine**: Handles the execution of search queries against the distributed index. It employs various optimization techniques, such as term-based pruning and caching. - -**Document Preprocessor**: Converts raw tweets into a document representation suitable for indexing. It handles tokenization, normalization, and analysis of tweet text and metadata. See our ingestion pipeline `src/java/com/twitter/search/ingester` for more write-path processing. - -**Index Writer**: Writes tweet documents to the index and maintains the index structure, including **posting lists** and **term dictionaries**. - -**Segment Manager**: Manages index segments within a partition. It is responsible for merging, optimizing, and flushing index segments to disk, or flush to HDFS to snapshot live segments. - -**Searcher**: Executes queries against the index, using techniques like caching and parallel query execution to minimize query latency. It also incorporates scoring models and ranking algorithms to provide relevant search results. - -The most important two data structures for Earlybird (or Information Retrieval in general) including: - -* **Inverted Index** which stores a mapping between a Term to a list of Doc IDs. Essentially, we build a hash map: each key in the map is a distinct Term (e.g., `cat`, `dog`) in a tweet, and each value is the list of tweets (aka., Document) in which the word appears. We keep one inverted index per field (text, UserID, user name, links, etc.) -* **Postings List** which optimize the storage a the list of Doc IDs mentioned above. - -See more at: https://blog.twitter.com/engineering/en_us/topics/infrastructure/2016/omnisearch-index-formats - -## Advanced features - -Earlybird incorporates several advanced features such as facet search, which allows users to refine search results based on specific attributes such as user mentions, hashtags, and URLs. Furthermore, the system supports various ranking models, including machine learning-based scoring models, to provide relevant search results. - -## Directory Structure -The project consists of several packages and files, which can be summarized as follows: - -* At the root level, the primary focus is on the Earlybird server implementation and its associated classes. These include classes for search, CPU quality factors, server management, index config, main classes, server startup, etc. -* `archive/`: Directory deals with the management and configuration of archived data, specifically for Earlybird Index Configurations. It also contains a `segmentbuilder/` subdirectory, which includes classes for building and updating archive index segments. -* `common/`: Directory holds utility classes for logging, handling requests, and Thrift backend functionality. It also has two subdirectories: `config/` for Earlybird configuration and `userupdates/` for user-related data handling. -* `config/`: Directory is dedicated to managing tier configurations specifically for archive cluster, which relate to server and search query distribution. -* `document/`: Handles document creation and processing, including various factories and token stream writers. -* `exception/`: Contains custom exceptions and exception handling classes related to the system. -* `factory/`: Provides utilities and factories for configurations, Kafka consumers, and server instances. -* `index/`: Contains index-related classes, including in-memory time mappers, tweet ID mappers, and facets. -* `ml/`: Houses the `ScoringModelsManager` for managing machine learning models. -* `partition/`: Manages partitions and index segments, including index loaders, segment writers, and startup indexers. -* `querycache/`: Implements caching for queries and query results, including cache configuration and update tasks. -* `queryparser/`: Provides query parsing functionality, including files that cover query rewriters and lhigh-frequency term extraction. -* `search/`: Contains read path related classes, such as search request processing, result collectors, and facet collectors. -* `segment/`: Provides classes for managing segment data providers and data reader sets. -* `stats/`: Contains classes for tracking and reporting statistics related to the system. -* `tools/`: Houses utility classes for deserializing thrift requests. -* `util/`: Includes utility classes for various tasks, such as action logging, scheduled tasks, and JSON viewers. - -## Related Services - -* The Earlybirds sit behind Earlybird Root servers that fan out queries to them. See `src/java/com/twitter/search/earlybird_root/` -* The Earlybirds are powered by multiple ingestion pipelines. See `src/java/com/twitter/search/ingester/` -* Earlybird segments for the Archives are built offline by segment builders -* Also, Earlybird light ranking is defined in `timelines/data_processing/ad_hoc/earlybird_ranking` - and `src/python/twitter/deepbird/projects/timelines/scripts/models/earlybird`. -* Search common library/packages - -## References - -See more: - -* "Earlybird: Real-Time Search at Twitter" (http://notes.stephenholiday.com/Earlybird.pdf) -* "Reducing search indexing latency to one second" (https://blog.twitter.com/engineering/en_us/topics/infrastructure/2020/reducing-search-indexing-latency-to-one-second) -* "Omnisearch index formats" (https://blog.twitter.com/engineering/en_us/topics/infrastructure/2016/omnisearch-index-formats) - - - - diff --git a/src/java/com/twitter/search/earlybird/RealtimeEarlybirdIndexConfig.docx b/src/java/com/twitter/search/earlybird/RealtimeEarlybirdIndexConfig.docx new file mode 100644 index 000000000..7f5133548 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/RealtimeEarlybirdIndexConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/RealtimeEarlybirdIndexConfig.java b/src/java/com/twitter/search/earlybird/RealtimeEarlybirdIndexConfig.java deleted file mode 100644 index 5e95903f0..000000000 --- a/src/java/com/twitter/search/earlybird/RealtimeEarlybirdIndexConfig.java +++ /dev/null @@ -1,128 +0,0 @@ -package com.twitter.search.earlybird; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; - -import com.twitter.decider.Decider; -import com.twitter.search.common.schema.DynamicSchema; -import com.twitter.search.common.schema.SearchWhitespaceAnalyzer; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.util.CloseResourceUtil; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentData; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsFactory; -import com.twitter.search.core.earlybird.index.inverted.IndexOptimizer; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.index.OptimizedTimeMapper; -import com.twitter.search.earlybird.index.OptimizedTweetIDMapper; -import com.twitter.search.earlybird.index.OutOfOrderRealtimeTweetIDMapper; -import com.twitter.search.earlybird.index.RealtimeTimeMapper; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentSyncInfo; - -/** - * Index config for the Real-Time in-memory Tweet cluster. - */ -public class RealtimeEarlybirdIndexConfig extends EarlybirdIndexConfig { - private final CloseResourceUtil resourceCloser = new CloseResourceUtil(); - - public RealtimeEarlybirdIndexConfig( - EarlybirdCluster cluster, Decider decider, SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler) { - super(cluster, decider, searchIndexingMetricSet, criticalExceptionHandler); - } - - public RealtimeEarlybirdIndexConfig( - EarlybirdCluster cluster, DynamicSchema schema, Decider decider, - SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler) { - super(cluster, schema, decider, searchIndexingMetricSet, criticalExceptionHandler); - } - - @Override - public Directory newLuceneDirectory(SegmentSyncInfo segmentSyncInfo) { - return new RAMDirectory(); - } - - @Override - public IndexWriterConfig newIndexWriterConfig() { - return new IndexWriterConfig(new SearchWhitespaceAnalyzer()) - .setSimilarity(IndexSearcher.getDefaultSimilarity()); - } - - @Override - public EarlybirdIndexSegmentData newSegmentData( - int maxSegmentSize, - long timeSliceID, - Directory dir, - EarlybirdIndexExtensionsFactory extensionsFactory) { - return new EarlybirdRealtimeIndexSegmentData( - maxSegmentSize, - timeSliceID, - getSchema(), - new OutOfOrderRealtimeTweetIDMapper(maxSegmentSize, timeSliceID), - new RealtimeTimeMapper(maxSegmentSize), - extensionsFactory); - } - - @Override - public EarlybirdIndexSegmentData loadSegmentData( - FlushInfo flushInfo, - DataDeserializer dataInputStream, - Directory dir, - EarlybirdIndexExtensionsFactory extensionsFactory) throws IOException { - EarlybirdRealtimeIndexSegmentData.InMemorySegmentDataFlushHandler flushHandler; - boolean isOptimized = flushInfo.getBooleanProperty( - EarlybirdIndexSegmentData.AbstractSegmentDataFlushHandler.IS_OPTIMIZED_PROP_NAME); - if (isOptimized) { - flushHandler = new EarlybirdRealtimeIndexSegmentData.InMemorySegmentDataFlushHandler( - getSchema(), - extensionsFactory, - new OptimizedTweetIDMapper.FlushHandler(), - new OptimizedTimeMapper.FlushHandler()); - } else { - flushHandler = new EarlybirdRealtimeIndexSegmentData.InMemorySegmentDataFlushHandler( - getSchema(), - extensionsFactory, - new OutOfOrderRealtimeTweetIDMapper.FlushHandler(), - new RealtimeTimeMapper.FlushHandler()); - } - - - return flushHandler.load(flushInfo, dataInputStream); - } - - @Override - public EarlybirdIndexSegmentData optimize( - EarlybirdIndexSegmentData earlybirdIndexSegmentData) throws IOException { - Preconditions.checkArgument( - earlybirdIndexSegmentData instanceof EarlybirdRealtimeIndexSegmentData, - "Expected EarlybirdRealtimeIndexSegmentData but got %s", - earlybirdIndexSegmentData.getClass()); - - return IndexOptimizer.optimize((EarlybirdRealtimeIndexSegmentData) earlybirdIndexSegmentData); - } - - @Override - public boolean isIndexStoredOnDisk() { - return false; - } - - @Override - public final CloseResourceUtil getResourceCloser() { - return resourceCloser; - } - - @Override - public boolean supportOutOfOrderIndexing() { - return true; - } -} diff --git a/src/java/com/twitter/search/earlybird/RecentTweetRestriction.docx b/src/java/com/twitter/search/earlybird/RecentTweetRestriction.docx new file mode 100644 index 000000000..218de54d4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/RecentTweetRestriction.docx differ diff --git a/src/java/com/twitter/search/earlybird/RecentTweetRestriction.java b/src/java/com/twitter/search/earlybird/RecentTweetRestriction.java deleted file mode 100644 index 7c25bfb14..000000000 --- a/src/java/com/twitter/search/earlybird/RecentTweetRestriction.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.search.earlybird; - -import scala.Option; - -import com.google.common.annotations.VisibleForTesting; - -import com.twitter.decider.Decider; - -public final class RecentTweetRestriction { - private static final String RECENT_TWEETS_THRESHOLD = "recent_tweets_threshold"; - private static final String QUERY_CACHE_UNTIL_TIME = "query_cache_until_time"; - - @VisibleForTesting - public static final int DEFAULT_RECENT_TWEET_SECONDS = 15; - - private RecentTweetRestriction() { - } - - /** - * Returns the point in time (in seconds past the unix epoch) before which all tweets will be - * completely indexed. This is required by some clients, because they rely on Earlybird monotonically - * indexing tweets by ID and that tweets are completely indexed when they see them. - * - * @param lastTime The time at which the most recent tweet was indexed, in seconds since the unix - * epoch. - */ - public static int recentTweetsUntilTime(Decider decider, int lastTime) { - return untilTimeSeconds(decider, lastTime, RECENT_TWEETS_THRESHOLD); - } - - /** - * Returns the point in time (in seconds past the unix epoch) before which all tweets will be - * completely indexed. This is required by some clients, because they rely on Earlybird monotonically - * indexing tweets by ID and that tweets are completely indexed when they see them. - * - * @param lastTime The time at which the most recent tweet was indexed, in seconds since the unix - * epoch. - */ - public static int queryCacheUntilTime(Decider decider, int lastTime) { - return untilTimeSeconds(decider, lastTime, QUERY_CACHE_UNTIL_TIME); - } - - private static int untilTimeSeconds(Decider decider, int lastTime, String deciderKey) { - int recentTweetSeconds = getRecentTweetSeconds(decider, deciderKey); - - if (recentTweetSeconds == 0) { - return 0; - } - - return lastTime - recentTweetSeconds; - } - - private static int getRecentTweetSeconds(Decider decider, String deciderKey) { - Option deciderValue = decider.getAvailability(deciderKey); - if (deciderValue.isDefined()) { - return (int) deciderValue.get(); - } - return DEFAULT_RECENT_TWEET_SECONDS; - } -} diff --git a/src/java/com/twitter/search/earlybird/ServerSetMember.docx b/src/java/com/twitter/search/earlybird/ServerSetMember.docx new file mode 100644 index 000000000..42be61be5 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/ServerSetMember.docx differ diff --git a/src/java/com/twitter/search/earlybird/ServerSetMember.java b/src/java/com/twitter/search/earlybird/ServerSetMember.java deleted file mode 100644 index a287f6950..000000000 --- a/src/java/com/twitter/search/earlybird/ServerSetMember.java +++ /dev/null @@ -1,55 +0,0 @@ -package com.twitter.search.earlybird; - -import org.apache.zookeeper.KeeperException; - -import com.twitter.common.zookeeper.ServerSet; -import com.twitter.common.zookeeper.ZooKeeperClient; - -/** - * Represents a server that can add and remove itself from a server set. - */ -public interface ServerSetMember { - /** - * Makes this server join its server set. - * - * @throws ServerSet.UpdateException - * @param requestSource - */ - void joinServerSet(String requestSource) throws ServerSet.UpdateException; - - /** - * Makes this server leave its server set. - * - * @throws ServerSet.UpdateException - * @param requestSource - */ - void leaveServerSet(String requestSource) throws ServerSet.UpdateException; - - /** - * Gets and returns the current number of members in this server's server set. - * - * @return number of members currently in this host's server set. - * @throws InterruptedException - * @throws ZooKeeperClient.ZooKeeperConnectionException - * @throws KeeperException - */ - int getNumberOfServerSetMembers() throws InterruptedException, - ZooKeeperClient.ZooKeeperConnectionException, KeeperException; - - /** - * Checks if this earlybird is in the server set. - * - * @return true if it is, false otherwise. - */ - boolean isInServerSet(); - - /** - * Should only be called for Archive Earlybirds. - * - * Join ServerSet for ServiceProxy with a named admin port and with a zookeeper path that Service - * Proxy can translate to a domain name label that is less than 64 characters (due to the size - * limit for domain name labels described here: https://tools.ietf.org/html/rfc1035) - * This will allow us to access Earlybirds that are not on mesos via ServiceProxy. - */ - void joinServerSetForServiceProxy(); -} diff --git a/src/java/com/twitter/search/earlybird/UpdateableEarlybirdStateManager.docx b/src/java/com/twitter/search/earlybird/UpdateableEarlybirdStateManager.docx new file mode 100644 index 000000000..80f19da98 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/UpdateableEarlybirdStateManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/UpdateableEarlybirdStateManager.java b/src/java/com/twitter/search/earlybird/UpdateableEarlybirdStateManager.java deleted file mode 100644 index 24b1cbce3..000000000 --- a/src/java/com/twitter/search/earlybird/UpdateableEarlybirdStateManager.java +++ /dev/null @@ -1,437 +0,0 @@ -package com.twitter.search.earlybird; - -import java.io.File; -import java.io.IOException; -import java.util.Random; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Charsets; - -import org.apache.thrift.TException; -import org.apache.zookeeper.KeeperException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.common.zookeeper.ZooKeeperClient; -import com.twitter.search.common.aurora.AuroraSchedulerClient; -import com.twitter.search.common.concurrent.ScheduledExecutorServiceFactory; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.file.LocalFile; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.schema.AnalyzerFactory; -import com.twitter.search.common.schema.DynamicSchema; -import com.twitter.search.common.schema.ImmutableSchema; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.thriftjava.ThriftSchema; -import com.twitter.search.common.util.ml.tensorflow_engine.TensorflowModelsManager; -import com.twitter.search.common.util.thrift.ThriftUtils; -import com.twitter.search.common.util.zookeeper.ZooKeeperProxy; -import com.twitter.search.earlybird.common.NonPagingAssert; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.ml.ScoringModelsManager; -import com.twitter.search.earlybird.partition.DynamicPartitionConfig; -import com.twitter.search.earlybird.partition.PartitionConfig; -import com.twitter.search.earlybird.partition.PartitionConfigLoader; -import com.twitter.search.earlybird.partition.PartitionConfigLoadingException; -import com.twitter.search.earlybird.util.OneTaskScheduledExecutorManager; -import com.twitter.search.earlybird.util.PeriodicActionParams; -import com.twitter.search.earlybird.util.ShutdownWaitTimeParams; - -/** - * A class that keeps track of Earlybird state that may change while an Earlybird runs, and keeps - * that state up to date. Currently keeps track of the current Earlybird schema and partition - * configuration, and periodically updates them from Zookeeper. It also reloads periodically the - * scoring models from HDFS. - */ -public class UpdateableEarlybirdStateManager extends OneTaskScheduledExecutorManager { - private static final Logger LOG = LoggerFactory.getLogger(UpdateableEarlybirdStateManager.class); - public static final String SCHEMA_SUFFIX = ".schema.v"; - - private static final String THREAD_NAME_PATTERN = "state_update-%d"; - private static final boolean THREAD_IS_DAEMON = true; - private static final long EXECUTOR_SHUTDOWN_WAIT_SEC = 5; - - private static final String DEFAULT_ZK_SCHEMA_LOCATION = - "/twitter/search/production/earlybird/schema"; - private static final String DEFAULT_LOCAL_SCHEMA_LOCATION = - "/home/search/earlybird_schema_canary"; - private static final long DEFAULT_UPDATE_PERIOD_MILLIS = - TimeUnit.MINUTES.toMillis(30); - - private static final String SCHEMA_MAJOR_VERSION_NAME = - "schema_major_version"; - private static final String SCHEMA_MINOR_VERSION_NAME = - "schema_minor_version"; - private static final String LAST_SUCCESSFUL_SCHEMA_RELOAD_TIME_MILLIS_NAME = - "last_successful_schema_reload_timestamp_millis"; - @VisibleForTesting - static final String FAIL_TO_LOAD_SCHEMA_COUNT_NAME = - "fail_to_load_schema_count"; - @VisibleForTesting - static final String HOST_IS_CANARY_SCHEME = "host_is_canary_schema"; - @VisibleForTesting - static final String DID_NOT_FIND_SCHEMA_COUNT_NAME = - "did_not_find_schema_count"; - private static final String LAST_SUCCESSFUL_PARTITION_CONFIG_RELOAD_TIME_MILLIS_NAME = - "last_successful_partition_config_reload_timestamp_millis"; - @VisibleForTesting - static final String FAIL_TO_LOAD_PARTITION_CONFIG_COUNT_NAME = - "fail_to_load_partition_config_count"; - @VisibleForTesting - static final String HOST_IS_IN_LAYOUT_STAT_NAME = "host_is_in_layout"; - private static final String NOT_IN_LAYOUT_SHUT_DOWN_ATTEMPTED_NAME = - "not_in_layout_shut_down_attempted"; - - private static final String SHUT_DOWN_EARLYBIRD_WHEN_NOT_IN_LAYOUT_DECIDER_KEY = - "shut_down_earlybird_when_not_in_layout"; - - private static final String NO_SHUTDOWN_WHEN_NOT_IN_LAYOUT_NAME = - "no_shutdown_when_not_in_layout"; - - private final SearchLongGauge schemaMajorVersion; - private final SearchLongGauge schemaMinorVersion; - private final SearchLongGauge lastSuccessfulSchemaReloadTimeMillis; - private final SearchCounter failToLoadSchemaCount; - private final SearchLongGauge hostIsCanarySchema; - private final SearchCounter didNotFindSchemaCount; - private final SearchLongGauge lastSuccessfulPartitionConfigReloadTimeMillis; - private final SearchCounter failToLoadPartitionConfigCount; - private final SearchLongGauge hostIsInLayout; - private final SearchCounter notInLayoutShutDownAttemptedCount; - private final SearchLongGauge noShutdownWhenNotInLayoutGauge; - - private final EarlybirdIndexConfig indexConfig; - private final DynamicPartitionConfig partitionConfig; - private final String schemaLocationOnLocal; - private final String schemaLocationOnZK; - private final ZooKeeperProxy zkClient; - private final AuroraSchedulerClient schedulerClient; - private final ScoringModelsManager scoringModelsManager; - private final TensorflowModelsManager tensorflowModelsManager; - private final SearchDecider searchDecider; - private final AtomicLong noShutdownWhenNotInLayout; - private EarlybirdServer earlybirdServer; - private Clock clock; - - public UpdateableEarlybirdStateManager( - EarlybirdIndexConfig indexConfig, - DynamicPartitionConfig partitionConfig, - ZooKeeperProxy zooKeeperClient, - @Nullable AuroraSchedulerClient schedulerClient, - ScheduledExecutorServiceFactory executorServiceFactory, - ScoringModelsManager scoringModelsManager, - TensorflowModelsManager tensorflowModelsManager, - SearchStatsReceiver searchStatsReceiver, - SearchDecider searchDecider, - CriticalExceptionHandler criticalExceptionHandler, - Clock clock) { - this( - indexConfig, - partitionConfig, - DEFAULT_LOCAL_SCHEMA_LOCATION, - DEFAULT_ZK_SCHEMA_LOCATION, - DEFAULT_UPDATE_PERIOD_MILLIS, - zooKeeperClient, - schedulerClient, - executorServiceFactory, - scoringModelsManager, - tensorflowModelsManager, - searchStatsReceiver, - searchDecider, - criticalExceptionHandler, - clock); - } - - protected UpdateableEarlybirdStateManager( - EarlybirdIndexConfig indexConfig, - DynamicPartitionConfig partitionConfig, - String schemaLocationOnLocal, - String schemaLocationOnZK, - long updatePeriodMillis, - ZooKeeperProxy zkClient, - @Nullable AuroraSchedulerClient schedulerClient, - ScheduledExecutorServiceFactory executorServiceFactory, - ScoringModelsManager scoringModelsManager, - TensorflowModelsManager tensorflowModelsManager, - SearchStatsReceiver searchStatsReceiver, - SearchDecider searchDecider, - CriticalExceptionHandler criticalExceptionHandler, - Clock clock) { - super( - executorServiceFactory, - THREAD_NAME_PATTERN, - THREAD_IS_DAEMON, - PeriodicActionParams.withFixedDelay( - updatePeriodMillis, - TimeUnit.MILLISECONDS - ), - new ShutdownWaitTimeParams( - EXECUTOR_SHUTDOWN_WAIT_SEC, - TimeUnit.SECONDS - ), - searchStatsReceiver, - criticalExceptionHandler); - this.indexConfig = indexConfig; - this.partitionConfig = partitionConfig; - this.schemaLocationOnLocal = schemaLocationOnLocal; - this.schemaLocationOnZK = schemaLocationOnZK; - this.zkClient = zkClient; - this.schedulerClient = schedulerClient; - this.scoringModelsManager = scoringModelsManager; - this.searchDecider = searchDecider; - this.noShutdownWhenNotInLayout = new AtomicLong(0); - this.tensorflowModelsManager = tensorflowModelsManager; - this.clock = clock; - this.schemaMajorVersion = getSearchStatsReceiver().getLongGauge( - SCHEMA_MAJOR_VERSION_NAME); - this.schemaMinorVersion = getSearchStatsReceiver().getLongGauge( - SCHEMA_MINOR_VERSION_NAME); - this.lastSuccessfulSchemaReloadTimeMillis = getSearchStatsReceiver().getLongGauge( - LAST_SUCCESSFUL_SCHEMA_RELOAD_TIME_MILLIS_NAME); - this.failToLoadSchemaCount = getSearchStatsReceiver().getCounter( - FAIL_TO_LOAD_SCHEMA_COUNT_NAME); - this.hostIsCanarySchema = getSearchStatsReceiver().getLongGauge(HOST_IS_CANARY_SCHEME); - this.didNotFindSchemaCount = getSearchStatsReceiver().getCounter( - DID_NOT_FIND_SCHEMA_COUNT_NAME); - this.lastSuccessfulPartitionConfigReloadTimeMillis = getSearchStatsReceiver().getLongGauge( - LAST_SUCCESSFUL_PARTITION_CONFIG_RELOAD_TIME_MILLIS_NAME); - this.failToLoadPartitionConfigCount = getSearchStatsReceiver().getCounter( - FAIL_TO_LOAD_PARTITION_CONFIG_COUNT_NAME); - this.hostIsInLayout = getSearchStatsReceiver().getLongGauge( - HOST_IS_IN_LAYOUT_STAT_NAME); - this.notInLayoutShutDownAttemptedCount = getSearchStatsReceiver().getCounter( - NOT_IN_LAYOUT_SHUT_DOWN_ATTEMPTED_NAME); - this.noShutdownWhenNotInLayoutGauge = getSearchStatsReceiver().getLongGauge( - NO_SHUTDOWN_WHEN_NOT_IN_LAYOUT_NAME, noShutdownWhenNotInLayout); - - updateSchemaVersionStats(indexConfig.getSchema()); - } - - private void updateSchemaVersionStats(Schema schema) { - schemaMajorVersion.set(schema.getMajorVersionNumber()); - schemaMinorVersion.set(schema.getMinorVersionNumber()); - lastSuccessfulSchemaReloadTimeMillis.set(System.currentTimeMillis()); - lastSuccessfulPartitionConfigReloadTimeMillis.set(System.currentTimeMillis()); - hostIsInLayout.set(1); - } - - private void updateSchemaVersionWithThriftSchema(ThriftSchema thriftSchema) - throws Schema.SchemaValidationException, DynamicSchema.SchemaUpdateException { - - ImmutableSchema newSchema = new ImmutableSchema( - thriftSchema, new AnalyzerFactory(), indexConfig.getCluster().getNameForStats()); - indexConfig.getSchema().updateSchema(newSchema); - tensorflowModelsManager.updateFeatureSchemaIdToMlIdMap(newSchema.getSearchFeatureSchema()); - updateSchemaVersionStats(indexConfig.getSchema()); - LOG.info("Schema updated. New Schema is: \n" + ThriftUtils.toTextFormatSafe(thriftSchema)); - } - - protected void updateSchema(ZooKeeperProxy zkClientToUse) { - // There are 3 cases: - // 1. Try to locate local schema file to canary, it might fail either because file not exist or - // ineligible versions. - // 2. Canary local schema failed, lookup schema file from zookeeper. - // 3. Both local and zookeeper updates failed, we do not update schema. Either schema not exists - // in zookeeper, or this would happened after canary schema: we updated current schema but did - // not rollback after finished. - if (updateSchemaFromLocal()) { - LOG.info("Host is used for schema canary"); - hostIsCanarySchema.set(1); - } else if (updateSchemaFromZooKeeper(zkClientToUse)) { - // Host is using schema file from zookeeper - hostIsCanarySchema.set(0); - } else { - // Schema update failed. Please check schema file exists on zookeeper and make sure - // rollback after canary. Current version: {}.{} - return; - } - } - - private boolean updateSchemaFromLocal() { - ThriftSchema thriftSchema = - loadCanaryThriftSchemaFromLocal(getCanarySchemaFileOnLocal()); - if (thriftSchema == null) { - // It is expected to not find a local schema file. The schema file only exists when the host - // is used as canary for schema updates - return false; - } - return updateSchemaFromThriftSchema(thriftSchema); - } - - private boolean updateSchemaFromZooKeeper(ZooKeeperProxy zkClientToUse) { - ThriftSchema thriftSchema = loadThriftSchemaFromZooKeeper(zkClientToUse); - if (thriftSchema == null) { - // It is expected to usually not find a schema file on ZooKeeper; one is only uploaded if the - // schema changes after the package has been compiled. All the relevant error handling and - // logging is expected to be handled by loadThriftSchemaFromZooKeeper(). - failToLoadSchemaCount.increment(); - return false; - } - return updateSchemaFromThriftSchema(thriftSchema); - } - - private boolean updateSchemaFromThriftSchema(ThriftSchema thriftSchema) { - Schema currentSchema = indexConfig.getSchema(); - if (thriftSchema.getMajorVersionNumber() != currentSchema.getMajorVersionNumber()) { - LOG.warn( - "Major version updates are not allowed. Current major version {}, try to update to {}", - currentSchema.getMajorVersionNumber(), thriftSchema.getMajorVersionNumber()); - return false; - } - if (thriftSchema.getMinorVersionNumber() > currentSchema.getMinorVersionNumber()) { - try { - updateSchemaVersionWithThriftSchema(thriftSchema); - } catch (Schema.SchemaValidationException | DynamicSchema.SchemaUpdateException e) { - LOG.warn("Exception while updating schema: ", e); - return false; - } - return true; - } else if (thriftSchema.getMinorVersionNumber() == currentSchema.getMinorVersionNumber()) { - LOG.info("Schema version to update is same as current one: {}.{}", - currentSchema.getMajorVersionNumber(), currentSchema.getMinorVersionNumber()); - return true; - } else { - LOG.info("Found schema to update, but not eligible for dynamic update. " - + "Current Version: {}.{}; Schema Version for updates: {}.{}", - currentSchema.getMajorVersionNumber(), - currentSchema.getMinorVersionNumber(), - thriftSchema.getMajorVersionNumber(), - thriftSchema.getMinorVersionNumber()); - return false; - } - } - - void updatePartitionConfig(@Nullable AuroraSchedulerClient schedulerClientToUse) { - try { - if (schedulerClientToUse == null) { - NonPagingAssert.assertFailed("aurora_scheduler_client_is_null"); - throw new PartitionConfigLoadingException("AuroraSchedulerClient can not be null."); - } - - PartitionConfig newPartitionConfig = - PartitionConfigLoader.getPartitionInfoForMesosConfig(schedulerClientToUse); - partitionConfig.setCurrentPartitionConfig(newPartitionConfig); - lastSuccessfulPartitionConfigReloadTimeMillis.set(System.currentTimeMillis()); - hostIsInLayout.set(1); - } catch (PartitionConfigLoadingException e) { - // Do not change hostIsInLayout's value if we could not load the layout. - LOG.warn("Failed to load partition config from ZooKeeper.", e); - failToLoadPartitionConfigCount.increment(); - } - } - - @Nullable - private ThriftSchema loadCanaryThriftSchemaFromLocal(LocalFile schemaFile) { - String schemaString; - if (!schemaFile.getFile().exists()) { - return null; - } - try { - schemaString = schemaFile.getCharSource().read(); - } catch (IOException e) { - LOG.warn("Fail to read from local schema file."); - return null; - } - ThriftSchema thriftSchema = new ThriftSchema(); - try { - ThriftUtils.fromTextFormat(schemaString, thriftSchema); - return thriftSchema; - } catch (TException e) { - LOG.warn("Unable to deserialize ThriftSchema loaded locally from {}.\n{}", - schemaFile.getName(), e); - return null; - } - } - - @Nullable - private ThriftSchema loadThriftSchemaFromZooKeeper(ZooKeeperProxy zkClientToUse) { - String schemaPathOnZk = getFullSchemaPathOnZK(); - byte[] rawBytes; - try { - rawBytes = zkClientToUse.getData(schemaPathOnZk, false, null); - } catch (KeeperException.NoNodeException e) { - didNotFindSchemaCount.increment(); - return null; - } catch (KeeperException e) { - LOG.warn("Exception while loading schema from ZK at {}.\n{}", schemaPathOnZk, e); - return null; - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - LOG.warn("Interrupted while loading schema from ZK at {}.\n{}", schemaPathOnZk, e); - return null; - } catch (ZooKeeperClient.ZooKeeperConnectionException e) { - LOG.warn("Exception while loading schema from ZK at {}.\n{}", schemaPathOnZk, e); - return null; - } - if (rawBytes == null) { - LOG.warn("Got null schema from ZooKeeper at {}.", schemaPathOnZk); - return null; - } - String schemaString = new String(rawBytes, Charsets.UTF_8); - ThriftSchema thriftSchema = new ThriftSchema(); - try { - ThriftUtils.fromTextFormat(schemaString, thriftSchema); - return thriftSchema; - } catch (TException e) { - LOG.warn("Unable to deserialize ThriftSchema loaded from ZK at {}.\n{}", schemaPathOnZk, e); - return null; - } - } - - @VisibleForTesting - protected String getSchemaFileName() { - return indexConfig.getCluster().name().toLowerCase() - + UpdateableEarlybirdStateManager.SCHEMA_SUFFIX - + indexConfig.getSchema().getMajorVersionNumber(); - } - - @VisibleForTesting - protected String getFullSchemaPathOnZK() { - return String.format("%s/%s", schemaLocationOnZK, getSchemaFileName()); - } - - LocalFile getCanarySchemaFileOnLocal() { - String canarySchemaFilePath = - String.format("%s/%s", schemaLocationOnLocal, getSchemaFileName()); - return new LocalFile(new File(canarySchemaFilePath)); - } - - void setNoShutdownWhenNotInLayout(boolean noShutdown) { - noShutdownWhenNotInLayout.set(noShutdown ? 1 : 0); - } - - @Override - protected void runOneIteration() { - updateSchema(zkClient); - updatePartitionConfig(schedulerClient); - - LOG.info("Reloading models."); - scoringModelsManager.reload(); - tensorflowModelsManager.run(); - - Random random = new Random(); - - try { - // We had an issue where HDFS operations were blocking, so reloading these models - // was finishing at the same time on each instance and after that every time an instance - // was reloading models, it was happening at the same time. This caused issues with HDFS - // load. We now place a "guard" waiting time after each reload so that the execution time - // on every instance is different and these calls can't easily sync to the same point in time. - int sleepSeconds = random.nextInt(30 * 60); - LOG.info("Sleeping for {} seconds", sleepSeconds); - clock.waitFor(sleepSeconds * 1000); - } catch (InterruptedException ex) { - LOG.info("Interrupted while sleeping"); - } - } - - public void setEarlybirdServer(EarlybirdServer earlybirdServer) { - this.earlybirdServer = earlybirdServer; - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveEarlybirdIndexConfig.docx b/src/java/com/twitter/search/earlybird/archive/ArchiveEarlybirdIndexConfig.docx new file mode 100644 index 000000000..dd564d3e1 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/ArchiveEarlybirdIndexConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveEarlybirdIndexConfig.java b/src/java/com/twitter/search/earlybird/archive/ArchiveEarlybirdIndexConfig.java deleted file mode 100644 index b7709008b..000000000 --- a/src/java/com/twitter/search/earlybird/archive/ArchiveEarlybirdIndexConfig.java +++ /dev/null @@ -1,75 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.IOException; -import java.util.concurrent.ConcurrentHashMap; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.KeepOnlyLastCommitDeletionPolicy; -import org.apache.lucene.index.LogByteSizeMergePolicy; -import org.apache.lucene.index.SerialMergeScheduler; - -import com.twitter.decider.Decider; -import com.twitter.search.common.schema.SearchWhitespaceAnalyzer; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.util.CloseResourceUtil; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; -import com.twitter.search.core.earlybird.index.EarlybirdLuceneIndexSegmentData; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; - -/** - * Base config for the top archive tweet clusters. - */ -public abstract class ArchiveEarlybirdIndexConfig extends EarlybirdIndexConfig { - - private final CloseResourceUtil resourceCloser = new CloseResourceUtil(); - - public ArchiveEarlybirdIndexConfig( - EarlybirdCluster cluster, Decider decider, SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler) { - super(cluster, decider, searchIndexingMetricSet, criticalExceptionHandler); - } - - @Override - public IndexWriterConfig newIndexWriterConfig() { - return new IndexWriterConfig(new SearchWhitespaceAnalyzer()) - .setIndexDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy()) - .setMergeScheduler(new SerialMergeScheduler()) - .setMergePolicy(new LogByteSizeMergePolicy()) - .setRAMBufferSizeMB(IndexWriterConfig.DEFAULT_RAM_PER_THREAD_HARD_LIMIT_MB) - .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH) - .setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND); - } - - @Override - public CloseResourceUtil getResourceCloser() { - return resourceCloser; - } - - @Override - public EarlybirdIndexSegmentData optimize( - EarlybirdIndexSegmentData segmentData) throws IOException { - Preconditions.checkArgument( - segmentData instanceof EarlybirdLuceneIndexSegmentData, - "Expected EarlybirdLuceneIndexSegmentData but got %s", - segmentData.getClass()); - EarlybirdLuceneIndexSegmentData data = (EarlybirdLuceneIndexSegmentData) segmentData; - - return new EarlybirdLuceneIndexSegmentData( - data.getLuceneDirectory(), - data.getMaxSegmentSize(), - data.getTimeSliceID(), - data.getSchema(), - true, // isOptimized - data.getSyncData().getSmallestDocID(), - new ConcurrentHashMap<>(data.getPerFieldMap()), - data.getFacetCountingArray(), - data.getDocValuesManager(), - data.getDocIDToTweetIDMapper(), - data.getTimeMapper(), - data.getIndexExtensionsData()); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveHDFSUtils.docx b/src/java/com/twitter/search/earlybird/archive/ArchiveHDFSUtils.docx new file mode 100644 index 000000000..72f11121d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/ArchiveHDFSUtils.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveHDFSUtils.java b/src/java/com/twitter/search/earlybird/archive/ArchiveHDFSUtils.java deleted file mode 100644 index 9fe7f3da2..000000000 --- a/src/java/com/twitter/search/earlybird/archive/ArchiveHDFSUtils.java +++ /dev/null @@ -1,173 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.IOException; -import java.util.Calendar; -import java.util.Date; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.partitioning.base.Segment; -import com.twitter.search.earlybird.partition.HdfsUtil; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; - - -public final class ArchiveHDFSUtils { - private static final Logger LOG = LoggerFactory.getLogger(ArchiveHDFSUtils.class); - - private static final Pattern SEGMENT_NAME_PATTERN = - Pattern.compile("_start_([0-9]+)_p_([0-9]+)_of_([0-9]+)_([0-9]{14}+)_"); - private static final int MATCHER_GROUP_END_DATE = 4; - - private ArchiveHDFSUtils() { - } - - /** - * Check if a given segment already has its indices built on hdfs. - * @return true if the indices exist on hdfs; otherwise, false. - */ - public static boolean hasSegmentIndicesOnHDFS(SegmentSyncConfig sync, SegmentInfo segment) { - LOG.info("checking segment on hdfs: " + segment - + " enabled: " + sync.isSegmentLoadFromHdfsEnabled()); - FileSystem fs = null; - try { - fs = HdfsUtil.getHdfsFileSystem(); - String hdfsBaseDirPrefix = segment.getSyncInfo() - .getHdfsSyncDirPrefix(); - FileStatus[] statuses = fs.globStatus(new Path(hdfsBaseDirPrefix)); - return statuses != null && statuses.length > 0; - } catch (IOException ex) { - LOG.error("Failed checking segment on hdfs: " + segment, ex); - return false; - } finally { - IOUtils.closeQuietly(fs); - } - } - - /** - * Delete the segment index directories on the HDFS. If 'deleteCurrentDir' is true, the - * index directory with the end date matching 'segment' will be deleted. If 'deleteOlderDirs', - * the index directories with the end date earlier than the the segment enddate will be deleted. - * - */ - public static void deleteHdfsSegmentDir(SegmentSyncConfig sync, SegmentInfo segment, - boolean deleteCurrentDir, boolean deleteOlderDirs) { - FileSystem fs = null; - try { - fs = HdfsUtil.getHdfsFileSystem(); - String hdfsFlushDir = segment.getSyncInfo().getHdfsFlushDir(); - String hdfsBaseDirPrefix = segment.getSyncInfo() - .getHdfsSyncDirPrefix(); - String endDateStr = extractEndDate(hdfsBaseDirPrefix); - if (endDateStr != null) { - hdfsBaseDirPrefix = hdfsBaseDirPrefix.replace(endDateStr, "*"); - } - String[] hdfsDirs = {segment.getSyncInfo().getHdfsTempFlushDir(), - hdfsBaseDirPrefix}; - for (String hdfsDir : hdfsDirs) { - FileStatus[] statuses = fs.globStatus(new Path(hdfsDir)); - if (statuses != null && statuses.length > 0) { - for (FileStatus status : statuses) { - if (status.getPath().toString().endsWith(hdfsFlushDir)) { - if (deleteCurrentDir) { - fs.delete(status.getPath(), true); - LOG.info("Deleted segment: " + status.getPath()); - } - } else { - if (deleteOlderDirs) { - fs.delete(status.getPath(), true); - LOG.info("Deleted segment: " + status.getPath()); - } - } - } - } - } - } catch (IOException e) { - LOG.error("Error delete Segment Dir :" + segment, e); - } finally { - IOUtils.closeQuietly(fs); - } - } - - /** - * Given a segment, check if there is any indices built on HDFS; if yes, return the end date - * of the index built on HDFS; otherwise, return null. - */ - public static Date getSegmentEndDateOnHdfs(SegmentSyncConfig sync, SegmentInfo segment) { - if (sync.isSegmentLoadFromHdfsEnabled()) { - LOG.info("About to check segment on hdfs: " + segment - + " enabled: " + sync.isSegmentLoadFromHdfsEnabled()); - - FileSystem fs = null; - try { - String hdfsBaseDirPrefix = segment.getSyncInfo() - .getHdfsSyncDirPrefix(); - String endDateStr = extractEndDate(hdfsBaseDirPrefix); - if (endDateStr == null) { - return null; - } - hdfsBaseDirPrefix = hdfsBaseDirPrefix.replace(endDateStr, "*"); - - fs = HdfsUtil.getHdfsFileSystem(); - FileStatus[] statuses = fs.globStatus(new Path(hdfsBaseDirPrefix)); - if (statuses != null && statuses.length > 0) { - Path hdfsSyncPath = statuses[statuses.length - 1].getPath(); - String hdfsSyncPathName = hdfsSyncPath.getName(); - endDateStr = extractEndDate(hdfsSyncPathName); - return Segment.getSegmentEndDate(endDateStr); - } - } catch (Exception ex) { - LOG.error("Failed getting segment from hdfs: " + segment, ex); - return null; - } finally { - IOUtils.closeQuietly(fs); - } - } - return null; - } - - private static String extractEndDate(String segmentDirPattern) { - Matcher matcher = SEGMENT_NAME_PATTERN.matcher(segmentDirPattern); - if (!matcher.find()) { - return null; - } - - try { - return matcher.group(MATCHER_GROUP_END_DATE); - } catch (IllegalStateException e) { - LOG.error("Match operation failed: " + segmentDirPattern, e); - return null; - } catch (IndexOutOfBoundsException e) { - LOG.error(" No group in the pattern with the given index : " + segmentDirPattern, e); - return null; - } - } - - /** - * Converts the given date to a path, using the given separator. For example, if the sate is - * January 5, 2019, and the separator is "/", this method will return "2019/01/05". - */ - public static String dateToPath(Date date, String separator) { - StringBuilder builder = new StringBuilder(); - Calendar cal = Calendar.getInstance(); - cal.setTime(date); - builder.append(cal.get(Calendar.YEAR)) - .append(separator) - .append(padding(cal.get(Calendar.MONTH) + 1, 2)) - .append(separator) - .append(padding(cal.get(Calendar.DAY_OF_MONTH), 2)); - return builder.toString(); - } - - private static String padding(int value, int len) { - return String.format("%0" + len + "d", value); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveOnDiskEarlybirdIndexConfig.docx b/src/java/com/twitter/search/earlybird/archive/ArchiveOnDiskEarlybirdIndexConfig.docx new file mode 100644 index 000000000..70c40707d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/ArchiveOnDiskEarlybirdIndexConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveOnDiskEarlybirdIndexConfig.java b/src/java/com/twitter/search/earlybird/archive/ArchiveOnDiskEarlybirdIndexConfig.java deleted file mode 100644 index ad6f3981c..000000000 --- a/src/java/com/twitter/search/earlybird/archive/ArchiveOnDiskEarlybirdIndexConfig.java +++ /dev/null @@ -1,79 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.File; -import java.io.IOException; - -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; - -import com.twitter.decider.Decider; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; -import com.twitter.search.core.earlybird.index.EarlybirdLuceneIndexSegmentData; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsFactory; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.index.DocValuesBasedTimeMapper; -import com.twitter.search.earlybird.index.DocValuesBasedTweetIDMapper; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentSyncInfo; - -/** - * Index config for the on-disk Tweet clusters. - */ -public class ArchiveOnDiskEarlybirdIndexConfig extends ArchiveEarlybirdIndexConfig { - public ArchiveOnDiskEarlybirdIndexConfig( - Decider decider, SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler) { - super(EarlybirdCluster.FULL_ARCHIVE, decider, searchIndexingMetricSet, - criticalExceptionHandler); - } - - @Override - public boolean isIndexStoredOnDisk() { - return true; - } - - @Override - public Directory newLuceneDirectory(SegmentSyncInfo segmentSyncInfo) throws IOException { - File dirPath = new File(segmentSyncInfo.getLocalLuceneSyncDir()); - return FSDirectory.open(dirPath.toPath()); - } - - @Override - public EarlybirdIndexSegmentData newSegmentData( - int maxSegmentSize, - long timeSliceID, - Directory dir, - EarlybirdIndexExtensionsFactory extensionsFactory) { - return new EarlybirdLuceneIndexSegmentData( - dir, - maxSegmentSize, - timeSliceID, - getSchema(), - new DocValuesBasedTweetIDMapper(), - new DocValuesBasedTimeMapper(), - extensionsFactory); - } - - @Override - public EarlybirdIndexSegmentData loadSegmentData( - FlushInfo flushInfo, - DataDeserializer dataInputStream, - Directory dir, - EarlybirdIndexExtensionsFactory extensionsFactory) throws IOException { - // IO Exception will be thrown if there's an error during load - return (new EarlybirdLuceneIndexSegmentData.OnDiskSegmentDataFlushHandler( - getSchema(), - dir, - extensionsFactory, - new DocValuesBasedTweetIDMapper.FlushHandler(), - new DocValuesBasedTimeMapper.FlushHandler())).load(flushInfo, dataInputStream); - } - - @Override - public boolean supportOutOfOrderIndexing() { - return false; - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSearchPartitionManager.docx b/src/java/com/twitter/search/earlybird/archive/ArchiveSearchPartitionManager.docx new file mode 100644 index 000000000..6235bfe3a Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/ArchiveSearchPartitionManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSearchPartitionManager.java b/src/java/com/twitter/search/earlybird/archive/ArchiveSearchPartitionManager.java deleted file mode 100644 index 251ceba0b..000000000 --- a/src/java/com/twitter/search/earlybird/archive/ArchiveSearchPartitionManager.java +++ /dev/null @@ -1,485 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.IOException; -import java.util.Date; -import java.util.List; -import java.util.concurrent.TimeUnit; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Predicate; -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.concurrent.ScheduledExecutorServiceFactory; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.util.GCUtil; -import com.twitter.search.common.util.io.recordreader.RecordReader; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.ServerSetMember; -import com.twitter.search.earlybird.archive.ArchiveTimeSlicer.ArchiveTimeSlice; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.util.ScrubGenUtil; -import com.twitter.search.earlybird.document.TweetDocument; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.partition.CompleteSegmentManager; -import com.twitter.search.earlybird.partition.DynamicPartitionConfig; -import com.twitter.search.earlybird.partition.MultiSegmentTermDictionaryManager; -import com.twitter.search.earlybird.partition.PartitionConfig; -import com.twitter.search.earlybird.partition.PartitionManager; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentHdfsFlusher; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentLoader; -import com.twitter.search.earlybird.partition.SegmentManager; -import com.twitter.search.earlybird.partition.SegmentManager.Filter; -import com.twitter.search.earlybird.partition.SegmentManager.Order; -import com.twitter.search.earlybird.partition.SegmentOptimizer; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; -import com.twitter.search.earlybird.partition.SegmentWarmer; -import com.twitter.search.earlybird.partition.SimpleSegmentIndexer; -import com.twitter.search.earlybird.partition.UserScrubGeoEventStreamIndexer; -import com.twitter.search.earlybird.partition.UserUpdatesStreamIndexer; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.earlybird.segment.SegmentDataProvider; -import com.twitter.search.earlybird.thrift.EarlybirdStatusCode; -import com.twitter.search.earlybird.util.CoordinatedEarlybirdAction; -import com.twitter.search.earlybird.util.CoordinatedEarlybirdActionInterface; -import com.twitter.search.earlybird.util.CoordinatedEarlybirdActionLockFailed; - -public class ArchiveSearchPartitionManager extends PartitionManager { - private static final Logger LOG = - LoggerFactory.getLogger(ArchiveSearchPartitionManager.class); - - public static final String CONFIG_NAME = "archive"; - - private static final long ONE_DAY_MILLIS = TimeUnit.DAYS.toMillis(1); - - private final ArchiveTimeSlicer timeSlicer; - private final ArchiveSegmentDataProvider segmentDataProvider; - - private final UserUpdatesStreamIndexer userUpdatesStreamIndexer; - private final UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer; - - private final SegmentWarmer segmentWarmer; - private final EarlybirdIndexConfig earlybirdIndexConfig; - private final ZooKeeperTryLockFactory zkTryLockFactory; - private final Clock clock; - private final SegmentSyncConfig segmentSyncConfig; - protected final SearchCounter gcAfterIndexing; - - // Used for coordinating daily updated across different replicas on the same hash partition, - // to run them one at a time, and minimize the impact on query latencies. - private final CoordinatedEarlybirdActionInterface coordinatedDailyUpdate; - - private final SearchIndexingMetricSet indexingMetricSet; - - // This is only used in tests where no coordination is needed. - @VisibleForTesting - public ArchiveSearchPartitionManager( - ZooKeeperTryLockFactory zooKeeperTryLockFactory, - QueryCacheManager queryCacheManager, - SegmentManager segmentManager, - DynamicPartitionConfig dynamicPartitionConfig, - UserUpdatesStreamIndexer userUpdatesStreamIndexer, - UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer, - SearchStatsReceiver searchStatsReceiver, - ArchiveEarlybirdIndexConfig earlybirdIndexConfig, - ScheduledExecutorServiceFactory executorServiceFactory, - ScheduledExecutorServiceFactory userUpdateIndexerScheduledExecutorFactory, - SearchIndexingMetricSet searchIndexingMetricSet, - SegmentSyncConfig syncConfig, - Clock clock, - CriticalExceptionHandler criticalExceptionHandler) - throws IOException { - this( - zooKeeperTryLockFactory, - queryCacheManager, - segmentManager, - dynamicPartitionConfig, - userUpdatesStreamIndexer, - userScrubGeoEventStreamIndexer, - searchStatsReceiver, - earlybirdIndexConfig, - null, - executorServiceFactory, - userUpdateIndexerScheduledExecutorFactory, - searchIndexingMetricSet, - syncConfig, - clock, - criticalExceptionHandler); - } - - public ArchiveSearchPartitionManager( - ZooKeeperTryLockFactory zooKeeperTryLockFactory, - QueryCacheManager queryCacheManager, - SegmentManager segmentManager, - DynamicPartitionConfig dynamicPartitionConfig, - UserUpdatesStreamIndexer userUpdatesStreamIndexer, - UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer, - SearchStatsReceiver searchStatsReceiver, - ArchiveEarlybirdIndexConfig earlybirdIndexConfig, - ServerSetMember serverSetMember, - ScheduledExecutorServiceFactory executorServiceFactory, - ScheduledExecutorServiceFactory userUpdateIndexerExecutorFactory, - SearchIndexingMetricSet searchIndexingMetricSet, - SegmentSyncConfig syncConfig, - Clock clock, - CriticalExceptionHandler criticalExceptionHandler) throws IOException { - super(queryCacheManager, segmentManager, dynamicPartitionConfig, executorServiceFactory, - searchIndexingMetricSet, searchStatsReceiver, criticalExceptionHandler); - - Preconditions.checkState(syncConfig.getScrubGen().isPresent()); - Date scrubGen = ScrubGenUtil.parseScrubGenToDate(syncConfig.getScrubGen().get()); - - this.zkTryLockFactory = zooKeeperTryLockFactory; - final DailyStatusBatches dailyStatusBatches = new DailyStatusBatches( - zkTryLockFactory, - scrubGen); - this.earlybirdIndexConfig = earlybirdIndexConfig; - PartitionConfig curPartitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - - this.indexingMetricSet = searchIndexingMetricSet; - - this.timeSlicer = new ArchiveTimeSlicer( - EarlybirdConfig.getMaxSegmentSize(), dailyStatusBatches, - curPartitionConfig.getTierStartDate(), curPartitionConfig.getTierEndDate(), - earlybirdIndexConfig); - this.segmentDataProvider = - new ArchiveSegmentDataProvider( - dynamicPartitionConfig, - timeSlicer, - this.earlybirdIndexConfig); - - this.userUpdatesStreamIndexer = userUpdatesStreamIndexer; - this.userScrubGeoEventStreamIndexer = userScrubGeoEventStreamIndexer; - - this.coordinatedDailyUpdate = new CoordinatedEarlybirdAction( - zkTryLockFactory, - "archive_daily_update", - dynamicPartitionConfig, - serverSetMember, - criticalExceptionHandler, - syncConfig); - - this.segmentWarmer = new SegmentWarmer(criticalExceptionHandler); - this.clock = clock; - this.segmentSyncConfig = syncConfig; - this.gcAfterIndexing = SearchCounter.export("gc_after_indexing"); - } - - @Override - public SegmentDataProvider getSegmentDataProvider() { - return segmentDataProvider; - } - - @Override - protected void startUp() throws Exception { - LOG.info("Using CompleteSegmentManager to index complete segments."); - - // deferring handling of multi-segment term dictionary for the archive. - // SEARCH-11952 - CompleteSegmentManager completeSegmentManager = new CompleteSegmentManager( - zkTryLockFactory, - segmentDataProvider, - userUpdatesStreamIndexer, - userScrubGeoEventStreamIndexer, - segmentManager, - null, - indexingMetricSet, - clock, - MultiSegmentTermDictionaryManager.NOOP_INSTANCE, - segmentSyncConfig, - criticalExceptionHandler); - - completeSegmentManager.indexUserEvents(); - completeSegmentManager.indexCompleteSegments( - () -> segmentManager.getSegmentInfos(Filter.NeedsIndexing, Order.OLD_TO_NEW)); - - // In the archive cluster, the current segment needs to be loaded too. - List allSegments = - Lists.newArrayList(segmentManager.getSegmentInfos(Filter.All, Order.OLD_TO_NEW)); - completeSegmentManager.loadCompleteSegments(allSegments); - - completeSegmentManager.buildMultiSegmentTermDictionary(); - - completeSegmentManager.warmSegments(allSegments); - - LOG.info("Starting to run UserUpdatesKafkaConsumer"); - new Thread(userUpdatesStreamIndexer::run, "userupdates-stream-indexer").start(); - - if (EarlybirdConfig.consumeUserScrubGeoEvents()) { - LOG.info("Starting to run UserScrubGeoEventKafkaConsumer"); - new Thread(userScrubGeoEventStreamIndexer::run, - "userScrubGeoEvent-stream-indexer").start(); - } - } - - private static List truncateSegmentList(List segmentList, - int maxNumSegments) { - // Maybe cut-off the beginning of the sorted list of IDs. - if (maxNumSegments > 0 && maxNumSegments < segmentList.size()) { - return segmentList.subList(segmentList.size() - maxNumSegments, segmentList.size()); - } else { - return segmentList; - } - } - - - @Override - protected void indexingLoop(boolean firstLoop) throws Exception { - if (firstLoop) { - EarlybirdStatus.beginEvent( - INDEX_CURRENT_SEGMENT, getSearchIndexingMetricSet().startupInCurrentSegment); - } - - List timeSlices = timeSlicer.getTimeSlicesInTierRange(); - PartitionConfig curPartitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - timeSlices = truncateSegmentList(timeSlices, curPartitionConfig.getMaxEnabledLocalSegments()); - - for (final ArchiveTimeSlice timeSlice : timeSlices) { - // If any timeslice build failed, do not try to build timeslice after that to prevent - // possible holes between timeslices. - try { - if (!processArchiveTimeSlice(timeSlice)) { - LOG.warn("Building timeslice {} has failed, stopping future builds.", - timeSlice.getDescription()); - indexingMetricSet.archiveTimeSliceBuildFailedCounter.increment(); - return; - } - } catch (CoordinatedEarlybirdActionLockFailed e) { - // If the timeslice build failed because of lock coordination, we can wait for the next - // iteration to build again. - return; - } - } - - if (firstLoop) { - EarlybirdStatus.endEvent( - INDEX_CURRENT_SEGMENT, getSearchIndexingMetricSet().startupInCurrentSegment); - LOG.info("First indexing loop complete. Setting up query cache..."); - EarlybirdStatus.beginEvent( - SETUP_QUERY_CACHE, getSearchIndexingMetricSet().startupInQueryCacheUpdates); - } - setupQueryCacheIfNeeded(); - - if (EarlybirdStatus.isStarting() && queryCacheManager.allTasksRan()) { - LOG.info("Query cache setup complete. Becoming current now..."); - EarlybirdStatus.endEvent( - SETUP_QUERY_CACHE, getSearchIndexingMetricSet().startupInQueryCacheUpdates); - - becomeCurrent(); - EarlybirdStatus.recordEarlybirdEvent("Archive Earlybird is current"); - } - - updateIndexFreshnessStats(timeSlices); - } - - @VisibleForTesting - protected boolean processArchiveTimeSlice(final ArchiveTimeSlice timeSlice) - throws CoordinatedEarlybirdActionLockFailed, IOException { - PartitionConfig curPartitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - long minStatusID = timeSlice.getMinStatusID(curPartitionConfig.getIndexingHashPartitionID()); - SegmentInfo segmentInfo = segmentManager.getSegmentInfo(minStatusID); - if (segmentInfo == null) { - return indexSegmentFromScratch(timeSlice); - } else if (existingSegmentNeedsUpdating(timeSlice, segmentInfo)) { - return indexNewDayAndAppendExistingSegment(timeSlice, segmentInfo); - } - return true; - } - - - @VisibleForTesting - SegmentInfo newSegmentInfo(ArchiveTimeSlice timeSlice) throws IOException { - return new SegmentInfo(segmentDataProvider.newArchiveSegment(timeSlice), - segmentManager.getEarlybirdSegmentFactory(), segmentSyncConfig); - } - - private boolean indexNewDayAndAppendExistingSegment(final ArchiveTimeSlice timeSlice, - SegmentInfo segmentInfo) - throws CoordinatedEarlybirdActionLockFailed, IOException { - - LOG.info("Updating segment: {}; new endDate will be {} segmentInfo: {}", - segmentInfo.getSegment().getTimeSliceID(), timeSlice.getEndDate(), segmentInfo); - - // Create another new SegmentInfo for indexing - final SegmentInfo newSegmentInfoForIndexing = newSegmentInfo(timeSlice); - // make a final reference of the old segment info to be passed into closure. - final SegmentInfo oldSegmentInfo = segmentInfo; - - // Sanity check: the old and new segment should not share the same lucene directory. - Preconditions.checkState( - !newSegmentInfoForIndexing.getSyncInfo().getLocalLuceneSyncDir().equals( - oldSegmentInfo.getSyncInfo().getLocalLuceneSyncDir())); - - Preconditions.checkState( - !newSegmentInfoForIndexing.getSyncInfo().getLocalSyncDir().equals( - oldSegmentInfo.getSyncInfo().getLocalSyncDir())); - - final ArchiveSegment oldSegment = (ArchiveSegment) segmentInfo.getSegment(); - - return indexSegment(newSegmentInfoForIndexing, oldSegmentInfo, input -> { - // we're updating the segment - only index days after the old end date, but only if - // we're in the on-disk archive, and we're sure that the previous days have already - // been indexed. - return !earlybirdIndexConfig.isIndexStoredOnDisk() - // First time around, and the segment has not been indexed and optimized yet, - // we will want to add all the days - || !oldSegmentInfo.isOptimized() - || oldSegmentInfo.getIndexSegment().getIndexStats().getStatusCount() == 0 - || !oldSegment.getDataEndDate().before(timeSlice.getEndDate()) - // Index any new days - || input.after(oldSegment.getDataEndDate()); - }); - } - - private boolean existingSegmentNeedsUpdating(ArchiveTimeSlice timeSlice, - SegmentInfo segmentInfo) { - return ((ArchiveSegment) segmentInfo.getSegment()) - .getDataEndDate().before(timeSlice.getEndDate()) - // First time around, the end date is the same as the timeSlice end date, but - // the segment has not been indexed and optimized yet - || (!segmentInfo.isOptimized() && !segmentInfo.wasIndexed()) - // If indexing failed, this index will not be marked as complete, and we will want - // to reindex - || !segmentInfo.isComplete(); - } - - private boolean indexSegmentFromScratch(ArchiveTimeSlice timeSlice) throws - CoordinatedEarlybirdActionLockFailed, IOException { - - SegmentInfo segmentInfo = newSegmentInfo(timeSlice); - LOG.info("Creating segment: " + segmentInfo.getSegment().getTimeSliceID() - + "; new endDate will be " + timeSlice.getEndDate() + " segmentInfo: " + segmentInfo); - - return indexSegment(segmentInfo, null, ArchiveSegment.MATCH_ALL_DATE_PREDICATE); - } - - private void updateIndexFreshnessStats(List timeSlices) { - if (!timeSlices.isEmpty()) { - ArchiveTimeSlice lastTimeslice = timeSlices.get(timeSlices.size() - 1); - - // Add ~24 hours to start of end date to estimate freshest tweet time. - indexingMetricSet.freshestTweetTimeMillis.set( - lastTimeslice.getEndDate().getTime() + ONE_DAY_MILLIS); - - PartitionConfig curPartitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - long maxStatusId = lastTimeslice.getMaxStatusID( - curPartitionConfig.getIndexingHashPartitionID()); - if (maxStatusId > indexingMetricSet.highestStatusId.get()) { - indexingMetricSet.highestStatusId.set(maxStatusId); - } - } - } - - @Override - public void shutDownIndexing() { - LOG.info("Shutting down."); - userUpdatesStreamIndexer.close(); - userScrubGeoEventStreamIndexer.close(); - LOG.info("Closed User Event Kafka Consumers. Now Shutting down reader set."); - getSegmentDataProvider().getSegmentDataReaderSet().stopAll(); - } - - /** - * Attempts to index new days of data into the provided segment, indexing only the days that - * match the "dateFilter" predicate. - * @return true iff indexing succeeded, false otherwise. - */ - @VisibleForTesting - protected boolean indexSegment(final SegmentInfo segmentInfo, - @Nullable final SegmentInfo segmentToAppend, - final Predicate dateFilter) - throws CoordinatedEarlybirdActionLockFailed, IOException { - // Don't coordinate while we're starting up - if (!EarlybirdStatus.isStarting()) { - return coordinatedDailyUpdate.execute(segmentInfo.getSegmentName(), - isCoordinated -> innerIndexSegment(segmentInfo, segmentToAppend, dateFilter)); - } else { - return innerIndexSegment(segmentInfo, segmentToAppend, dateFilter); - } - } - - private boolean innerIndexSegment(SegmentInfo segmentInfo, - @Nullable SegmentInfo segmentToAppend, - Predicate dateFilter) - throws IOException { - - // First try to load the new day from HDFS / Local disk - if (new SegmentLoader(segmentSyncConfig, criticalExceptionHandler).load(segmentInfo)) { - LOG.info("Successful loaded segment for new day: " + segmentInfo); - segmentManager.putSegmentInfo(segmentInfo); - gcAfterIndexing.increment(); - GCUtil.runGC(); - return true; - } - - LOG.info("Failed to load segment for new day. Will index segment: " + segmentInfo); - RecordReader tweetReader = ((ArchiveSegment) segmentInfo.getSegment()) - .getStatusRecordReader(earlybirdIndexConfig.createDocumentFactory(), dateFilter); - try { - // Read and index the statuses - boolean success = newSimpleSegmentIndexer(tweetReader, segmentToAppend) - .indexSegment(segmentInfo); - if (!success) { - return false; - } - } finally { - tweetReader.stop(); - } - - if (!SegmentOptimizer.optimize(segmentInfo)) { - // We consider the whole indexing event as failed if we fail to optimize. - LOG.error("Failed to optimize segment: " + segmentInfo); - segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - return false; - } - - if (!segmentWarmer.warmSegmentIfNecessary(segmentInfo)) { - // We consider the whole indexing event as failed if we failed to warm (because we open - // index readers in the warmer). - LOG.error("Failed to warm segment: " + segmentInfo); - segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - return false; - } - - // Flush and upload segment to HDFS. If this fails, we just log a warning and return true. - boolean success = new SegmentHdfsFlusher(zkTryLockFactory, segmentSyncConfig) - .flushSegmentToDiskAndHDFS(segmentInfo); - if (!success) { - LOG.warn("Failed to flush segment to HDFS: " + segmentInfo); - } - - segmentManager.putSegmentInfo(segmentInfo); - gcAfterIndexing.increment(); - GCUtil.runGC(); - return true; - } - - @VisibleForTesting - protected SimpleSegmentIndexer newSimpleSegmentIndexer( - RecordReader tweetReader, SegmentInfo segmentToAppend) { - return new SimpleSegmentIndexer(tweetReader, indexingMetricSet, segmentToAppend); - } - - @Override - public boolean isCaughtUpForTests() { - return EarlybirdStatus.getStatusCode() == EarlybirdStatusCode.CURRENT; - } - - public CoordinatedEarlybirdActionInterface getCoordinatedOptimizer() { - return this.coordinatedDailyUpdate; - } - - public ArchiveTimeSlicer getTimeSlicer() { - return timeSlicer; - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSegment.docx b/src/java/com/twitter/search/earlybird/archive/ArchiveSegment.docx new file mode 100644 index 000000000..6c1acebfb Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/ArchiveSegment.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSegment.java b/src/java/com/twitter/search/earlybird/archive/ArchiveSegment.java deleted file mode 100644 index 9d18a48e4..000000000 --- a/src/java/com/twitter/search/earlybird/archive/ArchiveSegment.java +++ /dev/null @@ -1,88 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.IOException; -import java.util.Date; - -import com.google.common.base.Predicate; -import com.google.common.base.Predicates; - -import com.twitter.search.common.partitioning.base.Segment; -import com.twitter.search.common.partitioning.base.TimeSlice; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.util.io.recordreader.RecordReader; -import com.twitter.search.earlybird.archive.ArchiveTimeSlicer.ArchiveTimeSlice; -import com.twitter.search.earlybird.document.DocumentFactory; -import com.twitter.search.earlybird.document.TweetDocument; - -public class ArchiveSegment extends Segment { - private final ArchiveTimeSlice archiveTimeSlice; - - public static final Predicate MATCH_ALL_DATE_PREDICATE = input -> true; - - // Constructor used for indexing an archive segment - public ArchiveSegment(ArchiveTimeSlice archiveTimeSlice, - int hashPartitionID, - int maxSegmentSize) { - super(new TimeSlice(archiveTimeSlice.getMinStatusID(hashPartitionID), - maxSegmentSize, hashPartitionID, - archiveTimeSlice.getNumHashPartitions()), - archiveTimeSlice.getEndDate().getTime()); - this.archiveTimeSlice = archiveTimeSlice; - } - - /** - * Constructor used for loading a flushed segment. Only be used by SegmentBuilder; Earlybird - * does not use this. - */ - ArchiveSegment(long timeSliceId, - int maxSegmentSize, - int partitions, - int hashPartitionID, - Date dataEndDate) { - super(new TimeSlice(timeSliceId, maxSegmentSize, hashPartitionID, partitions), - dataEndDate.getTime()); - // No archive timeslice is needed for loading. - this.archiveTimeSlice = null; - } - - /** - * Returns the tweets reader for this segment. - * - * @param documentFactory The factory that converts ThriftDocuments to Lucene documents. - */ - public RecordReader getStatusRecordReader( - DocumentFactory documentFactory) throws IOException { - return getStatusRecordReader(documentFactory, Predicates.alwaysTrue()); - } - - /** - * Returns the tweets reader for this segment. - * - * @param documentFactory The factory that converts ThriftDocuments to Lucene documents. - * @param filter A predicate that filters tweets based on the date they were created on. - */ - public RecordReader getStatusRecordReader( - DocumentFactory documentFactory, - Predicate filter) throws IOException { - if (archiveTimeSlice != null) { - return archiveTimeSlice.getStatusReader(this, documentFactory, filter); - } else { - throw new IllegalStateException("ArchiveSegment has no associated ArchiveTimeslice." - + "This ArchiveSegment can only be used for loading flushed segments."); - } - } - - public Date getDataEndDate() { - return archiveTimeSlice == null - ? new Date(getDataEndDateInclusiveMillis()) : archiveTimeSlice.getEndDate(); - } - - public ArchiveTimeSlice getArchiveTimeSlice() { - return archiveTimeSlice; - } - - @Override - public String toString() { - return super.toString() + " " + archiveTimeSlice.getDescription(); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentDataProvider.docx b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentDataProvider.docx new file mode 100644 index 000000000..8a8cd531d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentDataProvider.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentDataProvider.java b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentDataProvider.java deleted file mode 100644 index 07b44df5c..000000000 --- a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentDataProvider.java +++ /dev/null @@ -1,84 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.IOException; -import java.util.List; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import com.twitter.search.common.partitioning.base.Segment; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.util.io.recordreader.RecordReader; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.archive.ArchiveTimeSlicer.ArchiveTimeSlice; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.document.DocumentFactory; -import com.twitter.search.earlybird.document.TweetDocument; -import com.twitter.search.earlybird.partition.DynamicPartitionConfig; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.segment.EmptySegmentDataReaderSet; -import com.twitter.search.earlybird.segment.SegmentDataProvider; -import com.twitter.search.earlybird.segment.SegmentDataReaderSet; - -public class ArchiveSegmentDataProvider implements SegmentDataProvider { - private static final org.slf4j.Logger LOG = - org.slf4j.LoggerFactory.getLogger(ArchiveSegmentDataProvider.class); - - private DynamicPartitionConfig dynamicPartitionConfig; - private final ArchiveTimeSlicer timeSlicer; - - private final DocumentFactory documentFactory; - - private final SegmentDataReaderSet readerSet; - - public ArchiveSegmentDataProvider( - DynamicPartitionConfig dynamicPartitionConfig, - ArchiveTimeSlicer timeSlicer, - EarlybirdIndexConfig earlybirdIndexConfig) throws IOException { - this.dynamicPartitionConfig = dynamicPartitionConfig; - this.timeSlicer = timeSlicer; - this.readerSet = createSegmentDataReaderSet(); - this.documentFactory = earlybirdIndexConfig.createDocumentFactory(); - } - - @Override - public List newSegmentList() throws IOException { - List timeSlices = timeSlicer.getTimeSlicesInTierRange(); - if (timeSlices == null || timeSlices.isEmpty()) { - return Lists.newArrayList(); - } - List segments = Lists.newArrayListWithCapacity(timeSlices.size()); - for (ArchiveTimeSlice timeSlice : timeSlices) { - segments.add(newArchiveSegment(timeSlice)); - } - return segments; - } - - /** - * Creates a new Segment instance for the given timeslice. - */ - public ArchiveSegment newArchiveSegment(ArchiveTimeSlice archiveTimeSlice) { - return new ArchiveSegment( - archiveTimeSlice, - dynamicPartitionConfig.getCurrentPartitionConfig().getIndexingHashPartitionID(), - EarlybirdConfig.getMaxSegmentSize()); - } - - @Override - public SegmentDataReaderSet getSegmentDataReaderSet() { - return readerSet; - } - - private EmptySegmentDataReaderSet createSegmentDataReaderSet() throws IOException { - return new EmptySegmentDataReaderSet() { - - @Override - public RecordReader newDocumentReader(SegmentInfo segmentInfo) - throws IOException { - Segment segment = segmentInfo.getSegment(); - Preconditions.checkArgument(segment instanceof ArchiveSegment); - return ((ArchiveSegment) segment).getStatusRecordReader(documentFactory); - } - }; - } -}