694 lines
28 KiB
Java
694 lines
28 KiB
Java
package com.twitter.search.common.schema;
|
|
|
|
import java.util.Map;
|
|
import java.util.Set;
|
|
import javax.annotation.Nullable;
|
|
|
|
import com.google.common.base.Preconditions;
|
|
import com.google.common.collect.ImmutableList;
|
|
import com.google.common.collect.Sets;
|
|
|
|
import com.twitter.common.text.util.CharSequenceTermAttributeSerializer;
|
|
import com.twitter.common.text.util.PositionIncrementAttributeSerializer;
|
|
import com.twitter.common.text.util.TokenStreamSerializer;
|
|
import com.twitter.common.text.util.TokenTypeAttributeSerializer;
|
|
import com.twitter.search.common.schema.base.FeatureConfiguration;
|
|
import com.twitter.search.common.schema.base.FieldNameToIdMapping;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftCSFFieldSettings;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftCSFViewSettings;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftFacetFieldSettings;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftFeatureNormalizationType;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftFeatureUpdateConstraint;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftFieldConfiguration;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftFieldSettings;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftFixedLengthCSFSettings;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftIndexOptions;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftIndexedFieldSettings;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftIndexedNumericFieldSettings;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftNumericType;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftSchema;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftSearchFieldSettings;
|
|
import com.twitter.search.common.schema.thriftjava.ThriftTokenStreamSerializer;
|
|
import com.twitter.search.common.util.analysis.CharTermAttributeSerializer;
|
|
import com.twitter.search.common.util.analysis.IntTermAttributeSerializer;
|
|
import com.twitter.search.common.util.analysis.LongTermAttributeSerializer;
|
|
import com.twitter.search.common.util.analysis.PayloadAttributeSerializer;
|
|
|
|
public class SchemaBuilder {
|
|
|
|
public static final String CSF_VIEW_NAME_SEPARATOR = ".";
|
|
protected final ThriftSchema schema = new ThriftSchema();
|
|
protected final FieldNameToIdMapping idMapping;
|
|
protected final int tokenStreamSerializerVersion;
|
|
|
|
// As of now, we do not allow two fields to share the same field name.
|
|
// This set is used to perform this check.
|
|
private final Set<String> fieldNameSet = Sets.newHashSet();
|
|
|
|
/**
|
|
* Construct a schema builder with the given FieldNameToIdMapper.
|
|
* A SchemaBuilder is used to build a ThriftSchema incrementally.
|
|
*/
|
|
public SchemaBuilder(FieldNameToIdMapping idMapping,
|
|
TokenStreamSerializer.Version tokenStreamSerializerVersion) {
|
|
this.idMapping = idMapping;
|
|
Preconditions.checkArgument(
|
|
tokenStreamSerializerVersion == TokenStreamSerializer.Version.VERSION_2);
|
|
this.tokenStreamSerializerVersion = tokenStreamSerializerVersion.ordinal();
|
|
}
|
|
|
|
/**
|
|
* Build ThriftSchema using settings accumulated so far.
|
|
*/
|
|
public final ThriftSchema build() {
|
|
return schema;
|
|
}
|
|
|
|
/**
|
|
* Uses fieldName also as facetName.
|
|
*/
|
|
public final SchemaBuilder withFacetConfigs(String fieldName,
|
|
boolean storeSkipList,
|
|
boolean storeOffensiveCounters,
|
|
boolean useCSFForFacetCounting) {
|
|
return withFacetConfigs(
|
|
fieldName,
|
|
fieldName,
|
|
storeSkipList,
|
|
storeOffensiveCounters,
|
|
useCSFForFacetCounting);
|
|
}
|
|
|
|
/**
|
|
* Add facet field configuration.
|
|
*/
|
|
public final SchemaBuilder withFacetConfigs(String fieldName,
|
|
String facetName,
|
|
boolean storeSkipList,
|
|
boolean storeOffensiveCounters,
|
|
boolean useCSFForFacetCounting) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFacetFieldSettings facetSettings = new ThriftFacetFieldSettings();
|
|
// As of now, all our facet names are the same as field names
|
|
facetSettings.setFacetName(facetName);
|
|
facetSettings.setStoreSkiplist(storeSkipList);
|
|
facetSettings.setStoreOffensiveCounters(storeOffensiveCounters);
|
|
facetSettings.setUseCSFForFacetCounting(useCSFForFacetCounting);
|
|
|
|
int fieldId = idMapping.getFieldID(fieldName);
|
|
ThriftFieldConfiguration fieldConfiguration = schema.getFieldConfigs().get(fieldId);
|
|
Preconditions.checkNotNull(fieldConfiguration,
|
|
"In Earlybird, a facet field must be indexed. "
|
|
+ "No ThriftIndexedFieldSettings found for field " + fieldName);
|
|
fieldConfiguration.getSettings().setFacetFieldSettings(facetSettings);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Configure the given field ID to be used for partitioning.
|
|
*/
|
|
public final SchemaBuilder withPartitionFieldId(int partitionFieldId) {
|
|
schema.setPartitionFieldId(partitionFieldId);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Add a column stride field into schema.
|
|
*/
|
|
public final SchemaBuilder withColumnStrideField(String fieldName,
|
|
ThriftCSFType type,
|
|
int numValuesPerDoc,
|
|
boolean updatable,
|
|
boolean loadIntoRam) {
|
|
return withColumnStrideField(fieldName, type, numValuesPerDoc, updatable, loadIntoRam, null);
|
|
}
|
|
|
|
/**
|
|
* Add a column stride field into schema that is variable length.
|
|
*/
|
|
public final SchemaBuilder withBinaryColumnStrideField(String fieldName,
|
|
boolean loadIntoRam) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftCSFFieldSettings csfFieldSettings = new ThriftCSFFieldSettings();
|
|
csfFieldSettings.setCsfType(ThriftCSFType.BYTE)
|
|
.setVariableLength(true)
|
|
.setLoadIntoRAM(loadIntoRam);
|
|
|
|
ThriftFieldSettings fieldSettings =
|
|
new ThriftFieldSettings().setCsfFieldSettings(csfFieldSettings);
|
|
ThriftFieldConfiguration fieldConf =
|
|
new ThriftFieldConfiguration(fieldName).setSettings(fieldSettings);
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName), fieldConf);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Add a column stride field into schema which has a default value.
|
|
*/
|
|
public final SchemaBuilder withColumnStrideField(String fieldName,
|
|
ThriftCSFType type,
|
|
int numValuesPerDoc,
|
|
boolean updatable,
|
|
boolean loadIntoRam,
|
|
Long defaultValue) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftCSFFieldSettings csfFieldSettings = new ThriftCSFFieldSettings();
|
|
csfFieldSettings.setCsfType(type)
|
|
.setVariableLength(false)
|
|
.setFixedLengthSettings(
|
|
new ThriftFixedLengthCSFSettings()
|
|
.setNumValuesPerDoc(numValuesPerDoc)
|
|
.setUpdateable(updatable))
|
|
.setLoadIntoRAM(loadIntoRam);
|
|
|
|
if (defaultValue != null) {
|
|
csfFieldSettings.setDefaultValue(defaultValue);
|
|
}
|
|
|
|
ThriftFieldSettings fieldSettings =
|
|
new ThriftFieldSettings().setCsfFieldSettings(csfFieldSettings);
|
|
ThriftFieldConfiguration fieldConf =
|
|
new ThriftFieldConfiguration(fieldName).setSettings(fieldSettings);
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName), fieldConf);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Add a CSF view into schema. A view is a portion of another CSF.
|
|
*/
|
|
public final SchemaBuilder withColumnStrideFieldView(
|
|
String fieldName,
|
|
ThriftCSFType csfType,
|
|
ThriftCSFType outputCSFType,
|
|
String baseFieldName,
|
|
int valueIndex,
|
|
int bitStartPosition,
|
|
int bitLength,
|
|
ThriftFeatureNormalizationType featureNormalizationType,
|
|
@Nullable Set<ThriftFeatureUpdateConstraint> constraints) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
|
|
int baseFieldConfigID = idMapping.getFieldID(baseFieldName);
|
|
|
|
ThriftCSFViewSettings csfViewSettings = new ThriftCSFViewSettings()
|
|
.setBaseFieldConfigId(baseFieldConfigID)
|
|
.setCsfType(csfType)
|
|
.setValueIndex(valueIndex)
|
|
.setBitStartPosition(bitStartPosition)
|
|
.setBitLength(bitLength);
|
|
if (outputCSFType != null) {
|
|
csfViewSettings.setOutputCSFType(outputCSFType);
|
|
}
|
|
if (featureNormalizationType != ThriftFeatureNormalizationType.NONE) {
|
|
csfViewSettings.setNormalizationType(featureNormalizationType);
|
|
}
|
|
if (constraints != null) {
|
|
csfViewSettings.setFeatureUpdateConstraints(constraints);
|
|
}
|
|
ThriftFieldSettings fieldSettings = new ThriftFieldSettings()
|
|
.setCsfViewSettings(csfViewSettings);
|
|
ThriftFieldConfiguration fieldConf = new ThriftFieldConfiguration(fieldName)
|
|
.setSettings(fieldSettings);
|
|
|
|
Map<Integer, ThriftFieldConfiguration> fieldConfigs = schema.getFieldConfigs();
|
|
verifyCSFViewSettings(fieldConfigs, fieldConf);
|
|
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName), fieldConf);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Sanity checks for CSF view settings.
|
|
*/
|
|
public static void verifyCSFViewSettings(Map<Integer, ThriftFieldConfiguration> fieldConfigs,
|
|
ThriftFieldConfiguration fieldConf) {
|
|
Preconditions.checkNotNull(fieldConf.getSettings());
|
|
Preconditions.checkNotNull(fieldConf.getSettings().getCsfViewSettings());
|
|
ThriftCSFViewSettings csfViewSettings = fieldConf.getSettings().getCsfViewSettings();
|
|
|
|
if (fieldConfigs != null) {
|
|
ThriftFieldConfiguration baseFieldConfig = fieldConfigs.get(
|
|
csfViewSettings.getBaseFieldConfigId());
|
|
if (baseFieldConfig != null) {
|
|
String baseFieldName = baseFieldConfig.getFieldName();
|
|
String expectedViewNamePrefix = baseFieldName + CSF_VIEW_NAME_SEPARATOR;
|
|
if (fieldConf.getFieldName().startsWith(expectedViewNamePrefix)) {
|
|
ThriftFieldSettings baseFieldSettings = baseFieldConfig.getSettings();
|
|
ThriftCSFFieldSettings baseFieldCSFSettings = baseFieldSettings.getCsfFieldSettings();
|
|
|
|
if (baseFieldCSFSettings != null) {
|
|
if (!baseFieldCSFSettings.isVariableLength()
|
|
&& baseFieldCSFSettings.getFixedLengthSettings() != null) {
|
|
|
|
ThriftCSFType baseCSFType = baseFieldCSFSettings.getCsfType();
|
|
switch (baseCSFType) {
|
|
case BYTE:
|
|
checkCSFViewPositions(baseFieldCSFSettings, 8, csfViewSettings);
|
|
break;
|
|
case INT:
|
|
checkCSFViewPositions(baseFieldCSFSettings, 32, csfViewSettings);
|
|
break;
|
|
default:
|
|
throw new IllegalStateException("Base field: " + baseFieldName
|
|
+ " is of a non-supported CSFType: " + baseCSFType);
|
|
}
|
|
} else {
|
|
throw new IllegalStateException("Base field: " + baseFieldName
|
|
+ " must be a fixed-length CSF field");
|
|
}
|
|
} else {
|
|
throw new IllegalStateException("Base field: " + baseFieldName + " is not a CSF field");
|
|
}
|
|
} else {
|
|
throw new IllegalStateException("View field name for baseFieldConfigID: "
|
|
+ csfViewSettings.getBaseFieldConfigId() + " must start with: '"
|
|
+ expectedViewNamePrefix + "'");
|
|
}
|
|
} else {
|
|
throw new IllegalStateException("Can't add a view, no field defined for base fieldID: "
|
|
+ csfViewSettings.getBaseFieldConfigId());
|
|
}
|
|
} else {
|
|
throw new IllegalStateException("Can't add a view, no field configs defined.");
|
|
}
|
|
}
|
|
|
|
private static void checkCSFViewPositions(ThriftCSFFieldSettings baseFieldCSFSettings,
|
|
int bitsPerValue,
|
|
ThriftCSFViewSettings csfViewSettings) {
|
|
ThriftFixedLengthCSFSettings fixedLengthCSFSettings =
|
|
baseFieldCSFSettings.getFixedLengthSettings();
|
|
Preconditions.checkNotNull(fixedLengthCSFSettings);
|
|
|
|
int numValues = fixedLengthCSFSettings.getNumValuesPerDoc();
|
|
Preconditions.checkState(csfViewSettings.getValueIndex() >= 0,
|
|
"value index must be positive: " + csfViewSettings.getValueIndex());
|
|
Preconditions.checkState(csfViewSettings.getValueIndex() < numValues, "value index "
|
|
+ csfViewSettings.getValueIndex() + " must be less than numValues: " + numValues);
|
|
|
|
Preconditions.checkState(csfViewSettings.getBitStartPosition() >= 0,
|
|
"bitStartPosition must be positive: " + csfViewSettings.getBitStartPosition());
|
|
Preconditions.checkState(csfViewSettings.getBitStartPosition() < bitsPerValue,
|
|
"bitStartPosition " + csfViewSettings.getBitStartPosition()
|
|
+ " must be less than bitsPerValue " + bitsPerValue);
|
|
|
|
Preconditions.checkState(csfViewSettings.getBitLength() >= 1,
|
|
"bitLength must be positive: " + csfViewSettings.getBitLength());
|
|
|
|
Preconditions.checkState(
|
|
csfViewSettings.getBitStartPosition() + csfViewSettings.getBitLength() <= bitsPerValue,
|
|
String.format("bitStartPosition (%d) + bitLength (%d) must be less than bitsPerValue (%d)",
|
|
csfViewSettings.getBitStartPosition(), csfViewSettings.getBitLength(), bitsPerValue));
|
|
}
|
|
|
|
// No position; no freq; not pretokenized; not tokenized.
|
|
/**
|
|
* Norm is disabled as default. Like Lucene string field, or int/long fields.
|
|
*/
|
|
public final SchemaBuilder withIndexedNotTokenizedField(String fieldName) {
|
|
return withIndexedNotTokenizedField(fieldName, false);
|
|
}
|
|
|
|
/**
|
|
* Add an indexed but not tokenized field. This is similar to Lucene's StringField.
|
|
*/
|
|
public final SchemaBuilder withIndexedNotTokenizedField(String fieldName,
|
|
boolean supportOutOfOrderAppends) {
|
|
return withIndexedNotTokenizedField(fieldName, supportOutOfOrderAppends, true);
|
|
}
|
|
|
|
private final SchemaBuilder withIndexedNotTokenizedField(String fieldName,
|
|
boolean supportOutOfOrderAppends,
|
|
boolean omitNorms) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFieldSettings settings = getNoPositionNoFreqSettings(supportOutOfOrderAppends);
|
|
settings.getIndexedFieldSettings().setOmitNorms(omitNorms);
|
|
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName)
|
|
.setSettings(settings);
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
|
|
return this;
|
|
}
|
|
|
|
|
|
/** Makes the given field searchable by default, with the given weight. */
|
|
public final SchemaBuilder withSearchFieldByDefault(
|
|
String fieldName, float textSearchableFieldWeight) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
|
|
ThriftFieldSettings settings =
|
|
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
|
|
settings.setSearchFieldSettings(
|
|
new ThriftSearchFieldSettings()
|
|
.setTextSearchableFieldWeight(textSearchableFieldWeight)
|
|
.setTextDefaultSearchable(true));
|
|
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Similar to Lucene's TextField. The string is analyzed using the default/override analyzer.
|
|
* @param fieldName
|
|
* @param addHfPairIfHfFieldsArePresent Add hfPair fields if they exists in the schema.
|
|
* For certain text fields, adding hfPair fields are usually preferred, but they may
|
|
* not exist in the schema, in which case the hfPair fields will not be added.
|
|
*/
|
|
public final SchemaBuilder withTextField(String fieldName,
|
|
boolean addHfPairIfHfFieldsArePresent) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName).setSettings(
|
|
getDefaultSettings(ThriftIndexOptions.DOCS_AND_FREQS_AND_POSITIONS));
|
|
|
|
if (addHfPairIfHfFieldsArePresent) {
|
|
// Add hfPair fields only if they exist in the schema for the cluster
|
|
boolean hfPair = shouldIncludeField(ImmutableSchema.HF_TERM_PAIRS_FIELD)
|
|
&& shouldIncludeField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD);
|
|
config.getSettings().getIndexedFieldSettings().setIndexHighFreqTermPairs(hfPair);
|
|
}
|
|
|
|
config.getSettings().getIndexedFieldSettings().setTokenized(true);
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Marked the given field as having per position payload.
|
|
*/
|
|
public final SchemaBuilder withPerPositionPayload(String fieldName, int defaultPayloadLength) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFieldSettings settings =
|
|
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
|
|
|
|
settings.getIndexedFieldSettings().setStorePerPositionPayloads(true);
|
|
settings.getIndexedFieldSettings().setDefaultPerPositionPayloadLength(defaultPayloadLength);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Add field into schema that is pre-tokenized and does not have position.
|
|
* E.g. hashtags / stocks / card_domain
|
|
*/
|
|
public final SchemaBuilder withPretokenizedNoPositionField(String fieldName) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName)
|
|
.setSettings(getPretokenizedNoPositionFieldSetting());
|
|
// Add hfPair fields only if they exist in the schema for the cluster
|
|
boolean hfPair = shouldIncludeField(ImmutableSchema.HF_TERM_PAIRS_FIELD)
|
|
&& shouldIncludeField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD);
|
|
config.getSettings().getIndexedFieldSettings().setIndexHighFreqTermPairs(hfPair);
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Mark the field to have ordered term dictionary.
|
|
* In Lucene, term dictionary is sorted. In Earlybird, term dictionary order is not
|
|
* guaranteed unless this is turned on.
|
|
*/
|
|
public final SchemaBuilder withOrderedTerms(String fieldName) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFieldSettings settings =
|
|
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
|
|
|
|
settings.getIndexedFieldSettings().setSupportOrderedTerms(true);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Support lookup of term text by term id in the term dictionary.
|
|
*/
|
|
public final SchemaBuilder withTermTextLookup(String fieldName) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFieldSettings settings =
|
|
schema.getFieldConfigs().get(idMapping.getFieldID(fieldName)).getSettings();
|
|
|
|
settings.getIndexedFieldSettings().setSupportTermTextLookup(true);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Add a text field that is pre-tokenized, so not analyzed again in the index (e.g. Earlybird).
|
|
*
|
|
* Note that the token streams MUST be created using the attributes defined in
|
|
* {@link com.twitter.search.common.util.text.TweetTokenStreamSerializer}.
|
|
*/
|
|
public final SchemaBuilder withPretokenizedTextField(
|
|
String fieldName,
|
|
boolean addHfPairIfHfFieldsArePresent) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName)
|
|
.setSettings(getDefaultPretokenizedSettings(
|
|
ThriftIndexOptions.DOCS_AND_FREQS_AND_POSITIONS));
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
|
|
// Add hfPair fields only if they exist in the schema for the cluster
|
|
if (addHfPairIfHfFieldsArePresent) {
|
|
// Add hfPair fields only if they exist in the schema for the cluster
|
|
boolean hfPair = shouldIncludeField(ImmutableSchema.HF_TERM_PAIRS_FIELD)
|
|
&& shouldIncludeField(ImmutableSchema.HF_PHRASE_PAIRS_FIELD);
|
|
config.getSettings().getIndexedFieldSettings().setIndexHighFreqTermPairs(hfPair);
|
|
}
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Add a feature configuration
|
|
*/
|
|
public final SchemaBuilder withFeatureConfiguration(String baseFieldName, String viewName,
|
|
FeatureConfiguration featureConfiguration) {
|
|
return withColumnStrideFieldView(
|
|
viewName,
|
|
// Defaulting all encoded tweet features to int since the underlying encoded tweet features
|
|
// are ints.
|
|
ThriftCSFType.INT,
|
|
featureConfiguration.getOutputType(),
|
|
baseFieldName,
|
|
featureConfiguration.getValueIndex(),
|
|
featureConfiguration.getBitStartPosition(),
|
|
featureConfiguration.getBitLength(),
|
|
featureConfiguration.getFeatureNormalizationType(),
|
|
featureConfiguration.getUpdateConstraints()
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Add a long field in schema. This field uses LongTermAttribute.
|
|
*/
|
|
private SchemaBuilder addLongTermField(String fieldName, boolean useSortableEncoding) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFieldSettings longTermSettings = getEarlybirdNumericFieldSettings();
|
|
ThriftTokenStreamSerializer tokenStreamSerializer =
|
|
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
|
|
tokenStreamSerializer.setAttributeSerializerClassNames(
|
|
ImmutableList.<String>of(LongTermAttributeSerializer.class.getName()));
|
|
longTermSettings.getIndexedFieldSettings().setTokenStreamSerializer(tokenStreamSerializer);
|
|
|
|
ThriftIndexedNumericFieldSettings numericFieldSettings =
|
|
new ThriftIndexedNumericFieldSettings(true);
|
|
numericFieldSettings.setNumericType(ThriftNumericType.LONG);
|
|
numericFieldSettings.setUseSortableEncoding(useSortableEncoding);
|
|
longTermSettings.getIndexedFieldSettings().setNumericFieldSettings(numericFieldSettings);
|
|
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName),
|
|
new ThriftFieldConfiguration(fieldName).setSettings(longTermSettings));
|
|
return this;
|
|
}
|
|
|
|
public final SchemaBuilder withSortableLongTermField(String fieldName) {
|
|
return addLongTermField(fieldName, true);
|
|
}
|
|
|
|
public final SchemaBuilder withLongTermField(String fieldName) {
|
|
return addLongTermField(fieldName, false);
|
|
}
|
|
|
|
/**
|
|
* Add an int field in schema. This field uses IntTermAttribute.
|
|
*/
|
|
public final SchemaBuilder withIntTermField(String fieldName) {
|
|
if (!shouldIncludeField(fieldName)) {
|
|
return this;
|
|
}
|
|
ThriftFieldSettings intTermSettings = getEarlybirdNumericFieldSettings();
|
|
ThriftTokenStreamSerializer attributeSerializer =
|
|
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
|
|
attributeSerializer.setAttributeSerializerClassNames(
|
|
ImmutableList.<String>of(IntTermAttributeSerializer.class.getName()));
|
|
intTermSettings.getIndexedFieldSettings().setTokenStreamSerializer(attributeSerializer);
|
|
|
|
ThriftIndexedNumericFieldSettings numericFieldSettings =
|
|
new ThriftIndexedNumericFieldSettings(true);
|
|
numericFieldSettings.setNumericType(ThriftNumericType.INT);
|
|
intTermSettings.getIndexedFieldSettings().setNumericFieldSettings(numericFieldSettings);
|
|
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName),
|
|
new ThriftFieldConfiguration(fieldName).setSettings(intTermSettings));
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Timeline and ExpertSearch uses
|
|
* {@link com.twitter.search.common.util.analysis.PayloadWeightedTokenizer} to store weighted
|
|
* values.
|
|
*
|
|
* E.g. for the PRODUCED_LANGUAGES and CONSUMED_LANGUAGES fields, they contain not a single,
|
|
* value, but instead a list of values with a weight associated with each value.
|
|
*
|
|
* This method adds an indexed field that uses
|
|
* {@link com.twitter.search.common.util.analysis.PayloadWeightedTokenizer}.
|
|
*/
|
|
public final SchemaBuilder withCharTermPayloadWeightedField(String fieldName) {
|
|
ThriftFieldConfiguration config = new ThriftFieldConfiguration(fieldName)
|
|
.setSettings(getPayloadWeightedSettings(ThriftIndexOptions.DOCS_AND_FREQS_AND_POSITIONS));
|
|
putIntoFieldConfigs(idMapping.getFieldID(fieldName), config);
|
|
return this;
|
|
}
|
|
|
|
/**
|
|
* Set the version and description of this schema.
|
|
*/
|
|
public final SchemaBuilder withSchemaVersion(
|
|
int majorVersionNumber,
|
|
int minorVersionNumber,
|
|
String versionDesc,
|
|
boolean isOfficial) {
|
|
schema.setMajorVersionNumber(majorVersionNumber);
|
|
schema.setMinorVersionNumber(minorVersionNumber);
|
|
|
|
schema.setVersion(majorVersionNumber + ":" + versionDesc);
|
|
schema.setVersionIsOfficial(isOfficial);
|
|
|
|
return this;
|
|
}
|
|
|
|
public final SchemaBuilder withSchemaVersion(
|
|
int majorVersionNumber,
|
|
String versionDesc,
|
|
boolean isOfficial) {
|
|
return withSchemaVersion(majorVersionNumber, 0, versionDesc, isOfficial);
|
|
}
|
|
|
|
protected void putIntoFieldConfigs(int id, ThriftFieldConfiguration config) {
|
|
if (schema.getFieldConfigs() != null && schema.getFieldConfigs().containsKey(id)) {
|
|
throw new IllegalStateException("Already have a ThriftFieldConfiguration for field id " + id);
|
|
}
|
|
|
|
if (fieldNameSet.contains(config.getFieldName())) {
|
|
throw new IllegalStateException("Already have a ThriftFieldConfiguration for field "
|
|
+ config.getFieldName());
|
|
}
|
|
fieldNameSet.add(config.getFieldName());
|
|
schema.putToFieldConfigs(id, config);
|
|
}
|
|
|
|
// Default field settings. Most field settings are similar to this.
|
|
protected ThriftFieldSettings getDefaultSettings(ThriftIndexOptions indexOption) {
|
|
return getDefaultSettings(indexOption, false);
|
|
}
|
|
|
|
protected ThriftFieldSettings getDefaultSettings(ThriftIndexOptions indexOption,
|
|
boolean supportOutOfOrderAppends) {
|
|
ThriftFieldSettings fieldSettings = new ThriftFieldSettings();
|
|
ThriftIndexedFieldSettings indexedFieldSettings = new ThriftIndexedFieldSettings();
|
|
indexedFieldSettings
|
|
.setIndexed(true)
|
|
.setStored(false)
|
|
.setTokenized(false)
|
|
.setStoreTermVectors(false)
|
|
.setStoreTermVectorOffsets(false)
|
|
.setStoreTermVectorPayloads(false)
|
|
.setStoreTermVectorPositions(false)
|
|
.setSupportOutOfOrderAppends(supportOutOfOrderAppends)
|
|
.setIndexOptions(indexOption)
|
|
.setOmitNorms(true); // All Earlybird fields omit norms.
|
|
fieldSettings.setIndexedFieldSettings(indexedFieldSettings);
|
|
return fieldSettings;
|
|
}
|
|
|
|
/**
|
|
* Default field settings for fields that are pretokenized
|
|
*
|
|
* The fields that use these settings will need to be tokenized using a serializer with the
|
|
* attributes defined in {@link com.twitter.search.common.util.text.TweetTokenStreamSerializer}.
|
|
*/
|
|
protected final ThriftFieldSettings getDefaultPretokenizedSettings(
|
|
ThriftIndexOptions indexOption) {
|
|
ThriftFieldSettings fieldSettings = getDefaultSettings(indexOption);
|
|
fieldSettings.getIndexedFieldSettings().setTokenized(true);
|
|
ThriftTokenStreamSerializer attributeSerializer =
|
|
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
|
|
attributeSerializer.setAttributeSerializerClassNames(
|
|
ImmutableList.<String>of(
|
|
CharSequenceTermAttributeSerializer.class.getName(),
|
|
PositionIncrementAttributeSerializer.class.getName(),
|
|
TokenTypeAttributeSerializer.class.getName()));
|
|
|
|
fieldSettings.getIndexedFieldSettings().setTokenStreamSerializer(attributeSerializer);
|
|
return fieldSettings;
|
|
}
|
|
|
|
protected final ThriftFieldSettings getPretokenizedNoPositionFieldSetting() {
|
|
return getDefaultPretokenizedSettings(ThriftIndexOptions.DOCS_AND_FREQS);
|
|
}
|
|
|
|
protected final ThriftFieldSettings getNoPositionNoFreqSettings() {
|
|
return getNoPositionNoFreqSettings(false);
|
|
}
|
|
|
|
protected final ThriftFieldSettings getNoPositionNoFreqSettings(
|
|
boolean supportOutOfOrderAppends) {
|
|
return getDefaultSettings(ThriftIndexOptions.DOCS_ONLY, supportOutOfOrderAppends);
|
|
}
|
|
|
|
protected final ThriftFieldSettings getEarlybirdNumericFieldSettings() {
|
|
// Supposedly numeric fields are not tokenized.
|
|
// However, Earlybird uses SingleTokenTokenStream to handle int/long fields.
|
|
// So we need to set indexed to true for these fields.
|
|
ThriftFieldSettings settings = getNoPositionNoFreqSettings();
|
|
settings.getIndexedFieldSettings().setTokenized(true);
|
|
return settings;
|
|
}
|
|
|
|
private ThriftFieldSettings getPayloadWeightedSettings(ThriftIndexOptions indexOption) {
|
|
ThriftFieldSettings fieldSettings = getDefaultSettings(indexOption);
|
|
fieldSettings.getIndexedFieldSettings().setTokenized(true);
|
|
ThriftTokenStreamSerializer attributeSerializer =
|
|
new ThriftTokenStreamSerializer(tokenStreamSerializerVersion);
|
|
attributeSerializer.setAttributeSerializerClassNames(
|
|
ImmutableList.<String>of(CharTermAttributeSerializer.class.getName(),
|
|
PositionIncrementAttributeSerializer.class.getName(),
|
|
PayloadAttributeSerializer.class.getName()));
|
|
fieldSettings.getIndexedFieldSettings().setTokenStreamSerializer(attributeSerializer);
|
|
return fieldSettings;
|
|
}
|
|
|
|
protected boolean shouldIncludeField(String fieldName) {
|
|
return true;
|
|
}
|
|
}
|