the-algorithm/src/java/com/twitter/search/common/relevance/features/EarlybirdDocumentFeatures.java

233 lines
9.1 KiB
Java

package com.twitter.search.common.relevance.features;
import java.io.IOException;
import java.util.Map;
import java.util.function.Function;
import com.google.common.base.Preconditions;
import com.google.common.collect.Maps;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.NumericDocValues;
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.schema.base.FeatureConfiguration;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.common.schema.thriftjava.ThriftFeatureNormalizationType;
public class EarlybirdDocumentFeatures {
private static final Map<Integer, SearchCounter> FEATURE_CONFIG_IS_NULL_MAP = Maps.newHashMap();
private static final Map<Integer, SearchCounter> FEATURE_OUTPUT_TYPE_IS_NULL_MAP =
Maps.newHashMap();
private static final Map<Integer, SearchCounter> NO_SCHEMA_FIELD_FOR_FEATURE_MAP =
Maps.newHashMap();
private static final String FEATURE_CONFIG_IS_NULL_COUNTER_PATTERN =
"null_feature_config_for_feature_id_%d";
private static final String FEATURE_OUTPUT_TYPE_IS_NULL_COUNTER_PATTERN =
"null_output_type_for_feature_id_%d";
private static final String NO_SCHEMA_FIELD_FOR_FEATURE_COUNTER_PATTERN =
"no_schema_field_for_feature_id_%d";
private static final SearchCounter UNKNOWN_FEATURE_OUTPUT_TYPE_COUNTER =
SearchCounter.export("unknown_feature_output_type");
private final Map<String, NumericDocValues> numericDocValues = Maps.newHashMap();
private final LeafReader leafReader;
private int docId = -1;
/**
* Creates a new EarlybirdDocumentFeatures instance that will return feature values based on the
* NumericDocValues stored in the given LeafReader for the given document.
*/
public EarlybirdDocumentFeatures(LeafReader leafReader) {
this.leafReader = Preconditions.checkNotNull(leafReader);
}
/**
* Advances this instance to the given doc ID. The new doc ID must be greater than or equal to the
* current doc ID stored in this instance.
*/
public void advance(int target) {
Preconditions.checkArgument(
target >= 0,
"Target (%s) cannot be negative.",
target);
Preconditions.checkArgument(
target >= docId,
"Target (%s) smaller than current doc ID (%s).",
target,
docId);
Preconditions.checkArgument(
target < leafReader.maxDoc(),
"Target (%s) cannot be greater than or equal to the max doc ID (%s).",
target,
leafReader.maxDoc());
docId = target;
}
/**
* Returns the feature value for the given field.
*/
public long getFeatureValue(EarlybirdFieldConstant field) throws IOException {
// The index might not have a NumericDocValues instance for this feature.
// This might happen if we dynamically update the feature schema, for example.
//
// Cache the NumericDocValues instances for all accessed features, even if they're null.
String fieldName = field.getFieldName();
NumericDocValues docValues;
if (numericDocValues.containsKey(fieldName)) {
docValues = numericDocValues.get(fieldName);
} else {
docValues = leafReader.getNumericDocValues(fieldName);
numericDocValues.put(fieldName, docValues);
}
return docValues != null && docValues.advanceExact(docId) ? docValues.longValue() : 0L;
}
/**
* Determines if the given flag is set.
*/
public boolean isFlagSet(EarlybirdFieldConstant field) throws IOException {
return getFeatureValue(field) != 0;
}
/**
* Returns the unnormalized value for the given field.
*/
public double getUnnormalizedFeatureValue(EarlybirdFieldConstant field) throws IOException {
long featureValue = getFeatureValue(field);
ThriftFeatureNormalizationType normalizationType = field.getFeatureNormalizationType();
if (normalizationType == null) {
normalizationType = ThriftFeatureNormalizationType.NONE;
}
switch (normalizationType) {
case NONE:
return featureValue;
case LEGACY_BYTE_NORMALIZER:
return MutableFeatureNormalizers.BYTE_NORMALIZER.unnormLowerBound((byte) featureValue);
case LEGACY_BYTE_NORMALIZER_WITH_LOG2:
return MutableFeatureNormalizers.BYTE_NORMALIZER.unnormAndLog2((byte) featureValue);
case SMART_INTEGER_NORMALIZER:
return MutableFeatureNormalizers.SMART_INTEGER_NORMALIZER.unnormUpperBound(
(byte) featureValue);
case PREDICTION_SCORE_NORMALIZER:
return IntNormalizers.PREDICTION_SCORE_NORMALIZER.denormalize((int) featureValue);
default:
throw new IllegalArgumentException(
"Unsupported normalization type " + normalizationType + " for feature "
+ field.getFieldName());
}
}
/**
* Creates a ThriftSearchResultFeatures instance populated with values for all available features
* that have a non-zero value set.
*/
public ThriftSearchResultFeatures getSearchResultFeatures(ImmutableSchemaInterface schema)
throws IOException {
return getSearchResultFeatures(schema, (featureId) -> true);
}
/**
* Creates a ThriftSearchResultFeatures instance populated with values for all available features
* that have a non-zero value set.
*
* @param schema The schema.
* @param shouldCollectFeatureId A predicate that determines which features should be collected.
*/
public ThriftSearchResultFeatures getSearchResultFeatures(
ImmutableSchemaInterface schema,
Function<Integer, Boolean> shouldCollectFeatureId) throws IOException {
Map<Integer, Boolean> boolValues = Maps.newHashMap();
Map<Integer, Double> doubleValues = Maps.newHashMap();
Map<Integer, Integer> intValues = Maps.newHashMap();
Map<Integer, Long> longValues = Maps.newHashMap();
Map<Integer, FeatureConfiguration> idToFeatureConfigMap = schema.getFeatureIdToFeatureConfig();
for (int featureId : schema.getSearchFeatureSchema().getEntries().keySet()) {
if (!shouldCollectFeatureId.apply(featureId)) {
continue;
}
FeatureConfiguration featureConfig = idToFeatureConfigMap.get(featureId);
if (featureConfig == null) {
FEATURE_CONFIG_IS_NULL_MAP.computeIfAbsent(
featureId,
(fId) -> SearchCounter.export(
String.format(FEATURE_CONFIG_IS_NULL_COUNTER_PATTERN, fId))).increment();
continue;
}
ThriftCSFType outputType = featureConfig.getOutputType();
if (outputType == null) {
FEATURE_OUTPUT_TYPE_IS_NULL_MAP.computeIfAbsent(
featureId,
(fId) -> SearchCounter.export(
String.format(FEATURE_OUTPUT_TYPE_IS_NULL_COUNTER_PATTERN, fId))).increment();
continue;
}
if (!EarlybirdFieldConstants.hasFieldConstant(featureId)) {
// Should only happen for features that were dynamically added to the schema.
NO_SCHEMA_FIELD_FOR_FEATURE_MAP.computeIfAbsent(
featureId,
(fId) -> SearchCounter.export(
String.format(NO_SCHEMA_FIELD_FOR_FEATURE_COUNTER_PATTERN, fId))).increment();
continue;
}
EarlybirdFieldConstant field = EarlybirdFieldConstants.getFieldConstant(featureId);
switch (outputType) {
case BOOLEAN:
if (isFlagSet(field)) {
boolValues.put(featureId, true);
}
break;
case BYTE:
// It's unclear why we don't add this feature to a separate byteValues map...
byte byteFeatureValue = (byte) getFeatureValue(field);
if (byteFeatureValue != 0) {
intValues.put(featureId, (int) byteFeatureValue);
}
break;
case INT:
int intFeatureValue = (int) getFeatureValue(field);
if (intFeatureValue != 0) {
intValues.put(featureId, intFeatureValue);
}
break;
case LONG:
long longFeatureValue = getFeatureValue(field);
if (longFeatureValue != 0) {
longValues.put(featureId, longFeatureValue);
}
break;
case FLOAT:
// It's unclear why we don't add this feature to a separate floatValues map...
float floatFeatureValue = (float) getFeatureValue(field);
if (floatFeatureValue != 0) {
doubleValues.put(featureId, (double) floatFeatureValue);
}
break;
case DOUBLE:
double doubleFeatureValue = getUnnormalizedFeatureValue(field);
if (doubleFeatureValue != 0) {
doubleValues.put(featureId, doubleFeatureValue);
}
break;
default:
UNKNOWN_FEATURE_OUTPUT_TYPE_COUNTER.increment();
}
}
return new ThriftSearchResultFeatures()
.setBoolValues(boolValues)
.setIntValues(intValues)
.setLongValues(longValues)
.setDoubleValues(doubleValues);
}
}