the-algorithm/src/java/com/twitter/search/earlybird/search/relevance/scoring/ScoringFunction.java
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

214 lines
8.5 KiB
Java

package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import java.util.List;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation;
import com.twitter.common.collections.Pair;
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
import com.twitter.search.common.query.HitAttributeHelper;
import com.twitter.search.common.relevance.features.EarlybirdDocumentFeatures;
import com.twitter.search.common.results.thriftjava.FieldHitAttribution;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.TimeMapper;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
import com.twitter.search.queryparser.query.Query;
/**
* Defines a ranking function which computes the score of a document that matches a query.
*/
public abstract class ScoringFunction {
/**
* Returned by a {@link #score(int, float)} to indicate that a hit should be scored below all.
*
* We have some equality tests like:
* "if (score == ScoringFunction.SKIP_HIT) {...}" (DefaultScoringFunction#updateRelevanceStats)
* We might also have double to float casts.
*
* Such castings seem to work with the equality test, but there might corner cases when casting
* this float value to a double (and back) might not work properly.
*
* If possible, we should choose a constant that is not in the valid score range. Then we can
* turn the float equality tests into Math.abs(...) < EPSILON tests.
*/
public static final float SKIP_HIT = -Float.MAX_VALUE;
private final ImmutableSchemaInterface schema;
// The current doc ID and the reader for the current segment should be private, because we don't
// want sub-classes to incorrectly update them. The doc ID should only be updated by the score()
// and explain() methods, and the reader should only be updated by the setNextReader() method.
private int currentDocID = -1;
protected DocIDToTweetIDMapper tweetIDMapper = null;
protected TimeMapper timeMapper = null;
protected EarlybirdDocumentFeatures documentFeatures;
protected int debugMode = 0;
protected HitAttributeHelper hitAttributeHelper;
protected Query query;
protected FieldHitAttribution fieldHitAttribution;
public ScoringFunction(ImmutableSchemaInterface schema) {
this.schema = Preconditions.checkNotNull(schema);
}
protected ImmutableSchemaInterface getSchema() {
return schema;
}
/**
* Updates the reader that will be used to retrieve the tweet IDs and creation times associated
* with scored doc IDs, as well as the values for various CSFs. Should be called every time the
* searcher starts searching in a new segment.
*/
public void setNextReader(EarlybirdIndexSegmentAtomicReader reader) throws IOException {
tweetIDMapper = reader.getSegmentData().getDocIDToTweetIDMapper();
timeMapper = reader.getSegmentData().getTimeMapper();
documentFeatures = new EarlybirdDocumentFeatures(reader);
initializeNextSegment(reader);
}
public void setHitAttributeHelperAndQuery(HitAttributeHelper newHitAttributeHelper,
Query parsedQuery) {
this.hitAttributeHelper = newHitAttributeHelper;
this.query = parsedQuery;
}
public void setFieldHitAttribution(FieldHitAttribution fieldHitAttribution) {
this.fieldHitAttribution = fieldHitAttribution;
}
public void setDebugMode(int debugMode) {
this.debugMode = debugMode;
}
/**
* Allow scoring functions to perform more per-segment-specific setup.
*/
protected void initializeNextSegment(EarlybirdIndexSegmentAtomicReader reader)
throws IOException {
// Noop by default
}
// Updates the current document ID and advances all NumericDocValues to this doc ID.
private void setCurrentDocID(int currentDocID) throws IOException {
this.currentDocID = currentDocID;
documentFeatures.advance(currentDocID);
}
/**
* Returns the current doc ID stored in this scoring function.
*/
public int getCurrentDocID() {
return currentDocID;
}
/**
* Compute the score for the current hit. This is not expected to be thread safe.
*
* @param internalDocID internal id of the matching hit
* @param luceneQueryScore the score that lucene's text query computed for this hit
*/
public float score(int internalDocID, float luceneQueryScore) throws IOException {
setCurrentDocID(internalDocID);
return score(luceneQueryScore);
}
/**
* Compute the score for the current hit. This is not expected to be thread safe.
*
* @param luceneQueryScore the score that lucene's text query computed for this hit
*/
protected abstract float score(float luceneQueryScore) throws IOException;
/** Returns an explanation for the given hit. */
public final Explanation explain(IndexReader reader, int internalDocID, float luceneScore)
throws IOException {
setNextReader((EarlybirdIndexSegmentAtomicReader) reader);
setCurrentDocID(internalDocID);
return doExplain(luceneScore);
}
/** Returns an explanation for the current document. */
protected abstract Explanation doExplain(float luceneScore) throws IOException;
/**
* Returns the scoring metadata for the current doc ID.
*/
public ThriftSearchResultMetadata getResultMetadata(ThriftSearchResultMetadataOptions options)
throws IOException {
ThriftSearchResultMetadata metadata = new ThriftSearchResultMetadata();
metadata.setResultType(ThriftSearchResultType.RELEVANCE);
metadata.setPenguinVersion(EarlybirdConfig.getPenguinVersionByte());
metadata.setLanguage(ThriftLanguage.findByValue(
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LANGUAGE)));
metadata.setSignature(
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.TWEET_SIGNATURE));
metadata.setIsNullcast(documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG));
return metadata;
}
/**
* Updates the given ThriftSearchResultsRelevanceStats instance based on the scoring metadata for
* the current doc ID.
*/
public abstract void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats);
/**
* Score a list of hits. Not thread safe.
*/
public float[] batchScore(List<BatchHit> hits) throws IOException {
throw new UnsupportedOperationException("This operation (batchScore) is not implemented!");
}
/**
* Collect the features and CSFs for the current document. Used for scoring and generating the
* returned metadata.
*/
public Pair<LinearScoringData, ThriftSearchResultFeatures> collectFeatures(
float luceneQueryScore) throws IOException {
throw new UnsupportedOperationException("This operation (collectFeatures) is not implemented!");
}
/**
* Implement this function to populate the result metadata based on the given scoring data.
* Otherwise, this is a no-op.
*
* Scoring functions that implement this should also implement getScoringData().
*/
public void populateResultMetadataBasedOnScoringData(
ThriftSearchResultMetadataOptions options,
ThriftSearchResultMetadata metadata,
LinearScoringData data) throws IOException {
// Make sure that the scoring data passed in is null because getScoringDataForCurrentDocument()
// returns null by default and if a subclass overrides one of these two methods, it should
// override both.
Preconditions.checkState(data == null, "LinearScoringData should be null");
}
/**
* This should only be called at hit collection time because it relies on the internal doc id.
*
* Scoring functions that implement this should also implement the function
* populateResultMetadataBasedOnScoringData().
*/
public LinearScoringData getScoringDataForCurrentDocument() {
return null;
}
}