[docx] split commit for file 4400

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
Ari Archer 2024-01-23 19:15:55 +02:00
parent 8948d714f6
commit f37e76300b
No known key found for this signature in database
GPG Key ID: A50D5B4B599AF8A2
400 changed files with 0 additions and 20641 deletions

View File

@ -1,32 +0,0 @@
package com.twitter.search.earlybird.search.facets;
import java.util.Iterator;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.core.earlybird.facets.FacetCountState;
import com.twitter.search.core.earlybird.facets.FacetCountState.FacetFieldResults;
import com.twitter.search.earlybird.search.EarlybirdLuceneSearcher;
import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults;
public class SimpleCountRankingModule extends FacetRankingModule {
@Override
public void prepareResults(
EarlybirdLuceneSearcher.FacetSearchResults hits,
FacetCountState<ThriftFacetFieldResults> facetCountState) {
Iterator<FacetFieldResults<ThriftFacetFieldResults>> fieldResultsIterator =
facetCountState.getFacetFieldResultsIterator();
while (fieldResultsIterator.hasNext()) {
FacetFieldResults<ThriftFacetFieldResults> state = fieldResultsIterator.next();
if (!state.isFinished()) {
Schema.FieldInfo facetField =
facetCountState.getSchema().getFacetFieldByFacetName(state.facetName);
state.results = hits.getFacetResults(
facetField.getFieldType().getFacetName(), state.numResultsRequested);
if (state.results != null) {
state.numResultsFound = state.results.getTopFacetsSize();
}
}
}
}
}

View File

@ -1,47 +0,0 @@
package com.twitter.search.earlybird.search.facets;
import java.util.ArrayList;
import java.util.List;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Sets;
import org.apache.commons.lang.StringUtils;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.earlybird.partition.AudioSpaceTable;
import com.twitter.search.earlybird.thrift.AudioSpaceState;
import com.twitter.search.earlybird.thrift.ThriftSearchResult;
import com.twitter.search.earlybird.thrift.ThriftSearchResultAudioSpace;
public class SpaceFacetCollector extends AbstractFacetTermCollector {
private final List<ThriftSearchResultAudioSpace> spaces = new ArrayList<>();
private final AudioSpaceTable audioSpaceTable;
public SpaceFacetCollector(AudioSpaceTable audioSpaceTable) {
this.audioSpaceTable = audioSpaceTable;
}
@Override
public boolean collect(int docID, long termID, int fieldID) {
String spaceId = getTermFromFacet(termID, fieldID,
Sets.newHashSet(EarlybirdFieldConstant.SPACES_FACET));
if (StringUtils.isEmpty(spaceId)) {
return false;
}
spaces.add(new ThriftSearchResultAudioSpace(spaceId,
audioSpaceTable.isRunning(spaceId) ? AudioSpaceState.RUNNING
: AudioSpaceState.ENDED));
return true;
}
@Override
public void fillResultAndClear(ThriftSearchResult result) {
getExtraMetadata(result).setSpaces(ImmutableList.copyOf(spaces));
spaces.clear();
}
}

View File

@ -1,487 +0,0 @@
package com.twitter.search.earlybird.search.facets;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.DocIdSetIterator;
import com.twitter.common.util.Clock;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchResultsStats;
import com.twitter.search.common.schema.SchemaUtil;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.search.EarlyTerminationState;
import com.twitter.search.common.util.earlybird.TermStatisticsUtil;
import com.twitter.search.core.earlybird.index.TimeMapper;
import com.twitter.search.earlybird.index.EarlybirdSingleSegmentSearcher;
import com.twitter.search.earlybird.search.AbstractResultsCollector;
import com.twitter.search.earlybird.search.SearchResultsInfo;
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
import com.twitter.search.earlybird.thrift.ThriftTermRequest;
import com.twitter.search.earlybird.thrift.ThriftTermResults;
public class TermStatisticsCollector extends AbstractResultsCollector
<TermStatisticsRequestInfo, TermStatisticsCollector.TermStatisticsSearchResults> {
private static final EarlyTerminationState TERMINATED_TERM_STATS_COUNTING_DONE =
new EarlyTerminationState("terminated_term_stats_counting_done", true);
// Stats for tracking histogram results.
private static final SearchResultsStats TERM_STATS_HISTOGRAM_REQUESTS_WITH_MOVED_BACK_BINS =
SearchResultsStats.export("term_statistics_collector_queries_with_moved_back_bins");
private static final SearchCounter TERM_STATS_SKIPPED_LARGER_OUT_OF_BOUNDS_HITS =
SearchCounter.export("term_statistics_collector_skipped_larger_out_of_bounds_hits");
@VisibleForTesting
static final class TermStatistics {
private final ThriftTermRequest termRequest;
private final Term term; // could be null, for count across all fields
private int termDF = 0;
private int termCount = 0;
private final int[] histogramBins;
// Per-segment information.
private PostingsEnum segmentDocsEnum; // could be null, for count across all fields
private boolean segmentDone;
@VisibleForTesting
TermStatistics(ThriftTermRequest termRequest, Term term, int numBins) {
this.termRequest = termRequest;
this.term = term;
this.histogramBins = new int[numBins];
}
/**
* Take the currently accumulated counts and "move them back" to make room for counts from more
* recent binIds.
*
* For example, if the oldFirstBinID was set to 10, and the histogramBins were {3, 4, 5, 6, 7},
* after this call with newFirstBinID set to 12, the histogramBins will be set
* to {5, 6, 7, 0, 0}.
*
* @param oldFirstBinID the binId of the firstBin that's been used up to now.
* @param newFirstBinID the new binId of the firstBin that will be used from now on.
* The newFirstBinID is presumed to be larger than the oldFirstBinID, and is asserted.
*/
@VisibleForTesting
void moveBackTermCounts(int oldFirstBinID, int newFirstBinID) {
Preconditions.checkState(oldFirstBinID < newFirstBinID);
// move counts back by this many bins
final int moveBackBy = newFirstBinID - oldFirstBinID;
this.termCount = 0;
for (int i = 0; i < histogramBins.length; i++) {
int oldCount = histogramBins[i];
histogramBins[i] = 0;
int newIndex = i - moveBackBy;
if (newIndex >= 0) {
histogramBins[newIndex] = oldCount;
this.termCount += oldCount;
}
}
}
@VisibleForTesting void countHit(int bin) {
termCount++;
histogramBins[bin]++;
}
@VisibleForTesting int getTermCount() {
return termCount;
}
@VisibleForTesting int[] getHistogramBins() {
return histogramBins;
}
}
private TermStatistics[] termStatistics;
// Histogram fields.
private int numBins;
private int binSize;
private int numTimesBinsWereMovedBack = 0;
private int numLargerOutOfBoundsBinsSkipped = 0;
private static final int SEEN_OUT_OF_RANGE_THRESHOLD = 10;
private int seenOutOfRange = 0;
// ID of the first bin - effectively time / binSize. This is calculated
// relative to the first collected in-order hit.
private int firstBinID = -1;
// List of per-segment debug information specifically useful for termstat request debugging.
private List<String> termStatisticsDebugInfo = new ArrayList<>();
/**
* Creates a new term stats collector.
*/
public TermStatisticsCollector(
ImmutableSchemaInterface schema,
TermStatisticsRequestInfo searchRequestInfo,
EarlybirdSearcherStats searcherStats,
Clock clock,
int requestDebugMode) {
super(schema, searchRequestInfo, clock, searcherStats, requestDebugMode);
// Set up the histogram bins.
if (searchRequestInfo.isReturnHistogram()) {
ThriftHistogramSettings histogramSettings = searchRequestInfo.getHistogramSettings();
this.numBins = histogramSettings.getNumBins();
binSize = TermStatisticsUtil.determineBinSize(histogramSettings);
} else {
this.numBins = 0;
this.binSize = 0;
}
// Set up the term statistics array.
List<ThriftTermRequest> termRequests = searchRequestInfo.getTermRequests();
if (termRequests == null) {
this.termStatistics = new TermStatistics[0];
return;
}
this.termStatistics = new TermStatistics[searchRequestInfo.getTermRequests().size()];
for (int i = 0; i < searchRequestInfo.getTermRequests().size(); i++) {
final ThriftTermRequest termRequest = searchRequestInfo.getTermRequests().get(i);
Term term = null;
String fieldName = termRequest.getFieldName();
if (!StringUtils.isBlank(fieldName)) {
// First check if it's a facet field.
Schema.FieldInfo facetField = schema.getFacetFieldByFacetName(termRequest.getFieldName());
if (facetField != null) {
term = new Term(facetField.getName(), termRequest.getTerm());
} else {
// EarlybirdSearcher.validateRequest() should've already checked that the field exists in
// the schema, and that the term can be converted to the type of this field. However, if
// that did not happen for some reason, an exception will be thrown here, which will be
// converted to a TRANSIENT_ERROR response code.
Schema.FieldInfo fieldInfo = schema.getFieldInfo(fieldName);
Preconditions.checkNotNull(
fieldInfo,
"Found a ThriftTermRequest for a field that's not in the schema: " + fieldName
+ ". This should've been caught by EarlybirdSearcher.validateRequest()!");
term = new Term(fieldName, SchemaUtil.toBytesRef(fieldInfo, termRequest.getTerm()));
}
} else {
// NOTE: if the fieldName is empty, this is a catch-all term request for the count across
// all fields. We'll just use a null term in the TermStatistics object.
}
termStatistics[i] = new TermStatistics(termRequest, term, numBins);
}
}
@Override
public void startSegment() throws IOException {
termStatisticsDebugInfo.add(
"Starting segment in timestamp range: [" + timeMapper.getFirstTime()
+ ", " + timeMapper.getLastTime() + "]");
for (TermStatistics termStats : termStatistics) {
termStats.segmentDone = true; // until we know it's false later.
TermsEnum termsEnum = null;
if (termStats.term != null) {
Terms terms = currTwitterReader.terms(termStats.term.field());
if (terms != null) {
termsEnum = terms.iterator();
if (termsEnum != null && termsEnum.seekExact(termStats.term.bytes())) {
termStats.termDF += termsEnum.docFreq(); // Only meaningful for matchAll queries.
termStats.segmentDocsEnum =
termsEnum.postings(termStats.segmentDocsEnum, PostingsEnum.FREQS);
termStats.segmentDone = termStats.segmentDocsEnum == null
|| termStats.segmentDocsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
} else {
// this term doesn't exist in this segment.
}
}
} else {
// Catch-all case
termStats.termDF += currTwitterReader.numDocs(); // Only meaningful for matchAll queries.
termStats.segmentDocsEnum = null;
termStats.segmentDone = false;
}
}
}
private int calculateBin(final int tweetTime) {
if (tweetTime == TimeMapper.ILLEGAL_TIME) {
return -1;
}
final int binID = Math.abs(tweetTime) / binSize;
final int expectedFirstBinId = binID - numBins + 1;
if (firstBinID == -1) {
firstBinID = expectedFirstBinId;
} else if (expectedFirstBinId > firstBinID) {
numTimesBinsWereMovedBack++;
final int oldOutOfOrderFirstBinID = firstBinID;
firstBinID = expectedFirstBinId;
// We got a more recent out of order bin, move previous counts back.
for (TermStatistics ts : termStatistics) {
ts.moveBackTermCounts(oldOutOfOrderFirstBinID, firstBinID);
}
}
final int binIndex = binID - firstBinID;
if (binIndex >= numBins) {
// In-order times should be decreasing,
// and out of order times seen after an in-order tweet should also be smaller than the
// first in-order tweet's time. Will track these and export as a stat.
numLargerOutOfBoundsBinsSkipped++;
return -1;
} else if (binIndex < 0) {
// Early termination criteria.
seenOutOfRange++;
} else {
// Reset the counter, since we want to see consecutive tweets that are out of our bin range
// not single anomalies.
seenOutOfRange = 0;
}
return binIndex;
}
@Override
public void doCollect(long tweetID) throws IOException {
if (searchRequestInfo.isReturnHistogram()) {
final int tweetTime = timeMapper.getTime(curDocId);
final int binIndex = calculateBin(tweetTime);
if (binIndex >= 0) {
for (TermStatistics ts : termStatistics) {
if (!ts.segmentDone) {
countHist(ts, binIndex);
}
}
}
} else {
for (TermStatistics ts : termStatistics) {
if (!ts.segmentDone) {
countNoHist(ts);
}
}
}
}
@Override
public void skipSegment(EarlybirdSingleSegmentSearcher searcher) {
// Do nothing here.
// We don't do accounting that's done in AbstractResultsCollector for Term Stats
// requests because otherwise the bin ID calculation will be confused.
}
private boolean advance(TermStatistics ts) throws IOException {
PostingsEnum docsEnum = ts.segmentDocsEnum;
if (docsEnum.docID() < curDocId) {
if (docsEnum.advance(curDocId) == DocIdSetIterator.NO_MORE_DOCS) {
ts.segmentDone = true;
return false;
}
}
return docsEnum.docID() == curDocId;
}
private boolean countHist(TermStatistics ts, int bin) throws IOException {
if (ts.term != null && !advance(ts)) {
return false;
}
ts.countHit(bin);
return true;
}
private boolean countNoHist(TermStatistics ts) throws IOException {
if (ts.term != null && !advance(ts)) {
return false;
}
ts.termCount++;
return true;
}
@Override
public EarlyTerminationState innerShouldCollectMore() {
if (readyToTerminate()) {
return setEarlyTerminationState(TERMINATED_TERM_STATS_COUNTING_DONE);
}
return EarlyTerminationState.COLLECTING;
}
/**
* The termination logic is simple - we know what our earliest bin is and once we see a result
* that's before our earliest bin, we terminate.
*
* Our results come with increasing internal doc ids, which should correspond to decreasing
* timestamps. See SEARCH-27729, TWEETYPIE-7031.
*
* We early terminate after we have seen enough tweets that are outside of the bin
* range that we want to return. This way we're not terminating too early because of single tweets
* with wrong timestamps.
*/
@VisibleForTesting
boolean readyToTerminate() {
return this.seenOutOfRange >= SEEN_OUT_OF_RANGE_THRESHOLD;
}
@Override
public TermStatisticsSearchResults doGetResults() {
return new TermStatisticsSearchResults();
}
public final class TermStatisticsSearchResults extends SearchResultsInfo {
public final List<Integer> binIds;
public final Map<ThriftTermRequest, ThriftTermResults> results;
public final int lastCompleteBinId;
public final List<String> termStatisticsDebugInfo;
private TermStatisticsSearchResults() {
// Initialize term stat debug info
termStatisticsDebugInfo = TermStatisticsCollector.this.termStatisticsDebugInfo;
if (termStatistics.length > 0) {
results = new HashMap<>();
if (searchRequestInfo.isReturnHistogram()) {
binIds = new ArrayList<>(numBins);
int minSearchedTime = TermStatisticsCollector.this.getMinSearchedTime();
if (shouldCollectDetailedDebugInfo()) {
termStatisticsDebugInfo.add("minSearchedTime: " + minSearchedTime);
int maxSearchedTime = TermStatisticsCollector.this.getMaxSearchedTime();
termStatisticsDebugInfo.add("maxSearchedTime: " + maxSearchedTime);
}
int lastCompleteBin = -1;
computeFirstBinId(TermStatisticsCollector.this.isSetMinSearchedTime(), minSearchedTime);
trackHistogramResultStats();
// Example:
// minSearchTime = 53s
// binSize = 10
// firstBinId = 5
// numBins = 4
// binId = 5, 6, 7, 8
// binTimeStamp = 50s, 60s, 70s, 80s
for (int i = 0; i < numBins; i++) {
int binId = firstBinID + i;
int binTimeStamp = binId * binSize;
binIds.add(binId);
if (lastCompleteBin == -1 && binTimeStamp > minSearchedTime) {
lastCompleteBin = binId;
}
}
if (!getEarlyTerminationState().isTerminated()) {
// only if we didn't early terminate we can be sure to use the firstBinID as
// lastCompleteBinId
lastCompleteBinId = firstBinID;
if (shouldCollectDetailedDebugInfo()) {
termStatisticsDebugInfo.add("no early termination");
}
} else {
lastCompleteBinId = lastCompleteBin;
if (shouldCollectDetailedDebugInfo()) {
termStatisticsDebugInfo.add(
"early terminated for reason: " + getEarlyTerminationReason());
}
}
if (shouldCollectDetailedDebugInfo()) {
termStatisticsDebugInfo.add("lastCompleteBinId: " + lastCompleteBinId);
}
} else {
binIds = null;
lastCompleteBinId = -1;
}
for (TermStatistics ts : termStatistics) {
ThriftTermResults termResults = new ThriftTermResults().setTotalCount(ts.termCount);
if (searchRequestInfo.isReturnHistogram()) {
List<Integer> list = new ArrayList<>();
for (int count : ts.histogramBins) {
list.add(count);
}
termResults.setHistogramBins(list);
}
results.put(ts.termRequest, termResults);
}
} else {
binIds = null;
results = null;
lastCompleteBinId = -1;
}
}
@Override
public String toString() {
StringBuilder res = new StringBuilder();
res.append("TermStatisticsSearchResults(\n");
if (binIds != null) {
res.append(" binIds=").append(binIds).append("\n");
}
res.append(" lastCompleteBinId=").append(lastCompleteBinId).append("\n");
if (results != null) {
res.append(" results=").append(results).append("\n");
}
res.append(")");
return res.toString();
}
public List<String> getTermStatisticsDebugInfo() {
return termStatisticsDebugInfo;
}
}
/**
* Figure out what the actual firstBinId is for this query.
*/
private void computeFirstBinId(boolean isSetMinSearchedTime, int minSearchedTime) {
if (firstBinID == -1) {
if (!isSetMinSearchedTime) {
// This would only happen if we don't search any segments, which for now we have
// only seen happening if since_time or until_time don't intersect at all with
// the range of the served segments.
firstBinID = 0;
} else {
// Example:
// minSearchedTime = 54
// binSize = 10
// firstBinId = 5
firstBinID = minSearchedTime / binSize;
}
if (shouldCollectDetailedDebugInfo()) {
termStatisticsDebugInfo.add("firstBinId: " + firstBinID);
}
}
}
@VisibleForTesting
int getSeenOutOfRange() {
return seenOutOfRange;
}
private void trackHistogramResultStats() {
if (numLargerOutOfBoundsBinsSkipped > 0) {
TERM_STATS_SKIPPED_LARGER_OUT_OF_BOUNDS_HITS.increment();
}
if (numTimesBinsWereMovedBack > 0) {
TERM_STATS_HISTOGRAM_REQUESTS_WITH_MOVED_BACK_BINS.recordResults(numTimesBinsWereMovedBack);
}
}
}

View File

@ -1,94 +0,0 @@
package com.twitter.search.earlybird.search.facets;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableSet;
import org.apache.lucene.search.Query;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.common.search.TerminationTracker;
import com.twitter.search.common.util.text.NormalizerHelper;
import com.twitter.search.common.util.url.URLUtils;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.search.SearchRequestInfo;
import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftTermRequest;
import com.twitter.search.earlybird.thrift.ThriftTermStatisticsRequest;
public class TermStatisticsRequestInfo extends SearchRequestInfo {
private static final Set<String> FACET_URL_FIELDS_TO_NORMALIZE = new ImmutableSet.Builder()
.add(EarlybirdFieldConstant.IMAGES_FACET)
.add(EarlybirdFieldConstant.VIDEOS_FACET)
.add(EarlybirdFieldConstant.NEWS_FACET)
.build();
protected final List<ThriftTermRequest> termRequests;
protected final ThriftHistogramSettings histogramSettings;
/**
* Creates a new TermStatisticsRequestInfo instance using the provided query.
*/
public TermStatisticsRequestInfo(ThriftSearchQuery searchQuery,
Query luceneQuery,
ThriftTermStatisticsRequest termStatsRequest,
TerminationTracker terminationTracker) {
super(searchQuery, luceneQuery, terminationTracker);
this.termRequests = termStatsRequest.isSetTermRequests()
? termStatsRequest.getTermRequests() : new LinkedList<>();
this.histogramSettings = termStatsRequest.getHistogramSettings();
if (termStatsRequest.isIncludeGlobalCounts()) {
// Add an empty request to indicate we need a global count across all fields.
termRequests.add(new ThriftTermRequest().setFieldName("").setTerm(""));
}
// We only normalize TEXT terms and urls. All other terms, e.g. topics (named entities) are
// not normalized. Here the assumption is that the caller passes the exact terms back that
// the facet API returned
for (ThriftTermRequest termReq : termRequests) {
if (termReq.getTerm().isEmpty()) {
continue; // the special catch-all term.
}
if (!termReq.isSetFieldName()
|| termReq.getFieldName().equals(EarlybirdFieldConstant.TEXT_FIELD.getFieldName())) {
// normalize the TEXT term as it's normalized during ingestion
termReq.setTerm(NormalizerHelper.normalizeWithUnknownLocale(
termReq.getTerm(), EarlybirdConfig.getPenguinVersion()));
} else if (FACET_URL_FIELDS_TO_NORMALIZE.contains(termReq.getFieldName())) {
// remove the trailing slash from the URL path. This operation is idempotent,
// so either a spiderduck URL or a facet URL can be used here. The latter would just
// be normalized twice, which is fine.
termReq.setTerm(URLUtils.normalizePath(termReq.getTerm()));
}
}
}
@Override
protected int calculateMaxHitsToProcess(ThriftSearchQuery searchQuery) {
Preconditions.checkNotNull(searchQuery.getCollectorParams());
if (!searchQuery.getCollectorParams().isSetTerminationParams()
|| !searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
// Override the default value to all hits.
return Integer.MAX_VALUE;
} else {
return super.calculateMaxHitsToProcess(searchQuery);
}
}
public final List<ThriftTermRequest> getTermRequests() {
return this.termRequests;
}
public final ThriftHistogramSettings getHistogramSettings() {
return this.histogramSettings;
}
public final boolean isReturnHistogram() {
return this.histogramSettings != null;
}
}

View File

@ -1,41 +0,0 @@
package com.twitter.search.earlybird.search.facets;
import java.io.IOException;
import com.google.common.base.Preconditions;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.core.earlybird.facets.CSFFacetCountIterator;
import com.twitter.search.core.earlybird.facets.FacetCountIterator;
import com.twitter.search.core.earlybird.facets.FacetCountIteratorFactory;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
/**
* Factory of {@link FacetCountIterator} instances for tweet search.
* It provides a special iterator for the retweets facet.
*/
public final class TweetSearchFacetCountIteratorFactory extends FacetCountIteratorFactory {
public static final TweetSearchFacetCountIteratorFactory FACTORY =
new TweetSearchFacetCountIteratorFactory();
private TweetSearchFacetCountIteratorFactory() {
}
@Override
public FacetCountIterator getFacetCountIterator(
EarlybirdIndexSegmentAtomicReader reader,
Schema.FieldInfo fieldInfo) throws IOException {
Preconditions.checkNotNull(reader);
Preconditions.checkNotNull(fieldInfo);
Preconditions.checkArgument(fieldInfo.getFieldType().isUseCSFForFacetCounting());
String facetName = fieldInfo.getFieldType().getFacetName();
if (EarlybirdFieldConstant.RETWEETS_FACET.equals(facetName)) {
return new RetweetFacetCountIterator(reader, fieldInfo);
} else {
return new CSFFacetCountIterator(reader, fieldInfo);
}
}
}

View File

@ -1,115 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
public final class BadUserRepFilter extends Query {
/**
* Creates a query that filters out results coming from users with bad reputation.
*
* @param minTweepCred The lowest acceptable user reputation.
* @return A query that filters out results from bad reputation users.
*/
public static Query getBadUserRepFilter(int minTweepCred) {
if (minTweepCred <= 0) {
return null;
}
return new BooleanQuery.Builder()
.add(new BadUserRepFilter(minTweepCred), BooleanClause.Occur.FILTER)
.build();
}
private final int minTweepCred;
private BadUserRepFilter(int minTweepCred) {
this.minTweepCred = minTweepCred;
}
@Override
public int hashCode() {
return minTweepCred;
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof BadUserRepFilter)) {
return false;
}
return minTweepCred == BadUserRepFilter.class.cast(obj).minTweepCred;
}
@Override
public String toString(String field) {
return "BadUserRepFilter:" + minTweepCred;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
if (!(reader instanceof EarlybirdIndexSegmentAtomicReader)) {
return new AllDocsIterator(reader);
}
return new BadUserExcludeDocIdSetIterator(
(EarlybirdIndexSegmentAtomicReader) context.reader(), minTweepCred);
}
};
}
private static final class BadUserExcludeDocIdSetIterator extends RangeFilterDISI {
private final NumericDocValues userReputationDocValues;
private final int minTweepCred;
BadUserExcludeDocIdSetIterator(EarlybirdIndexSegmentAtomicReader indexReader,
int minTweepCred) throws IOException {
super(indexReader);
this.userReputationDocValues =
indexReader.getNumericDocValues(EarlybirdFieldConstant.USER_REPUTATION.getFieldName());
this.minTweepCred = minTweepCred;
}
@Override
public boolean shouldReturnDoc() throws IOException {
// We need this explicit casting to byte, because of how we encode and decode features in our
// encoded_tweet_features field. If a feature is an int (uses all 32 bits of the int), then
// encoding the feature and then decoding it preserves its original value. However, if the
// feature does not use the entire int (and especially if it uses bits somewhere in the middle
// of the int), then the feature value is assumed to be unsigned when it goes through this
// process of encoding and decoding. So a user rep of
// RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL (-128) will be correctly encoded as the
// binary value 10000000, but will be treated as an unsigned value when decoded, and therefore
// the decoded value will be 128.
//
// In retrospect, this seems like a really poor design decision. It seems like it would be
// better if all feature values were considered to be signed, even if most features can never
// have negative values. Unfortunately, making this change is not easy, because some features
// store normalized values, so we would also need to change the range of allowed values
// produced by those normalizers, as well as all code that depends on those values.
//
// So for now, just cast this value to a byte, to get the proper negative value.
return userReputationDocValues.advanceExact(docID())
&& ((byte) userReputationDocValues.longValue() >= minTweepCred);
}
}
}

View File

@ -1,87 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import java.util.Objects;
import java.util.Set;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
/**
* CSFDisjunctionFilter provides an efficient mechanism to query for documents that have a
* long CSF equal to one of the provided values.
*/
public final class CSFDisjunctionFilter extends Query {
private final String csfField;
private final Set<Long> values;
public static Query getCSFDisjunctionFilter(String csfField, Set<Long> values) {
return new BooleanQuery.Builder()
.add(new CSFDisjunctionFilter(csfField, values), BooleanClause.Occur.FILTER)
.build();
}
private CSFDisjunctionFilter(String csfField, Set<Long> values) {
this.csfField = csfField;
this.values = values;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
return new CSFDisjunctionFilterDISI(context.reader(), csfField, values);
}
};
}
@Override
public int hashCode() {
return (csfField == null ? 0 : csfField.hashCode()) * 17
+ (values == null ? 0 : values.hashCode());
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof CSFDisjunctionFilter)) {
return false;
}
CSFDisjunctionFilter filter = CSFDisjunctionFilter.class.cast(obj);
return Objects.equals(csfField, filter.csfField) && Objects.equals(values, filter.values);
}
@Override
public String toString(String field) {
return "CSFDisjunctionFilter:" + csfField + ",count:" + values.size();
}
private static final class CSFDisjunctionFilterDISI extends RangeFilterDISI {
private final NumericDocValues docValues;
private final Set<Long> values;
private CSFDisjunctionFilterDISI(LeafReader reader, String csfField, Set<Long> values)
throws IOException {
super(reader);
this.values = values;
this.docValues = reader.getNumericDocValues(csfField);
}
@Override
protected boolean shouldReturnDoc() throws IOException {
return docValues.advanceExact(docID()) && values.contains(docValues.longValue());
}
}
}

View File

@ -1,195 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import java.util.Objects;
import com.google.common.annotations.VisibleForTesting;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
/**
* Filters tweets according to the specified CSF field value.
* Note that min value is inclusive, and max value is exclusive.
*/
public final class DocValRangeFilter extends Query {
private final String csfField;
private final ThriftCSFType csfFieldType;
private final Number minValInclusive;
private final Number maxValExclusive;
/**
* Returns a query that filters hits based on the value of a CSF.
*
* @param csfField The CSF name.
* @param csfFieldType The CSF type.
* @param minVal The minimum acceptable value (inclusive).
* @param maxVal The maximum acceptable value (exclusive).
* @return A query that filters hits based on the value of a CSF.
*/
public static Query getDocValRangeQuery(String csfField, ThriftCSFType csfFieldType,
double minVal, double maxVal) {
return new BooleanQuery.Builder()
.add(new DocValRangeFilter(csfField, csfFieldType, minVal, maxVal),
BooleanClause.Occur.FILTER)
.build();
}
/**
* Returns a query that filters hits based on the value of a CSF.
*
* @param csfField The CSF name.
* @param csfFieldType The CSF type.
* @param minVal The minimum acceptable value (inclusive).
* @param maxVal The maximum acceptable value (exclusive).
* @return A query that filters hits based on the value of a CSF.
*/
public static Query getDocValRangeQuery(String csfField, ThriftCSFType csfFieldType,
long minVal, long maxVal) {
return new BooleanQuery.Builder()
.add(new DocValRangeFilter(csfField, csfFieldType, minVal, maxVal),
BooleanClause.Occur.FILTER)
.build();
}
private DocValRangeFilter(String csfField, ThriftCSFType csfFieldType,
double minVal, double maxVal) {
this.csfField = csfField;
this.csfFieldType = csfFieldType;
this.minValInclusive = new Float(minVal);
this.maxValExclusive = new Float(maxVal);
}
private DocValRangeFilter(String csfField, ThriftCSFType csfFieldType,
long minVal, long maxVal) {
this.csfField = csfField;
this.csfFieldType = csfFieldType;
this.minValInclusive = new Long(minVal);
this.maxValExclusive = new Long(maxVal);
}
@Override
public int hashCode() {
return (csfField == null ? 0 : csfField.hashCode()) * 29
+ (csfFieldType == null ? 0 : csfFieldType.hashCode()) * 17
+ minValInclusive.hashCode() * 7
+ maxValExclusive.hashCode();
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof DocValRangeFilter)) {
return false;
}
DocValRangeFilter filter = DocValRangeFilter.class.cast(obj);
return Objects.equals(csfField, filter.csfField)
&& (csfFieldType == filter.csfFieldType)
&& minValInclusive.equals(filter.minValInclusive)
&& maxValExclusive.equals(filter.maxValExclusive);
}
@Override
public String toString(String field) {
return "DocValRangeFilter:" + csfField
+ ",type:" + csfFieldType.toString()
+ ",min:" + this.minValInclusive.toString()
+ ",max:" + this.maxValExclusive.toString();
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
if (csfFieldType == null) {
return new AllDocsIterator(reader);
}
int smallestDoc = (reader instanceof EarlybirdIndexSegmentAtomicReader)
? ((EarlybirdIndexSegmentAtomicReader) reader).getSmallestDocID() : 0;
int largestDoc = reader.maxDoc() - 1;
return new CSFRangeDocIdSetIterator(reader, csfField, csfFieldType,
smallestDoc, largestDoc,
minValInclusive, maxValExclusive);
}
};
}
private static final class CSFRangeDocIdSetIterator extends RangeFilterDISI {
private final NumericDocValues numericDocValues;
private final ThriftCSFType csfType;
private final Number minValInclusive;
private final Number maxValExclusive;
public CSFRangeDocIdSetIterator(LeafReader reader,
String csfField,
ThriftCSFType csfType,
int smallestDocID,
int largestDocID,
Number minValInclusive,
Number maxValExclusive) throws IOException {
super(reader, smallestDocID, largestDocID);
this.numericDocValues = reader.getNumericDocValues(csfField);
this.csfType = csfType;
this.minValInclusive = minValInclusive;
this.maxValExclusive = maxValExclusive;
}
@Override
protected boolean shouldReturnDoc() throws IOException {
if (!numericDocValues.advanceExact(docID())) {
return false;
}
long val = numericDocValues.longValue();
switch (csfType) {
case DOUBLE:
double doubleVal = Double.longBitsToDouble(val);
return doubleVal >= minValInclusive.doubleValue()
&& doubleVal < maxValExclusive.doubleValue();
case FLOAT:
float floatVal = Float.intBitsToFloat((int) val);
return floatVal >= minValInclusive.doubleValue()
&& floatVal < maxValExclusive.doubleValue();
case LONG:
return val >= minValInclusive.longValue() && val < maxValExclusive.longValue();
case INT:
return val >= minValInclusive.longValue() && (int) val < maxValExclusive.longValue();
case BYTE:
return (byte) val >= minValInclusive.longValue()
&& (byte) val < maxValExclusive.longValue();
default:
return false;
}
}
}
//////////////////////////
// for unit tests only
//////////////////////////
@VisibleForTesting
public Number getMinValForTest() {
return minValInclusive;
}
@VisibleForTesting
public Number getMaxValForTest() {
return maxValExclusive;
}
}

View File

@ -1,113 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import java.util.Set;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
public final class FeatureValueInAcceptListOrUnsetFilter extends Query {
private final String featureName;
private final Set<Long> idsAcceptList;
/**
* Creates a query that filters for hits that have the given feature unset, or that have the
* given feature set to a value in the given list of IDs.
*
* @param featureName The feature.
* @param ids A list of id values this filter will accept for the given feature.
* @return A query that filters out all hits that have the given feature set.
*/
public static Query getFeatureValueInAcceptListOrUnsetFilter(String featureName, Set<Long> ids) {
return new BooleanQuery.Builder()
.add(new FeatureValueInAcceptListOrUnsetFilter(featureName, ids),
BooleanClause.Occur.FILTER)
.build();
}
@Override
public String toString(String s) {
return String.format("FeatureValueInAcceptListOrUnsetFilter(%s, AcceptList = (%s))",
featureName,
idsAcceptList);
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof FeatureValueInAcceptListOrUnsetFilter)) {
return false;
}
FeatureValueInAcceptListOrUnsetFilter filter =
FeatureValueInAcceptListOrUnsetFilter.class.cast(obj);
return featureName.equals(filter.featureName) && idsAcceptList.equals(filter.idsAcceptList);
}
@Override
public int hashCode() {
return featureName.hashCode() * 7 + idsAcceptList.hashCode();
}
private FeatureValueInAcceptListOrUnsetFilter(String featureName, Set<Long> ids) {
this.featureName = Preconditions.checkNotNull(featureName);
this.idsAcceptList = Preconditions.checkNotNull(ids);
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
return new FeatureValueInAcceptListOrUnsetDocIdSetIterator(
context.reader(), featureName, idsAcceptList);
}
};
}
private static final class FeatureValueInAcceptListOrUnsetDocIdSetIterator
extends RangeFilterDISI {
private final NumericDocValues featureDocValues;
private final Set<Long> idsAcceptList;
FeatureValueInAcceptListOrUnsetDocIdSetIterator(
LeafReader indexReader, String featureName, Set<Long> ids) throws IOException {
super(indexReader);
this.featureDocValues = indexReader.getNumericDocValues(featureName);
this.idsAcceptList = ids;
}
@Override
public boolean shouldReturnDoc() throws IOException {
// If featureDocValues is null, that means there were no documents indexed with the given
// field in the current segment.
//
// The advanceExact() method returns false if it cannot find the given docId in the
// NumericDocValues instance. So if advanceExact() returns false then we know the feature is
// unset.
// However, for realtime Earlybirds we have a custom implementation of NumericDocValues,
// ColumnStrideFieldDocValues, which will contain an entry for every indexed docId and use a
// value of 0 to indicate that a feature is unset.
//
// So to check if a feature is unset for a given docId, we first need to check if we can find
// the docId, and then we additionally need to check if the feature value is 0.
return featureDocValues == null
|| !featureDocValues.advanceExact(docID())
|| featureDocValues.longValue() == 0
|| idsAcceptList.contains(featureDocValues.longValue());
}
}
}

View File

@ -1,255 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import java.util.Set;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.search.TerminationTracker;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
public class GeoTwoPhaseQuery extends Query {
private static final boolean ENABLE_GEO_EARLY_TERMINATION =
EarlybirdConfig.getBool("early_terminate_geo_searches", true);
private static final int GEO_TIMEOUT_OVERRIDE =
EarlybirdConfig.getInt("early_terminate_geo_searches_timeout_override", -1);
// How many geo searches are early terminated due to timeout.
private static final SearchCounter GEO_SEARCH_TIMEOUT_COUNT =
SearchCounter.export("geo_search_timeout_count");
private final SecondPhaseDocAccepter accepter;
private final TerminationTracker terminationTracker;
private final ConstantScoreQuery query;
public GeoTwoPhaseQuery(
Query query, SecondPhaseDocAccepter accepter, TerminationTracker terminationTracker) {
this.accepter = accepter;
this.terminationTracker = terminationTracker;
this.query = new ConstantScoreQuery(query);
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = query.getQuery().rewrite(reader);
if (rewritten != query.getQuery()) {
return new GeoTwoPhaseQuery(rewritten, accepter, terminationTracker);
}
return this;
}
@Override
public int hashCode() {
return query.hashCode();
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof GeoTwoPhaseQuery)) {
return false;
}
GeoTwoPhaseQuery that = (GeoTwoPhaseQuery) obj;
return query.equals(that.query)
&& accepter.equals(that.accepter)
&& terminationTracker.equals(that.terminationTracker);
}
@Override
public String toString(String field) {
return new StringBuilder("GeoTwoPhaseQuery(")
.append("Accepter(")
.append(accepter.toString())
.append(") Geohashes(")
.append(query.getQuery().toString(field))
.append("))")
.toString();
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
Weight innerWeight = query.createWeight(searcher, scoreMode, boost);
return new GeoTwoPhaseWeight(this, innerWeight, accepter, terminationTracker);
}
private static final class GeoTwoPhaseWeight extends Weight {
private final Weight innerWeight;
private final SecondPhaseDocAccepter accepter;
private final TerminationTracker terminationTracker;
private GeoTwoPhaseWeight(
Query query,
Weight innerWeight,
SecondPhaseDocAccepter accepter,
TerminationTracker terminationTracker) {
super(query);
this.innerWeight = innerWeight;
this.accepter = accepter;
this.terminationTracker = terminationTracker;
}
@Override
public void extractTerms(Set<Term> terms) {
innerWeight.extractTerms(terms);
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
return innerWeight.explain(context, doc);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
Scorer innerScorer = innerWeight.scorer(context);
if (innerScorer == null) {
return null;
}
if (ENABLE_GEO_EARLY_TERMINATION
&& (terminationTracker == null || !terminationTracker.useLastSearchedDocIdOnTimeout())) {
innerScorer = new ConstantScoreScorer(
this,
0.0f,
ScoreMode.COMPLETE_NO_SCORES,
new TimedDocIdSetIterator(innerScorer.iterator(),
terminationTracker,
GEO_TIMEOUT_OVERRIDE,
GEO_SEARCH_TIMEOUT_COUNT));
}
accepter.initialize(context);
return new GeoTwoPhaseScorer(this, innerScorer, accepter);
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return innerWeight.isCacheable(ctx);
}
}
private static final class GeoTwoPhaseScorer extends Scorer {
private final Scorer innerScorer;
private final SecondPhaseDocAccepter accepter;
private GeoTwoPhaseScorer(Weight weight, Scorer innerScorer, SecondPhaseDocAccepter accepter) {
super(weight);
this.innerScorer = innerScorer;
this.accepter = accepter;
}
@Override
public TwoPhaseIterator twoPhaseIterator() {
return new TwoPhaseIterator(innerScorer.iterator()) {
@Override
public boolean matches() throws IOException {
return checkDocExpensive(innerScorer.docID());
}
@Override
public float matchCost() {
return 0.0f;
}
};
}
@Override
public int docID() {
return iterator().docID();
}
@Override
public float score() throws IOException {
return innerScorer.score();
}
@Override
public DocIdSetIterator iterator() {
return new DocIdSetIterator() {
private int doNext(int startingDocId) throws IOException {
int docId = startingDocId;
while ((docId != NO_MORE_DOCS) && !checkDocExpensive(docId)) {
docId = innerScorer.iterator().nextDoc();
}
return docId;
}
@Override
public int docID() {
return innerScorer.iterator().docID();
}
@Override
public int nextDoc() throws IOException {
return doNext(innerScorer.iterator().nextDoc());
}
@Override
public int advance(int target) throws IOException {
return doNext(innerScorer.iterator().advance(target));
}
@Override
public long cost() {
return 2 * innerScorer.iterator().cost();
}
};
}
@Override
public float getMaxScore(int upTo) throws IOException {
return innerScorer.getMaxScore(upTo);
}
private boolean checkDocExpensive(int doc) throws IOException {
return accepter.accept(doc);
}
}
public abstract static class SecondPhaseDocAccepter {
/**
* Initializes this accepter with the given reader context.
*/
public abstract void initialize(LeafReaderContext context) throws IOException;
/**
* Determines if the given doc ID is accepted by this accepter.
*/
public abstract boolean accept(int doc) throws IOException;
/**
* Returns a string description for this SecondPhaseDocAccepter instance.
*/
public abstract String toString();
}
public static final SecondPhaseDocAccepter ALL_DOCS_ACCEPTER = new SecondPhaseDocAccepter() {
@Override
public void initialize(LeafReaderContext context) { }
@Override
public boolean accept(int doc) {
return true;
}
@Override
public String toString() {
return "AllDocsAccepter";
}
};
}

View File

@ -1,44 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.RamUsageEstimator;
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
public final class MatchAllDocIdSet extends DocIdSet {
private final LeafReader reader;
public MatchAllDocIdSet(LeafReader reader) {
this.reader = reader;
}
@Override
public DocIdSetIterator iterator() throws IOException {
return new AllDocsIterator(reader);
}
@Override
public Bits bits() throws IOException {
return new Bits() {
@Override
public boolean get(int index) {
return true;
}
@Override
public int length() {
return reader.maxDoc();
}
};
}
@Override
public long ramBytesUsed() {
return RamUsageEstimator.shallowSizeOf(this);
}
}

View File

@ -1,91 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import java.util.Set;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
import com.twitter.search.earlybird.index.EarlybirdSingleSegmentSearcher;
/**
* A MatchAllDocsQuery implementation that does not assume that doc IDs are assigned sequentially.
* Instead, it wraps the EarlybirdIndexSegmentAtomicReader into a RangeFilterDISI, and uses
* this iterator to traverse only the valid doc IDs in this segment.
*
* Note that org.apache.lucene.index.MatchAllDocsQuery is final, so we cannot extend it.
*/
public class MatchAllDocsQuery extends Query {
private static class MatchAllDocsWeight extends Weight {
private final Weight luceneWeight;
public MatchAllDocsWeight(Query query, Weight luceneWeight) {
super(query);
this.luceneWeight = luceneWeight;
}
@Override
public void extractTerms(Set<Term> terms) {
luceneWeight.extractTerms(terms);
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
return luceneWeight.explain(context, doc);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
Preconditions.checkState(context.reader() instanceof EarlybirdIndexSegmentAtomicReader,
"Expected an EarlybirdIndexSegmentAtomicReader, but got a "
+ context.reader().getClass().getName() + " instance.");
EarlybirdIndexSegmentAtomicReader reader =
(EarlybirdIndexSegmentAtomicReader) context.reader();
return new ConstantScoreScorer(
this, 1.0f, ScoreMode.COMPLETE_NO_SCORES, new RangeFilterDISI(reader));
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return luceneWeight.isCacheable(ctx);
}
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
org.apache.lucene.search.MatchAllDocsQuery luceneMatchAllDocsQuery =
new org.apache.lucene.search.MatchAllDocsQuery();
Weight luceneWeight = luceneMatchAllDocsQuery.createWeight(searcher, scoreMode, boost);
if (!(searcher instanceof EarlybirdSingleSegmentSearcher)) {
return luceneWeight;
}
return new MatchAllDocsWeight(this, luceneWeight);
}
@Override
public int hashCode() {
return 0;
}
@Override
public boolean equals(Object obj) {
return obj instanceof MatchAllDocsQuery;
}
// Copied from org.apache.lucene.search.MatchAllDocsWeight
@Override
public String toString(String field) {
return "*:*";
}
}

View File

@ -1,131 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import java.util.Arrays;
import java.util.Collection;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.common.search.IntArrayDocIdSetIterator;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
import com.twitter.search.earlybird.index.TweetIDMapper;
public final class RequiredStatusIDsFilter extends Query {
private final Collection<Long> statusIDs;
public static Query getRequiredStatusIDsQuery(Collection<Long> statusIDs) {
return new BooleanQuery.Builder()
.add(new RequiredStatusIDsFilter(statusIDs), BooleanClause.Occur.FILTER)
.build();
}
private RequiredStatusIDsFilter(Collection<Long> statusIDs) {
this.statusIDs = Preconditions.checkNotNull(statusIDs);
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
LeafReader leafReader = context.reader();
if (!(leafReader instanceof EarlybirdIndexSegmentAtomicReader)) {
return DocIdSetIterator.empty();
}
EarlybirdIndexSegmentAtomicReader reader = (EarlybirdIndexSegmentAtomicReader) leafReader;
TweetIDMapper idMapper = (TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
int docIdsSize = 0;
int[] docIds = new int[statusIDs.size()];
for (long statusID : statusIDs) {
int docId = idMapper.getDocID(statusID);
if (docId >= 0) {
docIds[docIdsSize++] = docId;
}
}
Arrays.sort(docIds, 0, docIdsSize);
DocIdSetIterator statusesDISI =
new IntArrayDocIdSetIterator(Arrays.copyOf(docIds, docIdsSize));
DocIdSetIterator allDocsDISI = new AllDocsIterator(reader);
// We only want to return IDs for fully indexed documents. So we need to make sure that
// every doc ID we return exists in allDocsDISI. However, allDocsDISI has all documents in
// this segment, so driving by allDocsDISI would be very slow. So we want to drive by
// statusesDISI, and use allDocsDISI as a post-filter. What this comes down to is that we do
// not want to call allDocsDISI.nextDoc(); we only want to call allDocsDISI.advance(), and
// only on the doc IDs returned by statusesDISI.
return new DocIdSetIterator() {
@Override
public int docID() {
return statusesDISI.docID();
}
@Override
public int nextDoc() throws IOException {
statusesDISI.nextDoc();
return advanceToNextFullyIndexedDoc();
}
@Override
public int advance(int target) throws IOException {
statusesDISI.advance(target);
return advanceToNextFullyIndexedDoc();
}
private int advanceToNextFullyIndexedDoc() throws IOException {
while (docID() != DocIdSetIterator.NO_MORE_DOCS) {
// Check if the current doc is fully indexed.
// If it is, then we can return it. If it's not, then we need to keep searching.
int allDocsDocId = allDocsDISI.advance(docID());
if (allDocsDocId == docID()) {
break;
}
statusesDISI.advance(allDocsDocId);
}
return docID();
}
@Override
public long cost() {
return statusesDISI.cost();
}
};
}
};
}
@Override
public int hashCode() {
return statusIDs.hashCode();
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof RequiredStatusIDsFilter)) {
return false;
}
RequiredStatusIDsFilter filter = RequiredStatusIDsFilter.class.cast(obj);
return statusIDs.equals(filter.statusIDs);
}
@Override
public final String toString(String field) {
return String.format("RequiredStatusIDs[%s]", statusIDs);
}
}

View File

@ -1,86 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.ConstantScoreScorer;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
/**
* A version of a term query that we can use when we already know the term id (in case where we
* previously looked it up), and have a TermsEnum to get the actual postings.
*
* This is can be used for constant score queries, where only iterating on the postings is required.
*/
class SimpleTermQuery extends Query {
private final TermsEnum termsEnum;
private final long termId;
public SimpleTermQuery(TermsEnum termsEnum, long termId) {
this.termsEnum = termsEnum;
this.termId = termId;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
return new SimpleTermQueryWeight(scoreMode);
}
@Override
public int hashCode() {
return (termsEnum == null ? 0 : termsEnum.hashCode()) * 13 + (int) termId;
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof SimpleTermQuery)) {
return false;
}
SimpleTermQuery query = SimpleTermQuery.class.cast(obj);
return (termsEnum == null ? query.termsEnum == null : termsEnum.equals(query.termsEnum))
&& (termId == query.termId);
}
@Override
public String toString(String field) {
return "SimpleTermQuery(" + field + ":" + termId + ")";
}
private class SimpleTermQueryWeight extends ConstantScoreWeight {
private final ScoreMode scoreMode;
public SimpleTermQueryWeight(ScoreMode scoreMode) {
super(SimpleTermQuery.this, 1.0f);
this.scoreMode = scoreMode;
}
@Override
public String toString() {
return "weight(" + SimpleTermQuery.this + ")";
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
termsEnum.seekExact(termId);
PostingsEnum docs = termsEnum.postings(
null, scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE);
assert docs != null;
return new ConstantScoreScorer(this, 0, scoreMode, docs);
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}
}
}

View File

@ -1,211 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import com.google.common.annotations.VisibleForTesting;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
import com.twitter.search.earlybird.index.TweetIDMapper;
/**
* Filters tweet ids according to since_id and max_id parameter.
*
* Note that since_id is exclusive and max_id is inclusive.
*/
public final class SinceMaxIDFilter extends Query {
public static final long NO_FILTER = -1;
private final long sinceIdExclusive;
private final long maxIdInclusive;
public static Query getSinceMaxIDQuery(long sinceIdExclusive, long maxIdInclusive) {
return new BooleanQuery.Builder()
.add(new SinceMaxIDFilter(sinceIdExclusive, maxIdInclusive), BooleanClause.Occur.FILTER)
.build();
}
public static Query getSinceIDQuery(long sinceIdExclusive) {
return new BooleanQuery.Builder()
.add(new SinceMaxIDFilter(sinceIdExclusive, NO_FILTER), BooleanClause.Occur.FILTER)
.build();
}
public static Query getMaxIDQuery(long maxIdInclusive) {
return new BooleanQuery.Builder()
.add(new SinceMaxIDFilter(NO_FILTER, maxIdInclusive), BooleanClause.Occur.FILTER)
.build();
}
private SinceMaxIDFilter(long sinceIdExclusive, long maxIdInclusive) {
this.sinceIdExclusive = sinceIdExclusive;
this.maxIdInclusive = maxIdInclusive;
}
@Override
public int hashCode() {
return (int) (sinceIdExclusive * 13 + maxIdInclusive);
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof SinceMaxIDFilter)) {
return false;
}
SinceMaxIDFilter filter = SinceMaxIDFilter.class.cast(obj);
return (sinceIdExclusive == filter.sinceIdExclusive)
&& (maxIdInclusive == filter.maxIdInclusive);
}
@Override
public String toString(String field) {
if (sinceIdExclusive != NO_FILTER && maxIdInclusive != NO_FILTER) {
return "SinceIdFilter:" + sinceIdExclusive + ",MaxIdFilter:" + maxIdInclusive;
} else if (maxIdInclusive != NO_FILTER) {
return "MaxIdFilter:" + maxIdInclusive;
} else {
return "SinceIdFilter:" + sinceIdExclusive;
}
}
/**
* Determines if this segment is at least partially covered by the given tweet ID range.
*/
public static boolean sinceMaxIDsInRange(
TweetIDMapper tweetIdMapper, long sinceIdExclusive, long maxIdInclusive) {
// Check for since id out of range. Note that since this ID is exclusive,
// equality is out of range too.
if (sinceIdExclusive != NO_FILTER && sinceIdExclusive >= tweetIdMapper.getMaxTweetID()) {
return false;
}
// Check for max id in range.
return maxIdInclusive == NO_FILTER || maxIdInclusive >= tweetIdMapper.getMinTweetID();
}
// Returns true if this segment is completely covered by these id filters.
private static boolean sinceMaxIdsCoverRange(
TweetIDMapper tweetIdMapper, long sinceIdExclusive, long maxIdInclusive) {
// Check for since_id specified AND since_id newer than than first tweet.
if (sinceIdExclusive != NO_FILTER && sinceIdExclusive >= tweetIdMapper.getMinTweetID()) {
return false;
}
// Check for max id in range.
return maxIdInclusive == NO_FILTER || maxIdInclusive > tweetIdMapper.getMaxTweetID();
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
if (!(reader instanceof EarlybirdIndexSegmentAtomicReader)) {
return new AllDocsIterator(reader);
}
EarlybirdIndexSegmentAtomicReader twitterInMemoryIndexReader =
(EarlybirdIndexSegmentAtomicReader) reader;
TweetIDMapper tweetIdMapper =
(TweetIDMapper) twitterInMemoryIndexReader.getSegmentData().getDocIDToTweetIDMapper();
// Important to return a null DocIdSetIterator here, so the Scorer will skip searching
// this segment completely.
if (!sinceMaxIDsInRange(tweetIdMapper, sinceIdExclusive, maxIdInclusive)) {
return null;
}
// Optimization: just return a match-all iterator when the whole segment is in range.
// This avoids having to do so many status id lookups.
if (sinceMaxIdsCoverRange(tweetIdMapper, sinceIdExclusive, maxIdInclusive)) {
return new AllDocsIterator(reader);
}
return new SinceMaxIDDocIdSetIterator(
twitterInMemoryIndexReader, sinceIdExclusive, maxIdInclusive);
}
};
}
@VisibleForTesting
static class SinceMaxIDDocIdSetIterator extends RangeFilterDISI {
private final DocIDToTweetIDMapper docIdToTweetIdMapper;
private final long sinceIdExclusive;
private final long maxIdInclusive;
public SinceMaxIDDocIdSetIterator(EarlybirdIndexSegmentAtomicReader reader,
long sinceIdExclusive,
long maxIdInclusive) throws IOException {
super(reader,
findMaxIdDocID(reader, maxIdInclusive),
findSinceIdDocID(reader, sinceIdExclusive));
this.docIdToTweetIdMapper = reader.getSegmentData().getDocIDToTweetIDMapper();
this.sinceIdExclusive = sinceIdExclusive; // sinceStatusId == NO_FILTER is OK, it's exclusive
this.maxIdInclusive = maxIdInclusive != NO_FILTER ? maxIdInclusive : Long.MAX_VALUE;
}
/**
* This is a necessary check when we have out of order tweets in the archive.
* When tweets are out of order, this guarantees that no false positive results are returned.
* I.e. we can still miss some tweets in the specified range, but we never incorrectly return
* anything that's not in the range.
*/
@Override
protected boolean shouldReturnDoc() {
final long statusID = docIdToTweetIdMapper.getTweetID(docID());
return statusID > sinceIdExclusive && statusID <= maxIdInclusive;
}
private static int findSinceIdDocID(
EarlybirdIndexSegmentAtomicReader reader, long sinceIdExclusive) throws IOException {
TweetIDMapper tweetIdMapper =
(TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
if (sinceIdExclusive != SinceMaxIDFilter.NO_FILTER) {
// We use this as an upper bound on the search, so we want to find the highest possible
// doc ID for this tweet ID.
boolean findMaxDocID = true;
return tweetIdMapper.findDocIdBound(
sinceIdExclusive,
findMaxDocID,
reader.getSmallestDocID(),
reader.maxDoc() - 1);
} else {
return DocIDToTweetIDMapper.ID_NOT_FOUND;
}
}
private static int findMaxIdDocID(
EarlybirdIndexSegmentAtomicReader reader, long maxIdInclusive) throws IOException {
TweetIDMapper tweetIdMapper =
(TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
if (maxIdInclusive != SinceMaxIDFilter.NO_FILTER) {
// We use this as a lower bound on the search, so we want to find the lowest possible
// doc ID for this tweet ID.
boolean findMaxDocID = false;
return tweetIdMapper.findDocIdBound(
maxIdInclusive,
findMaxDocID,
reader.getSmallestDocID(),
reader.maxDoc() - 1);
} else {
return DocIDToTweetIDMapper.ID_NOT_FOUND;
}
}
}
}

View File

@ -1,137 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.TimeMapper;
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
// Filters tweets according to since time and until time (in seconds).
// Note that since time is inclusive, and until time is exclusive.
public final class SinceUntilFilter extends Query {
public static final int NO_FILTER = -1;
// These are both in seconds since the epoch.
private final int minTimeInclusive;
private final int maxTimeExclusive;
public static Query getSinceQuery(int sinceTimeSeconds) {
return new BooleanQuery.Builder()
.add(new SinceUntilFilter(sinceTimeSeconds, NO_FILTER), BooleanClause.Occur.FILTER)
.build();
}
public static Query getUntilQuery(int untilTimeSeconds) {
return new BooleanQuery.Builder()
.add(new SinceUntilFilter(NO_FILTER, untilTimeSeconds), BooleanClause.Occur.FILTER)
.build();
}
public static Query getSinceUntilQuery(int sinceTimeSeconds, int untilTimeSeconds) {
return new BooleanQuery.Builder()
.add(new SinceUntilFilter(sinceTimeSeconds, untilTimeSeconds), BooleanClause.Occur.FILTER)
.build();
}
private SinceUntilFilter(int sinceTime, int untilTime) {
this.minTimeInclusive = sinceTime != NO_FILTER ? sinceTime : 0;
this.maxTimeExclusive = untilTime != NO_FILTER ? untilTime : Integer.MAX_VALUE;
}
@Override
public int hashCode() {
return (int) (minTimeInclusive * 17 + maxTimeExclusive);
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof SinceUntilFilter)) {
return false;
}
SinceUntilFilter filter = SinceUntilFilter.class.cast(obj);
return (minTimeInclusive == filter.minTimeInclusive)
&& (maxTimeExclusive == filter.maxTimeExclusive);
}
@Override
public String toString(String field) {
if (minTimeInclusive > 0 && maxTimeExclusive != Integer.MAX_VALUE) {
return "SinceFilter:" + this.minTimeInclusive + ",UntilFilter:" + maxTimeExclusive;
} else if (minTimeInclusive > 0) {
return "SinceFilter:" + this.minTimeInclusive;
} else {
return "UntilFilter:" + this.maxTimeExclusive;
}
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
LeafReader indexReader = context.reader();
if (!(indexReader instanceof EarlybirdIndexSegmentAtomicReader)) {
return new AllDocsIterator(indexReader);
}
EarlybirdIndexSegmentAtomicReader reader = (EarlybirdIndexSegmentAtomicReader) indexReader;
TimeMapper timeMapper = reader.getSegmentData().getTimeMapper();
int smallestDocID = timeMapper.findFirstDocId(maxTimeExclusive, reader.getSmallestDocID());
int largestDoc = timeMapper.findFirstDocId(minTimeInclusive, reader.getSmallestDocID());
int smallestDoc = smallestDocID > 0 ? smallestDocID - 1 : 0;
return new SinceUntilDocIdSetIterator(
reader,
timeMapper,
smallestDoc,
largestDoc,
minTimeInclusive,
maxTimeExclusive);
}
};
}
// Returns true if this TimeMapper is at least partially covered by these time filters.
public static boolean sinceUntilTimesInRange(
TimeMapper timeMapper, int sinceTime, int untilTime) {
return (sinceTime == NO_FILTER || sinceTime <= timeMapper.getLastTime())
&& (untilTime == NO_FILTER || untilTime >= timeMapper.getFirstTime());
}
private static final class SinceUntilDocIdSetIterator extends RangeFilterDISI {
private final TimeMapper timeMapper;
private final int minTimeInclusive;
private final int maxTimeExclusive;
public SinceUntilDocIdSetIterator(EarlybirdIndexSegmentAtomicReader reader,
TimeMapper timeMapper,
int smallestDocID,
int largestDocID,
int minTimeInclusive,
int maxExclusive) throws IOException {
super(reader, smallestDocID, largestDocID);
this.timeMapper = timeMapper;
this.minTimeInclusive = minTimeInclusive;
this.maxTimeExclusive = maxExclusive;
}
@Override
protected boolean shouldReturnDoc() {
final int docTime = timeMapper.getTime(docID());
return docTime >= minTimeInclusive && docTime < maxTimeExclusive;
}
}
}

View File

@ -1,29 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.TermQuery;
/**
* Work around an issue where IntTerms and LongTerms are not valid utf8,
* so calling toString on any TermQuery containing an IntTerm or a LongTerm may cause exceptions.
* This code should produce the same output as TermQuery.toString
*/
public final class TermQueryWithSafeToString extends TermQuery {
private final String termValueForToString;
public TermQueryWithSafeToString(Term term, String termValueForToString) {
super(term);
this.termValueForToString = termValueForToString;
}
@Override
public String toString(String field) {
StringBuilder buffer = new StringBuilder();
if (!getTerm().field().equals(field)) {
buffer.append(getTerm().field());
buffer.append(":");
}
buffer.append(termValueForToString);
return buffer.toString();
}
}

View File

@ -1,128 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import org.apache.lucene.search.DocIdSetIterator;
import com.twitter.common.util.Clock;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.search.EarlyTerminationState;
import com.twitter.search.common.search.TerminationTracker;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
/**
* DocIdSetIterator whose nextDoc() and advance() will early terminate by returning NO_MORE_DOCS
* after the given deadline.
*/
public class TimedDocIdSetIterator extends DocIdSetIterator {
// check deadline every NEXT_CALL_TIMEOUT_CHECK_PERIOD calls to nextDoc()
@VisibleForTesting
protected static final int NEXT_CALL_TIMEOUT_CHECK_PERIOD =
EarlybirdConfig.getInt("timed_doc_id_set_next_doc_deadline_check_period", 1000);
// check deadline every ADVANCE_CALL_TIMEOUT_CHECK_PERIOD calls to advance()
private static final int ADVANCE_CALL_TIMEOUT_CHECK_PERIOD =
EarlybirdConfig.getInt("timed_doc_id_set_advance_deadline_check_period", 100);
private final Clock clock;
private final DocIdSetIterator innerIterator;
private final SearchCounter timeoutCountStat;
@Nullable
private final TerminationTracker terminationTracker;
private final long deadlineMillisFromEpoch;
private int docId = -1;
private int nextCounter = 0;
private int advanceCounter = 0;
public TimedDocIdSetIterator(DocIdSetIterator innerIterator,
@Nullable TerminationTracker terminationTracker,
final long timeoutOverride,
@Nullable SearchCounter timeoutCountStat) {
this(innerIterator, terminationTracker, timeoutOverride, timeoutCountStat, Clock.SYSTEM_CLOCK);
}
protected TimedDocIdSetIterator(DocIdSetIterator innerIterator,
@Nullable TerminationTracker terminationTracker,
final long timeoutOverride,
@Nullable SearchCounter timeoutCountStat,
Clock clock) {
this.clock = clock;
this.innerIterator = innerIterator;
this.timeoutCountStat = timeoutCountStat;
this.terminationTracker = terminationTracker;
if (terminationTracker == null) {
deadlineMillisFromEpoch = -1;
} else {
if (timeoutOverride > 0) {
deadlineMillisFromEpoch = terminationTracker.getClientStartTimeMillis() + timeoutOverride;
} else {
deadlineMillisFromEpoch = terminationTracker.getTimeoutEndTimeWithReservation();
}
}
}
@VisibleForTesting
protected TimedDocIdSetIterator(DocIdSetIterator innerIterator,
final long deadline,
@Nullable SearchCounter timeoutCountStat,
Clock clock) {
this.clock = clock;
this.innerIterator = innerIterator;
this.timeoutCountStat = timeoutCountStat;
this.terminationTracker = null;
this.deadlineMillisFromEpoch = deadline;
}
@Override
public int docID() {
return docId;
}
@Override
public int nextDoc() throws IOException {
if (++nextCounter % NEXT_CALL_TIMEOUT_CHECK_PERIOD == 0
&& clock.nowMillis() > deadlineMillisFromEpoch) {
if (timeoutCountStat != null) {
timeoutCountStat.increment();
}
if (terminationTracker != null) {
terminationTracker.setEarlyTerminationState(
EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED);
}
return docId = NO_MORE_DOCS;
}
return docId = innerIterator.nextDoc();
}
@Override
public int advance(int target) throws IOException {
if (++advanceCounter % ADVANCE_CALL_TIMEOUT_CHECK_PERIOD == 0
&& clock.nowMillis() > deadlineMillisFromEpoch) {
if (timeoutCountStat != null) {
timeoutCountStat.increment();
}
if (terminationTracker != null) {
terminationTracker.setEarlyTerminationState(
EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED);
}
return docId = NO_MORE_DOCS;
}
return docId = innerIterator.advance(target);
}
@Override
public long cost() {
return innerIterator.cost();
}
}

View File

@ -1,128 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
import com.twitter.search.earlybird.common.userupdates.UserTable;
public final class UserFlagsExcludeFilter extends Query {
/**
* Returns a query that filters hits based on their author flags.
*
* @param excludeAntisocial Determines if the filter should exclude hits from antisocial users.
* @param excludeOffensive Determines if the filter should exclude hits from offensive users.
* @param excludeProtected Determines if the filter should exclude hits from protected users
* @return A query that filters hits based on their author flags.
*/
public static Query getUserFlagsExcludeFilter(UserTable userTable,
boolean excludeAntisocial,
boolean excludeOffensive,
boolean excludeProtected) {
return new BooleanQuery.Builder()
.add(new UserFlagsExcludeFilter(
userTable, excludeAntisocial, excludeOffensive, excludeProtected),
BooleanClause.Occur.FILTER)
.build();
}
private final UserTable userTable;
private final boolean excludeAntisocial;
private final boolean excludeOffensive;
private final boolean excludeProtected;
private UserFlagsExcludeFilter(
UserTable userTable,
boolean excludeAntisocial,
boolean excludeOffensive,
boolean excludeProtected) {
this.userTable = userTable;
this.excludeAntisocial = excludeAntisocial;
this.excludeOffensive = excludeOffensive;
this.excludeProtected = excludeProtected;
}
@Override
public int hashCode() {
return (excludeAntisocial ? 13 : 0) + (excludeOffensive ? 1 : 0) + (excludeProtected ? 2 : 0);
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof UserFlagsExcludeFilter)) {
return false;
}
UserFlagsExcludeFilter filter = UserFlagsExcludeFilter.class.cast(obj);
return (excludeAntisocial == filter.excludeAntisocial)
&& (excludeOffensive == filter.excludeOffensive)
&& (excludeProtected == filter.excludeProtected);
}
@Override
public String toString(String field) {
return "UserFlagsExcludeFilter";
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
if (userTable == null) {
return new AllDocsIterator(reader);
}
final int bits =
(excludeAntisocial ? UserTable.ANTISOCIAL_BIT : 0)
| (excludeOffensive ? UserTable.OFFENSIVE_BIT | UserTable.NSFW_BIT : 0)
| (excludeProtected ? UserTable.IS_PROTECTED_BIT : 0);
if (bits != 0) {
return new UserFlagsExcludeDocIdSetIterator(reader, userTable) {
@Override
protected boolean checkUserFlags(UserTable table, long userID) {
return !table.isSet(userID, bits);
}
};
}
return new AllDocsIterator(reader);
}
};
}
private abstract static class UserFlagsExcludeDocIdSetIterator extends RangeFilterDISI {
private final UserTable userTable;
private final NumericDocValues fromUserID;
public UserFlagsExcludeDocIdSetIterator(
LeafReader indexReader, UserTable table) throws IOException {
super(indexReader);
userTable = table;
fromUserID =
indexReader.getNumericDocValues(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName());
}
@Override
protected boolean shouldReturnDoc() throws IOException {
return fromUserID.advanceExact(docID())
&& checkUserFlags(userTable, fromUserID.longValue());
}
protected abstract boolean checkUserFlags(UserTable table, long userID);
}
}

View File

@ -1,528 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BulkScorer;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.ConstantScoreWeight;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.BytesRef;
import com.twitter.decider.Decider;
import com.twitter.search.common.decider.DeciderUtil;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchTimer;
import com.twitter.search.common.metrics.SearchTimerStats;
import com.twitter.search.common.query.HitAttributeHelper;
import com.twitter.search.common.query.IDDisjunctionQuery;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.base.IndexedNumericFieldSettings;
import com.twitter.search.common.schema.base.Schema;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.common.search.termination.QueryTimeout;
import com.twitter.search.common.util.analysis.LongTermAttributeImpl;
import com.twitter.search.common.util.analysis.SortableLongTermAttributeImpl;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData;
import com.twitter.search.core.earlybird.index.inverted.InvertedIndex;
import com.twitter.search.core.earlybird.index.inverted.MultiSegmentTermDictionary;
import com.twitter.search.earlybird.partition.MultiSegmentTermDictionaryManager;
import com.twitter.search.earlybird.queryparser.EarlybirdQueryHelper;
import com.twitter.search.queryparser.query.QueryParserException;
/**
* A variant of a multi-term ID disjunction query (similar to {@link UserIdMultiSegmentQuery}),
* that also uses a {@link MultiSegmentTermDictionary} where available, for more efficient
* term lookups for queries that span multiple segments.
*
* By default, a IDDisjunctionQuery (or Lucene's MultiTermQuery), does a term dictionary lookup
* for all of the terms in its disjunction, and it does it once for each segment (or AtomicReader)
* that the query is searching.
* This means that when the term dictionary is large, and the term lookups are expensive, and when
* we are searching multiple segments, the query needs to make num_terms * num_segments expensive
* term dictionary lookups.
*
* With the help of a MultiSegmentTermDictionary, this multi-term disjunction query implementation
* only does one lookup for all of the segments managed by the MultiSegmentTermDictionary.
* If a segment is not supported by the MultiSegmentTermDictionary (e.g. if it's not optimized yet),
* a regular lookup in that segment's term dictionary will be performed.
*
* Usually, we will make 'num_terms' lookups in the current, un-optimized segment, and then if
* more segments need to be searched, we will make another 'num_terms' lookups, once for all of
* the remaining segments.
*
* When performing lookups in the MultiSegmentTermDictionary, for each supported segment, we save
* a list of termIds from that segment for all the searched terms that appear in that segment.
*
* For example, when querying for UserIdMultiSegmentQuery with user ids: {1L, 2L, 3L} and
* segments: {1, 2}, where segment 1 has user ids {1L, 2L} indexed under termIds {100, 200},
* and segment 2 has user ids {1L, 2L, 3L} indexed under termIds {200, 300, 400}, we will build
* up the following map once:
* segment1 -> [100, 200]
* segment2 -> [200, 300, 400]
*/
public class UserIdMultiSegmentQuery extends Query {
@VisibleForTesting
public static final SearchTimerStats TERM_LOOKUP_STATS =
SearchTimerStats.export("multi_segment_query_term_lookup", TimeUnit.NANOSECONDS, false);
public static final SearchTimerStats QUERY_FROM_PRECOMPUTED =
SearchTimerStats.export("multi_segment_query_from_precomputed", TimeUnit.NANOSECONDS, false);
public static final SearchTimerStats QUERY_REGULAR =
SearchTimerStats.export("multi_segment_query_regular", TimeUnit.NANOSECONDS, false);
@VisibleForTesting
public static final SearchCounter USED_MULTI_SEGMENT_TERM_DICTIONARY_COUNT = SearchCounter.export(
"user_id_multi_segment_query_used_multi_segment_term_dictionary_count");
@VisibleForTesting
public static final SearchCounter USED_ORIGINAL_TERM_DICTIONARY_COUNT = SearchCounter.export(
"user_id_multi_segment_query_used_original_term_dictionary_count");
private static final SearchCounter NEW_QUERY_COUNT =
SearchCounter.export("user_id_multi_segment_new_query_count");
private static final SearchCounter OLD_QUERY_COUNT =
SearchCounter.export("user_id_multi_segment_old_query_count");
private static final HashMap<String, SearchCounter> QUERY_COUNT_BY_QUERY_NAME = new HashMap<>();
private static final HashMap<String, SearchCounter> QUERY_COUNT_BY_FIELD_NAME = new HashMap<>();
private static final String DECIDER_KEY_PREFIX = "use_multi_segment_id_disjunction_queries_in_";
/**
* Returns a new user ID disjunction query.
*
* @param ids The user IDs.
* @param field The field storing the user IDs.
* @param schemaSnapshot A snapshot of earlybird's schema.
* @param multiSegmentTermDictionaryManager The manager for the term dictionaries that span
* multiple segments.
* @param decider The decider.
* @param earlybirdCluster The earlybird cluster.
* @param ranks The hit attribution ranks to be assigned to every user ID.
* @param hitAttributeHelper The helper that tracks hit attributions.
* @param queryTimeout The timeout to be enforced on this query.
* @return A new user ID disjunction query.
*/
public static Query createIdDisjunctionQuery(
String queryName,
List<Long> ids,
String field,
ImmutableSchemaInterface schemaSnapshot,
MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager,
Decider decider,
EarlybirdCluster earlybirdCluster,
List<Integer> ranks,
@Nullable HitAttributeHelper hitAttributeHelper,
@Nullable QueryTimeout queryTimeout) throws QueryParserException {
QUERY_COUNT_BY_QUERY_NAME.computeIfAbsent(queryName, name ->
SearchCounter.export("multi_segment_query_name_" + name)).increment();
QUERY_COUNT_BY_FIELD_NAME.computeIfAbsent(field, name ->
SearchCounter.export("multi_segment_query_count_for_field_" + name)).increment();
if (DeciderUtil.isAvailableForRandomRecipient(decider, getDeciderName(earlybirdCluster))) {
NEW_QUERY_COUNT.increment();
MultiSegmentTermDictionary multiSegmentTermDictionary =
multiSegmentTermDictionaryManager.getMultiSegmentTermDictionary(field);
return new UserIdMultiSegmentQuery(
ids,
field,
schemaSnapshot,
multiSegmentTermDictionary,
ranks,
hitAttributeHelper,
queryTimeout);
} else {
OLD_QUERY_COUNT.increment();
return new IDDisjunctionQuery(ids, field, schemaSnapshot);
}
}
@VisibleForTesting
public static String getDeciderName(EarlybirdCluster earlybirdCluster) {
return DECIDER_KEY_PREFIX + earlybirdCluster.name().toLowerCase();
}
private final boolean useOrderPreservingEncoding;
private final HitAttributeHelper hitAttributeHelper;
private final QueryTimeout queryTimeout;
private final MultiSegmentTermDictionary multiSegmentTermDictionary;
private final Schema.FieldInfo fieldInfo;
private final String field;
private final List<Long> ids;
private final List<Integer> ranks;
// For each segment where we have a multi-segment term dictionary, this map will contain the
// termIds of all the terms that actually appear in that segment's index.
@Nullable
private Map<InvertedIndex, List<TermRankPair>> termIdsPerSegment;
// A wrap class helps to associate termId with corresponding search operator rank if exist
private final class TermRankPair {
private final int termId;
private final int rank;
TermRankPair(int termId, int rank) {
this.termId = termId;
this.rank = rank;
}
public int getTermId() {
return termId;
}
public int getRank() {
return rank;
}
}
@VisibleForTesting
public UserIdMultiSegmentQuery(
List<Long> ids,
String field,
ImmutableSchemaInterface schemaSnapshot,
MultiSegmentTermDictionary termDictionary,
List<Integer> ranks,
@Nullable HitAttributeHelper hitAttributeHelper,
@Nullable QueryTimeout queryTimeout) {
this.field = field;
this.ids = ids;
this.multiSegmentTermDictionary = termDictionary;
this.ranks = ranks;
this.hitAttributeHelper = hitAttributeHelper;
this.queryTimeout = queryTimeout;
// check ids and ranks have same size
Preconditions.checkArgument(ranks.size() == 0 || ranks.size() == ids.size());
// hitAttributeHelper is not null iff ranks is not empty
if (ranks.size() > 0) {
Preconditions.checkNotNull(hitAttributeHelper);
} else {
Preconditions.checkArgument(hitAttributeHelper == null);
}
if (!schemaSnapshot.hasField(field)) {
throw new IllegalStateException("Tried to search a field which does not exist in schema");
}
this.fieldInfo = Preconditions.checkNotNull(schemaSnapshot.getFieldInfo(field));
IndexedNumericFieldSettings numericFieldSettings =
fieldInfo.getFieldType().getNumericFieldSettings();
if (numericFieldSettings == null) {
throw new IllegalStateException("Id field is not numerical");
}
this.useOrderPreservingEncoding = numericFieldSettings.isUseSortableEncoding();
}
/**
* If it hasn't been built yet, build up the map containing termIds of all the terms being
* searched, for all of the segments that are managed by the multi-segment term dictionary.
*
* We only do this once, when we have to search the first segment that's supported by our
* multi-segment term dictionary.
*
* Flow here is to:
* 1. go through all the ids being queried.
* 2. for each id, get the termIds for that term in all of the segments in the term dictionary
* 3. for all of the segments that have that term, add the termId to that segment's list of
* term ids (in the 'termIdsPerSegment' map).
*/
private void createTermIdsPerSegment() {
if (termIdsPerSegment != null) {
// already created the map
return;
}
long start = System.nanoTime();
final BytesRef termRef = useOrderPreservingEncoding
? SortableLongTermAttributeImpl.newBytesRef()
: LongTermAttributeImpl.newBytesRef();
termIdsPerSegment = Maps.newHashMap();
List<? extends InvertedIndex> segmentIndexes = multiSegmentTermDictionary.getSegmentIndexes();
for (int idx = 0; idx < ids.size(); ++idx) {
long longTerm = ids.get(idx);
if (useOrderPreservingEncoding) {
SortableLongTermAttributeImpl.copyLongToBytesRef(termRef, longTerm);
} else {
LongTermAttributeImpl.copyLongToBytesRef(termRef, longTerm);
}
int[] termIds = multiSegmentTermDictionary.lookupTermIds(termRef);
Preconditions.checkState(segmentIndexes.size() == termIds.length,
"SegmentIndexes: %s, field: %s, termIds: %s",
segmentIndexes.size(), field, termIds.length);
for (int indexId = 0; indexId < termIds.length; indexId++) {
int termId = termIds[indexId];
if (termId != EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) {
InvertedIndex fieldIndex = segmentIndexes.get(indexId);
List<TermRankPair> termIdsList = termIdsPerSegment.get(fieldIndex);
if (termIdsList == null) {
termIdsList = Lists.newArrayList();
termIdsPerSegment.put(fieldIndex, termIdsList);
}
termIdsList.add(new TermRankPair(
termId, ranks.size() > 0 ? ranks.get(idx) : -1));
}
}
}
long elapsed = System.nanoTime() - start;
TERM_LOOKUP_STATS.timerIncrement(elapsed);
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
return new UserIdMultiSegmentQueryWeight(searcher, scoreMode, boost);
}
@Override
public int hashCode() {
return Arrays.hashCode(
new Object[] {useOrderPreservingEncoding, queryTimeout, field, ids, ranks});
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof UserIdMultiSegmentQuery)) {
return false;
}
UserIdMultiSegmentQuery query = UserIdMultiSegmentQuery.class.cast(obj);
return Arrays.equals(
new Object[] {useOrderPreservingEncoding, queryTimeout, field, ids, ranks},
new Object[] {query.useOrderPreservingEncoding,
query.queryTimeout,
query.field,
query.ids,
query.ranks});
}
@Override
public String toString(String fieldName) {
StringBuilder builder = new StringBuilder();
builder.append(getClass().getSimpleName()).append("[").append(fieldName).append(":");
for (Long id : this.ids) {
builder.append(id);
builder.append(",");
}
builder.setLength(builder.length() - 1);
builder.append("]");
return builder.toString();
}
private final class UserIdMultiSegmentQueryWeight extends ConstantScoreWeight {
private final IndexSearcher searcher;
private final ScoreMode scoreMode;
private UserIdMultiSegmentQueryWeight(
IndexSearcher searcher,
ScoreMode scoreMode,
float boost) {
super(UserIdMultiSegmentQuery.this, boost);
this.searcher = searcher;
this.scoreMode = scoreMode;
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
Weight weight = rewrite(context);
if (weight != null) {
return weight.scorer(context);
} else {
return null;
}
}
@Override
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
Weight weight = rewrite(context);
if (weight != null) {
return weight.bulkScorer(context);
} else {
return null;
}
}
@Override
public void extractTerms(Set<Term> terms) {
terms.addAll(ids
.stream()
.map(id -> new Term(field, LongTermAttributeImpl.copyIntoNewBytesRef(id)))
.collect(Collectors.toSet()));
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return true;
}
private Weight rewrite(LeafReaderContext context) throws IOException {
final Terms terms = context.reader().terms(field);
if (terms == null) {
// field does not exist
return null;
}
final TermsEnum termsEnum = terms.iterator();
Preconditions.checkNotNull(termsEnum, "No termsEnum for field: %s", field);
BooleanQuery bq;
// See if the segment is supported by the multi-segment term dictionary. If so, build up
// the query using the termIds from the multi-segment term dictionary.
// If not (for the current segment), do the term lookups directly in the queried segment.
InvertedIndex fieldIndex = getFieldIndexFromMultiTermDictionary(context);
if (fieldIndex != null) {
createTermIdsPerSegment();
USED_MULTI_SEGMENT_TERM_DICTIONARY_COUNT.increment();
SearchTimer timer = QUERY_FROM_PRECOMPUTED.startNewTimer();
bq = addPrecomputedTermQueries(fieldIndex, termsEnum);
QUERY_FROM_PRECOMPUTED.stopTimerAndIncrement(timer);
} else {
USED_ORIGINAL_TERM_DICTIONARY_COUNT.increment();
// This segment is not supported by the multi-segment term dictionary. Lookup terms
// directly.
SearchTimer timer = QUERY_REGULAR.startNewTimer();
bq = addTermQueries(termsEnum);
QUERY_REGULAR.stopTimerAndIncrement(timer);
}
return searcher.rewrite(new ConstantScoreQuery(bq)).createWeight(
searcher, scoreMode, score());
}
/**
* If the multi-segment term dictionary supports this segment/LeafReader, then return the
* InvertedIndex representing this segment.
*
* If the segment being queried right now is not in the multi-segment term dictionary (e.g.
* if it's not optimized yet), return null.
*/
@Nullable
private InvertedIndex getFieldIndexFromMultiTermDictionary(LeafReaderContext context)
throws IOException {
if (multiSegmentTermDictionary == null) {
return null;
}
if (context.reader() instanceof EarlybirdIndexSegmentAtomicReader) {
EarlybirdIndexSegmentAtomicReader reader =
(EarlybirdIndexSegmentAtomicReader) context.reader();
EarlybirdIndexSegmentData segmentData = reader.getSegmentData();
InvertedIndex fieldIndex = segmentData.getFieldIndex(field);
if (multiSegmentTermDictionary.supportSegmentIndex(fieldIndex)) {
return fieldIndex;
}
}
return null;
}
private BooleanQuery addPrecomputedTermQueries(
InvertedIndex fieldIndex,
TermsEnum termsEnum) throws IOException {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
int numClauses = 0;
List<TermRankPair> termRankPairs = termIdsPerSegment.get(fieldIndex);
if (termRankPairs != null) {
for (TermRankPair pair : termRankPairs) {
int termId = pair.getTermId();
if (numClauses >= BooleanQuery.getMaxClauseCount()) {
BooleanQuery saved = bqBuilder.build();
bqBuilder = new BooleanQuery.Builder();
bqBuilder.add(saved, BooleanClause.Occur.SHOULD);
numClauses = 1;
}
Query query;
if (pair.getRank() != -1) {
query = EarlybirdQueryHelper.maybeWrapWithHitAttributionCollector(
new SimpleTermQuery(termsEnum, termId),
pair.getRank(),
fieldInfo,
hitAttributeHelper);
} else {
query = new SimpleTermQuery(termsEnum, termId);
}
bqBuilder.add(EarlybirdQueryHelper.maybeWrapWithTimeout(query, queryTimeout),
BooleanClause.Occur.SHOULD);
++numClauses;
}
}
return bqBuilder.build();
}
private BooleanQuery addTermQueries(TermsEnum termsEnum) throws IOException {
final BytesRef termRef = useOrderPreservingEncoding
? SortableLongTermAttributeImpl.newBytesRef()
: LongTermAttributeImpl.newBytesRef();
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
int numClauses = 0;
for (int idx = 0; idx < ids.size(); ++idx) {
long longTerm = ids.get(idx);
if (useOrderPreservingEncoding) {
SortableLongTermAttributeImpl.copyLongToBytesRef(termRef, longTerm);
} else {
LongTermAttributeImpl.copyLongToBytesRef(termRef, longTerm);
}
if (termsEnum.seekExact(termRef)) {
if (numClauses >= BooleanQuery.getMaxClauseCount()) {
BooleanQuery saved = bqBuilder.build();
bqBuilder = new BooleanQuery.Builder();
bqBuilder.add(saved, BooleanClause.Occur.SHOULD);
numClauses = 1;
}
if (ranks.size() > 0) {
bqBuilder.add(EarlybirdQueryHelper.maybeWrapWithHitAttributionCollector(
new SimpleTermQuery(termsEnum, termsEnum.ord()),
ranks.get(idx),
fieldInfo,
hitAttributeHelper),
BooleanClause.Occur.SHOULD);
} else {
bqBuilder.add(new SimpleTermQuery(termsEnum, termsEnum.ord()),
BooleanClause.Occur.SHOULD);
}
++numClauses;
}
}
return bqBuilder.build();
}
}
}

View File

@ -1,82 +0,0 @@
package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import java.util.Objects;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import com.twitter.search.common.metrics.SearchRateCounter;
import com.twitter.search.common.query.FilteredQuery;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.earlybird.common.userupdates.UserScrubGeoMap;
import com.twitter.search.earlybird.index.TweetIDMapper;
/**
* Filter that can be used with searches over geo field postings lists in order to filter out tweets
* that have been geo scrubbed. Determines if a tweet has been geo scrubbed by comparing the
* tweet's id against the max scrubbed tweet id for that tweet's author, which is stored in the
* UserScrubGeoMap.
*
* See: go/realtime-geo-filtering
*/
public class UserScrubGeoFilter implements FilteredQuery.DocIdFilterFactory {
private UserScrubGeoMap userScrubGeoMap;
private final SearchRateCounter totalRequestsUsingFilterCounter =
SearchRateCounter.export("user_scrub_geo_filter_total_requests");
public static FilteredQuery.DocIdFilterFactory getDocIdFilterFactory(
UserScrubGeoMap userScrubGeoMap) {
return new UserScrubGeoFilter(userScrubGeoMap);
}
public UserScrubGeoFilter(UserScrubGeoMap userScrubGeoMap) {
this.userScrubGeoMap = userScrubGeoMap;
totalRequestsUsingFilterCounter.increment();
}
@Override
public FilteredQuery.DocIdFilter getDocIdFilter(LeafReaderContext context) throws IOException {
// To determine if a given doc has been geo scrubbed we need two pieces of information about the
// doc: the associated tweet id and the user id of the tweet's author. We can get the tweet id
// from the TweetIDMapper for the segment we are currently searching, and we can get the user id
// of the tweet's author by looking up the doc id in the NumericDocValues for the
// FROM_USER_ID_CSF.
//
// With this information we can check the UserScrubGeoMap to find out if the tweet has been
// geo scrubbed and filter it out accordingly.
final EarlybirdIndexSegmentAtomicReader currTwitterReader =
(EarlybirdIndexSegmentAtomicReader) context.reader();
final TweetIDMapper tweetIdMapper =
(TweetIDMapper) currTwitterReader.getSegmentData().getDocIDToTweetIDMapper();
final NumericDocValues fromUserIdDocValues = currTwitterReader.getNumericDocValues(
EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName());
return (docId) -> fromUserIdDocValues.advanceExact(docId)
&& !userScrubGeoMap.isTweetGeoScrubbed(
tweetIdMapper.getTweetID(docId), fromUserIdDocValues.longValue());
}
@Override
public String toString() {
return "UserScrubGeoFilter";
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof UserScrubGeoMap)) {
return false;
}
UserScrubGeoFilter filter = UserScrubGeoFilter.class.cast(obj);
// filters are considered equal as long as they are using the same UserScrubGeoMap
return Objects.equals(userScrubGeoMap, filter.userScrubGeoMap);
}
@Override
public int hashCode() {
return userScrubGeoMap == null ? 0 : userScrubGeoMap.hashCode();
}
}

View File

@ -1,422 +0,0 @@
package com.twitter.search.earlybird.search.relevance;
import java.util.Arrays;
import java.util.List;
import com.google.common.collect.Lists;
import com.twitter.search.common.constants.SearchCardType;
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
public class LinearScoringData {
public static final float NO_BOOST_VALUE = 1.0f;
// A signal value so we can tell if something is unset, also used in explanation.
public static final int UNSET_SIGNAL_VALUE = -999;
//This is somewhat arbitrary, and is here so that we have some limit on
//how many offline experimental features we support per query
public static final int MAX_OFFLINE_EXPERIMENTAL_FIELDS = 5;
public enum SkipReason {
NOT_SKIPPED,
ANTIGAMING,
LOW_REPUTATION,
LOW_TEXT_SCORE,
LOW_RETWEET_COUNT,
LOW_FAV_COUNT,
SOCIAL_FILTER,
LOW_FINAL_SCORE
}
// When you add fields here, make sure you also update the clear() function.
public double luceneScore;
public double textScore;
//I am not sure why this has to be double...
public double tokenAt140DividedByNumTokensBucket;
public double userRep;
public double parusScore;
public final double[] offlineExpFeatureValues = new double[MAX_OFFLINE_EXPERIMENTAL_FIELDS];
// v1 engagement counters
public double retweetCountPostLog2;
public double favCountPostLog2;
public double replyCountPostLog2;
public double embedsImpressionCount;
public double embedsUrlCount;
public double videoViewCount;
// v2 engagement counters (that have a v1 counter part)
public double retweetCountV2;
public double favCountV2;
public double replyCountV2;
public double embedsImpressionCountV2;
public double embedsUrlCountV2;
public double videoViewCountV2;
// pure v2 engagement counters, they started v2 only
public double quotedCount;
public double weightedRetweetCount;
public double weightedReplyCount;
public double weightedFavCount;
public double weightedQuoteCount;
// card related properties
public boolean hasCard;
public byte cardType;
public boolean hasUrl;
public boolean isReply;
public boolean isRetweet;
public boolean isOffensive;
public boolean hasTrend;
public boolean isFromVerifiedAccount;
public boolean isFromBlueVerifiedAccount;
public boolean isUserSpam;
public boolean isUserNSFW;
public boolean isUserBot;
public boolean isUserAntiSocial;
public boolean hasVisibleLink;
public double luceneContrib;
public double reputationContrib;
public double textScoreContrib;
public double favContrib;
public double replyContrib;
public double multipleReplyContrib;
public double retweetContrib;
public double parusContrib;
public final double[] offlineExpFeatureContributions =
new double[MAX_OFFLINE_EXPERIMENTAL_FIELDS];
public double embedsImpressionContrib;
public double embedsUrlContrib;
public double videoViewContrib;
public double quotedContrib;
public double hasUrlContrib;
public double isReplyContrib;
public double isFollowRetweetContrib;
public double isTrustedRetweetContrib;
// Value passed in the request (ThriftRankingParams.querySpecificScoreAdjustments)
public double querySpecificScore;
// Value passed in the request (ThriftRankingParams.authorSpecificScoreAdjustments)
public double authorSpecificScore;
public double normalizedLuceneScore;
public int tweetLangId;
public double uiLangMult;
public double userLangMult;
public boolean hasDifferentLang;
public boolean hasEnglishTweetAndDifferentUILang;
public boolean hasEnglishUIAndDifferentTweetLang;
public int tweetAgeInSeconds;
public double ageDecayMult;
// Intermediate scores
public double scoreBeforeBoost;
public double scoreAfterBoost;
public double scoreFinal;
public double scoreReturned;
public SkipReason skipReason;
public boolean isTrusted;
public boolean isFollow;
public boolean spamUserDampApplied;
public boolean nsfwUserDampApplied;
public boolean botUserDampApplied;
public boolean trustedCircleBoostApplied;
public boolean directFollowBoostApplied;
public boolean outOfNetworkReplyPenaltyApplied;
public boolean hasMultipleHashtagsOrTrends;
public boolean tweetHasTrendsBoostApplied;
public boolean tweetFromVerifiedAccountBoostApplied;
public boolean tweetFromBlueVerifiedAccountBoostApplied;
public boolean hasCardBoostApplied;
public boolean cardDomainMatchBoostApplied;
public boolean cardAuthorMatchBoostApplied;
public boolean cardTitleMatchBoostApplied;
public boolean cardDescriptionMatchBoostApplied;
public List<String> hitFields;
public boolean hasNoTextHitDemotionApplied;
public boolean hasUrlOnlyHitDemotionApplied;
public boolean hasNameOnlyHitDemotionApplied;
public boolean hasSeparateTextAndNameHitDemotionApplied;
public boolean hasSeparateTextAndUrlHitDemotionApplied;
public long fromUserId;
// This is actually retweet status ID, or the ID of the original tweet being (natively) retweeted
public long sharedStatusId;
public long referenceAuthorId; // SEARCH-8564
public boolean isSelfTweet;
public boolean selfTweetBoostApplied;
public double selfTweetMult;
public boolean hasImageUrl;
public boolean hasVideoUrl;
public boolean hasMedialUrlBoostApplied;
public boolean hasNewsUrl;
public boolean hasNewsUrlBoostApplied;
public boolean hasConsumerVideo;
public boolean hasProVideo;
public boolean hasVine;
public boolean hasPeriscope;
public boolean hasNativeImage;
public boolean isNullcast;
public boolean hasQuote;
public boolean isSensitiveContent;
public boolean hasMultipleMediaFlag;
public boolean profileIsEggFlag;
public boolean isUserNewFlag;
public int numMentions;
public int numHashtags;
public int linkLanguage;
public int prevUserTweetEngagement;
public boolean isComposerSourceCamera;
// health model scores by HML
public double toxicityScore; // go/toxicity
public double pBlockScore; // go/pblock
public double pSpammyTweetScore; // go/pspammytweet
public double pReportedTweetScore; // go/preportedtweet
public double spammyTweetContentScore; // go/spammy-tweet-content
public double experimentalHealthModelScore1;
public double experimentalHealthModelScore2;
public double experimentalHealthModelScore3;
public double experimentalHealthModelScore4;
public LinearScoringData() {
hitFields = Lists.newArrayList();
clear();
}
// the following three counters were added later and they got denormalized in standard way,
// you can choose to apply scalding (for legacy LinearScoringFunction) or
// not apply (for returning in metadata and display in debug).
public double getEmbedsImpressionCount(boolean scaleForScoring) {
return scaleForScoring ? logWith0(embedsImpressionCount) : embedsImpressionCount;
}
public double getEmbedsUrlCount(boolean scaleForScoring) {
return scaleForScoring ? logWith0(embedsUrlCount) : embedsUrlCount;
}
public double getVideoViewCount(boolean scaleForScoring) {
return scaleForScoring ? logWith0(videoViewCount) : videoViewCount;
}
private static double logWith0(double value) {
return value > 0 ? Math.log(value) : 0.0;
}
/**
* Returns a string description of all data stored in this instance.
*/
public String getPropertyExplanation() {
StringBuilder sb = new StringBuilder();
sb.append(hasCard ? "CARD " + SearchCardType.cardTypeFromByteValue(cardType) : "");
sb.append(hasUrl ? "URL " : "");
sb.append(isReply ? "REPLY " : "");
sb.append(isRetweet ? "RETWEET " : "");
sb.append(isOffensive ? "OFFENSIVE " : "");
sb.append(hasTrend ? "TREND " : "");
sb.append(hasMultipleHashtagsOrTrends ? "HASHTAG/TREND+ " : "");
sb.append(isFromVerifiedAccount ? "VERIFIED " : "");
sb.append(isFromBlueVerifiedAccount ? "BLUE_VERIFIED " : "");
sb.append(isUserSpam ? "SPAM " : "");
sb.append(isUserNSFW ? "NSFW " : "");
sb.append(isUserBot ? "BOT " : "");
sb.append(isUserAntiSocial ? "ANTISOCIAL " : "");
sb.append(isTrusted ? "TRUSTED " : "");
sb.append(isFollow ? "FOLLOW " : "");
sb.append(isSelfTweet ? "SELF " : "");
sb.append(hasImageUrl ? "IMAGE " : "");
sb.append(hasVideoUrl ? "VIDEO " : "");
sb.append(hasNewsUrl ? "NEWS " : "");
sb.append(isNullcast ? "NULLCAST" : "");
sb.append(hasQuote ? "QUOTE" : "");
sb.append(isComposerSourceCamera ? "Composer Source: CAMERA" : "");
sb.append(favCountPostLog2 > 0 ? "Faves:" + favCountPostLog2 + " " : "");
sb.append(retweetCountPostLog2 > 0 ? "Retweets:" + retweetCountPostLog2 + " " : "");
sb.append(replyCountPostLog2 > 0 ? "Replies:" + replyCountPostLog2 + " " : "");
sb.append(getEmbedsImpressionCount(false) > 0
? "Embedded Imps:" + getEmbedsImpressionCount(false) + " " : "");
sb.append(getEmbedsUrlCount(false) > 0
? "Embedded Urls:" + getEmbedsUrlCount(false) + " " : "");
sb.append(getVideoViewCount(false) > 0
? "Video views:" + getVideoViewCount(false) + " " : "");
sb.append(weightedRetweetCount > 0 ? "Weighted Retweets:"
+ ((int) weightedRetweetCount) + " " : "");
sb.append(weightedReplyCount > 0
? "Weighted Replies:" + ((int) weightedReplyCount) + " " : "");
sb.append(weightedFavCount > 0
? "Weighted Faves:" + ((int) weightedFavCount) + " " : "");
sb.append(weightedQuoteCount > 0
? "Weighted Quotes:" + ((int) weightedQuoteCount) + " " : "");
return sb.toString();
}
/**
* Resets all data stored in this instance.
*/
public void clear() {
luceneScore = UNSET_SIGNAL_VALUE;
textScore = UNSET_SIGNAL_VALUE;
tokenAt140DividedByNumTokensBucket = UNSET_SIGNAL_VALUE;
userRep = UNSET_SIGNAL_VALUE;
retweetCountPostLog2 = UNSET_SIGNAL_VALUE;
favCountPostLog2 = UNSET_SIGNAL_VALUE;
replyCountPostLog2 = UNSET_SIGNAL_VALUE;
parusScore = UNSET_SIGNAL_VALUE;
Arrays.fill(offlineExpFeatureValues, 0);
embedsImpressionCount = UNSET_SIGNAL_VALUE;
embedsUrlCount = UNSET_SIGNAL_VALUE;
videoViewCount = UNSET_SIGNAL_VALUE;
// v2 engagement, these each have a v1 counterpart
retweetCountV2 = UNSET_SIGNAL_VALUE;
favCountV2 = UNSET_SIGNAL_VALUE;
replyCountV2 = UNSET_SIGNAL_VALUE;
embedsImpressionCountV2 = UNSET_SIGNAL_VALUE;
embedsUrlCountV2 = UNSET_SIGNAL_VALUE;
videoViewCountV2 = UNSET_SIGNAL_VALUE;
// new engagement counters, they only have one version with the v2 normalizer
quotedCount = UNSET_SIGNAL_VALUE;
weightedRetweetCount = UNSET_SIGNAL_VALUE;
weightedReplyCount = UNSET_SIGNAL_VALUE;
weightedFavCount = UNSET_SIGNAL_VALUE;
weightedQuoteCount = UNSET_SIGNAL_VALUE;
hasUrl = false;
isReply = false;
isRetweet = false;
isOffensive = false;
hasTrend = false;
isFromVerifiedAccount = false;
isFromBlueVerifiedAccount = false;
isUserSpam = false;
isUserNSFW = false;
isUserBot = false;
isUserAntiSocial = false;
hasVisibleLink = false;
isNullcast = false;
luceneContrib = UNSET_SIGNAL_VALUE;
reputationContrib = UNSET_SIGNAL_VALUE;
textScoreContrib = UNSET_SIGNAL_VALUE;
replyContrib = UNSET_SIGNAL_VALUE;
multipleReplyContrib = UNSET_SIGNAL_VALUE;
retweetContrib = UNSET_SIGNAL_VALUE;
favContrib = UNSET_SIGNAL_VALUE;
parusContrib = UNSET_SIGNAL_VALUE;
Arrays.fill(offlineExpFeatureContributions, 0);
embedsImpressionContrib = UNSET_SIGNAL_VALUE;
embedsUrlContrib = UNSET_SIGNAL_VALUE;
videoViewContrib = UNSET_SIGNAL_VALUE;
hasUrlContrib = UNSET_SIGNAL_VALUE;
isReplyContrib = UNSET_SIGNAL_VALUE;
querySpecificScore = UNSET_SIGNAL_VALUE;
authorSpecificScore = UNSET_SIGNAL_VALUE;
normalizedLuceneScore = NO_BOOST_VALUE;
tweetLangId = ThriftLanguage.UNKNOWN.getValue();
uiLangMult = NO_BOOST_VALUE;
userLangMult = NO_BOOST_VALUE;
hasDifferentLang = false;
hasEnglishTweetAndDifferentUILang = false;
hasEnglishUIAndDifferentTweetLang = false;
tweetAgeInSeconds = 0;
ageDecayMult = NO_BOOST_VALUE;
// Intermediate scores
scoreBeforeBoost = UNSET_SIGNAL_VALUE;
scoreAfterBoost = UNSET_SIGNAL_VALUE;
scoreFinal = UNSET_SIGNAL_VALUE;
scoreReturned = UNSET_SIGNAL_VALUE;
skipReason = SkipReason.NOT_SKIPPED;
isTrusted = false; // Set later
isFollow = false; // Set later
trustedCircleBoostApplied = false;
directFollowBoostApplied = false;
outOfNetworkReplyPenaltyApplied = false;
hasMultipleHashtagsOrTrends = false;
spamUserDampApplied = false;
nsfwUserDampApplied = false;
botUserDampApplied = false;
tweetHasTrendsBoostApplied = false;
tweetFromVerifiedAccountBoostApplied = false;
tweetFromBlueVerifiedAccountBoostApplied = false;
fromUserId = UNSET_SIGNAL_VALUE;
sharedStatusId = UNSET_SIGNAL_VALUE;
referenceAuthorId = UNSET_SIGNAL_VALUE;
isSelfTweet = false;
selfTweetBoostApplied = false;
selfTweetMult = NO_BOOST_VALUE;
trustedCircleBoostApplied = false;
directFollowBoostApplied = false;
hasImageUrl = false;
hasVideoUrl = false;
hasMedialUrlBoostApplied = false;
hasNewsUrl = false;
hasNewsUrlBoostApplied = false;
hasCard = false;
cardType = SearchCardType.UNKNOWN.getByteValue();
hasCardBoostApplied = false;
cardDomainMatchBoostApplied = false;
cardAuthorMatchBoostApplied = false;
cardTitleMatchBoostApplied = false;
cardDescriptionMatchBoostApplied = false;
hitFields.clear();
hasNoTextHitDemotionApplied = false;
hasUrlOnlyHitDemotionApplied = false;
hasNameOnlyHitDemotionApplied = false;
hasSeparateTextAndNameHitDemotionApplied = false;
hasSeparateTextAndUrlHitDemotionApplied = false;
hasConsumerVideo = false;
hasProVideo = false;
hasVine = false;
hasPeriscope = false;
hasNativeImage = false;
isSensitiveContent = false;
hasMultipleMediaFlag = false;
profileIsEggFlag = false;
numMentions = 0;
numHashtags = 0;
isUserNewFlag = false;
linkLanguage = 0;
prevUserTweetEngagement = 0;
isComposerSourceCamera = false;
// health model scores by HML
toxicityScore = UNSET_SIGNAL_VALUE;
pBlockScore = UNSET_SIGNAL_VALUE;
pSpammyTweetScore = UNSET_SIGNAL_VALUE;
pReportedTweetScore = UNSET_SIGNAL_VALUE;
spammyTweetContentScore = UNSET_SIGNAL_VALUE;
experimentalHealthModelScore1 = UNSET_SIGNAL_VALUE;
experimentalHealthModelScore2 = UNSET_SIGNAL_VALUE;
experimentalHealthModelScore3 = UNSET_SIGNAL_VALUE;
experimentalHealthModelScore4 = UNSET_SIGNAL_VALUE;
}
}

View File

@ -1,304 +0,0 @@
package com.twitter.search.earlybird.search.relevance;
import java.util.Arrays;
import java.util.Map;
import com.google.common.annotations.VisibleForTesting;
import com.twitter.search.common.constants.SearchCardType;
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.ranking.thriftjava.ThriftAgeDecayRankingParams;
import com.twitter.search.common.ranking.thriftjava.ThriftCardRankingParams;
import com.twitter.search.common.ranking.thriftjava.ThriftRankingParams;
import com.twitter.search.common.util.lang.ThriftLanguageUtil;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSocialFilterType;
/*
* The class for all query specific parameters, including the parameters from the relevanceOptions and
* values that are extracted from the request itself.
*/
public class LinearScoringParams {
public static final double DEFAULT_FEATURE_WEIGHT = 0;
public static final double DEFAULT_FEATURE_MIN_VAL = 0;
public static final double DEFAULT_NO_BOOST = 1.0;
@VisibleForTesting
static final SearchCounter NULL_USER_LANGS_KEY =
SearchCounter.export("linear_scoring_params_null_user_langs_key");
public final double luceneWeight;
public final double textScoreWeight;
public final double textScoreMinVal;
public final double retweetWeight;
public final double retweetMinVal;
public final double favWeight;
public final double favMinVal;
public final double replyWeight;
public final double multipleReplyWeight;
public final double multipleReplyMinVal;
public final double isReplyWeight;
public final double parusWeight;
public final double embedsImpressionWeight;
public final double embedsUrlWeight;
public final double videoViewWeight;
public final double quotedCountWeight;
public final double[] rankingOfflineExpWeights =
new double[LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS];
public final boolean applyBoosts;
// Storing ranking params for cards, avoid using maps for faster lookup
public final double[] hasCardBoosts = new double[SearchCardType.values().length];
public final double[] cardDomainMatchBoosts = new double[SearchCardType.values().length];
public final double[] cardAuthorMatchBoosts = new double[SearchCardType.values().length];
public final double[] cardTitleMatchBoosts = new double[SearchCardType.values().length];
public final double[] cardDescriptionMatchBoosts = new double[SearchCardType.values().length];
public final double urlWeight;
public final double reputationWeight;
public final double reputationMinVal;
public final double followRetweetWeight;
public final double trustedRetweetWeight;
// Adjustments for specific tweets (tweetId -> score)
public final Map<Long, Double> querySpecificScoreAdjustments;
// Adjustments for tweets posted by specific authors (userId -> score)
public final Map<Long, Double> authorSpecificScoreAdjustments;
public final double offensiveDamping;
public final double spamUserDamping;
public final double nsfwUserDamping;
public final double botUserDamping;
public final double trustedCircleBoost;
public final double directFollowBoost;
public final double minScore;
public final boolean applyFiltersAlways;
public final boolean useLuceneScoreAsBoost;
public final double maxLuceneScoreBoost;
public final double langEnglishTweetDemote;
public final double langEnglishUIDemote;
public final double langDefaultDemote;
public final boolean useUserLanguageInfo;
public final double unknownLanguageBoost;
public final double outOfNetworkReplyPenalty;
public final boolean useAgeDecay;
public final double ageDecayHalflife;
public final double ageDecayBase;
public final double ageDecaySlope;
// hit attribute demotions
public final boolean enableHitDemotion;
public final double noTextHitDemotion;
public final double urlOnlyHitDemotion;
public final double nameOnlyHitDemotion;
public final double separateTextAndNameHitDemotion;
public final double separateTextAndUrlHitDemotion;
// trends related params
public final double tweetHasTrendBoost;
public final double multipleHashtagsOrTrendsDamping;
public final double tweetFromVerifiedAccountBoost;
public final double tweetFromBlueVerifiedAccountBoost;
public final ThriftSocialFilterType socialFilterType;
public final int uiLangId;
// Confidences of the understandability of different languages for this user.
public final double[] userLangs = new double[ThriftLanguage.values().length];
public final long searcherId;
public final double selfTweetBoost;
public final double tweetHasMediaUrlBoost;
public final double tweetHasNewsUrlBoost;
// whether we need meta-data for replies what the reply is to.
public final boolean getInReplyToStatusId;
// Initialize from a ranking parameter
public LinearScoringParams(ThriftSearchQuery searchQuery, ThriftRankingParams params) {
// weights
luceneWeight = params.isSetLuceneScoreParams()
? params.getLuceneScoreParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
textScoreWeight = params.isSetTextScoreParams()
? params.getTextScoreParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
retweetWeight = params.isSetRetweetCountParams()
? params.getRetweetCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
favWeight = params.isSetFavCountParams()
? params.getFavCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
replyWeight = params.isSetReplyCountParams()
? params.getReplyCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
multipleReplyWeight = params.isSetMultipleReplyCountParams()
? params.getMultipleReplyCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
parusWeight = params.isSetParusScoreParams()
? params.getParusScoreParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
for (int i = 0; i < LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS; i++) {
Byte featureTypeByte = (byte) i;
// default weight is 0, thus contribution for unset feature value will be 0.
rankingOfflineExpWeights[i] = params.getOfflineExperimentalFeatureRankingParamsSize() > 0
&& params.getOfflineExperimentalFeatureRankingParams().containsKey(featureTypeByte)
? params.getOfflineExperimentalFeatureRankingParams().get(featureTypeByte).getWeight()
: DEFAULT_FEATURE_WEIGHT;
}
embedsImpressionWeight = params.isSetEmbedsImpressionCountParams()
? params.getEmbedsImpressionCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
embedsUrlWeight = params.isSetEmbedsUrlCountParams()
? params.getEmbedsUrlCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
videoViewWeight = params.isSetVideoViewCountParams()
? params.getVideoViewCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
quotedCountWeight = params.isSetQuotedCountParams()
? params.getQuotedCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
applyBoosts = params.isApplyBoosts();
// configure card values
Arrays.fill(hasCardBoosts, DEFAULT_NO_BOOST);
Arrays.fill(cardAuthorMatchBoosts, DEFAULT_NO_BOOST);
Arrays.fill(cardDomainMatchBoosts, DEFAULT_NO_BOOST);
Arrays.fill(cardTitleMatchBoosts, DEFAULT_NO_BOOST);
Arrays.fill(cardDescriptionMatchBoosts, DEFAULT_NO_BOOST);
if (params.isSetCardRankingParams()) {
for (SearchCardType cardType : SearchCardType.values()) {
byte cardTypeIndex = cardType.getByteValue();
ThriftCardRankingParams rankingParams = params.getCardRankingParams().get(cardTypeIndex);
if (rankingParams != null) {
hasCardBoosts[cardTypeIndex] = rankingParams.getHasCardBoost();
cardAuthorMatchBoosts[cardTypeIndex] = rankingParams.getAuthorMatchBoost();
cardDomainMatchBoosts[cardTypeIndex] = rankingParams.getDomainMatchBoost();
cardTitleMatchBoosts[cardTypeIndex] = rankingParams.getTitleMatchBoost();
cardDescriptionMatchBoosts[cardTypeIndex] = rankingParams.getDescriptionMatchBoost();
}
}
}
urlWeight = params.isSetUrlParams()
? params.getUrlParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
reputationWeight = params.isSetReputationParams()
? params.getReputationParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
isReplyWeight = params.isSetIsReplyParams()
? params.getIsReplyParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
followRetweetWeight = params.isSetDirectFollowRetweetCountParams()
? params.getDirectFollowRetweetCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
trustedRetweetWeight = params.isSetTrustedCircleRetweetCountParams()
? params.getTrustedCircleRetweetCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
querySpecificScoreAdjustments = params.getQuerySpecificScoreAdjustments();
authorSpecificScoreAdjustments = params.getAuthorSpecificScoreAdjustments();
// min/max filters
textScoreMinVal = params.isSetTextScoreParams()
? params.getTextScoreParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
reputationMinVal = params.isSetReputationParams()
? params.getReputationParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
multipleReplyMinVal = params.isSetMultipleReplyCountParams()
? params.getMultipleReplyCountParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
retweetMinVal = params.isSetRetweetCountParams() && params.getRetweetCountParams().isSetMin()
? params.getRetweetCountParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
favMinVal = params.isSetFavCountParams() && params.getFavCountParams().isSetMin()
? params.getFavCountParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
// boosts
spamUserDamping = params.isSetSpamUserBoost() ? params.getSpamUserBoost() : 1.0;
nsfwUserDamping = params.isSetNsfwUserBoost() ? params.getNsfwUserBoost() : 1.0;
botUserDamping = params.isSetBotUserBoost() ? params.getBotUserBoost() : 1.0;
offensiveDamping = params.getOffensiveBoost();
trustedCircleBoost = params.getInTrustedCircleBoost();
directFollowBoost = params.getInDirectFollowBoost();
// language boosts
langEnglishTweetDemote = params.getLangEnglishTweetBoost();
langEnglishUIDemote = params.getLangEnglishUIBoost();
langDefaultDemote = params.getLangDefaultBoost();
useUserLanguageInfo = params.isUseUserLanguageInfo();
unknownLanguageBoost = params.getUnknownLanguageBoost();
// hit demotions
enableHitDemotion = params.isEnableHitDemotion();
noTextHitDemotion = params.getNoTextHitDemotion();
urlOnlyHitDemotion = params.getUrlOnlyHitDemotion();
nameOnlyHitDemotion = params.getNameOnlyHitDemotion();
separateTextAndNameHitDemotion = params.getSeparateTextAndNameHitDemotion();
separateTextAndUrlHitDemotion = params.getSeparateTextAndUrlHitDemotion();
outOfNetworkReplyPenalty = params.getOutOfNetworkReplyPenalty();
if (params.isSetAgeDecayParams()) {
// new age decay settings
ThriftAgeDecayRankingParams ageDecayParams = params.getAgeDecayParams();
ageDecaySlope = ageDecayParams.getSlope();
ageDecayHalflife = ageDecayParams.getHalflife();
ageDecayBase = ageDecayParams.getBase();
useAgeDecay = true;
} else if (params.isSetDeprecatedAgeDecayBase()
&& params.isSetDeprecatedAgeDecayHalflife()
&& params.isSetDeprecatedAgeDecaySlope()) {
ageDecaySlope = params.getDeprecatedAgeDecaySlope();
ageDecayHalflife = params.getDeprecatedAgeDecayHalflife();
ageDecayBase = params.getDeprecatedAgeDecayBase();
useAgeDecay = true;
} else {
ageDecaySlope = 0.0;
ageDecayHalflife = 0.0;
ageDecayBase = 0.0;
useAgeDecay = false;
}
// trends
tweetHasTrendBoost = params.getTweetHasTrendBoost();
multipleHashtagsOrTrendsDamping = params.getMultipleHashtagsOrTrendsBoost();
// verified accounts
tweetFromVerifiedAccountBoost = params.getTweetFromVerifiedAccountBoost();
tweetFromBlueVerifiedAccountBoost = params.getTweetFromBlueVerifiedAccountBoost();
// score filter
minScore = params.getMinScore();
applyFiltersAlways = params.isApplyFiltersAlways();
useLuceneScoreAsBoost = params.isUseLuceneScoreAsBoost();
maxLuceneScoreBoost = params.getMaxLuceneScoreBoost();
searcherId = searchQuery.isSetSearcherId() ? searchQuery.getSearcherId() : -1;
selfTweetBoost = params.getSelfTweetBoost();
socialFilterType = searchQuery.getSocialFilterType();
// the UI language and the confidences of the languages user can understand.
if (!searchQuery.isSetUiLang() || searchQuery.getUiLang().isEmpty()) {
uiLangId = ThriftLanguage.UNKNOWN.getValue();
} else {
uiLangId = ThriftLanguageUtil.getThriftLanguageOf(searchQuery.getUiLang()).getValue();
}
if (searchQuery.getUserLangsSize() > 0) {
for (Map.Entry<ThriftLanguage, Double> lang : searchQuery.getUserLangs().entrySet()) {
ThriftLanguage thriftLanguage = lang.getKey();
// SEARCH-13441
if (thriftLanguage != null) {
userLangs[thriftLanguage.getValue()] = lang.getValue();
} else {
NULL_USER_LANGS_KEY.increment();
}
}
}
// For now, we will use the same boost for both image, and video.
tweetHasMediaUrlBoost = params.getTweetHasImageUrlBoost();
tweetHasNewsUrlBoost = params.getTweetHasNewsUrlBoost();
getInReplyToStatusId =
searchQuery.isSetResultMetadataOptions()
&& searchQuery.getResultMetadataOptions().isSetGetInReplyToStatusId()
&& searchQuery.getResultMetadataOptions().isGetInReplyToStatusId();
}
}

View File

@ -1,163 +0,0 @@
package com.twitter.search.earlybird.search.relevance;
import java.io.IOException;
import java.util.Objects;
import com.google.common.annotations.VisibleForTesting;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.encoding.features.ByteNormalizer;
import com.twitter.search.common.encoding.features.ClampByteNormalizer;
import com.twitter.search.common.encoding.features.SingleBytePositiveFloatNormalizer;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.common.query.FilteredQuery;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
public final class MinFeatureValueFilter extends Query implements FilteredQuery.DocIdFilterFactory {
private final String featureName;
private final ByteNormalizer normalizer;
private final double minValue;
/**
* Creates a query that filters out all hits that have a value smaller than the given threshold
* for the given feature.
*
* @param featureName The feature.
* @param minValue The threshold for the feature values.
* @return A query that filters out all hits that have a value smaller than the given threshold
* for the given feature.
*/
public static Query getMinFeatureValueFilter(String featureName, double minValue) {
return new BooleanQuery.Builder()
.add(new MinFeatureValueFilter(featureName, minValue), BooleanClause.Occur.FILTER)
.build();
}
public static FilteredQuery.DocIdFilterFactory getDocIdFilterFactory(
String featureName, double minValue) {
return new MinFeatureValueFilter(featureName, minValue);
}
/**
* Returns the normalizer that should be used to normalize the values for the given feature.
*
* @param featureName The feature.
* @return The normalizer that should be used to normalize the values for the given feature.
*/
@VisibleForTesting
public static ByteNormalizer getMinFeatureValueNormalizer(String featureName) {
if (featureName.equals(EarlybirdFieldConstant.USER_REPUTATION.getFieldName())) {
return new ClampByteNormalizer(0, 100);
}
if (featureName.equals(EarlybirdFieldConstant.FAVORITE_COUNT.getFieldName())
|| featureName.equals(EarlybirdFieldConstant.PARUS_SCORE.getFieldName())
|| featureName.equals(EarlybirdFieldConstant.REPLY_COUNT.getFieldName())
|| featureName.equals(EarlybirdFieldConstant.RETWEET_COUNT.getFieldName())) {
return new SingleBytePositiveFloatNormalizer();
}
throw new IllegalArgumentException("Unknown normalization method for field " + featureName);
}
@Override
public int hashCode() {
// Probably doesn't make sense to include the schemaSnapshot and normalizer here.
return (int) ((featureName == null ? 0 : featureName.hashCode() * 7) + minValue);
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof MinFeatureValueFilter)) {
return false;
}
// Probably doesn't make sense to include the schemaSnapshot and normalizer here.
MinFeatureValueFilter filter = MinFeatureValueFilter.class.cast(obj);
return Objects.equals(featureName, filter.featureName) && (minValue == filter.minValue);
}
@Override
public String toString(String field) {
return String.format("MinFeatureValueFilter(%s, %f)", featureName, minValue);
}
private MinFeatureValueFilter(String featureName, double minValue) {
this.featureName = featureName;
this.normalizer = getMinFeatureValueNormalizer(featureName);
this.minValue = normalizer.normalize(minValue);
}
@Override
public FilteredQuery.DocIdFilter getDocIdFilter(LeafReaderContext context) throws IOException {
final NumericDocValues featureDocValues = context.reader().getNumericDocValues(featureName);
return (docId) -> featureDocValues.advanceExact(docId)
&& ((byte) featureDocValues.longValue() >= minValue);
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
return new MinFeatureValueDocIdSetIterator(
context.reader(), featureName, minValue);
}
};
}
private static final class MinFeatureValueDocIdSetIterator extends RangeFilterDISI {
private final NumericDocValues featureDocValues;
private final double minValue;
MinFeatureValueDocIdSetIterator(LeafReader indexReader,
String featureName,
double minValue) throws IOException {
super(indexReader);
this.featureDocValues = indexReader.getNumericDocValues(featureName);
this.minValue = minValue;
}
@Override
public boolean shouldReturnDoc() throws IOException {
// We need this explicit casting to byte, because of how we encode and decode features in our
// encoded_tweet_features field. If a feature is an int (uses all 32 bits of the int), then
// encoding the feature and then decoding it preserves its original value. However, if the
// feature does not use the entire int (and especially if it uses bits somewhere in the middle
// of the int), then the feature value is assumed to be unsigned when it goes through this
// process of encoding and decoding. So a user rep of
// RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL (-128) will be correctly encoded as the
// binary value 10000000, but will be treated as an unsigned value when decoded, and therefore
// the decoded value will be 128.
//
// In retrospect, this seems like a really poor design decision. It seems like it would be
// better if all feature values were considered to be signed, even if most features can never
// have negative values. Unfortunately, making this change is not easy, because some features
// store normalized values, so we would also need to change the range of allowed values
// produced by those normalizers, as well as all code that depends on those values.
//
// So for now, just cast this value to a byte, to get the proper negative value.
return featureDocValues.advanceExact(docID())
&& ((byte) featureDocValues.longValue() >= minValue);
}
}
public double getMinValue() {
return minValue;
}
public ByteNormalizer getNormalizer() {
return normalizer;
}
}

View File

@ -1,104 +0,0 @@
package com.twitter.search.earlybird.search.relevance;
import java.util.Comparator;
import javax.annotation.Nullable;
import com.google.common.base.Preconditions;
import com.twitter.common_internal.collections.RandomAccessPriorityQueue;
import com.twitter.search.common.relevance.features.TweetIntegerShingleSignature;
import com.twitter.search.earlybird.search.Hit;
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
public class RelevanceHit extends Hit
implements RandomAccessPriorityQueue.SignatureProvider<TweetIntegerShingleSignature> {
@Nullable
private TweetIntegerShingleSignature signature;
public RelevanceHit() {
super(Long.MAX_VALUE, Long.MAX_VALUE);
}
public RelevanceHit(long timeSliceID, long statusID,
TweetIntegerShingleSignature signature,
ThriftSearchResultMetadata metadata) {
super(timeSliceID, statusID);
update(timeSliceID, statusID, signature, metadata);
}
/**
* Updates the data for this relevance hit.
*
* @param timeSliceID The timeslice ID of the segment that the segment came from.
* @param statusID The hit's tweet ID.
* @param tweetSignature The tweet signature generated for this hit.
* @param metadata The metadata associated with this hit.
*/
public void update(long timeSliceID, long statusID, TweetIntegerShingleSignature tweetSignature,
ThriftSearchResultMetadata metadata) {
this.statusID = statusID;
this.timeSliceID = timeSliceID;
this.metadata = Preconditions.checkNotNull(metadata);
this.signature = Preconditions.checkNotNull(tweetSignature);
}
/**
* Returns the computed score for this hit.
*/
public float getScore() {
if (metadata != null) {
return (float) metadata.getScore();
} else {
return ScoringFunction.SKIP_HIT;
}
}
// We want the score as a double (and not cast to a float) for COMPARATOR_BY_SCORE and
// PQ_COMPARATOR_BY_SCORE so that the results returned from Earlybirds will be sorted based on the
// scores in the ThriftSearchResultMetadata objects (and will not lose precision by being cast to
// floats). Thus, the sorted order on Earlybirds and Earlybird Roots will be consistent.
private double getScoreDouble() {
if (metadata != null) {
return metadata.getScore();
} else {
return (double) ScoringFunction.SKIP_HIT;
}
}
@Override @Nullable
public TweetIntegerShingleSignature getSignature() {
return signature;
}
@Override
public String toString() {
return "RelevanceHit[tweetID=" + statusID + ",timeSliceID=" + timeSliceID
+ ",score=" + (metadata == null ? "null" : metadata.getScore())
+ ",signature=" + (signature == null ? "null" : signature) + "]";
}
public static final Comparator<RelevanceHit> COMPARATOR_BY_SCORE =
(d1, d2) -> {
// if two docs have the same score, then the first one (most recent) wins
if (d1.getScore() == d2.getScore()) {
return Long.compare(d2.getStatusID(), d1.getStatusID());
}
return Double.compare(d2.getScoreDouble(), d1.getScoreDouble());
};
public static final Comparator<RelevanceHit> PQ_COMPARATOR_BY_SCORE =
(d1, d2) -> {
// Reverse the order
return COMPARATOR_BY_SCORE.compare(d2, d1);
};
@Override
public void clear() {
timeSliceID = Long.MAX_VALUE;
statusID = Long.MAX_VALUE;
metadata = null;
signature = null;
}
}

View File

@ -1,66 +0,0 @@
package com.twitter.search.earlybird.search.relevance;
import com.google.common.base.Preconditions;
import org.apache.lucene.search.Query;
import com.twitter.search.common.search.TerminationTracker;
import com.twitter.search.earlybird.QualityFactor;
import com.twitter.search.earlybird.search.SearchRequestInfo;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
public class RelevanceSearchRequestInfo extends SearchRequestInfo {
private final ThriftSearchRelevanceOptions relevanceOptions;
public RelevanceSearchRequestInfo(
ThriftSearchQuery searchQuery, Query query,
TerminationTracker terminationTracker, QualityFactor qualityFactor) {
super(addResultMetadataOptionsIfUnset(searchQuery), query, terminationTracker, qualityFactor);
this.relevanceOptions = searchQuery.getRelevanceOptions();
}
private static ThriftSearchQuery addResultMetadataOptionsIfUnset(ThriftSearchQuery searchQuery) {
if (!searchQuery.isSetResultMetadataOptions()) {
searchQuery.setResultMetadataOptions(new ThriftSearchResultMetadataOptions());
}
return searchQuery;
}
@Override
protected int calculateMaxHitsToProcess(ThriftSearchQuery thriftSearchQuery) {
ThriftSearchRelevanceOptions searchRelevanceOptions = thriftSearchQuery.getRelevanceOptions();
// Don't use the value from the ThriftSearchQuery object if one is provided in the
// relevance options
int requestedMaxHitsToProcess = searchRelevanceOptions.isSetMaxHitsToProcess()
? searchRelevanceOptions.getMaxHitsToProcess()
: super.calculateMaxHitsToProcess(thriftSearchQuery);
return qualityFactorMaxHitsToProcess(getNumResultsRequested(), requestedMaxHitsToProcess);
}
public ThriftSearchRelevanceOptions getRelevanceOptions() {
return this.relevanceOptions;
}
/**
* Reduces maxHitsToProcess based on quality factor. Never reduces it beyond
* numResults.
* @param numResults
* @param maxHitsToProcess
* @return Reduced maxHitsToProcess.
*/
public int qualityFactorMaxHitsToProcess(int numResults, int maxHitsToProcess) {
Preconditions.checkNotNull(qualityFactor);
// Do not quality factor if there is no lower bound on maxHitsToProcess.
if (numResults > maxHitsToProcess) {
return maxHitsToProcess;
}
double currentQualityFactor = qualityFactor.get();
return Math.max(numResults, (int) (currentQualityFactor * maxHitsToProcess));
}
}

View File

@ -1,37 +0,0 @@
package com.twitter.search.earlybird.search.relevance;
import com.twitter.search.earlybird.search.Hit;
import com.twitter.search.earlybird.search.SimpleSearchResults;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
public class RelevanceSearchResults extends SimpleSearchResults {
public final ThriftSearchResultMetadata[] resultMetadata;
private ThriftSearchResultsRelevanceStats relevanceStats = null;
private long scoringTimeNanos = 0;
public RelevanceSearchResults(int size) {
super(size);
this.resultMetadata = new ThriftSearchResultMetadata[size];
}
public void setHit(Hit hit, int hitIndex) {
hits[hitIndex] = hit;
resultMetadata[hitIndex] = hit.getMetadata();
}
public void setRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
this.relevanceStats = relevanceStats;
}
public ThriftSearchResultsRelevanceStats getRelevanceStats() {
return relevanceStats;
}
public void setScoringTimeNanos(long scoringTimeNanos) {
this.scoringTimeNanos = scoringTimeNanos;
}
public long getScoringTimeNanos() {
return scoringTimeNanos;
}
}

View File

@ -1,138 +0,0 @@
package com.twitter.search.earlybird.search.relevance;
import java.io.IOException;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunctionProvider;
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunctionProvider.NamedScoringFunctionProvider;
/**
* This filter only accepts documents for which the provided
* {@link com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction}
* returns a score that's greater or equal to the passed-in minScore and smaller or equal
* to maxScore.
*/
public final class ScoreFilterQuery extends Query {
private static final float DEFAULT_LUCENE_SCORE = 1.0F;
private final float minScore;
private final float maxScore;
private final NamedScoringFunctionProvider scoringFunctionProvider;
private final ImmutableSchemaInterface schema;
/**
* Returns a score filter.
*
* @param schema The schema to use to extract the feature scores.
* @param scoringFunctionProvider The scoring function provider.
* @param minScore The minimum score threshold.
* @param maxScore The maximum score threshold.
* @return A score filter with the given configuration.
*/
public static Query getScoreFilterQuery(
ImmutableSchemaInterface schema,
NamedScoringFunctionProvider scoringFunctionProvider,
float minScore,
float maxScore) {
return new BooleanQuery.Builder()
.add(new ScoreFilterQuery(schema, scoringFunctionProvider, minScore, maxScore),
BooleanClause.Occur.FILTER)
.build();
}
private ScoreFilterQuery(ImmutableSchemaInterface schema,
NamedScoringFunctionProvider scoringFunctionProvider,
float minScore,
float maxScore) {
this.schema = schema;
this.scoringFunctionProvider = scoringFunctionProvider;
this.minScore = minScore;
this.maxScore = maxScore;
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
ScoringFunction scoringFunction = scoringFunctionProvider.getScoringFunction();
scoringFunction.setNextReader((EarlybirdIndexSegmentAtomicReader) context.reader());
return new ScoreFilterDocIdSetIterator(
context.reader(), scoringFunction, minScore, maxScore);
}
};
}
private static final class ScoreFilterDocIdSetIterator extends RangeFilterDISI {
private final ScoringFunction scoringFunction;
private final float minScore;
private final float maxScore;
public ScoreFilterDocIdSetIterator(LeafReader indexReader, ScoringFunction scoringFunction,
float minScore, float maxScore) throws IOException {
super(indexReader);
this.scoringFunction = scoringFunction;
this.minScore = minScore;
this.maxScore = maxScore;
}
@Override
protected boolean shouldReturnDoc() throws IOException {
float score = scoringFunction.score(docID(), DEFAULT_LUCENE_SCORE);
return score >= minScore && score <= maxScore;
}
}
public float getMinScoreForTest() {
return minScore;
}
public float getMaxScoreForTest() {
return maxScore;
}
public ScoringFunctionProvider getScoringFunctionProviderForTest() {
return scoringFunctionProvider;
}
@Override
public int hashCode() {
return (int) (minScore * 29
+ maxScore * 17
+ (scoringFunctionProvider == null ? 0 : scoringFunctionProvider.hashCode()));
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof ScoreFilterQuery)) {
return false;
}
ScoreFilterQuery filter = ScoreFilterQuery.class.cast(obj);
return (minScore == filter.minScore)
&& (maxScore == filter.maxScore)
&& (scoringFunctionProvider == null
? filter.scoringFunctionProvider == null
: scoringFunctionProvider.equals(filter.scoringFunctionProvider));
}
@Override
public String toString(String field) {
return "SCORE_FILTER_QUERY[minScore=" + minScore + ",maxScore=" + maxScore + "]";
}
}

View File

@ -1,147 +0,0 @@
package com.twitter.search.earlybird.search.relevance.collectors;
import java.io.IOException;
import com.google.common.base.Preconditions;
import com.twitter.common.util.Clock;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.core.earlybird.facets.LanguageHistogram;
import com.twitter.search.earlybird.common.userupdates.UserTable;
import com.twitter.search.earlybird.search.AbstractResultsCollector;
import com.twitter.search.earlybird.search.relevance.RelevanceSearchRequestInfo;
import com.twitter.search.earlybird.search.relevance.RelevanceSearchResults;
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
/**
* AbstractRelevanceCollector is a results collector that collects RelevanceHit results
* which include more detailed information than a normal Hit.
*/
public abstract class AbstractRelevanceCollector
extends AbstractResultsCollector<RelevanceSearchRequestInfo, RelevanceSearchResults> {
protected final ScoringFunction scoringFunction;
private final ThriftSearchResultsRelevanceStats relevanceStats;
private final EarlybirdCluster cluster;
private final UserTable userTable;
// Per-language result counts.
private final LanguageHistogram languageHistogram = new LanguageHistogram();
// Accumulated time spend on relevance scoring across all collected hits, including batch scoring.
private long scoringTimeNanos = 0;
public AbstractRelevanceCollector(
ImmutableSchemaInterface schema,
RelevanceSearchRequestInfo searchRequestInfo,
ScoringFunction scoringFunction,
EarlybirdSearcherStats searcherStats,
EarlybirdCluster cluster,
UserTable userTable,
Clock clock,
int requestDebugMode) {
super(schema, searchRequestInfo, clock, searcherStats, requestDebugMode);
this.scoringFunction = scoringFunction;
this.relevanceStats = new ThriftSearchResultsRelevanceStats();
this.cluster = cluster;
this.userTable = userTable;
}
/**
* Subclasses must implement this method to actually collect a scored relevance hit.
*/
protected abstract void doCollectWithScore(long tweetID, float score) throws IOException;
@Override
public final void startSegment() throws IOException {
scoringFunction.setNextReader(currTwitterReader);
ThriftSearchResultMetadataOptions options =
searchRequestInfo.getSearchQuery().getResultMetadataOptions();
featuresRequested = options != null && options.isReturnSearchResultFeatures();
}
@Override
protected final void doCollect(long tweetID) throws IOException {
final long scoringStartNanos = getClock().nowNanos();
float luceneSore = scorer.score();
final float score = scoringFunction.score(curDocId, luceneSore);
final long scoringEndNanos = getClock().nowNanos();
addToOverallScoringTimeNanos(scoringStartNanos, scoringEndNanos);
scoringFunction.updateRelevanceStats(relevanceStats);
updateHitCounts(tweetID);
doCollectWithScore(tweetID, score);
}
protected final void addToOverallScoringTimeNanos(long scoringStartNanos, long scoringEndNanos) {
scoringTimeNanos += scoringEndNanos - scoringStartNanos;
}
protected final ThriftSearchResultMetadata collectMetadata() throws IOException {
ThriftSearchResultMetadataOptions options =
searchRequestInfo.getSearchQuery().getResultMetadataOptions();
Preconditions.checkNotNull(options);
ThriftSearchResultMetadata metadata =
Preconditions.checkNotNull(scoringFunction.getResultMetadata(options));
if (metadata.isSetLanguage()) {
languageHistogram.increment(metadata.getLanguage().getValue());
}
// Some additional metadata which is not provided by the scoring function, but
// by accessing the reader directly.
if (currTwitterReader != null) {
fillResultGeoLocation(metadata);
if (searchRequestInfo.isCollectConversationId()) {
long conversationId =
documentFeatures.getFeatureValue(EarlybirdFieldConstant.CONVERSATION_ID_CSF);
if (conversationId != 0) {
ensureExtraMetadataIsSet(metadata);
metadata.getExtraMetadata().setConversationId(conversationId);
}
}
}
// Check and collect hit attribution data, if it's available.
fillHitAttributionMetadata(metadata);
long fromUserId = documentFeatures.getFeatureValue(EarlybirdFieldConstant.FROM_USER_ID_CSF);
if (searchRequestInfo.isGetFromUserId()) {
metadata.setFromUserId(fromUserId);
}
collectExclusiveConversationAuthorId(metadata);
collectFacets(metadata);
collectFeatures(metadata);
collectIsProtected(metadata, cluster, userTable);
return metadata;
}
protected final ThriftSearchResultsRelevanceStats getRelevanceStats() {
return relevanceStats;
}
public final LanguageHistogram getLanguageHistogram() {
return languageHistogram;
}
@Override
protected final RelevanceSearchResults doGetResults() throws IOException {
final RelevanceSearchResults results = doGetRelevanceResults();
results.setScoringTimeNanos(scoringTimeNanos);
return results;
}
/**
* For subclasses to process and aggregate collected hits.
*/
protected abstract RelevanceSearchResults doGetRelevanceResults() throws IOException;
}

View File

@ -1,118 +0,0 @@
package com.twitter.search.earlybird.search.relevance.collectors;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import com.twitter.common.collections.Pair;
import com.twitter.common.util.Clock;
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
import com.twitter.search.common.metrics.SearchTimerStats;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.common.search.EarlyTerminationState;
import com.twitter.search.earlybird.common.userupdates.UserTable;
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
import com.twitter.search.earlybird.search.relevance.RelevanceSearchRequestInfo;
import com.twitter.search.earlybird.search.relevance.RelevanceSearchResults;
import com.twitter.search.earlybird.search.relevance.scoring.BatchHit;
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultExtraMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
/**
* BatchRelevanceTopCollector is similar to the `RelevanceTopCollector` in what it outputs:
* Collects the top numResults by score, filtering out duplicates
* and results with scores equal to Flat.MIN_VALUE.
* The way that it achieves that is different though: it will score documents through the batch score
* function instead of scoring documents one by one.
*/
public class BatchRelevanceTopCollector extends RelevanceTopCollector {
protected final List<BatchHit> hits;
public BatchRelevanceTopCollector(
ImmutableSchemaInterface schema,
RelevanceSearchRequestInfo searchRequestInfo,
ScoringFunction scoringFunction,
EarlybirdSearcherStats searcherStats,
EarlybirdCluster cluster,
UserTable userTable,
Clock clock,
int requestDebugMode) {
super(schema, searchRequestInfo, scoringFunction, searcherStats, cluster, userTable, clock,
requestDebugMode);
this.hits = new ArrayList<>((int) getMaxHitsToProcess());
}
@Override
protected void doCollectWithScore(long tweetID, float score) throws IOException {
Pair<LinearScoringData, ThriftSearchResultFeatures> pair =
scoringFunction.collectFeatures(score);
ThriftSearchResultMetadata metadata = collectMetadata();
hits.add(new BatchHit(pair.getFirst(),
pair.getSecond(),
metadata,
tweetID,
currTimeSliceID));
}
@Override
public EarlyTerminationState innerShouldCollectMore() {
if (hits.size() >= getMaxHitsToProcess()) {
return setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_HITS_EXCEEDED);
}
return EarlyTerminationState.COLLECTING;
}
@Override
protected RelevanceSearchResults doGetRelevanceResults() throws IOException {
final long scoringStartNanos = getClock().nowNanos();
float[] scores = scoringFunction.batchScore(hits);
final long scoringEndNanos = getClock().nowNanos();
addToOverallScoringTimeNanos(scoringStartNanos, scoringEndNanos);
exportBatchScoringTime(scoringEndNanos - scoringStartNanos);
for (int i = 0; i < hits.size(); i++) {
BatchHit hit = hits.get(i);
ThriftSearchResultMetadata metadata = hit.getMetadata();
if (!metadata.isSetExtraMetadata()) {
metadata.setExtraMetadata(new ThriftSearchResultExtraMetadata());
}
metadata.getExtraMetadata().setFeatures(hit.getFeatures());
// Populate the ThriftSearchResultMetadata post batch scoring with information from the
// LinearScoringData, which now includes a score.
scoringFunction.populateResultMetadataBasedOnScoringData(
searchRequestInfo.getSearchQuery().getResultMetadataOptions(),
metadata,
hit.getScoringData());
collectWithScoreInternal(
hit.getTweetID(),
hit.getTimeSliceID(),
scores[i],
metadata
);
}
return getRelevanceResultsInternal();
}
private void exportBatchScoringTime(long scoringTimeNanos) {
ThriftSearchRelevanceOptions relevanceOptions = searchRequestInfo.getRelevanceOptions();
if (relevanceOptions.isSetRankingParams()
&& relevanceOptions.getRankingParams().isSetSelectedTensorflowModel()) {
String model = relevanceOptions.getRankingParams().getSelectedTensorflowModel();
SearchTimerStats batchScoringPerModelTimer = SearchTimerStats.export(
String.format("batch_scoring_time_for_model_%s", model),
TimeUnit.NANOSECONDS,
false,
true);
batchScoringPerModelTimer.timerIncrement(scoringTimeNanos);
}
}
}

View File

@ -1,70 +0,0 @@
package com.twitter.search.earlybird.search.relevance.collectors;
import java.io.IOException;
import java.util.List;
import com.google.common.collect.Lists;
import com.twitter.common.util.Clock;
import com.twitter.search.common.relevance.features.TweetIntegerShingleSignature;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.earlybird.common.userupdates.UserTable;
import com.twitter.search.earlybird.search.relevance.RelevanceHit;
import com.twitter.search.earlybird.search.relevance.RelevanceSearchRequestInfo;
import com.twitter.search.earlybird.search.relevance.RelevanceSearchResults;
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
/**
* RelevanceAllCollector is a results collector that collects all results sorted by score,
* including signature-duplicates and results skipped by the scoring function.
*/
public class RelevanceAllCollector extends AbstractRelevanceCollector {
// All results.
protected final List<RelevanceHit> results;
public RelevanceAllCollector(
ImmutableSchemaInterface schema,
RelevanceSearchRequestInfo searchRequestInfo,
ScoringFunction scoringFunction,
EarlybirdSearcherStats searcherStats,
EarlybirdCluster cluster,
UserTable userTable,
Clock clock,
int requestDebugMode) {
super(schema, searchRequestInfo, scoringFunction, searcherStats, cluster, userTable, clock,
requestDebugMode);
this.results = Lists.newArrayList();
}
@Override
protected void doCollectWithScore(long tweetID, float score) throws IOException {
ThriftSearchResultMetadata metadata = collectMetadata();
scoringFunction.populateResultMetadataBasedOnScoringData(
searchRequestInfo.getSearchQuery().getResultMetadataOptions(),
metadata,
scoringFunction.getScoringDataForCurrentDocument());
results.add(new RelevanceHit(
currTimeSliceID,
tweetID,
TweetIntegerShingleSignature.deserialize(metadata.getSignature()),
metadata));
}
@Override
protected RelevanceSearchResults doGetRelevanceResults() {
final int numResults = results.size();
RelevanceSearchResults searchResults = new RelevanceSearchResults(numResults);
// Insert hits in decreasing order by score.
results.sort(RelevanceHit.COMPARATOR_BY_SCORE);
for (int i = 0; i < numResults; i++) {
searchResults.setHit(results.get(i), i);
}
searchResults.setRelevanceStats(getRelevanceStats());
searchResults.setNumHits(numResults);
return searchResults;
}
}

View File

@ -1,167 +0,0 @@
package com.twitter.search.earlybird.search.relevance.collectors;
import java.io.IOException;
import com.google.common.base.Preconditions;
import com.twitter.common.util.Clock;
import com.twitter.common_internal.collections.RandomAccessPriorityQueue;
import com.twitter.search.common.relevance.features.TweetIntegerShingleSignature;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
import com.twitter.search.common.search.EarlyTerminationState;
import com.twitter.search.earlybird.common.userupdates.UserTable;
import com.twitter.search.earlybird.search.relevance.RelevanceHit;
import com.twitter.search.earlybird.search.relevance.RelevanceSearchRequestInfo;
import com.twitter.search.earlybird.search.relevance.RelevanceSearchResults;
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
/**
* RelevanceTopCollector is a results collector that collects the top numResults by
* score, filtering out duplicates.
*/
public class RelevanceTopCollector extends AbstractRelevanceCollector {
// Search results are collected in a min-heap.
protected final RandomAccessPriorityQueue<RelevanceHit, TweetIntegerShingleSignature> minQueue;
// Number of hits actually added to the min queue after dupe filtering and skipping.
// Less than or equal to numHitsProcessed.
protected int numHitsCollected;
// The 'top' of the min heap, or, the lowest scored document in the heap.
private RelevanceHit pqTop;
private float lowestScore = ScoringFunction.SKIP_HIT;
private final boolean isFilterDupes;
public RelevanceTopCollector(
ImmutableSchemaInterface schema,
RelevanceSearchRequestInfo searchRequestInfo,
ScoringFunction scoringFunction,
EarlybirdSearcherStats searcherStats,
EarlybirdCluster cluster,
UserTable userTable,
Clock clock,
int requestDebugMode) {
super(schema, searchRequestInfo, scoringFunction, searcherStats, cluster, userTable, clock,
requestDebugMode);
this.minQueue = new RandomAccessPriorityQueue<RelevanceHit, TweetIntegerShingleSignature>(
searchRequestInfo.getNumResultsRequested(), RelevanceHit.PQ_COMPARATOR_BY_SCORE) {
@Override
protected RelevanceHit getSentinelObject() {
return new RelevanceHit(); // default relevance constructor would create a hit with the
// lowest score possible.
}
};
this.pqTop = minQueue.top();
this.isFilterDupes = getSearchRequestInfo().getRelevanceOptions().isFilterDups();
}
protected void collectWithScoreInternal(
long tweetID,
long timeSliceID,
float score,
ThriftSearchResultMetadata metadata) {
// This collector cannot handle these scores:
assert !Float.isNaN(score);
if (score <= lowestScore) {
// Since docs are returned in-order (i.e., increasing doc Id), a document
// with equal score to pqTop.score cannot compete since HitQueue favors
// documents with lower doc Ids. Therefore reject those docs too.
// IMPORTANT: docs skipped by the scoring function will have scores set
// to ScoringFunction.SKIP_HIT, meaning they will not be collected.
return;
}
boolean dupFound = false;
Preconditions.checkState(metadata.isSetSignature(),
"The signature should be set at metadata collection time, but it is null. "
+ "Tweet id = %s, metadata = %s",
tweetID,
metadata);
int signatureInt = metadata.getSignature();
final TweetIntegerShingleSignature signature =
TweetIntegerShingleSignature.deserialize(signatureInt);
if (isFilterDupes) {
// update duplicate if any
if (signatureInt != TweetIntegerShingleSignature.DEFAULT_NO_SIGNATURE) {
dupFound = minQueue.incrementElement(
signature,
element -> {
if (score > element.getScore()) {
element.update(timeSliceID, tweetID, signature, metadata);
}
}
);
}
}
if (!dupFound) {
numHitsCollected++;
// if we didn't find a duplicate element to update then we add it now as a new element to the
// pq
pqTop = minQueue.updateTop(top -> top.update(timeSliceID, tweetID, signature, metadata));
lowestScore = pqTop.getScore();
}
}
@Override
protected void doCollectWithScore(final long tweetID, final float score) throws IOException {
ThriftSearchResultMetadata metadata = collectMetadata();
scoringFunction.populateResultMetadataBasedOnScoringData(
searchRequestInfo.getSearchQuery().getResultMetadataOptions(),
metadata,
scoringFunction.getScoringDataForCurrentDocument());
collectWithScoreInternal(tweetID, currTimeSliceID, score, metadata);
}
@Override
public EarlyTerminationState innerShouldCollectMore() {
// Note that numHitsCollected here might be less than num results collected in the
// TwitterEarlyTerminationCollector, if we hit dups or there are very low scores.
if (numHitsCollected >= getMaxHitsToProcess()) {
return setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_HITS_EXCEEDED);
}
return EarlyTerminationState.COLLECTING;
}
@Override
protected RelevanceSearchResults doGetRelevanceResults() throws IOException {
return getRelevanceResultsInternal();
}
protected RelevanceSearchResults getRelevanceResultsInternal() {
return resultsFromQueue(minQueue, getSearchRequestInfo().getNumResultsRequested(),
getRelevanceStats());
}
private static RelevanceSearchResults resultsFromQueue(
RandomAccessPriorityQueue<RelevanceHit, TweetIntegerShingleSignature> pq,
int desiredNumResults,
ThriftSearchResultsRelevanceStats relevanceStats) {
// trim first in case we didn't fill up the queue to not get any sentinel values here
int numResults = pq.trim();
if (numResults > desiredNumResults) {
for (int i = 0; i < numResults - desiredNumResults; i++) {
pq.pop();
}
numResults = desiredNumResults;
}
RelevanceSearchResults results = new RelevanceSearchResults(numResults);
// insert hits in decreasing order by score
for (int i = numResults - 1; i >= 0; i--) {
RelevanceHit hit = pq.pop();
results.setHit(hit, i);
}
results.setRelevanceStats(relevanceStats);
results.setNumHits(numResults);
return results;
}
}

View File

@ -1,47 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
public class BatchHit {
private final LinearScoringData scoringData;
private final ThriftSearchResultFeatures features;
private final ThriftSearchResultMetadata metadata;
private final long tweetID;
private final long timeSliceID;
public BatchHit(
LinearScoringData scoringData,
ThriftSearchResultFeatures features,
ThriftSearchResultMetadata metadata,
long tweetID,
long timeSliceID
) {
this.scoringData = scoringData;
this.features = features;
this.metadata = metadata;
this.tweetID = tweetID;
this.timeSliceID = timeSliceID;
}
public LinearScoringData getScoringData() {
return scoringData;
}
public ThriftSearchResultFeatures getFeatures() {
return features;
}
public ThriftSearchResultMetadata getMetadata() {
return metadata;
}
public long getTweetID() {
return tweetID;
}
public long getTimeSliceID() {
return timeSliceID;
}
}

View File

@ -1,37 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import org.apache.lucene.search.Explanation;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
/*
* A sample scorer, doesn't really do anything, returns the same score for every document.
*/
public class DefaultScoringFunction extends ScoringFunction {
private float score;
public DefaultScoringFunction(ImmutableSchemaInterface schema) {
super(schema);
}
@Override
protected float score(float luceneQueryScore) {
score = luceneQueryScore;
return luceneQueryScore;
}
@Override
protected Explanation doExplain(float luceneScore) {
// just an example - this scoring function will go away soon
return Explanation.match(luceneScore, "luceneScore=" + luceneScore);
}
@Override
public void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
relevanceStats.setNumScored(relevanceStats.getNumScored() + 1);
if (score == ScoringFunction.SKIP_HIT) {
relevanceStats.setNumSkipped(relevanceStats.getNumSkipped() + 1);
}
}
}

View File

@ -1,98 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import com.twitter.search.common.util.ml.prediction_engine.BaseLegacyScoreAccumulator;
import com.twitter.search.common.util.ml.prediction_engine.LightweightLinearModel;
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
import com.twitter.search.modeling.tweet_ranking.TweetScoringFeatures;
/**
* Legacy score accumulator in Earlybird with specific features added.
* This class is created to avoid adding LinearScoringData as a dependency to search's common ML
* library.
*
* @deprecated This class is retired and we suggest to switch to SchemaBasedScoreAccumulator.
*/
@Deprecated
public class LegacyScoreAccumulator extends BaseLegacyScoreAccumulator<LinearScoringData> {
/**
* Constructs with a model and LinearScoringData
*/
LegacyScoreAccumulator(LightweightLinearModel model) {
super(model);
}
/**
* Update the accumulator score with features, after this function the score should already
* be computed.
*
* @deprecated This function is retired and we suggest to switch to updateScoresWithFeatures in
* SchemaBasedScoreAccumulator.
*/
@Override
@Deprecated
protected void updateScoreWithFeatures(LinearScoringData data) {
addContinuousFeature(TweetScoringFeatures.LUCENE_SCORE, data.luceneScore);
addContinuousFeature(TweetScoringFeatures.TEXT_SCORE, data.textScore);
addContinuousFeature(TweetScoringFeatures.TWEET_AGE_IN_SECONDS, data.tweetAgeInSeconds);
addContinuousFeature(TweetScoringFeatures.REPLY_COUNT, data.replyCountPostLog2);
addContinuousFeature(TweetScoringFeatures.RETWEET_COUNT, data.retweetCountPostLog2);
addContinuousFeature(TweetScoringFeatures.FAV_COUNT, data.favCountPostLog2);
addContinuousFeature(TweetScoringFeatures.REPLY_COUNT_V2, data.replyCountV2);
addContinuousFeature(TweetScoringFeatures.RETWEET_COUNT_V2, data.retweetCountV2);
addContinuousFeature(TweetScoringFeatures.FAV_COUNT_V2, data.favCountV2);
addContinuousFeature(TweetScoringFeatures.EMBEDS_IMPRESSION_COUNT,
data.getEmbedsImpressionCount(false));
addContinuousFeature(TweetScoringFeatures.EMBEDS_URL_COUNT, data.getEmbedsUrlCount(false));
addContinuousFeature(TweetScoringFeatures.VIDEO_VIEW_COUNT, data.getVideoViewCount(false));
addContinuousFeature(TweetScoringFeatures.QUOTED_COUNT, data.quotedCount);
addContinuousFeature(TweetScoringFeatures.WEIGHTED_RETWEET_COUNT, data.weightedRetweetCount);
addContinuousFeature(TweetScoringFeatures.WEIGHTED_REPLY_COUNT, data.weightedReplyCount);
addContinuousFeature(TweetScoringFeatures.WEIGHTED_FAV_COUNT, data.weightedFavCount);
addContinuousFeature(TweetScoringFeatures.WEIGHTED_QUOTE_COUNT, data.weightedQuoteCount);
addBinaryFeature(TweetScoringFeatures.HAS_URL, data.hasUrl);
addBinaryFeature(TweetScoringFeatures.HAS_CARD, data.hasCard);
addBinaryFeature(TweetScoringFeatures.HAS_VINE, data.hasVine);
addBinaryFeature(TweetScoringFeatures.HAS_PERISCOPE, data.hasPeriscope);
addBinaryFeature(TweetScoringFeatures.HAS_NATIVE_IMAGE, data.hasNativeImage);
addBinaryFeature(TweetScoringFeatures.HAS_IMAGE_URL, data.hasImageUrl);
addBinaryFeature(TweetScoringFeatures.HAS_NEWS_URL, data.hasNewsUrl);
addBinaryFeature(TweetScoringFeatures.HAS_VIDEO_URL, data.hasVideoUrl);
addBinaryFeature(TweetScoringFeatures.HAS_CONSUMER_VIDEO, data.hasConsumerVideo);
addBinaryFeature(TweetScoringFeatures.HAS_PRO_VIDEO, data.hasProVideo);
addBinaryFeature(TweetScoringFeatures.HAS_QUOTE, data.hasQuote);
addBinaryFeature(TweetScoringFeatures.HAS_TREND, data.hasTrend);
addBinaryFeature(TweetScoringFeatures.HAS_MULTIPLE_HASHTAGS_OR_TRENDS,
data.hasMultipleHashtagsOrTrends);
addBinaryFeature(TweetScoringFeatures.IS_OFFENSIVE, data.isOffensive);
addBinaryFeature(TweetScoringFeatures.IS_REPLY, data.isReply);
addBinaryFeature(TweetScoringFeatures.IS_RETWEET, data.isRetweet);
addBinaryFeature(TweetScoringFeatures.IS_SELF_TWEET, data.isSelfTweet);
addBinaryFeature(TweetScoringFeatures.IS_FOLLOW_RETWEET, data.isRetweet & data.isFollow);
addBinaryFeature(TweetScoringFeatures.IS_TRUSTED_RETWEET, data.isRetweet & data.isTrusted);
addContinuousFeature(TweetScoringFeatures.QUERY_SPECIFIC_SCORE, data.querySpecificScore);
addContinuousFeature(TweetScoringFeatures.AUTHOR_SPECIFIC_SCORE, data.authorSpecificScore);
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_FOLLOW, data.isFollow);
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_TRUSTED, data.isTrusted);
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_VERIFIED, data.isFromVerifiedAccount);
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_NSFW, data.isUserNSFW);
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_SPAM, data.isUserSpam);
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_BOT, data.isUserBot);
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_ANTISOCIAL, data.isUserAntiSocial);
addContinuousFeature(TweetScoringFeatures.AUTHOR_REPUTATION, data.userRep);
addContinuousFeature(TweetScoringFeatures.SEARCHER_LANG_SCORE, data.userLangMult);
addBinaryFeature(TweetScoringFeatures.HAS_DIFFERENT_LANG, data.hasDifferentLang);
addBinaryFeature(TweetScoringFeatures.HAS_ENGLISH_TWEET_AND_DIFFERENT_UI_LANG,
data.hasEnglishTweetAndDifferentUILang);
addBinaryFeature(TweetScoringFeatures.HAS_ENGLISH_UI_AND_DIFFERENT_TWEET_LANG,
data.hasEnglishUIAndDifferentTweetLang);
addBinaryFeature(TweetScoringFeatures.IS_SENSITIVE_CONTENT, data.isSensitiveContent);
addBinaryFeature(TweetScoringFeatures.HAS_MULTIPLE_MEDIA, data.hasMultipleMediaFlag);
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_PROFILE_EGG, data.profileIsEggFlag);
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_NEW, data.isUserNewFlag);
addContinuousFeature(TweetScoringFeatures.MENTIONS_COUNT, data.numMentions);
addContinuousFeature(TweetScoringFeatures.HASHTAGS_COUNT, data.numHashtags);
addContinuousFeature(TweetScoringFeatures.LINK_LANGUAGE_ID, data.linkLanguage);
addContinuousFeature(TweetScoringFeatures.LANGUAGE_ID, data.tweetLangId);
addBinaryFeature(TweetScoringFeatures.HAS_VISIBLE_LINK, data.hasVisibleLink);
}
}

View File

@ -1,237 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import java.util.List;
import com.google.common.collect.Lists;
import org.apache.lucene.search.Explanation;
import com.twitter.search.common.relevance.features.MutableFeatureNormalizers;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.earlybird.common.userupdates.UserTable;
import com.twitter.search.earlybird.search.AntiGamingFilter;
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
import com.twitter.search.earlybird.search.relevance.LinearScoringParams;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
/**
* Scoring function that uses the weights and boosts provided in the scoring parameters from the
* request.
*/
public class LinearScoringFunction extends FeatureBasedScoringFunction {
private static final double BASE_SCORE = 0.0001;
public LinearScoringFunction(
ImmutableSchemaInterface schema,
ThriftSearchQuery searchQuery,
AntiGamingFilter antiGamingFilter,
ThriftSearchResultType searchResultType,
UserTable userTable) throws IOException {
super("LinearScoringFunction", schema, searchQuery, antiGamingFilter, searchResultType,
userTable);
}
@Override
protected double computeScore(LinearScoringData data, boolean forExplanation) throws IOException {
double score = BASE_SCORE;
data.luceneContrib = params.useLuceneScoreAsBoost
? 0.0 : params.luceneWeight * data.luceneScore;
data.reputationContrib = params.reputationWeight * data.userRep;
data.textScoreContrib = params.textScoreWeight * data.textScore;
data.parusContrib = params.parusWeight * data.parusScore;
// contributions from engagement counters. Note that we have "true" argument for all getters,
// which means all values will get scaled down for scoring, they were unbounded in raw form.
data.retweetContrib = params.retweetWeight * data.retweetCountPostLog2;
data.favContrib = params.favWeight * data.favCountPostLog2;
data.replyContrib = params.replyWeight * data.replyCountPostLog2;
data.embedsImpressionContrib =
params.embedsImpressionWeight * data.getEmbedsImpressionCount(true);
data.embedsUrlContrib =
params.embedsUrlWeight * data.getEmbedsUrlCount(true);
data.videoViewContrib =
params.videoViewWeight * data.getVideoViewCount(true);
data.quotedContrib =
params.quotedCountWeight * data.quotedCount;
for (int i = 0; i < LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS; i++) {
data.offlineExpFeatureContributions[i] =
params.rankingOfflineExpWeights[i] * data.offlineExpFeatureValues[i];
}
data.hasUrlContrib = params.urlWeight * (data.hasUrl ? 1.0 : 0.0);
data.isReplyContrib = params.isReplyWeight * (data.isReply ? 1.0 : 0.0);
data.isFollowRetweetContrib =
params.followRetweetWeight * (data.isRetweet && data.isFollow ? 1.0 : 0.0);
data.isTrustedRetweetContrib =
params.trustedRetweetWeight * (data.isRetweet && data.isTrusted ? 1.0 : 0.0);
double replyCountOriginal = getUnscaledReplyCountFeatureValue();
data.multipleReplyContrib = params.multipleReplyWeight
* (replyCountOriginal < params.multipleReplyMinVal ? 0.0 : replyCountOriginal);
// We directly the query specific score as the contribution below as it doesn't need a weight
// for contribution computation.
score += data.luceneContrib
+ data.reputationContrib
+ data.textScoreContrib
+ data.replyContrib
+ data.multipleReplyContrib
+ data.retweetContrib
+ data.favContrib
+ data.parusContrib
+ data.embedsImpressionContrib
+ data.embedsUrlContrib
+ data.videoViewContrib
+ data.quotedContrib
+ data.hasUrlContrib
+ data.isReplyContrib
+ data.isFollowRetweetContrib
+ data.isTrustedRetweetContrib
+ data.querySpecificScore
+ data.authorSpecificScore;
for (int i = 0; i < LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS; i++) {
score += data.offlineExpFeatureContributions[i];
}
return score;
}
/**
* Generates the explanation for the linear score.
*/
@Override
protected void generateExplanationForScoring(
LinearScoringData scoringData, boolean isHit, List<Explanation> details) throws IOException {
// 1. Linear components
final List<Explanation> linearDetails = Lists.newArrayList();
addLinearElementExplanation(
linearDetails, "[LuceneQueryScore]",
params.luceneWeight, scoringData.luceneScore, scoringData.luceneContrib);
if (scoringData.hasCard) {
if (scoringData.cardAuthorMatchBoostApplied) {
linearDetails.add(Explanation.match(
(float) params.cardAuthorMatchBoosts[scoringData.cardType],
"[x] card author match boost"));
}
if (scoringData.cardDescriptionMatchBoostApplied) {
linearDetails.add(Explanation.match(
(float) params.cardDescriptionMatchBoosts[scoringData.cardType],
"[x] card description match boost"));
}
if (scoringData.cardDomainMatchBoostApplied) {
linearDetails.add(Explanation.match(
(float) params.cardDomainMatchBoosts[scoringData.cardType],
"[x] card domain match boost"));
}
if (scoringData.cardTitleMatchBoostApplied) {
linearDetails.add(Explanation.match(
(float) params.cardTitleMatchBoosts[scoringData.cardType],
"[x] card title match boost"));
}
}
addLinearElementExplanation(
linearDetails, "reputation",
params.reputationWeight, scoringData.userRep, scoringData.reputationContrib);
addLinearElementExplanation(
linearDetails, "text score",
params.textScoreWeight, scoringData.textScore, scoringData.textScoreContrib);
addLinearElementExplanation(
linearDetails, "reply count (log2)",
params.replyWeight, scoringData.replyCountPostLog2, scoringData.replyContrib);
addLinearElementExplanation(
linearDetails, "multi reply",
params.multipleReplyWeight,
getUnscaledReplyCountFeatureValue() > params.multipleReplyMinVal ? 1 : 0,
scoringData.multipleReplyContrib);
addLinearElementExplanation(
linearDetails, "retweet count (log2)",
params.retweetWeight, scoringData.retweetCountPostLog2, scoringData.retweetContrib);
addLinearElementExplanation(
linearDetails, "fav count (log2)",
params.favWeight, scoringData.favCountPostLog2, scoringData.favContrib);
addLinearElementExplanation(
linearDetails, "parus score",
params.parusWeight, scoringData.parusScore, scoringData.parusContrib);
for (int i = 0; i < LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS; i++) {
if (params.rankingOfflineExpWeights[i] != LinearScoringParams.DEFAULT_FEATURE_WEIGHT) {
addLinearElementExplanation(linearDetails,
"ranking exp score offline experimental #" + i,
params.rankingOfflineExpWeights[i], scoringData.offlineExpFeatureValues[i],
scoringData.offlineExpFeatureContributions[i]);
}
}
addLinearElementExplanation(linearDetails,
"embedded tweet impression count",
params.embedsImpressionWeight, scoringData.getEmbedsImpressionCount(false),
scoringData.embedsImpressionContrib);
addLinearElementExplanation(linearDetails,
"embedded tweet url count",
params.embedsUrlWeight, scoringData.getEmbedsUrlCount(false),
scoringData.embedsUrlContrib);
addLinearElementExplanation(linearDetails,
"video view count",
params.videoViewWeight, scoringData.getVideoViewCount(false),
scoringData.videoViewContrib);
addLinearElementExplanation(linearDetails,
"quoted count",
params.quotedCountWeight, scoringData.quotedCount, scoringData.quotedContrib);
addLinearElementExplanation(
linearDetails, "has url", params.urlWeight, scoringData.hasUrl ? 1.0 : 0.0,
scoringData.hasUrlContrib);
addLinearElementExplanation(
linearDetails, "is reply", params.isReplyWeight,
scoringData.isReply ? 1.0 : 0.0, scoringData.isReplyContrib);
addLinearElementExplanation(
linearDetails, "is follow retweet", params.followRetweetWeight,
scoringData.isRetweet && scoringData.isFollow ? 1.0 : 0.0,
scoringData.isFollowRetweetContrib);
addLinearElementExplanation(
linearDetails, "is trusted retweet", params.trustedRetweetWeight,
scoringData.isRetweet && scoringData.isTrusted ? 1.0 : 0.0,
scoringData.isTrustedRetweetContrib);
if (scoringData.querySpecificScore != 0.0) {
linearDetails.add(Explanation.match((float) scoringData.querySpecificScore,
"[+] query specific score adjustment"));
}
if (scoringData.authorSpecificScore != 0.0) {
linearDetails.add(Explanation.match((float) scoringData.authorSpecificScore,
"[+] author specific score adjustment"));
}
Explanation linearCombo = isHit
? Explanation.match((float) scoringData.scoreBeforeBoost,
"(MATCH) Linear components, sum of:", linearDetails)
: Explanation.noMatch("Linear components, sum of:", linearDetails);
details.add(linearCombo);
}
private void addLinearElementExplanation(List<Explanation> explanation,
String name,
double weight,
double componentValue,
double contrib) {
if (contrib == 0.0) {
return;
}
explanation.add(
Explanation.match((float) contrib,
String.format("[+] %s=%.3f weight=%.3f", name, componentValue, weight)));
}
private double getUnscaledReplyCountFeatureValue() throws IOException {
byte featureValue = (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.REPLY_COUNT);
return MutableFeatureNormalizers.BYTE_NORMALIZER.unnormLowerBound(featureValue);
}
}

View File

@ -1,151 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import java.util.List;
import java.util.Map;
import com.google.common.base.Optional;
import com.google.common.base.Preconditions;
import com.google.common.collect.Lists;
import org.apache.lucene.search.Explanation;
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.ranking.thriftjava.ThriftRankingParams;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.util.ml.prediction_engine.LightweightLinearModel;
import com.twitter.search.common.util.ml.prediction_engine.SchemaBasedScoreAccumulator;
import com.twitter.search.earlybird.common.userupdates.UserTable;
import com.twitter.search.earlybird.exception.ClientException;
import com.twitter.search.earlybird.ml.ScoringModelsManager;
import com.twitter.search.earlybird.search.AntiGamingFilter;
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
/**
* Scoring function that uses the scoring models specified from the request.
*/
public class ModelBasedScoringFunction extends FeatureBasedScoringFunction {
private final SelectedModel[] selectedModels;
private final boolean useLogitScore;
private final boolean isSchemaBased;
private static final SearchCounter NUM_LEGACY_MODELS =
SearchCounter.export("scoring_function_num_legacy_models");
private static final SearchCounter NUM_SCHEMA_BASED_MODELS =
SearchCounter.export("scoring_function_num_schema_based_models");
private static final SearchCounter MIXED_MODEL_TYPES =
SearchCounter.export("scoring_function_mixed_model_types");
public ModelBasedScoringFunction(
ImmutableSchemaInterface schema,
ThriftSearchQuery searchQuery,
AntiGamingFilter antiGamingFilter,
ThriftSearchResultType searchResultType,
UserTable userTable,
ScoringModelsManager scoringModelsManager
) throws IOException, ClientException {
super("ModelBasedScoringFunction", schema, searchQuery, antiGamingFilter, searchResultType,
userTable);
ThriftRankingParams rankingParams = searchQuery.getRelevanceOptions().getRankingParams();
Preconditions.checkNotNull(rankingParams);
if (rankingParams.getSelectedModelsSize() <= 0) {
throw new ClientException("Scoring type is MODEL_BASED but no models were selected");
}
Map<String, Double> models = rankingParams.getSelectedModels();
selectedModels = new SelectedModel[models.size()];
int numSchemaBased = 0;
int i = 0;
for (Map.Entry<String, Double> nameAndWeight : models.entrySet()) {
Optional<LightweightLinearModel> model =
scoringModelsManager.getModel(nameAndWeight.getKey());
if (!model.isPresent()) {
throw new ClientException(String.format(
"Scoring function is MODEL_BASED. Selected model '%s' not found",
nameAndWeight.getKey()));
}
selectedModels[i] =
new SelectedModel(nameAndWeight.getKey(), nameAndWeight.getValue(), model.get());
if (selectedModels[i].model.isSchemaBased()) {
++numSchemaBased;
NUM_SCHEMA_BASED_MODELS.increment();
} else {
NUM_LEGACY_MODELS.increment();
}
++i;
}
// We should either see all models schema-based, or none of them so, if this is not the case,
// we log an error message and fall back to use just the first model, whatever it is.
if (numSchemaBased > 0 && numSchemaBased != selectedModels.length) {
MIXED_MODEL_TYPES.increment();
throw new ClientException(
"You cannot mix schema-based and non-schema-based models in the same request, "
+ "models are: " + models.keySet());
}
isSchemaBased = selectedModels[0].model.isSchemaBased();
useLogitScore = rankingParams.isUseLogitScore();
}
@Override
protected double computeScore(LinearScoringData data, boolean forExplanation) throws IOException {
ThriftSearchResultFeatures features =
isSchemaBased ? createFeaturesForDocument(data, false).getFeatures() : null;
double score = 0;
for (SelectedModel selectedModel : selectedModels) {
double modelScore = isSchemaBased
? new SchemaBasedScoreAccumulator(selectedModel.model).scoreWith(features, useLogitScore)
: new LegacyScoreAccumulator(selectedModel.model).scoreWith(data, useLogitScore);
score += selectedModel.weight * modelScore;
}
return score;
}
@Override
protected void generateExplanationForScoring(
LinearScoringData scoringData, boolean isHit, List<Explanation> details) throws IOException {
boolean schemaBased = selectedModels[0].model.isSchemaBased();
ThriftSearchResultFeatures features =
schemaBased ? createFeaturesForDocument(scoringData, false).getFeatures() : null;
// 1. Model-based score
final List<Explanation> modelExplanations = Lists.newArrayList();
float finalScore = 0;
for (SelectedModel selectedModel : selectedModels) {
double modelScore = schemaBased
? new SchemaBasedScoreAccumulator(selectedModel.model).scoreWith(features, useLogitScore)
: new LegacyScoreAccumulator(selectedModel.model).scoreWith(scoringData, useLogitScore);
float weightedScore = (float) (selectedModel.weight * modelScore);
details.add(Explanation.match(
weightedScore, String.format("model=%s score=%.6f weight=%.3f useLogitScore=%s",
selectedModel.name, modelScore, selectedModel.weight, useLogitScore)));
finalScore += weightedScore;
}
details.add(Explanation.match(
finalScore, String.format("Total model-based score (hit=%s)", isHit), modelExplanations));
}
private static final class SelectedModel {
public final String name;
public final double weight;
public final LightweightLinearModel model;
private SelectedModel(String name, double weight, LightweightLinearModel model) {
this.name = name;
this.weight = weight;
this.model = model;
}
}
}

View File

@ -1,164 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import java.util.Objects;
import java.util.Set;
import javax.annotation.Nullable;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.Scorer;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.results.thriftjava.FieldHitAttribution;
/**
* A wrapper for a Lucene query which first computes Lucene's query score
* and then delegates to a {@link ScoringFunction} for final score computation.
*/
public class RelevanceQuery extends Query {
private static final Logger LOG = LoggerFactory.getLogger(RelevanceQuery.class.getName());
protected final Query luceneQuery;
protected final ScoringFunction scoringFunction;
// True when the lucene query's score should be ignored for debug explanations.
protected final boolean ignoreLuceneQueryScoreExplanation;
public RelevanceQuery(Query luceneQuery, ScoringFunction scoringFunction) {
this(luceneQuery, scoringFunction, false);
}
public RelevanceQuery(Query luceneQuery,
ScoringFunction scoringFunction,
boolean ignoreLuceneQueryScoreExplanation) {
this.luceneQuery = luceneQuery;
this.scoringFunction = scoringFunction;
this.ignoreLuceneQueryScoreExplanation = ignoreLuceneQueryScoreExplanation;
}
public ScoringFunction getScoringFunction() {
return scoringFunction;
}
public Query getLuceneQuery() {
return luceneQuery;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = luceneQuery.rewrite(reader);
if (rewritten == luceneQuery) {
return this;
}
return new RelevanceQuery(rewritten, scoringFunction, ignoreLuceneQueryScoreExplanation);
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
Weight luceneWeight = luceneQuery.createWeight(searcher, scoreMode, boost);
if (luceneWeight == null) {
return null;
}
return new RelevanceWeight(searcher, luceneWeight);
}
public class RelevanceWeight extends Weight {
private final Weight luceneWeight;
public RelevanceWeight(IndexSearcher searcher, Weight luceneWeight) {
super(RelevanceQuery.this);
this.luceneWeight = luceneWeight;
}
@Override
public void extractTerms(Set<Term> terms) {
this.luceneWeight.extractTerms(terms);
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
return explain(context, doc, null);
}
/**
* Returns an explanation of the scoring for the given document.
*
* @param context The context of the reader that returned this document.
* @param doc The document.
* @param fieldHitAttribution Per-hit field attribution information.
* @return An explanation of the scoring for the given document.
*/
public Explanation explain(LeafReaderContext context, int doc,
@Nullable FieldHitAttribution fieldHitAttribution) throws IOException {
Explanation luceneExplanation = Explanation.noMatch("LuceneQuery explain skipped");
if (!ignoreLuceneQueryScoreExplanation) {
// get Lucene score
try {
luceneExplanation = luceneWeight.explain(context, doc);
} catch (Exception e) {
// We sometimes see exceptions resulting from term queries that do not store
// utf8-text, which TermQuery.toString() assumes. Catch here and allow at least
// scoring function explanations to be returned.
LOG.error("Exception in explain", e);
luceneExplanation = Explanation.noMatch("LuceneQuery explain failed");
}
}
Explanation scoringFunctionExplanation;
scoringFunction.setFieldHitAttribution(fieldHitAttribution);
scoringFunctionExplanation = scoringFunction.explain(
context.reader(), doc, luceneExplanation.getValue().floatValue());
// just add a wrapper for a better structure of the final explanation
Explanation luceneExplanationWrapper = Explanation.match(
luceneExplanation.getValue(), "LuceneQuery", luceneExplanation);
return Explanation.match(scoringFunctionExplanation.getValue(), "RelevanceQuery",
scoringFunctionExplanation, luceneExplanationWrapper);
}
@Override
public Scorer scorer(LeafReaderContext context) throws IOException {
return luceneWeight.scorer(context);
}
@Override
public boolean isCacheable(LeafReaderContext ctx) {
return luceneWeight.isCacheable(ctx);
}
}
@Override
public int hashCode() {
return (luceneQuery == null ? 0 : luceneQuery.hashCode())
+ (scoringFunction == null ? 0 : scoringFunction.hashCode()) * 13;
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof RelevanceQuery)) {
return false;
}
RelevanceQuery query = RelevanceQuery.class.cast(obj);
return Objects.equals(luceneQuery, query.luceneQuery)
&& Objects.equals(scoringFunction, query.scoringFunction);
}
@Override
public String toString(String field) {
return "RelevanceQuery[q=" + luceneQuery.toString(field) + "]";
}
}

View File

@ -1,165 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import org.apache.lucene.search.Explanation;
import com.twitter.search.common.relevance.features.MutableFeatureNormalizers;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
/**
* A toptweets query cache index selection scoring function that is based purely on retweet counts.
* The goal of this scoring functon is to deprecate itweet score in entirety.
*
* Once all legacy itweet scores are drained from existing earlybird index, new parus score replaces
* existing itweet score position, then this class will be deprecated, a new scoring function
* using parus score shall replace this.
*
* this scoring function is only used in Query Cache for marking top tweets
* in the background. When searched, those tweets are still ranked with linear or model-based
* scoring function.
*
*/
public class RetweetBasedTopTweetsScoringFunction extends ScoringFunction {
private static final double DEFAULT_RECENCY_SCORE_FRACTION = 0.1;
private static final double DEFAULT_SIGMOID_APLHA = 0.008;
private static final int DEFAULT_RECENCY_CENTER_MINUTES = 1080;
// if you update the default cut off, make sure you update the query cache filter in
// querycache.yml
//
// we know currently each time slice, each partition has about 10K entries in toptweets query
// cache. These are unique tweets. Looking at retweet updates, each time slice, each partition has
// about 650K unique tweets that received retweet. To create roughly similar number of entries in
// query cache, we need top 2% of such tweets, and that sets to min retweet count to 4.
// In this linear scoring function, we will rescale retweet count to [0, 1] range,
// with an input range of [0, 20]. Given the realtime factor's weight of 0.1, that give our
// minimal retweet score threshold to: 4/20 * 0.9 = 0.18.
// Testing on prod showed much higher volume due to the generous setting of max value of 20,
// (highest we have seen is 14). Adjusted to 0.21 which gave us similar volume.
private static final double DEFAULT_CUT_OFF_SCORE = 0.21;
// Normalize retweet counts from [0, 20] range to [0, 1] range
private static final double MAX_RETWEET_COUNT = 20.0;
private static final double MIN_USER_REPUTATION = 40.0; // matches itweet system threshold
/**
* The scores for the retweet based top tweets have to be in the [0, 1] interval. So we can't use
* SKIP_HIT as the lowest possible score, and instead have to use Float.MIN_VALUE.
*
* It's OK to use different values for these constants, because they do not interfere with each
* other. This constant is only used in RetweetBasedTopTweetsScoringFunction, which is only used
* to filter the hits for the [score_filter retweets minScore maxScore] operator. So the scores
* returned by RetweetBasedTopTweetsScoringFunction.score() do not have any impact on the final
* hit score.
*
* See EarlybirdLuceneQueryVisitor.visitScoredFilterOperator() and ScoreFilterQuery for more details.
*/
private static final float RETWEET_BASED_TOP_TWEETS_LOWEST_SCORE = Float.MIN_VALUE;
private final double recencyScoreFraction;
private final double sigmoidAlpha;
private final double cutOffScore;
private final int recencyCenterMinutes;
private final double maxRecency;
private final int currentTimeSeconds;
private ThriftSearchResultMetadata metadata = null;
private double score;
private double retweetCount;
public RetweetBasedTopTweetsScoringFunction(ImmutableSchemaInterface schema) {
this(schema, DEFAULT_RECENCY_SCORE_FRACTION,
DEFAULT_SIGMOID_APLHA,
DEFAULT_CUT_OFF_SCORE,
DEFAULT_RECENCY_CENTER_MINUTES);
}
/**
* Creates a no decay scoring function (used by top archive).
* Otherwise same as default constructor.
* @param nodecay If no decay is set to true. Alpha is set to 0.0.
*/
public RetweetBasedTopTweetsScoringFunction(ImmutableSchemaInterface schema, boolean nodecay) {
this(schema, DEFAULT_RECENCY_SCORE_FRACTION,
nodecay ? 0.0 : DEFAULT_SIGMOID_APLHA,
DEFAULT_CUT_OFF_SCORE,
DEFAULT_RECENCY_CENTER_MINUTES);
}
public RetweetBasedTopTweetsScoringFunction(ImmutableSchemaInterface schema,
double recencyScoreFraction, double sigmoidAlpha,
double cutOffScore, int recencyCenterMinutes) {
super(schema);
this.recencyScoreFraction = recencyScoreFraction;
this.sigmoidAlpha = sigmoidAlpha;
this.cutOffScore = cutOffScore;
this.recencyCenterMinutes = recencyCenterMinutes;
this.maxRecency = computeSigmoid(0);
this.currentTimeSeconds = (int) (System.currentTimeMillis() / 1000);
}
@Override
protected float score(float luceneQueryScore) throws IOException {
// Reset the data for each tweet!!!
metadata = null;
if (documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)
|| (documentFeatures.getFeatureValue(EarlybirdFieldConstant.USER_REPUTATION)
< MIN_USER_REPUTATION)) {
score = RETWEET_BASED_TOP_TWEETS_LOWEST_SCORE;
} else {
// Note that here we want the post log2 value, as the MAX_RETWEET_COUNT was actually
// set up for that.
retweetCount = MutableFeatureNormalizers.BYTE_NORMALIZER.unnormAndLog2(
(byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.RETWEET_COUNT));
final double recencyScore = computeTopTweetRecencyScore();
score = (retweetCount / MAX_RETWEET_COUNT) * (1 - recencyScoreFraction)
+ recencyScoreFraction * recencyScore;
if (score < this.cutOffScore) {
score = RETWEET_BASED_TOP_TWEETS_LOWEST_SCORE;
}
}
return (float) score;
}
private double computeSigmoid(double x) {
return 1.0f / (1.0f + Math.exp(sigmoidAlpha * (x - recencyCenterMinutes)));
}
private double computeTopTweetRecencyScore() {
double diffMinutes =
Math.max(0, currentTimeSeconds - timeMapper.getTime(getCurrentDocID())) / 60.0;
return computeSigmoid(diffMinutes) / maxRecency;
}
@Override
protected Explanation doExplain(float luceneScore) {
return null;
}
@Override
public ThriftSearchResultMetadata getResultMetadata(ThriftSearchResultMetadataOptions options) {
if (metadata == null) {
metadata = new ThriftSearchResultMetadata()
.setResultType(ThriftSearchResultType.POPULAR)
.setPenguinVersion(EarlybirdConfig.getPenguinVersionByte());
metadata.setRetweetCount((int) retweetCount);
metadata.setScore(score);
}
return metadata;
}
@Override
public void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
}
}

View File

@ -1,213 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import java.util.List;
import com.google.common.base.Preconditions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.search.Explanation;
import com.twitter.common.collections.Pair;
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
import com.twitter.search.common.query.HitAttributeHelper;
import com.twitter.search.common.relevance.features.EarlybirdDocumentFeatures;
import com.twitter.search.common.results.thriftjava.FieldHitAttribution;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.TimeMapper;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
import com.twitter.search.queryparser.query.Query;
/**
* Defines a ranking function which computes the score of a document that matches a query.
*/
public abstract class ScoringFunction {
/**
* Returned by a {@link #score(int, float)} to indicate that a hit should be scored below all.
*
* We have some equality tests like:
* "if (score == ScoringFunction.SKIP_HIT) {...}" (DefaultScoringFunction#updateRelevanceStats)
* We might also have double to float casts.
*
* Such castings seem to work with the equality test, but there might corner cases when casting
* this float value to a double (and back) might not work properly.
*
* If possible, we should choose a constant that is not in the valid score range. Then we can
* turn the float equality tests into Math.abs(...) < EPSILON tests.
*/
public static final float SKIP_HIT = -Float.MAX_VALUE;
private final ImmutableSchemaInterface schema;
// The current doc ID and the reader for the current segment should be private, because we don't
// want sub-classes to incorrectly update them. The doc ID should only be updated by the score()
// and explain() methods, and the reader should only be updated by the setNextReader() method.
private int currentDocID = -1;
protected DocIDToTweetIDMapper tweetIDMapper = null;
protected TimeMapper timeMapper = null;
protected EarlybirdDocumentFeatures documentFeatures;
protected int debugMode = 0;
protected HitAttributeHelper hitAttributeHelper;
protected Query query;
protected FieldHitAttribution fieldHitAttribution;
public ScoringFunction(ImmutableSchemaInterface schema) {
this.schema = Preconditions.checkNotNull(schema);
}
protected ImmutableSchemaInterface getSchema() {
return schema;
}
/**
* Updates the reader that will be used to retrieve the tweet IDs and creation times associated
* with scored doc IDs, as well as the values for various CSFs. Should be called every time the
* searcher starts searching in a new segment.
*/
public void setNextReader(EarlybirdIndexSegmentAtomicReader reader) throws IOException {
tweetIDMapper = reader.getSegmentData().getDocIDToTweetIDMapper();
timeMapper = reader.getSegmentData().getTimeMapper();
documentFeatures = new EarlybirdDocumentFeatures(reader);
initializeNextSegment(reader);
}
public void setHitAttributeHelperAndQuery(HitAttributeHelper newHitAttributeHelper,
Query parsedQuery) {
this.hitAttributeHelper = newHitAttributeHelper;
this.query = parsedQuery;
}
public void setFieldHitAttribution(FieldHitAttribution fieldHitAttribution) {
this.fieldHitAttribution = fieldHitAttribution;
}
public void setDebugMode(int debugMode) {
this.debugMode = debugMode;
}
/**
* Allow scoring functions to perform more per-segment-specific setup.
*/
protected void initializeNextSegment(EarlybirdIndexSegmentAtomicReader reader)
throws IOException {
// Noop by default
}
// Updates the current document ID and advances all NumericDocValues to this doc ID.
private void setCurrentDocID(int currentDocID) throws IOException {
this.currentDocID = currentDocID;
documentFeatures.advance(currentDocID);
}
/**
* Returns the current doc ID stored in this scoring function.
*/
public int getCurrentDocID() {
return currentDocID;
}
/**
* Compute the score for the current hit. This is not expected to be thread safe.
*
* @param internalDocID internal id of the matching hit
* @param luceneQueryScore the score that lucene's text query computed for this hit
*/
public float score(int internalDocID, float luceneQueryScore) throws IOException {
setCurrentDocID(internalDocID);
return score(luceneQueryScore);
}
/**
* Compute the score for the current hit. This is not expected to be thread safe.
*
* @param luceneQueryScore the score that lucene's text query computed for this hit
*/
protected abstract float score(float luceneQueryScore) throws IOException;
/** Returns an explanation for the given hit. */
public final Explanation explain(IndexReader reader, int internalDocID, float luceneScore)
throws IOException {
setNextReader((EarlybirdIndexSegmentAtomicReader) reader);
setCurrentDocID(internalDocID);
return doExplain(luceneScore);
}
/** Returns an explanation for the current document. */
protected abstract Explanation doExplain(float luceneScore) throws IOException;
/**
* Returns the scoring metadata for the current doc ID.
*/
public ThriftSearchResultMetadata getResultMetadata(ThriftSearchResultMetadataOptions options)
throws IOException {
ThriftSearchResultMetadata metadata = new ThriftSearchResultMetadata();
metadata.setResultType(ThriftSearchResultType.RELEVANCE);
metadata.setPenguinVersion(EarlybirdConfig.getPenguinVersionByte());
metadata.setLanguage(ThriftLanguage.findByValue(
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LANGUAGE)));
metadata.setSignature(
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.TWEET_SIGNATURE));
metadata.setIsNullcast(documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG));
return metadata;
}
/**
* Updates the given ThriftSearchResultsRelevanceStats instance based on the scoring metadata for
* the current doc ID.
*/
public abstract void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats);
/**
* Score a list of hits. Not thread safe.
*/
public float[] batchScore(List<BatchHit> hits) throws IOException {
throw new UnsupportedOperationException("This operation (batchScore) is not implemented!");
}
/**
* Collect the features and CSFs for the current document. Used for scoring and generating the
* returned metadata.
*/
public Pair<LinearScoringData, ThriftSearchResultFeatures> collectFeatures(
float luceneQueryScore) throws IOException {
throw new UnsupportedOperationException("This operation (collectFeatures) is not implemented!");
}
/**
* Implement this function to populate the result metadata based on the given scoring data.
* Otherwise, this is a no-op.
*
* Scoring functions that implement this should also implement getScoringData().
*/
public void populateResultMetadataBasedOnScoringData(
ThriftSearchResultMetadataOptions options,
ThriftSearchResultMetadata metadata,
LinearScoringData data) throws IOException {
// Make sure that the scoring data passed in is null because getScoringDataForCurrentDocument()
// returns null by default and if a subclass overrides one of these two methods, it should
// override both.
Preconditions.checkState(data == null, "LinearScoringData should be null");
}
/**
* This should only be called at hit collection time because it relies on the internal doc id.
*
* Scoring functions that implement this should also implement the function
* populateResultMetadataBasedOnScoringData().
*/
public LinearScoringData getScoringDataForCurrentDocument() {
return null;
}
}

View File

@ -1,216 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.query.HitAttributeHelper;
import com.twitter.search.common.ranking.thriftjava.ThriftRankingParams;
import com.twitter.search.common.ranking.thriftjava.ThriftScoringFunctionType;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.util.ml.tensorflow_engine.TensorflowModelsManager;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.common.userupdates.UserTable;
import com.twitter.search.earlybird.exception.ClientException;
import com.twitter.search.earlybird.ml.ScoringModelsManager;
import com.twitter.search.earlybird.search.AntiGamingFilter;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
import com.twitter.search.queryparser.query.Query;
/**
* Returns a scoring function for a particular experiment ID.
*
* Can be used for a/b testing of different scoring formulas.
*/
public abstract class ScoringFunctionProvider {
private static final Logger LOG = LoggerFactory.getLogger(ScoringFunctionProvider.class);
/**
* Returns the scoring function.
*/
public abstract ScoringFunction getScoringFunction() throws IOException, ClientException;
public static final String RETWEETS_SCORER_NAME = "retweets";
public static final String NO_SPAM_SCORER_NAME = "no_spam";
public static final String TEST_SCORER_NAME = "test";
// Whether to avoid time decay when scoring top tweets.
// Top archive does not need time decay.
private static final boolean TOP_TWEET_WITH_DECAY =
EarlybirdConfig.getBool("top_tweet_scoring_with_decay", true);
/**
* Abstract class that can be used for ScoringFunctions that don't throw a ClientException.
*
* It does throw an IOException but it doesn't throw a ClientException so the name can be a bit
* misleading.
*/
public abstract static class NamedScoringFunctionProvider extends ScoringFunctionProvider {
/**
* Returns the scoring function.
*/
public abstract ScoringFunction getScoringFunction() throws IOException;
}
/**
* Returns the scoring function provider with the given name, or null if no such provider exists.
*/
public static NamedScoringFunctionProvider getScoringFunctionProviderByName(
String name, final ImmutableSchemaInterface schema) {
if (name.equals(NO_SPAM_SCORER_NAME)) {
return new NamedScoringFunctionProvider() {
@Override
public ScoringFunction getScoringFunction() throws IOException {
return new SpamVectorScoringFunction(schema);
}
};
} else if (name.equals(RETWEETS_SCORER_NAME)) {
return new NamedScoringFunctionProvider() {
@Override
public ScoringFunction getScoringFunction() throws IOException {
// Production top tweet actually uses this.
if (TOP_TWEET_WITH_DECAY) {
return new RetweetBasedTopTweetsScoringFunction(schema);
} else {
return new RetweetBasedTopTweetsScoringFunction(schema, true);
}
}
};
} else if (name.equals(TEST_SCORER_NAME)) {
return new NamedScoringFunctionProvider() {
@Override
public ScoringFunction getScoringFunction() throws IOException {
return new TestScoringFunction(schema);
}
};
}
return null;
}
/**
* Returns default scoring functions for different scoring function type
* and provides fallback behavior if model-based scoring function fails
*/
public static class DefaultScoringFunctionProvider extends ScoringFunctionProvider {
private final EarlybirdRequest request;
private final ImmutableSchemaInterface schema;
private final ThriftSearchQuery searchQuery;
private final AntiGamingFilter antiGamingFilter;
private final UserTable userTable;
private final HitAttributeHelper hitAttributeHelper;
private final Query parsedQuery;
private final ScoringModelsManager scoringModelsManager;
private final TensorflowModelsManager tensorflowModelsManager;
private static final SearchCounter MODEL_BASED_SCORING_FUNCTION_CREATED =
SearchCounter.export("model_based_scoring_function_created");
private static final SearchCounter MODEL_BASED_FALLBACK_TO_LINEAR_SCORING_FUNCTION =
SearchCounter.export("model_based_fallback_to_linear_scoring_function");
private static final SearchCounter TENSORFLOW_BASED_SCORING_FUNCTION_CREATED =
SearchCounter.export("tensorflow_based_scoring_function_created");
private static final SearchCounter TENSORFLOW_BASED_FALLBACK_TO_LINEAR_SCORING_FUNCTION =
SearchCounter.export("tensorflow_fallback_to_linear_function_scoring_function");
public DefaultScoringFunctionProvider(
final EarlybirdRequest request,
final ImmutableSchemaInterface schema,
final ThriftSearchQuery searchQuery,
final AntiGamingFilter antiGamingFilter,
final UserTable userTable,
final HitAttributeHelper hitAttributeHelper,
final Query parsedQuery,
final ScoringModelsManager scoringModelsManager,
final TensorflowModelsManager tensorflowModelsManager) {
this.request = request;
this.schema = schema;
this.searchQuery = searchQuery;
this.antiGamingFilter = antiGamingFilter;
this.userTable = userTable;
this.hitAttributeHelper = hitAttributeHelper;
this.parsedQuery = parsedQuery;
this.scoringModelsManager = scoringModelsManager;
this.tensorflowModelsManager = tensorflowModelsManager;
}
@Override
public ScoringFunction getScoringFunction() throws IOException, ClientException {
if (searchQuery.isSetRelevanceOptions()
&& searchQuery.getRelevanceOptions().isSetRankingParams()) {
ThriftRankingParams params = searchQuery.getRelevanceOptions().getRankingParams();
ThriftScoringFunctionType type = params.isSetType()
? params.getType() : ThriftScoringFunctionType.LINEAR; // default type
switch (type) {
case LINEAR:
return createLinear();
case MODEL_BASED:
if (scoringModelsManager.isEnabled()) {
MODEL_BASED_SCORING_FUNCTION_CREATED.increment();
return createModelBased();
} else {
// From ScoringModelsManager.NO_OP_MANAGER. Fall back to LinearScoringFunction
MODEL_BASED_FALLBACK_TO_LINEAR_SCORING_FUNCTION.increment();
return createLinear();
}
case TENSORFLOW_BASED:
if (tensorflowModelsManager.isEnabled()) {
TENSORFLOW_BASED_SCORING_FUNCTION_CREATED.increment();
return createTensorflowBased();
} else {
// Fallback to linear scoring if tf manager is disabled
TENSORFLOW_BASED_FALLBACK_TO_LINEAR_SCORING_FUNCTION.increment();
return createLinear();
}
case TOPTWEETS:
return createTopTweets();
default:
throw new IllegalArgumentException("Unknown scoring type: in " + searchQuery);
}
} else {
LOG.error("No relevance options provided query = " + searchQuery);
return new DefaultScoringFunction(schema);
}
}
private ScoringFunction createLinear() throws IOException {
LinearScoringFunction scoringFunction = new LinearScoringFunction(
schema, searchQuery, antiGamingFilter, ThriftSearchResultType.RELEVANCE,
userTable);
scoringFunction.setHitAttributeHelperAndQuery(hitAttributeHelper, parsedQuery);
return scoringFunction;
}
/**
* For model based scoring function, ClientException will be throw if client selects an
* unknown model for scoring manager.
* {@link com.twitter.search.earlybird.search.relevance.scoring.ModelBasedScoringFunction}
*/
private ScoringFunction createModelBased() throws IOException, ClientException {
ModelBasedScoringFunction scoringFunction = new ModelBasedScoringFunction(
schema, searchQuery, antiGamingFilter, ThriftSearchResultType.RELEVANCE, userTable,
scoringModelsManager);
scoringFunction.setHitAttributeHelperAndQuery(hitAttributeHelper, parsedQuery);
return scoringFunction;
}
private ScoringFunction createTopTweets() throws IOException {
return new LinearScoringFunction(
schema, searchQuery, antiGamingFilter, ThriftSearchResultType.POPULAR, userTable);
}
private TensorflowBasedScoringFunction createTensorflowBased()
throws IOException, ClientException {
TensorflowBasedScoringFunction tfScoringFunction = new TensorflowBasedScoringFunction(
request, schema, searchQuery, antiGamingFilter,
ThriftSearchResultType.RELEVANCE, userTable, tensorflowModelsManager);
tfScoringFunction.setHitAttributeHelperAndQuery(hitAttributeHelper, parsedQuery);
return tfScoringFunction;
}
}
}

View File

@ -1,85 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import com.google.common.annotations.VisibleForTesting;
import org.apache.lucene.search.Explanation;
import com.twitter.search.common.relevance.features.RelevanceSignalConstants;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
public class SpamVectorScoringFunction extends ScoringFunction {
private static final int MIN_TWEEPCRED_WITH_LINK =
EarlybirdConfig.getInt("min_tweepcred_with_non_whitelisted_link", 25);
// The engagement threshold that prevents us from filtering users with low tweepcred.
private static final int ENGAGEMENTS_NO_FILTER = 1;
@VisibleForTesting
static final float NOT_SPAM_SCORE = 0.5f;
@VisibleForTesting
static final float SPAM_SCORE = -0.5f;
public SpamVectorScoringFunction(ImmutableSchemaInterface schema) {
super(schema);
}
@Override
protected float score(float luceneQueryScore) throws IOException {
if (documentFeatures.isFlagSet(EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG)) {
return NOT_SPAM_SCORE;
}
int tweepCredThreshold = 0;
if (documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_LINK_FLAG)
&& !documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG)
&& !documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG)
&& !documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG)) {
// Contains a non-media non-news link, definite spam vector.
tweepCredThreshold = MIN_TWEEPCRED_WITH_LINK;
}
int tweepcred = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.USER_REPUTATION);
// For new user, tweepcred is set to a sentinel value of -128, specified at
// src/thrift/com/twitter/search/common/indexing/status.thrift
if (tweepcred >= tweepCredThreshold
|| tweepcred == (int) RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL) {
return NOT_SPAM_SCORE;
}
double retweetCount =
documentFeatures.getUnnormalizedFeatureValue(EarlybirdFieldConstant.RETWEET_COUNT);
double replyCount =
documentFeatures.getUnnormalizedFeatureValue(EarlybirdFieldConstant.REPLY_COUNT);
double favoriteCount =
documentFeatures.getUnnormalizedFeatureValue(EarlybirdFieldConstant.FAVORITE_COUNT);
// If the tweet has enough engagements, do not mark it as spam.
if (retweetCount + replyCount + favoriteCount >= ENGAGEMENTS_NO_FILTER) {
return NOT_SPAM_SCORE;
}
return SPAM_SCORE;
}
@Override
protected Explanation doExplain(float luceneScore) {
return null;
}
@Override
public ThriftSearchResultMetadata getResultMetadata(ThriftSearchResultMetadataOptions options) {
return null;
}
@Override
public void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
}
}

View File

@ -1,87 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
// Ideally, this part should live somewhere in the Cortex common
// code. Today, it is not possible to create
// a `SparseTensor` that relies only on ByteBuffer.
public class SparseTensor {
private ByteBuffer sparseIndices;
private ByteBuffer sparseValues;
private ByteBuffer sparseShape;
private int numDocs;
private final long[] sparseShapeShapeDimension = new long[] {2L};
private final long inputBitSize = 1 << 63;
private long numRecordsSeen = 0;
private final long numFeatures;
private int numValuesSeen;
public SparseTensor(int numDocs, int numFeatures) {
this.numDocs = numDocs;
this.numFeatures = (long) numFeatures;
this.sparseValues =
ByteBuffer
.allocate(numFeatures * numDocs * Float.BYTES)
.order(ByteOrder.LITTLE_ENDIAN);
this.sparseIndices =
ByteBuffer
.allocate(2 * numFeatures * numDocs * Long.BYTES)
.order(ByteOrder.LITTLE_ENDIAN);
this.sparseShape =
ByteBuffer
.allocate(2 * Long.BYTES)
.order(ByteOrder.LITTLE_ENDIAN);
}
public void incNumRecordsSeen() {
numRecordsSeen++;
}
/**
* Adds the given value to this tensor.
*/
public void addValue(long featureId, float value) {
sparseValues.putFloat(value);
sparseIndices.putLong(numRecordsSeen);
sparseIndices.putLong(featureId);
numValuesSeen++;
}
public ByteBuffer getSparseValues() {
sparseValues.limit(numValuesSeen * Float.BYTES);
sparseValues.rewind();
return sparseValues;
}
public long[] getSparseValuesShape() {
return new long[] {numValuesSeen};
}
public long[] getSparseIndicesShape() {
return new long[] {numValuesSeen, 2L};
}
public long[] getSparseShapeShape() {
return sparseShapeShapeDimension;
}
public ByteBuffer getSparseIndices() {
sparseIndices.limit(2 * numValuesSeen * Long.BYTES);
sparseIndices.rewind();
return sparseIndices;
}
/**
* Returns the sparse shape for this tensor.
*/
public ByteBuffer getSparseShape() {
sparseShape.putLong(numRecordsSeen);
sparseShape.putLong(inputBitSize);
sparseShape.rewind();
return sparseShape;
}
}

View File

@ -1,339 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import java.io.IOException;
import java.nio.FloatBuffer;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ImmutableMap;
import org.apache.lucene.search.Explanation;
import org.tensorflow.Tensor;
import com.twitter.common.collections.Pair;
import com.twitter.search.common.constants.thriftjava.ThriftQuerySource;
import com.twitter.search.common.features.EarlybirdRankingDerivedFeature;
import com.twitter.search.common.features.FeatureHandler;
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.common.util.ml.tensorflow_engine.TensorflowModelsManager;
import com.twitter.search.earlybird.EarlybirdSearcher;
import com.twitter.search.earlybird.common.userupdates.UserTable;
import com.twitter.search.earlybird.exception.ClientException;
import com.twitter.search.earlybird.search.AntiGamingFilter;
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
import com.twitter.search.modeling.common.TweetFeaturesUtils;
import com.twitter.tfcompute_java.TFModelRunner;
/**
* TensorflowBasedScoringFunction relies on a TF model for scoring tweets
* Only the `batchScore` part is implemented
*/
public class TensorflowBasedScoringFunction extends FeatureBasedScoringFunction {
private final TFModelRunner tfModelRunner;
// https://stackoverflow.com/questions/37849322/how-to-understand-the-term-tensor-in-tensorflow
// for more information on this notation - in short, a TF graph is made
// of TF operations and doesn't have a first order notion of tensors
// The notation <operation>:<index> will maps to the <index> output of the
// <operation> contained in the TF graph.
private static final String INPUT_VALUES = "input_sparse_tensor_values:0";
private static final String INPUT_INDICES = "input_sparse_tensor_indices:0";
private static final String INPUT_SHAPE = "input_sparse_tensor_shape:0";
private static final String OUTPUT_NODE = "output_scores:0";
private final Map<Integer, Long> featureSchemaIdToMlApiId;
private final Map<Long, Float> tweetIdToScoreMap = new HashMap<>();
private final EarlybirdRequest request;
public TensorflowBasedScoringFunction(
EarlybirdRequest request,
ImmutableSchemaInterface schema,
ThriftSearchQuery searchQuery,
AntiGamingFilter antiGamingFilter,
ThriftSearchResultType searchResultType,
UserTable userTable,
TensorflowModelsManager tensorflowModelsManager
) throws IOException, ClientException {
super(
"TensorflowBasedScoringFunction",
schema,
searchQuery,
antiGamingFilter,
searchResultType,
userTable
);
this.request = request;
String modelName = searchQuery.getRelevanceOptions().getRankingParams().selectedTensorflowModel;
this.featureSchemaIdToMlApiId = tensorflowModelsManager.getFeatureSchemaIdToMlApiId();
if (modelName == null) {
throw new ClientException("Scoring type is TENSORFLOW_BASED but no model was selected");
} else if (!tensorflowModelsManager.getModel(modelName).isPresent()) {
throw new ClientException(
"Scoring type is TENSORFLOW_BASED. Model "
+ modelName
+ " is not present."
);
}
if (searchQuery.getRelevanceOptions().getRankingParams().isEnableHitDemotion()) {
throw new ClientException(
"Hit attribute demotion is not supported with TENSORFLOW_BASED scoring type");
}
tfModelRunner = tensorflowModelsManager.getModel(modelName).get();
}
/**
* Single item scoring just returns the lucene score to be used during the batching phase.
*/
@Override
protected float score(float luceneQueryScore) {
return luceneQueryScore;
}
@Override
public Pair<LinearScoringData, ThriftSearchResultFeatures> collectFeatures(
float luceneQueryScore) throws IOException {
LinearScoringData linearScoringData = updateLinearScoringData(luceneQueryScore);
ThriftSearchResultFeatures features =
createFeaturesForDocument(linearScoringData, true).getFeatures();
return new Pair<>(linearScoringData, features);
}
@Override
protected FeatureHandler createFeaturesForDocument(
LinearScoringData linearScoringData,
boolean ignoreDefaultValues) throws IOException {
return super.createFeaturesForDocument(linearScoringData,
ignoreDefaultValues)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_TREND_CLICK,
request.querySource == ThriftQuerySource.TREND_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_TYPED_QUERY,
request.querySource == ThriftQuerySource.TYPED_QUERY)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_TYPEAHEAD_CLICK,
request.querySource == ThriftQuerySource.TYPEAHEAD_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_HASHTAG_CLICK,
request.querySource == ThriftQuerySource.RECENT_SEARCH_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_RECENT_SEARCH_CLICK,
request.querySource == ThriftQuerySource.RECENT_SEARCH_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_PROFILE_CLICK,
request.querySource == ThriftQuerySource.PROFILE_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_API_CALL,
request.querySource == ThriftQuerySource.API_CALL)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_PROMOTED_TREND_CLICK,
request.querySource == ThriftQuerySource.PROMOTED_TREND_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_SAVED_SEARCH_CLICK,
request.querySource == ThriftQuerySource.SAVED_SEARCH_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_CASHTAG_CLICK,
request.querySource == ThriftQuerySource.CASHTAG_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_SPELLING_EXPANSION_REVERT_CLICK,
request.querySource == ThriftQuerySource.SPELLING_EXPANSION_REVERT_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_SPELLING_SUGGESTION_CLICK,
request.querySource == ThriftQuerySource.SPELLING_SUGGESTION_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_LOGGED_OUT_HOME_TREND_CLICK,
request.querySource == ThriftQuerySource.LOGGED_OUT_HOME_TREND_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_RELATED_QUERY_CLICK,
request.querySource == ThriftQuerySource.RELATED_QUERY_CLICK)
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_AUTO_SPELL_CORRECT_REVERT_CLICK,
request.querySource == ThriftQuerySource.AUTO_SPELL_CORRECT_REVERT_CLICK);
}
/**
* Return scores computed in batchScore() if forExplanation is true.
*/
@Override
protected double computeScore(LinearScoringData data, boolean forExplanation) {
Preconditions.checkState(forExplanation,
"forExplanation is false. computeScore() should only be used for explanation creation");
return tweetIdToScoreMap.get(tweetIDMapper.getTweetID(getCurrentDocID()));
}
@Override
protected void generateExplanationForScoring(
LinearScoringData scoringData, boolean isHit, List<Explanation> details) {
}
@VisibleForTesting
SparseTensor createInputTensor(ThriftSearchResultFeatures[] featuresForDocs) {
// Moving this across outside of the request path
// would reduce the allocation cost and make the `ByteBuffer`s
// long lived - would need one per thread.
SparseTensor sparseTensor =
new SparseTensor(featuresForDocs.length, featureSchemaIdToMlApiId.size());
for (ThriftSearchResultFeatures features : featuresForDocs) {
updateSparseTensor(sparseTensor, features);
}
return sparseTensor;
}
private void addSchemaBooleanFeatures(SparseTensor sparseTensor,
Map<Integer, Boolean> booleanMap) {
if (booleanMap == null || booleanMap.isEmpty()) {
return;
}
for (Map.Entry<Integer, Boolean> entry : booleanMap.entrySet()) {
Preconditions.checkState(featureSchemaIdToMlApiId.containsKey(entry.getKey()));
sparseTensor.addValue(
featureSchemaIdToMlApiId.get(entry.getKey()), entry.getValue() ? 1f : 0f);
}
}
private void addSchemaContinuousFeatures(SparseTensor sparseTensor,
Map<Integer, ? extends Number> valueMap) {
if (valueMap == null || valueMap.isEmpty()) {
return;
}
for (Map.Entry<Integer, ? extends Number> entry : valueMap.entrySet()) {
Integer id = entry.getKey();
// SEARCH-26795
if (!TweetFeaturesUtils.isFeatureDiscrete(id)) {
Preconditions.checkState(featureSchemaIdToMlApiId.containsKey(id));
sparseTensor.addValue(
featureSchemaIdToMlApiId.get(id), entry.getValue().floatValue());
}
}
}
private void updateSparseTensor(SparseTensor sparseTensor, ThriftSearchResultFeatures features) {
addSchemaBooleanFeatures(sparseTensor, features.getBoolValues());
addSchemaContinuousFeatures(sparseTensor, features.getIntValues());
addSchemaContinuousFeatures(sparseTensor, features.getLongValues());
addSchemaContinuousFeatures(sparseTensor, features.getDoubleValues());
sparseTensor.incNumRecordsSeen();
}
private float[] batchScoreInternal(ThriftSearchResultFeatures[] featuresForDocs) {
int nbDocs = featuresForDocs.length;
float[] backingArrayResults = new float[nbDocs];
SparseTensor sparseTensor = createInputTensor(featuresForDocs);
Tensor<?> sparseValues =
Tensor.create(
Float.class,
sparseTensor.getSparseValuesShape(),
sparseTensor.getSparseValues());
Tensor<?> sparseIndices =
Tensor.create(
Long.class,
sparseTensor.getSparseIndicesShape(),
sparseTensor.getSparseIndices());
Tensor<?> sparseShape =
Tensor.create(
Long.class,
sparseTensor.getSparseShapeShape(),
sparseTensor.getSparseShape());
Map<String, Tensor<?>> inputMap = ImmutableMap.of(
INPUT_VALUES, sparseValues,
INPUT_INDICES, sparseIndices,
INPUT_SHAPE, sparseShape
);
List<String> output = ImmutableList.of(OUTPUT_NODE);
Map<String, Tensor<?>> outputs = tfModelRunner.run(
inputMap,
output,
ImmutableList.of()
);
Tensor<?> outputTensor = outputs.get(OUTPUT_NODE);
try {
FloatBuffer finalResultBuffer =
FloatBuffer.wrap(backingArrayResults, 0, nbDocs);
outputTensor.writeTo(finalResultBuffer);
} finally {
// Close tensors to avoid memory leaks
sparseValues.close();
sparseIndices.close();
sparseShape.close();
if (outputTensor != null) {
outputTensor.close();
}
}
return backingArrayResults;
}
/**
* Compute the score for a list of hits. Not thread safe.
* @return Array of scores
*/
@Override
public float[] batchScore(List<BatchHit> hits) throws IOException {
ThriftSearchResultFeatures[] featuresForDocs = new ThriftSearchResultFeatures[hits.size()];
for (int i = 0; i < hits.size(); i++) {
// This is a gigantic allocation, but the models are trained to depend on unset values having
// a default.
BatchHit hit = hits.get(i);
ThriftSearchResultFeatures features = hit.getFeatures().deepCopy();
// Adjust features of a hit based on overrides provided by relevance options. Should mostly
// be used for debugging purposes.
adjustHitScoringFeatures(hit, features);
setDefaultFeatureValues(features);
featuresForDocs[i] = features;
}
float[] scores = batchScoreInternal(featuresForDocs);
float[] finalScores = new float[hits.size()];
for (int i = 0; i < hits.size(); i++) {
LinearScoringData data = hits.get(i).getScoringData();
if (data.skipReason != null && data.skipReason != LinearScoringData.SkipReason.NOT_SKIPPED) {
// If the hit should be skipped, overwrite the score with SKIP_HIT
scores[i] = SKIP_HIT;
}
// If explanations enabled, Add scores to map. Will be used in computeScore()
if (EarlybirdSearcher.explanationsEnabled(debugMode)) {
tweetIdToScoreMap.put(hits.get(i).getTweetID(), scores[i]);
}
finalScores[i] = postScoreComputation(
data,
scores[i],
false, // cannot get the hit attribution info for this hit at this point in time
null);
}
return finalScores;
}
private void adjustHitScoringFeatures(BatchHit hit, ThriftSearchResultFeatures features) {
if (request.isSetSearchQuery() && request.getSearchQuery().isSetRelevanceOptions()) {
ThriftSearchRelevanceOptions relevanceOptions =
request.getSearchQuery().getRelevanceOptions();
if (relevanceOptions.isSetPerTweetFeaturesOverride()
&& relevanceOptions.getPerTweetFeaturesOverride().containsKey(hit.getTweetID())) {
overrideFeatureValues(
features,
relevanceOptions.getPerTweetFeaturesOverride().get(hit.getTweetID()));
}
if (relevanceOptions.isSetPerUserFeaturesOverride()
&& relevanceOptions.getPerUserFeaturesOverride().containsKey(
hit.getScoringData().fromUserId)) {
overrideFeatureValues(
features,
relevanceOptions.getPerUserFeaturesOverride().get(hit.getScoringData().fromUserId));
}
if (relevanceOptions.isSetGlobalFeaturesOverride()) {
overrideFeatureValues(
features, relevanceOptions.getGlobalFeaturesOverride());
}
}
}
}

View File

@ -1,52 +0,0 @@
package com.twitter.search.earlybird.search.relevance.scoring;
import org.apache.lucene.search.Explanation;
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
/**
* A dummy scoring function for test, the score is always tweetId/10000.0
* Since score_filter: operator requires all score to be between [0, 1], if you want to use this
* with it, don't use any tweet id larger than 10000 in your test.
*/
public class TestScoringFunction extends ScoringFunction {
private ThriftSearchResultMetadata metadata = null;
private float score;
public TestScoringFunction(ImmutableSchemaInterface schema) {
super(schema);
}
@Override
protected float score(float luceneQueryScore) {
long tweetId = tweetIDMapper.getTweetID(getCurrentDocID());
this.score = (float) (tweetId / 10000.0);
System.out.println(String.format("score for tweet %10d is %6.3f", tweetId, score));
return this.score;
}
@Override
protected Explanation doExplain(float luceneScore) {
return null;
}
@Override
public ThriftSearchResultMetadata getResultMetadata(ThriftSearchResultMetadataOptions options) {
if (metadata == null) {
metadata = new ThriftSearchResultMetadata()
.setResultType(ThriftSearchResultType.RELEVANCE)
.setPenguinVersion(EarlybirdConfig.getPenguinVersionByte());
metadata.setScore(score);
}
return metadata;
}
@Override
public void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
}
}

View File

@ -1,62 +0,0 @@
package com.twitter.search.earlybird.segment;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;
import com.twitter.common.util.Clock;
import com.twitter.search.common.partitioning.base.Segment;
import com.twitter.search.common.util.io.dl.DLReaderWriterFactory;
import com.twitter.search.common.util.io.dl.SegmentDLUtil;
import com.twitter.search.earlybird.EarlybirdIndexConfig;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
/**
* An implementation of SegmentDataProvider using DistributedLog.
*/
public class DLSegmentDataProvider implements SegmentDataProvider {
private final int hashPartitionID;
private final DLReaderWriterFactory dlFactory;
private final SegmentDataReaderSet readerSet;
public DLSegmentDataProvider(
int hashPartitionID,
EarlybirdIndexConfig earlybirdIndexConfig,
DLReaderWriterFactory dlReaderWriterFactory) throws IOException {
this(hashPartitionID, earlybirdIndexConfig, dlReaderWriterFactory,
Clock.SYSTEM_CLOCK);
}
public DLSegmentDataProvider(
int hashPartitionID,
EarlybirdIndexConfig earlybirdIndexConfig,
DLReaderWriterFactory dlReaderWriterFactory,
Clock clock) throws IOException {
this.hashPartitionID = hashPartitionID;
this.dlFactory = dlReaderWriterFactory;
this.readerSet = new DLSegmentDataReaderSet(
dlFactory,
earlybirdIndexConfig,
clock);
}
@Override
public SegmentDataReaderSet getSegmentDataReaderSet() {
return readerSet;
}
@Override
public List<Segment> newSegmentList() throws IOException {
Set<String> segmentNames = SegmentDLUtil.getSegmentNames(dlFactory, null, hashPartitionID);
List<Segment> segmentList = new ArrayList<>(segmentNames.size());
for (String segmentName : segmentNames) {
Segment segment = Segment.fromSegmentName(segmentName, EarlybirdConfig.getMaxSegmentSize());
segmentList.add(segment);
}
// Sort the segments by ID.
Collections.sort(segmentList);
return segmentList;
}
}

View File

@ -1,237 +0,0 @@
package com.twitter.search.earlybird.segment;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.Optional;
import java.util.concurrent.TimeUnit;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Function;
import com.google.common.base.Preconditions;
import org.apache.thrift.TException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.twitter.common.util.Clock;
import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
import com.twitter.search.common.metrics.SearchCounter;
import com.twitter.search.common.metrics.SearchCustomGauge;
import com.twitter.search.common.metrics.SearchRequestStats;
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil;
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
import com.twitter.search.common.util.io.ReaderWithStatsFactory;
import com.twitter.search.common.util.io.TransformingRecordReader;
import com.twitter.search.common.util.io.dl.DLMultiStreamReader;
import com.twitter.search.common.util.io.dl.DLReaderWriterFactory;
import com.twitter.search.common.util.io.dl.DLTimestampedReaderFactory;
import com.twitter.search.common.util.io.dl.SegmentDLUtil;
import com.twitter.search.common.util.io.recordreader.RecordReader;
import com.twitter.search.common.util.io.recordreader.RecordReaderFactory;
import com.twitter.search.common.util.thrift.ThriftUtils;
import com.twitter.search.earlybird.EarlybirdIndexConfig;
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
import com.twitter.search.earlybird.document.DocumentFactory;
import com.twitter.search.earlybird.document.TweetDocument;
import com.twitter.search.earlybird.partition.SegmentInfo;
public class DLSegmentDataReaderSet implements SegmentDataReaderSet {
private static final Logger LOG = LoggerFactory.getLogger(DLSegmentDataReaderSet.class);
public static final SearchRequestStats STATUS_DL_READ_STATS =
SearchRequestStats.export("status_dlreader", TimeUnit.MICROSECONDS, false);
private static final SearchRequestStats UPDATE_EVENT_DL_READ_STATS =
SearchRequestStats.export("update_events_dlreader", TimeUnit.MICROSECONDS, false);
// The number of tweets not indexed because they failed deserialization.
private static final SearchCounter STATUS_SKIPPED_DUE_TO_FAILED_DESERIALIZATION_COUNTER =
SearchCounter.export("statuses_skipped_due_to_failed_deserialization");
@VisibleForTesting
public static final int FRESH_READ_THRESHOLD = (int) TimeUnit.MINUTES.toMillis(1);
private final int documentReadFreshnessThreshold =
EarlybirdConfig.getInt("documents_reader_freshness_threshold_millis", 10000);
private final int updateReadFreshnessThreshold =
EarlybirdConfig.getInt("updates_freshness_threshold_millis", FRESH_READ_THRESHOLD);
private final int dlReaderVersion = EarlybirdConfig.getInt("dl_reader_version");
private final DLReaderWriterFactory dlFactory;
private final RecordReaderFactory<byte[]> dlUpdateEventsFactory;
private final EarlybirdIndexConfig indexConfig;
private final Clock clock;
private RecordReader<TweetDocument> documentReader;
// RecordReaders for update events that span all live segments.
private final RecordReader<ThriftVersionedEvents> updateEventsReader;
private final DLMultiStreamReader updateEventsMultiReader;
private final Map<Long, RecordReader<ThriftVersionedEvents>> updateEventReaders = new HashMap<>();
DLSegmentDataReaderSet(
DLReaderWriterFactory dlFactory,
final EarlybirdIndexConfig indexConfig,
Clock clock) throws IOException {
this.dlFactory = dlFactory;
this.indexConfig = indexConfig;
this.clock = clock;
this.dlUpdateEventsFactory = new ReaderWithStatsFactory(
new DLTimestampedReaderFactory(dlFactory, clock, updateReadFreshnessThreshold),
UPDATE_EVENT_DL_READ_STATS);
this.updateEventsMultiReader =
new DLMultiStreamReader("update_events", dlUpdateEventsFactory, true, clock);
this.updateEventsReader =
new TransformingRecordReader<>(updateEventsMultiReader, record ->
(record != null) ? deserializeTVE(record.getBytes()) : null);
SearchCustomGauge.export("open_dl_update_events_streams", updateEventReaders::size);
}
private ThriftVersionedEvents deserializeTVE(byte[] bytes) {
ThriftVersionedEvents event = new ThriftVersionedEvents();
try {
ThriftUtils.fromCompactBinaryFormat(bytes, event);
return event;
} catch (TException e) {
LOG.error("error deserializing TVE", e);
return null;
}
}
@Override
public void attachDocumentReaders(SegmentInfo segmentInfo) throws IOException {
// Close any document reader left open before.
if (documentReader != null) {
LOG.warn("Previous documentReader not closed: {}", documentReader);
completeSegmentDocs(segmentInfo);
}
documentReader = newDocumentReader(segmentInfo);
}
@Override
public void attachUpdateReaders(SegmentInfo segmentInfo) throws IOException {
if (updateEventsMultiReader == null) {
return;
}
String segmentName = segmentInfo.getSegmentName();
if (getUpdateEventsReaderForSegment(segmentInfo) != null) {
LOG.info("Update events reader for segment {} is already attached.", segmentName);
return;
}
long updateEventStreamOffsetTimestamp = segmentInfo.getUpdatesStreamOffsetTimestamp();
LOG.info("Attaching update events reader for segment {} with timestamp: {}.",
segmentName, updateEventStreamOffsetTimestamp);
String topic = SegmentDLUtil.getDLTopicForUpdateEvents(segmentName, dlReaderVersion);
RecordReader<byte[]> recordReader =
dlUpdateEventsFactory.newRecordReaderForTimestamp(topic, updateEventStreamOffsetTimestamp);
updateEventsMultiReader.addRecordReader(recordReader, topic);
updateEventReaders.put(segmentInfo.getTimeSliceID(),
new TransformingRecordReader<>(recordReader, this::deserializeTVE));
}
@Override
public void stopAll() {
if (documentReader != null) {
documentReader.close();
}
if (updateEventsReader != null) {
updateEventsReader.close();
}
try {
dlFactory.close();
} catch (IOException e) {
LOG.error("Exception while closing DL factory", e);
}
}
@Override
public void completeSegmentDocs(SegmentInfo segmentInfo) {
if (documentReader != null) {
documentReader.close();
documentReader = null;
}
}
@Override
public void stopSegmentUpdates(SegmentInfo segmentInfo) {
if (updateEventsMultiReader != null) {
updateEventsMultiReader.removeStream(
SegmentDLUtil.getDLTopicForUpdateEvents(segmentInfo.getSegmentName(), dlReaderVersion));
updateEventReaders.remove(segmentInfo.getTimeSliceID());
}
}
@Override
public RecordReader<TweetDocument> newDocumentReader(SegmentInfo segmentInfo) throws IOException {
String topic = SegmentDLUtil.getDLTopicForTweets(segmentInfo.getSegmentName(),
EarlybirdConfig.getPenguinVersion(), dlReaderVersion);
final long timeSliceId = segmentInfo.getTimeSliceID();
final DocumentFactory<ThriftIndexingEvent> docFactory = indexConfig.createDocumentFactory();
// Create the underlying DLRecordReader wrapped with the tweet reader stats.
RecordReader<byte[]> dlReader = new ReaderWithStatsFactory(
new DLTimestampedReaderFactory(
dlFactory,
clock,
documentReadFreshnessThreshold),
STATUS_DL_READ_STATS)
.newRecordReader(topic);
// Create the wrapped reader which transforms serialized byte[] to TweetDocument.
return new TransformingRecordReader<>(
dlReader,
new Function<byte[], TweetDocument>() {
@Override
public TweetDocument apply(byte[] input) {
ThriftIndexingEvent event = new ThriftIndexingEvent();
try {
ThriftUtils.fromCompactBinaryFormat(input, event);
} catch (TException e) {
LOG.error("Could not deserialize status document", e);
STATUS_SKIPPED_DUE_TO_FAILED_DESERIALIZATION_COUNTER.increment();
return null;
}
Preconditions.checkNotNull(event.getDocument());
return new TweetDocument(
docFactory.getStatusId(event),
timeSliceId,
EarlybirdThriftDocumentUtil.getCreatedAtMs(event.getDocument()),
docFactory.newDocument(event));
}
});
}
@Override
public RecordReader<TweetDocument> getDocumentReader() {
return documentReader;
}
@Override
public RecordReader<ThriftVersionedEvents> getUpdateEventsReader() {
return updateEventsReader;
}
@Override
public RecordReader<ThriftVersionedEvents> getUpdateEventsReaderForSegment(
SegmentInfo segmentInfo) {
return updateEventReaders.get(segmentInfo.getTimeSliceID());
}
@Override
public Optional<Long> getUpdateEventsStreamOffsetForSegment(SegmentInfo segmentInfo) {
String topic =
SegmentDLUtil.getDLTopicForUpdateEvents(segmentInfo.getSegmentName(), dlReaderVersion);
return updateEventsMultiReader.getUnderlyingOffsetForSegmentWithTopic(topic);
}
@Override
public boolean allCaughtUp() {
return ((getDocumentReader() == null) || getDocumentReader().isCaughtUp())
&& ((getUpdateEventsReader() == null) || getUpdateEventsReader().isCaughtUp());
}
}

View File

@ -1,72 +0,0 @@
package com.twitter.search.earlybird.segment;
import java.util.Optional;
import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
import com.twitter.search.common.util.io.EmptyRecordReader;
import com.twitter.search.common.util.io.recordreader.RecordReader;
import com.twitter.search.earlybird.document.TweetDocument;
import com.twitter.search.earlybird.partition.SegmentInfo;
/**
* A SegmentDataReaderSet that returns no data. Uses a DocumentReader that is
* always caught up, but never gets exhausted.
* Can be used for bringing up an earlybird against a static set of segments,
* and will not incorporate any new updates.
*/
public class EmptySegmentDataReaderSet implements SegmentDataReaderSet {
public static final EmptySegmentDataReaderSet INSTANCE = new EmptySegmentDataReaderSet();
@Override
public void attachDocumentReaders(SegmentInfo segmentInfo) {
}
@Override
public void attachUpdateReaders(SegmentInfo segmentInfo) {
}
@Override
public void completeSegmentDocs(SegmentInfo segmentInfo) {
}
@Override
public void stopSegmentUpdates(SegmentInfo segmentInfo) {
}
@Override
public void stopAll() {
}
@Override
public boolean allCaughtUp() {
// ALWAYS CAUGHT UP
return true;
}
@Override
public RecordReader<TweetDocument> newDocumentReader(SegmentInfo segmentInfo)
throws Exception {
return null;
}
@Override
public RecordReader<TweetDocument> getDocumentReader() {
return new EmptyRecordReader<>();
}
@Override
public RecordReader<ThriftVersionedEvents> getUpdateEventsReader() {
return null;
}
@Override
public RecordReader<ThriftVersionedEvents> getUpdateEventsReaderForSegment(
SegmentInfo segmentInfo) {
return null;
}
@Override
public Optional<Long> getUpdateEventsStreamOffsetForSegment(SegmentInfo segmentInfo) {
return Optional.of(0L);
}
}

View File

@ -1,14 +0,0 @@
package com.twitter.search.earlybird.segment;
/**
* SegmentDataProvider provides information about available segments for indexing. This interface
* abstracts away the actual source of the segment data. It might be a MySQL database, a mock
* object, or a directory of flat files. It also provides access to the segmentInfoMap itself, which
* contains information about the indexing state of Segments.
*/
public interface SegmentDataProvider extends SegmentProvider {
/**
* Returns the set of segment data record readers.
*/
SegmentDataReaderSet getSegmentDataReaderSet();
}

Some files were not shown because too many files have changed in this diff Show More