mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-11-16 00:25:11 +01:00
[docx] split commit for file 4400
Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
parent
8948d714f6
commit
f37e76300b
Binary file not shown.
@ -1,32 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.facets;
|
||||
|
||||
import java.util.Iterator;
|
||||
|
||||
import com.twitter.search.common.schema.base.Schema;
|
||||
import com.twitter.search.core.earlybird.facets.FacetCountState;
|
||||
import com.twitter.search.core.earlybird.facets.FacetCountState.FacetFieldResults;
|
||||
import com.twitter.search.earlybird.search.EarlybirdLuceneSearcher;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults;
|
||||
|
||||
public class SimpleCountRankingModule extends FacetRankingModule {
|
||||
|
||||
@Override
|
||||
public void prepareResults(
|
||||
EarlybirdLuceneSearcher.FacetSearchResults hits,
|
||||
FacetCountState<ThriftFacetFieldResults> facetCountState) {
|
||||
Iterator<FacetFieldResults<ThriftFacetFieldResults>> fieldResultsIterator =
|
||||
facetCountState.getFacetFieldResultsIterator();
|
||||
while (fieldResultsIterator.hasNext()) {
|
||||
FacetFieldResults<ThriftFacetFieldResults> state = fieldResultsIterator.next();
|
||||
if (!state.isFinished()) {
|
||||
Schema.FieldInfo facetField =
|
||||
facetCountState.getSchema().getFacetFieldByFacetName(state.facetName);
|
||||
state.results = hits.getFacetResults(
|
||||
facetField.getFieldType().getFacetName(), state.numResultsRequested);
|
||||
if (state.results != null) {
|
||||
state.numResultsFound = state.results.getTopFacetsSize();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,47 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.facets;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.Sets;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.earlybird.partition.AudioSpaceTable;
|
||||
import com.twitter.search.earlybird.thrift.AudioSpaceState;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResult;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultAudioSpace;
|
||||
|
||||
public class SpaceFacetCollector extends AbstractFacetTermCollector {
|
||||
private final List<ThriftSearchResultAudioSpace> spaces = new ArrayList<>();
|
||||
|
||||
private final AudioSpaceTable audioSpaceTable;
|
||||
|
||||
public SpaceFacetCollector(AudioSpaceTable audioSpaceTable) {
|
||||
this.audioSpaceTable = audioSpaceTable;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean collect(int docID, long termID, int fieldID) {
|
||||
|
||||
String spaceId = getTermFromFacet(termID, fieldID,
|
||||
Sets.newHashSet(EarlybirdFieldConstant.SPACES_FACET));
|
||||
if (StringUtils.isEmpty(spaceId)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
spaces.add(new ThriftSearchResultAudioSpace(spaceId,
|
||||
audioSpaceTable.isRunning(spaceId) ? AudioSpaceState.RUNNING
|
||||
: AudioSpaceState.ENDED));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fillResultAndClear(ThriftSearchResult result) {
|
||||
getExtraMetadata(result).setSpaces(ImmutableList.copyOf(spaces));
|
||||
spaces.clear();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,487 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.facets;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.metrics.SearchResultsStats;
|
||||
import com.twitter.search.common.schema.SchemaUtil;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.base.Schema;
|
||||
import com.twitter.search.common.search.EarlyTerminationState;
|
||||
import com.twitter.search.common.util.earlybird.TermStatisticsUtil;
|
||||
import com.twitter.search.core.earlybird.index.TimeMapper;
|
||||
import com.twitter.search.earlybird.index.EarlybirdSingleSegmentSearcher;
|
||||
import com.twitter.search.earlybird.search.AbstractResultsCollector;
|
||||
import com.twitter.search.earlybird.search.SearchResultsInfo;
|
||||
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
|
||||
import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
|
||||
import com.twitter.search.earlybird.thrift.ThriftTermRequest;
|
||||
import com.twitter.search.earlybird.thrift.ThriftTermResults;
|
||||
|
||||
public class TermStatisticsCollector extends AbstractResultsCollector
|
||||
<TermStatisticsRequestInfo, TermStatisticsCollector.TermStatisticsSearchResults> {
|
||||
private static final EarlyTerminationState TERMINATED_TERM_STATS_COUNTING_DONE =
|
||||
new EarlyTerminationState("terminated_term_stats_counting_done", true);
|
||||
|
||||
// Stats for tracking histogram results.
|
||||
private static final SearchResultsStats TERM_STATS_HISTOGRAM_REQUESTS_WITH_MOVED_BACK_BINS =
|
||||
SearchResultsStats.export("term_statistics_collector_queries_with_moved_back_bins");
|
||||
private static final SearchCounter TERM_STATS_SKIPPED_LARGER_OUT_OF_BOUNDS_HITS =
|
||||
SearchCounter.export("term_statistics_collector_skipped_larger_out_of_bounds_hits");
|
||||
|
||||
@VisibleForTesting
|
||||
static final class TermStatistics {
|
||||
private final ThriftTermRequest termRequest;
|
||||
private final Term term; // could be null, for count across all fields
|
||||
private int termDF = 0;
|
||||
private int termCount = 0;
|
||||
private final int[] histogramBins;
|
||||
|
||||
// Per-segment information.
|
||||
private PostingsEnum segmentDocsEnum; // could be null, for count across all fields
|
||||
private boolean segmentDone;
|
||||
|
||||
@VisibleForTesting
|
||||
TermStatistics(ThriftTermRequest termRequest, Term term, int numBins) {
|
||||
this.termRequest = termRequest;
|
||||
this.term = term;
|
||||
this.histogramBins = new int[numBins];
|
||||
}
|
||||
|
||||
/**
|
||||
* Take the currently accumulated counts and "move them back" to make room for counts from more
|
||||
* recent binIds.
|
||||
*
|
||||
* For example, if the oldFirstBinID was set to 10, and the histogramBins were {3, 4, 5, 6, 7},
|
||||
* after this call with newFirstBinID set to 12, the histogramBins will be set
|
||||
* to {5, 6, 7, 0, 0}.
|
||||
*
|
||||
* @param oldFirstBinID the binId of the firstBin that's been used up to now.
|
||||
* @param newFirstBinID the new binId of the firstBin that will be used from now on.
|
||||
* The newFirstBinID is presumed to be larger than the oldFirstBinID, and is asserted.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
void moveBackTermCounts(int oldFirstBinID, int newFirstBinID) {
|
||||
Preconditions.checkState(oldFirstBinID < newFirstBinID);
|
||||
// move counts back by this many bins
|
||||
final int moveBackBy = newFirstBinID - oldFirstBinID;
|
||||
|
||||
this.termCount = 0;
|
||||
for (int i = 0; i < histogramBins.length; i++) {
|
||||
int oldCount = histogramBins[i];
|
||||
histogramBins[i] = 0;
|
||||
int newIndex = i - moveBackBy;
|
||||
if (newIndex >= 0) {
|
||||
histogramBins[newIndex] = oldCount;
|
||||
this.termCount += oldCount;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting void countHit(int bin) {
|
||||
termCount++;
|
||||
histogramBins[bin]++;
|
||||
}
|
||||
|
||||
@VisibleForTesting int getTermCount() {
|
||||
return termCount;
|
||||
}
|
||||
|
||||
@VisibleForTesting int[] getHistogramBins() {
|
||||
return histogramBins;
|
||||
}
|
||||
}
|
||||
|
||||
private TermStatistics[] termStatistics;
|
||||
|
||||
// Histogram fields.
|
||||
private int numBins;
|
||||
private int binSize;
|
||||
|
||||
private int numTimesBinsWereMovedBack = 0;
|
||||
private int numLargerOutOfBoundsBinsSkipped = 0;
|
||||
|
||||
private static final int SEEN_OUT_OF_RANGE_THRESHOLD = 10;
|
||||
|
||||
private int seenOutOfRange = 0;
|
||||
|
||||
// ID of the first bin - effectively time / binSize. This is calculated
|
||||
// relative to the first collected in-order hit.
|
||||
private int firstBinID = -1;
|
||||
// List of per-segment debug information specifically useful for termstat request debugging.
|
||||
private List<String> termStatisticsDebugInfo = new ArrayList<>();
|
||||
|
||||
/**
|
||||
* Creates a new term stats collector.
|
||||
*/
|
||||
public TermStatisticsCollector(
|
||||
ImmutableSchemaInterface schema,
|
||||
TermStatisticsRequestInfo searchRequestInfo,
|
||||
EarlybirdSearcherStats searcherStats,
|
||||
Clock clock,
|
||||
int requestDebugMode) {
|
||||
super(schema, searchRequestInfo, clock, searcherStats, requestDebugMode);
|
||||
|
||||
// Set up the histogram bins.
|
||||
if (searchRequestInfo.isReturnHistogram()) {
|
||||
ThriftHistogramSettings histogramSettings = searchRequestInfo.getHistogramSettings();
|
||||
this.numBins = histogramSettings.getNumBins();
|
||||
binSize = TermStatisticsUtil.determineBinSize(histogramSettings);
|
||||
} else {
|
||||
this.numBins = 0;
|
||||
this.binSize = 0;
|
||||
}
|
||||
|
||||
// Set up the term statistics array.
|
||||
List<ThriftTermRequest> termRequests = searchRequestInfo.getTermRequests();
|
||||
if (termRequests == null) {
|
||||
this.termStatistics = new TermStatistics[0];
|
||||
return;
|
||||
}
|
||||
|
||||
this.termStatistics = new TermStatistics[searchRequestInfo.getTermRequests().size()];
|
||||
for (int i = 0; i < searchRequestInfo.getTermRequests().size(); i++) {
|
||||
final ThriftTermRequest termRequest = searchRequestInfo.getTermRequests().get(i);
|
||||
|
||||
Term term = null;
|
||||
String fieldName = termRequest.getFieldName();
|
||||
if (!StringUtils.isBlank(fieldName)) {
|
||||
// First check if it's a facet field.
|
||||
Schema.FieldInfo facetField = schema.getFacetFieldByFacetName(termRequest.getFieldName());
|
||||
if (facetField != null) {
|
||||
term = new Term(facetField.getName(), termRequest.getTerm());
|
||||
} else {
|
||||
// EarlybirdSearcher.validateRequest() should've already checked that the field exists in
|
||||
// the schema, and that the term can be converted to the type of this field. However, if
|
||||
// that did not happen for some reason, an exception will be thrown here, which will be
|
||||
// converted to a TRANSIENT_ERROR response code.
|
||||
Schema.FieldInfo fieldInfo = schema.getFieldInfo(fieldName);
|
||||
Preconditions.checkNotNull(
|
||||
fieldInfo,
|
||||
"Found a ThriftTermRequest for a field that's not in the schema: " + fieldName
|
||||
+ ". This should've been caught by EarlybirdSearcher.validateRequest()!");
|
||||
term = new Term(fieldName, SchemaUtil.toBytesRef(fieldInfo, termRequest.getTerm()));
|
||||
}
|
||||
} else {
|
||||
// NOTE: if the fieldName is empty, this is a catch-all term request for the count across
|
||||
// all fields. We'll just use a null term in the TermStatistics object.
|
||||
}
|
||||
|
||||
termStatistics[i] = new TermStatistics(termRequest, term, numBins);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startSegment() throws IOException {
|
||||
termStatisticsDebugInfo.add(
|
||||
"Starting segment in timestamp range: [" + timeMapper.getFirstTime()
|
||||
+ ", " + timeMapper.getLastTime() + "]");
|
||||
for (TermStatistics termStats : termStatistics) {
|
||||
termStats.segmentDone = true; // until we know it's false later.
|
||||
TermsEnum termsEnum = null;
|
||||
if (termStats.term != null) {
|
||||
Terms terms = currTwitterReader.terms(termStats.term.field());
|
||||
if (terms != null) {
|
||||
termsEnum = terms.iterator();
|
||||
if (termsEnum != null && termsEnum.seekExact(termStats.term.bytes())) {
|
||||
termStats.termDF += termsEnum.docFreq(); // Only meaningful for matchAll queries.
|
||||
termStats.segmentDocsEnum =
|
||||
termsEnum.postings(termStats.segmentDocsEnum, PostingsEnum.FREQS);
|
||||
termStats.segmentDone = termStats.segmentDocsEnum == null
|
||||
|| termStats.segmentDocsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
|
||||
} else {
|
||||
// this term doesn't exist in this segment.
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Catch-all case
|
||||
termStats.termDF += currTwitterReader.numDocs(); // Only meaningful for matchAll queries.
|
||||
termStats.segmentDocsEnum = null;
|
||||
termStats.segmentDone = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private int calculateBin(final int tweetTime) {
|
||||
if (tweetTime == TimeMapper.ILLEGAL_TIME) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
final int binID = Math.abs(tweetTime) / binSize;
|
||||
final int expectedFirstBinId = binID - numBins + 1;
|
||||
|
||||
if (firstBinID == -1) {
|
||||
firstBinID = expectedFirstBinId;
|
||||
} else if (expectedFirstBinId > firstBinID) {
|
||||
numTimesBinsWereMovedBack++;
|
||||
final int oldOutOfOrderFirstBinID = firstBinID;
|
||||
firstBinID = expectedFirstBinId;
|
||||
// We got a more recent out of order bin, move previous counts back.
|
||||
for (TermStatistics ts : termStatistics) {
|
||||
ts.moveBackTermCounts(oldOutOfOrderFirstBinID, firstBinID);
|
||||
}
|
||||
}
|
||||
|
||||
final int binIndex = binID - firstBinID;
|
||||
if (binIndex >= numBins) {
|
||||
// In-order times should be decreasing,
|
||||
// and out of order times seen after an in-order tweet should also be smaller than the
|
||||
// first in-order tweet's time. Will track these and export as a stat.
|
||||
numLargerOutOfBoundsBinsSkipped++;
|
||||
return -1;
|
||||
} else if (binIndex < 0) {
|
||||
// Early termination criteria.
|
||||
seenOutOfRange++;
|
||||
} else {
|
||||
// Reset the counter, since we want to see consecutive tweets that are out of our bin range
|
||||
// not single anomalies.
|
||||
seenOutOfRange = 0;
|
||||
}
|
||||
|
||||
return binIndex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void doCollect(long tweetID) throws IOException {
|
||||
if (searchRequestInfo.isReturnHistogram()) {
|
||||
final int tweetTime = timeMapper.getTime(curDocId);
|
||||
final int binIndex = calculateBin(tweetTime);
|
||||
if (binIndex >= 0) {
|
||||
for (TermStatistics ts : termStatistics) {
|
||||
if (!ts.segmentDone) {
|
||||
countHist(ts, binIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (TermStatistics ts : termStatistics) {
|
||||
if (!ts.segmentDone) {
|
||||
countNoHist(ts);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void skipSegment(EarlybirdSingleSegmentSearcher searcher) {
|
||||
// Do nothing here.
|
||||
// We don't do accounting that's done in AbstractResultsCollector for Term Stats
|
||||
// requests because otherwise the bin ID calculation will be confused.
|
||||
}
|
||||
|
||||
private boolean advance(TermStatistics ts) throws IOException {
|
||||
PostingsEnum docsEnum = ts.segmentDocsEnum;
|
||||
if (docsEnum.docID() < curDocId) {
|
||||
if (docsEnum.advance(curDocId) == DocIdSetIterator.NO_MORE_DOCS) {
|
||||
ts.segmentDone = true;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return docsEnum.docID() == curDocId;
|
||||
}
|
||||
|
||||
private boolean countHist(TermStatistics ts, int bin) throws IOException {
|
||||
if (ts.term != null && !advance(ts)) {
|
||||
return false;
|
||||
}
|
||||
ts.countHit(bin);
|
||||
return true;
|
||||
}
|
||||
|
||||
private boolean countNoHist(TermStatistics ts) throws IOException {
|
||||
if (ts.term != null && !advance(ts)) {
|
||||
return false;
|
||||
}
|
||||
ts.termCount++;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public EarlyTerminationState innerShouldCollectMore() {
|
||||
if (readyToTerminate()) {
|
||||
return setEarlyTerminationState(TERMINATED_TERM_STATS_COUNTING_DONE);
|
||||
}
|
||||
return EarlyTerminationState.COLLECTING;
|
||||
}
|
||||
|
||||
/**
|
||||
* The termination logic is simple - we know what our earliest bin is and once we see a result
|
||||
* that's before our earliest bin, we terminate.
|
||||
*
|
||||
* Our results come with increasing internal doc ids, which should correspond to decreasing
|
||||
* timestamps. See SEARCH-27729, TWEETYPIE-7031.
|
||||
*
|
||||
* We early terminate after we have seen enough tweets that are outside of the bin
|
||||
* range that we want to return. This way we're not terminating too early because of single tweets
|
||||
* with wrong timestamps.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
boolean readyToTerminate() {
|
||||
return this.seenOutOfRange >= SEEN_OUT_OF_RANGE_THRESHOLD;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TermStatisticsSearchResults doGetResults() {
|
||||
return new TermStatisticsSearchResults();
|
||||
}
|
||||
|
||||
public final class TermStatisticsSearchResults extends SearchResultsInfo {
|
||||
public final List<Integer> binIds;
|
||||
public final Map<ThriftTermRequest, ThriftTermResults> results;
|
||||
public final int lastCompleteBinId;
|
||||
public final List<String> termStatisticsDebugInfo;
|
||||
|
||||
private TermStatisticsSearchResults() {
|
||||
// Initialize term stat debug info
|
||||
termStatisticsDebugInfo = TermStatisticsCollector.this.termStatisticsDebugInfo;
|
||||
|
||||
if (termStatistics.length > 0) {
|
||||
results = new HashMap<>();
|
||||
|
||||
if (searchRequestInfo.isReturnHistogram()) {
|
||||
binIds = new ArrayList<>(numBins);
|
||||
int minSearchedTime = TermStatisticsCollector.this.getMinSearchedTime();
|
||||
|
||||
if (shouldCollectDetailedDebugInfo()) {
|
||||
termStatisticsDebugInfo.add("minSearchedTime: " + minSearchedTime);
|
||||
int maxSearchedTime = TermStatisticsCollector.this.getMaxSearchedTime();
|
||||
termStatisticsDebugInfo.add("maxSearchedTime: " + maxSearchedTime);
|
||||
}
|
||||
|
||||
int lastCompleteBin = -1;
|
||||
|
||||
computeFirstBinId(TermStatisticsCollector.this.isSetMinSearchedTime(), minSearchedTime);
|
||||
trackHistogramResultStats();
|
||||
|
||||
// Example:
|
||||
// minSearchTime = 53s
|
||||
// binSize = 10
|
||||
// firstBinId = 5
|
||||
// numBins = 4
|
||||
// binId = 5, 6, 7, 8
|
||||
// binTimeStamp = 50s, 60s, 70s, 80s
|
||||
for (int i = 0; i < numBins; i++) {
|
||||
int binId = firstBinID + i;
|
||||
int binTimeStamp = binId * binSize;
|
||||
binIds.add(binId);
|
||||
if (lastCompleteBin == -1 && binTimeStamp > minSearchedTime) {
|
||||
lastCompleteBin = binId;
|
||||
}
|
||||
}
|
||||
|
||||
if (!getEarlyTerminationState().isTerminated()) {
|
||||
// only if we didn't early terminate we can be sure to use the firstBinID as
|
||||
// lastCompleteBinId
|
||||
lastCompleteBinId = firstBinID;
|
||||
if (shouldCollectDetailedDebugInfo()) {
|
||||
termStatisticsDebugInfo.add("no early termination");
|
||||
}
|
||||
} else {
|
||||
lastCompleteBinId = lastCompleteBin;
|
||||
if (shouldCollectDetailedDebugInfo()) {
|
||||
termStatisticsDebugInfo.add(
|
||||
"early terminated for reason: " + getEarlyTerminationReason());
|
||||
}
|
||||
}
|
||||
if (shouldCollectDetailedDebugInfo()) {
|
||||
termStatisticsDebugInfo.add("lastCompleteBinId: " + lastCompleteBinId);
|
||||
}
|
||||
} else {
|
||||
binIds = null;
|
||||
lastCompleteBinId = -1;
|
||||
}
|
||||
|
||||
for (TermStatistics ts : termStatistics) {
|
||||
ThriftTermResults termResults = new ThriftTermResults().setTotalCount(ts.termCount);
|
||||
|
||||
if (searchRequestInfo.isReturnHistogram()) {
|
||||
List<Integer> list = new ArrayList<>();
|
||||
for (int count : ts.histogramBins) {
|
||||
list.add(count);
|
||||
}
|
||||
termResults.setHistogramBins(list);
|
||||
}
|
||||
|
||||
results.put(ts.termRequest, termResults);
|
||||
}
|
||||
} else {
|
||||
binIds = null;
|
||||
results = null;
|
||||
lastCompleteBinId = -1;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder res = new StringBuilder();
|
||||
res.append("TermStatisticsSearchResults(\n");
|
||||
if (binIds != null) {
|
||||
res.append(" binIds=").append(binIds).append("\n");
|
||||
}
|
||||
res.append(" lastCompleteBinId=").append(lastCompleteBinId).append("\n");
|
||||
if (results != null) {
|
||||
res.append(" results=").append(results).append("\n");
|
||||
}
|
||||
res.append(")");
|
||||
return res.toString();
|
||||
}
|
||||
|
||||
public List<String> getTermStatisticsDebugInfo() {
|
||||
return termStatisticsDebugInfo;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Figure out what the actual firstBinId is for this query.
|
||||
*/
|
||||
private void computeFirstBinId(boolean isSetMinSearchedTime, int minSearchedTime) {
|
||||
if (firstBinID == -1) {
|
||||
if (!isSetMinSearchedTime) {
|
||||
// This would only happen if we don't search any segments, which for now we have
|
||||
// only seen happening if since_time or until_time don't intersect at all with
|
||||
// the range of the served segments.
|
||||
firstBinID = 0;
|
||||
} else {
|
||||
// Example:
|
||||
// minSearchedTime = 54
|
||||
// binSize = 10
|
||||
// firstBinId = 5
|
||||
firstBinID = minSearchedTime / binSize;
|
||||
}
|
||||
|
||||
if (shouldCollectDetailedDebugInfo()) {
|
||||
termStatisticsDebugInfo.add("firstBinId: " + firstBinID);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
int getSeenOutOfRange() {
|
||||
return seenOutOfRange;
|
||||
}
|
||||
|
||||
private void trackHistogramResultStats() {
|
||||
if (numLargerOutOfBoundsBinsSkipped > 0) {
|
||||
TERM_STATS_SKIPPED_LARGER_OUT_OF_BOUNDS_HITS.increment();
|
||||
}
|
||||
|
||||
if (numTimesBinsWereMovedBack > 0) {
|
||||
TERM_STATS_HISTOGRAM_REQUESTS_WITH_MOVED_BACK_BINS.recordResults(numTimesBinsWereMovedBack);
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,94 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.facets;
|
||||
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableSet;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.common.search.TerminationTracker;
|
||||
import com.twitter.search.common.util.text.NormalizerHelper;
|
||||
import com.twitter.search.common.util.url.URLUtils;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.search.SearchRequestInfo;
|
||||
import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftTermRequest;
|
||||
import com.twitter.search.earlybird.thrift.ThriftTermStatisticsRequest;
|
||||
|
||||
public class TermStatisticsRequestInfo extends SearchRequestInfo {
|
||||
private static final Set<String> FACET_URL_FIELDS_TO_NORMALIZE = new ImmutableSet.Builder()
|
||||
.add(EarlybirdFieldConstant.IMAGES_FACET)
|
||||
.add(EarlybirdFieldConstant.VIDEOS_FACET)
|
||||
.add(EarlybirdFieldConstant.NEWS_FACET)
|
||||
.build();
|
||||
|
||||
protected final List<ThriftTermRequest> termRequests;
|
||||
protected final ThriftHistogramSettings histogramSettings;
|
||||
|
||||
/**
|
||||
* Creates a new TermStatisticsRequestInfo instance using the provided query.
|
||||
*/
|
||||
public TermStatisticsRequestInfo(ThriftSearchQuery searchQuery,
|
||||
Query luceneQuery,
|
||||
ThriftTermStatisticsRequest termStatsRequest,
|
||||
TerminationTracker terminationTracker) {
|
||||
super(searchQuery, luceneQuery, terminationTracker);
|
||||
this.termRequests = termStatsRequest.isSetTermRequests()
|
||||
? termStatsRequest.getTermRequests() : new LinkedList<>();
|
||||
this.histogramSettings = termStatsRequest.getHistogramSettings();
|
||||
if (termStatsRequest.isIncludeGlobalCounts()) {
|
||||
// Add an empty request to indicate we need a global count across all fields.
|
||||
termRequests.add(new ThriftTermRequest().setFieldName("").setTerm(""));
|
||||
}
|
||||
|
||||
// We only normalize TEXT terms and urls. All other terms, e.g. topics (named entities) are
|
||||
// not normalized. Here the assumption is that the caller passes the exact terms back that
|
||||
// the facet API returned
|
||||
for (ThriftTermRequest termReq : termRequests) {
|
||||
if (termReq.getTerm().isEmpty()) {
|
||||
continue; // the special catch-all term.
|
||||
}
|
||||
|
||||
if (!termReq.isSetFieldName()
|
||||
|| termReq.getFieldName().equals(EarlybirdFieldConstant.TEXT_FIELD.getFieldName())) {
|
||||
// normalize the TEXT term as it's normalized during ingestion
|
||||
termReq.setTerm(NormalizerHelper.normalizeWithUnknownLocale(
|
||||
termReq.getTerm(), EarlybirdConfig.getPenguinVersion()));
|
||||
} else if (FACET_URL_FIELDS_TO_NORMALIZE.contains(termReq.getFieldName())) {
|
||||
// remove the trailing slash from the URL path. This operation is idempotent,
|
||||
// so either a spiderduck URL or a facet URL can be used here. The latter would just
|
||||
// be normalized twice, which is fine.
|
||||
termReq.setTerm(URLUtils.normalizePath(termReq.getTerm()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int calculateMaxHitsToProcess(ThriftSearchQuery searchQuery) {
|
||||
Preconditions.checkNotNull(searchQuery.getCollectorParams());
|
||||
if (!searchQuery.getCollectorParams().isSetTerminationParams()
|
||||
|| !searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
|
||||
// Override the default value to all hits.
|
||||
return Integer.MAX_VALUE;
|
||||
} else {
|
||||
return super.calculateMaxHitsToProcess(searchQuery);
|
||||
}
|
||||
}
|
||||
|
||||
public final List<ThriftTermRequest> getTermRequests() {
|
||||
return this.termRequests;
|
||||
}
|
||||
|
||||
public final ThriftHistogramSettings getHistogramSettings() {
|
||||
return this.histogramSettings;
|
||||
}
|
||||
|
||||
public final boolean isReturnHistogram() {
|
||||
return this.histogramSettings != null;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,41 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.facets;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.search.common.schema.base.Schema;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.core.earlybird.facets.CSFFacetCountIterator;
|
||||
import com.twitter.search.core.earlybird.facets.FacetCountIterator;
|
||||
import com.twitter.search.core.earlybird.facets.FacetCountIteratorFactory;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
|
||||
/**
|
||||
* Factory of {@link FacetCountIterator} instances for tweet search.
|
||||
* It provides a special iterator for the retweets facet.
|
||||
*/
|
||||
public final class TweetSearchFacetCountIteratorFactory extends FacetCountIteratorFactory {
|
||||
public static final TweetSearchFacetCountIteratorFactory FACTORY =
|
||||
new TweetSearchFacetCountIteratorFactory();
|
||||
|
||||
private TweetSearchFacetCountIteratorFactory() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public FacetCountIterator getFacetCountIterator(
|
||||
EarlybirdIndexSegmentAtomicReader reader,
|
||||
Schema.FieldInfo fieldInfo) throws IOException {
|
||||
Preconditions.checkNotNull(reader);
|
||||
Preconditions.checkNotNull(fieldInfo);
|
||||
Preconditions.checkArgument(fieldInfo.getFieldType().isUseCSFForFacetCounting());
|
||||
|
||||
String facetName = fieldInfo.getFieldType().getFacetName();
|
||||
|
||||
if (EarlybirdFieldConstant.RETWEETS_FACET.equals(facetName)) {
|
||||
return new RetweetFacetCountIterator(reader, fieldInfo);
|
||||
} else {
|
||||
return new CSFFacetCountIterator(reader, fieldInfo);
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,115 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
|
||||
public final class BadUserRepFilter extends Query {
|
||||
/**
|
||||
* Creates a query that filters out results coming from users with bad reputation.
|
||||
*
|
||||
* @param minTweepCred The lowest acceptable user reputation.
|
||||
* @return A query that filters out results from bad reputation users.
|
||||
*/
|
||||
public static Query getBadUserRepFilter(int minTweepCred) {
|
||||
if (minTweepCred <= 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new BadUserRepFilter(minTweepCred), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
private final int minTweepCred;
|
||||
|
||||
private BadUserRepFilter(int minTweepCred) {
|
||||
this.minTweepCred = minTweepCred;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return minTweepCred;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof BadUserRepFilter)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return minTweepCred == BadUserRepFilter.class.cast(obj).minTweepCred;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "BadUserRepFilter:" + minTweepCred;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
LeafReader reader = context.reader();
|
||||
if (!(reader instanceof EarlybirdIndexSegmentAtomicReader)) {
|
||||
return new AllDocsIterator(reader);
|
||||
}
|
||||
|
||||
return new BadUserExcludeDocIdSetIterator(
|
||||
(EarlybirdIndexSegmentAtomicReader) context.reader(), minTweepCred);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static final class BadUserExcludeDocIdSetIterator extends RangeFilterDISI {
|
||||
private final NumericDocValues userReputationDocValues;
|
||||
private final int minTweepCred;
|
||||
|
||||
BadUserExcludeDocIdSetIterator(EarlybirdIndexSegmentAtomicReader indexReader,
|
||||
int minTweepCred) throws IOException {
|
||||
super(indexReader);
|
||||
this.userReputationDocValues =
|
||||
indexReader.getNumericDocValues(EarlybirdFieldConstant.USER_REPUTATION.getFieldName());
|
||||
this.minTweepCred = minTweepCred;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldReturnDoc() throws IOException {
|
||||
// We need this explicit casting to byte, because of how we encode and decode features in our
|
||||
// encoded_tweet_features field. If a feature is an int (uses all 32 bits of the int), then
|
||||
// encoding the feature and then decoding it preserves its original value. However, if the
|
||||
// feature does not use the entire int (and especially if it uses bits somewhere in the middle
|
||||
// of the int), then the feature value is assumed to be unsigned when it goes through this
|
||||
// process of encoding and decoding. So a user rep of
|
||||
// RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL (-128) will be correctly encoded as the
|
||||
// binary value 10000000, but will be treated as an unsigned value when decoded, and therefore
|
||||
// the decoded value will be 128.
|
||||
//
|
||||
// In retrospect, this seems like a really poor design decision. It seems like it would be
|
||||
// better if all feature values were considered to be signed, even if most features can never
|
||||
// have negative values. Unfortunately, making this change is not easy, because some features
|
||||
// store normalized values, so we would also need to change the range of allowed values
|
||||
// produced by those normalizers, as well as all code that depends on those values.
|
||||
//
|
||||
// So for now, just cast this value to a byte, to get the proper negative value.
|
||||
return userReputationDocValues.advanceExact(docID())
|
||||
&& ((byte) userReputationDocValues.longValue() >= minTweepCred);
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,87 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
|
||||
/**
|
||||
* CSFDisjunctionFilter provides an efficient mechanism to query for documents that have a
|
||||
* long CSF equal to one of the provided values.
|
||||
*/
|
||||
public final class CSFDisjunctionFilter extends Query {
|
||||
private final String csfField;
|
||||
private final Set<Long> values;
|
||||
|
||||
public static Query getCSFDisjunctionFilter(String csfField, Set<Long> values) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new CSFDisjunctionFilter(csfField, values), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
private CSFDisjunctionFilter(String csfField, Set<Long> values) {
|
||||
this.csfField = csfField;
|
||||
this.values = values;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
return new CSFDisjunctionFilterDISI(context.reader(), csfField, values);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (csfField == null ? 0 : csfField.hashCode()) * 17
|
||||
+ (values == null ? 0 : values.hashCode());
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof CSFDisjunctionFilter)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
CSFDisjunctionFilter filter = CSFDisjunctionFilter.class.cast(obj);
|
||||
return Objects.equals(csfField, filter.csfField) && Objects.equals(values, filter.values);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "CSFDisjunctionFilter:" + csfField + ",count:" + values.size();
|
||||
}
|
||||
|
||||
private static final class CSFDisjunctionFilterDISI extends RangeFilterDISI {
|
||||
private final NumericDocValues docValues;
|
||||
private final Set<Long> values;
|
||||
|
||||
private CSFDisjunctionFilterDISI(LeafReader reader, String csfField, Set<Long> values)
|
||||
throws IOException {
|
||||
super(reader);
|
||||
this.values = values;
|
||||
this.docValues = reader.getNumericDocValues(csfField);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean shouldReturnDoc() throws IOException {
|
||||
return docValues.advanceExact(docID()) && values.contains(docValues.longValue());
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,195 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftCSFType;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
|
||||
/**
|
||||
* Filters tweets according to the specified CSF field value.
|
||||
* Note that min value is inclusive, and max value is exclusive.
|
||||
*/
|
||||
public final class DocValRangeFilter extends Query {
|
||||
private final String csfField;
|
||||
private final ThriftCSFType csfFieldType;
|
||||
private final Number minValInclusive;
|
||||
private final Number maxValExclusive;
|
||||
|
||||
/**
|
||||
* Returns a query that filters hits based on the value of a CSF.
|
||||
*
|
||||
* @param csfField The CSF name.
|
||||
* @param csfFieldType The CSF type.
|
||||
* @param minVal The minimum acceptable value (inclusive).
|
||||
* @param maxVal The maximum acceptable value (exclusive).
|
||||
* @return A query that filters hits based on the value of a CSF.
|
||||
*/
|
||||
public static Query getDocValRangeQuery(String csfField, ThriftCSFType csfFieldType,
|
||||
double minVal, double maxVal) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new DocValRangeFilter(csfField, csfFieldType, minVal, maxVal),
|
||||
BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a query that filters hits based on the value of a CSF.
|
||||
*
|
||||
* @param csfField The CSF name.
|
||||
* @param csfFieldType The CSF type.
|
||||
* @param minVal The minimum acceptable value (inclusive).
|
||||
* @param maxVal The maximum acceptable value (exclusive).
|
||||
* @return A query that filters hits based on the value of a CSF.
|
||||
*/
|
||||
public static Query getDocValRangeQuery(String csfField, ThriftCSFType csfFieldType,
|
||||
long minVal, long maxVal) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new DocValRangeFilter(csfField, csfFieldType, minVal, maxVal),
|
||||
BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
private DocValRangeFilter(String csfField, ThriftCSFType csfFieldType,
|
||||
double minVal, double maxVal) {
|
||||
this.csfField = csfField;
|
||||
this.csfFieldType = csfFieldType;
|
||||
this.minValInclusive = new Float(minVal);
|
||||
this.maxValExclusive = new Float(maxVal);
|
||||
}
|
||||
|
||||
private DocValRangeFilter(String csfField, ThriftCSFType csfFieldType,
|
||||
long minVal, long maxVal) {
|
||||
this.csfField = csfField;
|
||||
this.csfFieldType = csfFieldType;
|
||||
this.minValInclusive = new Long(minVal);
|
||||
this.maxValExclusive = new Long(maxVal);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (csfField == null ? 0 : csfField.hashCode()) * 29
|
||||
+ (csfFieldType == null ? 0 : csfFieldType.hashCode()) * 17
|
||||
+ minValInclusive.hashCode() * 7
|
||||
+ maxValExclusive.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof DocValRangeFilter)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
DocValRangeFilter filter = DocValRangeFilter.class.cast(obj);
|
||||
return Objects.equals(csfField, filter.csfField)
|
||||
&& (csfFieldType == filter.csfFieldType)
|
||||
&& minValInclusive.equals(filter.minValInclusive)
|
||||
&& maxValExclusive.equals(filter.maxValExclusive);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "DocValRangeFilter:" + csfField
|
||||
+ ",type:" + csfFieldType.toString()
|
||||
+ ",min:" + this.minValInclusive.toString()
|
||||
+ ",max:" + this.maxValExclusive.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
LeafReader reader = context.reader();
|
||||
if (csfFieldType == null) {
|
||||
return new AllDocsIterator(reader);
|
||||
}
|
||||
|
||||
int smallestDoc = (reader instanceof EarlybirdIndexSegmentAtomicReader)
|
||||
? ((EarlybirdIndexSegmentAtomicReader) reader).getSmallestDocID() : 0;
|
||||
int largestDoc = reader.maxDoc() - 1;
|
||||
return new CSFRangeDocIdSetIterator(reader, csfField, csfFieldType,
|
||||
smallestDoc, largestDoc,
|
||||
minValInclusive, maxValExclusive);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static final class CSFRangeDocIdSetIterator extends RangeFilterDISI {
|
||||
private final NumericDocValues numericDocValues;
|
||||
private final ThriftCSFType csfType;
|
||||
private final Number minValInclusive;
|
||||
private final Number maxValExclusive;
|
||||
|
||||
public CSFRangeDocIdSetIterator(LeafReader reader,
|
||||
String csfField,
|
||||
ThriftCSFType csfType,
|
||||
int smallestDocID,
|
||||
int largestDocID,
|
||||
Number minValInclusive,
|
||||
Number maxValExclusive) throws IOException {
|
||||
super(reader, smallestDocID, largestDocID);
|
||||
this.numericDocValues = reader.getNumericDocValues(csfField);
|
||||
this.csfType = csfType;
|
||||
this.minValInclusive = minValInclusive;
|
||||
this.maxValExclusive = maxValExclusive;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean shouldReturnDoc() throws IOException {
|
||||
if (!numericDocValues.advanceExact(docID())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
long val = numericDocValues.longValue();
|
||||
switch (csfType) {
|
||||
case DOUBLE:
|
||||
double doubleVal = Double.longBitsToDouble(val);
|
||||
return doubleVal >= minValInclusive.doubleValue()
|
||||
&& doubleVal < maxValExclusive.doubleValue();
|
||||
case FLOAT:
|
||||
float floatVal = Float.intBitsToFloat((int) val);
|
||||
return floatVal >= minValInclusive.doubleValue()
|
||||
&& floatVal < maxValExclusive.doubleValue();
|
||||
case LONG:
|
||||
return val >= minValInclusive.longValue() && val < maxValExclusive.longValue();
|
||||
case INT:
|
||||
return val >= minValInclusive.longValue() && (int) val < maxValExclusive.longValue();
|
||||
case BYTE:
|
||||
return (byte) val >= minValInclusive.longValue()
|
||||
&& (byte) val < maxValExclusive.longValue();
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//////////////////////////
|
||||
// for unit tests only
|
||||
//////////////////////////
|
||||
@VisibleForTesting
|
||||
public Number getMinValForTest() {
|
||||
return minValInclusive;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public Number getMaxValForTest() {
|
||||
return maxValExclusive;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,113 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
|
||||
public final class FeatureValueInAcceptListOrUnsetFilter extends Query {
|
||||
|
||||
private final String featureName;
|
||||
private final Set<Long> idsAcceptList;
|
||||
|
||||
/**
|
||||
* Creates a query that filters for hits that have the given feature unset, or that have the
|
||||
* given feature set to a value in the given list of IDs.
|
||||
*
|
||||
* @param featureName The feature.
|
||||
* @param ids A list of id values this filter will accept for the given feature.
|
||||
* @return A query that filters out all hits that have the given feature set.
|
||||
*/
|
||||
public static Query getFeatureValueInAcceptListOrUnsetFilter(String featureName, Set<Long> ids) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new FeatureValueInAcceptListOrUnsetFilter(featureName, ids),
|
||||
BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String s) {
|
||||
return String.format("FeatureValueInAcceptListOrUnsetFilter(%s, AcceptList = (%s))",
|
||||
featureName,
|
||||
idsAcceptList);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof FeatureValueInAcceptListOrUnsetFilter)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
FeatureValueInAcceptListOrUnsetFilter filter =
|
||||
FeatureValueInAcceptListOrUnsetFilter.class.cast(obj);
|
||||
return featureName.equals(filter.featureName) && idsAcceptList.equals(filter.idsAcceptList);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return featureName.hashCode() * 7 + idsAcceptList.hashCode();
|
||||
}
|
||||
|
||||
private FeatureValueInAcceptListOrUnsetFilter(String featureName, Set<Long> ids) {
|
||||
this.featureName = Preconditions.checkNotNull(featureName);
|
||||
this.idsAcceptList = Preconditions.checkNotNull(ids);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
return new FeatureValueInAcceptListOrUnsetDocIdSetIterator(
|
||||
context.reader(), featureName, idsAcceptList);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static final class FeatureValueInAcceptListOrUnsetDocIdSetIterator
|
||||
extends RangeFilterDISI {
|
||||
private final NumericDocValues featureDocValues;
|
||||
private final Set<Long> idsAcceptList;
|
||||
|
||||
FeatureValueInAcceptListOrUnsetDocIdSetIterator(
|
||||
LeafReader indexReader, String featureName, Set<Long> ids) throws IOException {
|
||||
super(indexReader);
|
||||
this.featureDocValues = indexReader.getNumericDocValues(featureName);
|
||||
this.idsAcceptList = ids;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldReturnDoc() throws IOException {
|
||||
// If featureDocValues is null, that means there were no documents indexed with the given
|
||||
// field in the current segment.
|
||||
//
|
||||
// The advanceExact() method returns false if it cannot find the given docId in the
|
||||
// NumericDocValues instance. So if advanceExact() returns false then we know the feature is
|
||||
// unset.
|
||||
// However, for realtime Earlybirds we have a custom implementation of NumericDocValues,
|
||||
// ColumnStrideFieldDocValues, which will contain an entry for every indexed docId and use a
|
||||
// value of 0 to indicate that a feature is unset.
|
||||
//
|
||||
// So to check if a feature is unset for a given docId, we first need to check if we can find
|
||||
// the docId, and then we additionally need to check if the feature value is 0.
|
||||
return featureDocValues == null
|
||||
|| !featureDocValues.advanceExact(docID())
|
||||
|| featureDocValues.longValue() == 0
|
||||
|| idsAcceptList.contains(featureDocValues.longValue());
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,255 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.ConstantScoreScorer;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.TwoPhaseIterator;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.search.TerminationTracker;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
|
||||
|
||||
public class GeoTwoPhaseQuery extends Query {
|
||||
private static final boolean ENABLE_GEO_EARLY_TERMINATION =
|
||||
EarlybirdConfig.getBool("early_terminate_geo_searches", true);
|
||||
|
||||
private static final int GEO_TIMEOUT_OVERRIDE =
|
||||
EarlybirdConfig.getInt("early_terminate_geo_searches_timeout_override", -1);
|
||||
|
||||
// How many geo searches are early terminated due to timeout.
|
||||
private static final SearchCounter GEO_SEARCH_TIMEOUT_COUNT =
|
||||
SearchCounter.export("geo_search_timeout_count");
|
||||
|
||||
private final SecondPhaseDocAccepter accepter;
|
||||
private final TerminationTracker terminationTracker;
|
||||
private final ConstantScoreQuery query;
|
||||
|
||||
public GeoTwoPhaseQuery(
|
||||
Query query, SecondPhaseDocAccepter accepter, TerminationTracker terminationTracker) {
|
||||
this.accepter = accepter;
|
||||
this.terminationTracker = terminationTracker;
|
||||
|
||||
this.query = new ConstantScoreQuery(query);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
Query rewritten = query.getQuery().rewrite(reader);
|
||||
if (rewritten != query.getQuery()) {
|
||||
return new GeoTwoPhaseQuery(rewritten, accepter, terminationTracker);
|
||||
}
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return query.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof GeoTwoPhaseQuery)) {
|
||||
return false;
|
||||
}
|
||||
GeoTwoPhaseQuery that = (GeoTwoPhaseQuery) obj;
|
||||
return query.equals(that.query)
|
||||
&& accepter.equals(that.accepter)
|
||||
&& terminationTracker.equals(that.terminationTracker);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return new StringBuilder("GeoTwoPhaseQuery(")
|
||||
.append("Accepter(")
|
||||
.append(accepter.toString())
|
||||
.append(") Geohashes(")
|
||||
.append(query.getQuery().toString(field))
|
||||
.append("))")
|
||||
.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
Weight innerWeight = query.createWeight(searcher, scoreMode, boost);
|
||||
return new GeoTwoPhaseWeight(this, innerWeight, accepter, terminationTracker);
|
||||
}
|
||||
|
||||
private static final class GeoTwoPhaseWeight extends Weight {
|
||||
private final Weight innerWeight;
|
||||
private final SecondPhaseDocAccepter accepter;
|
||||
private final TerminationTracker terminationTracker;
|
||||
|
||||
private GeoTwoPhaseWeight(
|
||||
Query query,
|
||||
Weight innerWeight,
|
||||
SecondPhaseDocAccepter accepter,
|
||||
TerminationTracker terminationTracker) {
|
||||
super(query);
|
||||
this.innerWeight = innerWeight;
|
||||
this.accepter = accepter;
|
||||
this.terminationTracker = terminationTracker;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
innerWeight.extractTerms(terms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
|
||||
return innerWeight.explain(context, doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
Scorer innerScorer = innerWeight.scorer(context);
|
||||
if (innerScorer == null) {
|
||||
return null;
|
||||
}
|
||||
if (ENABLE_GEO_EARLY_TERMINATION
|
||||
&& (terminationTracker == null || !terminationTracker.useLastSearchedDocIdOnTimeout())) {
|
||||
innerScorer = new ConstantScoreScorer(
|
||||
this,
|
||||
0.0f,
|
||||
ScoreMode.COMPLETE_NO_SCORES,
|
||||
new TimedDocIdSetIterator(innerScorer.iterator(),
|
||||
terminationTracker,
|
||||
GEO_TIMEOUT_OVERRIDE,
|
||||
GEO_SEARCH_TIMEOUT_COUNT));
|
||||
}
|
||||
|
||||
accepter.initialize(context);
|
||||
return new GeoTwoPhaseScorer(this, innerScorer, accepter);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable(LeafReaderContext ctx) {
|
||||
return innerWeight.isCacheable(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
private static final class GeoTwoPhaseScorer extends Scorer {
|
||||
private final Scorer innerScorer;
|
||||
private final SecondPhaseDocAccepter accepter;
|
||||
|
||||
private GeoTwoPhaseScorer(Weight weight, Scorer innerScorer, SecondPhaseDocAccepter accepter) {
|
||||
super(weight);
|
||||
this.innerScorer = innerScorer;
|
||||
this.accepter = accepter;
|
||||
}
|
||||
|
||||
@Override
|
||||
public TwoPhaseIterator twoPhaseIterator() {
|
||||
return new TwoPhaseIterator(innerScorer.iterator()) {
|
||||
@Override
|
||||
public boolean matches() throws IOException {
|
||||
return checkDocExpensive(innerScorer.docID());
|
||||
}
|
||||
|
||||
@Override
|
||||
public float matchCost() {
|
||||
return 0.0f;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return iterator().docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public float score() throws IOException {
|
||||
return innerScorer.score();
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSetIterator iterator() {
|
||||
return new DocIdSetIterator() {
|
||||
private int doNext(int startingDocId) throws IOException {
|
||||
int docId = startingDocId;
|
||||
while ((docId != NO_MORE_DOCS) && !checkDocExpensive(docId)) {
|
||||
docId = innerScorer.iterator().nextDoc();
|
||||
}
|
||||
return docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return innerScorer.iterator().docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
return doNext(innerScorer.iterator().nextDoc());
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
return doNext(innerScorer.iterator().advance(target));
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return 2 * innerScorer.iterator().cost();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public float getMaxScore(int upTo) throws IOException {
|
||||
return innerScorer.getMaxScore(upTo);
|
||||
}
|
||||
|
||||
private boolean checkDocExpensive(int doc) throws IOException {
|
||||
return accepter.accept(doc);
|
||||
}
|
||||
}
|
||||
|
||||
public abstract static class SecondPhaseDocAccepter {
|
||||
/**
|
||||
* Initializes this accepter with the given reader context.
|
||||
*/
|
||||
public abstract void initialize(LeafReaderContext context) throws IOException;
|
||||
|
||||
/**
|
||||
* Determines if the given doc ID is accepted by this accepter.
|
||||
*/
|
||||
public abstract boolean accept(int doc) throws IOException;
|
||||
|
||||
/**
|
||||
* Returns a string description for this SecondPhaseDocAccepter instance.
|
||||
*/
|
||||
public abstract String toString();
|
||||
}
|
||||
|
||||
public static final SecondPhaseDocAccepter ALL_DOCS_ACCEPTER = new SecondPhaseDocAccepter() {
|
||||
@Override
|
||||
public void initialize(LeafReaderContext context) { }
|
||||
|
||||
@Override
|
||||
public boolean accept(int doc) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "AllDocsAccepter";
|
||||
}
|
||||
};
|
||||
}
|
Binary file not shown.
@ -1,44 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.search.DocIdSet;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.util.Bits;
|
||||
import org.apache.lucene.util.RamUsageEstimator;
|
||||
|
||||
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
|
||||
|
||||
public final class MatchAllDocIdSet extends DocIdSet {
|
||||
private final LeafReader reader;
|
||||
|
||||
public MatchAllDocIdSet(LeafReader reader) {
|
||||
this.reader = reader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public DocIdSetIterator iterator() throws IOException {
|
||||
return new AllDocsIterator(reader);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Bits bits() throws IOException {
|
||||
return new Bits() {
|
||||
@Override
|
||||
public boolean get(int index) {
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int length() {
|
||||
return reader.maxDoc();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public long ramBytesUsed() {
|
||||
return RamUsageEstimator.shallowSizeOf(this);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,91 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Set;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.ConstantScoreScorer;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
import com.twitter.search.earlybird.index.EarlybirdSingleSegmentSearcher;
|
||||
|
||||
/**
|
||||
* A MatchAllDocsQuery implementation that does not assume that doc IDs are assigned sequentially.
|
||||
* Instead, it wraps the EarlybirdIndexSegmentAtomicReader into a RangeFilterDISI, and uses
|
||||
* this iterator to traverse only the valid doc IDs in this segment.
|
||||
*
|
||||
* Note that org.apache.lucene.index.MatchAllDocsQuery is final, so we cannot extend it.
|
||||
*/
|
||||
public class MatchAllDocsQuery extends Query {
|
||||
private static class MatchAllDocsWeight extends Weight {
|
||||
private final Weight luceneWeight;
|
||||
|
||||
public MatchAllDocsWeight(Query query, Weight luceneWeight) {
|
||||
super(query);
|
||||
this.luceneWeight = luceneWeight;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
luceneWeight.extractTerms(terms);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
|
||||
return luceneWeight.explain(context, doc);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
Preconditions.checkState(context.reader() instanceof EarlybirdIndexSegmentAtomicReader,
|
||||
"Expected an EarlybirdIndexSegmentAtomicReader, but got a "
|
||||
+ context.reader().getClass().getName() + " instance.");
|
||||
EarlybirdIndexSegmentAtomicReader reader =
|
||||
(EarlybirdIndexSegmentAtomicReader) context.reader();
|
||||
return new ConstantScoreScorer(
|
||||
this, 1.0f, ScoreMode.COMPLETE_NO_SCORES, new RangeFilterDISI(reader));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable(LeafReaderContext ctx) {
|
||||
return luceneWeight.isCacheable(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
|
||||
org.apache.lucene.search.MatchAllDocsQuery luceneMatchAllDocsQuery =
|
||||
new org.apache.lucene.search.MatchAllDocsQuery();
|
||||
Weight luceneWeight = luceneMatchAllDocsQuery.createWeight(searcher, scoreMode, boost);
|
||||
if (!(searcher instanceof EarlybirdSingleSegmentSearcher)) {
|
||||
return luceneWeight;
|
||||
}
|
||||
return new MatchAllDocsWeight(this, luceneWeight);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
return obj instanceof MatchAllDocsQuery;
|
||||
}
|
||||
|
||||
// Copied from org.apache.lucene.search.MatchAllDocsWeight
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "*:*";
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,131 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collection;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.common.search.IntArrayDocIdSetIterator;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
|
||||
import com.twitter.search.earlybird.index.TweetIDMapper;
|
||||
|
||||
public final class RequiredStatusIDsFilter extends Query {
|
||||
private final Collection<Long> statusIDs;
|
||||
|
||||
public static Query getRequiredStatusIDsQuery(Collection<Long> statusIDs) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new RequiredStatusIDsFilter(statusIDs), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
private RequiredStatusIDsFilter(Collection<Long> statusIDs) {
|
||||
this.statusIDs = Preconditions.checkNotNull(statusIDs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
LeafReader leafReader = context.reader();
|
||||
if (!(leafReader instanceof EarlybirdIndexSegmentAtomicReader)) {
|
||||
return DocIdSetIterator.empty();
|
||||
}
|
||||
|
||||
EarlybirdIndexSegmentAtomicReader reader = (EarlybirdIndexSegmentAtomicReader) leafReader;
|
||||
TweetIDMapper idMapper = (TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
|
||||
|
||||
int docIdsSize = 0;
|
||||
int[] docIds = new int[statusIDs.size()];
|
||||
for (long statusID : statusIDs) {
|
||||
int docId = idMapper.getDocID(statusID);
|
||||
if (docId >= 0) {
|
||||
docIds[docIdsSize++] = docId;
|
||||
}
|
||||
}
|
||||
|
||||
Arrays.sort(docIds, 0, docIdsSize);
|
||||
DocIdSetIterator statusesDISI =
|
||||
new IntArrayDocIdSetIterator(Arrays.copyOf(docIds, docIdsSize));
|
||||
DocIdSetIterator allDocsDISI = new AllDocsIterator(reader);
|
||||
|
||||
// We only want to return IDs for fully indexed documents. So we need to make sure that
|
||||
// every doc ID we return exists in allDocsDISI. However, allDocsDISI has all documents in
|
||||
// this segment, so driving by allDocsDISI would be very slow. So we want to drive by
|
||||
// statusesDISI, and use allDocsDISI as a post-filter. What this comes down to is that we do
|
||||
// not want to call allDocsDISI.nextDoc(); we only want to call allDocsDISI.advance(), and
|
||||
// only on the doc IDs returned by statusesDISI.
|
||||
return new DocIdSetIterator() {
|
||||
@Override
|
||||
public int docID() {
|
||||
return statusesDISI.docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
statusesDISI.nextDoc();
|
||||
return advanceToNextFullyIndexedDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
statusesDISI.advance(target);
|
||||
return advanceToNextFullyIndexedDoc();
|
||||
}
|
||||
|
||||
private int advanceToNextFullyIndexedDoc() throws IOException {
|
||||
while (docID() != DocIdSetIterator.NO_MORE_DOCS) {
|
||||
// Check if the current doc is fully indexed.
|
||||
// If it is, then we can return it. If it's not, then we need to keep searching.
|
||||
int allDocsDocId = allDocsDISI.advance(docID());
|
||||
if (allDocsDocId == docID()) {
|
||||
break;
|
||||
}
|
||||
|
||||
statusesDISI.advance(allDocsDocId);
|
||||
}
|
||||
return docID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return statusesDISI.cost();
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return statusIDs.hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof RequiredStatusIDsFilter)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
RequiredStatusIDsFilter filter = RequiredStatusIDsFilter.class.cast(obj);
|
||||
return statusIDs.equals(filter.statusIDs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public final String toString(String field) {
|
||||
return String.format("RequiredStatusIDs[%s]", statusIDs);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,86 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.PostingsEnum;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.ConstantScoreScorer;
|
||||
import org.apache.lucene.search.ConstantScoreWeight;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
/**
|
||||
* A version of a term query that we can use when we already know the term id (in case where we
|
||||
* previously looked it up), and have a TermsEnum to get the actual postings.
|
||||
*
|
||||
* This is can be used for constant score queries, where only iterating on the postings is required.
|
||||
*/
|
||||
class SimpleTermQuery extends Query {
|
||||
private final TermsEnum termsEnum;
|
||||
private final long termId;
|
||||
|
||||
public SimpleTermQuery(TermsEnum termsEnum, long termId) {
|
||||
this.termsEnum = termsEnum;
|
||||
this.termId = termId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
return new SimpleTermQueryWeight(scoreMode);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (termsEnum == null ? 0 : termsEnum.hashCode()) * 13 + (int) termId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof SimpleTermQuery)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
SimpleTermQuery query = SimpleTermQuery.class.cast(obj);
|
||||
return (termsEnum == null ? query.termsEnum == null : termsEnum.equals(query.termsEnum))
|
||||
&& (termId == query.termId);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "SimpleTermQuery(" + field + ":" + termId + ")";
|
||||
}
|
||||
|
||||
private class SimpleTermQueryWeight extends ConstantScoreWeight {
|
||||
private final ScoreMode scoreMode;
|
||||
|
||||
public SimpleTermQueryWeight(ScoreMode scoreMode) {
|
||||
super(SimpleTermQuery.this, 1.0f);
|
||||
this.scoreMode = scoreMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "weight(" + SimpleTermQuery.this + ")";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
termsEnum.seekExact(termId);
|
||||
|
||||
PostingsEnum docs = termsEnum.postings(
|
||||
null, scoreMode.needsScores() ? PostingsEnum.FREQS : PostingsEnum.NONE);
|
||||
assert docs != null;
|
||||
return new ConstantScoreScorer(this, 0, scoreMode, docs);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable(LeafReaderContext ctx) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,211 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
import com.twitter.search.earlybird.index.TweetIDMapper;
|
||||
|
||||
/**
|
||||
* Filters tweet ids according to since_id and max_id parameter.
|
||||
*
|
||||
* Note that since_id is exclusive and max_id is inclusive.
|
||||
*/
|
||||
public final class SinceMaxIDFilter extends Query {
|
||||
public static final long NO_FILTER = -1;
|
||||
|
||||
private final long sinceIdExclusive;
|
||||
private final long maxIdInclusive;
|
||||
|
||||
public static Query getSinceMaxIDQuery(long sinceIdExclusive, long maxIdInclusive) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new SinceMaxIDFilter(sinceIdExclusive, maxIdInclusive), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
public static Query getSinceIDQuery(long sinceIdExclusive) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new SinceMaxIDFilter(sinceIdExclusive, NO_FILTER), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
public static Query getMaxIDQuery(long maxIdInclusive) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new SinceMaxIDFilter(NO_FILTER, maxIdInclusive), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
private SinceMaxIDFilter(long sinceIdExclusive, long maxIdInclusive) {
|
||||
this.sinceIdExclusive = sinceIdExclusive;
|
||||
this.maxIdInclusive = maxIdInclusive;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (int) (sinceIdExclusive * 13 + maxIdInclusive);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof SinceMaxIDFilter)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
SinceMaxIDFilter filter = SinceMaxIDFilter.class.cast(obj);
|
||||
return (sinceIdExclusive == filter.sinceIdExclusive)
|
||||
&& (maxIdInclusive == filter.maxIdInclusive);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
if (sinceIdExclusive != NO_FILTER && maxIdInclusive != NO_FILTER) {
|
||||
return "SinceIdFilter:" + sinceIdExclusive + ",MaxIdFilter:" + maxIdInclusive;
|
||||
} else if (maxIdInclusive != NO_FILTER) {
|
||||
return "MaxIdFilter:" + maxIdInclusive;
|
||||
} else {
|
||||
return "SinceIdFilter:" + sinceIdExclusive;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this segment is at least partially covered by the given tweet ID range.
|
||||
*/
|
||||
public static boolean sinceMaxIDsInRange(
|
||||
TweetIDMapper tweetIdMapper, long sinceIdExclusive, long maxIdInclusive) {
|
||||
// Check for since id out of range. Note that since this ID is exclusive,
|
||||
// equality is out of range too.
|
||||
if (sinceIdExclusive != NO_FILTER && sinceIdExclusive >= tweetIdMapper.getMaxTweetID()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for max id in range.
|
||||
return maxIdInclusive == NO_FILTER || maxIdInclusive >= tweetIdMapper.getMinTweetID();
|
||||
}
|
||||
|
||||
// Returns true if this segment is completely covered by these id filters.
|
||||
private static boolean sinceMaxIdsCoverRange(
|
||||
TweetIDMapper tweetIdMapper, long sinceIdExclusive, long maxIdInclusive) {
|
||||
// Check for since_id specified AND since_id newer than than first tweet.
|
||||
if (sinceIdExclusive != NO_FILTER && sinceIdExclusive >= tweetIdMapper.getMinTweetID()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check for max id in range.
|
||||
return maxIdInclusive == NO_FILTER || maxIdInclusive > tweetIdMapper.getMaxTweetID();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
LeafReader reader = context.reader();
|
||||
if (!(reader instanceof EarlybirdIndexSegmentAtomicReader)) {
|
||||
return new AllDocsIterator(reader);
|
||||
}
|
||||
|
||||
EarlybirdIndexSegmentAtomicReader twitterInMemoryIndexReader =
|
||||
(EarlybirdIndexSegmentAtomicReader) reader;
|
||||
TweetIDMapper tweetIdMapper =
|
||||
(TweetIDMapper) twitterInMemoryIndexReader.getSegmentData().getDocIDToTweetIDMapper();
|
||||
|
||||
// Important to return a null DocIdSetIterator here, so the Scorer will skip searching
|
||||
// this segment completely.
|
||||
if (!sinceMaxIDsInRange(tweetIdMapper, sinceIdExclusive, maxIdInclusive)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// Optimization: just return a match-all iterator when the whole segment is in range.
|
||||
// This avoids having to do so many status id lookups.
|
||||
if (sinceMaxIdsCoverRange(tweetIdMapper, sinceIdExclusive, maxIdInclusive)) {
|
||||
return new AllDocsIterator(reader);
|
||||
}
|
||||
|
||||
return new SinceMaxIDDocIdSetIterator(
|
||||
twitterInMemoryIndexReader, sinceIdExclusive, maxIdInclusive);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
static class SinceMaxIDDocIdSetIterator extends RangeFilterDISI {
|
||||
private final DocIDToTweetIDMapper docIdToTweetIdMapper;
|
||||
private final long sinceIdExclusive;
|
||||
private final long maxIdInclusive;
|
||||
|
||||
public SinceMaxIDDocIdSetIterator(EarlybirdIndexSegmentAtomicReader reader,
|
||||
long sinceIdExclusive,
|
||||
long maxIdInclusive) throws IOException {
|
||||
super(reader,
|
||||
findMaxIdDocID(reader, maxIdInclusive),
|
||||
findSinceIdDocID(reader, sinceIdExclusive));
|
||||
this.docIdToTweetIdMapper = reader.getSegmentData().getDocIDToTweetIDMapper();
|
||||
this.sinceIdExclusive = sinceIdExclusive; // sinceStatusId == NO_FILTER is OK, it's exclusive
|
||||
this.maxIdInclusive = maxIdInclusive != NO_FILTER ? maxIdInclusive : Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* This is a necessary check when we have out of order tweets in the archive.
|
||||
* When tweets are out of order, this guarantees that no false positive results are returned.
|
||||
* I.e. we can still miss some tweets in the specified range, but we never incorrectly return
|
||||
* anything that's not in the range.
|
||||
*/
|
||||
@Override
|
||||
protected boolean shouldReturnDoc() {
|
||||
final long statusID = docIdToTweetIdMapper.getTweetID(docID());
|
||||
return statusID > sinceIdExclusive && statusID <= maxIdInclusive;
|
||||
}
|
||||
|
||||
private static int findSinceIdDocID(
|
||||
EarlybirdIndexSegmentAtomicReader reader, long sinceIdExclusive) throws IOException {
|
||||
TweetIDMapper tweetIdMapper =
|
||||
(TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
|
||||
if (sinceIdExclusive != SinceMaxIDFilter.NO_FILTER) {
|
||||
// We use this as an upper bound on the search, so we want to find the highest possible
|
||||
// doc ID for this tweet ID.
|
||||
boolean findMaxDocID = true;
|
||||
return tweetIdMapper.findDocIdBound(
|
||||
sinceIdExclusive,
|
||||
findMaxDocID,
|
||||
reader.getSmallestDocID(),
|
||||
reader.maxDoc() - 1);
|
||||
} else {
|
||||
return DocIDToTweetIDMapper.ID_NOT_FOUND;
|
||||
}
|
||||
}
|
||||
|
||||
private static int findMaxIdDocID(
|
||||
EarlybirdIndexSegmentAtomicReader reader, long maxIdInclusive) throws IOException {
|
||||
TweetIDMapper tweetIdMapper =
|
||||
(TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
|
||||
if (maxIdInclusive != SinceMaxIDFilter.NO_FILTER) {
|
||||
// We use this as a lower bound on the search, so we want to find the lowest possible
|
||||
// doc ID for this tweet ID.
|
||||
boolean findMaxDocID = false;
|
||||
return tweetIdMapper.findDocIdBound(
|
||||
maxIdInclusive,
|
||||
findMaxDocID,
|
||||
reader.getSmallestDocID(),
|
||||
reader.maxDoc() - 1);
|
||||
} else {
|
||||
return DocIDToTweetIDMapper.ID_NOT_FOUND;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,137 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.core.earlybird.index.TimeMapper;
|
||||
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
|
||||
// Filters tweets according to since time and until time (in seconds).
|
||||
// Note that since time is inclusive, and until time is exclusive.
|
||||
public final class SinceUntilFilter extends Query {
|
||||
public static final int NO_FILTER = -1;
|
||||
|
||||
// These are both in seconds since the epoch.
|
||||
private final int minTimeInclusive;
|
||||
private final int maxTimeExclusive;
|
||||
|
||||
public static Query getSinceQuery(int sinceTimeSeconds) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new SinceUntilFilter(sinceTimeSeconds, NO_FILTER), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
public static Query getUntilQuery(int untilTimeSeconds) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new SinceUntilFilter(NO_FILTER, untilTimeSeconds), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
public static Query getSinceUntilQuery(int sinceTimeSeconds, int untilTimeSeconds) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new SinceUntilFilter(sinceTimeSeconds, untilTimeSeconds), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
private SinceUntilFilter(int sinceTime, int untilTime) {
|
||||
this.minTimeInclusive = sinceTime != NO_FILTER ? sinceTime : 0;
|
||||
this.maxTimeExclusive = untilTime != NO_FILTER ? untilTime : Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (int) (minTimeInclusive * 17 + maxTimeExclusive);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof SinceUntilFilter)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
SinceUntilFilter filter = SinceUntilFilter.class.cast(obj);
|
||||
return (minTimeInclusive == filter.minTimeInclusive)
|
||||
&& (maxTimeExclusive == filter.maxTimeExclusive);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
if (minTimeInclusive > 0 && maxTimeExclusive != Integer.MAX_VALUE) {
|
||||
return "SinceFilter:" + this.minTimeInclusive + ",UntilFilter:" + maxTimeExclusive;
|
||||
} else if (minTimeInclusive > 0) {
|
||||
return "SinceFilter:" + this.minTimeInclusive;
|
||||
} else {
|
||||
return "UntilFilter:" + this.maxTimeExclusive;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
LeafReader indexReader = context.reader();
|
||||
if (!(indexReader instanceof EarlybirdIndexSegmentAtomicReader)) {
|
||||
return new AllDocsIterator(indexReader);
|
||||
}
|
||||
|
||||
EarlybirdIndexSegmentAtomicReader reader = (EarlybirdIndexSegmentAtomicReader) indexReader;
|
||||
TimeMapper timeMapper = reader.getSegmentData().getTimeMapper();
|
||||
int smallestDocID = timeMapper.findFirstDocId(maxTimeExclusive, reader.getSmallestDocID());
|
||||
int largestDoc = timeMapper.findFirstDocId(minTimeInclusive, reader.getSmallestDocID());
|
||||
int smallestDoc = smallestDocID > 0 ? smallestDocID - 1 : 0;
|
||||
return new SinceUntilDocIdSetIterator(
|
||||
reader,
|
||||
timeMapper,
|
||||
smallestDoc,
|
||||
largestDoc,
|
||||
minTimeInclusive,
|
||||
maxTimeExclusive);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
// Returns true if this TimeMapper is at least partially covered by these time filters.
|
||||
public static boolean sinceUntilTimesInRange(
|
||||
TimeMapper timeMapper, int sinceTime, int untilTime) {
|
||||
return (sinceTime == NO_FILTER || sinceTime <= timeMapper.getLastTime())
|
||||
&& (untilTime == NO_FILTER || untilTime >= timeMapper.getFirstTime());
|
||||
}
|
||||
|
||||
private static final class SinceUntilDocIdSetIterator extends RangeFilterDISI {
|
||||
private final TimeMapper timeMapper;
|
||||
private final int minTimeInclusive;
|
||||
private final int maxTimeExclusive;
|
||||
|
||||
public SinceUntilDocIdSetIterator(EarlybirdIndexSegmentAtomicReader reader,
|
||||
TimeMapper timeMapper,
|
||||
int smallestDocID,
|
||||
int largestDocID,
|
||||
int minTimeInclusive,
|
||||
int maxExclusive) throws IOException {
|
||||
super(reader, smallestDocID, largestDocID);
|
||||
this.timeMapper = timeMapper;
|
||||
this.minTimeInclusive = minTimeInclusive;
|
||||
this.maxTimeExclusive = maxExclusive;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean shouldReturnDoc() {
|
||||
final int docTime = timeMapper.getTime(docID());
|
||||
return docTime >= minTimeInclusive && docTime < maxTimeExclusive;
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,29 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.TermQuery;
|
||||
|
||||
/**
|
||||
* Work around an issue where IntTerms and LongTerms are not valid utf8,
|
||||
* so calling toString on any TermQuery containing an IntTerm or a LongTerm may cause exceptions.
|
||||
* This code should produce the same output as TermQuery.toString
|
||||
*/
|
||||
public final class TermQueryWithSafeToString extends TermQuery {
|
||||
private final String termValueForToString;
|
||||
|
||||
public TermQueryWithSafeToString(Term term, String termValueForToString) {
|
||||
super(term);
|
||||
this.termValueForToString = termValueForToString;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
StringBuilder buffer = new StringBuilder();
|
||||
if (!getTerm().field().equals(field)) {
|
||||
buffer.append(getTerm().field());
|
||||
buffer.append(":");
|
||||
}
|
||||
buffer.append(termValueForToString);
|
||||
return buffer.toString();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,128 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.search.EarlyTerminationState;
|
||||
import com.twitter.search.common.search.TerminationTracker;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
|
||||
/**
|
||||
* DocIdSetIterator whose nextDoc() and advance() will early terminate by returning NO_MORE_DOCS
|
||||
* after the given deadline.
|
||||
*/
|
||||
public class TimedDocIdSetIterator extends DocIdSetIterator {
|
||||
// check deadline every NEXT_CALL_TIMEOUT_CHECK_PERIOD calls to nextDoc()
|
||||
@VisibleForTesting
|
||||
protected static final int NEXT_CALL_TIMEOUT_CHECK_PERIOD =
|
||||
EarlybirdConfig.getInt("timed_doc_id_set_next_doc_deadline_check_period", 1000);
|
||||
|
||||
|
||||
// check deadline every ADVANCE_CALL_TIMEOUT_CHECK_PERIOD calls to advance()
|
||||
private static final int ADVANCE_CALL_TIMEOUT_CHECK_PERIOD =
|
||||
EarlybirdConfig.getInt("timed_doc_id_set_advance_deadline_check_period", 100);
|
||||
|
||||
private final Clock clock;
|
||||
private final DocIdSetIterator innerIterator;
|
||||
private final SearchCounter timeoutCountStat;
|
||||
|
||||
@Nullable
|
||||
private final TerminationTracker terminationTracker;
|
||||
private final long deadlineMillisFromEpoch;
|
||||
|
||||
private int docId = -1;
|
||||
private int nextCounter = 0;
|
||||
private int advanceCounter = 0;
|
||||
|
||||
public TimedDocIdSetIterator(DocIdSetIterator innerIterator,
|
||||
@Nullable TerminationTracker terminationTracker,
|
||||
final long timeoutOverride,
|
||||
@Nullable SearchCounter timeoutCountStat) {
|
||||
this(innerIterator, terminationTracker, timeoutOverride, timeoutCountStat, Clock.SYSTEM_CLOCK);
|
||||
}
|
||||
|
||||
protected TimedDocIdSetIterator(DocIdSetIterator innerIterator,
|
||||
@Nullable TerminationTracker terminationTracker,
|
||||
final long timeoutOverride,
|
||||
@Nullable SearchCounter timeoutCountStat,
|
||||
Clock clock) {
|
||||
this.clock = clock;
|
||||
this.innerIterator = innerIterator;
|
||||
this.timeoutCountStat = timeoutCountStat;
|
||||
this.terminationTracker = terminationTracker;
|
||||
|
||||
if (terminationTracker == null) {
|
||||
deadlineMillisFromEpoch = -1;
|
||||
} else {
|
||||
if (timeoutOverride > 0) {
|
||||
deadlineMillisFromEpoch = terminationTracker.getClientStartTimeMillis() + timeoutOverride;
|
||||
} else {
|
||||
deadlineMillisFromEpoch = terminationTracker.getTimeoutEndTimeWithReservation();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected TimedDocIdSetIterator(DocIdSetIterator innerIterator,
|
||||
final long deadline,
|
||||
@Nullable SearchCounter timeoutCountStat,
|
||||
Clock clock) {
|
||||
this.clock = clock;
|
||||
this.innerIterator = innerIterator;
|
||||
this.timeoutCountStat = timeoutCountStat;
|
||||
this.terminationTracker = null;
|
||||
|
||||
this.deadlineMillisFromEpoch = deadline;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public int docID() {
|
||||
return docId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int nextDoc() throws IOException {
|
||||
if (++nextCounter % NEXT_CALL_TIMEOUT_CHECK_PERIOD == 0
|
||||
&& clock.nowMillis() > deadlineMillisFromEpoch) {
|
||||
if (timeoutCountStat != null) {
|
||||
timeoutCountStat.increment();
|
||||
}
|
||||
if (terminationTracker != null) {
|
||||
terminationTracker.setEarlyTerminationState(
|
||||
EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED);
|
||||
}
|
||||
|
||||
return docId = NO_MORE_DOCS;
|
||||
}
|
||||
return docId = innerIterator.nextDoc();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int advance(int target) throws IOException {
|
||||
if (++advanceCounter % ADVANCE_CALL_TIMEOUT_CHECK_PERIOD == 0
|
||||
&& clock.nowMillis() > deadlineMillisFromEpoch) {
|
||||
if (timeoutCountStat != null) {
|
||||
timeoutCountStat.increment();
|
||||
}
|
||||
if (terminationTracker != null) {
|
||||
terminationTracker.setEarlyTerminationState(
|
||||
EarlyTerminationState.TERMINATED_TIME_OUT_EXCEEDED);
|
||||
}
|
||||
return docId = NO_MORE_DOCS;
|
||||
}
|
||||
|
||||
return docId = innerIterator.advance(target);
|
||||
}
|
||||
|
||||
@Override
|
||||
public long cost() {
|
||||
return innerIterator.cost();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,128 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserTable;
|
||||
|
||||
public final class UserFlagsExcludeFilter extends Query {
|
||||
/**
|
||||
* Returns a query that filters hits based on their author flags.
|
||||
*
|
||||
* @param excludeAntisocial Determines if the filter should exclude hits from antisocial users.
|
||||
* @param excludeOffensive Determines if the filter should exclude hits from offensive users.
|
||||
* @param excludeProtected Determines if the filter should exclude hits from protected users
|
||||
* @return A query that filters hits based on their author flags.
|
||||
*/
|
||||
public static Query getUserFlagsExcludeFilter(UserTable userTable,
|
||||
boolean excludeAntisocial,
|
||||
boolean excludeOffensive,
|
||||
boolean excludeProtected) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new UserFlagsExcludeFilter(
|
||||
userTable, excludeAntisocial, excludeOffensive, excludeProtected),
|
||||
BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
private final UserTable userTable;
|
||||
private final boolean excludeAntisocial;
|
||||
private final boolean excludeOffensive;
|
||||
private final boolean excludeProtected;
|
||||
|
||||
private UserFlagsExcludeFilter(
|
||||
UserTable userTable,
|
||||
boolean excludeAntisocial,
|
||||
boolean excludeOffensive,
|
||||
boolean excludeProtected) {
|
||||
this.userTable = userTable;
|
||||
this.excludeAntisocial = excludeAntisocial;
|
||||
this.excludeOffensive = excludeOffensive;
|
||||
this.excludeProtected = excludeProtected;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (excludeAntisocial ? 13 : 0) + (excludeOffensive ? 1 : 0) + (excludeProtected ? 2 : 0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof UserFlagsExcludeFilter)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
UserFlagsExcludeFilter filter = UserFlagsExcludeFilter.class.cast(obj);
|
||||
return (excludeAntisocial == filter.excludeAntisocial)
|
||||
&& (excludeOffensive == filter.excludeOffensive)
|
||||
&& (excludeProtected == filter.excludeProtected);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "UserFlagsExcludeFilter";
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
LeafReader reader = context.reader();
|
||||
if (userTable == null) {
|
||||
return new AllDocsIterator(reader);
|
||||
}
|
||||
|
||||
final int bits =
|
||||
(excludeAntisocial ? UserTable.ANTISOCIAL_BIT : 0)
|
||||
| (excludeOffensive ? UserTable.OFFENSIVE_BIT | UserTable.NSFW_BIT : 0)
|
||||
| (excludeProtected ? UserTable.IS_PROTECTED_BIT : 0);
|
||||
if (bits != 0) {
|
||||
return new UserFlagsExcludeDocIdSetIterator(reader, userTable) {
|
||||
@Override
|
||||
protected boolean checkUserFlags(UserTable table, long userID) {
|
||||
return !table.isSet(userID, bits);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
return new AllDocsIterator(reader);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private abstract static class UserFlagsExcludeDocIdSetIterator extends RangeFilterDISI {
|
||||
private final UserTable userTable;
|
||||
private final NumericDocValues fromUserID;
|
||||
|
||||
public UserFlagsExcludeDocIdSetIterator(
|
||||
LeafReader indexReader, UserTable table) throws IOException {
|
||||
super(indexReader);
|
||||
userTable = table;
|
||||
fromUserID =
|
||||
indexReader.getNumericDocValues(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean shouldReturnDoc() throws IOException {
|
||||
return fromUserID.advanceExact(docID())
|
||||
&& checkUserFlags(userTable, fromUserID.longValue());
|
||||
}
|
||||
|
||||
protected abstract boolean checkUserFlags(UserTable table, long userID);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,528 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.Collectors;
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.index.Terms;
|
||||
import org.apache.lucene.index.TermsEnum;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.BulkScorer;
|
||||
import org.apache.lucene.search.ConstantScoreQuery;
|
||||
import org.apache.lucene.search.ConstantScoreWeight;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.apache.lucene.util.BytesRef;
|
||||
|
||||
import com.twitter.decider.Decider;
|
||||
import com.twitter.search.common.decider.DeciderUtil;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.metrics.SearchTimer;
|
||||
import com.twitter.search.common.metrics.SearchTimerStats;
|
||||
import com.twitter.search.common.query.HitAttributeHelper;
|
||||
import com.twitter.search.common.query.IDDisjunctionQuery;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.base.IndexedNumericFieldSettings;
|
||||
import com.twitter.search.common.schema.base.Schema;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
|
||||
import com.twitter.search.common.search.termination.QueryTimeout;
|
||||
import com.twitter.search.common.util.analysis.LongTermAttributeImpl;
|
||||
import com.twitter.search.common.util.analysis.SortableLongTermAttributeImpl;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData;
|
||||
import com.twitter.search.core.earlybird.index.inverted.InvertedIndex;
|
||||
import com.twitter.search.core.earlybird.index.inverted.MultiSegmentTermDictionary;
|
||||
import com.twitter.search.earlybird.partition.MultiSegmentTermDictionaryManager;
|
||||
import com.twitter.search.earlybird.queryparser.EarlybirdQueryHelper;
|
||||
import com.twitter.search.queryparser.query.QueryParserException;
|
||||
|
||||
/**
|
||||
* A variant of a multi-term ID disjunction query (similar to {@link UserIdMultiSegmentQuery}),
|
||||
* that also uses a {@link MultiSegmentTermDictionary} where available, for more efficient
|
||||
* term lookups for queries that span multiple segments.
|
||||
*
|
||||
* By default, a IDDisjunctionQuery (or Lucene's MultiTermQuery), does a term dictionary lookup
|
||||
* for all of the terms in its disjunction, and it does it once for each segment (or AtomicReader)
|
||||
* that the query is searching.
|
||||
* This means that when the term dictionary is large, and the term lookups are expensive, and when
|
||||
* we are searching multiple segments, the query needs to make num_terms * num_segments expensive
|
||||
* term dictionary lookups.
|
||||
*
|
||||
* With the help of a MultiSegmentTermDictionary, this multi-term disjunction query implementation
|
||||
* only does one lookup for all of the segments managed by the MultiSegmentTermDictionary.
|
||||
* If a segment is not supported by the MultiSegmentTermDictionary (e.g. if it's not optimized yet),
|
||||
* a regular lookup in that segment's term dictionary will be performed.
|
||||
*
|
||||
* Usually, we will make 'num_terms' lookups in the current, un-optimized segment, and then if
|
||||
* more segments need to be searched, we will make another 'num_terms' lookups, once for all of
|
||||
* the remaining segments.
|
||||
*
|
||||
* When performing lookups in the MultiSegmentTermDictionary, for each supported segment, we save
|
||||
* a list of termIds from that segment for all the searched terms that appear in that segment.
|
||||
*
|
||||
* For example, when querying for UserIdMultiSegmentQuery with user ids: {1L, 2L, 3L} and
|
||||
* segments: {1, 2}, where segment 1 has user ids {1L, 2L} indexed under termIds {100, 200},
|
||||
* and segment 2 has user ids {1L, 2L, 3L} indexed under termIds {200, 300, 400}, we will build
|
||||
* up the following map once:
|
||||
* segment1 -> [100, 200]
|
||||
* segment2 -> [200, 300, 400]
|
||||
*/
|
||||
public class UserIdMultiSegmentQuery extends Query {
|
||||
@VisibleForTesting
|
||||
public static final SearchTimerStats TERM_LOOKUP_STATS =
|
||||
SearchTimerStats.export("multi_segment_query_term_lookup", TimeUnit.NANOSECONDS, false);
|
||||
public static final SearchTimerStats QUERY_FROM_PRECOMPUTED =
|
||||
SearchTimerStats.export("multi_segment_query_from_precomputed", TimeUnit.NANOSECONDS, false);
|
||||
public static final SearchTimerStats QUERY_REGULAR =
|
||||
SearchTimerStats.export("multi_segment_query_regular", TimeUnit.NANOSECONDS, false);
|
||||
|
||||
@VisibleForTesting
|
||||
public static final SearchCounter USED_MULTI_SEGMENT_TERM_DICTIONARY_COUNT = SearchCounter.export(
|
||||
"user_id_multi_segment_query_used_multi_segment_term_dictionary_count");
|
||||
@VisibleForTesting
|
||||
public static final SearchCounter USED_ORIGINAL_TERM_DICTIONARY_COUNT = SearchCounter.export(
|
||||
"user_id_multi_segment_query_used_original_term_dictionary_count");
|
||||
|
||||
private static final SearchCounter NEW_QUERY_COUNT =
|
||||
SearchCounter.export("user_id_multi_segment_new_query_count");
|
||||
private static final SearchCounter OLD_QUERY_COUNT =
|
||||
SearchCounter.export("user_id_multi_segment_old_query_count");
|
||||
|
||||
private static final HashMap<String, SearchCounter> QUERY_COUNT_BY_QUERY_NAME = new HashMap<>();
|
||||
private static final HashMap<String, SearchCounter> QUERY_COUNT_BY_FIELD_NAME = new HashMap<>();
|
||||
|
||||
private static final String DECIDER_KEY_PREFIX = "use_multi_segment_id_disjunction_queries_in_";
|
||||
|
||||
/**
|
||||
* Returns a new user ID disjunction query.
|
||||
*
|
||||
* @param ids The user IDs.
|
||||
* @param field The field storing the user IDs.
|
||||
* @param schemaSnapshot A snapshot of earlybird's schema.
|
||||
* @param multiSegmentTermDictionaryManager The manager for the term dictionaries that span
|
||||
* multiple segments.
|
||||
* @param decider The decider.
|
||||
* @param earlybirdCluster The earlybird cluster.
|
||||
* @param ranks The hit attribution ranks to be assigned to every user ID.
|
||||
* @param hitAttributeHelper The helper that tracks hit attributions.
|
||||
* @param queryTimeout The timeout to be enforced on this query.
|
||||
* @return A new user ID disjunction query.
|
||||
*/
|
||||
public static Query createIdDisjunctionQuery(
|
||||
String queryName,
|
||||
List<Long> ids,
|
||||
String field,
|
||||
ImmutableSchemaInterface schemaSnapshot,
|
||||
MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager,
|
||||
Decider decider,
|
||||
EarlybirdCluster earlybirdCluster,
|
||||
List<Integer> ranks,
|
||||
@Nullable HitAttributeHelper hitAttributeHelper,
|
||||
@Nullable QueryTimeout queryTimeout) throws QueryParserException {
|
||||
QUERY_COUNT_BY_QUERY_NAME.computeIfAbsent(queryName, name ->
|
||||
SearchCounter.export("multi_segment_query_name_" + name)).increment();
|
||||
QUERY_COUNT_BY_FIELD_NAME.computeIfAbsent(field, name ->
|
||||
SearchCounter.export("multi_segment_query_count_for_field_" + name)).increment();
|
||||
|
||||
if (DeciderUtil.isAvailableForRandomRecipient(decider, getDeciderName(earlybirdCluster))) {
|
||||
NEW_QUERY_COUNT.increment();
|
||||
MultiSegmentTermDictionary multiSegmentTermDictionary =
|
||||
multiSegmentTermDictionaryManager.getMultiSegmentTermDictionary(field);
|
||||
return new UserIdMultiSegmentQuery(
|
||||
ids,
|
||||
field,
|
||||
schemaSnapshot,
|
||||
multiSegmentTermDictionary,
|
||||
ranks,
|
||||
hitAttributeHelper,
|
||||
queryTimeout);
|
||||
} else {
|
||||
OLD_QUERY_COUNT.increment();
|
||||
return new IDDisjunctionQuery(ids, field, schemaSnapshot);
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public static String getDeciderName(EarlybirdCluster earlybirdCluster) {
|
||||
return DECIDER_KEY_PREFIX + earlybirdCluster.name().toLowerCase();
|
||||
}
|
||||
|
||||
private final boolean useOrderPreservingEncoding;
|
||||
private final HitAttributeHelper hitAttributeHelper;
|
||||
private final QueryTimeout queryTimeout;
|
||||
private final MultiSegmentTermDictionary multiSegmentTermDictionary;
|
||||
private final Schema.FieldInfo fieldInfo;
|
||||
private final String field;
|
||||
private final List<Long> ids;
|
||||
|
||||
private final List<Integer> ranks;
|
||||
// For each segment where we have a multi-segment term dictionary, this map will contain the
|
||||
// termIds of all the terms that actually appear in that segment's index.
|
||||
@Nullable
|
||||
private Map<InvertedIndex, List<TermRankPair>> termIdsPerSegment;
|
||||
|
||||
// A wrap class helps to associate termId with corresponding search operator rank if exist
|
||||
private final class TermRankPair {
|
||||
private final int termId;
|
||||
private final int rank;
|
||||
|
||||
TermRankPair(int termId, int rank) {
|
||||
this.termId = termId;
|
||||
this.rank = rank;
|
||||
}
|
||||
|
||||
public int getTermId() {
|
||||
return termId;
|
||||
}
|
||||
|
||||
public int getRank() {
|
||||
return rank;
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public UserIdMultiSegmentQuery(
|
||||
List<Long> ids,
|
||||
String field,
|
||||
ImmutableSchemaInterface schemaSnapshot,
|
||||
MultiSegmentTermDictionary termDictionary,
|
||||
List<Integer> ranks,
|
||||
@Nullable HitAttributeHelper hitAttributeHelper,
|
||||
@Nullable QueryTimeout queryTimeout) {
|
||||
this.field = field;
|
||||
this.ids = ids;
|
||||
this.multiSegmentTermDictionary = termDictionary;
|
||||
this.ranks = ranks;
|
||||
this.hitAttributeHelper = hitAttributeHelper;
|
||||
this.queryTimeout = queryTimeout;
|
||||
|
||||
// check ids and ranks have same size
|
||||
Preconditions.checkArgument(ranks.size() == 0 || ranks.size() == ids.size());
|
||||
// hitAttributeHelper is not null iff ranks is not empty
|
||||
if (ranks.size() > 0) {
|
||||
Preconditions.checkNotNull(hitAttributeHelper);
|
||||
} else {
|
||||
Preconditions.checkArgument(hitAttributeHelper == null);
|
||||
}
|
||||
|
||||
if (!schemaSnapshot.hasField(field)) {
|
||||
throw new IllegalStateException("Tried to search a field which does not exist in schema");
|
||||
}
|
||||
this.fieldInfo = Preconditions.checkNotNull(schemaSnapshot.getFieldInfo(field));
|
||||
|
||||
IndexedNumericFieldSettings numericFieldSettings =
|
||||
fieldInfo.getFieldType().getNumericFieldSettings();
|
||||
if (numericFieldSettings == null) {
|
||||
throw new IllegalStateException("Id field is not numerical");
|
||||
}
|
||||
|
||||
this.useOrderPreservingEncoding = numericFieldSettings.isUseSortableEncoding();
|
||||
}
|
||||
|
||||
/**
|
||||
* If it hasn't been built yet, build up the map containing termIds of all the terms being
|
||||
* searched, for all of the segments that are managed by the multi-segment term dictionary.
|
||||
*
|
||||
* We only do this once, when we have to search the first segment that's supported by our
|
||||
* multi-segment term dictionary.
|
||||
*
|
||||
* Flow here is to:
|
||||
* 1. go through all the ids being queried.
|
||||
* 2. for each id, get the termIds for that term in all of the segments in the term dictionary
|
||||
* 3. for all of the segments that have that term, add the termId to that segment's list of
|
||||
* term ids (in the 'termIdsPerSegment' map).
|
||||
*/
|
||||
private void createTermIdsPerSegment() {
|
||||
if (termIdsPerSegment != null) {
|
||||
// already created the map
|
||||
return;
|
||||
}
|
||||
|
||||
long start = System.nanoTime();
|
||||
|
||||
final BytesRef termRef = useOrderPreservingEncoding
|
||||
? SortableLongTermAttributeImpl.newBytesRef()
|
||||
: LongTermAttributeImpl.newBytesRef();
|
||||
|
||||
termIdsPerSegment = Maps.newHashMap();
|
||||
List<? extends InvertedIndex> segmentIndexes = multiSegmentTermDictionary.getSegmentIndexes();
|
||||
|
||||
for (int idx = 0; idx < ids.size(); ++idx) {
|
||||
long longTerm = ids.get(idx);
|
||||
|
||||
if (useOrderPreservingEncoding) {
|
||||
SortableLongTermAttributeImpl.copyLongToBytesRef(termRef, longTerm);
|
||||
} else {
|
||||
LongTermAttributeImpl.copyLongToBytesRef(termRef, longTerm);
|
||||
}
|
||||
|
||||
int[] termIds = multiSegmentTermDictionary.lookupTermIds(termRef);
|
||||
Preconditions.checkState(segmentIndexes.size() == termIds.length,
|
||||
"SegmentIndexes: %s, field: %s, termIds: %s",
|
||||
segmentIndexes.size(), field, termIds.length);
|
||||
|
||||
for (int indexId = 0; indexId < termIds.length; indexId++) {
|
||||
int termId = termIds[indexId];
|
||||
if (termId != EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) {
|
||||
InvertedIndex fieldIndex = segmentIndexes.get(indexId);
|
||||
|
||||
List<TermRankPair> termIdsList = termIdsPerSegment.get(fieldIndex);
|
||||
if (termIdsList == null) {
|
||||
termIdsList = Lists.newArrayList();
|
||||
termIdsPerSegment.put(fieldIndex, termIdsList);
|
||||
}
|
||||
termIdsList.add(new TermRankPair(
|
||||
termId, ranks.size() > 0 ? ranks.get(idx) : -1));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
long elapsed = System.nanoTime() - start;
|
||||
TERM_LOOKUP_STATS.timerIncrement(elapsed);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
|
||||
return new UserIdMultiSegmentQueryWeight(searcher, scoreMode, boost);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Arrays.hashCode(
|
||||
new Object[] {useOrderPreservingEncoding, queryTimeout, field, ids, ranks});
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof UserIdMultiSegmentQuery)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
UserIdMultiSegmentQuery query = UserIdMultiSegmentQuery.class.cast(obj);
|
||||
return Arrays.equals(
|
||||
new Object[] {useOrderPreservingEncoding, queryTimeout, field, ids, ranks},
|
||||
new Object[] {query.useOrderPreservingEncoding,
|
||||
query.queryTimeout,
|
||||
query.field,
|
||||
query.ids,
|
||||
query.ranks});
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String fieldName) {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append(getClass().getSimpleName()).append("[").append(fieldName).append(":");
|
||||
for (Long id : this.ids) {
|
||||
builder.append(id);
|
||||
builder.append(",");
|
||||
}
|
||||
builder.setLength(builder.length() - 1);
|
||||
builder.append("]");
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
private final class UserIdMultiSegmentQueryWeight extends ConstantScoreWeight {
|
||||
private final IndexSearcher searcher;
|
||||
private final ScoreMode scoreMode;
|
||||
|
||||
private UserIdMultiSegmentQueryWeight(
|
||||
IndexSearcher searcher,
|
||||
ScoreMode scoreMode,
|
||||
float boost) {
|
||||
super(UserIdMultiSegmentQuery.this, boost);
|
||||
this.searcher = searcher;
|
||||
this.scoreMode = scoreMode;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
Weight weight = rewrite(context);
|
||||
if (weight != null) {
|
||||
return weight.scorer(context);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public BulkScorer bulkScorer(LeafReaderContext context) throws IOException {
|
||||
Weight weight = rewrite(context);
|
||||
if (weight != null) {
|
||||
return weight.bulkScorer(context);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
terms.addAll(ids
|
||||
.stream()
|
||||
.map(id -> new Term(field, LongTermAttributeImpl.copyIntoNewBytesRef(id)))
|
||||
.collect(Collectors.toSet()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable(LeafReaderContext ctx) {
|
||||
return true;
|
||||
}
|
||||
|
||||
private Weight rewrite(LeafReaderContext context) throws IOException {
|
||||
final Terms terms = context.reader().terms(field);
|
||||
if (terms == null) {
|
||||
// field does not exist
|
||||
return null;
|
||||
}
|
||||
final TermsEnum termsEnum = terms.iterator();
|
||||
Preconditions.checkNotNull(termsEnum, "No termsEnum for field: %s", field);
|
||||
|
||||
BooleanQuery bq;
|
||||
// See if the segment is supported by the multi-segment term dictionary. If so, build up
|
||||
// the query using the termIds from the multi-segment term dictionary.
|
||||
// If not (for the current segment), do the term lookups directly in the queried segment.
|
||||
InvertedIndex fieldIndex = getFieldIndexFromMultiTermDictionary(context);
|
||||
if (fieldIndex != null) {
|
||||
createTermIdsPerSegment();
|
||||
|
||||
USED_MULTI_SEGMENT_TERM_DICTIONARY_COUNT.increment();
|
||||
SearchTimer timer = QUERY_FROM_PRECOMPUTED.startNewTimer();
|
||||
bq = addPrecomputedTermQueries(fieldIndex, termsEnum);
|
||||
QUERY_FROM_PRECOMPUTED.stopTimerAndIncrement(timer);
|
||||
} else {
|
||||
USED_ORIGINAL_TERM_DICTIONARY_COUNT.increment();
|
||||
// This segment is not supported by the multi-segment term dictionary. Lookup terms
|
||||
// directly.
|
||||
SearchTimer timer = QUERY_REGULAR.startNewTimer();
|
||||
bq = addTermQueries(termsEnum);
|
||||
QUERY_REGULAR.stopTimerAndIncrement(timer);
|
||||
}
|
||||
|
||||
return searcher.rewrite(new ConstantScoreQuery(bq)).createWeight(
|
||||
searcher, scoreMode, score());
|
||||
}
|
||||
|
||||
/**
|
||||
* If the multi-segment term dictionary supports this segment/LeafReader, then return the
|
||||
* InvertedIndex representing this segment.
|
||||
*
|
||||
* If the segment being queried right now is not in the multi-segment term dictionary (e.g.
|
||||
* if it's not optimized yet), return null.
|
||||
*/
|
||||
@Nullable
|
||||
private InvertedIndex getFieldIndexFromMultiTermDictionary(LeafReaderContext context)
|
||||
throws IOException {
|
||||
if (multiSegmentTermDictionary == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (context.reader() instanceof EarlybirdIndexSegmentAtomicReader) {
|
||||
EarlybirdIndexSegmentAtomicReader reader =
|
||||
(EarlybirdIndexSegmentAtomicReader) context.reader();
|
||||
|
||||
EarlybirdIndexSegmentData segmentData = reader.getSegmentData();
|
||||
InvertedIndex fieldIndex = segmentData.getFieldIndex(field);
|
||||
|
||||
if (multiSegmentTermDictionary.supportSegmentIndex(fieldIndex)) {
|
||||
return fieldIndex;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
private BooleanQuery addPrecomputedTermQueries(
|
||||
InvertedIndex fieldIndex,
|
||||
TermsEnum termsEnum) throws IOException {
|
||||
|
||||
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
||||
int numClauses = 0;
|
||||
|
||||
List<TermRankPair> termRankPairs = termIdsPerSegment.get(fieldIndex);
|
||||
if (termRankPairs != null) {
|
||||
for (TermRankPair pair : termRankPairs) {
|
||||
int termId = pair.getTermId();
|
||||
if (numClauses >= BooleanQuery.getMaxClauseCount()) {
|
||||
BooleanQuery saved = bqBuilder.build();
|
||||
bqBuilder = new BooleanQuery.Builder();
|
||||
bqBuilder.add(saved, BooleanClause.Occur.SHOULD);
|
||||
numClauses = 1;
|
||||
}
|
||||
|
||||
Query query;
|
||||
if (pair.getRank() != -1) {
|
||||
query = EarlybirdQueryHelper.maybeWrapWithHitAttributionCollector(
|
||||
new SimpleTermQuery(termsEnum, termId),
|
||||
pair.getRank(),
|
||||
fieldInfo,
|
||||
hitAttributeHelper);
|
||||
} else {
|
||||
query = new SimpleTermQuery(termsEnum, termId);
|
||||
}
|
||||
bqBuilder.add(EarlybirdQueryHelper.maybeWrapWithTimeout(query, queryTimeout),
|
||||
BooleanClause.Occur.SHOULD);
|
||||
++numClauses;
|
||||
}
|
||||
}
|
||||
return bqBuilder.build();
|
||||
}
|
||||
|
||||
private BooleanQuery addTermQueries(TermsEnum termsEnum) throws IOException {
|
||||
final BytesRef termRef = useOrderPreservingEncoding
|
||||
? SortableLongTermAttributeImpl.newBytesRef()
|
||||
: LongTermAttributeImpl.newBytesRef();
|
||||
|
||||
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
|
||||
int numClauses = 0;
|
||||
|
||||
for (int idx = 0; idx < ids.size(); ++idx) {
|
||||
long longTerm = ids.get(idx);
|
||||
if (useOrderPreservingEncoding) {
|
||||
SortableLongTermAttributeImpl.copyLongToBytesRef(termRef, longTerm);
|
||||
} else {
|
||||
LongTermAttributeImpl.copyLongToBytesRef(termRef, longTerm);
|
||||
}
|
||||
|
||||
if (termsEnum.seekExact(termRef)) {
|
||||
if (numClauses >= BooleanQuery.getMaxClauseCount()) {
|
||||
BooleanQuery saved = bqBuilder.build();
|
||||
bqBuilder = new BooleanQuery.Builder();
|
||||
bqBuilder.add(saved, BooleanClause.Occur.SHOULD);
|
||||
numClauses = 1;
|
||||
}
|
||||
|
||||
if (ranks.size() > 0) {
|
||||
bqBuilder.add(EarlybirdQueryHelper.maybeWrapWithHitAttributionCollector(
|
||||
new SimpleTermQuery(termsEnum, termsEnum.ord()),
|
||||
ranks.get(idx),
|
||||
fieldInfo,
|
||||
hitAttributeHelper),
|
||||
BooleanClause.Occur.SHOULD);
|
||||
} else {
|
||||
bqBuilder.add(new SimpleTermQuery(termsEnum, termsEnum.ord()),
|
||||
BooleanClause.Occur.SHOULD);
|
||||
}
|
||||
++numClauses;
|
||||
}
|
||||
}
|
||||
|
||||
return bqBuilder.build();
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,82 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.queries;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
||||
import com.twitter.search.common.query.FilteredQuery;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserScrubGeoMap;
|
||||
import com.twitter.search.earlybird.index.TweetIDMapper;
|
||||
|
||||
/**
|
||||
* Filter that can be used with searches over geo field postings lists in order to filter out tweets
|
||||
* that have been geo scrubbed. Determines if a tweet has been geo scrubbed by comparing the
|
||||
* tweet's id against the max scrubbed tweet id for that tweet's author, which is stored in the
|
||||
* UserScrubGeoMap.
|
||||
*
|
||||
* See: go/realtime-geo-filtering
|
||||
*/
|
||||
public class UserScrubGeoFilter implements FilteredQuery.DocIdFilterFactory {
|
||||
|
||||
private UserScrubGeoMap userScrubGeoMap;
|
||||
|
||||
private final SearchRateCounter totalRequestsUsingFilterCounter =
|
||||
SearchRateCounter.export("user_scrub_geo_filter_total_requests");
|
||||
|
||||
public static FilteredQuery.DocIdFilterFactory getDocIdFilterFactory(
|
||||
UserScrubGeoMap userScrubGeoMap) {
|
||||
return new UserScrubGeoFilter(userScrubGeoMap);
|
||||
}
|
||||
|
||||
public UserScrubGeoFilter(UserScrubGeoMap userScrubGeoMap) {
|
||||
this.userScrubGeoMap = userScrubGeoMap;
|
||||
totalRequestsUsingFilterCounter.increment();
|
||||
}
|
||||
|
||||
@Override
|
||||
public FilteredQuery.DocIdFilter getDocIdFilter(LeafReaderContext context) throws IOException {
|
||||
// To determine if a given doc has been geo scrubbed we need two pieces of information about the
|
||||
// doc: the associated tweet id and the user id of the tweet's author. We can get the tweet id
|
||||
// from the TweetIDMapper for the segment we are currently searching, and we can get the user id
|
||||
// of the tweet's author by looking up the doc id in the NumericDocValues for the
|
||||
// FROM_USER_ID_CSF.
|
||||
//
|
||||
// With this information we can check the UserScrubGeoMap to find out if the tweet has been
|
||||
// geo scrubbed and filter it out accordingly.
|
||||
final EarlybirdIndexSegmentAtomicReader currTwitterReader =
|
||||
(EarlybirdIndexSegmentAtomicReader) context.reader();
|
||||
final TweetIDMapper tweetIdMapper =
|
||||
(TweetIDMapper) currTwitterReader.getSegmentData().getDocIDToTweetIDMapper();
|
||||
final NumericDocValues fromUserIdDocValues = currTwitterReader.getNumericDocValues(
|
||||
EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName());
|
||||
return (docId) -> fromUserIdDocValues.advanceExact(docId)
|
||||
&& !userScrubGeoMap.isTweetGeoScrubbed(
|
||||
tweetIdMapper.getTweetID(docId), fromUserIdDocValues.longValue());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "UserScrubGeoFilter";
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof UserScrubGeoMap)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
UserScrubGeoFilter filter = UserScrubGeoFilter.class.cast(obj);
|
||||
// filters are considered equal as long as they are using the same UserScrubGeoMap
|
||||
return Objects.equals(userScrubGeoMap, filter.userScrubGeoMap);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return userScrubGeoMap == null ? 0 : userScrubGeoMap.hashCode();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,422 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import com.twitter.search.common.constants.SearchCardType;
|
||||
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
|
||||
|
||||
public class LinearScoringData {
|
||||
public static final float NO_BOOST_VALUE = 1.0f;
|
||||
|
||||
// A signal value so we can tell if something is unset, also used in explanation.
|
||||
public static final int UNSET_SIGNAL_VALUE = -999;
|
||||
|
||||
//This is somewhat arbitrary, and is here so that we have some limit on
|
||||
//how many offline experimental features we support per query
|
||||
public static final int MAX_OFFLINE_EXPERIMENTAL_FIELDS = 5;
|
||||
|
||||
public enum SkipReason {
|
||||
NOT_SKIPPED,
|
||||
ANTIGAMING,
|
||||
LOW_REPUTATION,
|
||||
LOW_TEXT_SCORE,
|
||||
LOW_RETWEET_COUNT,
|
||||
LOW_FAV_COUNT,
|
||||
SOCIAL_FILTER,
|
||||
LOW_FINAL_SCORE
|
||||
}
|
||||
|
||||
// When you add fields here, make sure you also update the clear() function.
|
||||
public double luceneScore;
|
||||
public double textScore;
|
||||
//I am not sure why this has to be double...
|
||||
public double tokenAt140DividedByNumTokensBucket;
|
||||
public double userRep;
|
||||
public double parusScore;
|
||||
public final double[] offlineExpFeatureValues = new double[MAX_OFFLINE_EXPERIMENTAL_FIELDS];
|
||||
|
||||
// v1 engagement counters
|
||||
public double retweetCountPostLog2;
|
||||
public double favCountPostLog2;
|
||||
public double replyCountPostLog2;
|
||||
public double embedsImpressionCount;
|
||||
public double embedsUrlCount;
|
||||
public double videoViewCount;
|
||||
|
||||
// v2 engagement counters (that have a v1 counter part)
|
||||
public double retweetCountV2;
|
||||
public double favCountV2;
|
||||
public double replyCountV2;
|
||||
public double embedsImpressionCountV2;
|
||||
public double embedsUrlCountV2;
|
||||
public double videoViewCountV2;
|
||||
// pure v2 engagement counters, they started v2 only
|
||||
public double quotedCount;
|
||||
public double weightedRetweetCount;
|
||||
public double weightedReplyCount;
|
||||
public double weightedFavCount;
|
||||
public double weightedQuoteCount;
|
||||
|
||||
// card related properties
|
||||
public boolean hasCard;
|
||||
public byte cardType;
|
||||
|
||||
public boolean hasUrl;
|
||||
public boolean isReply;
|
||||
public boolean isRetweet;
|
||||
public boolean isOffensive;
|
||||
public boolean hasTrend;
|
||||
public boolean isFromVerifiedAccount;
|
||||
public boolean isFromBlueVerifiedAccount;
|
||||
public boolean isUserSpam;
|
||||
public boolean isUserNSFW;
|
||||
public boolean isUserBot;
|
||||
public boolean isUserAntiSocial;
|
||||
public boolean hasVisibleLink;
|
||||
|
||||
public double luceneContrib;
|
||||
public double reputationContrib;
|
||||
public double textScoreContrib;
|
||||
public double favContrib;
|
||||
public double replyContrib;
|
||||
public double multipleReplyContrib;
|
||||
public double retweetContrib;
|
||||
public double parusContrib;
|
||||
public final double[] offlineExpFeatureContributions =
|
||||
new double[MAX_OFFLINE_EXPERIMENTAL_FIELDS];
|
||||
public double embedsImpressionContrib;
|
||||
public double embedsUrlContrib;
|
||||
public double videoViewContrib;
|
||||
public double quotedContrib;
|
||||
|
||||
public double hasUrlContrib;
|
||||
public double isReplyContrib;
|
||||
public double isFollowRetweetContrib;
|
||||
public double isTrustedRetweetContrib;
|
||||
|
||||
// Value passed in the request (ThriftRankingParams.querySpecificScoreAdjustments)
|
||||
public double querySpecificScore;
|
||||
|
||||
// Value passed in the request (ThriftRankingParams.authorSpecificScoreAdjustments)
|
||||
public double authorSpecificScore;
|
||||
|
||||
public double normalizedLuceneScore;
|
||||
|
||||
public int tweetLangId;
|
||||
public double uiLangMult;
|
||||
public double userLangMult;
|
||||
public boolean hasDifferentLang;
|
||||
public boolean hasEnglishTweetAndDifferentUILang;
|
||||
public boolean hasEnglishUIAndDifferentTweetLang;
|
||||
|
||||
public int tweetAgeInSeconds;
|
||||
public double ageDecayMult;
|
||||
|
||||
// Intermediate scores
|
||||
public double scoreBeforeBoost;
|
||||
public double scoreAfterBoost;
|
||||
public double scoreFinal;
|
||||
public double scoreReturned;
|
||||
|
||||
public SkipReason skipReason;
|
||||
|
||||
public boolean isTrusted;
|
||||
public boolean isFollow;
|
||||
public boolean spamUserDampApplied;
|
||||
public boolean nsfwUserDampApplied;
|
||||
public boolean botUserDampApplied;
|
||||
public boolean trustedCircleBoostApplied;
|
||||
public boolean directFollowBoostApplied;
|
||||
public boolean outOfNetworkReplyPenaltyApplied;
|
||||
public boolean hasMultipleHashtagsOrTrends;
|
||||
|
||||
public boolean tweetHasTrendsBoostApplied;
|
||||
public boolean tweetFromVerifiedAccountBoostApplied;
|
||||
public boolean tweetFromBlueVerifiedAccountBoostApplied;
|
||||
public boolean hasCardBoostApplied;
|
||||
public boolean cardDomainMatchBoostApplied;
|
||||
public boolean cardAuthorMatchBoostApplied;
|
||||
public boolean cardTitleMatchBoostApplied;
|
||||
public boolean cardDescriptionMatchBoostApplied;
|
||||
|
||||
public List<String> hitFields;
|
||||
public boolean hasNoTextHitDemotionApplied;
|
||||
public boolean hasUrlOnlyHitDemotionApplied;
|
||||
public boolean hasNameOnlyHitDemotionApplied;
|
||||
public boolean hasSeparateTextAndNameHitDemotionApplied;
|
||||
public boolean hasSeparateTextAndUrlHitDemotionApplied;
|
||||
|
||||
public long fromUserId;
|
||||
// This is actually retweet status ID, or the ID of the original tweet being (natively) retweeted
|
||||
public long sharedStatusId;
|
||||
public long referenceAuthorId; // SEARCH-8564
|
||||
|
||||
public boolean isSelfTweet;
|
||||
public boolean selfTweetBoostApplied;
|
||||
public double selfTweetMult;
|
||||
|
||||
public boolean hasImageUrl;
|
||||
public boolean hasVideoUrl;
|
||||
public boolean hasMedialUrlBoostApplied;
|
||||
public boolean hasNewsUrl;
|
||||
public boolean hasNewsUrlBoostApplied;
|
||||
|
||||
public boolean hasConsumerVideo;
|
||||
public boolean hasProVideo;
|
||||
public boolean hasVine;
|
||||
public boolean hasPeriscope;
|
||||
public boolean hasNativeImage;
|
||||
public boolean isNullcast;
|
||||
public boolean hasQuote;
|
||||
|
||||
public boolean isSensitiveContent;
|
||||
public boolean hasMultipleMediaFlag;
|
||||
public boolean profileIsEggFlag;
|
||||
public boolean isUserNewFlag;
|
||||
|
||||
public int numMentions;
|
||||
public int numHashtags;
|
||||
public int linkLanguage;
|
||||
public int prevUserTweetEngagement;
|
||||
|
||||
public boolean isComposerSourceCamera;
|
||||
|
||||
// health model scores by HML
|
||||
public double toxicityScore; // go/toxicity
|
||||
public double pBlockScore; // go/pblock
|
||||
public double pSpammyTweetScore; // go/pspammytweet
|
||||
public double pReportedTweetScore; // go/preportedtweet
|
||||
public double spammyTweetContentScore; // go/spammy-tweet-content
|
||||
public double experimentalHealthModelScore1;
|
||||
public double experimentalHealthModelScore2;
|
||||
public double experimentalHealthModelScore3;
|
||||
public double experimentalHealthModelScore4;
|
||||
|
||||
public LinearScoringData() {
|
||||
hitFields = Lists.newArrayList();
|
||||
clear();
|
||||
}
|
||||
|
||||
// the following three counters were added later and they got denormalized in standard way,
|
||||
// you can choose to apply scalding (for legacy LinearScoringFunction) or
|
||||
// not apply (for returning in metadata and display in debug).
|
||||
public double getEmbedsImpressionCount(boolean scaleForScoring) {
|
||||
return scaleForScoring ? logWith0(embedsImpressionCount) : embedsImpressionCount;
|
||||
}
|
||||
public double getEmbedsUrlCount(boolean scaleForScoring) {
|
||||
return scaleForScoring ? logWith0(embedsUrlCount) : embedsUrlCount;
|
||||
}
|
||||
public double getVideoViewCount(boolean scaleForScoring) {
|
||||
return scaleForScoring ? logWith0(videoViewCount) : videoViewCount;
|
||||
}
|
||||
private static double logWith0(double value) {
|
||||
return value > 0 ? Math.log(value) : 0.0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string description of all data stored in this instance.
|
||||
*/
|
||||
public String getPropertyExplanation() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append(hasCard ? "CARD " + SearchCardType.cardTypeFromByteValue(cardType) : "");
|
||||
sb.append(hasUrl ? "URL " : "");
|
||||
sb.append(isReply ? "REPLY " : "");
|
||||
sb.append(isRetweet ? "RETWEET " : "");
|
||||
sb.append(isOffensive ? "OFFENSIVE " : "");
|
||||
sb.append(hasTrend ? "TREND " : "");
|
||||
sb.append(hasMultipleHashtagsOrTrends ? "HASHTAG/TREND+ " : "");
|
||||
sb.append(isFromVerifiedAccount ? "VERIFIED " : "");
|
||||
sb.append(isFromBlueVerifiedAccount ? "BLUE_VERIFIED " : "");
|
||||
sb.append(isUserSpam ? "SPAM " : "");
|
||||
sb.append(isUserNSFW ? "NSFW " : "");
|
||||
sb.append(isUserBot ? "BOT " : "");
|
||||
sb.append(isUserAntiSocial ? "ANTISOCIAL " : "");
|
||||
sb.append(isTrusted ? "TRUSTED " : "");
|
||||
sb.append(isFollow ? "FOLLOW " : "");
|
||||
sb.append(isSelfTweet ? "SELF " : "");
|
||||
sb.append(hasImageUrl ? "IMAGE " : "");
|
||||
sb.append(hasVideoUrl ? "VIDEO " : "");
|
||||
sb.append(hasNewsUrl ? "NEWS " : "");
|
||||
sb.append(isNullcast ? "NULLCAST" : "");
|
||||
sb.append(hasQuote ? "QUOTE" : "");
|
||||
sb.append(isComposerSourceCamera ? "Composer Source: CAMERA" : "");
|
||||
sb.append(favCountPostLog2 > 0 ? "Faves:" + favCountPostLog2 + " " : "");
|
||||
sb.append(retweetCountPostLog2 > 0 ? "Retweets:" + retweetCountPostLog2 + " " : "");
|
||||
sb.append(replyCountPostLog2 > 0 ? "Replies:" + replyCountPostLog2 + " " : "");
|
||||
sb.append(getEmbedsImpressionCount(false) > 0
|
||||
? "Embedded Imps:" + getEmbedsImpressionCount(false) + " " : "");
|
||||
sb.append(getEmbedsUrlCount(false) > 0
|
||||
? "Embedded Urls:" + getEmbedsUrlCount(false) + " " : "");
|
||||
sb.append(getVideoViewCount(false) > 0
|
||||
? "Video views:" + getVideoViewCount(false) + " " : "");
|
||||
sb.append(weightedRetweetCount > 0 ? "Weighted Retweets:"
|
||||
+ ((int) weightedRetweetCount) + " " : "");
|
||||
sb.append(weightedReplyCount > 0
|
||||
? "Weighted Replies:" + ((int) weightedReplyCount) + " " : "");
|
||||
sb.append(weightedFavCount > 0
|
||||
? "Weighted Faves:" + ((int) weightedFavCount) + " " : "");
|
||||
sb.append(weightedQuoteCount > 0
|
||||
? "Weighted Quotes:" + ((int) weightedQuoteCount) + " " : "");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets all data stored in this instance.
|
||||
*/
|
||||
public void clear() {
|
||||
luceneScore = UNSET_SIGNAL_VALUE;
|
||||
textScore = UNSET_SIGNAL_VALUE;
|
||||
tokenAt140DividedByNumTokensBucket = UNSET_SIGNAL_VALUE;
|
||||
userRep = UNSET_SIGNAL_VALUE;
|
||||
retweetCountPostLog2 = UNSET_SIGNAL_VALUE;
|
||||
favCountPostLog2 = UNSET_SIGNAL_VALUE;
|
||||
replyCountPostLog2 = UNSET_SIGNAL_VALUE;
|
||||
parusScore = UNSET_SIGNAL_VALUE;
|
||||
Arrays.fill(offlineExpFeatureValues, 0);
|
||||
embedsImpressionCount = UNSET_SIGNAL_VALUE;
|
||||
embedsUrlCount = UNSET_SIGNAL_VALUE;
|
||||
videoViewCount = UNSET_SIGNAL_VALUE;
|
||||
// v2 engagement, these each have a v1 counterpart
|
||||
retweetCountV2 = UNSET_SIGNAL_VALUE;
|
||||
favCountV2 = UNSET_SIGNAL_VALUE;
|
||||
replyCountV2 = UNSET_SIGNAL_VALUE;
|
||||
embedsImpressionCountV2 = UNSET_SIGNAL_VALUE;
|
||||
embedsUrlCountV2 = UNSET_SIGNAL_VALUE;
|
||||
videoViewCountV2 = UNSET_SIGNAL_VALUE;
|
||||
// new engagement counters, they only have one version with the v2 normalizer
|
||||
quotedCount = UNSET_SIGNAL_VALUE;
|
||||
weightedRetweetCount = UNSET_SIGNAL_VALUE;
|
||||
weightedReplyCount = UNSET_SIGNAL_VALUE;
|
||||
weightedFavCount = UNSET_SIGNAL_VALUE;
|
||||
weightedQuoteCount = UNSET_SIGNAL_VALUE;
|
||||
|
||||
hasUrl = false;
|
||||
isReply = false;
|
||||
isRetweet = false;
|
||||
isOffensive = false;
|
||||
hasTrend = false;
|
||||
isFromVerifiedAccount = false;
|
||||
isFromBlueVerifiedAccount = false;
|
||||
isUserSpam = false;
|
||||
isUserNSFW = false;
|
||||
isUserBot = false;
|
||||
isUserAntiSocial = false;
|
||||
hasVisibleLink = false;
|
||||
isNullcast = false;
|
||||
|
||||
luceneContrib = UNSET_SIGNAL_VALUE;
|
||||
reputationContrib = UNSET_SIGNAL_VALUE;
|
||||
textScoreContrib = UNSET_SIGNAL_VALUE;
|
||||
replyContrib = UNSET_SIGNAL_VALUE;
|
||||
multipleReplyContrib = UNSET_SIGNAL_VALUE;
|
||||
retweetContrib = UNSET_SIGNAL_VALUE;
|
||||
favContrib = UNSET_SIGNAL_VALUE;
|
||||
parusContrib = UNSET_SIGNAL_VALUE;
|
||||
Arrays.fill(offlineExpFeatureContributions, 0);
|
||||
embedsImpressionContrib = UNSET_SIGNAL_VALUE;
|
||||
embedsUrlContrib = UNSET_SIGNAL_VALUE;
|
||||
videoViewContrib = UNSET_SIGNAL_VALUE;
|
||||
hasUrlContrib = UNSET_SIGNAL_VALUE;
|
||||
isReplyContrib = UNSET_SIGNAL_VALUE;
|
||||
|
||||
querySpecificScore = UNSET_SIGNAL_VALUE;
|
||||
authorSpecificScore = UNSET_SIGNAL_VALUE;
|
||||
|
||||
normalizedLuceneScore = NO_BOOST_VALUE;
|
||||
|
||||
tweetLangId = ThriftLanguage.UNKNOWN.getValue();
|
||||
uiLangMult = NO_BOOST_VALUE;
|
||||
userLangMult = NO_BOOST_VALUE;
|
||||
hasDifferentLang = false;
|
||||
hasEnglishTweetAndDifferentUILang = false;
|
||||
hasEnglishUIAndDifferentTweetLang = false;
|
||||
|
||||
tweetAgeInSeconds = 0;
|
||||
ageDecayMult = NO_BOOST_VALUE;
|
||||
|
||||
// Intermediate scores
|
||||
scoreBeforeBoost = UNSET_SIGNAL_VALUE;
|
||||
scoreAfterBoost = UNSET_SIGNAL_VALUE;
|
||||
scoreFinal = UNSET_SIGNAL_VALUE;
|
||||
scoreReturned = UNSET_SIGNAL_VALUE;
|
||||
|
||||
skipReason = SkipReason.NOT_SKIPPED;
|
||||
|
||||
isTrusted = false; // Set later
|
||||
isFollow = false; // Set later
|
||||
trustedCircleBoostApplied = false;
|
||||
directFollowBoostApplied = false;
|
||||
outOfNetworkReplyPenaltyApplied = false;
|
||||
hasMultipleHashtagsOrTrends = false;
|
||||
spamUserDampApplied = false;
|
||||
nsfwUserDampApplied = false;
|
||||
botUserDampApplied = false;
|
||||
|
||||
tweetHasTrendsBoostApplied = false;
|
||||
tweetFromVerifiedAccountBoostApplied = false;
|
||||
tweetFromBlueVerifiedAccountBoostApplied = false;
|
||||
|
||||
fromUserId = UNSET_SIGNAL_VALUE;
|
||||
sharedStatusId = UNSET_SIGNAL_VALUE;
|
||||
referenceAuthorId = UNSET_SIGNAL_VALUE;
|
||||
|
||||
isSelfTweet = false;
|
||||
selfTweetBoostApplied = false;
|
||||
selfTweetMult = NO_BOOST_VALUE;
|
||||
|
||||
trustedCircleBoostApplied = false;
|
||||
directFollowBoostApplied = false;
|
||||
|
||||
hasImageUrl = false;
|
||||
hasVideoUrl = false;
|
||||
hasMedialUrlBoostApplied = false;
|
||||
hasNewsUrl = false;
|
||||
hasNewsUrlBoostApplied = false;
|
||||
|
||||
hasCard = false;
|
||||
cardType = SearchCardType.UNKNOWN.getByteValue();
|
||||
hasCardBoostApplied = false;
|
||||
cardDomainMatchBoostApplied = false;
|
||||
cardAuthorMatchBoostApplied = false;
|
||||
cardTitleMatchBoostApplied = false;
|
||||
cardDescriptionMatchBoostApplied = false;
|
||||
|
||||
hitFields.clear();
|
||||
hasNoTextHitDemotionApplied = false;
|
||||
hasUrlOnlyHitDemotionApplied = false;
|
||||
hasNameOnlyHitDemotionApplied = false;
|
||||
hasSeparateTextAndNameHitDemotionApplied = false;
|
||||
hasSeparateTextAndUrlHitDemotionApplied = false;
|
||||
|
||||
hasConsumerVideo = false;
|
||||
hasProVideo = false;
|
||||
hasVine = false;
|
||||
hasPeriscope = false;
|
||||
hasNativeImage = false;
|
||||
|
||||
isSensitiveContent = false;
|
||||
hasMultipleMediaFlag = false;
|
||||
profileIsEggFlag = false;
|
||||
numMentions = 0;
|
||||
numHashtags = 0;
|
||||
isUserNewFlag = false;
|
||||
linkLanguage = 0;
|
||||
prevUserTweetEngagement = 0;
|
||||
|
||||
isComposerSourceCamera = false;
|
||||
|
||||
// health model scores by HML
|
||||
toxicityScore = UNSET_SIGNAL_VALUE;
|
||||
pBlockScore = UNSET_SIGNAL_VALUE;
|
||||
pSpammyTweetScore = UNSET_SIGNAL_VALUE;
|
||||
pReportedTweetScore = UNSET_SIGNAL_VALUE;
|
||||
spammyTweetContentScore = UNSET_SIGNAL_VALUE;
|
||||
experimentalHealthModelScore1 = UNSET_SIGNAL_VALUE;
|
||||
experimentalHealthModelScore2 = UNSET_SIGNAL_VALUE;
|
||||
experimentalHealthModelScore3 = UNSET_SIGNAL_VALUE;
|
||||
experimentalHealthModelScore4 = UNSET_SIGNAL_VALUE;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,304 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import com.twitter.search.common.constants.SearchCardType;
|
||||
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.ranking.thriftjava.ThriftAgeDecayRankingParams;
|
||||
import com.twitter.search.common.ranking.thriftjava.ThriftCardRankingParams;
|
||||
import com.twitter.search.common.ranking.thriftjava.ThriftRankingParams;
|
||||
import com.twitter.search.common.util.lang.ThriftLanguageUtil;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSocialFilterType;
|
||||
|
||||
/*
|
||||
* The class for all query specific parameters, including the parameters from the relevanceOptions and
|
||||
* values that are extracted from the request itself.
|
||||
*/
|
||||
public class LinearScoringParams {
|
||||
|
||||
public static final double DEFAULT_FEATURE_WEIGHT = 0;
|
||||
public static final double DEFAULT_FEATURE_MIN_VAL = 0;
|
||||
public static final double DEFAULT_NO_BOOST = 1.0;
|
||||
@VisibleForTesting
|
||||
static final SearchCounter NULL_USER_LANGS_KEY =
|
||||
SearchCounter.export("linear_scoring_params_null_user_langs_key");
|
||||
|
||||
public final double luceneWeight;
|
||||
public final double textScoreWeight;
|
||||
public final double textScoreMinVal;
|
||||
public final double retweetWeight;
|
||||
public final double retweetMinVal;
|
||||
public final double favWeight;
|
||||
public final double favMinVal;
|
||||
public final double replyWeight;
|
||||
public final double multipleReplyWeight;
|
||||
public final double multipleReplyMinVal;
|
||||
public final double isReplyWeight;
|
||||
public final double parusWeight;
|
||||
public final double embedsImpressionWeight;
|
||||
public final double embedsUrlWeight;
|
||||
public final double videoViewWeight;
|
||||
public final double quotedCountWeight;
|
||||
|
||||
public final double[] rankingOfflineExpWeights =
|
||||
new double[LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS];
|
||||
|
||||
public final boolean applyBoosts;
|
||||
|
||||
// Storing ranking params for cards, avoid using maps for faster lookup
|
||||
public final double[] hasCardBoosts = new double[SearchCardType.values().length];
|
||||
public final double[] cardDomainMatchBoosts = new double[SearchCardType.values().length];
|
||||
public final double[] cardAuthorMatchBoosts = new double[SearchCardType.values().length];
|
||||
public final double[] cardTitleMatchBoosts = new double[SearchCardType.values().length];
|
||||
public final double[] cardDescriptionMatchBoosts = new double[SearchCardType.values().length];
|
||||
|
||||
public final double urlWeight;
|
||||
public final double reputationWeight;
|
||||
public final double reputationMinVal;
|
||||
public final double followRetweetWeight;
|
||||
public final double trustedRetweetWeight;
|
||||
|
||||
// Adjustments for specific tweets (tweetId -> score)
|
||||
public final Map<Long, Double> querySpecificScoreAdjustments;
|
||||
|
||||
// Adjustments for tweets posted by specific authors (userId -> score)
|
||||
public final Map<Long, Double> authorSpecificScoreAdjustments;
|
||||
|
||||
public final double offensiveDamping;
|
||||
public final double spamUserDamping;
|
||||
public final double nsfwUserDamping;
|
||||
public final double botUserDamping;
|
||||
public final double trustedCircleBoost;
|
||||
public final double directFollowBoost;
|
||||
public final double minScore;
|
||||
|
||||
public final boolean applyFiltersAlways;
|
||||
|
||||
public final boolean useLuceneScoreAsBoost;
|
||||
public final double maxLuceneScoreBoost;
|
||||
|
||||
public final double langEnglishTweetDemote;
|
||||
public final double langEnglishUIDemote;
|
||||
public final double langDefaultDemote;
|
||||
public final boolean useUserLanguageInfo;
|
||||
public final double unknownLanguageBoost;
|
||||
|
||||
public final double outOfNetworkReplyPenalty;
|
||||
|
||||
public final boolean useAgeDecay;
|
||||
public final double ageDecayHalflife;
|
||||
public final double ageDecayBase;
|
||||
public final double ageDecaySlope;
|
||||
|
||||
// hit attribute demotions
|
||||
public final boolean enableHitDemotion;
|
||||
public final double noTextHitDemotion;
|
||||
public final double urlOnlyHitDemotion;
|
||||
public final double nameOnlyHitDemotion;
|
||||
public final double separateTextAndNameHitDemotion;
|
||||
public final double separateTextAndUrlHitDemotion;
|
||||
|
||||
// trends related params
|
||||
public final double tweetHasTrendBoost;
|
||||
public final double multipleHashtagsOrTrendsDamping;
|
||||
|
||||
public final double tweetFromVerifiedAccountBoost;
|
||||
|
||||
public final double tweetFromBlueVerifiedAccountBoost;
|
||||
|
||||
public final ThriftSocialFilterType socialFilterType;
|
||||
public final int uiLangId;
|
||||
// Confidences of the understandability of different languages for this user.
|
||||
public final double[] userLangs = new double[ThriftLanguage.values().length];
|
||||
|
||||
public final long searcherId;
|
||||
public final double selfTweetBoost;
|
||||
|
||||
public final double tweetHasMediaUrlBoost;
|
||||
public final double tweetHasNewsUrlBoost;
|
||||
|
||||
// whether we need meta-data for replies what the reply is to.
|
||||
public final boolean getInReplyToStatusId;
|
||||
|
||||
// Initialize from a ranking parameter
|
||||
public LinearScoringParams(ThriftSearchQuery searchQuery, ThriftRankingParams params) {
|
||||
// weights
|
||||
luceneWeight = params.isSetLuceneScoreParams()
|
||||
? params.getLuceneScoreParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
textScoreWeight = params.isSetTextScoreParams()
|
||||
? params.getTextScoreParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
retweetWeight = params.isSetRetweetCountParams()
|
||||
? params.getRetweetCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
favWeight = params.isSetFavCountParams()
|
||||
? params.getFavCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
replyWeight = params.isSetReplyCountParams()
|
||||
? params.getReplyCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
multipleReplyWeight = params.isSetMultipleReplyCountParams()
|
||||
? params.getMultipleReplyCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
parusWeight = params.isSetParusScoreParams()
|
||||
? params.getParusScoreParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
for (int i = 0; i < LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS; i++) {
|
||||
Byte featureTypeByte = (byte) i;
|
||||
// default weight is 0, thus contribution for unset feature value will be 0.
|
||||
rankingOfflineExpWeights[i] = params.getOfflineExperimentalFeatureRankingParamsSize() > 0
|
||||
&& params.getOfflineExperimentalFeatureRankingParams().containsKey(featureTypeByte)
|
||||
? params.getOfflineExperimentalFeatureRankingParams().get(featureTypeByte).getWeight()
|
||||
: DEFAULT_FEATURE_WEIGHT;
|
||||
}
|
||||
embedsImpressionWeight = params.isSetEmbedsImpressionCountParams()
|
||||
? params.getEmbedsImpressionCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
embedsUrlWeight = params.isSetEmbedsUrlCountParams()
|
||||
? params.getEmbedsUrlCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
videoViewWeight = params.isSetVideoViewCountParams()
|
||||
? params.getVideoViewCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
quotedCountWeight = params.isSetQuotedCountParams()
|
||||
? params.getQuotedCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
|
||||
applyBoosts = params.isApplyBoosts();
|
||||
|
||||
// configure card values
|
||||
Arrays.fill(hasCardBoosts, DEFAULT_NO_BOOST);
|
||||
Arrays.fill(cardAuthorMatchBoosts, DEFAULT_NO_BOOST);
|
||||
Arrays.fill(cardDomainMatchBoosts, DEFAULT_NO_BOOST);
|
||||
Arrays.fill(cardTitleMatchBoosts, DEFAULT_NO_BOOST);
|
||||
Arrays.fill(cardDescriptionMatchBoosts, DEFAULT_NO_BOOST);
|
||||
if (params.isSetCardRankingParams()) {
|
||||
for (SearchCardType cardType : SearchCardType.values()) {
|
||||
byte cardTypeIndex = cardType.getByteValue();
|
||||
ThriftCardRankingParams rankingParams = params.getCardRankingParams().get(cardTypeIndex);
|
||||
if (rankingParams != null) {
|
||||
hasCardBoosts[cardTypeIndex] = rankingParams.getHasCardBoost();
|
||||
cardAuthorMatchBoosts[cardTypeIndex] = rankingParams.getAuthorMatchBoost();
|
||||
cardDomainMatchBoosts[cardTypeIndex] = rankingParams.getDomainMatchBoost();
|
||||
cardTitleMatchBoosts[cardTypeIndex] = rankingParams.getTitleMatchBoost();
|
||||
cardDescriptionMatchBoosts[cardTypeIndex] = rankingParams.getDescriptionMatchBoost();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
urlWeight = params.isSetUrlParams()
|
||||
? params.getUrlParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
reputationWeight = params.isSetReputationParams()
|
||||
? params.getReputationParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
isReplyWeight = params.isSetIsReplyParams()
|
||||
? params.getIsReplyParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
followRetweetWeight = params.isSetDirectFollowRetweetCountParams()
|
||||
? params.getDirectFollowRetweetCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
trustedRetweetWeight = params.isSetTrustedCircleRetweetCountParams()
|
||||
? params.getTrustedCircleRetweetCountParams().getWeight() : DEFAULT_FEATURE_WEIGHT;
|
||||
|
||||
querySpecificScoreAdjustments = params.getQuerySpecificScoreAdjustments();
|
||||
authorSpecificScoreAdjustments = params.getAuthorSpecificScoreAdjustments();
|
||||
|
||||
// min/max filters
|
||||
textScoreMinVal = params.isSetTextScoreParams()
|
||||
? params.getTextScoreParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
|
||||
reputationMinVal = params.isSetReputationParams()
|
||||
? params.getReputationParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
|
||||
multipleReplyMinVal = params.isSetMultipleReplyCountParams()
|
||||
? params.getMultipleReplyCountParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
|
||||
retweetMinVal = params.isSetRetweetCountParams() && params.getRetweetCountParams().isSetMin()
|
||||
? params.getRetweetCountParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
|
||||
favMinVal = params.isSetFavCountParams() && params.getFavCountParams().isSetMin()
|
||||
? params.getFavCountParams().getMin() : DEFAULT_FEATURE_MIN_VAL;
|
||||
|
||||
// boosts
|
||||
spamUserDamping = params.isSetSpamUserBoost() ? params.getSpamUserBoost() : 1.0;
|
||||
nsfwUserDamping = params.isSetNsfwUserBoost() ? params.getNsfwUserBoost() : 1.0;
|
||||
botUserDamping = params.isSetBotUserBoost() ? params.getBotUserBoost() : 1.0;
|
||||
offensiveDamping = params.getOffensiveBoost();
|
||||
trustedCircleBoost = params.getInTrustedCircleBoost();
|
||||
directFollowBoost = params.getInDirectFollowBoost();
|
||||
|
||||
// language boosts
|
||||
langEnglishTweetDemote = params.getLangEnglishTweetBoost();
|
||||
langEnglishUIDemote = params.getLangEnglishUIBoost();
|
||||
langDefaultDemote = params.getLangDefaultBoost();
|
||||
useUserLanguageInfo = params.isUseUserLanguageInfo();
|
||||
unknownLanguageBoost = params.getUnknownLanguageBoost();
|
||||
|
||||
// hit demotions
|
||||
enableHitDemotion = params.isEnableHitDemotion();
|
||||
noTextHitDemotion = params.getNoTextHitDemotion();
|
||||
urlOnlyHitDemotion = params.getUrlOnlyHitDemotion();
|
||||
nameOnlyHitDemotion = params.getNameOnlyHitDemotion();
|
||||
separateTextAndNameHitDemotion = params.getSeparateTextAndNameHitDemotion();
|
||||
separateTextAndUrlHitDemotion = params.getSeparateTextAndUrlHitDemotion();
|
||||
|
||||
outOfNetworkReplyPenalty = params.getOutOfNetworkReplyPenalty();
|
||||
|
||||
if (params.isSetAgeDecayParams()) {
|
||||
// new age decay settings
|
||||
ThriftAgeDecayRankingParams ageDecayParams = params.getAgeDecayParams();
|
||||
ageDecaySlope = ageDecayParams.getSlope();
|
||||
ageDecayHalflife = ageDecayParams.getHalflife();
|
||||
ageDecayBase = ageDecayParams.getBase();
|
||||
useAgeDecay = true;
|
||||
} else if (params.isSetDeprecatedAgeDecayBase()
|
||||
&& params.isSetDeprecatedAgeDecayHalflife()
|
||||
&& params.isSetDeprecatedAgeDecaySlope()) {
|
||||
ageDecaySlope = params.getDeprecatedAgeDecaySlope();
|
||||
ageDecayHalflife = params.getDeprecatedAgeDecayHalflife();
|
||||
ageDecayBase = params.getDeprecatedAgeDecayBase();
|
||||
useAgeDecay = true;
|
||||
} else {
|
||||
ageDecaySlope = 0.0;
|
||||
ageDecayHalflife = 0.0;
|
||||
ageDecayBase = 0.0;
|
||||
useAgeDecay = false;
|
||||
}
|
||||
|
||||
// trends
|
||||
tweetHasTrendBoost = params.getTweetHasTrendBoost();
|
||||
multipleHashtagsOrTrendsDamping = params.getMultipleHashtagsOrTrendsBoost();
|
||||
|
||||
// verified accounts
|
||||
tweetFromVerifiedAccountBoost = params.getTweetFromVerifiedAccountBoost();
|
||||
tweetFromBlueVerifiedAccountBoost = params.getTweetFromBlueVerifiedAccountBoost();
|
||||
|
||||
// score filter
|
||||
minScore = params.getMinScore();
|
||||
|
||||
applyFiltersAlways = params.isApplyFiltersAlways();
|
||||
|
||||
useLuceneScoreAsBoost = params.isUseLuceneScoreAsBoost();
|
||||
maxLuceneScoreBoost = params.getMaxLuceneScoreBoost();
|
||||
|
||||
searcherId = searchQuery.isSetSearcherId() ? searchQuery.getSearcherId() : -1;
|
||||
selfTweetBoost = params.getSelfTweetBoost();
|
||||
|
||||
socialFilterType = searchQuery.getSocialFilterType();
|
||||
|
||||
// the UI language and the confidences of the languages user can understand.
|
||||
if (!searchQuery.isSetUiLang() || searchQuery.getUiLang().isEmpty()) {
|
||||
uiLangId = ThriftLanguage.UNKNOWN.getValue();
|
||||
} else {
|
||||
uiLangId = ThriftLanguageUtil.getThriftLanguageOf(searchQuery.getUiLang()).getValue();
|
||||
}
|
||||
if (searchQuery.getUserLangsSize() > 0) {
|
||||
for (Map.Entry<ThriftLanguage, Double> lang : searchQuery.getUserLangs().entrySet()) {
|
||||
ThriftLanguage thriftLanguage = lang.getKey();
|
||||
// SEARCH-13441
|
||||
if (thriftLanguage != null) {
|
||||
userLangs[thriftLanguage.getValue()] = lang.getValue();
|
||||
} else {
|
||||
NULL_USER_LANGS_KEY.increment();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// For now, we will use the same boost for both image, and video.
|
||||
tweetHasMediaUrlBoost = params.getTweetHasImageUrlBoost();
|
||||
tweetHasNewsUrlBoost = params.getTweetHasNewsUrlBoost();
|
||||
|
||||
getInReplyToStatusId =
|
||||
searchQuery.isSetResultMetadataOptions()
|
||||
&& searchQuery.getResultMetadataOptions().isSetGetInReplyToStatusId()
|
||||
&& searchQuery.getResultMetadataOptions().isGetInReplyToStatusId();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,163 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.NumericDocValues;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.encoding.features.ByteNormalizer;
|
||||
import com.twitter.search.common.encoding.features.ClampByteNormalizer;
|
||||
import com.twitter.search.common.encoding.features.SingleBytePositiveFloatNormalizer;
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.common.query.FilteredQuery;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
|
||||
public final class MinFeatureValueFilter extends Query implements FilteredQuery.DocIdFilterFactory {
|
||||
private final String featureName;
|
||||
private final ByteNormalizer normalizer;
|
||||
private final double minValue;
|
||||
|
||||
/**
|
||||
* Creates a query that filters out all hits that have a value smaller than the given threshold
|
||||
* for the given feature.
|
||||
*
|
||||
* @param featureName The feature.
|
||||
* @param minValue The threshold for the feature values.
|
||||
* @return A query that filters out all hits that have a value smaller than the given threshold
|
||||
* for the given feature.
|
||||
*/
|
||||
public static Query getMinFeatureValueFilter(String featureName, double minValue) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new MinFeatureValueFilter(featureName, minValue), BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
public static FilteredQuery.DocIdFilterFactory getDocIdFilterFactory(
|
||||
String featureName, double minValue) {
|
||||
return new MinFeatureValueFilter(featureName, minValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the normalizer that should be used to normalize the values for the given feature.
|
||||
*
|
||||
* @param featureName The feature.
|
||||
* @return The normalizer that should be used to normalize the values for the given feature.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public static ByteNormalizer getMinFeatureValueNormalizer(String featureName) {
|
||||
if (featureName.equals(EarlybirdFieldConstant.USER_REPUTATION.getFieldName())) {
|
||||
return new ClampByteNormalizer(0, 100);
|
||||
}
|
||||
|
||||
if (featureName.equals(EarlybirdFieldConstant.FAVORITE_COUNT.getFieldName())
|
||||
|| featureName.equals(EarlybirdFieldConstant.PARUS_SCORE.getFieldName())
|
||||
|| featureName.equals(EarlybirdFieldConstant.REPLY_COUNT.getFieldName())
|
||||
|| featureName.equals(EarlybirdFieldConstant.RETWEET_COUNT.getFieldName())) {
|
||||
return new SingleBytePositiveFloatNormalizer();
|
||||
}
|
||||
|
||||
throw new IllegalArgumentException("Unknown normalization method for field " + featureName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
// Probably doesn't make sense to include the schemaSnapshot and normalizer here.
|
||||
return (int) ((featureName == null ? 0 : featureName.hashCode() * 7) + minValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof MinFeatureValueFilter)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Probably doesn't make sense to include the schemaSnapshot and normalizer here.
|
||||
MinFeatureValueFilter filter = MinFeatureValueFilter.class.cast(obj);
|
||||
return Objects.equals(featureName, filter.featureName) && (minValue == filter.minValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return String.format("MinFeatureValueFilter(%s, %f)", featureName, minValue);
|
||||
}
|
||||
|
||||
private MinFeatureValueFilter(String featureName, double minValue) {
|
||||
this.featureName = featureName;
|
||||
this.normalizer = getMinFeatureValueNormalizer(featureName);
|
||||
this.minValue = normalizer.normalize(minValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public FilteredQuery.DocIdFilter getDocIdFilter(LeafReaderContext context) throws IOException {
|
||||
final NumericDocValues featureDocValues = context.reader().getNumericDocValues(featureName);
|
||||
return (docId) -> featureDocValues.advanceExact(docId)
|
||||
&& ((byte) featureDocValues.longValue() >= minValue);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
return new MinFeatureValueDocIdSetIterator(
|
||||
context.reader(), featureName, minValue);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static final class MinFeatureValueDocIdSetIterator extends RangeFilterDISI {
|
||||
private final NumericDocValues featureDocValues;
|
||||
private final double minValue;
|
||||
|
||||
MinFeatureValueDocIdSetIterator(LeafReader indexReader,
|
||||
String featureName,
|
||||
double minValue) throws IOException {
|
||||
super(indexReader);
|
||||
this.featureDocValues = indexReader.getNumericDocValues(featureName);
|
||||
this.minValue = minValue;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean shouldReturnDoc() throws IOException {
|
||||
// We need this explicit casting to byte, because of how we encode and decode features in our
|
||||
// encoded_tweet_features field. If a feature is an int (uses all 32 bits of the int), then
|
||||
// encoding the feature and then decoding it preserves its original value. However, if the
|
||||
// feature does not use the entire int (and especially if it uses bits somewhere in the middle
|
||||
// of the int), then the feature value is assumed to be unsigned when it goes through this
|
||||
// process of encoding and decoding. So a user rep of
|
||||
// RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL (-128) will be correctly encoded as the
|
||||
// binary value 10000000, but will be treated as an unsigned value when decoded, and therefore
|
||||
// the decoded value will be 128.
|
||||
//
|
||||
// In retrospect, this seems like a really poor design decision. It seems like it would be
|
||||
// better if all feature values were considered to be signed, even if most features can never
|
||||
// have negative values. Unfortunately, making this change is not easy, because some features
|
||||
// store normalized values, so we would also need to change the range of allowed values
|
||||
// produced by those normalizers, as well as all code that depends on those values.
|
||||
//
|
||||
// So for now, just cast this value to a byte, to get the proper negative value.
|
||||
return featureDocValues.advanceExact(docID())
|
||||
&& ((byte) featureDocValues.longValue() >= minValue);
|
||||
}
|
||||
}
|
||||
|
||||
public double getMinValue() {
|
||||
return minValue;
|
||||
}
|
||||
|
||||
public ByteNormalizer getNormalizer() {
|
||||
return normalizer;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,104 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance;
|
||||
|
||||
import java.util.Comparator;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.common_internal.collections.RandomAccessPriorityQueue;
|
||||
import com.twitter.search.common.relevance.features.TweetIntegerShingleSignature;
|
||||
import com.twitter.search.earlybird.search.Hit;
|
||||
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
|
||||
public class RelevanceHit extends Hit
|
||||
implements RandomAccessPriorityQueue.SignatureProvider<TweetIntegerShingleSignature> {
|
||||
@Nullable
|
||||
private TweetIntegerShingleSignature signature;
|
||||
|
||||
public RelevanceHit() {
|
||||
super(Long.MAX_VALUE, Long.MAX_VALUE);
|
||||
}
|
||||
|
||||
public RelevanceHit(long timeSliceID, long statusID,
|
||||
TweetIntegerShingleSignature signature,
|
||||
ThriftSearchResultMetadata metadata) {
|
||||
super(timeSliceID, statusID);
|
||||
update(timeSliceID, statusID, signature, metadata);
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the data for this relevance hit.
|
||||
*
|
||||
* @param timeSliceID The timeslice ID of the segment that the segment came from.
|
||||
* @param statusID The hit's tweet ID.
|
||||
* @param tweetSignature The tweet signature generated for this hit.
|
||||
* @param metadata The metadata associated with this hit.
|
||||
*/
|
||||
public void update(long timeSliceID, long statusID, TweetIntegerShingleSignature tweetSignature,
|
||||
ThriftSearchResultMetadata metadata) {
|
||||
this.statusID = statusID;
|
||||
this.timeSliceID = timeSliceID;
|
||||
this.metadata = Preconditions.checkNotNull(metadata);
|
||||
this.signature = Preconditions.checkNotNull(tweetSignature);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the computed score for this hit.
|
||||
*/
|
||||
public float getScore() {
|
||||
if (metadata != null) {
|
||||
return (float) metadata.getScore();
|
||||
} else {
|
||||
return ScoringFunction.SKIP_HIT;
|
||||
}
|
||||
}
|
||||
|
||||
// We want the score as a double (and not cast to a float) for COMPARATOR_BY_SCORE and
|
||||
// PQ_COMPARATOR_BY_SCORE so that the results returned from Earlybirds will be sorted based on the
|
||||
// scores in the ThriftSearchResultMetadata objects (and will not lose precision by being cast to
|
||||
// floats). Thus, the sorted order on Earlybirds and Earlybird Roots will be consistent.
|
||||
private double getScoreDouble() {
|
||||
if (metadata != null) {
|
||||
return metadata.getScore();
|
||||
} else {
|
||||
return (double) ScoringFunction.SKIP_HIT;
|
||||
}
|
||||
}
|
||||
|
||||
@Override @Nullable
|
||||
public TweetIntegerShingleSignature getSignature() {
|
||||
return signature;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "RelevanceHit[tweetID=" + statusID + ",timeSliceID=" + timeSliceID
|
||||
+ ",score=" + (metadata == null ? "null" : metadata.getScore())
|
||||
+ ",signature=" + (signature == null ? "null" : signature) + "]";
|
||||
}
|
||||
|
||||
public static final Comparator<RelevanceHit> COMPARATOR_BY_SCORE =
|
||||
(d1, d2) -> {
|
||||
// if two docs have the same score, then the first one (most recent) wins
|
||||
if (d1.getScore() == d2.getScore()) {
|
||||
return Long.compare(d2.getStatusID(), d1.getStatusID());
|
||||
}
|
||||
return Double.compare(d2.getScoreDouble(), d1.getScoreDouble());
|
||||
};
|
||||
|
||||
public static final Comparator<RelevanceHit> PQ_COMPARATOR_BY_SCORE =
|
||||
(d1, d2) -> {
|
||||
// Reverse the order
|
||||
return COMPARATOR_BY_SCORE.compare(d2, d1);
|
||||
};
|
||||
|
||||
@Override
|
||||
public void clear() {
|
||||
timeSliceID = Long.MAX_VALUE;
|
||||
statusID = Long.MAX_VALUE;
|
||||
metadata = null;
|
||||
signature = null;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,66 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
import com.twitter.search.common.search.TerminationTracker;
|
||||
import com.twitter.search.earlybird.QualityFactor;
|
||||
import com.twitter.search.earlybird.search.SearchRequestInfo;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
|
||||
|
||||
public class RelevanceSearchRequestInfo extends SearchRequestInfo {
|
||||
private final ThriftSearchRelevanceOptions relevanceOptions;
|
||||
|
||||
public RelevanceSearchRequestInfo(
|
||||
ThriftSearchQuery searchQuery, Query query,
|
||||
TerminationTracker terminationTracker, QualityFactor qualityFactor) {
|
||||
super(addResultMetadataOptionsIfUnset(searchQuery), query, terminationTracker, qualityFactor);
|
||||
this.relevanceOptions = searchQuery.getRelevanceOptions();
|
||||
}
|
||||
|
||||
private static ThriftSearchQuery addResultMetadataOptionsIfUnset(ThriftSearchQuery searchQuery) {
|
||||
if (!searchQuery.isSetResultMetadataOptions()) {
|
||||
searchQuery.setResultMetadataOptions(new ThriftSearchResultMetadataOptions());
|
||||
}
|
||||
return searchQuery;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int calculateMaxHitsToProcess(ThriftSearchQuery thriftSearchQuery) {
|
||||
ThriftSearchRelevanceOptions searchRelevanceOptions = thriftSearchQuery.getRelevanceOptions();
|
||||
|
||||
// Don't use the value from the ThriftSearchQuery object if one is provided in the
|
||||
// relevance options
|
||||
int requestedMaxHitsToProcess = searchRelevanceOptions.isSetMaxHitsToProcess()
|
||||
? searchRelevanceOptions.getMaxHitsToProcess()
|
||||
: super.calculateMaxHitsToProcess(thriftSearchQuery);
|
||||
|
||||
return qualityFactorMaxHitsToProcess(getNumResultsRequested(), requestedMaxHitsToProcess);
|
||||
}
|
||||
|
||||
public ThriftSearchRelevanceOptions getRelevanceOptions() {
|
||||
return this.relevanceOptions;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reduces maxHitsToProcess based on quality factor. Never reduces it beyond
|
||||
* numResults.
|
||||
* @param numResults
|
||||
* @param maxHitsToProcess
|
||||
* @return Reduced maxHitsToProcess.
|
||||
*/
|
||||
public int qualityFactorMaxHitsToProcess(int numResults, int maxHitsToProcess) {
|
||||
Preconditions.checkNotNull(qualityFactor);
|
||||
|
||||
// Do not quality factor if there is no lower bound on maxHitsToProcess.
|
||||
if (numResults > maxHitsToProcess) {
|
||||
return maxHitsToProcess;
|
||||
}
|
||||
|
||||
double currentQualityFactor = qualityFactor.get();
|
||||
return Math.max(numResults, (int) (currentQualityFactor * maxHitsToProcess));
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,37 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance;
|
||||
|
||||
import com.twitter.search.earlybird.search.Hit;
|
||||
import com.twitter.search.earlybird.search.SimpleSearchResults;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
|
||||
|
||||
public class RelevanceSearchResults extends SimpleSearchResults {
|
||||
public final ThriftSearchResultMetadata[] resultMetadata;
|
||||
private ThriftSearchResultsRelevanceStats relevanceStats = null;
|
||||
private long scoringTimeNanos = 0;
|
||||
|
||||
public RelevanceSearchResults(int size) {
|
||||
super(size);
|
||||
this.resultMetadata = new ThriftSearchResultMetadata[size];
|
||||
}
|
||||
|
||||
public void setHit(Hit hit, int hitIndex) {
|
||||
hits[hitIndex] = hit;
|
||||
resultMetadata[hitIndex] = hit.getMetadata();
|
||||
}
|
||||
|
||||
public void setRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
|
||||
this.relevanceStats = relevanceStats;
|
||||
}
|
||||
public ThriftSearchResultsRelevanceStats getRelevanceStats() {
|
||||
return relevanceStats;
|
||||
}
|
||||
|
||||
public void setScoringTimeNanos(long scoringTimeNanos) {
|
||||
this.scoringTimeNanos = scoringTimeNanos;
|
||||
}
|
||||
|
||||
public long getScoringTimeNanos() {
|
||||
return scoringTimeNanos;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,138 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.search.BooleanClause;
|
||||
import org.apache.lucene.search.BooleanQuery;
|
||||
import org.apache.lucene.search.DocIdSetIterator;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
|
||||
import com.twitter.search.common.query.DefaultFilterWeight;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
||||
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
|
||||
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunctionProvider;
|
||||
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunctionProvider.NamedScoringFunctionProvider;
|
||||
|
||||
/**
|
||||
* This filter only accepts documents for which the provided
|
||||
* {@link com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction}
|
||||
* returns a score that's greater or equal to the passed-in minScore and smaller or equal
|
||||
* to maxScore.
|
||||
*/
|
||||
public final class ScoreFilterQuery extends Query {
|
||||
private static final float DEFAULT_LUCENE_SCORE = 1.0F;
|
||||
|
||||
private final float minScore;
|
||||
private final float maxScore;
|
||||
private final NamedScoringFunctionProvider scoringFunctionProvider;
|
||||
private final ImmutableSchemaInterface schema;
|
||||
|
||||
/**
|
||||
* Returns a score filter.
|
||||
*
|
||||
* @param schema The schema to use to extract the feature scores.
|
||||
* @param scoringFunctionProvider The scoring function provider.
|
||||
* @param minScore The minimum score threshold.
|
||||
* @param maxScore The maximum score threshold.
|
||||
* @return A score filter with the given configuration.
|
||||
*/
|
||||
public static Query getScoreFilterQuery(
|
||||
ImmutableSchemaInterface schema,
|
||||
NamedScoringFunctionProvider scoringFunctionProvider,
|
||||
float minScore,
|
||||
float maxScore) {
|
||||
return new BooleanQuery.Builder()
|
||||
.add(new ScoreFilterQuery(schema, scoringFunctionProvider, minScore, maxScore),
|
||||
BooleanClause.Occur.FILTER)
|
||||
.build();
|
||||
}
|
||||
|
||||
private ScoreFilterQuery(ImmutableSchemaInterface schema,
|
||||
NamedScoringFunctionProvider scoringFunctionProvider,
|
||||
float minScore,
|
||||
float maxScore) {
|
||||
this.schema = schema;
|
||||
this.scoringFunctionProvider = scoringFunctionProvider;
|
||||
this.minScore = minScore;
|
||||
this.maxScore = maxScore;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
return new DefaultFilterWeight(this) {
|
||||
@Override
|
||||
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
||||
ScoringFunction scoringFunction = scoringFunctionProvider.getScoringFunction();
|
||||
scoringFunction.setNextReader((EarlybirdIndexSegmentAtomicReader) context.reader());
|
||||
return new ScoreFilterDocIdSetIterator(
|
||||
context.reader(), scoringFunction, minScore, maxScore);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
private static final class ScoreFilterDocIdSetIterator extends RangeFilterDISI {
|
||||
private final ScoringFunction scoringFunction;
|
||||
private final float minScore;
|
||||
private final float maxScore;
|
||||
|
||||
public ScoreFilterDocIdSetIterator(LeafReader indexReader, ScoringFunction scoringFunction,
|
||||
float minScore, float maxScore) throws IOException {
|
||||
super(indexReader);
|
||||
this.scoringFunction = scoringFunction;
|
||||
this.minScore = minScore;
|
||||
this.maxScore = maxScore;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected boolean shouldReturnDoc() throws IOException {
|
||||
float score = scoringFunction.score(docID(), DEFAULT_LUCENE_SCORE);
|
||||
return score >= minScore && score <= maxScore;
|
||||
}
|
||||
}
|
||||
|
||||
public float getMinScoreForTest() {
|
||||
return minScore;
|
||||
}
|
||||
|
||||
public float getMaxScoreForTest() {
|
||||
return maxScore;
|
||||
}
|
||||
|
||||
public ScoringFunctionProvider getScoringFunctionProviderForTest() {
|
||||
return scoringFunctionProvider;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (int) (minScore * 29
|
||||
+ maxScore * 17
|
||||
+ (scoringFunctionProvider == null ? 0 : scoringFunctionProvider.hashCode()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof ScoreFilterQuery)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ScoreFilterQuery filter = ScoreFilterQuery.class.cast(obj);
|
||||
return (minScore == filter.minScore)
|
||||
&& (maxScore == filter.maxScore)
|
||||
&& (scoringFunctionProvider == null
|
||||
? filter.scoringFunctionProvider == null
|
||||
: scoringFunctionProvider.equals(filter.scoringFunctionProvider));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "SCORE_FILTER_QUERY[minScore=" + minScore + ",maxScore=" + maxScore + "]";
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,147 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.collectors;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.core.earlybird.facets.LanguageHistogram;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserTable;
|
||||
import com.twitter.search.earlybird.search.AbstractResultsCollector;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceSearchRequestInfo;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceSearchResults;
|
||||
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
|
||||
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
|
||||
|
||||
/**
|
||||
* AbstractRelevanceCollector is a results collector that collects RelevanceHit results
|
||||
* which include more detailed information than a normal Hit.
|
||||
*/
|
||||
public abstract class AbstractRelevanceCollector
|
||||
extends AbstractResultsCollector<RelevanceSearchRequestInfo, RelevanceSearchResults> {
|
||||
protected final ScoringFunction scoringFunction;
|
||||
private final ThriftSearchResultsRelevanceStats relevanceStats;
|
||||
private final EarlybirdCluster cluster;
|
||||
private final UserTable userTable;
|
||||
|
||||
// Per-language result counts.
|
||||
private final LanguageHistogram languageHistogram = new LanguageHistogram();
|
||||
|
||||
// Accumulated time spend on relevance scoring across all collected hits, including batch scoring.
|
||||
private long scoringTimeNanos = 0;
|
||||
|
||||
public AbstractRelevanceCollector(
|
||||
ImmutableSchemaInterface schema,
|
||||
RelevanceSearchRequestInfo searchRequestInfo,
|
||||
ScoringFunction scoringFunction,
|
||||
EarlybirdSearcherStats searcherStats,
|
||||
EarlybirdCluster cluster,
|
||||
UserTable userTable,
|
||||
Clock clock,
|
||||
int requestDebugMode) {
|
||||
super(schema, searchRequestInfo, clock, searcherStats, requestDebugMode);
|
||||
this.scoringFunction = scoringFunction;
|
||||
this.relevanceStats = new ThriftSearchResultsRelevanceStats();
|
||||
this.cluster = cluster;
|
||||
this.userTable = userTable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclasses must implement this method to actually collect a scored relevance hit.
|
||||
*/
|
||||
protected abstract void doCollectWithScore(long tweetID, float score) throws IOException;
|
||||
|
||||
@Override
|
||||
public final void startSegment() throws IOException {
|
||||
scoringFunction.setNextReader(currTwitterReader);
|
||||
|
||||
ThriftSearchResultMetadataOptions options =
|
||||
searchRequestInfo.getSearchQuery().getResultMetadataOptions();
|
||||
featuresRequested = options != null && options.isReturnSearchResultFeatures();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final void doCollect(long tweetID) throws IOException {
|
||||
final long scoringStartNanos = getClock().nowNanos();
|
||||
float luceneSore = scorer.score();
|
||||
final float score = scoringFunction.score(curDocId, luceneSore);
|
||||
final long scoringEndNanos = getClock().nowNanos();
|
||||
addToOverallScoringTimeNanos(scoringStartNanos, scoringEndNanos);
|
||||
|
||||
scoringFunction.updateRelevanceStats(relevanceStats);
|
||||
|
||||
updateHitCounts(tweetID);
|
||||
|
||||
doCollectWithScore(tweetID, score);
|
||||
}
|
||||
|
||||
protected final void addToOverallScoringTimeNanos(long scoringStartNanos, long scoringEndNanos) {
|
||||
scoringTimeNanos += scoringEndNanos - scoringStartNanos;
|
||||
}
|
||||
|
||||
protected final ThriftSearchResultMetadata collectMetadata() throws IOException {
|
||||
ThriftSearchResultMetadataOptions options =
|
||||
searchRequestInfo.getSearchQuery().getResultMetadataOptions();
|
||||
Preconditions.checkNotNull(options);
|
||||
ThriftSearchResultMetadata metadata =
|
||||
Preconditions.checkNotNull(scoringFunction.getResultMetadata(options));
|
||||
if (metadata.isSetLanguage()) {
|
||||
languageHistogram.increment(metadata.getLanguage().getValue());
|
||||
}
|
||||
|
||||
// Some additional metadata which is not provided by the scoring function, but
|
||||
// by accessing the reader directly.
|
||||
if (currTwitterReader != null) {
|
||||
fillResultGeoLocation(metadata);
|
||||
if (searchRequestInfo.isCollectConversationId()) {
|
||||
long conversationId =
|
||||
documentFeatures.getFeatureValue(EarlybirdFieldConstant.CONVERSATION_ID_CSF);
|
||||
if (conversationId != 0) {
|
||||
ensureExtraMetadataIsSet(metadata);
|
||||
metadata.getExtraMetadata().setConversationId(conversationId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check and collect hit attribution data, if it's available.
|
||||
fillHitAttributionMetadata(metadata);
|
||||
|
||||
long fromUserId = documentFeatures.getFeatureValue(EarlybirdFieldConstant.FROM_USER_ID_CSF);
|
||||
if (searchRequestInfo.isGetFromUserId()) {
|
||||
metadata.setFromUserId(fromUserId);
|
||||
}
|
||||
|
||||
collectExclusiveConversationAuthorId(metadata);
|
||||
collectFacets(metadata);
|
||||
collectFeatures(metadata);
|
||||
collectIsProtected(metadata, cluster, userTable);
|
||||
|
||||
return metadata;
|
||||
}
|
||||
|
||||
protected final ThriftSearchResultsRelevanceStats getRelevanceStats() {
|
||||
return relevanceStats;
|
||||
}
|
||||
|
||||
public final LanguageHistogram getLanguageHistogram() {
|
||||
return languageHistogram;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected final RelevanceSearchResults doGetResults() throws IOException {
|
||||
final RelevanceSearchResults results = doGetRelevanceResults();
|
||||
results.setScoringTimeNanos(scoringTimeNanos);
|
||||
return results;
|
||||
}
|
||||
|
||||
/**
|
||||
* For subclasses to process and aggregate collected hits.
|
||||
*/
|
||||
protected abstract RelevanceSearchResults doGetRelevanceResults() throws IOException;
|
||||
}
|
Binary file not shown.
@ -1,118 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.collectors;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.twitter.common.collections.Pair;
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
|
||||
import com.twitter.search.common.metrics.SearchTimerStats;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
|
||||
import com.twitter.search.common.search.EarlyTerminationState;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserTable;
|
||||
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceSearchRequestInfo;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceSearchResults;
|
||||
import com.twitter.search.earlybird.search.relevance.scoring.BatchHit;
|
||||
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
|
||||
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultExtraMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
|
||||
/**
|
||||
* BatchRelevanceTopCollector is similar to the `RelevanceTopCollector` in what it outputs:
|
||||
* Collects the top numResults by score, filtering out duplicates
|
||||
* and results with scores equal to Flat.MIN_VALUE.
|
||||
* The way that it achieves that is different though: it will score documents through the batch score
|
||||
* function instead of scoring documents one by one.
|
||||
*/
|
||||
public class BatchRelevanceTopCollector extends RelevanceTopCollector {
|
||||
protected final List<BatchHit> hits;
|
||||
|
||||
public BatchRelevanceTopCollector(
|
||||
ImmutableSchemaInterface schema,
|
||||
RelevanceSearchRequestInfo searchRequestInfo,
|
||||
ScoringFunction scoringFunction,
|
||||
EarlybirdSearcherStats searcherStats,
|
||||
EarlybirdCluster cluster,
|
||||
UserTable userTable,
|
||||
Clock clock,
|
||||
int requestDebugMode) {
|
||||
super(schema, searchRequestInfo, scoringFunction, searcherStats, cluster, userTable, clock,
|
||||
requestDebugMode);
|
||||
this.hits = new ArrayList<>((int) getMaxHitsToProcess());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doCollectWithScore(long tweetID, float score) throws IOException {
|
||||
Pair<LinearScoringData, ThriftSearchResultFeatures> pair =
|
||||
scoringFunction.collectFeatures(score);
|
||||
ThriftSearchResultMetadata metadata = collectMetadata();
|
||||
hits.add(new BatchHit(pair.getFirst(),
|
||||
pair.getSecond(),
|
||||
metadata,
|
||||
tweetID,
|
||||
currTimeSliceID));
|
||||
}
|
||||
|
||||
@Override
|
||||
public EarlyTerminationState innerShouldCollectMore() {
|
||||
if (hits.size() >= getMaxHitsToProcess()) {
|
||||
return setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_HITS_EXCEEDED);
|
||||
}
|
||||
return EarlyTerminationState.COLLECTING;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected RelevanceSearchResults doGetRelevanceResults() throws IOException {
|
||||
final long scoringStartNanos = getClock().nowNanos();
|
||||
float[] scores = scoringFunction.batchScore(hits);
|
||||
final long scoringEndNanos = getClock().nowNanos();
|
||||
addToOverallScoringTimeNanos(scoringStartNanos, scoringEndNanos);
|
||||
exportBatchScoringTime(scoringEndNanos - scoringStartNanos);
|
||||
|
||||
for (int i = 0; i < hits.size(); i++) {
|
||||
BatchHit hit = hits.get(i);
|
||||
ThriftSearchResultMetadata metadata = hit.getMetadata();
|
||||
|
||||
if (!metadata.isSetExtraMetadata()) {
|
||||
metadata.setExtraMetadata(new ThriftSearchResultExtraMetadata());
|
||||
}
|
||||
metadata.getExtraMetadata().setFeatures(hit.getFeatures());
|
||||
|
||||
|
||||
// Populate the ThriftSearchResultMetadata post batch scoring with information from the
|
||||
// LinearScoringData, which now includes a score.
|
||||
scoringFunction.populateResultMetadataBasedOnScoringData(
|
||||
searchRequestInfo.getSearchQuery().getResultMetadataOptions(),
|
||||
metadata,
|
||||
hit.getScoringData());
|
||||
|
||||
collectWithScoreInternal(
|
||||
hit.getTweetID(),
|
||||
hit.getTimeSliceID(),
|
||||
scores[i],
|
||||
metadata
|
||||
);
|
||||
}
|
||||
return getRelevanceResultsInternal();
|
||||
}
|
||||
|
||||
private void exportBatchScoringTime(long scoringTimeNanos) {
|
||||
ThriftSearchRelevanceOptions relevanceOptions = searchRequestInfo.getRelevanceOptions();
|
||||
if (relevanceOptions.isSetRankingParams()
|
||||
&& relevanceOptions.getRankingParams().isSetSelectedTensorflowModel()) {
|
||||
String model = relevanceOptions.getRankingParams().getSelectedTensorflowModel();
|
||||
SearchTimerStats batchScoringPerModelTimer = SearchTimerStats.export(
|
||||
String.format("batch_scoring_time_for_model_%s", model),
|
||||
TimeUnit.NANOSECONDS,
|
||||
false,
|
||||
true);
|
||||
batchScoringPerModelTimer.timerIncrement(scoringTimeNanos);
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,70 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.collectors;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.relevance.features.TweetIntegerShingleSignature;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserTable;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceHit;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceSearchRequestInfo;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceSearchResults;
|
||||
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
|
||||
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
|
||||
/**
|
||||
* RelevanceAllCollector is a results collector that collects all results sorted by score,
|
||||
* including signature-duplicates and results skipped by the scoring function.
|
||||
*/
|
||||
public class RelevanceAllCollector extends AbstractRelevanceCollector {
|
||||
// All results.
|
||||
protected final List<RelevanceHit> results;
|
||||
|
||||
public RelevanceAllCollector(
|
||||
ImmutableSchemaInterface schema,
|
||||
RelevanceSearchRequestInfo searchRequestInfo,
|
||||
ScoringFunction scoringFunction,
|
||||
EarlybirdSearcherStats searcherStats,
|
||||
EarlybirdCluster cluster,
|
||||
UserTable userTable,
|
||||
Clock clock,
|
||||
int requestDebugMode) {
|
||||
super(schema, searchRequestInfo, scoringFunction, searcherStats, cluster, userTable, clock,
|
||||
requestDebugMode);
|
||||
this.results = Lists.newArrayList();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doCollectWithScore(long tweetID, float score) throws IOException {
|
||||
ThriftSearchResultMetadata metadata = collectMetadata();
|
||||
scoringFunction.populateResultMetadataBasedOnScoringData(
|
||||
searchRequestInfo.getSearchQuery().getResultMetadataOptions(),
|
||||
metadata,
|
||||
scoringFunction.getScoringDataForCurrentDocument());
|
||||
results.add(new RelevanceHit(
|
||||
currTimeSliceID,
|
||||
tweetID,
|
||||
TweetIntegerShingleSignature.deserialize(metadata.getSignature()),
|
||||
metadata));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected RelevanceSearchResults doGetRelevanceResults() {
|
||||
final int numResults = results.size();
|
||||
RelevanceSearchResults searchResults = new RelevanceSearchResults(numResults);
|
||||
|
||||
// Insert hits in decreasing order by score.
|
||||
results.sort(RelevanceHit.COMPARATOR_BY_SCORE);
|
||||
for (int i = 0; i < numResults; i++) {
|
||||
searchResults.setHit(results.get(i), i);
|
||||
}
|
||||
searchResults.setRelevanceStats(getRelevanceStats());
|
||||
searchResults.setNumHits(numResults);
|
||||
return searchResults;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,167 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.collectors;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.common_internal.collections.RandomAccessPriorityQueue;
|
||||
import com.twitter.search.common.relevance.features.TweetIntegerShingleSignature;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
|
||||
import com.twitter.search.common.search.EarlyTerminationState;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserTable;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceHit;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceSearchRequestInfo;
|
||||
import com.twitter.search.earlybird.search.relevance.RelevanceSearchResults;
|
||||
import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunction;
|
||||
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
|
||||
|
||||
/**
|
||||
* RelevanceTopCollector is a results collector that collects the top numResults by
|
||||
* score, filtering out duplicates.
|
||||
*/
|
||||
public class RelevanceTopCollector extends AbstractRelevanceCollector {
|
||||
// Search results are collected in a min-heap.
|
||||
protected final RandomAccessPriorityQueue<RelevanceHit, TweetIntegerShingleSignature> minQueue;
|
||||
|
||||
// Number of hits actually added to the min queue after dupe filtering and skipping.
|
||||
// Less than or equal to numHitsProcessed.
|
||||
protected int numHitsCollected;
|
||||
|
||||
// The 'top' of the min heap, or, the lowest scored document in the heap.
|
||||
private RelevanceHit pqTop;
|
||||
private float lowestScore = ScoringFunction.SKIP_HIT;
|
||||
|
||||
private final boolean isFilterDupes;
|
||||
|
||||
public RelevanceTopCollector(
|
||||
ImmutableSchemaInterface schema,
|
||||
RelevanceSearchRequestInfo searchRequestInfo,
|
||||
ScoringFunction scoringFunction,
|
||||
EarlybirdSearcherStats searcherStats,
|
||||
EarlybirdCluster cluster,
|
||||
UserTable userTable,
|
||||
Clock clock,
|
||||
int requestDebugMode) {
|
||||
super(schema, searchRequestInfo, scoringFunction, searcherStats, cluster, userTable, clock,
|
||||
requestDebugMode);
|
||||
this.minQueue = new RandomAccessPriorityQueue<RelevanceHit, TweetIntegerShingleSignature>(
|
||||
searchRequestInfo.getNumResultsRequested(), RelevanceHit.PQ_COMPARATOR_BY_SCORE) {
|
||||
@Override
|
||||
protected RelevanceHit getSentinelObject() {
|
||||
return new RelevanceHit(); // default relevance constructor would create a hit with the
|
||||
// lowest score possible.
|
||||
}
|
||||
};
|
||||
this.pqTop = minQueue.top();
|
||||
this.isFilterDupes = getSearchRequestInfo().getRelevanceOptions().isFilterDups();
|
||||
}
|
||||
|
||||
protected void collectWithScoreInternal(
|
||||
long tweetID,
|
||||
long timeSliceID,
|
||||
float score,
|
||||
ThriftSearchResultMetadata metadata) {
|
||||
// This collector cannot handle these scores:
|
||||
assert !Float.isNaN(score);
|
||||
|
||||
if (score <= lowestScore) {
|
||||
// Since docs are returned in-order (i.e., increasing doc Id), a document
|
||||
// with equal score to pqTop.score cannot compete since HitQueue favors
|
||||
// documents with lower doc Ids. Therefore reject those docs too.
|
||||
// IMPORTANT: docs skipped by the scoring function will have scores set
|
||||
// to ScoringFunction.SKIP_HIT, meaning they will not be collected.
|
||||
return;
|
||||
}
|
||||
|
||||
boolean dupFound = false;
|
||||
Preconditions.checkState(metadata.isSetSignature(),
|
||||
"The signature should be set at metadata collection time, but it is null. "
|
||||
+ "Tweet id = %s, metadata = %s",
|
||||
tweetID,
|
||||
metadata);
|
||||
int signatureInt = metadata.getSignature();
|
||||
final TweetIntegerShingleSignature signature =
|
||||
TweetIntegerShingleSignature.deserialize(signatureInt);
|
||||
|
||||
if (isFilterDupes) {
|
||||
// update duplicate if any
|
||||
if (signatureInt != TweetIntegerShingleSignature.DEFAULT_NO_SIGNATURE) {
|
||||
dupFound = minQueue.incrementElement(
|
||||
signature,
|
||||
element -> {
|
||||
if (score > element.getScore()) {
|
||||
element.update(timeSliceID, tweetID, signature, metadata);
|
||||
}
|
||||
}
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
if (!dupFound) {
|
||||
numHitsCollected++;
|
||||
|
||||
// if we didn't find a duplicate element to update then we add it now as a new element to the
|
||||
// pq
|
||||
pqTop = minQueue.updateTop(top -> top.update(timeSliceID, tweetID, signature, metadata));
|
||||
|
||||
lowestScore = pqTop.getScore();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void doCollectWithScore(final long tweetID, final float score) throws IOException {
|
||||
ThriftSearchResultMetadata metadata = collectMetadata();
|
||||
scoringFunction.populateResultMetadataBasedOnScoringData(
|
||||
searchRequestInfo.getSearchQuery().getResultMetadataOptions(),
|
||||
metadata,
|
||||
scoringFunction.getScoringDataForCurrentDocument());
|
||||
collectWithScoreInternal(tweetID, currTimeSliceID, score, metadata);
|
||||
}
|
||||
|
||||
@Override
|
||||
public EarlyTerminationState innerShouldCollectMore() {
|
||||
// Note that numHitsCollected here might be less than num results collected in the
|
||||
// TwitterEarlyTerminationCollector, if we hit dups or there are very low scores.
|
||||
if (numHitsCollected >= getMaxHitsToProcess()) {
|
||||
return setEarlyTerminationState(EarlyTerminationState.TERMINATED_MAX_HITS_EXCEEDED);
|
||||
}
|
||||
return EarlyTerminationState.COLLECTING;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected RelevanceSearchResults doGetRelevanceResults() throws IOException {
|
||||
return getRelevanceResultsInternal();
|
||||
}
|
||||
|
||||
protected RelevanceSearchResults getRelevanceResultsInternal() {
|
||||
return resultsFromQueue(minQueue, getSearchRequestInfo().getNumResultsRequested(),
|
||||
getRelevanceStats());
|
||||
}
|
||||
|
||||
private static RelevanceSearchResults resultsFromQueue(
|
||||
RandomAccessPriorityQueue<RelevanceHit, TweetIntegerShingleSignature> pq,
|
||||
int desiredNumResults,
|
||||
ThriftSearchResultsRelevanceStats relevanceStats) {
|
||||
// trim first in case we didn't fill up the queue to not get any sentinel values here
|
||||
int numResults = pq.trim();
|
||||
if (numResults > desiredNumResults) {
|
||||
for (int i = 0; i < numResults - desiredNumResults; i++) {
|
||||
pq.pop();
|
||||
}
|
||||
numResults = desiredNumResults;
|
||||
}
|
||||
RelevanceSearchResults results = new RelevanceSearchResults(numResults);
|
||||
// insert hits in decreasing order by score
|
||||
for (int i = numResults - 1; i >= 0; i--) {
|
||||
RelevanceHit hit = pq.pop();
|
||||
results.setHit(hit, i);
|
||||
}
|
||||
results.setRelevanceStats(relevanceStats);
|
||||
results.setNumHits(numResults);
|
||||
return results;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,47 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
|
||||
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
|
||||
public class BatchHit {
|
||||
private final LinearScoringData scoringData;
|
||||
private final ThriftSearchResultFeatures features;
|
||||
private final ThriftSearchResultMetadata metadata;
|
||||
private final long tweetID;
|
||||
private final long timeSliceID;
|
||||
|
||||
public BatchHit(
|
||||
LinearScoringData scoringData,
|
||||
ThriftSearchResultFeatures features,
|
||||
ThriftSearchResultMetadata metadata,
|
||||
long tweetID,
|
||||
long timeSliceID
|
||||
) {
|
||||
this.scoringData = scoringData;
|
||||
this.features = features;
|
||||
this.metadata = metadata;
|
||||
this.tweetID = tweetID;
|
||||
this.timeSliceID = timeSliceID;
|
||||
}
|
||||
|
||||
public LinearScoringData getScoringData() {
|
||||
return scoringData;
|
||||
}
|
||||
|
||||
public ThriftSearchResultFeatures getFeatures() {
|
||||
return features;
|
||||
}
|
||||
|
||||
public ThriftSearchResultMetadata getMetadata() {
|
||||
return metadata;
|
||||
}
|
||||
|
||||
public long getTweetID() {
|
||||
return tweetID;
|
||||
}
|
||||
|
||||
public long getTimeSliceID() {
|
||||
return timeSliceID;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,37 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
|
||||
|
||||
/*
|
||||
* A sample scorer, doesn't really do anything, returns the same score for every document.
|
||||
*/
|
||||
public class DefaultScoringFunction extends ScoringFunction {
|
||||
private float score;
|
||||
|
||||
public DefaultScoringFunction(ImmutableSchemaInterface schema) {
|
||||
super(schema);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(float luceneQueryScore) {
|
||||
score = luceneQueryScore;
|
||||
return luceneQueryScore;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Explanation doExplain(float luceneScore) {
|
||||
// just an example - this scoring function will go away soon
|
||||
return Explanation.match(luceneScore, "luceneScore=" + luceneScore);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
|
||||
relevanceStats.setNumScored(relevanceStats.getNumScored() + 1);
|
||||
if (score == ScoringFunction.SKIP_HIT) {
|
||||
relevanceStats.setNumSkipped(relevanceStats.getNumSkipped() + 1);
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -1,98 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import com.twitter.search.common.util.ml.prediction_engine.BaseLegacyScoreAccumulator;
|
||||
import com.twitter.search.common.util.ml.prediction_engine.LightweightLinearModel;
|
||||
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
|
||||
import com.twitter.search.modeling.tweet_ranking.TweetScoringFeatures;
|
||||
|
||||
/**
|
||||
* Legacy score accumulator in Earlybird with specific features added.
|
||||
* This class is created to avoid adding LinearScoringData as a dependency to search's common ML
|
||||
* library.
|
||||
*
|
||||
* @deprecated This class is retired and we suggest to switch to SchemaBasedScoreAccumulator.
|
||||
*/
|
||||
@Deprecated
|
||||
public class LegacyScoreAccumulator extends BaseLegacyScoreAccumulator<LinearScoringData> {
|
||||
/**
|
||||
* Constructs with a model and LinearScoringData
|
||||
*/
|
||||
LegacyScoreAccumulator(LightweightLinearModel model) {
|
||||
super(model);
|
||||
}
|
||||
|
||||
/**
|
||||
* Update the accumulator score with features, after this function the score should already
|
||||
* be computed.
|
||||
*
|
||||
* @deprecated This function is retired and we suggest to switch to updateScoresWithFeatures in
|
||||
* SchemaBasedScoreAccumulator.
|
||||
*/
|
||||
@Override
|
||||
@Deprecated
|
||||
protected void updateScoreWithFeatures(LinearScoringData data) {
|
||||
addContinuousFeature(TweetScoringFeatures.LUCENE_SCORE, data.luceneScore);
|
||||
addContinuousFeature(TweetScoringFeatures.TEXT_SCORE, data.textScore);
|
||||
addContinuousFeature(TweetScoringFeatures.TWEET_AGE_IN_SECONDS, data.tweetAgeInSeconds);
|
||||
addContinuousFeature(TweetScoringFeatures.REPLY_COUNT, data.replyCountPostLog2);
|
||||
addContinuousFeature(TweetScoringFeatures.RETWEET_COUNT, data.retweetCountPostLog2);
|
||||
addContinuousFeature(TweetScoringFeatures.FAV_COUNT, data.favCountPostLog2);
|
||||
addContinuousFeature(TweetScoringFeatures.REPLY_COUNT_V2, data.replyCountV2);
|
||||
addContinuousFeature(TweetScoringFeatures.RETWEET_COUNT_V2, data.retweetCountV2);
|
||||
addContinuousFeature(TweetScoringFeatures.FAV_COUNT_V2, data.favCountV2);
|
||||
addContinuousFeature(TweetScoringFeatures.EMBEDS_IMPRESSION_COUNT,
|
||||
data.getEmbedsImpressionCount(false));
|
||||
addContinuousFeature(TweetScoringFeatures.EMBEDS_URL_COUNT, data.getEmbedsUrlCount(false));
|
||||
addContinuousFeature(TweetScoringFeatures.VIDEO_VIEW_COUNT, data.getVideoViewCount(false));
|
||||
addContinuousFeature(TweetScoringFeatures.QUOTED_COUNT, data.quotedCount);
|
||||
addContinuousFeature(TweetScoringFeatures.WEIGHTED_RETWEET_COUNT, data.weightedRetweetCount);
|
||||
addContinuousFeature(TweetScoringFeatures.WEIGHTED_REPLY_COUNT, data.weightedReplyCount);
|
||||
addContinuousFeature(TweetScoringFeatures.WEIGHTED_FAV_COUNT, data.weightedFavCount);
|
||||
addContinuousFeature(TweetScoringFeatures.WEIGHTED_QUOTE_COUNT, data.weightedQuoteCount);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_URL, data.hasUrl);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_CARD, data.hasCard);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_VINE, data.hasVine);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_PERISCOPE, data.hasPeriscope);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_NATIVE_IMAGE, data.hasNativeImage);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_IMAGE_URL, data.hasImageUrl);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_NEWS_URL, data.hasNewsUrl);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_VIDEO_URL, data.hasVideoUrl);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_CONSUMER_VIDEO, data.hasConsumerVideo);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_PRO_VIDEO, data.hasProVideo);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_QUOTE, data.hasQuote);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_TREND, data.hasTrend);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_MULTIPLE_HASHTAGS_OR_TRENDS,
|
||||
data.hasMultipleHashtagsOrTrends);
|
||||
addBinaryFeature(TweetScoringFeatures.IS_OFFENSIVE, data.isOffensive);
|
||||
addBinaryFeature(TweetScoringFeatures.IS_REPLY, data.isReply);
|
||||
addBinaryFeature(TweetScoringFeatures.IS_RETWEET, data.isRetweet);
|
||||
addBinaryFeature(TweetScoringFeatures.IS_SELF_TWEET, data.isSelfTweet);
|
||||
addBinaryFeature(TweetScoringFeatures.IS_FOLLOW_RETWEET, data.isRetweet & data.isFollow);
|
||||
addBinaryFeature(TweetScoringFeatures.IS_TRUSTED_RETWEET, data.isRetweet & data.isTrusted);
|
||||
addContinuousFeature(TweetScoringFeatures.QUERY_SPECIFIC_SCORE, data.querySpecificScore);
|
||||
addContinuousFeature(TweetScoringFeatures.AUTHOR_SPECIFIC_SCORE, data.authorSpecificScore);
|
||||
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_FOLLOW, data.isFollow);
|
||||
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_TRUSTED, data.isTrusted);
|
||||
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_VERIFIED, data.isFromVerifiedAccount);
|
||||
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_NSFW, data.isUserNSFW);
|
||||
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_SPAM, data.isUserSpam);
|
||||
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_BOT, data.isUserBot);
|
||||
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_ANTISOCIAL, data.isUserAntiSocial);
|
||||
addContinuousFeature(TweetScoringFeatures.AUTHOR_REPUTATION, data.userRep);
|
||||
addContinuousFeature(TweetScoringFeatures.SEARCHER_LANG_SCORE, data.userLangMult);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_DIFFERENT_LANG, data.hasDifferentLang);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_ENGLISH_TWEET_AND_DIFFERENT_UI_LANG,
|
||||
data.hasEnglishTweetAndDifferentUILang);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_ENGLISH_UI_AND_DIFFERENT_TWEET_LANG,
|
||||
data.hasEnglishUIAndDifferentTweetLang);
|
||||
addBinaryFeature(TweetScoringFeatures.IS_SENSITIVE_CONTENT, data.isSensitiveContent);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_MULTIPLE_MEDIA, data.hasMultipleMediaFlag);
|
||||
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_PROFILE_EGG, data.profileIsEggFlag);
|
||||
addBinaryFeature(TweetScoringFeatures.AUTHOR_IS_NEW, data.isUserNewFlag);
|
||||
addContinuousFeature(TweetScoringFeatures.MENTIONS_COUNT, data.numMentions);
|
||||
addContinuousFeature(TweetScoringFeatures.HASHTAGS_COUNT, data.numHashtags);
|
||||
addContinuousFeature(TweetScoringFeatures.LINK_LANGUAGE_ID, data.linkLanguage);
|
||||
addContinuousFeature(TweetScoringFeatures.LANGUAGE_ID, data.tweetLangId);
|
||||
addBinaryFeature(TweetScoringFeatures.HAS_VISIBLE_LINK, data.hasVisibleLink);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,237 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
import com.twitter.search.common.relevance.features.MutableFeatureNormalizers;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserTable;
|
||||
import com.twitter.search.earlybird.search.AntiGamingFilter;
|
||||
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
|
||||
import com.twitter.search.earlybird.search.relevance.LinearScoringParams;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
|
||||
|
||||
/**
|
||||
* Scoring function that uses the weights and boosts provided in the scoring parameters from the
|
||||
* request.
|
||||
*/
|
||||
public class LinearScoringFunction extends FeatureBasedScoringFunction {
|
||||
private static final double BASE_SCORE = 0.0001;
|
||||
|
||||
public LinearScoringFunction(
|
||||
ImmutableSchemaInterface schema,
|
||||
ThriftSearchQuery searchQuery,
|
||||
AntiGamingFilter antiGamingFilter,
|
||||
ThriftSearchResultType searchResultType,
|
||||
UserTable userTable) throws IOException {
|
||||
super("LinearScoringFunction", schema, searchQuery, antiGamingFilter, searchResultType,
|
||||
userTable);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double computeScore(LinearScoringData data, boolean forExplanation) throws IOException {
|
||||
double score = BASE_SCORE;
|
||||
|
||||
data.luceneContrib = params.useLuceneScoreAsBoost
|
||||
? 0.0 : params.luceneWeight * data.luceneScore;
|
||||
|
||||
data.reputationContrib = params.reputationWeight * data.userRep;
|
||||
data.textScoreContrib = params.textScoreWeight * data.textScore;
|
||||
data.parusContrib = params.parusWeight * data.parusScore;
|
||||
|
||||
// contributions from engagement counters. Note that we have "true" argument for all getters,
|
||||
// which means all values will get scaled down for scoring, they were unbounded in raw form.
|
||||
data.retweetContrib = params.retweetWeight * data.retweetCountPostLog2;
|
||||
data.favContrib = params.favWeight * data.favCountPostLog2;
|
||||
data.replyContrib = params.replyWeight * data.replyCountPostLog2;
|
||||
data.embedsImpressionContrib =
|
||||
params.embedsImpressionWeight * data.getEmbedsImpressionCount(true);
|
||||
data.embedsUrlContrib =
|
||||
params.embedsUrlWeight * data.getEmbedsUrlCount(true);
|
||||
data.videoViewContrib =
|
||||
params.videoViewWeight * data.getVideoViewCount(true);
|
||||
data.quotedContrib =
|
||||
params.quotedCountWeight * data.quotedCount;
|
||||
|
||||
for (int i = 0; i < LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS; i++) {
|
||||
data.offlineExpFeatureContributions[i] =
|
||||
params.rankingOfflineExpWeights[i] * data.offlineExpFeatureValues[i];
|
||||
}
|
||||
|
||||
data.hasUrlContrib = params.urlWeight * (data.hasUrl ? 1.0 : 0.0);
|
||||
data.isReplyContrib = params.isReplyWeight * (data.isReply ? 1.0 : 0.0);
|
||||
data.isFollowRetweetContrib =
|
||||
params.followRetweetWeight * (data.isRetweet && data.isFollow ? 1.0 : 0.0);
|
||||
data.isTrustedRetweetContrib =
|
||||
params.trustedRetweetWeight * (data.isRetweet && data.isTrusted ? 1.0 : 0.0);
|
||||
double replyCountOriginal = getUnscaledReplyCountFeatureValue();
|
||||
data.multipleReplyContrib = params.multipleReplyWeight
|
||||
* (replyCountOriginal < params.multipleReplyMinVal ? 0.0 : replyCountOriginal);
|
||||
|
||||
// We directly the query specific score as the contribution below as it doesn't need a weight
|
||||
// for contribution computation.
|
||||
score += data.luceneContrib
|
||||
+ data.reputationContrib
|
||||
+ data.textScoreContrib
|
||||
+ data.replyContrib
|
||||
+ data.multipleReplyContrib
|
||||
+ data.retweetContrib
|
||||
+ data.favContrib
|
||||
+ data.parusContrib
|
||||
+ data.embedsImpressionContrib
|
||||
+ data.embedsUrlContrib
|
||||
+ data.videoViewContrib
|
||||
+ data.quotedContrib
|
||||
+ data.hasUrlContrib
|
||||
+ data.isReplyContrib
|
||||
+ data.isFollowRetweetContrib
|
||||
+ data.isTrustedRetweetContrib
|
||||
+ data.querySpecificScore
|
||||
+ data.authorSpecificScore;
|
||||
|
||||
for (int i = 0; i < LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS; i++) {
|
||||
score += data.offlineExpFeatureContributions[i];
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates the explanation for the linear score.
|
||||
*/
|
||||
@Override
|
||||
protected void generateExplanationForScoring(
|
||||
LinearScoringData scoringData, boolean isHit, List<Explanation> details) throws IOException {
|
||||
// 1. Linear components
|
||||
final List<Explanation> linearDetails = Lists.newArrayList();
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "[LuceneQueryScore]",
|
||||
params.luceneWeight, scoringData.luceneScore, scoringData.luceneContrib);
|
||||
if (scoringData.hasCard) {
|
||||
if (scoringData.cardAuthorMatchBoostApplied) {
|
||||
linearDetails.add(Explanation.match(
|
||||
(float) params.cardAuthorMatchBoosts[scoringData.cardType],
|
||||
"[x] card author match boost"));
|
||||
}
|
||||
if (scoringData.cardDescriptionMatchBoostApplied) {
|
||||
linearDetails.add(Explanation.match(
|
||||
(float) params.cardDescriptionMatchBoosts[scoringData.cardType],
|
||||
"[x] card description match boost"));
|
||||
}
|
||||
if (scoringData.cardDomainMatchBoostApplied) {
|
||||
linearDetails.add(Explanation.match(
|
||||
(float) params.cardDomainMatchBoosts[scoringData.cardType],
|
||||
"[x] card domain match boost"));
|
||||
}
|
||||
if (scoringData.cardTitleMatchBoostApplied) {
|
||||
linearDetails.add(Explanation.match(
|
||||
(float) params.cardTitleMatchBoosts[scoringData.cardType],
|
||||
"[x] card title match boost"));
|
||||
}
|
||||
}
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "reputation",
|
||||
params.reputationWeight, scoringData.userRep, scoringData.reputationContrib);
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "text score",
|
||||
params.textScoreWeight, scoringData.textScore, scoringData.textScoreContrib);
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "reply count (log2)",
|
||||
params.replyWeight, scoringData.replyCountPostLog2, scoringData.replyContrib);
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "multi reply",
|
||||
params.multipleReplyWeight,
|
||||
getUnscaledReplyCountFeatureValue() > params.multipleReplyMinVal ? 1 : 0,
|
||||
scoringData.multipleReplyContrib);
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "retweet count (log2)",
|
||||
params.retweetWeight, scoringData.retweetCountPostLog2, scoringData.retweetContrib);
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "fav count (log2)",
|
||||
params.favWeight, scoringData.favCountPostLog2, scoringData.favContrib);
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "parus score",
|
||||
params.parusWeight, scoringData.parusScore, scoringData.parusContrib);
|
||||
for (int i = 0; i < LinearScoringData.MAX_OFFLINE_EXPERIMENTAL_FIELDS; i++) {
|
||||
if (params.rankingOfflineExpWeights[i] != LinearScoringParams.DEFAULT_FEATURE_WEIGHT) {
|
||||
addLinearElementExplanation(linearDetails,
|
||||
"ranking exp score offline experimental #" + i,
|
||||
params.rankingOfflineExpWeights[i], scoringData.offlineExpFeatureValues[i],
|
||||
scoringData.offlineExpFeatureContributions[i]);
|
||||
}
|
||||
}
|
||||
addLinearElementExplanation(linearDetails,
|
||||
"embedded tweet impression count",
|
||||
params.embedsImpressionWeight, scoringData.getEmbedsImpressionCount(false),
|
||||
scoringData.embedsImpressionContrib);
|
||||
addLinearElementExplanation(linearDetails,
|
||||
"embedded tweet url count",
|
||||
params.embedsUrlWeight, scoringData.getEmbedsUrlCount(false),
|
||||
scoringData.embedsUrlContrib);
|
||||
addLinearElementExplanation(linearDetails,
|
||||
"video view count",
|
||||
params.videoViewWeight, scoringData.getVideoViewCount(false),
|
||||
scoringData.videoViewContrib);
|
||||
addLinearElementExplanation(linearDetails,
|
||||
"quoted count",
|
||||
params.quotedCountWeight, scoringData.quotedCount, scoringData.quotedContrib);
|
||||
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "has url", params.urlWeight, scoringData.hasUrl ? 1.0 : 0.0,
|
||||
scoringData.hasUrlContrib);
|
||||
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "is reply", params.isReplyWeight,
|
||||
scoringData.isReply ? 1.0 : 0.0, scoringData.isReplyContrib);
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "is follow retweet", params.followRetweetWeight,
|
||||
scoringData.isRetweet && scoringData.isFollow ? 1.0 : 0.0,
|
||||
scoringData.isFollowRetweetContrib);
|
||||
addLinearElementExplanation(
|
||||
linearDetails, "is trusted retweet", params.trustedRetweetWeight,
|
||||
scoringData.isRetweet && scoringData.isTrusted ? 1.0 : 0.0,
|
||||
scoringData.isTrustedRetweetContrib);
|
||||
|
||||
if (scoringData.querySpecificScore != 0.0) {
|
||||
linearDetails.add(Explanation.match((float) scoringData.querySpecificScore,
|
||||
"[+] query specific score adjustment"));
|
||||
}
|
||||
if (scoringData.authorSpecificScore != 0.0) {
|
||||
linearDetails.add(Explanation.match((float) scoringData.authorSpecificScore,
|
||||
"[+] author specific score adjustment"));
|
||||
}
|
||||
|
||||
|
||||
Explanation linearCombo = isHit
|
||||
? Explanation.match((float) scoringData.scoreBeforeBoost,
|
||||
"(MATCH) Linear components, sum of:", linearDetails)
|
||||
: Explanation.noMatch("Linear components, sum of:", linearDetails);
|
||||
|
||||
|
||||
details.add(linearCombo);
|
||||
}
|
||||
|
||||
private void addLinearElementExplanation(List<Explanation> explanation,
|
||||
String name,
|
||||
double weight,
|
||||
double componentValue,
|
||||
double contrib) {
|
||||
if (contrib == 0.0) {
|
||||
return;
|
||||
}
|
||||
explanation.add(
|
||||
Explanation.match((float) contrib,
|
||||
String.format("[+] %s=%.3f weight=%.3f", name, componentValue, weight)));
|
||||
}
|
||||
|
||||
private double getUnscaledReplyCountFeatureValue() throws IOException {
|
||||
byte featureValue = (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.REPLY_COUNT);
|
||||
return MutableFeatureNormalizers.BYTE_NORMALIZER.unnormLowerBound(featureValue);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,151 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.base.Optional;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.ranking.thriftjava.ThriftRankingParams;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.util.ml.prediction_engine.LightweightLinearModel;
|
||||
import com.twitter.search.common.util.ml.prediction_engine.SchemaBasedScoreAccumulator;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserTable;
|
||||
import com.twitter.search.earlybird.exception.ClientException;
|
||||
import com.twitter.search.earlybird.ml.ScoringModelsManager;
|
||||
import com.twitter.search.earlybird.search.AntiGamingFilter;
|
||||
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
|
||||
|
||||
/**
|
||||
* Scoring function that uses the scoring models specified from the request.
|
||||
*/
|
||||
public class ModelBasedScoringFunction extends FeatureBasedScoringFunction {
|
||||
private final SelectedModel[] selectedModels;
|
||||
private final boolean useLogitScore;
|
||||
private final boolean isSchemaBased;
|
||||
|
||||
private static final SearchCounter NUM_LEGACY_MODELS =
|
||||
SearchCounter.export("scoring_function_num_legacy_models");
|
||||
private static final SearchCounter NUM_SCHEMA_BASED_MODELS =
|
||||
SearchCounter.export("scoring_function_num_schema_based_models");
|
||||
private static final SearchCounter MIXED_MODEL_TYPES =
|
||||
SearchCounter.export("scoring_function_mixed_model_types");
|
||||
|
||||
public ModelBasedScoringFunction(
|
||||
ImmutableSchemaInterface schema,
|
||||
ThriftSearchQuery searchQuery,
|
||||
AntiGamingFilter antiGamingFilter,
|
||||
ThriftSearchResultType searchResultType,
|
||||
UserTable userTable,
|
||||
ScoringModelsManager scoringModelsManager
|
||||
) throws IOException, ClientException {
|
||||
|
||||
super("ModelBasedScoringFunction", schema, searchQuery, antiGamingFilter, searchResultType,
|
||||
userTable);
|
||||
|
||||
ThriftRankingParams rankingParams = searchQuery.getRelevanceOptions().getRankingParams();
|
||||
Preconditions.checkNotNull(rankingParams);
|
||||
|
||||
if (rankingParams.getSelectedModelsSize() <= 0) {
|
||||
throw new ClientException("Scoring type is MODEL_BASED but no models were selected");
|
||||
}
|
||||
|
||||
Map<String, Double> models = rankingParams.getSelectedModels();
|
||||
|
||||
selectedModels = new SelectedModel[models.size()];
|
||||
int numSchemaBased = 0;
|
||||
int i = 0;
|
||||
for (Map.Entry<String, Double> nameAndWeight : models.entrySet()) {
|
||||
Optional<LightweightLinearModel> model =
|
||||
scoringModelsManager.getModel(nameAndWeight.getKey());
|
||||
if (!model.isPresent()) {
|
||||
throw new ClientException(String.format(
|
||||
"Scoring function is MODEL_BASED. Selected model '%s' not found",
|
||||
nameAndWeight.getKey()));
|
||||
}
|
||||
selectedModels[i] =
|
||||
new SelectedModel(nameAndWeight.getKey(), nameAndWeight.getValue(), model.get());
|
||||
|
||||
if (selectedModels[i].model.isSchemaBased()) {
|
||||
++numSchemaBased;
|
||||
NUM_SCHEMA_BASED_MODELS.increment();
|
||||
} else {
|
||||
NUM_LEGACY_MODELS.increment();
|
||||
}
|
||||
++i;
|
||||
}
|
||||
|
||||
// We should either see all models schema-based, or none of them so, if this is not the case,
|
||||
// we log an error message and fall back to use just the first model, whatever it is.
|
||||
if (numSchemaBased > 0 && numSchemaBased != selectedModels.length) {
|
||||
MIXED_MODEL_TYPES.increment();
|
||||
throw new ClientException(
|
||||
"You cannot mix schema-based and non-schema-based models in the same request, "
|
||||
+ "models are: " + models.keySet());
|
||||
}
|
||||
|
||||
isSchemaBased = selectedModels[0].model.isSchemaBased();
|
||||
useLogitScore = rankingParams.isUseLogitScore();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected double computeScore(LinearScoringData data, boolean forExplanation) throws IOException {
|
||||
ThriftSearchResultFeatures features =
|
||||
isSchemaBased ? createFeaturesForDocument(data, false).getFeatures() : null;
|
||||
|
||||
double score = 0;
|
||||
for (SelectedModel selectedModel : selectedModels) {
|
||||
double modelScore = isSchemaBased
|
||||
? new SchemaBasedScoreAccumulator(selectedModel.model).scoreWith(features, useLogitScore)
|
||||
: new LegacyScoreAccumulator(selectedModel.model).scoreWith(data, useLogitScore);
|
||||
score += selectedModel.weight * modelScore;
|
||||
}
|
||||
|
||||
return score;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void generateExplanationForScoring(
|
||||
LinearScoringData scoringData, boolean isHit, List<Explanation> details) throws IOException {
|
||||
boolean schemaBased = selectedModels[0].model.isSchemaBased();
|
||||
ThriftSearchResultFeatures features =
|
||||
schemaBased ? createFeaturesForDocument(scoringData, false).getFeatures() : null;
|
||||
|
||||
// 1. Model-based score
|
||||
final List<Explanation> modelExplanations = Lists.newArrayList();
|
||||
float finalScore = 0;
|
||||
for (SelectedModel selectedModel : selectedModels) {
|
||||
double modelScore = schemaBased
|
||||
? new SchemaBasedScoreAccumulator(selectedModel.model).scoreWith(features, useLogitScore)
|
||||
: new LegacyScoreAccumulator(selectedModel.model).scoreWith(scoringData, useLogitScore);
|
||||
float weightedScore = (float) (selectedModel.weight * modelScore);
|
||||
details.add(Explanation.match(
|
||||
weightedScore, String.format("model=%s score=%.6f weight=%.3f useLogitScore=%s",
|
||||
selectedModel.name, modelScore, selectedModel.weight, useLogitScore)));
|
||||
finalScore += weightedScore;
|
||||
}
|
||||
|
||||
details.add(Explanation.match(
|
||||
finalScore, String.format("Total model-based score (hit=%s)", isHit), modelExplanations));
|
||||
}
|
||||
|
||||
private static final class SelectedModel {
|
||||
public final String name;
|
||||
public final double weight;
|
||||
public final LightweightLinearModel model;
|
||||
|
||||
private SelectedModel(String name, double weight, LightweightLinearModel model) {
|
||||
this.name = name;
|
||||
this.weight = weight;
|
||||
this.model = model;
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,164 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Objects;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.index.Term;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.apache.lucene.search.IndexSearcher;
|
||||
import org.apache.lucene.search.Query;
|
||||
import org.apache.lucene.search.Scorer;
|
||||
import org.apache.lucene.search.ScoreMode;
|
||||
import org.apache.lucene.search.Weight;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.common.results.thriftjava.FieldHitAttribution;
|
||||
|
||||
/**
|
||||
* A wrapper for a Lucene query which first computes Lucene's query score
|
||||
* and then delegates to a {@link ScoringFunction} for final score computation.
|
||||
*/
|
||||
public class RelevanceQuery extends Query {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(RelevanceQuery.class.getName());
|
||||
|
||||
protected final Query luceneQuery;
|
||||
protected final ScoringFunction scoringFunction;
|
||||
|
||||
// True when the lucene query's score should be ignored for debug explanations.
|
||||
protected final boolean ignoreLuceneQueryScoreExplanation;
|
||||
|
||||
public RelevanceQuery(Query luceneQuery, ScoringFunction scoringFunction) {
|
||||
this(luceneQuery, scoringFunction, false);
|
||||
}
|
||||
|
||||
public RelevanceQuery(Query luceneQuery,
|
||||
ScoringFunction scoringFunction,
|
||||
boolean ignoreLuceneQueryScoreExplanation) {
|
||||
this.luceneQuery = luceneQuery;
|
||||
this.scoringFunction = scoringFunction;
|
||||
this.ignoreLuceneQueryScoreExplanation = ignoreLuceneQueryScoreExplanation;
|
||||
}
|
||||
|
||||
public ScoringFunction getScoringFunction() {
|
||||
return scoringFunction;
|
||||
}
|
||||
|
||||
public Query getLuceneQuery() {
|
||||
return luceneQuery;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Query rewrite(IndexReader reader) throws IOException {
|
||||
Query rewritten = luceneQuery.rewrite(reader);
|
||||
if (rewritten == luceneQuery) {
|
||||
return this;
|
||||
}
|
||||
return new RelevanceQuery(rewritten, scoringFunction, ignoreLuceneQueryScoreExplanation);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
||||
throws IOException {
|
||||
Weight luceneWeight = luceneQuery.createWeight(searcher, scoreMode, boost);
|
||||
if (luceneWeight == null) {
|
||||
return null;
|
||||
}
|
||||
return new RelevanceWeight(searcher, luceneWeight);
|
||||
}
|
||||
|
||||
public class RelevanceWeight extends Weight {
|
||||
private final Weight luceneWeight;
|
||||
|
||||
public RelevanceWeight(IndexSearcher searcher, Weight luceneWeight) {
|
||||
super(RelevanceQuery.this);
|
||||
this.luceneWeight = luceneWeight;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void extractTerms(Set<Term> terms) {
|
||||
this.luceneWeight.extractTerms(terms);
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
|
||||
return explain(context, doc, null);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an explanation of the scoring for the given document.
|
||||
*
|
||||
* @param context The context of the reader that returned this document.
|
||||
* @param doc The document.
|
||||
* @param fieldHitAttribution Per-hit field attribution information.
|
||||
* @return An explanation of the scoring for the given document.
|
||||
*/
|
||||
public Explanation explain(LeafReaderContext context, int doc,
|
||||
@Nullable FieldHitAttribution fieldHitAttribution) throws IOException {
|
||||
|
||||
Explanation luceneExplanation = Explanation.noMatch("LuceneQuery explain skipped");
|
||||
if (!ignoreLuceneQueryScoreExplanation) {
|
||||
// get Lucene score
|
||||
try {
|
||||
luceneExplanation = luceneWeight.explain(context, doc);
|
||||
} catch (Exception e) {
|
||||
// We sometimes see exceptions resulting from term queries that do not store
|
||||
// utf8-text, which TermQuery.toString() assumes. Catch here and allow at least
|
||||
// scoring function explanations to be returned.
|
||||
LOG.error("Exception in explain", e);
|
||||
luceneExplanation = Explanation.noMatch("LuceneQuery explain failed");
|
||||
}
|
||||
}
|
||||
|
||||
Explanation scoringFunctionExplanation;
|
||||
scoringFunction.setFieldHitAttribution(fieldHitAttribution);
|
||||
scoringFunctionExplanation = scoringFunction.explain(
|
||||
context.reader(), doc, luceneExplanation.getValue().floatValue());
|
||||
|
||||
// just add a wrapper for a better structure of the final explanation
|
||||
Explanation luceneExplanationWrapper = Explanation.match(
|
||||
luceneExplanation.getValue(), "LuceneQuery", luceneExplanation);
|
||||
|
||||
return Explanation.match(scoringFunctionExplanation.getValue(), "RelevanceQuery",
|
||||
scoringFunctionExplanation, luceneExplanationWrapper);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Scorer scorer(LeafReaderContext context) throws IOException {
|
||||
return luceneWeight.scorer(context);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isCacheable(LeafReaderContext ctx) {
|
||||
return luceneWeight.isCacheable(ctx);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return (luceneQuery == null ? 0 : luceneQuery.hashCode())
|
||||
+ (scoringFunction == null ? 0 : scoringFunction.hashCode()) * 13;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (!(obj instanceof RelevanceQuery)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
RelevanceQuery query = RelevanceQuery.class.cast(obj);
|
||||
return Objects.equals(luceneQuery, query.luceneQuery)
|
||||
&& Objects.equals(scoringFunction, query.scoringFunction);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString(String field) {
|
||||
return "RelevanceQuery[q=" + luceneQuery.toString(field) + "]";
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,165 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
import com.twitter.search.common.relevance.features.MutableFeatureNormalizers;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
|
||||
|
||||
/**
|
||||
* A toptweets query cache index selection scoring function that is based purely on retweet counts.
|
||||
* The goal of this scoring functon is to deprecate itweet score in entirety.
|
||||
*
|
||||
* Once all legacy itweet scores are drained from existing earlybird index, new parus score replaces
|
||||
* existing itweet score position, then this class will be deprecated, a new scoring function
|
||||
* using parus score shall replace this.
|
||||
*
|
||||
* this scoring function is only used in Query Cache for marking top tweets
|
||||
* in the background. When searched, those tweets are still ranked with linear or model-based
|
||||
* scoring function.
|
||||
*
|
||||
*/
|
||||
public class RetweetBasedTopTweetsScoringFunction extends ScoringFunction {
|
||||
private static final double DEFAULT_RECENCY_SCORE_FRACTION = 0.1;
|
||||
private static final double DEFAULT_SIGMOID_APLHA = 0.008;
|
||||
private static final int DEFAULT_RECENCY_CENTER_MINUTES = 1080;
|
||||
|
||||
// if you update the default cut off, make sure you update the query cache filter in
|
||||
// querycache.yml
|
||||
//
|
||||
// we know currently each time slice, each partition has about 10K entries in toptweets query
|
||||
// cache. These are unique tweets. Looking at retweet updates, each time slice, each partition has
|
||||
// about 650K unique tweets that received retweet. To create roughly similar number of entries in
|
||||
// query cache, we need top 2% of such tweets, and that sets to min retweet count to 4.
|
||||
// In this linear scoring function, we will rescale retweet count to [0, 1] range,
|
||||
// with an input range of [0, 20]. Given the realtime factor's weight of 0.1, that give our
|
||||
// minimal retweet score threshold to: 4/20 * 0.9 = 0.18.
|
||||
// Testing on prod showed much higher volume due to the generous setting of max value of 20,
|
||||
// (highest we have seen is 14). Adjusted to 0.21 which gave us similar volume.
|
||||
private static final double DEFAULT_CUT_OFF_SCORE = 0.21;
|
||||
|
||||
// Normalize retweet counts from [0, 20] range to [0, 1] range
|
||||
private static final double MAX_RETWEET_COUNT = 20.0;
|
||||
private static final double MIN_USER_REPUTATION = 40.0; // matches itweet system threshold
|
||||
|
||||
/**
|
||||
* The scores for the retweet based top tweets have to be in the [0, 1] interval. So we can't use
|
||||
* SKIP_HIT as the lowest possible score, and instead have to use Float.MIN_VALUE.
|
||||
*
|
||||
* It's OK to use different values for these constants, because they do not interfere with each
|
||||
* other. This constant is only used in RetweetBasedTopTweetsScoringFunction, which is only used
|
||||
* to filter the hits for the [score_filter retweets minScore maxScore] operator. So the scores
|
||||
* returned by RetweetBasedTopTweetsScoringFunction.score() do not have any impact on the final
|
||||
* hit score.
|
||||
*
|
||||
* See EarlybirdLuceneQueryVisitor.visitScoredFilterOperator() and ScoreFilterQuery for more details.
|
||||
*/
|
||||
private static final float RETWEET_BASED_TOP_TWEETS_LOWEST_SCORE = Float.MIN_VALUE;
|
||||
|
||||
private final double recencyScoreFraction;
|
||||
private final double sigmoidAlpha;
|
||||
private final double cutOffScore;
|
||||
private final int recencyCenterMinutes;
|
||||
private final double maxRecency;
|
||||
|
||||
private final int currentTimeSeconds;
|
||||
|
||||
private ThriftSearchResultMetadata metadata = null;
|
||||
private double score;
|
||||
private double retweetCount;
|
||||
|
||||
public RetweetBasedTopTweetsScoringFunction(ImmutableSchemaInterface schema) {
|
||||
this(schema, DEFAULT_RECENCY_SCORE_FRACTION,
|
||||
DEFAULT_SIGMOID_APLHA,
|
||||
DEFAULT_CUT_OFF_SCORE,
|
||||
DEFAULT_RECENCY_CENTER_MINUTES);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a no decay scoring function (used by top archive).
|
||||
* Otherwise same as default constructor.
|
||||
* @param nodecay If no decay is set to true. Alpha is set to 0.0.
|
||||
*/
|
||||
public RetweetBasedTopTweetsScoringFunction(ImmutableSchemaInterface schema, boolean nodecay) {
|
||||
this(schema, DEFAULT_RECENCY_SCORE_FRACTION,
|
||||
nodecay ? 0.0 : DEFAULT_SIGMOID_APLHA,
|
||||
DEFAULT_CUT_OFF_SCORE,
|
||||
DEFAULT_RECENCY_CENTER_MINUTES);
|
||||
}
|
||||
|
||||
public RetweetBasedTopTweetsScoringFunction(ImmutableSchemaInterface schema,
|
||||
double recencyScoreFraction, double sigmoidAlpha,
|
||||
double cutOffScore, int recencyCenterMinutes) {
|
||||
super(schema);
|
||||
this.recencyScoreFraction = recencyScoreFraction;
|
||||
this.sigmoidAlpha = sigmoidAlpha;
|
||||
this.cutOffScore = cutOffScore;
|
||||
this.recencyCenterMinutes = recencyCenterMinutes;
|
||||
this.maxRecency = computeSigmoid(0);
|
||||
this.currentTimeSeconds = (int) (System.currentTimeMillis() / 1000);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(float luceneQueryScore) throws IOException {
|
||||
// Reset the data for each tweet!!!
|
||||
metadata = null;
|
||||
if (documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)
|
||||
|| (documentFeatures.getFeatureValue(EarlybirdFieldConstant.USER_REPUTATION)
|
||||
< MIN_USER_REPUTATION)) {
|
||||
score = RETWEET_BASED_TOP_TWEETS_LOWEST_SCORE;
|
||||
} else {
|
||||
// Note that here we want the post log2 value, as the MAX_RETWEET_COUNT was actually
|
||||
// set up for that.
|
||||
retweetCount = MutableFeatureNormalizers.BYTE_NORMALIZER.unnormAndLog2(
|
||||
(byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.RETWEET_COUNT));
|
||||
final double recencyScore = computeTopTweetRecencyScore();
|
||||
|
||||
score = (retweetCount / MAX_RETWEET_COUNT) * (1 - recencyScoreFraction)
|
||||
+ recencyScoreFraction * recencyScore;
|
||||
|
||||
if (score < this.cutOffScore) {
|
||||
score = RETWEET_BASED_TOP_TWEETS_LOWEST_SCORE;
|
||||
}
|
||||
}
|
||||
|
||||
return (float) score;
|
||||
}
|
||||
|
||||
private double computeSigmoid(double x) {
|
||||
return 1.0f / (1.0f + Math.exp(sigmoidAlpha * (x - recencyCenterMinutes)));
|
||||
}
|
||||
|
||||
private double computeTopTweetRecencyScore() {
|
||||
double diffMinutes =
|
||||
Math.max(0, currentTimeSeconds - timeMapper.getTime(getCurrentDocID())) / 60.0;
|
||||
return computeSigmoid(diffMinutes) / maxRecency;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Explanation doExplain(float luceneScore) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ThriftSearchResultMetadata getResultMetadata(ThriftSearchResultMetadataOptions options) {
|
||||
if (metadata == null) {
|
||||
metadata = new ThriftSearchResultMetadata()
|
||||
.setResultType(ThriftSearchResultType.POPULAR)
|
||||
.setPenguinVersion(EarlybirdConfig.getPenguinVersionByte());
|
||||
metadata.setRetweetCount((int) retweetCount);
|
||||
metadata.setScore(score);
|
||||
}
|
||||
return metadata;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,213 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.lucene.index.IndexReader;
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
import com.twitter.common.collections.Pair;
|
||||
import com.twitter.search.common.constants.thriftjava.ThriftLanguage;
|
||||
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
|
||||
import com.twitter.search.common.query.HitAttributeHelper;
|
||||
import com.twitter.search.common.relevance.features.EarlybirdDocumentFeatures;
|
||||
import com.twitter.search.common.results.thriftjava.FieldHitAttribution;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper;
|
||||
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
||||
import com.twitter.search.core.earlybird.index.TimeMapper;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
|
||||
import com.twitter.search.queryparser.query.Query;
|
||||
|
||||
/**
|
||||
* Defines a ranking function which computes the score of a document that matches a query.
|
||||
*/
|
||||
public abstract class ScoringFunction {
|
||||
/**
|
||||
* Returned by a {@link #score(int, float)} to indicate that a hit should be scored below all.
|
||||
*
|
||||
* We have some equality tests like:
|
||||
* "if (score == ScoringFunction.SKIP_HIT) {...}" (DefaultScoringFunction#updateRelevanceStats)
|
||||
* We might also have double to float casts.
|
||||
*
|
||||
* Such castings seem to work with the equality test, but there might corner cases when casting
|
||||
* this float value to a double (and back) might not work properly.
|
||||
*
|
||||
* If possible, we should choose a constant that is not in the valid score range. Then we can
|
||||
* turn the float equality tests into Math.abs(...) < EPSILON tests.
|
||||
*/
|
||||
public static final float SKIP_HIT = -Float.MAX_VALUE;
|
||||
|
||||
private final ImmutableSchemaInterface schema;
|
||||
|
||||
// The current doc ID and the reader for the current segment should be private, because we don't
|
||||
// want sub-classes to incorrectly update them. The doc ID should only be updated by the score()
|
||||
// and explain() methods, and the reader should only be updated by the setNextReader() method.
|
||||
private int currentDocID = -1;
|
||||
|
||||
protected DocIDToTweetIDMapper tweetIDMapper = null;
|
||||
protected TimeMapper timeMapper = null;
|
||||
protected EarlybirdDocumentFeatures documentFeatures;
|
||||
|
||||
protected int debugMode = 0;
|
||||
protected HitAttributeHelper hitAttributeHelper;
|
||||
protected Query query;
|
||||
|
||||
protected FieldHitAttribution fieldHitAttribution;
|
||||
|
||||
public ScoringFunction(ImmutableSchemaInterface schema) {
|
||||
this.schema = Preconditions.checkNotNull(schema);
|
||||
}
|
||||
|
||||
protected ImmutableSchemaInterface getSchema() {
|
||||
return schema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the reader that will be used to retrieve the tweet IDs and creation times associated
|
||||
* with scored doc IDs, as well as the values for various CSFs. Should be called every time the
|
||||
* searcher starts searching in a new segment.
|
||||
*/
|
||||
public void setNextReader(EarlybirdIndexSegmentAtomicReader reader) throws IOException {
|
||||
tweetIDMapper = reader.getSegmentData().getDocIDToTweetIDMapper();
|
||||
timeMapper = reader.getSegmentData().getTimeMapper();
|
||||
documentFeatures = new EarlybirdDocumentFeatures(reader);
|
||||
initializeNextSegment(reader);
|
||||
}
|
||||
|
||||
public void setHitAttributeHelperAndQuery(HitAttributeHelper newHitAttributeHelper,
|
||||
Query parsedQuery) {
|
||||
this.hitAttributeHelper = newHitAttributeHelper;
|
||||
this.query = parsedQuery;
|
||||
}
|
||||
|
||||
public void setFieldHitAttribution(FieldHitAttribution fieldHitAttribution) {
|
||||
this.fieldHitAttribution = fieldHitAttribution;
|
||||
}
|
||||
|
||||
public void setDebugMode(int debugMode) {
|
||||
this.debugMode = debugMode;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allow scoring functions to perform more per-segment-specific setup.
|
||||
*/
|
||||
protected void initializeNextSegment(EarlybirdIndexSegmentAtomicReader reader)
|
||||
throws IOException {
|
||||
// Noop by default
|
||||
}
|
||||
|
||||
// Updates the current document ID and advances all NumericDocValues to this doc ID.
|
||||
private void setCurrentDocID(int currentDocID) throws IOException {
|
||||
this.currentDocID = currentDocID;
|
||||
documentFeatures.advance(currentDocID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the current doc ID stored in this scoring function.
|
||||
*/
|
||||
public int getCurrentDocID() {
|
||||
return currentDocID;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the score for the current hit. This is not expected to be thread safe.
|
||||
*
|
||||
* @param internalDocID internal id of the matching hit
|
||||
* @param luceneQueryScore the score that lucene's text query computed for this hit
|
||||
*/
|
||||
public float score(int internalDocID, float luceneQueryScore) throws IOException {
|
||||
setCurrentDocID(internalDocID);
|
||||
return score(luceneQueryScore);
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the score for the current hit. This is not expected to be thread safe.
|
||||
*
|
||||
* @param luceneQueryScore the score that lucene's text query computed for this hit
|
||||
*/
|
||||
protected abstract float score(float luceneQueryScore) throws IOException;
|
||||
|
||||
/** Returns an explanation for the given hit. */
|
||||
public final Explanation explain(IndexReader reader, int internalDocID, float luceneScore)
|
||||
throws IOException {
|
||||
setNextReader((EarlybirdIndexSegmentAtomicReader) reader);
|
||||
setCurrentDocID(internalDocID);
|
||||
return doExplain(luceneScore);
|
||||
}
|
||||
|
||||
/** Returns an explanation for the current document. */
|
||||
protected abstract Explanation doExplain(float luceneScore) throws IOException;
|
||||
|
||||
/**
|
||||
* Returns the scoring metadata for the current doc ID.
|
||||
*/
|
||||
public ThriftSearchResultMetadata getResultMetadata(ThriftSearchResultMetadataOptions options)
|
||||
throws IOException {
|
||||
ThriftSearchResultMetadata metadata = new ThriftSearchResultMetadata();
|
||||
metadata.setResultType(ThriftSearchResultType.RELEVANCE);
|
||||
metadata.setPenguinVersion(EarlybirdConfig.getPenguinVersionByte());
|
||||
metadata.setLanguage(ThriftLanguage.findByValue(
|
||||
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LANGUAGE)));
|
||||
metadata.setSignature(
|
||||
(int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.TWEET_SIGNATURE));
|
||||
metadata.setIsNullcast(documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG));
|
||||
return metadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates the given ThriftSearchResultsRelevanceStats instance based on the scoring metadata for
|
||||
* the current doc ID.
|
||||
*/
|
||||
public abstract void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats);
|
||||
|
||||
/**
|
||||
* Score a list of hits. Not thread safe.
|
||||
*/
|
||||
public float[] batchScore(List<BatchHit> hits) throws IOException {
|
||||
throw new UnsupportedOperationException("This operation (batchScore) is not implemented!");
|
||||
}
|
||||
|
||||
/**
|
||||
* Collect the features and CSFs for the current document. Used for scoring and generating the
|
||||
* returned metadata.
|
||||
*/
|
||||
public Pair<LinearScoringData, ThriftSearchResultFeatures> collectFeatures(
|
||||
float luceneQueryScore) throws IOException {
|
||||
throw new UnsupportedOperationException("This operation (collectFeatures) is not implemented!");
|
||||
}
|
||||
|
||||
/**
|
||||
* Implement this function to populate the result metadata based on the given scoring data.
|
||||
* Otherwise, this is a no-op.
|
||||
*
|
||||
* Scoring functions that implement this should also implement getScoringData().
|
||||
*/
|
||||
public void populateResultMetadataBasedOnScoringData(
|
||||
ThriftSearchResultMetadataOptions options,
|
||||
ThriftSearchResultMetadata metadata,
|
||||
LinearScoringData data) throws IOException {
|
||||
// Make sure that the scoring data passed in is null because getScoringDataForCurrentDocument()
|
||||
// returns null by default and if a subclass overrides one of these two methods, it should
|
||||
// override both.
|
||||
Preconditions.checkState(data == null, "LinearScoringData should be null");
|
||||
}
|
||||
|
||||
/**
|
||||
* This should only be called at hit collection time because it relies on the internal doc id.
|
||||
*
|
||||
* Scoring functions that implement this should also implement the function
|
||||
* populateResultMetadataBasedOnScoringData().
|
||||
*/
|
||||
public LinearScoringData getScoringDataForCurrentDocument() {
|
||||
return null;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,216 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.query.HitAttributeHelper;
|
||||
import com.twitter.search.common.ranking.thriftjava.ThriftRankingParams;
|
||||
import com.twitter.search.common.ranking.thriftjava.ThriftScoringFunctionType;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.util.ml.tensorflow_engine.TensorflowModelsManager;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserTable;
|
||||
import com.twitter.search.earlybird.exception.ClientException;
|
||||
import com.twitter.search.earlybird.ml.ScoringModelsManager;
|
||||
import com.twitter.search.earlybird.search.AntiGamingFilter;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
|
||||
import com.twitter.search.queryparser.query.Query;
|
||||
|
||||
/**
|
||||
* Returns a scoring function for a particular experiment ID.
|
||||
*
|
||||
* Can be used for a/b testing of different scoring formulas.
|
||||
*/
|
||||
public abstract class ScoringFunctionProvider {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ScoringFunctionProvider.class);
|
||||
|
||||
/**
|
||||
* Returns the scoring function.
|
||||
*/
|
||||
public abstract ScoringFunction getScoringFunction() throws IOException, ClientException;
|
||||
|
||||
public static final String RETWEETS_SCORER_NAME = "retweets";
|
||||
public static final String NO_SPAM_SCORER_NAME = "no_spam";
|
||||
public static final String TEST_SCORER_NAME = "test";
|
||||
|
||||
// Whether to avoid time decay when scoring top tweets.
|
||||
// Top archive does not need time decay.
|
||||
private static final boolean TOP_TWEET_WITH_DECAY =
|
||||
EarlybirdConfig.getBool("top_tweet_scoring_with_decay", true);
|
||||
|
||||
/**
|
||||
* Abstract class that can be used for ScoringFunctions that don't throw a ClientException.
|
||||
*
|
||||
* It does throw an IOException but it doesn't throw a ClientException so the name can be a bit
|
||||
* misleading.
|
||||
*/
|
||||
public abstract static class NamedScoringFunctionProvider extends ScoringFunctionProvider {
|
||||
/**
|
||||
* Returns the scoring function.
|
||||
*/
|
||||
public abstract ScoringFunction getScoringFunction() throws IOException;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the scoring function provider with the given name, or null if no such provider exists.
|
||||
*/
|
||||
public static NamedScoringFunctionProvider getScoringFunctionProviderByName(
|
||||
String name, final ImmutableSchemaInterface schema) {
|
||||
if (name.equals(NO_SPAM_SCORER_NAME)) {
|
||||
return new NamedScoringFunctionProvider() {
|
||||
@Override
|
||||
public ScoringFunction getScoringFunction() throws IOException {
|
||||
return new SpamVectorScoringFunction(schema);
|
||||
}
|
||||
};
|
||||
} else if (name.equals(RETWEETS_SCORER_NAME)) {
|
||||
return new NamedScoringFunctionProvider() {
|
||||
@Override
|
||||
public ScoringFunction getScoringFunction() throws IOException {
|
||||
// Production top tweet actually uses this.
|
||||
if (TOP_TWEET_WITH_DECAY) {
|
||||
return new RetweetBasedTopTweetsScoringFunction(schema);
|
||||
} else {
|
||||
return new RetweetBasedTopTweetsScoringFunction(schema, true);
|
||||
}
|
||||
}
|
||||
};
|
||||
} else if (name.equals(TEST_SCORER_NAME)) {
|
||||
return new NamedScoringFunctionProvider() {
|
||||
@Override
|
||||
public ScoringFunction getScoringFunction() throws IOException {
|
||||
return new TestScoringFunction(schema);
|
||||
}
|
||||
};
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns default scoring functions for different scoring function type
|
||||
* and provides fallback behavior if model-based scoring function fails
|
||||
*/
|
||||
public static class DefaultScoringFunctionProvider extends ScoringFunctionProvider {
|
||||
private final EarlybirdRequest request;
|
||||
private final ImmutableSchemaInterface schema;
|
||||
private final ThriftSearchQuery searchQuery;
|
||||
private final AntiGamingFilter antiGamingFilter;
|
||||
private final UserTable userTable;
|
||||
private final HitAttributeHelper hitAttributeHelper;
|
||||
private final Query parsedQuery;
|
||||
private final ScoringModelsManager scoringModelsManager;
|
||||
private final TensorflowModelsManager tensorflowModelsManager;
|
||||
|
||||
private static final SearchCounter MODEL_BASED_SCORING_FUNCTION_CREATED =
|
||||
SearchCounter.export("model_based_scoring_function_created");
|
||||
private static final SearchCounter MODEL_BASED_FALLBACK_TO_LINEAR_SCORING_FUNCTION =
|
||||
SearchCounter.export("model_based_fallback_to_linear_scoring_function");
|
||||
|
||||
private static final SearchCounter TENSORFLOW_BASED_SCORING_FUNCTION_CREATED =
|
||||
SearchCounter.export("tensorflow_based_scoring_function_created");
|
||||
private static final SearchCounter TENSORFLOW_BASED_FALLBACK_TO_LINEAR_SCORING_FUNCTION =
|
||||
SearchCounter.export("tensorflow_fallback_to_linear_function_scoring_function");
|
||||
|
||||
public DefaultScoringFunctionProvider(
|
||||
final EarlybirdRequest request,
|
||||
final ImmutableSchemaInterface schema,
|
||||
final ThriftSearchQuery searchQuery,
|
||||
final AntiGamingFilter antiGamingFilter,
|
||||
final UserTable userTable,
|
||||
final HitAttributeHelper hitAttributeHelper,
|
||||
final Query parsedQuery,
|
||||
final ScoringModelsManager scoringModelsManager,
|
||||
final TensorflowModelsManager tensorflowModelsManager) {
|
||||
this.request = request;
|
||||
this.schema = schema;
|
||||
this.searchQuery = searchQuery;
|
||||
this.antiGamingFilter = antiGamingFilter;
|
||||
this.userTable = userTable;
|
||||
this.hitAttributeHelper = hitAttributeHelper;
|
||||
this.parsedQuery = parsedQuery;
|
||||
this.scoringModelsManager = scoringModelsManager;
|
||||
this.tensorflowModelsManager = tensorflowModelsManager;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ScoringFunction getScoringFunction() throws IOException, ClientException {
|
||||
if (searchQuery.isSetRelevanceOptions()
|
||||
&& searchQuery.getRelevanceOptions().isSetRankingParams()) {
|
||||
ThriftRankingParams params = searchQuery.getRelevanceOptions().getRankingParams();
|
||||
ThriftScoringFunctionType type = params.isSetType()
|
||||
? params.getType() : ThriftScoringFunctionType.LINEAR; // default type
|
||||
switch (type) {
|
||||
case LINEAR:
|
||||
return createLinear();
|
||||
case MODEL_BASED:
|
||||
if (scoringModelsManager.isEnabled()) {
|
||||
MODEL_BASED_SCORING_FUNCTION_CREATED.increment();
|
||||
return createModelBased();
|
||||
} else {
|
||||
// From ScoringModelsManager.NO_OP_MANAGER. Fall back to LinearScoringFunction
|
||||
MODEL_BASED_FALLBACK_TO_LINEAR_SCORING_FUNCTION.increment();
|
||||
return createLinear();
|
||||
}
|
||||
case TENSORFLOW_BASED:
|
||||
if (tensorflowModelsManager.isEnabled()) {
|
||||
TENSORFLOW_BASED_SCORING_FUNCTION_CREATED.increment();
|
||||
return createTensorflowBased();
|
||||
} else {
|
||||
// Fallback to linear scoring if tf manager is disabled
|
||||
TENSORFLOW_BASED_FALLBACK_TO_LINEAR_SCORING_FUNCTION.increment();
|
||||
return createLinear();
|
||||
}
|
||||
case TOPTWEETS:
|
||||
return createTopTweets();
|
||||
default:
|
||||
throw new IllegalArgumentException("Unknown scoring type: in " + searchQuery);
|
||||
}
|
||||
} else {
|
||||
LOG.error("No relevance options provided query = " + searchQuery);
|
||||
return new DefaultScoringFunction(schema);
|
||||
}
|
||||
}
|
||||
|
||||
private ScoringFunction createLinear() throws IOException {
|
||||
LinearScoringFunction scoringFunction = new LinearScoringFunction(
|
||||
schema, searchQuery, antiGamingFilter, ThriftSearchResultType.RELEVANCE,
|
||||
userTable);
|
||||
scoringFunction.setHitAttributeHelperAndQuery(hitAttributeHelper, parsedQuery);
|
||||
|
||||
return scoringFunction;
|
||||
}
|
||||
|
||||
/**
|
||||
* For model based scoring function, ClientException will be throw if client selects an
|
||||
* unknown model for scoring manager.
|
||||
* {@link com.twitter.search.earlybird.search.relevance.scoring.ModelBasedScoringFunction}
|
||||
*/
|
||||
private ScoringFunction createModelBased() throws IOException, ClientException {
|
||||
ModelBasedScoringFunction scoringFunction = new ModelBasedScoringFunction(
|
||||
schema, searchQuery, antiGamingFilter, ThriftSearchResultType.RELEVANCE, userTable,
|
||||
scoringModelsManager);
|
||||
scoringFunction.setHitAttributeHelperAndQuery(hitAttributeHelper, parsedQuery);
|
||||
|
||||
return scoringFunction;
|
||||
}
|
||||
|
||||
private ScoringFunction createTopTweets() throws IOException {
|
||||
return new LinearScoringFunction(
|
||||
schema, searchQuery, antiGamingFilter, ThriftSearchResultType.POPULAR, userTable);
|
||||
}
|
||||
|
||||
private TensorflowBasedScoringFunction createTensorflowBased()
|
||||
throws IOException, ClientException {
|
||||
TensorflowBasedScoringFunction tfScoringFunction = new TensorflowBasedScoringFunction(
|
||||
request, schema, searchQuery, antiGamingFilter,
|
||||
ThriftSearchResultType.RELEVANCE, userTable, tensorflowModelsManager);
|
||||
tfScoringFunction.setHitAttributeHelperAndQuery(hitAttributeHelper, parsedQuery);
|
||||
return tfScoringFunction;
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,85 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
import com.twitter.search.common.relevance.features.RelevanceSignalConstants;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
|
||||
|
||||
public class SpamVectorScoringFunction extends ScoringFunction {
|
||||
private static final int MIN_TWEEPCRED_WITH_LINK =
|
||||
EarlybirdConfig.getInt("min_tweepcred_with_non_whitelisted_link", 25);
|
||||
|
||||
// The engagement threshold that prevents us from filtering users with low tweepcred.
|
||||
private static final int ENGAGEMENTS_NO_FILTER = 1;
|
||||
|
||||
@VisibleForTesting
|
||||
static final float NOT_SPAM_SCORE = 0.5f;
|
||||
@VisibleForTesting
|
||||
static final float SPAM_SCORE = -0.5f;
|
||||
|
||||
public SpamVectorScoringFunction(ImmutableSchemaInterface schema) {
|
||||
super(schema);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(float luceneQueryScore) throws IOException {
|
||||
if (documentFeatures.isFlagSet(EarlybirdFieldConstant.FROM_VERIFIED_ACCOUNT_FLAG)) {
|
||||
return NOT_SPAM_SCORE;
|
||||
}
|
||||
|
||||
int tweepCredThreshold = 0;
|
||||
if (documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_LINK_FLAG)
|
||||
&& !documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_IMAGE_URL_FLAG)
|
||||
&& !documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_VIDEO_URL_FLAG)
|
||||
&& !documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_NEWS_URL_FLAG)) {
|
||||
// Contains a non-media non-news link, definite spam vector.
|
||||
tweepCredThreshold = MIN_TWEEPCRED_WITH_LINK;
|
||||
}
|
||||
|
||||
int tweepcred = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.USER_REPUTATION);
|
||||
|
||||
// For new user, tweepcred is set to a sentinel value of -128, specified at
|
||||
// src/thrift/com/twitter/search/common/indexing/status.thrift
|
||||
if (tweepcred >= tweepCredThreshold
|
||||
|| tweepcred == (int) RelevanceSignalConstants.UNSET_REPUTATION_SENTINEL) {
|
||||
return NOT_SPAM_SCORE;
|
||||
}
|
||||
|
||||
double retweetCount =
|
||||
documentFeatures.getUnnormalizedFeatureValue(EarlybirdFieldConstant.RETWEET_COUNT);
|
||||
double replyCount =
|
||||
documentFeatures.getUnnormalizedFeatureValue(EarlybirdFieldConstant.REPLY_COUNT);
|
||||
double favoriteCount =
|
||||
documentFeatures.getUnnormalizedFeatureValue(EarlybirdFieldConstant.FAVORITE_COUNT);
|
||||
|
||||
// If the tweet has enough engagements, do not mark it as spam.
|
||||
if (retweetCount + replyCount + favoriteCount >= ENGAGEMENTS_NO_FILTER) {
|
||||
return NOT_SPAM_SCORE;
|
||||
}
|
||||
|
||||
return SPAM_SCORE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Explanation doExplain(float luceneScore) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ThriftSearchResultMetadata getResultMetadata(ThriftSearchResultMetadataOptions options) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,87 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import java.nio.ByteBuffer;
|
||||
import java.nio.ByteOrder;
|
||||
|
||||
// Ideally, this part should live somewhere in the Cortex common
|
||||
// code. Today, it is not possible to create
|
||||
// a `SparseTensor` that relies only on ByteBuffer.
|
||||
public class SparseTensor {
|
||||
|
||||
private ByteBuffer sparseIndices;
|
||||
private ByteBuffer sparseValues;
|
||||
private ByteBuffer sparseShape;
|
||||
|
||||
private int numDocs;
|
||||
private final long[] sparseShapeShapeDimension = new long[] {2L};
|
||||
private final long inputBitSize = 1 << 63;
|
||||
|
||||
private long numRecordsSeen = 0;
|
||||
private final long numFeatures;
|
||||
private int numValuesSeen;
|
||||
|
||||
public SparseTensor(int numDocs, int numFeatures) {
|
||||
this.numDocs = numDocs;
|
||||
this.numFeatures = (long) numFeatures;
|
||||
this.sparseValues =
|
||||
ByteBuffer
|
||||
.allocate(numFeatures * numDocs * Float.BYTES)
|
||||
.order(ByteOrder.LITTLE_ENDIAN);
|
||||
this.sparseIndices =
|
||||
ByteBuffer
|
||||
.allocate(2 * numFeatures * numDocs * Long.BYTES)
|
||||
.order(ByteOrder.LITTLE_ENDIAN);
|
||||
this.sparseShape =
|
||||
ByteBuffer
|
||||
.allocate(2 * Long.BYTES)
|
||||
.order(ByteOrder.LITTLE_ENDIAN);
|
||||
}
|
||||
|
||||
public void incNumRecordsSeen() {
|
||||
numRecordsSeen++;
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the given value to this tensor.
|
||||
*/
|
||||
public void addValue(long featureId, float value) {
|
||||
sparseValues.putFloat(value);
|
||||
sparseIndices.putLong(numRecordsSeen);
|
||||
sparseIndices.putLong(featureId);
|
||||
numValuesSeen++;
|
||||
}
|
||||
|
||||
public ByteBuffer getSparseValues() {
|
||||
sparseValues.limit(numValuesSeen * Float.BYTES);
|
||||
sparseValues.rewind();
|
||||
return sparseValues;
|
||||
}
|
||||
|
||||
public long[] getSparseValuesShape() {
|
||||
return new long[] {numValuesSeen};
|
||||
}
|
||||
|
||||
public long[] getSparseIndicesShape() {
|
||||
return new long[] {numValuesSeen, 2L};
|
||||
}
|
||||
|
||||
public long[] getSparseShapeShape() {
|
||||
return sparseShapeShapeDimension;
|
||||
}
|
||||
|
||||
public ByteBuffer getSparseIndices() {
|
||||
sparseIndices.limit(2 * numValuesSeen * Long.BYTES);
|
||||
sparseIndices.rewind();
|
||||
return sparseIndices;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the sparse shape for this tensor.
|
||||
*/
|
||||
public ByteBuffer getSparseShape() {
|
||||
sparseShape.putLong(numRecordsSeen);
|
||||
sparseShape.putLong(inputBitSize);
|
||||
sparseShape.rewind();
|
||||
return sparseShape;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,339 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.nio.FloatBuffer;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
import org.tensorflow.Tensor;
|
||||
|
||||
import com.twitter.common.collections.Pair;
|
||||
import com.twitter.search.common.constants.thriftjava.ThriftQuerySource;
|
||||
import com.twitter.search.common.features.EarlybirdRankingDerivedFeature;
|
||||
import com.twitter.search.common.features.FeatureHandler;
|
||||
import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures;
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.common.util.ml.tensorflow_engine.TensorflowModelsManager;
|
||||
import com.twitter.search.earlybird.EarlybirdSearcher;
|
||||
import com.twitter.search.earlybird.common.userupdates.UserTable;
|
||||
import com.twitter.search.earlybird.exception.ClientException;
|
||||
import com.twitter.search.earlybird.search.AntiGamingFilter;
|
||||
import com.twitter.search.earlybird.search.relevance.LinearScoringData;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
|
||||
import com.twitter.search.modeling.common.TweetFeaturesUtils;
|
||||
import com.twitter.tfcompute_java.TFModelRunner;
|
||||
|
||||
/**
|
||||
* TensorflowBasedScoringFunction relies on a TF model for scoring tweets
|
||||
* Only the `batchScore` part is implemented
|
||||
*/
|
||||
public class TensorflowBasedScoringFunction extends FeatureBasedScoringFunction {
|
||||
private final TFModelRunner tfModelRunner;
|
||||
|
||||
// https://stackoverflow.com/questions/37849322/how-to-understand-the-term-tensor-in-tensorflow
|
||||
// for more information on this notation - in short, a TF graph is made
|
||||
// of TF operations and doesn't have a first order notion of tensors
|
||||
// The notation <operation>:<index> will maps to the <index> output of the
|
||||
// <operation> contained in the TF graph.
|
||||
private static final String INPUT_VALUES = "input_sparse_tensor_values:0";
|
||||
private static final String INPUT_INDICES = "input_sparse_tensor_indices:0";
|
||||
private static final String INPUT_SHAPE = "input_sparse_tensor_shape:0";
|
||||
private static final String OUTPUT_NODE = "output_scores:0";
|
||||
|
||||
private final Map<Integer, Long> featureSchemaIdToMlApiId;
|
||||
private final Map<Long, Float> tweetIdToScoreMap = new HashMap<>();
|
||||
private final EarlybirdRequest request;
|
||||
|
||||
public TensorflowBasedScoringFunction(
|
||||
EarlybirdRequest request,
|
||||
ImmutableSchemaInterface schema,
|
||||
ThriftSearchQuery searchQuery,
|
||||
AntiGamingFilter antiGamingFilter,
|
||||
ThriftSearchResultType searchResultType,
|
||||
UserTable userTable,
|
||||
TensorflowModelsManager tensorflowModelsManager
|
||||
) throws IOException, ClientException {
|
||||
super(
|
||||
"TensorflowBasedScoringFunction",
|
||||
schema,
|
||||
searchQuery,
|
||||
antiGamingFilter,
|
||||
searchResultType,
|
||||
userTable
|
||||
);
|
||||
this.request = request;
|
||||
String modelName = searchQuery.getRelevanceOptions().getRankingParams().selectedTensorflowModel;
|
||||
this.featureSchemaIdToMlApiId = tensorflowModelsManager.getFeatureSchemaIdToMlApiId();
|
||||
|
||||
if (modelName == null) {
|
||||
throw new ClientException("Scoring type is TENSORFLOW_BASED but no model was selected");
|
||||
} else if (!tensorflowModelsManager.getModel(modelName).isPresent()) {
|
||||
throw new ClientException(
|
||||
"Scoring type is TENSORFLOW_BASED. Model "
|
||||
+ modelName
|
||||
+ " is not present."
|
||||
);
|
||||
}
|
||||
|
||||
if (searchQuery.getRelevanceOptions().getRankingParams().isEnableHitDemotion()) {
|
||||
throw new ClientException(
|
||||
"Hit attribute demotion is not supported with TENSORFLOW_BASED scoring type");
|
||||
}
|
||||
|
||||
tfModelRunner = tensorflowModelsManager.getModel(modelName).get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Single item scoring just returns the lucene score to be used during the batching phase.
|
||||
*/
|
||||
@Override
|
||||
protected float score(float luceneQueryScore) {
|
||||
return luceneQueryScore;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Pair<LinearScoringData, ThriftSearchResultFeatures> collectFeatures(
|
||||
float luceneQueryScore) throws IOException {
|
||||
LinearScoringData linearScoringData = updateLinearScoringData(luceneQueryScore);
|
||||
ThriftSearchResultFeatures features =
|
||||
createFeaturesForDocument(linearScoringData, true).getFeatures();
|
||||
|
||||
return new Pair<>(linearScoringData, features);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected FeatureHandler createFeaturesForDocument(
|
||||
LinearScoringData linearScoringData,
|
||||
boolean ignoreDefaultValues) throws IOException {
|
||||
return super.createFeaturesForDocument(linearScoringData,
|
||||
ignoreDefaultValues)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_TREND_CLICK,
|
||||
request.querySource == ThriftQuerySource.TREND_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_TYPED_QUERY,
|
||||
request.querySource == ThriftQuerySource.TYPED_QUERY)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_TYPEAHEAD_CLICK,
|
||||
request.querySource == ThriftQuerySource.TYPEAHEAD_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_HASHTAG_CLICK,
|
||||
request.querySource == ThriftQuerySource.RECENT_SEARCH_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_RECENT_SEARCH_CLICK,
|
||||
request.querySource == ThriftQuerySource.RECENT_SEARCH_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_PROFILE_CLICK,
|
||||
request.querySource == ThriftQuerySource.PROFILE_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_API_CALL,
|
||||
request.querySource == ThriftQuerySource.API_CALL)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_PROMOTED_TREND_CLICK,
|
||||
request.querySource == ThriftQuerySource.PROMOTED_TREND_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_SAVED_SEARCH_CLICK,
|
||||
request.querySource == ThriftQuerySource.SAVED_SEARCH_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_CASHTAG_CLICK,
|
||||
request.querySource == ThriftQuerySource.CASHTAG_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_SPELLING_EXPANSION_REVERT_CLICK,
|
||||
request.querySource == ThriftQuerySource.SPELLING_EXPANSION_REVERT_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_SPELLING_SUGGESTION_CLICK,
|
||||
request.querySource == ThriftQuerySource.SPELLING_SUGGESTION_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_LOGGED_OUT_HOME_TREND_CLICK,
|
||||
request.querySource == ThriftQuerySource.LOGGED_OUT_HOME_TREND_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_RELATED_QUERY_CLICK,
|
||||
request.querySource == ThriftQuerySource.RELATED_QUERY_CLICK)
|
||||
.addBoolean(EarlybirdRankingDerivedFeature.QUERY_SOURCE_AUTO_SPELL_CORRECT_REVERT_CLICK,
|
||||
request.querySource == ThriftQuerySource.AUTO_SPELL_CORRECT_REVERT_CLICK);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return scores computed in batchScore() if forExplanation is true.
|
||||
*/
|
||||
@Override
|
||||
protected double computeScore(LinearScoringData data, boolean forExplanation) {
|
||||
Preconditions.checkState(forExplanation,
|
||||
"forExplanation is false. computeScore() should only be used for explanation creation");
|
||||
return tweetIdToScoreMap.get(tweetIDMapper.getTweetID(getCurrentDocID()));
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void generateExplanationForScoring(
|
||||
LinearScoringData scoringData, boolean isHit, List<Explanation> details) {
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
SparseTensor createInputTensor(ThriftSearchResultFeatures[] featuresForDocs) {
|
||||
// Moving this across outside of the request path
|
||||
// would reduce the allocation cost and make the `ByteBuffer`s
|
||||
// long lived - would need one per thread.
|
||||
SparseTensor sparseTensor =
|
||||
new SparseTensor(featuresForDocs.length, featureSchemaIdToMlApiId.size());
|
||||
for (ThriftSearchResultFeatures features : featuresForDocs) {
|
||||
updateSparseTensor(sparseTensor, features);
|
||||
}
|
||||
return sparseTensor;
|
||||
}
|
||||
|
||||
private void addSchemaBooleanFeatures(SparseTensor sparseTensor,
|
||||
Map<Integer, Boolean> booleanMap) {
|
||||
if (booleanMap == null || booleanMap.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
for (Map.Entry<Integer, Boolean> entry : booleanMap.entrySet()) {
|
||||
Preconditions.checkState(featureSchemaIdToMlApiId.containsKey(entry.getKey()));
|
||||
sparseTensor.addValue(
|
||||
featureSchemaIdToMlApiId.get(entry.getKey()), entry.getValue() ? 1f : 0f);
|
||||
}
|
||||
}
|
||||
|
||||
private void addSchemaContinuousFeatures(SparseTensor sparseTensor,
|
||||
Map<Integer, ? extends Number> valueMap) {
|
||||
if (valueMap == null || valueMap.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
for (Map.Entry<Integer, ? extends Number> entry : valueMap.entrySet()) {
|
||||
Integer id = entry.getKey();
|
||||
// SEARCH-26795
|
||||
if (!TweetFeaturesUtils.isFeatureDiscrete(id)) {
|
||||
Preconditions.checkState(featureSchemaIdToMlApiId.containsKey(id));
|
||||
sparseTensor.addValue(
|
||||
featureSchemaIdToMlApiId.get(id), entry.getValue().floatValue());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void updateSparseTensor(SparseTensor sparseTensor, ThriftSearchResultFeatures features) {
|
||||
addSchemaBooleanFeatures(sparseTensor, features.getBoolValues());
|
||||
addSchemaContinuousFeatures(sparseTensor, features.getIntValues());
|
||||
addSchemaContinuousFeatures(sparseTensor, features.getLongValues());
|
||||
addSchemaContinuousFeatures(sparseTensor, features.getDoubleValues());
|
||||
|
||||
sparseTensor.incNumRecordsSeen();
|
||||
}
|
||||
|
||||
private float[] batchScoreInternal(ThriftSearchResultFeatures[] featuresForDocs) {
|
||||
int nbDocs = featuresForDocs.length;
|
||||
float[] backingArrayResults = new float[nbDocs];
|
||||
SparseTensor sparseTensor = createInputTensor(featuresForDocs);
|
||||
Tensor<?> sparseValues =
|
||||
Tensor.create(
|
||||
Float.class,
|
||||
sparseTensor.getSparseValuesShape(),
|
||||
sparseTensor.getSparseValues());
|
||||
Tensor<?> sparseIndices =
|
||||
Tensor.create(
|
||||
Long.class,
|
||||
sparseTensor.getSparseIndicesShape(),
|
||||
sparseTensor.getSparseIndices());
|
||||
Tensor<?> sparseShape =
|
||||
Tensor.create(
|
||||
Long.class,
|
||||
sparseTensor.getSparseShapeShape(),
|
||||
sparseTensor.getSparseShape());
|
||||
Map<String, Tensor<?>> inputMap = ImmutableMap.of(
|
||||
INPUT_VALUES, sparseValues,
|
||||
INPUT_INDICES, sparseIndices,
|
||||
INPUT_SHAPE, sparseShape
|
||||
);
|
||||
List<String> output = ImmutableList.of(OUTPUT_NODE);
|
||||
|
||||
Map<String, Tensor<?>> outputs = tfModelRunner.run(
|
||||
inputMap,
|
||||
output,
|
||||
ImmutableList.of()
|
||||
);
|
||||
Tensor<?> outputTensor = outputs.get(OUTPUT_NODE);
|
||||
try {
|
||||
FloatBuffer finalResultBuffer =
|
||||
FloatBuffer.wrap(backingArrayResults, 0, nbDocs);
|
||||
|
||||
outputTensor.writeTo(finalResultBuffer);
|
||||
} finally {
|
||||
// Close tensors to avoid memory leaks
|
||||
sparseValues.close();
|
||||
sparseIndices.close();
|
||||
sparseShape.close();
|
||||
if (outputTensor != null) {
|
||||
outputTensor.close();
|
||||
}
|
||||
}
|
||||
return backingArrayResults;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the score for a list of hits. Not thread safe.
|
||||
* @return Array of scores
|
||||
*/
|
||||
@Override
|
||||
public float[] batchScore(List<BatchHit> hits) throws IOException {
|
||||
ThriftSearchResultFeatures[] featuresForDocs = new ThriftSearchResultFeatures[hits.size()];
|
||||
|
||||
for (int i = 0; i < hits.size(); i++) {
|
||||
// This is a gigantic allocation, but the models are trained to depend on unset values having
|
||||
// a default.
|
||||
BatchHit hit = hits.get(i);
|
||||
ThriftSearchResultFeatures features = hit.getFeatures().deepCopy();
|
||||
|
||||
// Adjust features of a hit based on overrides provided by relevance options. Should mostly
|
||||
// be used for debugging purposes.
|
||||
adjustHitScoringFeatures(hit, features);
|
||||
|
||||
setDefaultFeatureValues(features);
|
||||
featuresForDocs[i] = features;
|
||||
}
|
||||
|
||||
float[] scores = batchScoreInternal(featuresForDocs);
|
||||
float[] finalScores = new float[hits.size()];
|
||||
|
||||
for (int i = 0; i < hits.size(); i++) {
|
||||
LinearScoringData data = hits.get(i).getScoringData();
|
||||
if (data.skipReason != null && data.skipReason != LinearScoringData.SkipReason.NOT_SKIPPED) {
|
||||
// If the hit should be skipped, overwrite the score with SKIP_HIT
|
||||
scores[i] = SKIP_HIT;
|
||||
}
|
||||
|
||||
// If explanations enabled, Add scores to map. Will be used in computeScore()
|
||||
if (EarlybirdSearcher.explanationsEnabled(debugMode)) {
|
||||
tweetIdToScoreMap.put(hits.get(i).getTweetID(), scores[i]);
|
||||
}
|
||||
|
||||
finalScores[i] = postScoreComputation(
|
||||
data,
|
||||
scores[i],
|
||||
false, // cannot get the hit attribution info for this hit at this point in time
|
||||
null);
|
||||
}
|
||||
return finalScores;
|
||||
}
|
||||
|
||||
private void adjustHitScoringFeatures(BatchHit hit, ThriftSearchResultFeatures features) {
|
||||
|
||||
if (request.isSetSearchQuery() && request.getSearchQuery().isSetRelevanceOptions()) {
|
||||
ThriftSearchRelevanceOptions relevanceOptions =
|
||||
request.getSearchQuery().getRelevanceOptions();
|
||||
|
||||
if (relevanceOptions.isSetPerTweetFeaturesOverride()
|
||||
&& relevanceOptions.getPerTweetFeaturesOverride().containsKey(hit.getTweetID())) {
|
||||
overrideFeatureValues(
|
||||
features,
|
||||
relevanceOptions.getPerTweetFeaturesOverride().get(hit.getTweetID()));
|
||||
}
|
||||
|
||||
if (relevanceOptions.isSetPerUserFeaturesOverride()
|
||||
&& relevanceOptions.getPerUserFeaturesOverride().containsKey(
|
||||
hit.getScoringData().fromUserId)) {
|
||||
overrideFeatureValues(
|
||||
features,
|
||||
relevanceOptions.getPerUserFeaturesOverride().get(hit.getScoringData().fromUserId));
|
||||
}
|
||||
|
||||
if (relevanceOptions.isSetGlobalFeaturesOverride()) {
|
||||
overrideFeatureValues(
|
||||
features, relevanceOptions.getGlobalFeaturesOverride());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,52 +0,0 @@
|
||||
package com.twitter.search.earlybird.search.relevance.scoring;
|
||||
|
||||
import org.apache.lucene.search.Explanation;
|
||||
|
||||
import com.twitter.search.common.schema.base.ImmutableSchemaInterface;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultType;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchResultsRelevanceStats;
|
||||
|
||||
/**
|
||||
* A dummy scoring function for test, the score is always tweetId/10000.0
|
||||
* Since score_filter: operator requires all score to be between [0, 1], if you want to use this
|
||||
* with it, don't use any tweet id larger than 10000 in your test.
|
||||
*/
|
||||
public class TestScoringFunction extends ScoringFunction {
|
||||
private ThriftSearchResultMetadata metadata = null;
|
||||
private float score;
|
||||
|
||||
public TestScoringFunction(ImmutableSchemaInterface schema) {
|
||||
super(schema);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected float score(float luceneQueryScore) {
|
||||
long tweetId = tweetIDMapper.getTweetID(getCurrentDocID());
|
||||
this.score = (float) (tweetId / 10000.0);
|
||||
System.out.println(String.format("score for tweet %10d is %6.3f", tweetId, score));
|
||||
return this.score;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Explanation doExplain(float luceneScore) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ThriftSearchResultMetadata getResultMetadata(ThriftSearchResultMetadataOptions options) {
|
||||
if (metadata == null) {
|
||||
metadata = new ThriftSearchResultMetadata()
|
||||
.setResultType(ThriftSearchResultType.RELEVANCE)
|
||||
.setPenguinVersion(EarlybirdConfig.getPenguinVersionByte());
|
||||
metadata.setScore(score);
|
||||
}
|
||||
return metadata;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void updateRelevanceStats(ThriftSearchResultsRelevanceStats relevanceStats) {
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,62 +0,0 @@
|
||||
package com.twitter.search.earlybird.segment;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.partitioning.base.Segment;
|
||||
import com.twitter.search.common.util.io.dl.DLReaderWriterFactory;
|
||||
import com.twitter.search.common.util.io.dl.SegmentDLUtil;
|
||||
import com.twitter.search.earlybird.EarlybirdIndexConfig;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
|
||||
/**
|
||||
* An implementation of SegmentDataProvider using DistributedLog.
|
||||
*/
|
||||
public class DLSegmentDataProvider implements SegmentDataProvider {
|
||||
private final int hashPartitionID;
|
||||
private final DLReaderWriterFactory dlFactory;
|
||||
private final SegmentDataReaderSet readerSet;
|
||||
|
||||
public DLSegmentDataProvider(
|
||||
int hashPartitionID,
|
||||
EarlybirdIndexConfig earlybirdIndexConfig,
|
||||
DLReaderWriterFactory dlReaderWriterFactory) throws IOException {
|
||||
this(hashPartitionID, earlybirdIndexConfig, dlReaderWriterFactory,
|
||||
Clock.SYSTEM_CLOCK);
|
||||
}
|
||||
|
||||
public DLSegmentDataProvider(
|
||||
int hashPartitionID,
|
||||
EarlybirdIndexConfig earlybirdIndexConfig,
|
||||
DLReaderWriterFactory dlReaderWriterFactory,
|
||||
Clock clock) throws IOException {
|
||||
this.hashPartitionID = hashPartitionID;
|
||||
this.dlFactory = dlReaderWriterFactory;
|
||||
this.readerSet = new DLSegmentDataReaderSet(
|
||||
dlFactory,
|
||||
earlybirdIndexConfig,
|
||||
clock);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentDataReaderSet getSegmentDataReaderSet() {
|
||||
return readerSet;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<Segment> newSegmentList() throws IOException {
|
||||
Set<String> segmentNames = SegmentDLUtil.getSegmentNames(dlFactory, null, hashPartitionID);
|
||||
List<Segment> segmentList = new ArrayList<>(segmentNames.size());
|
||||
for (String segmentName : segmentNames) {
|
||||
Segment segment = Segment.fromSegmentName(segmentName, EarlybirdConfig.getMaxSegmentSize());
|
||||
segmentList.add(segment);
|
||||
}
|
||||
// Sort the segments by ID.
|
||||
Collections.sort(segmentList);
|
||||
return segmentList;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,237 +0,0 @@
|
||||
package com.twitter.search.earlybird.segment;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.thrift.TException;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.metrics.SearchCustomGauge;
|
||||
import com.twitter.search.common.metrics.SearchRequestStats;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
|
||||
import com.twitter.search.common.util.io.ReaderWithStatsFactory;
|
||||
import com.twitter.search.common.util.io.TransformingRecordReader;
|
||||
import com.twitter.search.common.util.io.dl.DLMultiStreamReader;
|
||||
import com.twitter.search.common.util.io.dl.DLReaderWriterFactory;
|
||||
import com.twitter.search.common.util.io.dl.DLTimestampedReaderFactory;
|
||||
import com.twitter.search.common.util.io.dl.SegmentDLUtil;
|
||||
import com.twitter.search.common.util.io.recordreader.RecordReader;
|
||||
import com.twitter.search.common.util.io.recordreader.RecordReaderFactory;
|
||||
import com.twitter.search.common.util.thrift.ThriftUtils;
|
||||
import com.twitter.search.earlybird.EarlybirdIndexConfig;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.document.DocumentFactory;
|
||||
import com.twitter.search.earlybird.document.TweetDocument;
|
||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
||||
|
||||
public class DLSegmentDataReaderSet implements SegmentDataReaderSet {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DLSegmentDataReaderSet.class);
|
||||
|
||||
public static final SearchRequestStats STATUS_DL_READ_STATS =
|
||||
SearchRequestStats.export("status_dlreader", TimeUnit.MICROSECONDS, false);
|
||||
private static final SearchRequestStats UPDATE_EVENT_DL_READ_STATS =
|
||||
SearchRequestStats.export("update_events_dlreader", TimeUnit.MICROSECONDS, false);
|
||||
// The number of tweets not indexed because they failed deserialization.
|
||||
private static final SearchCounter STATUS_SKIPPED_DUE_TO_FAILED_DESERIALIZATION_COUNTER =
|
||||
SearchCounter.export("statuses_skipped_due_to_failed_deserialization");
|
||||
|
||||
@VisibleForTesting
|
||||
public static final int FRESH_READ_THRESHOLD = (int) TimeUnit.MINUTES.toMillis(1);
|
||||
|
||||
private final int documentReadFreshnessThreshold =
|
||||
EarlybirdConfig.getInt("documents_reader_freshness_threshold_millis", 10000);
|
||||
private final int updateReadFreshnessThreshold =
|
||||
EarlybirdConfig.getInt("updates_freshness_threshold_millis", FRESH_READ_THRESHOLD);
|
||||
private final int dlReaderVersion = EarlybirdConfig.getInt("dl_reader_version");
|
||||
|
||||
private final DLReaderWriterFactory dlFactory;
|
||||
private final RecordReaderFactory<byte[]> dlUpdateEventsFactory;
|
||||
private final EarlybirdIndexConfig indexConfig;
|
||||
private final Clock clock;
|
||||
|
||||
private RecordReader<TweetDocument> documentReader;
|
||||
|
||||
// RecordReaders for update events that span all live segments.
|
||||
private final RecordReader<ThriftVersionedEvents> updateEventsReader;
|
||||
private final DLMultiStreamReader updateEventsMultiReader;
|
||||
private final Map<Long, RecordReader<ThriftVersionedEvents>> updateEventReaders = new HashMap<>();
|
||||
|
||||
DLSegmentDataReaderSet(
|
||||
DLReaderWriterFactory dlFactory,
|
||||
final EarlybirdIndexConfig indexConfig,
|
||||
Clock clock) throws IOException {
|
||||
this.dlFactory = dlFactory;
|
||||
this.indexConfig = indexConfig;
|
||||
this.clock = clock;
|
||||
|
||||
this.dlUpdateEventsFactory = new ReaderWithStatsFactory(
|
||||
new DLTimestampedReaderFactory(dlFactory, clock, updateReadFreshnessThreshold),
|
||||
UPDATE_EVENT_DL_READ_STATS);
|
||||
this.updateEventsMultiReader =
|
||||
new DLMultiStreamReader("update_events", dlUpdateEventsFactory, true, clock);
|
||||
this.updateEventsReader =
|
||||
new TransformingRecordReader<>(updateEventsMultiReader, record ->
|
||||
(record != null) ? deserializeTVE(record.getBytes()) : null);
|
||||
|
||||
SearchCustomGauge.export("open_dl_update_events_streams", updateEventReaders::size);
|
||||
}
|
||||
|
||||
private ThriftVersionedEvents deserializeTVE(byte[] bytes) {
|
||||
ThriftVersionedEvents event = new ThriftVersionedEvents();
|
||||
try {
|
||||
ThriftUtils.fromCompactBinaryFormat(bytes, event);
|
||||
return event;
|
||||
} catch (TException e) {
|
||||
LOG.error("error deserializing TVE", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void attachDocumentReaders(SegmentInfo segmentInfo) throws IOException {
|
||||
// Close any document reader left open before.
|
||||
if (documentReader != null) {
|
||||
LOG.warn("Previous documentReader not closed: {}", documentReader);
|
||||
completeSegmentDocs(segmentInfo);
|
||||
}
|
||||
documentReader = newDocumentReader(segmentInfo);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void attachUpdateReaders(SegmentInfo segmentInfo) throws IOException {
|
||||
if (updateEventsMultiReader == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
String segmentName = segmentInfo.getSegmentName();
|
||||
if (getUpdateEventsReaderForSegment(segmentInfo) != null) {
|
||||
LOG.info("Update events reader for segment {} is already attached.", segmentName);
|
||||
return;
|
||||
}
|
||||
|
||||
long updateEventStreamOffsetTimestamp = segmentInfo.getUpdatesStreamOffsetTimestamp();
|
||||
LOG.info("Attaching update events reader for segment {} with timestamp: {}.",
|
||||
segmentName, updateEventStreamOffsetTimestamp);
|
||||
|
||||
String topic = SegmentDLUtil.getDLTopicForUpdateEvents(segmentName, dlReaderVersion);
|
||||
RecordReader<byte[]> recordReader =
|
||||
dlUpdateEventsFactory.newRecordReaderForTimestamp(topic, updateEventStreamOffsetTimestamp);
|
||||
updateEventsMultiReader.addRecordReader(recordReader, topic);
|
||||
updateEventReaders.put(segmentInfo.getTimeSliceID(),
|
||||
new TransformingRecordReader<>(recordReader, this::deserializeTVE));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopAll() {
|
||||
if (documentReader != null) {
|
||||
documentReader.close();
|
||||
}
|
||||
if (updateEventsReader != null) {
|
||||
updateEventsReader.close();
|
||||
}
|
||||
try {
|
||||
dlFactory.close();
|
||||
} catch (IOException e) {
|
||||
LOG.error("Exception while closing DL factory", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void completeSegmentDocs(SegmentInfo segmentInfo) {
|
||||
if (documentReader != null) {
|
||||
documentReader.close();
|
||||
documentReader = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopSegmentUpdates(SegmentInfo segmentInfo) {
|
||||
if (updateEventsMultiReader != null) {
|
||||
updateEventsMultiReader.removeStream(
|
||||
SegmentDLUtil.getDLTopicForUpdateEvents(segmentInfo.getSegmentName(), dlReaderVersion));
|
||||
updateEventReaders.remove(segmentInfo.getTimeSliceID());
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<TweetDocument> newDocumentReader(SegmentInfo segmentInfo) throws IOException {
|
||||
String topic = SegmentDLUtil.getDLTopicForTweets(segmentInfo.getSegmentName(),
|
||||
EarlybirdConfig.getPenguinVersion(), dlReaderVersion);
|
||||
final long timeSliceId = segmentInfo.getTimeSliceID();
|
||||
final DocumentFactory<ThriftIndexingEvent> docFactory = indexConfig.createDocumentFactory();
|
||||
|
||||
// Create the underlying DLRecordReader wrapped with the tweet reader stats.
|
||||
RecordReader<byte[]> dlReader = new ReaderWithStatsFactory(
|
||||
new DLTimestampedReaderFactory(
|
||||
dlFactory,
|
||||
clock,
|
||||
documentReadFreshnessThreshold),
|
||||
STATUS_DL_READ_STATS)
|
||||
.newRecordReader(topic);
|
||||
|
||||
// Create the wrapped reader which transforms serialized byte[] to TweetDocument.
|
||||
return new TransformingRecordReader<>(
|
||||
dlReader,
|
||||
new Function<byte[], TweetDocument>() {
|
||||
@Override
|
||||
public TweetDocument apply(byte[] input) {
|
||||
ThriftIndexingEvent event = new ThriftIndexingEvent();
|
||||
try {
|
||||
ThriftUtils.fromCompactBinaryFormat(input, event);
|
||||
} catch (TException e) {
|
||||
LOG.error("Could not deserialize status document", e);
|
||||
STATUS_SKIPPED_DUE_TO_FAILED_DESERIALIZATION_COUNTER.increment();
|
||||
return null;
|
||||
}
|
||||
|
||||
Preconditions.checkNotNull(event.getDocument());
|
||||
return new TweetDocument(
|
||||
docFactory.getStatusId(event),
|
||||
timeSliceId,
|
||||
EarlybirdThriftDocumentUtil.getCreatedAtMs(event.getDocument()),
|
||||
docFactory.newDocument(event));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<TweetDocument> getDocumentReader() {
|
||||
return documentReader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<ThriftVersionedEvents> getUpdateEventsReader() {
|
||||
return updateEventsReader;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<ThriftVersionedEvents> getUpdateEventsReaderForSegment(
|
||||
SegmentInfo segmentInfo) {
|
||||
return updateEventReaders.get(segmentInfo.getTimeSliceID());
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Long> getUpdateEventsStreamOffsetForSegment(SegmentInfo segmentInfo) {
|
||||
String topic =
|
||||
SegmentDLUtil.getDLTopicForUpdateEvents(segmentInfo.getSegmentName(), dlReaderVersion);
|
||||
return updateEventsMultiReader.getUnderlyingOffsetForSegmentWithTopic(topic);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean allCaughtUp() {
|
||||
return ((getDocumentReader() == null) || getDocumentReader().isCaughtUp())
|
||||
&& ((getUpdateEventsReader() == null) || getUpdateEventsReader().isCaughtUp());
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,72 +0,0 @@
|
||||
package com.twitter.search.earlybird.segment;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents;
|
||||
import com.twitter.search.common.util.io.EmptyRecordReader;
|
||||
import com.twitter.search.common.util.io.recordreader.RecordReader;
|
||||
import com.twitter.search.earlybird.document.TweetDocument;
|
||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
||||
|
||||
/**
|
||||
* A SegmentDataReaderSet that returns no data. Uses a DocumentReader that is
|
||||
* always caught up, but never gets exhausted.
|
||||
* Can be used for bringing up an earlybird against a static set of segments,
|
||||
* and will not incorporate any new updates.
|
||||
*/
|
||||
public class EmptySegmentDataReaderSet implements SegmentDataReaderSet {
|
||||
public static final EmptySegmentDataReaderSet INSTANCE = new EmptySegmentDataReaderSet();
|
||||
|
||||
@Override
|
||||
public void attachDocumentReaders(SegmentInfo segmentInfo) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void attachUpdateReaders(SegmentInfo segmentInfo) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void completeSegmentDocs(SegmentInfo segmentInfo) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopSegmentUpdates(SegmentInfo segmentInfo) {
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stopAll() {
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean allCaughtUp() {
|
||||
// ALWAYS CAUGHT UP
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<TweetDocument> newDocumentReader(SegmentInfo segmentInfo)
|
||||
throws Exception {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<TweetDocument> getDocumentReader() {
|
||||
return new EmptyRecordReader<>();
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<ThriftVersionedEvents> getUpdateEventsReader() {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecordReader<ThriftVersionedEvents> getUpdateEventsReaderForSegment(
|
||||
SegmentInfo segmentInfo) {
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<Long> getUpdateEventsStreamOffsetForSegment(SegmentInfo segmentInfo) {
|
||||
return Optional.of(0L);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,14 +0,0 @@
|
||||
package com.twitter.search.earlybird.segment;
|
||||
|
||||
/**
|
||||
* SegmentDataProvider provides information about available segments for indexing. This interface
|
||||
* abstracts away the actual source of the segment data. It might be a MySQL database, a mock
|
||||
* object, or a directory of flat files. It also provides access to the segmentInfoMap itself, which
|
||||
* contains information about the indexing state of Segments.
|
||||
*/
|
||||
public interface SegmentDataProvider extends SegmentProvider {
|
||||
/**
|
||||
* Returns the set of segment data record readers.
|
||||
*/
|
||||
SegmentDataReaderSet getSegmentDataReaderSet();
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user