the-algorithm/src/java/com/twitter/search/earlybird/search/queries/SinceMaxIDFilter.java
twitter-team ef4c5eb65e Twitter Recommendation Algorithm
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
2023-03-31 17:36:31 -05:00

212 lines
8.1 KiB
Java

package com.twitter.search.earlybird.search.queries;
import java.io.IOException;
import com.google.common.annotations.VisibleForTesting;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.Weight;
import com.twitter.search.common.query.DefaultFilterWeight;
import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper;
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
import com.twitter.search.earlybird.index.TweetIDMapper;
/**
* Filters tweet ids according to since_id and max_id parameter.
*
* Note that since_id is exclusive and max_id is inclusive.
*/
public final class SinceMaxIDFilter extends Query {
public static final long NO_FILTER = -1;
private final long sinceIdExclusive;
private final long maxIdInclusive;
public static Query getSinceMaxIDQuery(long sinceIdExclusive, long maxIdInclusive) {
return new BooleanQuery.Builder()
.add(new SinceMaxIDFilter(sinceIdExclusive, maxIdInclusive), BooleanClause.Occur.FILTER)
.build();
}
public static Query getSinceIDQuery(long sinceIdExclusive) {
return new BooleanQuery.Builder()
.add(new SinceMaxIDFilter(sinceIdExclusive, NO_FILTER), BooleanClause.Occur.FILTER)
.build();
}
public static Query getMaxIDQuery(long maxIdInclusive) {
return new BooleanQuery.Builder()
.add(new SinceMaxIDFilter(NO_FILTER, maxIdInclusive), BooleanClause.Occur.FILTER)
.build();
}
private SinceMaxIDFilter(long sinceIdExclusive, long maxIdInclusive) {
this.sinceIdExclusive = sinceIdExclusive;
this.maxIdInclusive = maxIdInclusive;
}
@Override
public int hashCode() {
return (int) (sinceIdExclusive * 13 + maxIdInclusive);
}
@Override
public boolean equals(Object obj) {
if (!(obj instanceof SinceMaxIDFilter)) {
return false;
}
SinceMaxIDFilter filter = SinceMaxIDFilter.class.cast(obj);
return (sinceIdExclusive == filter.sinceIdExclusive)
&& (maxIdInclusive == filter.maxIdInclusive);
}
@Override
public String toString(String field) {
if (sinceIdExclusive != NO_FILTER && maxIdInclusive != NO_FILTER) {
return "SinceIdFilter:" + sinceIdExclusive + ",MaxIdFilter:" + maxIdInclusive;
} else if (maxIdInclusive != NO_FILTER) {
return "MaxIdFilter:" + maxIdInclusive;
} else {
return "SinceIdFilter:" + sinceIdExclusive;
}
}
/**
* Determines if this segment is at least partially covered by the given tweet ID range.
*/
public static boolean sinceMaxIDsInRange(
TweetIDMapper tweetIdMapper, long sinceIdExclusive, long maxIdInclusive) {
// Check for since id out of range. Note that since this ID is exclusive,
// equality is out of range too.
if (sinceIdExclusive != NO_FILTER && sinceIdExclusive >= tweetIdMapper.getMaxTweetID()) {
return false;
}
// Check for max id in range.
return maxIdInclusive == NO_FILTER || maxIdInclusive >= tweetIdMapper.getMinTweetID();
}
// Returns true if this segment is completely covered by these id filters.
private static boolean sinceMaxIdsCoverRange(
TweetIDMapper tweetIdMapper, long sinceIdExclusive, long maxIdInclusive) {
// Check for since_id specified AND since_id newer than than first tweet.
if (sinceIdExclusive != NO_FILTER && sinceIdExclusive >= tweetIdMapper.getMinTweetID()) {
return false;
}
// Check for max id in range.
return maxIdInclusive == NO_FILTER || maxIdInclusive > tweetIdMapper.getMaxTweetID();
}
@Override
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
throws IOException {
return new DefaultFilterWeight(this) {
@Override
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
LeafReader reader = context.reader();
if (!(reader instanceof EarlybirdIndexSegmentAtomicReader)) {
return new AllDocsIterator(reader);
}
EarlybirdIndexSegmentAtomicReader twitterInMemoryIndexReader =
(EarlybirdIndexSegmentAtomicReader) reader;
TweetIDMapper tweetIdMapper =
(TweetIDMapper) twitterInMemoryIndexReader.getSegmentData().getDocIDToTweetIDMapper();
// Important to return a null DocIdSetIterator here, so the Scorer will skip searching
// this segment completely.
if (!sinceMaxIDsInRange(tweetIdMapper, sinceIdExclusive, maxIdInclusive)) {
return null;
}
// Optimization: just return a match-all iterator when the whole segment is in range.
// This avoids having to do so many status id lookups.
if (sinceMaxIdsCoverRange(tweetIdMapper, sinceIdExclusive, maxIdInclusive)) {
return new AllDocsIterator(reader);
}
return new SinceMaxIDDocIdSetIterator(
twitterInMemoryIndexReader, sinceIdExclusive, maxIdInclusive);
}
};
}
@VisibleForTesting
static class SinceMaxIDDocIdSetIterator extends RangeFilterDISI {
private final DocIDToTweetIDMapper docIdToTweetIdMapper;
private final long sinceIdExclusive;
private final long maxIdInclusive;
public SinceMaxIDDocIdSetIterator(EarlybirdIndexSegmentAtomicReader reader,
long sinceIdExclusive,
long maxIdInclusive) throws IOException {
super(reader,
findMaxIdDocID(reader, maxIdInclusive),
findSinceIdDocID(reader, sinceIdExclusive));
this.docIdToTweetIdMapper = reader.getSegmentData().getDocIDToTweetIDMapper();
this.sinceIdExclusive = sinceIdExclusive; // sinceStatusId == NO_FILTER is OK, it's exclusive
this.maxIdInclusive = maxIdInclusive != NO_FILTER ? maxIdInclusive : Long.MAX_VALUE;
}
/**
* This is a necessary check when we have out of order tweets in the archive.
* When tweets are out of order, this guarantees that no false positive results are returned.
* I.e. we can still miss some tweets in the specified range, but we never incorrectly return
* anything that's not in the range.
*/
@Override
protected boolean shouldReturnDoc() {
final long statusID = docIdToTweetIdMapper.getTweetID(docID());
return statusID > sinceIdExclusive && statusID <= maxIdInclusive;
}
private static int findSinceIdDocID(
EarlybirdIndexSegmentAtomicReader reader, long sinceIdExclusive) throws IOException {
TweetIDMapper tweetIdMapper =
(TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
if (sinceIdExclusive != SinceMaxIDFilter.NO_FILTER) {
// We use this as an upper bound on the search, so we want to find the highest possible
// doc ID for this tweet ID.
boolean findMaxDocID = true;
return tweetIdMapper.findDocIdBound(
sinceIdExclusive,
findMaxDocID,
reader.getSmallestDocID(),
reader.maxDoc() - 1);
} else {
return DocIDToTweetIDMapper.ID_NOT_FOUND;
}
}
private static int findMaxIdDocID(
EarlybirdIndexSegmentAtomicReader reader, long maxIdInclusive) throws IOException {
TweetIDMapper tweetIdMapper =
(TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
if (maxIdInclusive != SinceMaxIDFilter.NO_FILTER) {
// We use this as a lower bound on the search, so we want to find the lowest possible
// doc ID for this tweet ID.
boolean findMaxDocID = false;
return tweetIdMapper.findDocIdBound(
maxIdInclusive,
findMaxDocID,
reader.getSmallestDocID(),
reader.maxDoc() - 1);
} else {
return DocIDToTweetIDMapper.ID_NOT_FOUND;
}
}
}
}