mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-06-01 08:48:46 +02:00
ef4c5eb65e
Please note we have force-pushed a new initial commit in order to remove some publicly-available Twitter user information. Note that this process may be required in the future.
212 lines
8.1 KiB
Java
212 lines
8.1 KiB
Java
package com.twitter.search.earlybird.search.queries;
|
|
|
|
import java.io.IOException;
|
|
|
|
import com.google.common.annotations.VisibleForTesting;
|
|
|
|
import org.apache.lucene.index.LeafReader;
|
|
import org.apache.lucene.index.LeafReaderContext;
|
|
import org.apache.lucene.search.BooleanClause;
|
|
import org.apache.lucene.search.BooleanQuery;
|
|
import org.apache.lucene.search.DocIdSetIterator;
|
|
import org.apache.lucene.search.IndexSearcher;
|
|
import org.apache.lucene.search.Query;
|
|
import org.apache.lucene.search.ScoreMode;
|
|
import org.apache.lucene.search.Weight;
|
|
|
|
import com.twitter.search.common.query.DefaultFilterWeight;
|
|
import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper;
|
|
import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader;
|
|
import com.twitter.search.core.earlybird.index.util.AllDocsIterator;
|
|
import com.twitter.search.core.earlybird.index.util.RangeFilterDISI;
|
|
import com.twitter.search.earlybird.index.TweetIDMapper;
|
|
|
|
/**
|
|
* Filters tweet ids according to since_id and max_id parameter.
|
|
*
|
|
* Note that since_id is exclusive and max_id is inclusive.
|
|
*/
|
|
public final class SinceMaxIDFilter extends Query {
|
|
public static final long NO_FILTER = -1;
|
|
|
|
private final long sinceIdExclusive;
|
|
private final long maxIdInclusive;
|
|
|
|
public static Query getSinceMaxIDQuery(long sinceIdExclusive, long maxIdInclusive) {
|
|
return new BooleanQuery.Builder()
|
|
.add(new SinceMaxIDFilter(sinceIdExclusive, maxIdInclusive), BooleanClause.Occur.FILTER)
|
|
.build();
|
|
}
|
|
|
|
public static Query getSinceIDQuery(long sinceIdExclusive) {
|
|
return new BooleanQuery.Builder()
|
|
.add(new SinceMaxIDFilter(sinceIdExclusive, NO_FILTER), BooleanClause.Occur.FILTER)
|
|
.build();
|
|
}
|
|
|
|
public static Query getMaxIDQuery(long maxIdInclusive) {
|
|
return new BooleanQuery.Builder()
|
|
.add(new SinceMaxIDFilter(NO_FILTER, maxIdInclusive), BooleanClause.Occur.FILTER)
|
|
.build();
|
|
}
|
|
|
|
private SinceMaxIDFilter(long sinceIdExclusive, long maxIdInclusive) {
|
|
this.sinceIdExclusive = sinceIdExclusive;
|
|
this.maxIdInclusive = maxIdInclusive;
|
|
}
|
|
|
|
@Override
|
|
public int hashCode() {
|
|
return (int) (sinceIdExclusive * 13 + maxIdInclusive);
|
|
}
|
|
|
|
@Override
|
|
public boolean equals(Object obj) {
|
|
if (!(obj instanceof SinceMaxIDFilter)) {
|
|
return false;
|
|
}
|
|
|
|
SinceMaxIDFilter filter = SinceMaxIDFilter.class.cast(obj);
|
|
return (sinceIdExclusive == filter.sinceIdExclusive)
|
|
&& (maxIdInclusive == filter.maxIdInclusive);
|
|
}
|
|
|
|
@Override
|
|
public String toString(String field) {
|
|
if (sinceIdExclusive != NO_FILTER && maxIdInclusive != NO_FILTER) {
|
|
return "SinceIdFilter:" + sinceIdExclusive + ",MaxIdFilter:" + maxIdInclusive;
|
|
} else if (maxIdInclusive != NO_FILTER) {
|
|
return "MaxIdFilter:" + maxIdInclusive;
|
|
} else {
|
|
return "SinceIdFilter:" + sinceIdExclusive;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Determines if this segment is at least partially covered by the given tweet ID range.
|
|
*/
|
|
public static boolean sinceMaxIDsInRange(
|
|
TweetIDMapper tweetIdMapper, long sinceIdExclusive, long maxIdInclusive) {
|
|
// Check for since id out of range. Note that since this ID is exclusive,
|
|
// equality is out of range too.
|
|
if (sinceIdExclusive != NO_FILTER && sinceIdExclusive >= tweetIdMapper.getMaxTweetID()) {
|
|
return false;
|
|
}
|
|
|
|
// Check for max id in range.
|
|
return maxIdInclusive == NO_FILTER || maxIdInclusive >= tweetIdMapper.getMinTweetID();
|
|
}
|
|
|
|
// Returns true if this segment is completely covered by these id filters.
|
|
private static boolean sinceMaxIdsCoverRange(
|
|
TweetIDMapper tweetIdMapper, long sinceIdExclusive, long maxIdInclusive) {
|
|
// Check for since_id specified AND since_id newer than than first tweet.
|
|
if (sinceIdExclusive != NO_FILTER && sinceIdExclusive >= tweetIdMapper.getMinTweetID()) {
|
|
return false;
|
|
}
|
|
|
|
// Check for max id in range.
|
|
return maxIdInclusive == NO_FILTER || maxIdInclusive > tweetIdMapper.getMaxTweetID();
|
|
}
|
|
|
|
@Override
|
|
public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost)
|
|
throws IOException {
|
|
return new DefaultFilterWeight(this) {
|
|
@Override
|
|
protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException {
|
|
LeafReader reader = context.reader();
|
|
if (!(reader instanceof EarlybirdIndexSegmentAtomicReader)) {
|
|
return new AllDocsIterator(reader);
|
|
}
|
|
|
|
EarlybirdIndexSegmentAtomicReader twitterInMemoryIndexReader =
|
|
(EarlybirdIndexSegmentAtomicReader) reader;
|
|
TweetIDMapper tweetIdMapper =
|
|
(TweetIDMapper) twitterInMemoryIndexReader.getSegmentData().getDocIDToTweetIDMapper();
|
|
|
|
// Important to return a null DocIdSetIterator here, so the Scorer will skip searching
|
|
// this segment completely.
|
|
if (!sinceMaxIDsInRange(tweetIdMapper, sinceIdExclusive, maxIdInclusive)) {
|
|
return null;
|
|
}
|
|
|
|
// Optimization: just return a match-all iterator when the whole segment is in range.
|
|
// This avoids having to do so many status id lookups.
|
|
if (sinceMaxIdsCoverRange(tweetIdMapper, sinceIdExclusive, maxIdInclusive)) {
|
|
return new AllDocsIterator(reader);
|
|
}
|
|
|
|
return new SinceMaxIDDocIdSetIterator(
|
|
twitterInMemoryIndexReader, sinceIdExclusive, maxIdInclusive);
|
|
}
|
|
};
|
|
}
|
|
|
|
@VisibleForTesting
|
|
static class SinceMaxIDDocIdSetIterator extends RangeFilterDISI {
|
|
private final DocIDToTweetIDMapper docIdToTweetIdMapper;
|
|
private final long sinceIdExclusive;
|
|
private final long maxIdInclusive;
|
|
|
|
public SinceMaxIDDocIdSetIterator(EarlybirdIndexSegmentAtomicReader reader,
|
|
long sinceIdExclusive,
|
|
long maxIdInclusive) throws IOException {
|
|
super(reader,
|
|
findMaxIdDocID(reader, maxIdInclusive),
|
|
findSinceIdDocID(reader, sinceIdExclusive));
|
|
this.docIdToTweetIdMapper = reader.getSegmentData().getDocIDToTweetIDMapper();
|
|
this.sinceIdExclusive = sinceIdExclusive; // sinceStatusId == NO_FILTER is OK, it's exclusive
|
|
this.maxIdInclusive = maxIdInclusive != NO_FILTER ? maxIdInclusive : Long.MAX_VALUE;
|
|
}
|
|
|
|
/**
|
|
* This is a necessary check when we have out of order tweets in the archive.
|
|
* When tweets are out of order, this guarantees that no false positive results are returned.
|
|
* I.e. we can still miss some tweets in the specified range, but we never incorrectly return
|
|
* anything that's not in the range.
|
|
*/
|
|
@Override
|
|
protected boolean shouldReturnDoc() {
|
|
final long statusID = docIdToTweetIdMapper.getTweetID(docID());
|
|
return statusID > sinceIdExclusive && statusID <= maxIdInclusive;
|
|
}
|
|
|
|
private static int findSinceIdDocID(
|
|
EarlybirdIndexSegmentAtomicReader reader, long sinceIdExclusive) throws IOException {
|
|
TweetIDMapper tweetIdMapper =
|
|
(TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
|
|
if (sinceIdExclusive != SinceMaxIDFilter.NO_FILTER) {
|
|
// We use this as an upper bound on the search, so we want to find the highest possible
|
|
// doc ID for this tweet ID.
|
|
boolean findMaxDocID = true;
|
|
return tweetIdMapper.findDocIdBound(
|
|
sinceIdExclusive,
|
|
findMaxDocID,
|
|
reader.getSmallestDocID(),
|
|
reader.maxDoc() - 1);
|
|
} else {
|
|
return DocIDToTweetIDMapper.ID_NOT_FOUND;
|
|
}
|
|
}
|
|
|
|
private static int findMaxIdDocID(
|
|
EarlybirdIndexSegmentAtomicReader reader, long maxIdInclusive) throws IOException {
|
|
TweetIDMapper tweetIdMapper =
|
|
(TweetIDMapper) reader.getSegmentData().getDocIDToTweetIDMapper();
|
|
if (maxIdInclusive != SinceMaxIDFilter.NO_FILTER) {
|
|
// We use this as a lower bound on the search, so we want to find the lowest possible
|
|
// doc ID for this tweet ID.
|
|
boolean findMaxDocID = false;
|
|
return tweetIdMapper.findDocIdBound(
|
|
maxIdInclusive,
|
|
findMaxDocID,
|
|
reader.getSmallestDocID(),
|
|
reader.maxDoc() - 1);
|
|
} else {
|
|
return DocIDToTweetIDMapper.ID_NOT_FOUND;
|
|
}
|
|
}
|
|
}
|
|
}
|