diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.docx b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.docx new file mode 100644 index 000000000..c193edb41 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.java b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.java deleted file mode 100644 index 7620ace9b..000000000 --- a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.java +++ /dev/null @@ -1,279 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.IOException; -import java.util.Date; - -import com.google.common.base.Preconditions; -import com.google.common.base.Predicate; - -import org.apache.commons.lang.time.FastDateFormat; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.metrics.SearchStatsReceiverImpl; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.util.io.recordreader.RecordReader; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.document.DocumentFactory; -import com.twitter.search.earlybird.document.TweetDocument; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentHdfsFlusher; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentLoader; -import com.twitter.search.earlybird.partition.SegmentOptimizer; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; -import com.twitter.search.earlybird.partition.SimpleSegmentIndexer; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; - -/** - * Given a segment, this class checks if the segment has an index built on HDFS: - * if not, use SimpleSegmentIndexer to build an index - * if yes, load the HDFS index, build a new index for the new status data which has dates newer - * than the HDFS index, then append the loaded HDFS index. - */ -public class ArchiveSegmentUpdater { - private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentUpdater.class); - - private final SegmentSyncConfig sync; - private final EarlybirdIndexConfig earlybirdIndexConfig; - private final ZooKeeperTryLockFactory zkTryLockFactory; - private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl(); - private final SearchIndexingMetricSet searchIndexingMetricSet = - new SearchIndexingMetricSet(statsReceiver); - private final EarlybirdSearcherStats searcherStats = - new EarlybirdSearcherStats(statsReceiver); - private final SearchRateCounter indexNewSegment = - new SearchRateCounter("index_new_segment"); - private final SearchRateCounter updateExistingSegment = - new SearchRateCounter("update_existing_segment"); - private final SearchRateCounter skipExistingSegment = - new SearchRateCounter("skip_existing_segment"); - private Clock clock; - - public ArchiveSegmentUpdater(ZooKeeperTryLockFactory zooKeeperTryLockFactory, - SegmentSyncConfig sync, - EarlybirdIndexConfig earlybirdIndexConfig, - Clock clock) { - this.sync = sync; - this.earlybirdIndexConfig = earlybirdIndexConfig; - this.zkTryLockFactory = zooKeeperTryLockFactory; - this.clock = clock; - } - - private boolean canUpdateSegment(SegmentInfo segmentInfo) { - if (!(segmentInfo.getSegment() instanceof ArchiveSegment)) { - LOG.info("only ArchiveSegment is available for updating now: " - + segmentInfo); - return false; - } - - if (!segmentInfo.isEnabled()) { - LOG.debug("Segment is disabled: " + segmentInfo); - return false; - } - - if (segmentInfo.isComplete() || segmentInfo.isIndexing() - || segmentInfo.getSyncInfo().isLoaded()) { - LOG.debug("Cannot update already indexed segment: " + segmentInfo); - return false; - } - - return true; - } - - /** - * Given a segment, checks if the segment has an index built on HDFS: - * if not, use SimpleSegmentIndexer to build an index - * if yes, load the HDFS index, build a new index for the new status data which has dates newer - * than the HDFS index, then append the loaded HDFS index. - * - * Returns whether the segment was successfully updated. - */ - public boolean updateSegment(SegmentInfo segmentInfo) { - Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment); - if (!canUpdateSegment(segmentInfo)) { - return false; - } - - if (segmentInfo.isIndexing()) { - LOG.error("Segment is already being indexed: " + segmentInfo); - return false; - } - - final Date hdfsEndDate = ArchiveHDFSUtils.getSegmentEndDateOnHdfs(sync, segmentInfo); - if (hdfsEndDate == null) { - indexNewSegment.increment(); - if (!indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE)) { - return false; - } - } else { - final Date curEndDate = ((ArchiveSegment) segmentInfo.getSegment()).getDataEndDate(); - if (!hdfsEndDate.before(curEndDate)) { - skipExistingSegment.increment(); - LOG.info("Segment is up-to-date: " + segmentInfo.getSegment().getTimeSliceID() - + " Found flushed segment on HDFS with end date: " - + FastDateFormat.getInstance("yyyyMMdd").format(hdfsEndDate)); - segmentInfo.setComplete(true); - segmentInfo.getSyncInfo().setFlushed(true); - return true; - } - - updateExistingSegment.increment(); - LOG.info("Updating segment: " + segmentInfo.getSegment().getTimeSliceID() - + "; new endDate will be " + FastDateFormat.getInstance("yyyyMMdd").format(curEndDate)); - - if (!updateSegment(segmentInfo, hdfsEndDate)) { - return false; - } - } - - boolean success = SegmentOptimizer.optimize(segmentInfo); - if (!success) { - // Clean up the segment dir on local disk - segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - LOG.info("Error optimizing segment: " + segmentInfo); - return false; - } - - // Verify segment before uploading. - success = ArchiveSegmentVerifier.verifySegment(segmentInfo); - if (!success) { - segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - LOG.info("Segment not uploaded to HDFS because it did not pass verification: " + segmentInfo); - return false; - } - - // upload the index to HDFS - success = new SegmentHdfsFlusher(zkTryLockFactory, sync, false) - .flushSegmentToDiskAndHDFS(segmentInfo); - if (success) { - ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, false, true); - } else { - // Clean up the segment dir on hdfs - ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, true, false); - LOG.info("Error uploading segment to HDFS: " + segmentInfo); - } - segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - - return success; - } - - /** - * Build index for the given segmentInfo. Only those statuses passing the dateFilter are indexed. - */ - private boolean indexSegment(final SegmentInfo segmentInfo, Predicate dateFilter) { - Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment); - - RecordReader documentReader = null; - try { - ArchiveSegment archiveSegment = (ArchiveSegment) segmentInfo.getSegment(); - DocumentFactory documentFactory = - earlybirdIndexConfig.createDocumentFactory(); - documentReader = archiveSegment.getStatusRecordReader(documentFactory, dateFilter); - - // Read and index the statuses - boolean success = new SimpleSegmentIndexer(documentReader, searchIndexingMetricSet) - .indexSegment(segmentInfo); - if (!success) { - // Clean up segment dir on local disk - segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - LOG.info("Error indexing segment: " + segmentInfo); - } - - return success; - } catch (IOException e) { - segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - LOG.info("Exception while indexing segment: " + segmentInfo, e); - return false; - } finally { - if (documentReader != null) { - documentReader.stop(); - } - } - } - - /** - * Load the index built on HDFS for the given segmentInfo, index the new data and append the - * HDFS index to the new indexed segment - */ - private boolean updateSegment(final SegmentInfo segmentInfo, final Date hdfsEndDate) { - SegmentInfo hdfsSegmentInfo = loadSegmentFromHdfs(segmentInfo, hdfsEndDate); - if (hdfsSegmentInfo == null) { - return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE); - } - - boolean success = indexSegment(segmentInfo, input -> { - // we're updating the segment - only index days after the old end date, - // and we're sure that the previous days have already been indexed. - return input.after(hdfsEndDate); - }); - if (!success) { - LOG.error("Error indexing new data: " + segmentInfo); - return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE); - } - - // Now, append the index loaded from hdfs - try { - segmentInfo.getIndexSegment().append(hdfsSegmentInfo.getIndexSegment()); - hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - LOG.info("Deleted local segment directories with end date " + hdfsEndDate + " : " - + segmentInfo); - } catch (IOException e) { - LOG.warn("Caught IOException while appending segment " + hdfsSegmentInfo.getSegmentName(), e); - hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - return false; - } - - segmentInfo.setComplete(true); - return true; - } - - /** - * Load the index built on HDFS for the given segmentInfo and end date - */ - private SegmentInfo loadSegmentFromHdfs(final SegmentInfo segmentInfo, final Date hdfsEndDate) { - Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment); - - ArchiveSegment segment = new ArchiveSegment( - segmentInfo.getTimeSliceID(), - EarlybirdConfig.getMaxSegmentSize(), - segmentInfo.getNumPartitions(), - segmentInfo.getSegment().getHashPartitionID(), - hdfsEndDate); - EarlybirdSegmentFactory factory = new EarlybirdSegmentFactory( - earlybirdIndexConfig, - searchIndexingMetricSet, - searcherStats, - clock); - - SegmentInfo hdfsSegmentInfo; - - try { - hdfsSegmentInfo = new SegmentInfo(segment, factory, sync); - CriticalExceptionHandler criticalExceptionHandler = - new CriticalExceptionHandler(); - - boolean success = new SegmentLoader(sync, criticalExceptionHandler) - .load(hdfsSegmentInfo); - if (!success) { - // If not successful, segmentLoader has already cleaned up the local dir. - LOG.info("Error loading hdfs segment " + hdfsSegmentInfo - + ", building segment from scratch."); - hdfsSegmentInfo = null; - } - } catch (IOException e) { - LOG.error("Exception while loading segment from hdfs: " + segmentInfo, e); - hdfsSegmentInfo = null; - } - - return hdfsSegmentInfo; - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.docx b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.docx new file mode 100644 index 000000000..9ba90dfa8 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.java b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.java deleted file mode 100644 index 2eb23265e..000000000 --- a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.java +++ /dev/null @@ -1,75 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.IOException; -import java.util.List; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.store.Directory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.earlybird.partition.SegmentInfo; - -public final class ArchiveSegmentVerifier { - private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentVerifier.class); - - private ArchiveSegmentVerifier() { - } - - @VisibleForTesting - static boolean shouldVerifySegment(SegmentInfo segmentInfo) { - if (segmentInfo.isIndexing()) { - LOG.warn("ArchiveSegmentVerifier got segment still indexing."); - return false; - } - - if (!segmentInfo.isComplete()) { - LOG.warn("ArchiveSegmentVerifyer got incomplete segment."); - return false; - } - - if (!segmentInfo.isOptimized()) { - LOG.warn("ArchiveSegmentVerifyer got unoptimized segment."); - return false; - } - - return true; - } - - /** - * Verifies an archive segment has a sane number of leaves. - */ - public static boolean verifySegment(SegmentInfo segmentInfo) { - if (!shouldVerifySegment(segmentInfo)) { - return false; - } - Directory directory = segmentInfo.getIndexSegment().getLuceneDirectory(); - return verifyLuceneIndex(directory); - } - - private static boolean verifyLuceneIndex(Directory directory) { - try { - DirectoryReader indexerReader = DirectoryReader.open(directory); - List leaves = indexerReader.getContext().leaves(); - if (leaves.size() != 1) { - LOG.warn("Lucene index does not have exactly one segment: " + leaves.size() + " != 1. " - + "Lucene segments should have been merged during optimization."); - return false; - } - - LeafReader reader = leaves.get(0).reader(); - if (reader.numDocs() <= 0) { - LOG.warn("Lucene index has no document: " + reader); - return false; - } - return true; - } catch (IOException e) { - LOG.warn("Found bad lucene index at: " + directory); - return false; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.docx b/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.docx new file mode 100644 index 000000000..94814314c Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.java b/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.java deleted file mode 100644 index c326c76be..000000000 --- a/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.java +++ /dev/null @@ -1,322 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Calendar; -import java.util.Collections; -import java.util.Comparator; -import java.util.Date; -import java.util.List; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Predicate; -import com.google.common.collect.Lists; - - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.util.io.MergingSortedRecordReader; -import com.twitter.search.common.util.io.recordreader.RecordReader; -import com.twitter.search.earlybird.config.TierConfig; -import com.twitter.search.earlybird.document.DocumentFactory; -import com.twitter.search.earlybird.document.ThriftIndexingEventDocumentFactory; -import com.twitter.search.earlybird.document.TweetDocument; - - -/** - * Responsible for taking a number of daily status batches and partitioning them into time slices - * which will be used to build segments. - * - * We try to put at most N number of tweets into a time slice. - */ -public class ArchiveTimeSlicer { - private static final Logger LOG = LoggerFactory.getLogger(ArchiveTimeSlicer.class); - - private static final Comparator ASCENDING = - (o1, o2) -> Long.compare(o1.getTweetID(), o2.getTweetID()); - - private static final Comparator DESCENDING = - (o1, o2) -> Long.compare(o2.getTweetID(), o1.getTweetID()); - - // Represents a number of daily batches which will go into a segment. - public static final class ArchiveTimeSlice { - private Date startDate; - private Date endDate; - private int statusCount; - private final DailyStatusBatches directory; - private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig; - - // This list is always ordered from oldest day, to the newest day. - // For the on-disk archive, we reverse the days in getTweetReaders(). - private final List batches = Lists.newArrayList(); - - private ArchiveTimeSlice(DailyStatusBatches directory, - ArchiveEarlybirdIndexConfig earlybirdIndexConfig) { - this.directory = directory; - this.earlybirdIndexConfig = earlybirdIndexConfig; - } - - public Date getEndDate() { - return endDate; - } - - public int getStatusCount() { - return statusCount; - } - - public int getNumHashPartitions() { - return batches.isEmpty() ? 0 : batches.get(0).getNumHashPartitions(); - } - - /** - * Returns a reader for reading tweets from this timeslice. - * - * @param archiveSegment The segment to which the timeslice belongs. - * @param documentFactory The ThriftIndexingEvent to TweetDocument converter. - * @param filter A filter that determines what dates should be read. - */ - public RecordReader getStatusReader( - ArchiveSegment archiveSegment, - DocumentFactory documentFactory, - Predicate filter) throws IOException { - // We no longer support ThriftStatus based document factories. - Preconditions.checkState(documentFactory instanceof ThriftIndexingEventDocumentFactory); - - final int hashPartitionID = archiveSegment.getHashPartitionID(); - List> readers = new ArrayList<>(batches.size()); - List orderedForReading = orderBatchesForReading(batches); - LOG.info("Creating new status reader for hashPartition: " - + hashPartitionID + " timeslice: " + getDescription()); - - for (DailyStatusBatch batch : orderedForReading) { - if (filter.apply(batch.getDate())) { - LOG.info("Adding reader for " + batch.getDate() + " " + getDescription()); - PartitionedBatch partitionedBatch = batch.getPartition(hashPartitionID); - // Don't even try to create a reader if the partition is empty. - // There does not seem to be any problem in production now, but HDFS FileSystem's javadoc - // does indicate that listStatus() is allowed to throw a FileNotFoundException if the - // partition does not exist. This check makes the code more robust against future - // HDFS FileSystem implementation changes. - if (partitionedBatch.getStatusCount() > 0) { - RecordReader tweetReaders = partitionedBatch.getTweetReaders( - archiveSegment, - directory.getStatusPathToUseForDay(batch.getDate()), - documentFactory); - readers.add(tweetReaders); - } - } else { - LOG.info("Filtered reader for " + batch.getDate() + " " + getDescription()); - } - } - - LOG.info("Creating reader for timeslice: " + getDescription() - + " with " + readers.size() + " readers"); - - return new MergingSortedRecordReader(getMergingComparator(), readers); - } - - private List orderBatchesForReading(List orderedBatches) { - // For the index formats using stock lucene, we want the most recent days to be indexed first. - // In the twitter in-memory optimized indexes, older tweets will be added first, and - // optimization will reverse the documents to make most recent tweets be first. - return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering() - ? orderedBatches : Lists.reverse(orderedBatches); - } - - private Comparator getMergingComparator() { - // We always want to retrieve larger tweet ids first. - // LIFO means that the smaller ids get inserted first --> ASCENDING order. - // FIFO would mean that we want to first insert the larger ids --> DESCENDING order. - return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering() - ? ASCENDING : DESCENDING; - } - - /** - * Returns the smallest indexed tweet ID in this timeslice for the given partition. - * - * @param hashPartitionID The partition. - */ - public long getMinStatusID(int hashPartitionID) { - if (batches.isEmpty()) { - return 0; - } - - for (int i = 0; i < batches.size(); i++) { - long minStatusID = batches.get(i).getPartition(hashPartitionID).getMinStatusID(); - if (minStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) { - return minStatusID; - } - } - - return 0; - } - - /** - * Returns the highest indexed tweet ID in this timeslice for the given partition. - * - * @param hashPartitionID The partition. - */ - public long getMaxStatusID(int hashPartitionID) { - if (batches.isEmpty()) { - return Long.MAX_VALUE; - } - - for (int i = batches.size() - 1; i >= 0; i--) { - long maxStatusID = batches.get(i).getPartition(hashPartitionID).getMaxStatusID(); - if (maxStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) { - return maxStatusID; - } - } - - return Long.MAX_VALUE; - } - - /** - * Returns a string with some information for this timeslice. - */ - public String getDescription() { - StringBuilder builder = new StringBuilder(); - builder.append("TimeSlice[start date="); - builder.append(DailyStatusBatches.DATE_FORMAT.format(startDate)); - builder.append(", end date="); - builder.append(DailyStatusBatches.DATE_FORMAT.format(endDate)); - builder.append(", status count="); - builder.append(statusCount); - builder.append(", days count="); - builder.append(batches.size()); - builder.append("]"); - return builder.toString(); - } - } - - private final int maxSegmentSize; - private final DailyStatusBatches dailyStatusBatches; - private final Date tierStartDate; - private final Date tierEndDate; - private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig; - - private List lastCachedTimeslices = null; - - public ArchiveTimeSlicer(int maxSegmentSize, - DailyStatusBatches dailyStatusBatches, - ArchiveEarlybirdIndexConfig earlybirdIndexConfig) { - this(maxSegmentSize, dailyStatusBatches, TierConfig.DEFAULT_TIER_START_DATE, - TierConfig.DEFAULT_TIER_END_DATE, earlybirdIndexConfig); - } - - public ArchiveTimeSlicer(int maxSegmentSize, - DailyStatusBatches dailyStatusBatches, - Date tierStartDate, - Date tierEndDate, - ArchiveEarlybirdIndexConfig earlybirdIndexConfig) { - this.maxSegmentSize = maxSegmentSize; - this.dailyStatusBatches = dailyStatusBatches; - this.tierStartDate = tierStartDate; - this.tierEndDate = tierEndDate; - this.earlybirdIndexConfig = earlybirdIndexConfig; - } - - private boolean cacheIsValid() throws IOException { - return lastCachedTimeslices != null - && !lastCachedTimeslices.isEmpty() - && cacheIsValid(lastCachedTimeslices.get(lastCachedTimeslices.size() - 1).endDate); - } - - private boolean cacheIsValid(Date lastDate) throws IOException { - if (lastCachedTimeslices == null || lastCachedTimeslices.isEmpty()) { - return false; - } - - // Check if we have a daily batch newer than the last batch used for the newest timeslice. - Calendar cal = Calendar.getInstance(); - cal.setTime(lastDate); - cal.add(Calendar.DATE, 1); - Date nextDate = cal.getTime(); - - boolean foundBatch = dailyStatusBatches.hasValidBatchForDay(nextDate); - - LOG.info("Checking cache: Looked for valid batch for day {}. Found: {}", - DailyStatusBatches.DATE_FORMAT.format(nextDate), foundBatch); - - return !foundBatch; - } - - private boolean timesliceIsFull(ArchiveTimeSlice timeSlice, DailyStatusBatch batch) { - return timeSlice.statusCount + batch.getMaxPerPartitionStatusCount() > maxSegmentSize; - } - - private void doTimeSlicing() throws IOException { - dailyStatusBatches.refresh(); - - lastCachedTimeslices = Lists.newArrayList(); - ArchiveTimeSlice currentTimeSlice = null; - - // Iterate over each day and add it to the current timeslice, until it gets full. - for (DailyStatusBatch batch : dailyStatusBatches.getStatusBatches()) { - if (!batch.isValid()) { - LOG.warn("Skipping hole: " + batch.getDate()); - continue; - } - - if (currentTimeSlice == null || timesliceIsFull(currentTimeSlice, batch)) { - if (currentTimeSlice != null) { - LOG.info("Filled timeslice: " + currentTimeSlice.getDescription()); - } - currentTimeSlice = new ArchiveTimeSlice(dailyStatusBatches, earlybirdIndexConfig); - currentTimeSlice.startDate = batch.getDate(); - lastCachedTimeslices.add(currentTimeSlice); - } - - currentTimeSlice.endDate = batch.getDate(); - currentTimeSlice.statusCount += batch.getMaxPerPartitionStatusCount(); - currentTimeSlice.batches.add(batch); - } - LOG.info("Last timeslice: {}", currentTimeSlice.getDescription()); - - LOG.info("Done with time slicing. Number of timeslices: {}", - lastCachedTimeslices.size()); - } - - /** - * Returns all timeslices for this earlybird. - */ - public List getTimeSlices() throws IOException { - if (cacheIsValid()) { - return lastCachedTimeslices; - } - - LOG.info("Cache is outdated. Loading new daily batches now..."); - - doTimeSlicing(); - - return lastCachedTimeslices != null ? Collections.unmodifiableList(lastCachedTimeslices) : null; - } - - /** - * Return the timeslices that overlap the tier start/end date ranges if they are specified - */ - public List getTimeSlicesInTierRange() throws IOException { - List timeSlices = getTimeSlices(); - if (tierStartDate == TierConfig.DEFAULT_TIER_START_DATE - && tierEndDate == TierConfig.DEFAULT_TIER_END_DATE) { - return timeSlices; - } - - List filteredTimeSlice = Lists.newArrayList(); - for (ArchiveTimeSlice timeSlice : timeSlices) { - if (timeSlice.startDate.before(tierEndDate) && !timeSlice.endDate.before(tierStartDate)) { - filteredTimeSlice.add(timeSlice); - } - } - - return filteredTimeSlice; - } - - @VisibleForTesting - protected DailyStatusBatches getDailyStatusBatches() { - return dailyStatusBatches; - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.docx b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.docx new file mode 100644 index 000000000..c8b820d87 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.java b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.java deleted file mode 100644 index 6dcc852ec..000000000 --- a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.java +++ /dev/null @@ -1,166 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.IOException; -import java.util.Date; -import java.util.Map; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Maps; -import com.google.gson.Gson; -import com.google.gson.JsonParseException; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Represents a day's worth of statuses (tweets) for multiple hash partitions. - * - * Note that what this class contains is not the data, but metadata. - * - * A day of tweets will come from: - * - A scrubgen, if it has happened before the scrubgen date. - * - Our daily jobs pipeline, if it has happened after that. - * - * This class checks the _SUCCESS file exists in the "statuses" subdirectory and extracts the status - * count, min status id and max status id. - */ -public class DailyStatusBatch implements Comparable { - private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatch.class); - - public static final long EMPTY_BATCH_STATUS_ID = -1; - private static final String PARTITION_FORMAT = "p_%d_of_%d"; - private static final String SUCCESS_FILE_NAME = "_SUCCESS"; - - private final Map hashPartitionToStatuses = Maps.newHashMap(); - - private final Date date; - private final int numHashPartitions; - private final boolean hasSuccessFiles; - - public DailyStatusBatch(Date date, int numHashPartitions, Path statusPath, FileSystem hdfs) { - this.date = date; - this.numHashPartitions = numHashPartitions; - this.hasSuccessFiles = checkForSuccessFile(hdfs, date, statusPath); - } - - public Date getDate() { - return date; - } - - /** - * Check for the presence of the _SUCCESS file for the given day's path on HDFS for the statuses - * field group. - */ - private boolean checkForSuccessFile(FileSystem hdfs, Date inputDate, Path statusPath) { - Path dayPath = new Path(statusPath, ArchiveHDFSUtils.dateToPath(inputDate, "/")); - Path successFilePath = new Path(dayPath, SUCCESS_FILE_NAME); - try { - return hdfs.getFileStatus(successFilePath).isFile(); - } catch (IOException e) { - LOG.error("Could not verify existence of the _SUCCESS file. Assuming it doesn't exist.", e); - } - return false; - } - - /** - * Loads the data for this day for the given partition. - */ - public PartitionedBatch addPartition(FileSystem hdfs, Path dayPath, int hashPartitionID) - throws IOException { - String partitionDir = String.format(PARTITION_FORMAT, hashPartitionID, numHashPartitions); - Path path = new Path(dayPath, partitionDir); - PartitionedBatch batch = - new PartitionedBatch(path, hashPartitionID, numHashPartitions, date); - batch.load(hdfs); - hashPartitionToStatuses.put(hashPartitionID, batch); - return batch; - } - - public PartitionedBatch getPartition(int hashPartitionID) { - return hashPartitionToStatuses.get(hashPartitionID); - } - - /** - * Returns the greatest status count in all partitions belonging to this batch. - */ - public int getMaxPerPartitionStatusCount() { - int maxPerPartitionStatusCount = 0; - for (PartitionedBatch batch : hashPartitionToStatuses.values()) { - maxPerPartitionStatusCount = Math.max(batch.getStatusCount(), maxPerPartitionStatusCount); - } - return maxPerPartitionStatusCount; - } - - public int getNumHashPartitions() { - return numHashPartitions; - } - - @VisibleForTesting - boolean hasSuccessFiles() { - return hasSuccessFiles; - } - - /** - * Returns true if the _status_counts files could be found in each - * hash partition subfolder that belongs to this timeslice - * AND the _SUCCESS file can be found at the root folder for day - */ - public boolean isValid() { - // make sure we have data for all hash partitions - for (int i = 0; i < numHashPartitions; i++) { - PartitionedBatch day = hashPartitionToStatuses.get(i); - if (day == null || !day.hasStatusCount() || day.isDisallowedEmptyPartition()) { - return false; - } - } - return hasSuccessFiles; - } - - @Override - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append("DailyStatusBatch[date=").append(date) - .append(",valid=").append(isValid()) - .append(",hasSuccessFiles=").append(hasSuccessFiles) - .append(",numHashPartitions=").append(numHashPartitions) - .append("]:\n"); - for (int i = 0; i < numHashPartitions; i++) { - builder.append('\t').append(hashPartitionToStatuses.get(i).toString()).append('\n'); - } - return builder.toString(); - } - - @Override - public int compareTo(DailyStatusBatch o) { - return date.compareTo(o.date); - } - - /** - * Serialize DailyStatusBatch to a json string. - */ - public String serializeToJson() { - return serializeToJson(new Gson()); - } - - @VisibleForTesting - String serializeToJson(Gson gson) { - return gson.toJson(this); - } - - /** - * Given a json string, parse its fields and construct a daily status batch. - * @param batchStr the json string representation of a daily status batch. - * @return the daily status batch constructed; if the string is of invalid format, null will be - * returned. - */ - static DailyStatusBatch deserializeFromJson(String batchStr) { - try { - return new Gson().fromJson(batchStr, DailyStatusBatch.class); - } catch (JsonParseException e) { - LOG.error("Error parsing json string: " + batchStr, e); - return null; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.docx b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.docx new file mode 100644 index 000000000..ef745e791 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.java b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.java deleted file mode 100644 index fa45a6ca3..000000000 --- a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.java +++ /dev/null @@ -1,702 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileWriter; -import java.io.IOException; -import java.util.Calendar; -import java.util.Collection; -import java.util.Date; -import java.util.NavigableMap; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Stopwatch; -import com.google.common.collect.Maps; - -import org.apache.commons.io.IOUtils; -import org.apache.commons.lang3.time.FastDateFormat; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.search.common.database.DatabaseConfig; -import com.twitter.search.common.util.date.DateUtil; -import com.twitter.search.common.util.io.LineRecordFileReader; -import com.twitter.search.common.util.zktrylock.TryLock; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.partition.HdfsUtil; -import com.twitter.search.earlybird.partition.StatusBatchFlushVersion; - -/** - * Provides access to preprocessed statuses (tweets) to be indexed by archive search earlybirds. - * - * These tweets can be coming from a scrub gen or from the output of the daily jobs. - */ -public class DailyStatusBatches { - private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatches.class); - - // Maximum time to spend on obtaining daily status batches by computing or loading from HDFS - private static final Amount MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES = - Amount.of(EarlybirdConfig.getLong("daily_status_batches_max_initial_load_time_minutes"), - Time.MINUTES); - // Time to wait before trying again when obtaining daily status batches fails - private static final Amount DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES = - Amount.of(EarlybirdConfig.getLong("daily_status_batches_waiting_time_minutes"), - Time.MINUTES); - private static final String DAILY_STATUS_BATCHES_SYNC_PATH = - EarlybirdProperty.ZK_APP_ROOT.get() + "/daily_batches_sync"; - private static final String DAILY_BATCHES_ZK_LOCK = "daily_batches_zk_lock"; - private static final Amount DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES = - Amount.of(EarlybirdConfig.getLong("daily_status_batches_zk_lock_expiration_minutes"), - Time.MINUTES); - - static final FastDateFormat DATE_FORMAT = FastDateFormat.getInstance("yyyyMMdd"); - - // before this date, there was no twitter - private static final Date FIRST_TWITTER_DAY = DateUtil.toDate(2006, 2, 1); - - private static final String STATUS_BATCHES_PREFIX = "status_batches"; - - private final String rootDir = - EarlybirdConfig.getString("hdfs_offline_segment_sync_dir", "top_archive_statuses"); - - private final String buildGen = - EarlybirdConfig.getString("offline_segment_build_gen", "bg_1"); - - public static final String STATUS_SUBDIR_NAME = "statuses"; - public static final String LAYOUT_SUBDIR_NAME = "layouts"; - public static final String SCRUB_GEN_SUFFIX_PATTERN = "scrubbed/%s"; - - private static final String INTERMEDIATE_COUNTS_SUBDIR_NAME = "counts"; - private static final String SUCCESS_FILE_NAME = "_SUCCESS"; - private static final Pattern HASH_PARTITION_PATTERN = Pattern.compile("p_(\\d+)_of_(\\d+)"); - private static final Date FIRST_TWEET_DAY = DateUtil.toDate(2006, 3, 21); - - private final Path rootPath = new Path(rootDir); - private final Path buildGenPath = new Path(rootPath, buildGen); - private final Path statusPath = new Path(buildGenPath, STATUS_SUBDIR_NAME); - - private final NavigableMap statusBatches = Maps.newTreeMap(); - - private Date firstValidDay = null; - private Date lastValidDay = null; - - private final ZooKeeperTryLockFactory zkTryLockFactory; - private final Date scrubGenDay; - private long numberOfDaysWithValidScrubGenData; - - public DailyStatusBatches( - ZooKeeperTryLockFactory zooKeeperTryLockFactory, Date scrubGenDay) throws IOException { - this.zkTryLockFactory = zooKeeperTryLockFactory; - this.scrubGenDay = scrubGenDay; - - FileSystem hdfs = null; - try { - hdfs = HdfsUtil.getHdfsFileSystem(); - verifyDirectory(hdfs); - } finally { - IOUtils.closeQuietly(hdfs); - } - } - - @VisibleForTesting - public Date getScrubGenDay() { - return scrubGenDay; - } - - public Collection getStatusBatches() { - return statusBatches.values(); - } - - /** - * Reset the states of the directory - */ - private void resetDirectory() { - statusBatches.clear(); - firstValidDay = null; - lastValidDay = null; - } - - /** - * Indicate whether the directory has been initialized - */ - private boolean isInitialized() { - return lastValidDay != null; - } - - /** - * Load the daily status batches from HDFS; return true if one or more batches could be loaded. - **/ - private boolean refreshByLoadingHDFSStatusBatches(final FileSystem fs) throws IOException { - // first find the latest valid end date of statuses - final Date lastValidStatusDay = getLastValidInputDateFromNow(fs); - if (lastValidStatusDay != null) { - if (hasStatusBatchesOnHdfs(fs, lastValidStatusDay)) { - if (loadStatusBatchesFromHdfs(fs, lastValidStatusDay)) { - return true; - } - } - } - - resetDirectory(); - return false; - } - - /** - * Checks the directory for new data and returns true, if one or more new batches could be loaded. - */ - public void refresh() throws IOException { - final FileSystem hdfs = HdfsUtil.getHdfsFileSystem(); - - final Stopwatch stopwatch = Stopwatch.createStarted(); - try { - if (!isInitialized()) { - if (initializeDailyStatusBatches(hdfs, stopwatch)) { - LOG.info("Successfully obtained daily status batches after {}", stopwatch); - } else { - String errMsg = "Failed to load or compute daily status batches after " - + stopwatch.toString(); - LOG.error(errMsg); - throw new IOException(errMsg); - } - } else { - loadNewDailyBatches(hdfs); - } - } finally { - IOUtils.closeQuietly(hdfs); - } - } - - private boolean initializeDailyStatusBatches(final FileSystem hdfs, - final Stopwatch stopwatch) throws IOException { - long timeSpentOnDailyBatches = 0L; - long maxAllowedTimeMs = MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES.as(Time.MILLISECONDS); - long waitingTimeMs = DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES.as(Time.MILLISECONDS); - boolean firstLoop = true; - LOG.info("Starting to load or compute daily status batches for the first time."); - while (timeSpentOnDailyBatches <= maxAllowedTimeMs && !Thread.currentThread().isInterrupted()) { - if (!firstLoop) { - try { - LOG.info("Sleeping " + waitingTimeMs - + " millis before trying to obtain daily batches again"); - Thread.sleep(waitingTimeMs); - } catch (InterruptedException e) { - LOG.warn("Interrupted while waiting to load daily batches", e); - Thread.currentThread().interrupt(); - break; - } - } - - if (isStatusBatchLoadingEnabled() && refreshByLoadingHDFSStatusBatches(hdfs)) { - LOG.info("Successfully loaded daily status batches after {}", stopwatch); - return true; - } - - final AtomicBoolean successRef = new AtomicBoolean(false); - if (computeDailyBatchesWithZKLock(hdfs, successRef, stopwatch)) { - return successRef.get(); - } - - timeSpentOnDailyBatches = stopwatch.elapsed(TimeUnit.MILLISECONDS); - firstLoop = false; - } - - return false; - } - - private boolean computeDailyBatchesWithZKLock(final FileSystem hdfs, - final AtomicBoolean successRef, - final Stopwatch stopwatch) throws IOException { - // Using a global lock to coordinate among earlybirds and segment builders so that only - // one instance would hit the HDFS name node to query the daily status directories - TryLock lock = zkTryLockFactory.createTryLock( - DatabaseConfig.getLocalHostname(), - DAILY_STATUS_BATCHES_SYNC_PATH, - DAILY_BATCHES_ZK_LOCK, - DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES); - - return lock.tryWithLock(() -> { - LOG.info("Obtained ZK lock to compute daily status batches after {}", stopwatch); - successRef.set(initialLoadDailyBatchInfos(hdfs)); - if (successRef.get()) { - LOG.info("Successfully computed daily status batches after {}", stopwatch); - if (isStatusBatchFlushingEnabled()) { - LOG.info("Starting to store daily status batches to HDFS"); - if (storeStatusBatchesToHdfs(hdfs, lastValidDay)) { - LOG.info("Successfully stored daily status batches to HDFS"); - } else { - LOG.warn("Failed storing daily status batches to HDFS"); - } - } - } else { - LOG.info("Failed loading daily status info"); - } - }); - } - - private void verifyDirectory(FileSystem hdfs) throws IOException { - if (!hdfs.exists(rootPath)) { - throw new IOException("Root dir '" + rootPath + "' does not exist."); - } - - if (!hdfs.exists(buildGenPath)) { - throw new IOException("Build gen dir '" + buildGenPath + "' does not exist."); - } - - if (!hdfs.exists(statusPath)) { - throw new IOException("Status dir '" + statusPath + "' does not exist."); - } - } - - private void loadNewDailyBatches(FileSystem hdfs) throws IOException { - Preconditions.checkNotNull(lastValidDay); - - Calendar day = Calendar.getInstance(); - day.setTime(lastValidDay); - day.add(Calendar.DATE, 1); - - while (loadDay(hdfs, day.getTime()) != null) { - lastValidDay = day.getTime(); - day.add(Calendar.DATE, 1); - } - } - - private boolean initialLoadDailyBatchInfos(FileSystem hdfs) throws IOException { - LOG.info("Starting to build timeslice map from scratch."); - - final Date lastValidStatusDay = getLastValidInputDateFromNow(hdfs); - - if (lastValidStatusDay == null) { - LOG.warn("No data found in " + statusPath + " and scrubbed path"); - return false; - } - int mostRecentYear = DateUtil.getCalendar(lastValidStatusDay).get(Calendar.YEAR); - for (int year = 2006; year <= mostRecentYear; ++year) { - // construct path to avoid hdfs.listStatus() calls - Calendar day = Calendar.getInstance(); - day.set(year, Calendar.JANUARY, 1, 0, 0, 0); - day.set(Calendar.MILLISECOND, 0); - - Calendar yearEnd = Calendar.getInstance(); - yearEnd.set(year, Calendar.DECEMBER, 31, 0, 0, 0); - yearEnd.set(Calendar.MILLISECOND, 0); - - if (lastValidDay != null) { - // We're updating. - if (lastValidDay.after(yearEnd.getTime())) { - // This year was already loaded. - continue; - } - if (lastValidDay.after(day.getTime())) { - // Start one day after last valid date. - day.setTime(lastValidDay); - day.add(Calendar.DATE, 1); - } - } - - for (; !day.after(yearEnd); day.add(Calendar.DATE, 1)) { - loadDay(hdfs, day.getTime()); - } - } - - boolean updated = false; - numberOfDaysWithValidScrubGenData = 0; - - // Iterate batches in sorted order. - for (DailyStatusBatch batch : statusBatches.values()) { - if (!batch.isValid()) { - break; - } - if (batch.getDate().before(scrubGenDay)) { - numberOfDaysWithValidScrubGenData++; - } - if (firstValidDay == null) { - firstValidDay = batch.getDate(); - } - if (lastValidDay == null || lastValidDay.before(batch.getDate())) { - lastValidDay = batch.getDate(); - updated = true; - } - } - - LOG.info("Number of statusBatches: {}", statusBatches.size()); - return updated; - } - - private static String filesToString(FileStatus[] files) { - if (files == null) { - return "null"; - } - StringBuilder b = new StringBuilder(); - for (FileStatus s : files) { - b.append(s.getPath().toString()).append(", "); - } - return b.toString(); - } - - @VisibleForTesting - protected DailyStatusBatch loadDay(FileSystem hdfs, Date day) throws IOException { - Path dayPath = new Path(getStatusPathToUseForDay(day), ArchiveHDFSUtils.dateToPath(day, "/")); - LOG.debug("Looking for batch in " + dayPath.toString()); - DailyStatusBatch result = this.statusBatches.get(day); - if (result != null) { - return result; - } - - final FileStatus[] files; - try { - files = hdfs.listStatus(dayPath); - LOG.debug("Files found: " + filesToString(files)); - } catch (FileNotFoundException e) { - LOG.debug("loadDay() called, but directory does not exist for day: " + day - + " in: " + dayPath); - return null; - } - - if (files != null && files.length > 0) { - for (FileStatus file : files) { - Matcher matcher = HASH_PARTITION_PATTERN.matcher(file.getPath().getName()); - if (matcher.matches()) { - int numHashPartitions = Integer.parseInt(matcher.group(2)); - result = new DailyStatusBatch( - day, numHashPartitions, getStatusPathToUseForDay(day), hdfs); - - for (int partitionID = 0; partitionID < numHashPartitions; partitionID++) { - result.addPartition(hdfs, dayPath, partitionID); - } - - if (result.isValid()) { - statusBatches.put(day, result); - return result; - } else { - LOG.info("Invalid batch found for day: " + day + ", batch: " + result); - } - } else { - // skip logging the intermediate count subdirectories or _SUCCESS files. - if (!INTERMEDIATE_COUNTS_SUBDIR_NAME.equals(file.getPath().getName()) - && !SUCCESS_FILE_NAME.equals(file.getPath().getName())) { - LOG.warn("Path does not match hash partition pattern: " + file.getPath()); - } - } - } - } else { - LOG.warn("No data found for day: " + day + " in: " + dayPath - + " files null: " + (files == null)); - } - - return null; - } - - /** - * Determines if this directory has a valid batch for the given day. - */ - public boolean hasValidBatchForDay(Date day) throws IOException { - FileSystem hdfs = null; - try { - hdfs = HdfsUtil.getHdfsFileSystem(); - return hasValidBatchForDay(hdfs, day); - } finally { - IOUtils.closeQuietly(hdfs); - } - } - - private boolean hasValidBatchForDay(FileSystem fs, Date day) throws IOException { - DailyStatusBatch batch = loadDay(fs, day); - - return batch != null && batch.isValid(); - } - - @VisibleForTesting - Date getFirstValidDay() { - return firstValidDay; - } - - @VisibleForTesting - Date getLastValidDay() { - return lastValidDay; - } - - private Date getLastValidInputDateFromNow(FileSystem hdfs) throws IOException { - Calendar cal = Calendar.getInstance(); - cal.setTime(new Date()); // current date - return getLastValidInputDate(hdfs, cal); - } - - /** - * Starting from current date, probe backward till we find a valid input Date - */ - @VisibleForTesting - Date getLastValidInputDate(FileSystem hdfs, Calendar cal) throws IOException { - cal.set(Calendar.MILLISECOND, 0); - cal.set(Calendar.HOUR_OF_DAY, 0); - cal.set(Calendar.MINUTE, 0); - cal.set(Calendar.SECOND, 0); - cal.set(Calendar.MILLISECOND, 0); - Date lastValidInputDate = cal.getTime(); - LOG.info("Probing backwards for last valid data date from " + lastValidInputDate); - while (lastValidInputDate.after(FIRST_TWITTER_DAY)) { - if (hasValidBatchForDay(hdfs, lastValidInputDate)) { - LOG.info("Found latest valid data on date " + lastValidInputDate); - LOG.info(" Used path: {}", getStatusPathToUseForDay(lastValidInputDate)); - return lastValidInputDate; - } - cal.add(Calendar.DATE, -1); - lastValidInputDate = cal.getTime(); - } - - return null; - } - - /** - * Check if the daily status batches are already on HDFS - */ - @VisibleForTesting - boolean hasStatusBatchesOnHdfs(FileSystem fs, Date lastDataDay) { - String hdfsFileName = getHdfsStatusBatchSyncFileName(lastDataDay); - try { - return fs.exists(new Path(hdfsFileName)); - } catch (IOException ex) { - LOG.error("Failed checking status batch file on HDFS: " + hdfsFileName, ex); - return false; - } - } - - /** - * Load the daily status batches from HDFS by first copying the file from HDFS to local disk - * and then reading from the local disk. - * - * @param day the latest day of valid statuses. - * @return true if the loading is successful. - */ - @VisibleForTesting - boolean loadStatusBatchesFromHdfs(FileSystem fs, Date day) { - // set the directory state to initial state - resetDirectory(); - - String fileHdfsPath = getHdfsStatusBatchSyncFileName(day); - String fileLocalPath = getLocalStatusBatchSyncFileName(day); - - LOG.info("Using " + fileHdfsPath + " as the HDFS batch summary load path."); - LOG.info("Using " + fileLocalPath + " as the local batch summary sync path."); - - LineRecordFileReader lineReader = null; - try { - fs.copyToLocalFile(new Path(fileHdfsPath), new Path(fileLocalPath)); - - lineReader = new LineRecordFileReader(fileLocalPath); - String batchLine; - while ((batchLine = lineReader.readNext()) != null) { - DailyStatusBatch batch = DailyStatusBatch.deserializeFromJson(batchLine); - if (batch == null) { - LOG.error("Invalid daily status batch constructed from line: " + batchLine); - resetDirectory(); - return false; - } - Date date = batch.getDate(); - if (firstValidDay == null || firstValidDay.after(date)) { - firstValidDay = date; - } - if (lastValidDay == null || lastValidDay.before(date)) { - lastValidDay = date; - } - statusBatches.put(date, batch); - } - LOG.info("Loaded {} status batches from HDFS: {}", - statusBatches.size(), fileHdfsPath); - LOG.info("First entry: {}", statusBatches.firstEntry().getValue().toString()); - LOG.info("Last entry: {}", statusBatches.lastEntry().getValue().toString()); - - return true; - } catch (IOException ex) { - LOG.error("Failed loading time slices from HDFS: " + fileHdfsPath, ex); - resetDirectory(); - return false; - } finally { - if (lineReader != null) { - lineReader.stop(); - } - } - } - - /** - * Flush the daily status batches to local disk and then upload to HDFS. - */ - private boolean storeStatusBatchesToHdfs(FileSystem fs, Date day) { - Preconditions.checkNotNull(lastValidDay); - - if (!StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.isOfficial()) { - LOG.info("Status batch flush version is not official, no batches will be flushed to HDFS"); - return true; - } - - String fileLocalPath = getLocalStatusBatchSyncFileName(day); - - // Flush to local disk - File outputFile = null; - FileWriter fileWriter = null; - try { - LOG.info("Flushing daily status batches into: " + fileLocalPath); - outputFile = new File(fileLocalPath); - outputFile.getParentFile().mkdirs(); - if (!outputFile.getParentFile().exists()) { - LOG.error("Cannot create directory: " + outputFile.getParentFile().toString()); - return false; - } - fileWriter = new FileWriter(outputFile, false); - for (Date date : statusBatches.keySet()) { - fileWriter.write(statusBatches.get(date).serializeToJson()); - fileWriter.write("\n"); - } - fileWriter.flush(); - - // Upload the file to HDFS - return uploadStatusBatchesToHdfs(fs, day); - } catch (IOException e) { - String fileHdfsPath = getHdfsStatusBatchSyncFileName(day); - LOG.error("Failed storing status batches to HDFS: " + fileHdfsPath, e); - return false; - } finally { - try { - if (fileWriter != null) { - fileWriter.close(); - } - } catch (IOException e) { - LOG.error("Error to close fileWrite.", e); - } - if (outputFile != null) { - // Delete the local file - outputFile.delete(); - } - } - } - - /** - * Upload the status batches to HDFS. - */ - @VisibleForTesting - boolean uploadStatusBatchesToHdfs(FileSystem fs, Date day) { - String localFileName = getLocalStatusBatchSyncFileName(day); - String hdfsFileName = getHdfsStatusBatchSyncFileName(day); - - LOG.info("Using " + hdfsFileName + " as the HDFS batch summary upload path."); - LOG.info("Using " + localFileName + " as the local batch summary sync path."); - - try { - Path hdfsFilePath = new Path(hdfsFileName); - if (fs.exists(hdfsFilePath)) { - LOG.warn("Found status batch file on HDFS: " + hdfsFileName); - return true; - } - - String hdfsTempName = getHdfsStatusBatchTempSyncFileName(day); - Path hdfsTempPath = new Path(hdfsTempName); - if (fs.exists(hdfsTempPath)) { - LOG.info("Found existing temporary status batch file on HDFS, removing: " + hdfsTempName); - if (!fs.delete(hdfsTempPath, false)) { - LOG.error("Failed to delete temporary file: " + hdfsTempName); - return false; - } - } - fs.copyFromLocalFile(new Path(localFileName), hdfsTempPath); - - if (fs.rename(hdfsTempPath, hdfsFilePath)) { - LOG.debug("Renamed " + hdfsTempName + " on HDFS to: " + hdfsFileName); - return true; - } else { - LOG.error("Failed to rename " + hdfsTempName + " on HDFS to: " + hdfsFileName); - return false; - } - } catch (IOException ex) { - LOG.error("Failed uploading status batch file to HDFS: " + hdfsFileName, ex); - return false; - } - } - - private static boolean isStatusBatchFlushingEnabled() { - return EarlybirdProperty.ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED.get(false); - } - - private static boolean isStatusBatchLoadingEnabled() { - return EarlybirdConfig.getBool("archive_daily_status_batch_loading_enabled", false); - } - - private static String getVersionFileExtension() { - return StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.getVersionFileExtension(); - } - - String getStatusBatchSyncRootDir() { - return EarlybirdConfig.getString("archive_daily_status_batch_sync_dir", - "daily_status_batches") + "/" + scrubGenSuffix(); - } - - @VisibleForTesting - String getLocalStatusBatchSyncFileName(Date day) { - return getStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_" - + DATE_FORMAT.format(day) + getVersionFileExtension(); - } - - String getHdfsStatusBatchSyncRootDir() { - return EarlybirdConfig.getString("hdfs_archive_daily_status_batch_sync_dir", - "daily_status_batches") + "/" + scrubGenSuffix(); - } - - @VisibleForTesting - String getHdfsStatusBatchSyncFileName(Date day) { - return getHdfsStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_" - + DATE_FORMAT.format(day) + getVersionFileExtension(); - } - - private String getHdfsStatusBatchTempSyncFileName(Date day) { - return getHdfsStatusBatchSyncRootDir() + "/" + DatabaseConfig.getLocalHostname() + "_" - + STATUS_BATCHES_PREFIX + "_" + DATE_FORMAT.format(day) + getVersionFileExtension(); - } - - private String scrubGenSuffix() { - return String.format(SCRUB_GEN_SUFFIX_PATTERN, DATE_FORMAT.format(scrubGenDay)); - } - - /** - * Returns the path to the directory that stores the statuses for the given day. - */ - public Path getStatusPathToUseForDay(Date day) { - if (!day.before(scrubGenDay)) { - return statusPath; - } - - String suffix = scrubGenSuffix(); - Preconditions.checkArgument(!suffix.isEmpty()); - Path scrubPath = new Path(buildGenPath, suffix); - return new Path(scrubPath, STATUS_SUBDIR_NAME); - } - - /** - * Determines if the data for the specified scrub gen was fully built, by checking the number of - * days for which data was built against the expected number of days extracted from the specified - * scrub gen date. - */ - public boolean isScrubGenDataFullyBuilt(FileSystem hdfs) throws IOException { - initialLoadDailyBatchInfos(hdfs); - if (numberOfDaysWithValidScrubGenData == 0) { - LOG.warn("numberOfDaysWithValidScrubGenData is 0"); - } - long expectedDays = getDiffBetweenDays(scrubGenDay); - return expectedDays == numberOfDaysWithValidScrubGenData; - } - - @VisibleForTesting - long getDiffBetweenDays(Date day) { - long diff = day.getTime() - FIRST_TWEET_DAY.getTime(); - return TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.docx b/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.docx new file mode 100644 index 000000000..526914c6d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.java b/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.java deleted file mode 100644 index b72e8c7f2..000000000 --- a/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.java +++ /dev/null @@ -1,333 +0,0 @@ -package com.twitter.search.earlybird.archive; - -import java.io.FileNotFoundException; -import java.io.IOException; -import java.util.Comparator; -import java.util.Date; -import java.util.List; -import java.util.concurrent.TimeUnit; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Function; -import com.google.common.base.Predicate; -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.Lists; - -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.config.Config; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.util.date.DateUtil; -import com.twitter.search.common.util.io.EmptyRecordReader; -import com.twitter.search.common.util.io.LzoThriftBlockFileReader; -import com.twitter.search.common.util.io.MergingSortedRecordReader; -import com.twitter.search.common.util.io.TransformingRecordReader; -import com.twitter.search.common.util.io.recordreader.RecordReader; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.document.DocumentFactory; -import com.twitter.search.earlybird.document.TweetDocument; -import com.twitter.search.earlybird.partition.HdfsUtil; - -/** - * A batch of pre-processed tweets for a single hash partition from a particular day. - */ -public class PartitionedBatch { - private static final Logger LOG = LoggerFactory.getLogger(PartitionedBatch.class); - private static final Date START_DATE_INCLUSIVE = DateUtil.toDate(2006, 03, 21); - private static final String STATUS_COUNT_FILE_PREFIX = "_status_count_"; - private static final Pattern STATUS_COUNT_FILE_PATTERN = - Pattern.compile(STATUS_COUNT_FILE_PREFIX + "(\\d+)_minid_(\\d+)_maxid_(\\d+)"); - private static final int MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS = - EarlybirdConfig.getInt("archive_max_out_of_order_tolerance_hours", 12); - private static final int READER_INIT_IOEXCEPTION_RETRIES = 20; - private static final PathFilter LZO_DATA_FILES_FILTER = file -> file.getName().endsWith(".lzo"); - private static final PathFilter TXT_DATA_FILES_FILTER = file -> file.getName().endsWith(".txt"); - - private static final Comparator DESC_THRIFT_INDEXING_EVENT_COMPARATOR = - (o1, o2) -> ComparisonChain.start() - .compare(o2.getSortId(), o1.getSortId()) - .compare(o2.getUid(), o1.getUid()) - .result(); - - // Number archive tweets skipped because they are too out-of-order. - private static final SearchCounter OUT_OF_ORDER_STATUSES_SKIPPED = - SearchCounter.export("out_of_order_archive_statuses_skipped"); - - @VisibleForTesting - protected static final long MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS = - TimeUnit.HOURS.toMillis(MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS); - - private final Date date; - private final Path path; - private int statusCount; - private long minStatusID; - private long maxStatusID; - private final int hashPartitionID; - private boolean hasStatusCountFile; - private final int numHashPartitions; - - @VisibleForTesting - public PartitionedBatch( - Path path, - int hashPartitionID, - int numHashPartitions, - Date date) { - this.path = path; - this.hashPartitionID = hashPartitionID; - this.numHashPartitions = numHashPartitions; - this.date = date; - } - - /** - * Loads all the information (tweet count, etc.) for this partition and day from HDFS. - */ - public void load(FileSystem hdfs) throws IOException { - FileStatus[] dailyBatchFiles = null; - try { - // listStatus() javadoc says it throws FileNotFoundException when path does not exist. - // However, the actual implementations return null or an empty array instead. - // We handle all 3 cases: null, empty array, or FileNotFoundException. - dailyBatchFiles = hdfs.listStatus(path); - } catch (FileNotFoundException e) { - // don't do anything here and the day will be handled as empty. - } - - if (dailyBatchFiles != null && dailyBatchFiles.length > 0) { - for (FileStatus file : dailyBatchFiles) { - String fileName = file.getPath().getName(); - if (fileName.equals(STATUS_COUNT_FILE_PREFIX)) { - // zero tweets in this partition - this can happen for early days in 2006 - handleEmptyPartition(); - } else { - Matcher matcher = STATUS_COUNT_FILE_PATTERN.matcher(fileName); - if (matcher.matches()) { - try { - statusCount = Integer.parseInt(matcher.group(1)); - // Only adjustMinStatusId in production. For tests, this makes the tests harder to - // understand. - minStatusID = Config.environmentIsTest() ? Long.parseLong(matcher.group(2)) - : adjustMinStatusId(Long.parseLong(matcher.group(2)), date); - maxStatusID = Long.parseLong(matcher.group(3)); - hasStatusCountFile = true; - } catch (NumberFormatException e) { - // invalid file - ignore - LOG.warn("Could not parse status count file name.", e); - } - } - } - } - } else { - // Partition folder does not exist. This case can happen for early days of twitter - // where some partitions are empty. Set us to having a status count file, the validity of - // the parent DailyStatusBatch will still be determined by whether there was a _SUCCESS file - // in the day root. - handleEmptyPartition(); - - if (date.after(getEarliestDenseDay())) { - LOG.error("Unexpected empty directory {} for {}", path, date); - } - } - } - - private void handleEmptyPartition() { - statusCount = 0; - minStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID; - maxStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID; - hasStatusCountFile = true; - } - - /** - * Sometimes tweets are out-of-order (E.g. a tweet from Sep 2012 got into a - * batch in July 2013). See SEARCH-1750 for more details. - * This adjust the minStatusID if it is badly out-of-order. - */ - @VisibleForTesting - protected static long adjustMinStatusId(long minStatusID, Date date) { - long dateTime = date.getTime(); - // If the daily batch is for a day before we started using snow flake IDs. Never adjust. - if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(dateTime)) { - return minStatusID; - } - - long earliestStartTime = dateTime - MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS; - long minStatusTime = SnowflakeIdParser.getTimestampFromTweetId(minStatusID); - if (minStatusTime < earliestStartTime) { - long newMinId = SnowflakeIdParser.generateValidStatusId(earliestStartTime, 0); - LOG.info("Daily batch for " + date + " has badly out of order tweet: " + minStatusID - + ". The minStatusID for the day this batch is adjusted to " + newMinId); - return newMinId; - } else { - return minStatusID; - } - } - - /** - * Returns a reader that reads tweets from the given directory. - * - * @param archiveSegment Determines the timeslice ID of all read tweets. - * @param tweetsPath The path to the directory where the tweets for this day are stored. - * @param documentFactory The ThriftIndexingEvent to TweetDocument converter. - */ - public RecordReader getTweetReaders( - ArchiveSegment archiveSegment, - Path tweetsPath, - DocumentFactory documentFactory) throws IOException { - RecordReader tweetDocumentReader = - new TransformingRecordReader<>( - createTweetReader(tweetsPath), new Function() { - @Override - public TweetDocument apply(ThriftIndexingEvent event) { - return new TweetDocument( - event.getSortId(), - archiveSegment.getTimeSliceID(), - EarlybirdThriftDocumentUtil.getCreatedAtMs(event.getDocument()), - documentFactory.newDocument(event) - ); - } - }); - - tweetDocumentReader.setExhaustStream(true); - return tweetDocumentReader; - } - - private RecordReader createTweetReader(Path tweetsPath) throws IOException { - if (date.before(START_DATE_INCLUSIVE)) { - return new EmptyRecordReader<>(); - } - - List> readers = Lists.newArrayList(); - FileSystem hdfs = HdfsUtil.getHdfsFileSystem(); - try { - Path dayPath = new Path(tweetsPath, ArchiveHDFSUtils.dateToPath(date, "/")); - Path partitionPath = - new Path(dayPath, String.format("p_%d_of_%d", hashPartitionID, numHashPartitions)); - PathFilter pathFilter = - Config.environmentIsTest() ? TXT_DATA_FILES_FILTER : LZO_DATA_FILES_FILTER; - FileStatus[] files = hdfs.listStatus(partitionPath, pathFilter); - for (FileStatus fileStatus : files) { - String fileStatusPath = fileStatus.getPath().toString().replaceAll("file:/", "/"); - RecordReader reader = createRecordReaderWithRetries(fileStatusPath); - readers.add(reader); - } - } finally { - IOUtils.closeQuietly(hdfs); - } - - if (readers.isEmpty()) { - return new EmptyRecordReader<>(); - } - - return new MergingSortedRecordReader<>(DESC_THRIFT_INDEXING_EVENT_COMPARATOR, readers); - } - - private RecordReader createRecordReaderWithRetries(String filePath) - throws IOException { - Predicate recordFilter = getRecordFilter(); - int numTries = 0; - while (true) { - try { - ++numTries; - return new LzoThriftBlockFileReader<>(filePath, ThriftIndexingEvent.class, recordFilter); - } catch (IOException e) { - if (numTries < READER_INIT_IOEXCEPTION_RETRIES) { - LOG.warn("Failed to open LzoThriftBlockFileReader for " + filePath + ". Will retry.", e); - } else { - LOG.error("Failed to open LzoThriftBlockFileReader for " + filePath - + " after too many retries.", e); - throw e; - } - } - } - } - - private Predicate getRecordFilter() { - return Config.environmentIsTest() ? null : input -> { - if (input == null) { - return false; - } - // We only guard against status IDs that are too small, because it is possible - // for a very old tweet to get into today's batch, but not possible for a very - // large ID (a future tweet ID that is not yet published) to get in today's - // batch, unless tweet ID generation messed up. - long statusId = input.getSortId(); - boolean keep = statusId >= minStatusID; - if (!keep) { - LOG.debug("Out of order documentId: {} minStatusID: {} Date: {} Path: {}", - statusId, minStatusID, date, path); - OUT_OF_ORDER_STATUSES_SKIPPED.increment(); - } - return keep; - }; - } - - /** - * Returns the number of statuses in this batch - */ - public int getStatusCount() { - return statusCount; - } - - /** - * Was the _status_count file was found in this folder. - */ - public boolean hasStatusCount() { - return hasStatusCountFile; - } - - public long getMinStatusID() { - return minStatusID; - } - - public long getMaxStatusID() { - return maxStatusID; - } - - public Date getDate() { - return date; - } - - public Path getPath() { - return path; - } - - /** - * Check whether the partition is - * . empty and - * . it is disallowed (empty partition can only happen before 2010) - * (Empty partition means that the directory is missing when scan happens.) - * - * @return true if the partition has no documents and it is not allowed. - */ - public boolean isDisallowedEmptyPartition() { - return hasStatusCountFile - && statusCount == 0 - && minStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID - && maxStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID - && date.after(getEarliestDenseDay()); - } - - @Override - public String toString() { - return "PartitionedBatch[hashPartitionId=" + hashPartitionID - + ",numHashPartitions=" + numHashPartitions - + ",date=" + date - + ",path=" + path - + ",hasStatusCountFile=" + hasStatusCountFile - + ",statusCount=" + statusCount + "]"; - } - - private Date getEarliestDenseDay() { - return EarlybirdConfig.getDate("archive_search_earliest_dense_day"); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.bazel b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.bazel deleted file mode 100644 index f630ffd06..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.bazel +++ /dev/null @@ -1,64 +0,0 @@ -java_library( - name = "segment_builder_lib", - sources = ["**/*.java"], - platform = "java8", - tags = [ - "bazel-compatible", - "bazel-only", - ], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server", - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "decider/src/main/scala", - "finatra/inject/inject-core/src/main/scala", - "finatra/inject/inject-server/src/main/scala/com/twitter/inject/server", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/quantity", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/config", - "src/java/com/twitter/search/common/database", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/base", - "src/java/com/twitter/search/common/partitioning/zookeeper", - "src/java/com/twitter/search/common/schema", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/util:closeresourceutil", - "src/java/com/twitter/search/common/util:gcutil", - "src/java/com/twitter/search/common/util:kerberos", - "src/java/com/twitter/search/common/util/date", - "src/java/com/twitter/search/common/util/io:flushable", - "src/java/com/twitter/search/common/util/zktrylock", - "src/java/com/twitter/search/common/util/zookeeper", - "src/java/com/twitter/search/earlybird:earlybird-lib", - "src/java/com/twitter/search/earlybird/common", - "src/java/com/twitter/search/earlybird/common/config", - "src/java/com/twitter/search/earlybird/common/userupdates", - "util/util-core:scala", - ], -) - -# Using hadoop_binary target can automatically exclude hadoop related jars in the built jar -# and load in the right jars based on hadoop config. -hadoop_binary( - name = "segment_builder_binary", - basename = "segment_builder", - main = "com.twitter.search.earlybird.archive.segmentbuilder.SegmentBuilderMain", - platform = "java8", - runtime_platform = "java8", - tags = [ - "bazel-compatible", - "bazel-compatible:migrated", - "bazel-only", - ], - dependencies = [ - ":segment_builder_lib", - "src/java/com/twitter/search/common/logging:search-log4j", - ], -) diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.docx new file mode 100644 index 000000000..f7a60c30b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.docx new file mode 100644 index 000000000..d64072414 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.java deleted file mode 100644 index a185d41f2..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.java +++ /dev/null @@ -1,29 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; - -public class BuiltAndFinalizedSegment extends SegmentBuilderSegment { - public BuiltAndFinalizedSegment( - SegmentInfo segmentInfo, - SegmentConfig segmentConfig, - EarlybirdSegmentFactory earlybirdSegmentFactory, - int alreadyRetriedCount, - SegmentSyncConfig sync) { - - super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync); - } - - @Override - public SegmentBuilderSegment handle() throws SegmentInfoConstructionException, - SegmentUpdaterException { - - throw new IllegalStateException("Should not handle a BuildAndFinalizedSegment."); - } - - @Override - public boolean isBuilt() { - return true; - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.docx new file mode 100644 index 000000000..e910bd1d0 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.java deleted file mode 100644 index 16249a7b1..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.java +++ /dev/null @@ -1,101 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import java.util.concurrent.atomic.AtomicBoolean; - -import com.google.common.base.Stopwatch; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.util.GCUtil; -import com.twitter.search.common.util.zktrylock.TryLock; -import com.twitter.search.earlybird.archive.ArchiveSegmentUpdater; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; - -public class NotYetBuiltSegment extends SegmentBuilderSegment { - private static final Logger LOG = LoggerFactory.getLogger(NotYetBuiltSegment.class); - - public NotYetBuiltSegment( - SegmentInfo segmentInfo, - SegmentConfig segmentConfig, - EarlybirdSegmentFactory earlybirdSegmentFactory, - int alreadyRetriedCount, - SegmentSyncConfig sync) { - - super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync); - } - - /** - * 1. Grab the ZK lock for this segment. - * 2a. if lock fails, another host is updating; return the SOMEONE_ELSE_IS_BUILDING state. - * 2b. if lock succeeds, check again if the updated segment exists on HDFS. - * 3a. if so, just move on. - * 3b. if not, update the segment. - * In both cases, we need to check if the segment can now be marked as BUILT_AND_FINALIZED. - */ - @Override - public SegmentBuilderSegment handle() - throws SegmentUpdaterException, SegmentInfoConstructionException { - LOG.info("Handling a not yet built segment: {}", this.getSegmentName()); - Stopwatch stopwatch = Stopwatch.createStarted(); - TryLock lock = getZooKeeperTryLock(); - - // The tryWithLock can only access variables from parent class that are final. However, we - // would like to pass the process() return value to the parent class. So here we use - // AtomicBoolean reference instead of Boolean. - final AtomicBoolean successRef = new AtomicBoolean(false); - boolean gotLock = lock.tryWithLock(() -> { - ArchiveSegmentUpdater updater = new ArchiveSegmentUpdater( - segmentConfig.getTryLockFactory(), - sync, - segmentConfig.getEarlybirdIndexConfig(), - Clock.SYSTEM_CLOCK); - - boolean success = updater.updateSegment(segmentInfo); - successRef.set(success); - }); - - if (!gotLock) { - LOG.info("cannot acquire zookeeper lock for: " + segmentInfo); - return new SomeoneElseIsBuildingSegment( - segmentInfo, - segmentConfig, - earlybirdSegmentFactory, - alreadyRetriedCount, - sync); - } - - // 1. we want to make sure the heap is clean right after building a segment so that it's ready - // for us to start allocations for a new segment - // — I think we've had cases where we were seeing OOM's while building - // 2. the thing that I think it helps with is compaction (vs just organically running CMS) - // — which would clean up the heap, but may leave it in a fragmented state - // — and running a Full GC is supposed to compact the remaining tenured space. - GCUtil.runGC(); - - if (successRef.get()) { - LOG.info("Indexing segment {} took {}", segmentInfo, stopwatch); - LOG.info("Finished building {}", segmentInfo.getSegment().getSegmentName()); - return new BuiltAndFinalizedSegment( - segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync); - } else { - int alreadyTried = alreadyRetriedCount + 1; - String errMsg = "failed updating segment for: " + segmentInfo - + " for " + alreadyTried + " times"; - LOG.error(errMsg); - if (alreadyTried < segmentConfig.getMaxRetriesOnFailure()) { - return new NotYetBuiltSegment( - createNewSegmentInfo(segmentInfo), - segmentConfig, - earlybirdSegmentFactory, - alreadyTried, - sync); - } else { - throw new SegmentUpdaterException(errMsg); - } - } - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.docx new file mode 100644 index 000000000..071d34881 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.java deleted file mode 100644 index 9ef883672..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.java +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import java.util.HashMap; -import java.util.Map; - -import com.twitter.common.util.Clock; - -/** - * A class that prevents handling a given segment more than once every hdfsCheckIntervalMillis - */ -public class RateLimitingSegmentHandler { - private final long hdfsCheckIntervalMillis; - private final Clock clock; - private final Map segmentNameToLastUpdatedTimeMillis = new HashMap<>(); - - RateLimitingSegmentHandler(long hdfsCheckIntervalMillis, Clock clock) { - this.hdfsCheckIntervalMillis = hdfsCheckIntervalMillis; - this.clock = clock; - } - - SegmentBuilderSegment processSegment(SegmentBuilderSegment segment) - throws SegmentUpdaterException, SegmentInfoConstructionException { - - String segmentName = segment.getSegmentName(); - - Long lastUpdatedMillis = segmentNameToLastUpdatedTimeMillis.get(segmentName); - if (lastUpdatedMillis == null) { - lastUpdatedMillis = 0L; - } - - long nowMillis = clock.nowMillis(); - if (nowMillis - lastUpdatedMillis < hdfsCheckIntervalMillis) { - return segment; - } - segmentNameToLastUpdatedTimeMillis.put(segmentName, nowMillis); - - return segment.handle(); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.docx new file mode 100644 index 000000000..1bbca6b3f Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.java deleted file mode 100644 index 1f3f47cf9..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.java +++ /dev/null @@ -1,540 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Date; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Random; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Stopwatch; -import com.google.common.collect.ComparisonChain; -import com.google.common.collect.ImmutableList; -import com.google.common.util.concurrent.Uninterruptibles; -import com.google.inject.Inject; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.inject.annotations.Flag; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.metrics.SearchStatsReceiverImpl; -import com.twitter.search.common.partitioning.zookeeper.SearchZkClient; -import com.twitter.search.common.util.Kerberos; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig; -import com.twitter.search.earlybird.archive.ArchiveSegment; -import com.twitter.search.earlybird.archive.DailyStatusBatches; -import com.twitter.search.earlybird.archive.ArchiveTimeSlicer; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.util.ScrubGenUtil; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; - -/** - * This class provides the core logic to build segment indices offline. - * For each server, it coordinate via zookeeper to pick the next segment, build the indices for it - * and upload them to HDFS. A state machine is used to handle the build state transitions. There - * are three states: - * NOT_BUILD_YET: a segment that needs to be built - * SOMEONE_ELSE_IS_BUILDING: another server is building the segment. - * BUILT_AND_FINALIZED: the indices of this segment have already been built. - */ -public class SegmentBuilder { - private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilder.class); - - private final boolean onlyRunOnce; - private final int waitBetweenLoopsMins; - private final int startUpBatchSize; - private final int instance; - private final int waitBetweenSegmentsSecs; - private final int waitBeforeQuitMins; - - // When multiple segment builders start simultaneously, they might make the HDFS name node and - // zookeeper overwhelmed. So, we let some instances sleep sometimes before they start to avoid - // the issues. - private final long startUpSleepMins; - - // If no more segments to built, wait this interval before checking again. - private final long processWaitingInterval = TimeUnit.MINUTES.toMillis(10); - - // The hash partitions that segments will be built. - private final ImmutableList hashPartitions; - - private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl(); - private final SearchIndexingMetricSet searchIndexingMetricSet = - new SearchIndexingMetricSet(statsReceiver); - private final EarlybirdSearcherStats searcherStats = - new EarlybirdSearcherStats(statsReceiver); - - private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig; - - private final ZooKeeperTryLockFactory zkTryLockFactory; - private final RateLimitingSegmentHandler segmentHandler; - private final Clock clock; - private final int numSegmentBuilderPartitions; - private final int myPartitionId; - private final SegmentConfig segmentConfig; - private final EarlybirdSegmentFactory segmentFactory; - private final SegmentBuilderCoordinator segmentBuilderCoordinator; - private final SegmentSyncConfig segmentSyncConfig; - private final Random random = new Random(); - - private static final double SLEEP_RANDOMIZATION_RATIO = .2; - - // Stats - // The flush version used to build segments - private static final SearchLongGauge CURRENT_FLUSH_VERSION = - SearchLongGauge.export("current_flush_version"); - - // Accumulated number and time in seconds spent on building segments locally - private static SearchCounter segmentsBuiltLocally = - SearchCounter.export("segments_built_locally"); - private static SearchCounter timeSpentOnSuccessfulBuildSecs = - SearchCounter.export("time_spent_on_successful_build_secs"); - - // The total number of segments to be built - private static final SearchLongGauge SEGMENTS_TO_BUILD = - SearchLongGauge.export("segments_to_build"); - - // How many segments failed locally - private static final SearchCounter FAILED_SEGMENTS = - SearchCounter.export("failed_segments"); - - @Inject - protected SegmentBuilder(@Flag("onlyRunOnce") boolean onlyRunOnceFlag, - @Flag("waitBetweenLoopsMins") int waitBetweenLoopsMinsFlag, - @Flag("startup_batch_size") int startUpBatchSizeFlag, - @Flag("instance") int instanceFlag, - @Flag("segmentZkLockExpirationHours") - int segmentZkLockExpirationHoursFlag, - @Flag("startupSleepMins") long startupSleepMinsFlag, - @Flag("maxRetriesOnFailure") int maxRetriesOnFailureFlag, - @Flag("hash_partitions") List hashPartitionsFlag, - @Flag("numSegmentBuilderPartitions") int numSegmentBuilderPartitionsFlag, - @Flag("waitBetweenSegmentsSecs") int waitBetweenSegmentsSecsFlag, - @Flag("waitBeforeQuitMins") int waitBeforeQuitMinsFlag, - @Flag("scrubGen") String scrubGen, - Decider decider) { - this(onlyRunOnceFlag, - waitBetweenLoopsMinsFlag, - startUpBatchSizeFlag, - instanceFlag, - segmentZkLockExpirationHoursFlag, - startupSleepMinsFlag, - hashPartitionsFlag, - maxRetriesOnFailureFlag, - waitBetweenSegmentsSecsFlag, - waitBeforeQuitMinsFlag, - SearchZkClient.getSZooKeeperClient().createZooKeeperTryLockFactory(), - new RateLimitingSegmentHandler(TimeUnit.MINUTES.toMillis(10), Clock.SYSTEM_CLOCK), - Clock.SYSTEM_CLOCK, - numSegmentBuilderPartitionsFlag, - decider, - getSyncConfig(scrubGen)); - } - - @VisibleForTesting - protected SegmentBuilder(boolean onlyRunOnceFlag, - int waitBetweenLoopsMinsFlag, - int startUpBatchSizeFlag, - int instanceFlag, - int segmentZkLockExpirationHoursFlag, - long startupSleepMinsFlag, - List hashPartitions, - int maxRetriesOnFailure, - int waitBetweenSegmentsSecsFlag, - int waitBeforeQuitMinsFlag, - ZooKeeperTryLockFactory zooKeeperTryLockFactory, - RateLimitingSegmentHandler segmentHandler, - Clock clock, - int numSegmentBuilderPartitions, - Decider decider, - SegmentSyncConfig syncConfig) { - LOG.info("Creating SegmentBuilder"); - LOG.info("Penguin version in use: " + EarlybirdConfig.getPenguinVersion()); - - // Set command line flag values - this.onlyRunOnce = onlyRunOnceFlag; - this.waitBetweenLoopsMins = waitBetweenLoopsMinsFlag; - this.startUpBatchSize = startUpBatchSizeFlag; - this.instance = instanceFlag; - this.waitBetweenSegmentsSecs = waitBetweenSegmentsSecsFlag; - this.waitBeforeQuitMins = waitBeforeQuitMinsFlag; - - this.segmentHandler = segmentHandler; - this.zkTryLockFactory = zooKeeperTryLockFactory; - this.segmentSyncConfig = syncConfig; - this.startUpSleepMins = startupSleepMinsFlag; - - if (!hashPartitions.isEmpty()) { - this.hashPartitions = ImmutableList.copyOf(hashPartitions); - } else { - this.hashPartitions = null; - } - - Amount segmentZKLockExpirationTime = Amount.of((long) - segmentZkLockExpirationHoursFlag, Time.HOURS); - - this.earlybirdIndexConfig = - new ArchiveOnDiskEarlybirdIndexConfig(decider, searchIndexingMetricSet, - new CriticalExceptionHandler()); - - this.segmentConfig = new SegmentConfig( - earlybirdIndexConfig, - segmentZKLockExpirationTime, - maxRetriesOnFailure, - zkTryLockFactory); - this.segmentFactory = new EarlybirdSegmentFactory( - earlybirdIndexConfig, - searchIndexingMetricSet, - searcherStats, - clock); - this.segmentBuilderCoordinator = new SegmentBuilderCoordinator( - zkTryLockFactory, syncConfig, clock); - - this.clock = clock; - - this.numSegmentBuilderPartitions = numSegmentBuilderPartitions; - this.myPartitionId = instance % numSegmentBuilderPartitions; - SearchLongGauge.export("segment_builder_partition_id_" + myPartitionId).set(1); - - CURRENT_FLUSH_VERSION.set(earlybirdIndexConfig.getSchema().getMajorVersionNumber()); - } - - void run() { - LOG.info("Config values: {}", EarlybirdConfig.allValuesAsString()); - - // Sleep some time uninterruptibly before get started so that if multiple instances are running, - // the HDFS name node and zookeeper wont be overwhelmed - // Say, we have 100 instances (instance_arg will have value from 0 - 99, our - // STARTUP_BATCH_SIZE_ARG is 20 and startUpSleepMins is 3 mins. Then the first 20 instances - // will not sleep, but start immediately. then instance 20 - 39 will sleep 3 mins and then - // start to run. instance 40 - 59 will sleep 6 mins then start to run. instances 60 - 79 will - // sleep 9 mins and then start to run and so forth. - long sleepTime = instance / startUpBatchSize * startUpSleepMins; - LOG.info("Instance={}, Start up batch size={}", instance, startUpBatchSize); - LOG.info("Sleep {} minutes to void HDFS name node and ZooKeeper overwhelmed.", sleepTime); - Uninterruptibles.sleepUninterruptibly(sleepTime, TimeUnit.MINUTES); - - // Kinit here. - Kerberos.kinit( - EarlybirdConfig.getString("kerberos_user", ""), - EarlybirdConfig.getString("kerberos_keytab_path", "") - ); - - long waitBetweenLoopsMs = TimeUnit.MINUTES.toMillis(waitBetweenLoopsMins); - if (onlyRunOnce) { - LOG.info("This segment builder will run the full rebuild of all the segments"); - } else { - LOG.info("This segment builder will incrementally check for new data and rebuilt " - + "current segments as needed."); - LOG.info("The waiting interval between two new data checking is: " - + waitBetweenLoopsMs + " ms."); - } - - boolean scrubGenPresent = segmentSyncConfig.getScrubGen().isPresent(); - LOG.info("Scrub gen present: {}", scrubGenPresent); - boolean scrubGenDataFullyBuilt = segmentBuilderCoordinator.isScrubGenDataFullyBuilt(instance); - LOG.info("Scrub gen data fully built: {}", scrubGenDataFullyBuilt); - - if (!scrubGenPresent || scrubGenDataFullyBuilt) { - LOG.info("Starting segment building loop..."); - while (!Thread.currentThread().isInterrupted()) { - try { - indexingLoop(); - if (onlyRunOnce) { - LOG.info("only run once is true, breaking"); - break; - } - clock.waitFor(waitBetweenLoopsMs); - } catch (InterruptedException e) { - LOG.info("Interrupted, quitting segment builder"); - Thread.currentThread().interrupt(); - } catch (SegmentInfoConstructionException e) { - LOG.error("Error creating new segmentInfo, quitting segment builder: ", e); - break; - } catch (SegmentUpdaterException e) { - FAILED_SEGMENTS.increment(); - // Before the segment builder quits, sleep for WAIT_BEFORE_QUIT_MINS minutes so that the - // FAILED_SEGMENTS stat can be exported. - try { - clock.waitFor(TimeUnit.MINUTES.toMillis(waitBeforeQuitMins)); - } catch (InterruptedException ex) { - LOG.info("Interrupted, quitting segment builder"); - Thread.currentThread().interrupt(); - } - LOG.error("SegmentUpdater processing segment error, quitting segment builder: ", e); - break; - } - } - } else { - LOG.info("Cannot build the segments for scrub gen yet."); - } - } - - // Refactoring the run loop to here for unittest - @VisibleForTesting - void indexingLoop() - throws SegmentInfoConstructionException, InterruptedException, SegmentUpdaterException { - // This map contains all the segments to be processed; if a segment is built, it will be removed - // from the map. - Map buildableSegmentInfoMap; - try { - buildableSegmentInfoMap = createSegmentInfoMap(); - printSegmentInfoMap(buildableSegmentInfoMap); - } catch (IOException e) { - LOG.error("Error creating segmentInfoMap: ", e); - return; - } - - while (!buildableSegmentInfoMap.isEmpty()) { - boolean hasBuiltSegment = processSegments(buildableSegmentInfoMap); - - if (!hasBuiltSegment) { - // If we successfully built a segment, no need to sleep since building a segment takes a - // long time - clock.waitFor(processWaitingInterval); - } - } - } - - // Actual shutdown. - protected void doShutdown() { - LOG.info("doShutdown()..."); - try { - earlybirdIndexConfig.getResourceCloser().shutdownExecutor(); - } catch (InterruptedException e) { - LOG.error("Interrupted during shutdown. ", e); - } - - LOG.info("Segment builder stopped!"); - } - - private List createTimeSlices() throws IOException { - Preconditions.checkState(segmentSyncConfig.getScrubGen().isPresent()); - Date scrubGen = ScrubGenUtil.parseScrubGenToDate(segmentSyncConfig.getScrubGen().get()); - - final DailyStatusBatches dailyStatusBatches = - new DailyStatusBatches(zkTryLockFactory, scrubGen); - final ArchiveTimeSlicer archiveTimeSlicer = new ArchiveTimeSlicer( - EarlybirdConfig.getMaxSegmentSize(), dailyStatusBatches, earlybirdIndexConfig); - - Stopwatch stopwatch = Stopwatch.createStarted(); - List timeSlices = archiveTimeSlicer.getTimeSlices(); - - if (timeSlices == null) { - LOG.error("Failed to load timeslice map after {}", stopwatch); - return Collections.emptyList(); - } - - LOG.info("Took {} to get timeslices", stopwatch); - return timeSlices; - } - - private static class TimeSliceAndHashPartition implements Comparable { - public final ArchiveTimeSlicer.ArchiveTimeSlice timeSlice; - public final Integer hashPartition; - - public TimeSliceAndHashPartition( - ArchiveTimeSlicer.ArchiveTimeSlice timeSlice, - Integer hashPartition) { - this.timeSlice = timeSlice; - this.hashPartition = hashPartition; - } - - @Override - public int compareTo(TimeSliceAndHashPartition o) { - Integer myHashPartition = this.hashPartition; - Integer otherHashPartition = o.hashPartition; - - long myTimeSliceId = this.timeSlice.getMinStatusID(myHashPartition); - long otherTimeSliceId = o.timeSlice.getMinStatusID(otherHashPartition); - - return ComparisonChain.start() - .compare(myHashPartition, otherHashPartition) - .compare(myTimeSliceId, otherTimeSliceId) - .result(); - } - } - - /** - * For all the timeslices, create the corresponding SegmentInfo and store in a map - */ - @VisibleForTesting - Map createSegmentInfoMap() throws IOException { - final List timeSlices = createTimeSlices(); - - List timeSlicePairs = createPairs(timeSlices); - // Export how many segments should be built - SEGMENTS_TO_BUILD.set(timeSlicePairs.size()); - LOG.info("Total number of segments to be built across all segment builders: {}", - timeSlicePairs.size()); - - List mySegments = getSegmentsForMyPartition(timeSlicePairs); - - Map segmentInfoMap = new HashMap<>(); - for (TimeSliceAndHashPartition mySegment : mySegments) { - ArchiveSegment segment = new ArchiveSegment(mySegment.timeSlice, mySegment.hashPartition, - EarlybirdConfig.getMaxSegmentSize()); - SegmentInfo segmentInfo = new SegmentInfo(segment, segmentFactory, segmentSyncConfig); - - segmentInfoMap.put(segmentInfo.getSegment().getSegmentName(), new NotYetBuiltSegment( - segmentInfo, segmentConfig, segmentFactory, 0, segmentSyncConfig)); - } - - return segmentInfoMap; - } - - private List createPairs( - List timeSlices) { - - List timeSlicePairs = new ArrayList<>(); - - for (ArchiveTimeSlicer.ArchiveTimeSlice slice : timeSlices) { - List localPartitions = hashPartitions; - if (localPartitions == null) { - localPartitions = range(slice.getNumHashPartitions()); - } - - for (Integer partition : localPartitions) { - timeSlicePairs.add(new TimeSliceAndHashPartition(slice, partition)); - } - } - return timeSlicePairs; - } - - private List getSegmentsForMyPartition( - List timeSlicePairs) { - - Collections.sort(timeSlicePairs); - - List myTimeSlices = new ArrayList<>(); - for (int i = myPartitionId; i < timeSlicePairs.size(); i += numSegmentBuilderPartitions) { - myTimeSlices.add(timeSlicePairs.get(i)); - } - - LOG.info("Getting segments to be built for partition: {}", myPartitionId); - LOG.info("Total number of partitions: {}", numSegmentBuilderPartitions); - LOG.info("Number of segments picked: {}", myTimeSlices.size()); - return myTimeSlices; - } - - /** - * Print out the segmentInfo Map for debugging - */ - private void printSegmentInfoMap(Map segmentInfoMap) { - LOG.info("SegmentInfoMap: "); - for (Map.Entry entry : segmentInfoMap.entrySet()) { - LOG.info(entry.getValue().toString()); - } - LOG.info("Total SegmentInfoMap size: " + segmentInfoMap.size() + ". done."); - } - - /** - * Build indices or refresh state for the segments in the specified segmentInfoMap, which only - * contains the segments that need to build or are building. When a segment has not been built, - * it is built here. If built successfully, it will be removed from the map; otherwise, its - * state will be updated in the map. - * - * Returns true iff this process has built a segment. - */ - @VisibleForTesting - boolean processSegments(Map segmentInfoMap) - throws SegmentInfoConstructionException, SegmentUpdaterException, InterruptedException { - - boolean hasBuiltSegment = false; - - Iterator> iter = - segmentInfoMap.entrySet().iterator(); - while (iter.hasNext()) { - Map.Entry entry = iter.next(); - SegmentBuilderSegment originalSegment = entry.getValue(); - - LOG.info("About to process segment: {}", originalSegment.getSegmentName()); - long startMillis = System.currentTimeMillis(); - SegmentBuilderSegment updatedSegment = segmentHandler.processSegment(originalSegment); - - if (updatedSegment.isBuilt()) { - iter.remove(); - hasBuiltSegment = true; - - if (originalSegment instanceof NotYetBuiltSegment) { - // Record the total time spent on successfully building a semgent, used to compute the - // average segment building time. - long timeSpent = System.currentTimeMillis() - startMillis; - segmentsBuiltLocally.increment(); - timeSpentOnSuccessfulBuildSecs.add(timeSpent / 1000); - } - } else { - entry.setValue(updatedSegment); - } - - clock.waitFor(getSegmentSleepTime()); - } - - return hasBuiltSegment; - } - - private long getSegmentSleepTime() { - // The Hadoop name node can handle only about 200 requests/sec before it gets overloaded. - // Updating the state of a node that has been built takes about 1 second. In the worst case - // scenario with 800 segment builders, we end up with about 800 requests/sec. Adding a 10 - // second sleep lowers the worst case to about 80 requests/sec. - - long sleepMillis = TimeUnit.SECONDS.toMillis(waitBetweenSegmentsSecs); - - // Use randomization so that we can't get all segment builders hitting it at the exact same time - - int lowerSleepBoundMillis = (int) (sleepMillis * (1.0 - SLEEP_RANDOMIZATION_RATIO)); - int upperSleepBoundMillis = (int) (sleepMillis * (1.0 + SLEEP_RANDOMIZATION_RATIO)); - return randRange(lowerSleepBoundMillis, upperSleepBoundMillis); - } - - /** - * Returns a pseudo-random number between min and max, inclusive. - */ - private int randRange(int min, int max) { - return random.nextInt((max - min) + 1) + min; - } - - /** - * Returns list of integers 0, 1, 2, ..., count-1. - */ - private static List range(int count) { - List nums = new ArrayList<>(count); - - for (int i = 0; i < count; i++) { - nums.add(i); - } - - return nums; - } - - private static SegmentSyncConfig getSyncConfig(String scrubGen) { - if (scrubGen == null || scrubGen.isEmpty()) { - throw new RuntimeException( - "Scrub gen expected, but could not get it from the arguments."); - } - - LOG.info("Scrub gen: " + scrubGen); - return new SegmentSyncConfig(Optional.of(scrubGen)); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.docx new file mode 100644 index 000000000..b800cbed8 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.java deleted file mode 100644 index dc4565ede..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.java +++ /dev/null @@ -1,109 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import java.util.Collection; - -import com.google.common.collect.ImmutableList; -import com.google.inject.Module; - - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.app.Flaggable; -import com.twitter.inject.server.AbstractTwitterServer; -import com.twitter.util.Future; -import com.twitter.util.Time; - -public class SegmentBuilderApp extends AbstractTwitterServer { - private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderApp.class); - - public SegmentBuilderApp() { - createFlag("onlyRunOnce", - true, - "whether to stop segment builder after one loop", - Flaggable.ofBoolean()); - - createFlag("waitBetweenLoopsMins", - 60, - "how many minutes to wait between building loops", - Flaggable.ofInt()); - - createFlag("startup_batch_size", - 30, - "How many instances can start and read timeslice info from HDFS at the same time. " - + "If you don't know what this parameter is, please do not change this parameter.", - Flaggable.ofInt()); - - createFlag("instance", - 20, - "the job instance number", - Flaggable.ofInt()); - - createFlag("segmentZkLockExpirationHours", - 0, - "max hours to hold the zookeeper lock while building segment", - Flaggable.ofInt()); - - createFlag("startupSleepMins", - 2L, - "sleep multiplier of startupSleepMins before job runs", - Flaggable.ofLong()); - - createFlag("maxRetriesOnFailure", - 3, - "how many times we should try to rebuild a segment when failure happens", - Flaggable.ofInt()); - - createFlag("hash_partitions", - ImmutableList.of(), - "comma separated hash partition ids, e.g., 0,1,3,4. " - + "If not specified, all the partitions will be built.", - Flaggable.ofJavaList(Flaggable.ofInt())); - - createFlag("numSegmentBuilderPartitions", - 100, - "Number of partitions for dividing up all segment builder work", - Flaggable.ofInt()); - - createFlag("waitBetweenSegmentsSecs", - 10, - "Time to sleep between processing segments.", - Flaggable.ofInt()); - - createFlag("waitBeforeQuitMins", - 2, - "How many minutes to sleep before quitting.", - Flaggable.ofInt()); - - createFlag("scrubGen", - "", - "Scrub gen for which segment builders should be run.", - Flaggable.ofString()); - } - - @Override - public void start() { - SegmentBuilder segmentBuilder = injector().instance(SegmentBuilder.class); - closeOnExit((Time time) -> { - segmentBuilder.doShutdown(); - return Future.Unit(); - }); - - LOG.info("Starting run()"); - segmentBuilder.run(); - LOG.info("run() complete"); - - // Now shutdown - shutdown(); - } - - protected void shutdown() { - LOG.info("Calling close() to initiate shutdown"); - close(); - } - - @Override - public Collection javaModules() { - return ImmutableList.of(new SegmentBuilderModule()); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.docx new file mode 100644 index 000000000..0b6e1add4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.java deleted file mode 100644 index 79925ab5a..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.java +++ /dev/null @@ -1,200 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import java.io.IOException; -import java.util.Date; -import java.util.Optional; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.common.util.Clock; -import com.twitter.search.common.database.DatabaseConfig; -import com.twitter.search.common.util.zktrylock.TryLock; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.earlybird.archive.DailyStatusBatches; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.util.ScrubGenUtil; -import com.twitter.search.earlybird.partition.HdfsUtil; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; -import com.twitter.util.Duration; - -/** - * Coordinate between segment builders for scrubbing pipeline. - * When segment builder is running, all of them will try to find a HDFS file indicating if data is - * ready. If the file does not exist, only one of them will go through the files and see if - * scrubbing pipeline has generated all data for this scrub gen. - * - * If the instance that got the lock found all data, it still exists, because otherwise we will - * have one single segmentbuilder instance trying to build all segments, which is not what we want. - * But if it exists, then the next time all segmentbuilder instances are scheduled, they will all - * find the file, and will start building segments. - */ -class SegmentBuilderCoordinator { - private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderCoordinator.class); - - private static final Amount ZK_LOCK_EXPIRATION_MIN = Amount.of(5L, Time.MINUTES); - private static final String SEGMENT_BUILDER_SYNC_NODE = "scrub_gen_data_sync"; - private static final String SEGMENT_BUILDER_SYNC_ZK_PATH = - EarlybirdProperty.ZK_APP_ROOT.get() + "/segment_builder_sync"; - private static final String DATA_FULLY_BUILT_FILE = "_data_fully_built"; - static final int FIRST_INSTANCE = 0; - - private static final long NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS = - Duration.fromHours(1).inMillis(); - - private final ZooKeeperTryLockFactory zkTryLockFactory; - private final SegmentSyncConfig syncConfig; - private final Optional scrubGenDayOpt; - private final Optional scrubGenOpt; - private final Clock clock; - - SegmentBuilderCoordinator( - ZooKeeperTryLockFactory zkTryLockFactory, SegmentSyncConfig syncConfig, Clock clock) { - this.zkTryLockFactory = zkTryLockFactory; - this.syncConfig = syncConfig; - this.scrubGenOpt = syncConfig.getScrubGen(); - this.scrubGenDayOpt = scrubGenOpt.map(ScrubGenUtil::parseScrubGenToDate); - this.clock = clock; - } - - - public boolean isScrubGenDataFullyBuilt(int instanceNumber) { - // Only segment builder that takes scrub gen should use isPartitioningOutputReady to coordinate - Preconditions.checkArgument(scrubGenDayOpt.isPresent()); - - final FileSystem hdfs; - try { - hdfs = HdfsUtil.getHdfsFileSystem(); - } catch (IOException e) { - LOG.error("Could not create HDFS file system.", e); - return false; - } - - return isScrubGenDataFullyBuilt( - instanceNumber, - scrubGenDayOpt.get(), - NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS, - hdfs - ); - } - - @VisibleForTesting - boolean isScrubGenDataFullyBuilt( - int instanceNumber, - Date scrubGenDay, - long nonFirstInstanceSleepBeforeRetryDuration, - FileSystem hdfs) { - // Check if the scrub gen has been fully built file exists. - if (checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs)) { - return true; - } - - // If it doesn't exist, let first instance see if scrub gen has been fully built and create the - // file. - if (instanceNumber == FIRST_INSTANCE) { - // We were missing some data on HDFS for this scrub gen in previous run, - // but we might've gotten more data in the meantime, check again. - // Only allow instance 0 to do this mainly for 2 reasons: - // 1) Since instances are scheduled in batches, it's possible that a instance from latter - // batch find the fully built file in hdfs and start processing. We end up doing work with - // only partial instances. - // 2) If we sleep before we release lock, it's hard to estimate how long a instance will - // be scheduled. - // For deterministic reason, we simplify a bit and only allow instance 0 to check and write - // data is fully build file to hdfs. - try { - checkIfScrubGenDataIsFullyBuilt(hdfs, scrubGenDay); - } catch (IOException e) { - LOG.error("Failed to grab lock and check scrub gen data.", e); - } - } else { - // for all other instances, sleep for a bit to give time for first instance to check if scrub - // gen has been fully built and create the file, then check again. - try { - LOG.info( - "Sleeping for {} ms before re-checking if scrub gen has been fully built file exists", - nonFirstInstanceSleepBeforeRetryDuration); - clock.waitFor(nonFirstInstanceSleepBeforeRetryDuration); - return checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs); - } catch (InterruptedException e) { - LOG.warn("Interrupted when sleeping before re-checking if scrub gen has been fully built " - + "file exists", e); - } - } - - // if hasSuccessFileToHdfs returns false, then should always return false in the end. - // next run will find success file for this scrub gen and move forward. - return false; - } - - private void checkIfScrubGenDataIsFullyBuilt( - FileSystem hdfs, Date scrubGenDay) throws IOException { - // Build the lock, try to acquire it, and check the data on HDFS - TryLock lock = zkTryLockFactory.createTryLock( - DatabaseConfig.getLocalHostname(), - SEGMENT_BUILDER_SYNC_ZK_PATH, - SEGMENT_BUILDER_SYNC_NODE, - ZK_LOCK_EXPIRATION_MIN); - Preconditions.checkState(scrubGenOpt.isPresent()); - String scrubGen = scrubGenOpt.get(); - - lock.tryWithLock(() -> { - LOG.info(String.format( - "Obtained ZK lock to check if data for scrub gen %s is ready.", scrubGen)); - final DailyStatusBatches directory = - new DailyStatusBatches(zkTryLockFactory, scrubGenDay); - if (directory.isScrubGenDataFullyBuilt(hdfs) - && createScrubGenDataFullyBuiltFileOnHdfs(hdfs)) { - LOG.info(String.format("All data for scrub gen %s is ready.", scrubGen)); - } else { - LOG.info(String.format("Data for scrub gen %s is not ready yet.", scrubGen)); - } - }); - } - - private boolean createScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) { - Path path = getScrubGenDataFullyBuiltFilePath(); - try { - fs.mkdirs(new Path(statusReadyHDFSPath())); - if (fs.createNewFile(path)) { - LOG.info("Successfully created file " + path + " on HDFS."); - return true; - } else { - LOG.warn("Failed to create file " + path + " on HDFS."); - } - } catch (IOException e) { - LOG.error("Failed to create file on HDFS " + path.toString(), e); - } - return false; - } - - private boolean checkHaveScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) { - Path path = getScrubGenDataFullyBuiltFilePath(); - try { - boolean ret = fs.exists(path); - LOG.info("Checking if file exists showing scrubgen is fully built."); - LOG.info("Path checked: {}, Exist check: {}", path, ret); - return ret; - } catch (IOException e) { - LOG.error("Failed to check file on HDFS " + path.toString(), e); - return false; - } - } - - @VisibleForTesting - Path getScrubGenDataFullyBuiltFilePath() { - return new Path(statusReadyHDFSPath(), DATA_FULLY_BUILT_FILE); - } - - @VisibleForTesting - String statusReadyHDFSPath() { - return syncConfig.getHdfsSegmentSyncRootDir() + "/segment_builder_sync"; - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.docx new file mode 100644 index 000000000..561e499f7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.java deleted file mode 100644 index 85db7e855..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.java +++ /dev/null @@ -1,10 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -public final class SegmentBuilderMain { - - private SegmentBuilderMain() { } - - public static void main(String[] args) { - new SegmentBuilderApp().main(args); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.docx new file mode 100644 index 000000000..4dade1dd5 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.java deleted file mode 100644 index ea0520a0b..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import java.io.File; - -import com.google.inject.Provides; -import com.google.inject.Singleton; - -import com.twitter.app.Flaggable; -import com.twitter.decider.Decider; -import com.twitter.inject.TwitterModule; -import com.twitter.inject.annotations.Flag; -import com.twitter.search.common.config.LoggerConfiguration; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.util.EarlybirdDecider; - -public class SegmentBuilderModule extends TwitterModule { - - private static final String CONFIG_FILE_FLAG_NAME = "config_file"; - private static final String SEGMENT_LOG_DIR_FLAG_NAME = "segment_log_dir"; - - public SegmentBuilderModule() { - createFlag(CONFIG_FILE_FLAG_NAME, - new File("earlybird-search.yml"), - "specify config file", - Flaggable.ofFile()); - - createFlag(SEGMENT_LOG_DIR_FLAG_NAME, - "", - "override log dir from config file", - Flaggable.ofString()); - } - - /** - * Initializes the Earlybird config and the log configuration, and returns an EarlybirdDecider - * object, which will be injected into the SegmentBuilder instance. - * - * @param configFile The config file to use to initialize EarlybirdConfig - * @param segmentLogDir If not empty, used to override the log directory from the config file - * @return An initialized EarlybirdDecider - */ - @Provides - @Singleton - public Decider provideDecider(@Flag(CONFIG_FILE_FLAG_NAME) File configFile, - @Flag(SEGMENT_LOG_DIR_FLAG_NAME) String segmentLogDir) { - // By default Guice will build singletons eagerly: - // https://github.com/google/guice/wiki/Scopes#eager-singletons - // So in order to ensure that the EarlybirdConfig and LoggerConfiguration initializations occur - // before the EarlybirdDecider initialization, we place them here. - EarlybirdConfig.init(configFile.getName()); - if (!segmentLogDir.isEmpty()) { - EarlybirdConfig.overrideLogDir(segmentLogDir); - } - new LoggerConfiguration(EarlybirdConfig.getLogPropertiesFile(), EarlybirdConfig.getLogDir()) - .configure(); - - return EarlybirdDecider.initialize(); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.docx new file mode 100644 index 000000000..5ebf91ed7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.java deleted file mode 100644 index 428113bf9..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.java +++ /dev/null @@ -1,100 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.search.common.database.DatabaseConfig; -import com.twitter.search.common.util.zktrylock.TryLock; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.earlybird.archive.ArchiveSegment; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; - -public abstract class SegmentBuilderSegment { - protected final SegmentInfo segmentInfo; - protected final SegmentConfig segmentConfig; - protected final EarlybirdSegmentFactory earlybirdSegmentFactory; - protected final int alreadyRetriedCount; - protected final SegmentSyncConfig sync; - - public SegmentBuilderSegment(SegmentInfo segmentInfo, - SegmentConfig segmentConfig, - EarlybirdSegmentFactory earlybirdSegmentFactory, - int alreadyRetriedCount, - SegmentSyncConfig segmentSyncConfig) { - this.segmentConfig = segmentConfig; - this.earlybirdSegmentFactory = earlybirdSegmentFactory; - this.alreadyRetriedCount = alreadyRetriedCount; - this.sync = segmentSyncConfig; - Preconditions.checkState(segmentInfo.getSegment() instanceof ArchiveSegment); - this.segmentInfo = Preconditions.checkNotNull(segmentInfo); - } - - public SegmentInfo getSegmentInfo() { - return segmentInfo; - } - - public String getSegmentName() { - return segmentInfo.getSegmentName(); - } - - public int getAlreadyRetriedCount() { - return alreadyRetriedCount; - } - - /** - * Handle the segment, potentially transitioning to a new state. - * @return The state after handling. - */ - public abstract SegmentBuilderSegment handle() - throws SegmentInfoConstructionException, SegmentUpdaterException; - - public boolean isBuilt() { - return false; - } - - @Override - public String toString() { - return "SegmentBuilderSegment{" - + "segmentInfo=" + segmentInfo - + ", state=" + this.getClass().getSimpleName() - + ", alreadyRetriedCount=" + alreadyRetriedCount + '}'; - } - - /** - * Given a SegmentInfo, create a new one with the same time slice and partitionID but clean - * internal state. - */ - protected SegmentInfo createNewSegmentInfo(SegmentInfo oldSegmentInfo) - throws SegmentInfoConstructionException { - Preconditions.checkArgument(oldSegmentInfo.getSegment() instanceof ArchiveSegment); - ArchiveSegment archiveSegment = (ArchiveSegment) oldSegmentInfo.getSegment(); - - try { - ArchiveSegment segment = new ArchiveSegment(archiveSegment.getArchiveTimeSlice(), - archiveSegment.getHashPartitionID(), EarlybirdConfig.getMaxSegmentSize()); - - return new SegmentInfo(segment, earlybirdSegmentFactory, sync); - } catch (IOException e) { - throw new SegmentInfoConstructionException("Error creating new segments", e); - } - } - - protected TryLock getZooKeeperTryLock() { - ZooKeeperTryLockFactory tryLockFactory = segmentConfig.getTryLockFactory(); - String zkRootPath = sync.getZooKeeperSyncFullPath(); - String nodeName = segmentInfo.getZkNodeName(); - Amount expirationTime = segmentConfig.getSegmentZKLockExpirationTime(); - - return tryLockFactory.createTryLock( - DatabaseConfig.getLocalHostname(), - zkRootPath, - nodeName, - expirationTime); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.docx new file mode 100644 index 000000000..782c407ec Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.java deleted file mode 100644 index e53f060c4..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig; - -public class SegmentConfig { - private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig; - private final Amount segmentZKLockExpirationTime; - private final int maxRetriesOnFailure; - private final ZooKeeperTryLockFactory tryLockFactory; - - public SegmentConfig( - ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig, - Amount segmentZKLockExpirationTime, - int maxRetriesOnFailure, - ZooKeeperTryLockFactory tryLockFactory) { - - this.earlybirdIndexConfig = earlybirdIndexConfig; - this.segmentZKLockExpirationTime = segmentZKLockExpirationTime; - this.maxRetriesOnFailure = maxRetriesOnFailure; - this.tryLockFactory = tryLockFactory; - } - - public ArchiveOnDiskEarlybirdIndexConfig getEarlybirdIndexConfig() { - return earlybirdIndexConfig; - } - - public Amount getSegmentZKLockExpirationTime() { - return segmentZKLockExpirationTime; - } - - public int getMaxRetriesOnFailure() { - return maxRetriesOnFailure; - } - - public ZooKeeperTryLockFactory getTryLockFactory() { - return tryLockFactory; - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.docx new file mode 100644 index 000000000..f223cd669 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.java deleted file mode 100644 index d7b69b96c..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import java.io.IOException; - -/** - * Used if exceptions are thrown during creating new SegmentInfo during the indexing loop - */ -class SegmentInfoConstructionException extends Exception { - SegmentInfoConstructionException(String msg, IOException e) { - super(msg, e); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.docx new file mode 100644 index 000000000..c31a995fa Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.java deleted file mode 100644 index 5ccbbdc25..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.java +++ /dev/null @@ -1,13 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import com.google.common.annotations.VisibleForTesting; - -/** - * Used when when SegmentUpdater fails processing segments. - */ -@VisibleForTesting -class SegmentUpdaterException extends Exception { - SegmentUpdaterException(String msg) { - super(msg); - } -} diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.docx b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.docx new file mode 100644 index 000000000..81cb0fd1d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.docx differ diff --git a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.java b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.java deleted file mode 100644 index c4f30c70d..000000000 --- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.java +++ /dev/null @@ -1,69 +0,0 @@ -package com.twitter.search.earlybird.archive.segmentbuilder; - -import java.util.concurrent.atomic.AtomicBoolean; - -import com.google.common.annotations.VisibleForTesting; - -import com.twitter.common.base.Command; -import com.twitter.search.common.util.zktrylock.TryLock; -import com.twitter.search.earlybird.archive.ArchiveHDFSUtils; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; - -public class SomeoneElseIsBuildingSegment extends SegmentBuilderSegment { - public SomeoneElseIsBuildingSegment( - SegmentInfo segmentInfo, - SegmentConfig segmentConfig, - EarlybirdSegmentFactory earlybirdSegmentFactory, - int alreadyRetriedCount, - SegmentSyncConfig sync) { - - super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync); - } - - /** - * This method refreshes local state of a segment. - * 1. Try to grab the ZK lock - * 2a. if got the lock, the segment is not being built; mark segment as NOT_BUILT_YET. - * 2b. otherwise, the segment is being built; keep the SOMEONE_ELSE_IS_BUILDING state - */ - @Override - public SegmentBuilderSegment handle() - throws SegmentInfoConstructionException, SegmentUpdaterException { - - TryLock lock = getZooKeeperTryLock(); - - final AtomicBoolean alreadyBuilt = new AtomicBoolean(false); - boolean gotLock = lock.tryWithLock((Command) () -> { - // The segment might have already finished built by others - if (segmentExistsOnHdfs()) { - alreadyBuilt.set(true); - } - }); - - if (!gotLock) { - return this; - } - - if (alreadyBuilt.get()) { - return new BuiltAndFinalizedSegment( - segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync); - } else { - // When a segment failed building, its state might not be clean. So, it is necessary to - // create a new SegmentInfo with a clean state - SegmentInfo newSegmentInfo = createNewSegmentInfo(segmentInfo); - return new NotYetBuiltSegment( - newSegmentInfo, - segmentConfig, - earlybirdSegmentFactory, - alreadyRetriedCount + 1, - sync); - } - } - - @VisibleForTesting - boolean segmentExistsOnHdfs() { - return ArchiveHDFSUtils.hasSegmentIndicesOnHDFS(sync, segmentInfo); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/BUILD b/src/java/com/twitter/search/earlybird/common/BUILD deleted file mode 100644 index 797ad1f25..000000000 --- a/src/java/com/twitter/search/earlybird/common/BUILD +++ /dev/null @@ -1,37 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/twitter/elephantbird:core", - "3rdparty/jvm/commons-codec", - "3rdparty/jvm/commons-httpclient", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "decider/src/main/scala", - "finagle/finagle-core/src/main", - "finagle/finagle-thrift/src/main/java", - "finagle/finagle-thrift/src/main/scala", - "scrooge/scrooge-core/src/main/scala", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/optional", - "src/java/com/twitter/search/common/decider", - "src/java/com/twitter/search/common/logging", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/util:finagleutil", - "src/java/com/twitter/search/common/util/earlybird", - "src/java/com/twitter/search/common/util/thrift:thrift-utils", - "src/java/com/twitter/search/queryparser/query:core-query-nodes", - "src/thrift/com/twitter/context:twitter-context-scala", - "src/thrift/com/twitter/search:earlybird-java", - "src/thrift/com/twitter/search/common:caching-java", - "src/thrift/com/twitter/search/common:constants-java", - "src/thrift/com/twitter/search/common:query-java", - "strato/src/main/scala/com/twitter/strato/opcontext", - "twitter-context/src/main/scala", - "util/util-core:scala", - ], -) diff --git a/src/java/com/twitter/search/earlybird/common/BUILD.docx b/src/java/com/twitter/search/earlybird/common/BUILD.docx new file mode 100644 index 000000000..ac1131d90 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.docx b/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.docx new file mode 100644 index 000000000..cd2834bb4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.java b/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.java deleted file mode 100644 index a2f2206ad..000000000 --- a/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.java +++ /dev/null @@ -1,120 +0,0 @@ -package com.twitter.search.earlybird.common; - -import org.apache.commons.codec.binary.Base64; -import org.apache.thrift.TException; -import org.apache.thrift.TSerializer; -import org.apache.thrift.protocol.TBinaryProtocol; -import org.slf4j.Logger; - -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; - -public final class Base64RequestResponseForLogging { - private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger( - Base64RequestResponseForLogging.class); - private static final Logger FAILED_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger( - Base64RequestResponseForLogging.class.getName() + ".FailedRequests"); - private static final Logger RANDOM_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger( - Base64RequestResponseForLogging.class.getName() + ".RandomRequests"); - private static final Logger SLOW_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger( - Base64RequestResponseForLogging.class.getName() + ".SlowRequests"); - - private enum LogType { - FAILED, - RANDOM, - SLOW, - }; - - private final LogType logtype; - private final String logLine; - private final EarlybirdRequest request; - private final EarlybirdResponse response; - private final Base64 base64 = new Base64(); - - // TSerializer is not threadsafe, so create a new one for each request - private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory()); - - private Base64RequestResponseForLogging( - LogType logType, String logLine, EarlybirdRequest request, EarlybirdResponse response) { - this.logtype = logType; - this.logLine = logLine; - this.request = request; - this.response = response; - } - - public static Base64RequestResponseForLogging randomRequest( - String logLine, EarlybirdRequest request, EarlybirdResponse response) { - return new Base64RequestResponseForLogging(LogType.RANDOM, logLine, request, response); - } - - public static Base64RequestResponseForLogging failedRequest( - String logLine, EarlybirdRequest request, EarlybirdResponse response) { - return new Base64RequestResponseForLogging(LogType.FAILED, logLine, request, response); - } - - public static Base64RequestResponseForLogging slowRequest( - String logLine, EarlybirdRequest request, EarlybirdResponse response) { - return new Base64RequestResponseForLogging(LogType.SLOW, logLine, request, response); - } - - private String asBase64(EarlybirdRequest clearedRequest) { - try { - // The purpose of this log is to make it easy to re-issue requests in formz to reproduce - // issues. If queries are re-issued as is they will be treated as late-arriving queries and - // dropped due to the clientRequestTimeMs being set to the original query time. For ease of - // use purposes we clear clientRequestTimeMs and log it out separately for the rare case it - // is needed. - clearedRequest.unsetClientRequestTimeMs(); - return base64.encodeToString(serializer.serialize(clearedRequest)); - } catch (TException e) { - GENERAL_LOG.error("Failed to serialize request for logging.", e); - return "failed_to_serialize"; - } - } - - private String asBase64(EarlybirdResponse earlybirdResponse) { - try { - return base64.encodeToString(serializer.serialize(earlybirdResponse)); - } catch (TException e) { - GENERAL_LOG.error("Failed to serialize response for logging.", e); - return "failed_to_serialize"; - } - } - - private String getFormattedMessage() { - String base64Request = asBase64( - EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request)); - String base64Response = asBase64(response); - return logLine + ", clientRequestTimeMs: " + request.getClientRequestTimeMs() - + ", " + base64Request + ", " + base64Response; - } - - /** - * Logs the Base64-encoded request and response to the success or failure log. - */ - public void log() { - // Do the serializing/concatting this way so it happens on the background thread for - // async logging - Object logObject = new Object() { - @Override - public String toString() { - return getFormattedMessage(); - } - }; - - switch (logtype) { - case FAILED: - FAILED_REQUEST_LOG.info("{}", logObject); - break; - case RANDOM: - RANDOM_REQUEST_LOG.info("{}", logObject); - break; - case SLOW: - SLOW_REQUEST_LOG.info("{}", logObject); - break; - default: - // Not logging anything for other log types. - break; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.docx b/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.docx new file mode 100644 index 000000000..430f6081b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.java b/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.java deleted file mode 100644 index cd6d49c06..000000000 --- a/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.java +++ /dev/null @@ -1,55 +0,0 @@ -package com.twitter.search.earlybird.common; - -import java.util.concurrent.atomic.AtomicBoolean; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCustomGauge; - -/** - * A monitor which enforces the condition that a single thread's work is caught up, and allows - * other threads to wait to be notified when the work is complete. An AtomicBoolean ensures the - * current status is visible to all threads. - */ -public class CaughtUpMonitor { - private static final Logger LOG = LoggerFactory.getLogger(CaughtUpMonitor.class); - - protected final AtomicBoolean isCaughtUp = new AtomicBoolean(false); - - public CaughtUpMonitor(String statPrefix) { - SearchCustomGauge.export(statPrefix + "_is_caught_up", () -> isCaughtUp() ? 1 : 0); - } - - public boolean isCaughtUp() { - return isCaughtUp.get(); - } - - /** - * Set caught up state, and notify waiting threads if caught up. - */ - public synchronized void setAndNotify(boolean caughtUp) { - isCaughtUp.set(caughtUp); - if (caughtUp) { - // Readers are caught up, notify waiting threads - notifyAll(); - } - } - - /** - * Wait using Object.wait() until caught up or until thread is interrupted. - */ - public synchronized void resetAndWaitUntilCaughtUp() { - LOG.info("Waiting to catch up."); - // Explicitly set isCaughtUp to false before waiting - isCaughtUp.set(false); - try { - while (!isCaughtUp()) { - wait(); - } - } catch (InterruptedException e) { - LOG.error("{} was interrupted while waiting to catch up", Thread.currentThread()); - } - LOG.info("Caught up."); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/ClientIdUtil.docx b/src/java/com/twitter/search/earlybird/common/ClientIdUtil.docx new file mode 100644 index 000000000..d76924cd7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/ClientIdUtil.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/ClientIdUtil.java b/src/java/com/twitter/search/earlybird/common/ClientIdUtil.java deleted file mode 100644 index 46b916adf..000000000 --- a/src/java/com/twitter/search/earlybird/common/ClientIdUtil.java +++ /dev/null @@ -1,85 +0,0 @@ -package com.twitter.search.earlybird.common; - -import java.util.Optional; - -import com.twitter.common.optional.Optionals; -import com.twitter.search.common.util.FinagleUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.strato.opcontext.Attribution; -import com.twitter.strato.opcontext.HttpEndpoint; - -public final class ClientIdUtil { - // Blenders should always set the EarlybirdRequest.clientId field. It should be set to the Finagle - // client ID of the client that caused the blender to send this request to the roots. If the - // Finagle ID of the blender's client cannot be determined, it will be set to "unknown" (see - // com.twitter.search.common.util.FinagleUtil.UNKNOWN_CLIENT_NAME). However, other services that - // send requests to roots might not set EarlybirdRequest.clientId. - // - // So an "unset" clientId means: EarlybirdRequest.clientId was null. - // An "unknown" clientId means: the client that sent us the request - // tried setting EarlybirdRequest.clientId, but couldn't figure out a good value for it. - public static final String UNSET_CLIENT_ID = "unset"; - - private static final String CLIENT_ID_FOR_UNKNOWN_CLIENTS = "unknown_client_id"; - - private static final String CLIENT_ID_PREFIX = "client_id_"; - - private static final String FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN = - "finagle_id_%s_and_client_id_%s"; - - private static final String CLIENT_ID_AND_REQUEST_TYPE = "client_id_%s_and_type_%s"; - - private ClientIdUtil() { - } - - /** Returns the ID of the client that initiated this request or UNSET_CLIENT_ID if not set. */ - public static String getClientIdFromRequest(EarlybirdRequest request) { - return Optional - .ofNullable(request.getClientId()) - .map(String::toLowerCase) - .orElse(UNSET_CLIENT_ID); - } - - /** - * Returns the Strato http endpoint attribution as an Optional. - */ - public static Optional getClientIdFromHttpEndpointAttribution() { - return Optionals - .optional(Attribution.httpEndpoint()) - .map(HttpEndpoint::name) - .map(String::toLowerCase); - } - - /** Formats the given clientId into a string that can be used for stats. */ - public static String formatClientId(String clientId) { - return CLIENT_ID_PREFIX + clientId; - } - - /** - * Formats the given Finagle clientId and the given clientId into a single string that can be used - * for stats, or other purposes where the two IDs need to be combined. - */ - public static String formatFinagleClientIdAndClientId(String finagleClientId, String clientId) { - return String.format(FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN, finagleClientId, clientId); - } - - /** - * Formats the given clientId and requestType into a single string that can be used - * for stats or other purposes. - */ - public static String formatClientIdAndRequestType( - String clientId, String requestType) { - return String.format(CLIENT_ID_AND_REQUEST_TYPE, clientId, requestType); - } - - /** - * Format the quota client id - */ - public static String getQuotaClientId(String clientId) { - if (FinagleUtil.UNKNOWN_CLIENT_NAME.equals(clientId) || UNSET_CLIENT_ID.equals(clientId)) { - return CLIENT_ID_FOR_UNKNOWN_CLIENTS; - } - - return clientId; - } -} diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.docx b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.docx new file mode 100644 index 000000000..c25241a6b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.java b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.java deleted file mode 100644 index 303507b2b..000000000 --- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.java +++ /dev/null @@ -1,365 +0,0 @@ -package com.twitter.search.earlybird.common; - -import java.util.EnumMap; -import java.util.Map; - -import scala.Option; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.Maps; - -import com.twitter.context.TwitterContext; -import com.twitter.context.thriftscala.Viewer; -import com.twitter.decider.Decider; -import com.twitter.finagle.thrift.ClientId; -import com.twitter.finagle.thrift.ClientId$; -import com.twitter.search.TwitterContextPermit; -import com.twitter.search.common.constants.thriftjava.ThriftQuerySource; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.logging.RPCLogger; -import com.twitter.search.common.metrics.FailureRatioCounter; -import com.twitter.search.common.metrics.Timer; -import com.twitter.search.common.util.earlybird.TermStatisticsUtil; -import com.twitter.search.common.util.earlybird.ThriftSearchResultUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldRequest; -import com.twitter.search.earlybird.thrift.ThriftHistogramSettings; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftTermStatisticsRequest; - -import static com.twitter.search.common.util.earlybird.EarlybirdResponseUtil - .responseConsideredFailed; - - -public class EarlybirdRequestLogger extends RPCLogger { - protected enum ExtraFields { - QUERY_MAX_HITS_TO_PROCESS, - COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS, - RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS, - NUM_HITS_PROCESSED, - QUERY_COST, - CPU_TOTAL, - QUERY_SOURCE, - CLIENT_ID, - FINAGLE_CLIENT_ID - } - - protected enum ShardOnlyExtraFields { - NUM_SEARCHED_SEGMENTS, - SCORING_TIME_NANOS - } - - protected enum RootOnlyExtraFields { - CACHING_ALLOWED, - DEBUG_MODE, - CACHE_HIT, - USER_AGENT, - // See JIRA APPSEC-2303 for IP addresses logging - } - - private static final String LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY = - "log_full_request_details_on_error"; - private static final String LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY = - "log_full_request_details_random_fraction"; - private static final String LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY = - "log_full_slow_request_details_random_fraction"; - private static final String SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY = - "slow_request_latency_threshold_ms"; - - private final Decider decider; - private final boolean enableLogUnknownClientRequests; - - private static final Map - FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE = preBuildFailureRatioCounters(); - private static final FailureRatioCounter NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER = - new FailureRatioCounter("earlybird_logger", "query_source", "not_set"); - - static EarlybirdRequestLogger buildForRoot( - String loggerName, int latencyWarnThreshold, Decider decider) { - - return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold, - decider, true, RPCLogger.Fields.values(), ExtraFields.values(), - RootOnlyExtraFields.values()); - } - - static EarlybirdRequestLogger buildForShard( - String loggerName, int latencyWarnThreshold, Decider decider) { - - return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold, - decider, false, RPCLogger.Fields.values(), ExtraFields.values(), - ShardOnlyExtraFields.values()); - } - - @VisibleForTesting - EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider) { - this(loggerName, latencyWarnThreshold, decider, false, RPCLogger.Fields.values(), - ExtraFields.values(), RootOnlyExtraFields.values(), ShardOnlyExtraFields.values()); - } - - private EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider, - boolean enableLogUnknownClientRequests, Enum[]... fieldEnums) { - super(loggerName, fieldEnums); - this.decider = decider; - this.enableLogUnknownClientRequests = enableLogUnknownClientRequests; - setLatencyWarnThreshold(latencyWarnThreshold); - } - - /** - * Logs the given earlybird request and response. - * - * @param request The earlybird request. - * @param response The earlybird response. - * @param timer The time it took to process this request. - */ - public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) { - try { - LogEntry entry = newLogEntry(); - - setRequestLogEntries(entry, request); - setResponseLogEntries(entry, response); - if (timer != null) { - entry.setField(ExtraFields.CPU_TOTAL, Long.toString(timer.getElapsedCpuTotal())); - } - - boolean wasError = response != null && responseConsideredFailed(response.getResponseCode()); - - long responseTime = response != null ? response.getResponseTime() : 0L; - - String logLine = writeLogLine(entry, responseTime, wasError); - - // This code path is called for pre/post logging - // Prevent same request showing up twice by only logging on post logging - if (response != null && DeciderUtil.isAvailableForRandomRecipient( - decider, LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) { - Base64RequestResponseForLogging.randomRequest(logLine, request, response).log(); - } - - // Unknown client request logging only applies to pre-logging. - if (enableLogUnknownClientRequests && response == null) { - UnknownClientRequestForLogging unknownClientRequestLogger = - UnknownClientRequestForLogging.unknownClientRequest(logLine, request); - if (unknownClientRequestLogger != null) { - unknownClientRequestLogger.log(); - } - } - - if (wasError - && DeciderUtil.isAvailableForRandomRecipient( - decider, LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY)) { - new RequestResponseForLogging(request, response).logFailedRequest(); - Base64RequestResponseForLogging.failedRequest(logLine, request, response).log(); - } - - boolean wasSlow = response != null - && responseTime >= DeciderUtil.getAvailability( - decider, SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY); - if (wasSlow - && DeciderUtil.isAvailableForRandomRecipient( - decider, LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) { - Base64RequestResponseForLogging.slowRequest(logLine, request, response).log(); - } - - FailureRatioCounter failureRatioCounter = - FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE.get(request.getQuerySource()); - if (failureRatioCounter != null) { - failureRatioCounter.requestFinished(!wasError); - } else { - NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER.requestFinished(!wasError); - } - - } catch (Exception e) { - LOG.error("Exception building log entry ", e); - } - } - - private void setRequestLogEntries(LogEntry entry, EarlybirdRequest request) { - entry.setField(Fields.CLIENT_HOST, request.getClientHost()); - entry.setField(Fields.CLIENT_REQUEST_ID, request.getClientRequestID()); - entry.setField(Fields.REQUEST_TYPE, requestTypeForLog(request)); - - if (request.isSetSearchQuery()) { - ThriftSearchQuery searchQuery = request.getSearchQuery(); - entry.setField(Fields.QUERY, searchQuery.getSerializedQuery()); - - if (searchQuery.isSetMaxHitsToProcess()) { - entry.setField(ExtraFields.QUERY_MAX_HITS_TO_PROCESS, - Integer.toString(searchQuery.getMaxHitsToProcess())); - } - - if (searchQuery.isSetCollectorParams() - && searchQuery.getCollectorParams().isSetTerminationParams() - && searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) { - entry.setField(ExtraFields.COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS, - Integer.toString(searchQuery.getCollectorParams().getTerminationParams() - .getMaxHitsToProcess())); - } - - if (searchQuery.isSetRelevanceOptions() - && searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) { - entry.setField(ExtraFields.RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS, - Integer.toString(searchQuery.getRelevanceOptions().getMaxHitsToProcess())); - } - } - - entry.setField(Fields.NUM_REQUESTED, Integer.toString(numRequestedForLog(request))); - - if (request.isSetQuerySource()) { - entry.setField(ExtraFields.QUERY_SOURCE, request.getQuerySource().name()); - } - - if (request.isSetClientId()) { - entry.setField(ExtraFields.CLIENT_ID, request.getClientId()); - } - - entry.setField(RootOnlyExtraFields.CACHING_ALLOWED, - Boolean.toString(EarlybirdRequestUtil.isCachingAllowed(request))); - - entry.setField(RootOnlyExtraFields.DEBUG_MODE, Byte.toString(request.getDebugMode())); - - Option clientIdOption = ClientId$.MODULE$.current(); - if (clientIdOption.isDefined()) { - entry.setField(ExtraFields.FINAGLE_CLIENT_ID, clientIdOption.get().name()); - } - - setLogEntriesFromTwitterContext(entry); - } - - @VisibleForTesting - Option getTwitterContext() { - return TwitterContext.acquire(TwitterContextPermit.get()).apply(); - } - - private void setLogEntriesFromTwitterContext(LogEntry entry) { - Option viewerOption = getTwitterContext(); - if (viewerOption.nonEmpty()) { - Viewer viewer = viewerOption.get(); - - if (viewer.userAgent().nonEmpty()) { - String userAgent = viewer.userAgent().get(); - - // we only replace the comma in the user-agent with %2C to make it easily parseable, - // specially with command line tools like cut/sed/awk - userAgent = userAgent.replace(",", "%2C"); - - entry.setField(RootOnlyExtraFields.USER_AGENT, userAgent); - } - } - } - - private void setResponseLogEntries(LogEntry entry, EarlybirdResponse response) { - if (response != null) { - entry.setField(Fields.NUM_RETURNED, Integer.toString(numResultsForLog(response))); - entry.setField(Fields.RESPONSE_CODE, String.valueOf(response.getResponseCode())); - entry.setField(Fields.RESPONSE_TIME_MICROS, Long.toString(response.getResponseTimeMicros())); - if (response.isSetSearchResults()) { - entry.setField(ExtraFields.NUM_HITS_PROCESSED, - Integer.toString(response.getSearchResults().getNumHitsProcessed())); - entry.setField(ExtraFields.QUERY_COST, - Double.toString(response.getSearchResults().getQueryCost())); - if (response.getSearchResults().isSetScoringTimeNanos()) { - entry.setField(ShardOnlyExtraFields.SCORING_TIME_NANOS, - Long.toString(response.getSearchResults().getScoringTimeNanos())); - } - } - if (response.isSetCacheHit()) { - entry.setField(RootOnlyExtraFields.CACHE_HIT, String.valueOf(response.isCacheHit())); - } - if (response.isSetNumSearchedSegments()) { - entry.setField(ShardOnlyExtraFields.NUM_SEARCHED_SEGMENTS, - Integer.toString(response.getNumSearchedSegments())); - } - } - } - - private static int numRequestedForLog(EarlybirdRequest request) { - int num = 0; - if (request.isSetFacetRequest() && request.getFacetRequest().isSetFacetFields()) { - for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) { - num += field.getNumResults(); - } - } else if (request.isSetTermStatisticsRequest()) { - num = request.getTermStatisticsRequest().getTermRequestsSize(); - } else if (request.isSetSearchQuery()) { - num = request.getSearchQuery().isSetCollectorParams() - ? request.getSearchQuery().getCollectorParams().getNumResultsToReturn() : 0; - if (request.getSearchQuery().getSearchStatusIdsSize() > 0) { - num = Math.max(num, request.getSearchQuery().getSearchStatusIdsSize()); - } - } - return num; - } - - /** - * Returns the number of results in the given response. If the response is a term stats response, - * then the returned value will be the number of term results. If the response is a facet - * response, then the returned value will be the number of facet results. Otherwise, the returned - * value will be the number of search results. - */ - public static int numResultsForLog(EarlybirdResponse response) { - if (response == null) { - return 0; - } else if (response.isSetFacetResults()) { - return ThriftSearchResultUtil.numFacetResults(response.getFacetResults()); - } else if (response.isSetTermStatisticsResults()) { - return response.getTermStatisticsResults().getTermResultsSize(); - } else { - return ThriftSearchResultUtil.numResults(response.getSearchResults()); - } - } - - private static String requestTypeForLog(EarlybirdRequest request) { - StringBuilder requestType = new StringBuilder(64); - if (request.isSetFacetRequest()) { - requestType.append("FACETS"); - int numFields = request.getFacetRequest().getFacetFieldsSize(); - if (numFields > 0) { - // For 1 or 2 fields, just put them in the request type. For more, just log the number. - if (numFields <= 2) { - for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) { - requestType.append(":").append(field.getFieldName().toUpperCase()); - } - } else { - requestType.append(":MULTI-").append(numFields); - } - } - } else if (request.isSetTermStatisticsRequest()) { - ThriftTermStatisticsRequest termStatsRequest = request.getTermStatisticsRequest(); - requestType.append("TERMSTATS-") - .append(termStatsRequest.getTermRequestsSize()); - - ThriftHistogramSettings histoSettings = termStatsRequest.getHistogramSettings(); - if (histoSettings != null) { - String binSizeVal = String.valueOf(TermStatisticsUtil.determineBinSize(histoSettings)); - String numBinsVal = String.valueOf(histoSettings.getNumBins()); - requestType.append(":NUMBINS-").append(numBinsVal).append(":BINSIZE-").append(binSizeVal); - } - } else if (request.isSetSearchQuery()) { - requestType.append("SEARCH:"); - requestType.append(request.getSearchQuery().getRankingMode().name()); - // Denote when a from user id is present. - if (request.getSearchQuery().isSetFromUserIDFilter64()) { - requestType.append(":NETWORK-") - .append(request.getSearchQuery().getFromUserIDFilter64Size()); - } - // Denote when required status ids are present. - if (request.getSearchQuery().getSearchStatusIdsSize() > 0) { - requestType.append(":IDS-").append(request.getSearchQuery().getSearchStatusIdsSize()); - } - } - return requestType.toString(); - } - - private static Map preBuildFailureRatioCounters() { - Map counterByQuerySource = - new EnumMap<>(ThriftQuerySource.class); - - for (ThriftQuerySource thriftQuerySource : ThriftQuerySource.values()) { - FailureRatioCounter counter = new FailureRatioCounter("earlybird_logger", "query_source", - thriftQuerySource.toString()); - counterByQuerySource.put(thriftQuerySource, counter); - } - - return Maps.immutableEnumMap(counterByQuerySource); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.docx b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.docx new file mode 100644 index 000000000..48e235c77 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.java b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.java deleted file mode 100644 index ab0d709f4..000000000 --- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.java +++ /dev/null @@ -1,37 +0,0 @@ -package com.twitter.search.earlybird.common; - -import com.twitter.decider.Decider; -import com.twitter.search.common.metrics.Timer; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; - -public final class EarlybirdRequestPostLogger { - private final EarlybirdRequestLogger logger; - - public static EarlybirdRequestPostLogger buildForRoot( - int latencyWarnThreshold, Decider decider) { - - EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot( - EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider); - - return new EarlybirdRequestPostLogger(requestLogger); - } - - public static EarlybirdRequestPostLogger buildForShard( - int latencyWarnThreshold, Decider decider) { - - EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard( - EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider); - - return new EarlybirdRequestPostLogger(requestLogger); - } - - private EarlybirdRequestPostLogger(EarlybirdRequestLogger logger) { - this.logger = logger; - } - - public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) { - EarlybirdRequestUtil.updateHitsCounters(request); - logger.logRequest(request, response, timer); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.docx b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.docx new file mode 100644 index 000000000..1a19197ac Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.java b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.java deleted file mode 100644 index 66d1d8b29..000000000 --- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.java +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.search.earlybird.common; - -import com.twitter.decider.Decider; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; - -public final class EarlybirdRequestPreLogger { - private final EarlybirdRequestLogger logger; - - public static EarlybirdRequestPreLogger buildForRoot(Decider decider) { - EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot( - EarlybirdRequestPreLogger.class.getName(), Integer.MAX_VALUE, decider); - - return new EarlybirdRequestPreLogger(requestLogger); - } - - public static EarlybirdRequestPreLogger buildForShard( - int latencyWarnThreshold, Decider decider) { - - EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard( - EarlybirdRequestPreLogger.class.getName(), latencyWarnThreshold, decider); - - return new EarlybirdRequestPreLogger(requestLogger); - } - - private EarlybirdRequestPreLogger(EarlybirdRequestLogger logger) { - this.logger = logger; - } - - public void logRequest(EarlybirdRequest request) { - logger.logRequest(request, null, null); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.docx b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.docx new file mode 100644 index 000000000..cbb7925a2 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.java b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.java deleted file mode 100644 index 6cdd322c5..000000000 --- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.java +++ /dev/null @@ -1,244 +0,0 @@ -package com.twitter.search.earlybird.common; - -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchMovingAverage; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.query.thriftjava.CollectorParams; -import com.twitter.search.common.query.thriftjava.CollectorTerminationParams; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions; - -public final class EarlybirdRequestUtil { - // This logger is setup to log to a separate set of log files (request_info) and use an - // async logger so as to not block the searcher thread. See search/earlybird/config/log4j.xml - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdRequestUtil.class); - - @VisibleForTesting - static final SearchMovingAverage REQUESTED_NUM_RESULTS_STAT = - SearchMovingAverage.export("requested_num_results"); - - @VisibleForTesting - static final SearchMovingAverage REQUESTED_MAX_HITS_TO_PROCESS_STAT = - SearchMovingAverage.export("requested_max_hits_to_process"); - - @VisibleForTesting - static final SearchMovingAverage REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT = - SearchMovingAverage.export("requested_collector_params_max_hits_to_process"); - - @VisibleForTesting - static final SearchMovingAverage REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT = - SearchMovingAverage.export("requested_relevance_options_max_hits_to_process"); - - @VisibleForTesting - static final SearchCounter REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT = - SearchCounter.export("requested_max_hits_to_process_are_different"); - - private static final SearchRateCounter REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT = - SearchRateCounter.export("request_with_more_than_2k_num_result"); - private static final SearchRateCounter REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT = - SearchRateCounter.export("request_with_more_than_4k_num_result"); - - // Stats for tracking clock skew between earlybird and the client-specified request timestamp. - @VisibleForTesting - public static final SearchTimerStats CLIENT_CLOCK_DIFF_ABS = - SearchTimerStats.export("client_clock_diff_abs", TimeUnit.MILLISECONDS, false, true); - @VisibleForTesting - public static final SearchTimerStats CLIENT_CLOCK_DIFF_POS = - SearchTimerStats.export("client_clock_diff_pos", TimeUnit.MILLISECONDS, false, true); - @VisibleForTesting - public static final SearchTimerStats CLIENT_CLOCK_DIFF_NEG = - SearchTimerStats.export("client_clock_diff_neg", TimeUnit.MILLISECONDS, false, true); - @VisibleForTesting - public static final SearchRateCounter CLIENT_CLOCK_DIFF_MISSING = - SearchRateCounter.export("client_clock_diff_missing"); - - private static final int MAX_NUM_RESULTS = 4000; - private static final int OLD_MAX_NUM_RESULTS = 2000; - - private EarlybirdRequestUtil() { - } - - /** - * Logs and fixes some potentially excessive values in the given request. - */ - public static void logAndFixExcessiveValues(EarlybirdRequest request) { - ThriftSearchQuery searchQuery = request.getSearchQuery(); - if (searchQuery != null) { - int maxHitsToProcess = 0; - int numResultsToReturn = 0; - - if (searchQuery.isSetCollectorParams()) { - numResultsToReturn = searchQuery.getCollectorParams().getNumResultsToReturn(); - - if (searchQuery.getCollectorParams().isSetTerminationParams()) { - maxHitsToProcess = - searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess(); - } - } - - if (maxHitsToProcess > 50000) { - LOG.warn("Excessive max hits in " + request.toString()); - } - - // We used to limit number of results to 2000. These two counters help us track if we receive - // too many requests with large number of results set. - String warningMessageTemplate = "Exceed %d num result in %s"; - if (numResultsToReturn > MAX_NUM_RESULTS) { - LOG.warn(String.format(warningMessageTemplate, MAX_NUM_RESULTS, request.toString())); - REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT.increment(); - searchQuery.getCollectorParams().setNumResultsToReturn(MAX_NUM_RESULTS); - } else if (numResultsToReturn > OLD_MAX_NUM_RESULTS) { - LOG.warn(String.format(warningMessageTemplate, OLD_MAX_NUM_RESULTS, request.toString())); - REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT.increment(); - } - - ThriftSearchRelevanceOptions options = searchQuery.getRelevanceOptions(); - if (options != null) { - if (options.getMaxHitsToProcess() > 50000) { - LOG.warn("Excessive max hits in " + request.toString()); - } - } - } - } - - /** - * Sets {@code request.searchQuery.collectorParams} if they are not already set. - */ - public static void checkAndSetCollectorParams(EarlybirdRequest request) { - ThriftSearchQuery searchQuery = request.getSearchQuery(); - if (searchQuery == null) { - return; - } - - if (!searchQuery.isSetCollectorParams()) { - searchQuery.setCollectorParams(new CollectorParams()); - } - if (!searchQuery.getCollectorParams().isSetNumResultsToReturn()) { - searchQuery.getCollectorParams().setNumResultsToReturn(searchQuery.getNumResults()); - } - if (!searchQuery.getCollectorParams().isSetTerminationParams()) { - CollectorTerminationParams terminationParams = new CollectorTerminationParams(); - if (request.isSetTimeoutMs()) { - terminationParams.setTimeoutMs(request.getTimeoutMs()); - } - if (request.isSetMaxQueryCost()) { - terminationParams.setMaxQueryCost(request.getMaxQueryCost()); - } - searchQuery.getCollectorParams().setTerminationParams(terminationParams); - } - setMaxHitsToProcess(searchQuery); - } - - // Early birds will only look for maxHitsToProcess in CollectorParameters.TerminationParameters. - // Priority to set CollectorParameters.TerminationParameters.maxHitsToProcess is - // 1 Collector parameters - // 2 RelevanceParameters - // 3 ThrfitQuery.maxHitsToProcess - private static void setMaxHitsToProcess(ThriftSearchQuery thriftSearchQuery) { - CollectorTerminationParams terminationParams = thriftSearchQuery - .getCollectorParams().getTerminationParams(); - if (!terminationParams.isSetMaxHitsToProcess()) { - if (thriftSearchQuery.isSetRelevanceOptions() - && thriftSearchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) { - terminationParams.setMaxHitsToProcess( - thriftSearchQuery.getRelevanceOptions().getMaxHitsToProcess()); - } else { - terminationParams.setMaxHitsToProcess(thriftSearchQuery.getMaxHitsToProcess()); - } - } - } - - /** - * Creates a copy of the given request and unsets the binary fields to make the logged line for - * this request look nicer. - */ - public static EarlybirdRequest copyAndClearUnnecessaryValuesForLogging(EarlybirdRequest request) { - EarlybirdRequest copiedRequest = request.deepCopy(); - - if (copiedRequest.isSetSearchQuery()) { - // These fields are very large and the binary data doesn't play well with formz - copiedRequest.getSearchQuery().unsetTrustedFilter(); - copiedRequest.getSearchQuery().unsetDirectFollowFilter(); - } - - return copiedRequest; - } - - /** - * Updates some hit-related stats based on the parameters in the given request. - */ - public static void updateHitsCounters(EarlybirdRequest request) { - if ((request == null) || !request.isSetSearchQuery()) { - return; - } - - ThriftSearchQuery searchQuery = request.getSearchQuery(); - - if (searchQuery.isSetNumResults()) { - REQUESTED_NUM_RESULTS_STAT.addSample(searchQuery.getNumResults()); - } - - if (searchQuery.isSetMaxHitsToProcess()) { - REQUESTED_MAX_HITS_TO_PROCESS_STAT.addSample(searchQuery.getMaxHitsToProcess()); - } - - Integer collectorParamsMaxHitsToProcess = null; - if (searchQuery.isSetCollectorParams() - && searchQuery.getCollectorParams().isSetTerminationParams() - && searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) { - collectorParamsMaxHitsToProcess = - searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess(); - REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT - .addSample(collectorParamsMaxHitsToProcess); - } - - Integer relevanceOptionsMaxHitsToProcess = null; - if (searchQuery.isSetRelevanceOptions() - && searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) { - relevanceOptionsMaxHitsToProcess = searchQuery.getRelevanceOptions().getMaxHitsToProcess(); - REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT - .addSample(relevanceOptionsMaxHitsToProcess); - } - - if ((collectorParamsMaxHitsToProcess != null) - && (relevanceOptionsMaxHitsToProcess != null) - && (collectorParamsMaxHitsToProcess != relevanceOptionsMaxHitsToProcess)) { - REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT.increment(); - } - } - - public static boolean isCachingAllowed(EarlybirdRequest request) { - return !request.isSetCachingParams() || request.getCachingParams().isCache(); - } - - /** - * Track the clock difference between this server and its client's specified request time. - * When there is no clock drift between machines, this will record the inflight time between this - * server and the client. - * - * @param request the incoming earlybird request. - */ - public static void recordClientClockDiff(EarlybirdRequest request) { - if (request.isSetClientRequestTimeMs()) { - final long timeDiff = System.currentTimeMillis() - request.getClientRequestTimeMs(); - final long timeDiffAbs = Math.abs(timeDiff); - if (timeDiff >= 0) { - CLIENT_CLOCK_DIFF_POS.timerIncrement(timeDiffAbs); - } else { - CLIENT_CLOCK_DIFF_NEG.timerIncrement(timeDiffAbs); - } - CLIENT_CLOCK_DIFF_ABS.timerIncrement(timeDiffAbs); - } else { - CLIENT_CLOCK_DIFF_MISSING.increment(); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.docx b/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.docx new file mode 100644 index 000000000..b32783db3 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.java b/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.java deleted file mode 100644 index 52a7fa898..000000000 --- a/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.java +++ /dev/null @@ -1,28 +0,0 @@ -package com.twitter.search.earlybird.common; - -import javax.inject.Inject; -import javax.inject.Singleton; - -import org.apache.thrift.protocol.TProtocolFactory; - -import com.twitter.finagle.Service; -import com.twitter.search.common.util.thrift.ThriftToBytesFilter; -import com.twitter.search.earlybird.thrift.EarlybirdService; - -@Singleton -public class EarlybirdThriftBackend extends EarlybirdService.ServiceToClient { - - /** - * Wrapping the bytes svc back to a EarlybirdService.ServiceToClient, which - * is a EarlybirdService.ServiceIface again. - */ - @Inject - public EarlybirdThriftBackend( - ThriftToBytesFilter thriftToBytesFilter, - Service byteService, - TProtocolFactory protocolFactory) { - - super(thriftToBytesFilter.andThen(byteService), protocolFactory); - } - -} diff --git a/src/java/com/twitter/search/earlybird/common/NonPagingAssert.docx b/src/java/com/twitter/search/earlybird/common/NonPagingAssert.docx new file mode 100644 index 000000000..357c71f7f Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/NonPagingAssert.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/NonPagingAssert.java b/src/java/com/twitter/search/earlybird/common/NonPagingAssert.java deleted file mode 100644 index 837adbb0a..000000000 --- a/src/java/com/twitter/search/earlybird/common/NonPagingAssert.java +++ /dev/null @@ -1,34 +0,0 @@ -package com.twitter.search.earlybird.common; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchRateCounter; - -/** - * When incremented, a non-paging alert will be triggered. Use this to assert for bad conditions - * that should generally never happen. - */ -public class NonPagingAssert { - private static final Logger LOG = LoggerFactory.getLogger(NonPagingAssert.class); - - private static final String ASSERT_STAT_PREFIX = "non_paging_assert_"; - - private final String name; - private final SearchRateCounter assertCounter; - - public NonPagingAssert(String name) { - this.name = name; - this.assertCounter = SearchRateCounter.export(ASSERT_STAT_PREFIX + name); - } - - public void assertFailed() { - LOG.error("NonPagingAssert failed: {}", name); - assertCounter.increment(); - } - - public static void assertFailed(String name) { - NonPagingAssert nonPagingAssert = new NonPagingAssert(name); - nonPagingAssert.assertFailed(); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.docx b/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.docx new file mode 100644 index 000000000..f4d6e92fb Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.java b/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.java deleted file mode 100644 index 695ce4503..000000000 --- a/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.java +++ /dev/null @@ -1,55 +0,0 @@ -package com.twitter.search.earlybird.common; - - -import org.apache.thrift.TException; -import org.apache.thrift.TSerializer; -import org.apache.thrift.protocol.TSimpleJSONProtocol; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; - -public class RequestResponseForLogging { - private static final Logger LOG = LoggerFactory.getLogger( - RequestResponseForLogging.class); - - private static final Logger FAILED_REQUEST_LOG = LoggerFactory.getLogger( - RequestResponseForLogging.class.getName() + ".FailedRequests"); - - private final EarlybirdRequest request; - private final EarlybirdResponse response; - - public RequestResponseForLogging(EarlybirdRequest request, - EarlybirdResponse response) { - this.request = request; - this.response = response; - } - - private String serialize(EarlybirdRequest clearedRequest, EarlybirdResponse theResponse) { - TSerializer serializer = new TSerializer(new TSimpleJSONProtocol.Factory()); - try { - String requestJson = serializer.toString(clearedRequest); - String responseJson = serializer.toString(theResponse); - return "{\"request\":" + requestJson + ", \"response\":" + responseJson + "}"; - } catch (TException e) { - LOG.error("Failed to serialize request/response for logging.", e); - return ""; - } - } - - /** - * Logs the request and response stored in this instance to the failure log file. - */ - public void logFailedRequest() { - // Do the serializing/concatting this way so it happens on the background thread for - // async logging - FAILED_REQUEST_LOG.info("{}", new Object() { - @Override - public String toString() { - return serialize( - EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request), response); - } - }); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/RequestResponsePair.docx b/src/java/com/twitter/search/earlybird/common/RequestResponsePair.docx new file mode 100644 index 000000000..cc6d6a0de Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/RequestResponsePair.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/RequestResponsePair.java b/src/java/com/twitter/search/earlybird/common/RequestResponsePair.java deleted file mode 100644 index 2a6c4b299..000000000 --- a/src/java/com/twitter/search/earlybird/common/RequestResponsePair.java +++ /dev/null @@ -1,44 +0,0 @@ -package com.twitter.search.earlybird.common; - -import org.apache.lucene.search.Query; - -import com.twitter.search.earlybird.thrift.EarlybirdRequest; -import com.twitter.search.earlybird.thrift.EarlybirdResponse; - -public class RequestResponsePair { - private final EarlybirdRequest request; - private final EarlybirdResponse response; - private final org.apache.lucene.search.Query luceneQuery; - - // The serialized query in its final form, after various modifications have been applied to it. - // As a note, we have some code paths in which this can be null, but I don't really see them - // triggered in production right now. - private final com.twitter.search.queryparser.query.Query finalSerializedQuery; - - public RequestResponsePair( - EarlybirdRequest request, - com.twitter.search.queryparser.query.Query finalSerializedQuery, - org.apache.lucene.search.Query luceneQuery, - EarlybirdResponse response) { - this.request = request; - this.luceneQuery = luceneQuery; - this.response = response; - this.finalSerializedQuery = finalSerializedQuery; - } - - public String getFinalSerializedQuery() { - return finalSerializedQuery != null ? finalSerializedQuery.serialize() : "N/A"; - } - - public EarlybirdRequest getRequest() { - return request; - } - - public EarlybirdResponse getResponse() { - return response; - } - - public Query getLuceneQuery() { - return luceneQuery; - } -} diff --git a/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.docx b/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.docx new file mode 100644 index 000000000..c1defa0d7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.java b/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.java deleted file mode 100644 index f0345d6a2..000000000 --- a/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.java +++ /dev/null @@ -1,77 +0,0 @@ -package com.twitter.search.earlybird.common; - -import org.apache.commons.codec.binary.Base64; -import org.apache.thrift.TException; -import org.apache.thrift.TSerializer; -import org.apache.thrift.protocol.TBinaryProtocol; -import org.slf4j.Logger; - -import com.twitter.search.common.util.FinagleUtil; -import com.twitter.search.earlybird.thrift.EarlybirdRequest; - -/** - * This class logs all requests that misses either the finagle Id or the client Id. - */ -public final class UnknownClientRequestForLogging { - private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger( - UnknownClientRequestForLogging.class); - private static final Logger LOG = org.slf4j.LoggerFactory.getLogger( - UnknownClientRequestForLogging.class.getName() + ".unknownClientRequests"); - - private final String logLine; - private final EarlybirdRequest request; - private final String clientId; - private final String finagleId; - - private final Base64 base64 = new Base64(); - private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory()); - - private UnknownClientRequestForLogging( - String logLine, - EarlybirdRequest request, - String clientId, - String finagleId) { - - this.logLine = logLine; - this.request = request; - this.clientId = clientId; - this.finagleId = finagleId; - } - - /** - * Returns an UnknownClientRequestForLogging instance if a client ID is not set on the given - * earlybird request. If the request has a client ID set, {@code null} is returned. - * - * @param logLine Additional information to propagate to the log file, when logging this request. - * @param request The earlybird request. - */ - public static UnknownClientRequestForLogging unknownClientRequest( - String logLine, EarlybirdRequest request) { - String clientId = ClientIdUtil.getClientIdFromRequest(request); - String finagleId = FinagleUtil.getFinagleClientName(); - - if (clientId.equals(ClientIdUtil.UNSET_CLIENT_ID)) { - return new UnknownClientRequestForLogging(logLine, request, clientId, finagleId); - } else { - return null; - } - } - - private String asBase64() { - try { - // Need to make a deepCopy() here, because the request may still be in use (e.g. if we are - // doing this in the pre-logger), and we should not be modifying crucial fields on the - // EarlybirdRequest in place. - EarlybirdRequest clearedRequest = request.deepCopy(); - clearedRequest.unsetClientRequestTimeMs(); - return base64.encodeToString(serializer.serialize(clearedRequest)); - } catch (TException e) { - GENERAL_LOG.error("Failed to serialize request for logging.", e); - return "failed_to_serialize"; - } - } - - public void log() { - LOG.info("{},{},{},{}", clientId, finagleId, logLine, asBase64()); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/config/BUILD b/src/java/com/twitter/search/earlybird/common/config/BUILD deleted file mode 100644 index 4d2634365..000000000 --- a/src/java/com/twitter/search/earlybird/common/config/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/org/apache/commons:commons-lang3", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "3rdparty/jvm/org/yaml:snakeyaml", - "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common_internal/text/version", - "src/java/com/twitter/search/common/aurora", - "src/java/com/twitter/search/common/config", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/util/zookeeper", - ], -) diff --git a/src/java/com/twitter/search/earlybird/common/config/BUILD.docx b/src/java/com/twitter/search/earlybird/common/config/BUILD.docx new file mode 100644 index 000000000..31f946db6 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/config/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.docx b/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.docx new file mode 100644 index 000000000..4b1047451 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.java b/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.java deleted file mode 100644 index ed18aab08..000000000 --- a/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.java +++ /dev/null @@ -1,363 +0,0 @@ -package com.twitter.search.earlybird.common.config; - -import java.util.Date; -import java.util.List; -import java.util.Map; -import javax.annotation.Nullable; - -import com.google.common.collect.ImmutableMap; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.aurora.AuroraInstanceKey; -import com.twitter.search.common.config.Config; -import com.twitter.search.common.config.ConfigFile; -import com.twitter.search.common.config.ConfigurationException; -import com.twitter.search.common.config.SearchPenguinVersionsConfig; - -public final class EarlybirdConfig { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdConfig.class); - - private static final String DEFAULT_CONFIG_FILE = "earlybird-search.yml"; - private static final String LATE_TWEET_BUFFER_KEY = "late_tweet_buffer"; - - public static final String EARLYBIRD_ZK_CONFIG_DIR = "/twitter/search/production/earlybird/"; - public static final String EARLYBIRD_CONFIG_DIR = "earlybird/config"; - - public static final String USER_SNAPSHOT_BASE_DIR = "user_snapshot_base_dir"; - - private static volatile ConfigFile earlybirdConfig = null; - private static volatile Map overrideValueMap = ImmutableMap.of(); - - private static String logDirOverride = null; - private static AuroraInstanceKey auroraInstanceKey = null; - - private static int adminPort; - - private EarlybirdConfig() { } - - private static final class PenguinVersionHolder { - private static final PenguinVersion PENGUIN_VERSION_SINGLETON = - SearchPenguinVersionsConfig.getSingleSupportedVersion( - EarlybirdProperty.PENGUIN_VERSION.get()); - private static final byte PENGUIN_VERSION_BYTE_VALUE = - PENGUIN_VERSION_SINGLETON.getByteValue(); - } - - public static byte getPenguinVersionByte() { - return PenguinVersionHolder.PENGUIN_VERSION_BYTE_VALUE; - } - - public static PenguinVersion getPenguinVersion() { - return PenguinVersionHolder.PENGUIN_VERSION_SINGLETON; - } - - /** - * Reads the earlybird configuration from the given file. - */ - public static synchronized void init(@Nullable String configFile) { - if (earlybirdConfig == null) { - String file = configFile == null ? DEFAULT_CONFIG_FILE : configFile; - earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, file); - } - } - - public static synchronized void setOverrideValues(Map overrideValues) { - overrideValueMap = ImmutableMap.copyOf(overrideValues); - } - - /** - * Pack all values in a string that can be printed for informational purposes. - * @return the string. - */ - public static String allValuesAsString() { - Map stringMap = earlybirdConfig.getStringMap(); - - StringBuilder stringBuilder = new StringBuilder(); - - stringBuilder.append("Config environment: " + Config.getEnvironment() + "\n\n"); - stringBuilder.append( - String.format("Values from earlybird-search.yml (total %d):\n", stringMap.size())); - - stringMap.forEach((key, value) -> { - stringBuilder.append(String.format(" %s: %s\n", key, value.toString())); - if (overrideValueMap.containsKey(key)) { - stringBuilder.append(String.format( - " override value: %s\n", overrideValueMap.get(key).toString())); - } - }); - - stringBuilder.append(String.format( - "\n\nAll command-line overrides (total: %d):\n", overrideValueMap.size())); - overrideValueMap.forEach((key, value) -> { - stringBuilder.append(String.format(" %s: %s\n", key, value.toString())); - }); - - return stringBuilder.toString(); - } - - /** - * Returns the value of the given property as a string. If the property is not set, a runtime - * exception is thrown. - */ - public static String getString(String property) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (String) overrideValue; - } - - try { - return earlybirdConfig.getString(property); - } catch (ConfigurationException e) { - LOG.error("Fatal error: could not get config string " + property, e); - throw new RuntimeException(e); - } - } - - /** - * Returns the value of the given property as a string. - */ - public static String getString(String property, String defaultValue) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (String) overrideValue; - } - - return earlybirdConfig.getString(property, defaultValue); - } - - /** - * Returns the value of the given property as an integer. If the property is not set, a runtime - * exception is thrown. - */ - public static int getInt(String property) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (int) overrideValue; - } - - try { - return earlybirdConfig.getInt(property); - } catch (ConfigurationException e) { - LOG.error("Fatal error: could not get config int " + property, e); - throw new RuntimeException(e); - } - } - - /** - * Returns the value of the given property as an integer. - */ - public static int getInt(String property, int defaultValue) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (int) overrideValue; - } - - return earlybirdConfig.getInt(property, defaultValue); - } - - /** - * Returns the value of the given property as a double. - */ - public static double getDouble(String property, double defaultValue) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (double) overrideValue; - } - - return earlybirdConfig.getDouble(property, defaultValue); - } - - /** - * Returns the value of the given property as a long. If the property is not set, a runtime - * exception is thrown. - */ - public static long getLong(String property) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (long) overrideValue; - } - - try { - return earlybirdConfig.getLong(property); - } catch (ConfigurationException e) { - LOG.error("Fatal error: could not get config long " + property, e); - throw new RuntimeException(e); - } - } - - /** - * Returns the value of the given property as a long. - */ - public static long getLong(String property, long defaultValue) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (long) overrideValue; - } - - return earlybirdConfig.getLong(property, defaultValue); - } - - /** - * Returns the value of the given property as a boolean. If the property is not set, a runtime - * exception is thrown. - */ - public static boolean getBool(String property) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (boolean) overrideValue; - } - - try { - return earlybirdConfig.getBool(property); - } catch (ConfigurationException e) { - LOG.error("Fatal error: could not get config boolean " + property, e); - throw new RuntimeException(e); - } - } - - /** - * Returns the value of the given property as a boolean. - */ - public static boolean getBool(String property, boolean defaultValue) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (boolean) overrideValue; - } - - return earlybirdConfig.getBool(property, defaultValue); - } - - /** - * Returns the value of the given property as a date. - */ - public static Date getDate(String property) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (Date) overrideValue; - } - - Date date = (Date) earlybirdConfig.getObject(property, null); - if (date == null) { - throw new RuntimeException("Could not get config date: " + property); - } - return date; - } - - /** - * Returns the value of the given property as a list of strings. - */ - public static List getListOfStrings(String property) { - Object overrideValue = overrideValueMap.get(property); - if (overrideValue != null) { - return (List) overrideValue; - } - - List list = (List) earlybirdConfig.getObject(property, null); - if (list == null) { - throw new RuntimeException("Could not get list of strings: " + property); - } - return list; - } - - /** - * Returns the value of the given property as a map. - */ - @SuppressWarnings("unchecked") - public static Map getMap(String property) { - Map map = (Map) earlybirdConfig.getObject(property, null); - if (map == null) { - throw new RuntimeException("Could not find config property: " + property); - } - return map; - } - - public static int getMaxSegmentSize() { - return EarlybirdConfig.getInt("max_segment_size", 1 << 16); - } - - /** - * Returns the log properties file. - */ - public static String getLogPropertiesFile() { - try { - String filename = earlybirdConfig.getString("log_properties_filename"); - return earlybirdConfig.getConfigFilePath(filename); - } catch (ConfigurationException e) { - // Print here rather than use LOG - log was probably not initialized yet. - LOG.error("Fatal error: could not get log properties file", e); - throw new RuntimeException(e); - } - } - - /** - * Returns the log directory. - */ - public static String getLogDir() { - if (logDirOverride != null) { - return logDirOverride; - } else { - return EarlybirdConfig.getString("log_dir"); - } - } - - public static void overrideLogDir(String logDir) { - EarlybirdConfig.logDirOverride = logDir; - } - - public static int getThriftPort() { - return EarlybirdProperty.THRIFT_PORT.get(); - } - - public static int getWarmUpThriftPort() { - return EarlybirdProperty.WARMUP_THRIFT_PORT.get(); - } - - public static int getSearcherThreads() { - return EarlybirdProperty.SEARCHER_THREADS.get(); - } - - public static int getLateTweetBuffer() { - return getInt(LATE_TWEET_BUFFER_KEY); - } - - public static int getAdminPort() { - return adminPort; - } - - public static void setAdminPort(int adminPort) { - EarlybirdConfig.adminPort = adminPort; - } - - public static boolean isRealtimeOrProtected() { - String earlybirdName = EarlybirdProperty.EARLYBIRD_NAME.get(); - return earlybirdName.contains("realtime") || earlybirdName.contains("protected"); - } - - public static boolean consumeUserScrubGeoEvents() { - return EarlybirdProperty.CONSUME_GEO_SCRUB_EVENTS.get(); - } - - @Nullable - public static AuroraInstanceKey getAuroraInstanceKey() { - return auroraInstanceKey; - } - - public static void setAuroraInstanceKey(AuroraInstanceKey auroraInstanceKey) { - EarlybirdConfig.auroraInstanceKey = auroraInstanceKey; - } - - public static boolean isAurora() { - return auroraInstanceKey != null; - } - - public static void setForTests(String property, Object value) { - earlybirdConfig.setForTests(DEFAULT_CONFIG_FILE, property, value); - } - - public static synchronized void clearForTests() { - earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, DEFAULT_CONFIG_FILE); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.docx b/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.docx new file mode 100644 index 000000000..b384dc5fc Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.java b/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.java deleted file mode 100644 index f8534bce5..000000000 --- a/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.java +++ /dev/null @@ -1,390 +0,0 @@ -package com.twitter.search.earlybird.common.config; - -import java.lang.reflect.Modifier; -import java.util.Arrays; -import java.util.List; -import java.util.function.BiFunction; -import java.util.function.Function; -import java.util.stream.Collectors; - -import com.google.common.collect.ImmutableList; - -import com.twitter.app.Flag; -import com.twitter.app.Flaggable; -import com.twitter.app.Flags; -import com.twitter.finagle.mtls.authentication.ServiceIdentifier; - -/** - * Stateless class that represents an Earlybird property that can be specified by a command line - * flag. - *

- * This is a regular Java class instead of enum to have a generic type. - * - * @param - */ -public final class EarlybirdProperty { - - private static final class PropertyType { - - private static final PropertyType BOOLEAN = new PropertyType<>( - Flaggable.ofJavaBoolean(), EarlybirdConfig::getBool, EarlybirdConfig::getBool); - - private static final PropertyType INT = new PropertyType<>( - Flaggable.ofJavaInteger(), EarlybirdConfig::getInt, EarlybirdConfig::getInt); - - private static final PropertyType STRING = new PropertyType<>( - Flaggable.ofString(), EarlybirdConfig::getString, EarlybirdConfig::getString); - - private final Flaggable flaggable; - private final Function getter; - private final BiFunction getterWithDefault; - - private PropertyType(Flaggable flaggable, Function getter, - BiFunction getterWithDefault) { - this.flaggable = flaggable; - this.getter = getter; - this.getterWithDefault = getterWithDefault; - } - } - - public static final EarlybirdProperty PENGUIN_VERSION = - new EarlybirdProperty<>( - "penguin_version", - "The penguin version to index.", - PropertyType.STRING, - false); - - public static final EarlybirdProperty THRIFT_PORT = new EarlybirdProperty<>( - "thrift_port", - "override thrift port from config file", - PropertyType.INT, - false); - - public static final EarlybirdProperty WARMUP_THRIFT_PORT = new EarlybirdProperty<>( - "warmup_thrift_port", - "override warmup thrift port from config file", - PropertyType.INT, - false); - - public static final EarlybirdProperty SEARCHER_THREADS = new EarlybirdProperty<>( - "searcher_threads", - "override number of searcher threads from config file", - PropertyType.INT, - false); - - public static final EarlybirdProperty EARLYBIRD_TIER = new EarlybirdProperty<>( - "earlybird_tier", - "the earlybird tier (e.g. tier1), used on Aurora", - PropertyType.STRING, - true); - - public static final EarlybirdProperty REPLICA_ID = new EarlybirdProperty<>( - "replica_id", - "the ID in a partition, used on Aurora", - PropertyType.INT, - true); - - public static final EarlybirdProperty PARTITION_ID = new EarlybirdProperty<>( - "partition_id", - "partition ID, used on Aurora", - PropertyType.INT, - true); - - public static final EarlybirdProperty NUM_PARTITIONS = new EarlybirdProperty<>( - "num_partitions", - "number of partitions, used on Aurora", - PropertyType.INT, - true); - - public static final EarlybirdProperty NUM_INSTANCES = new EarlybirdProperty<>( - "num_instances", - "number of instances in the job, used on Aurora", - PropertyType.INT, - true); - - public static final EarlybirdProperty SERVING_TIMESLICES = new EarlybirdProperty<>( - "serving_timeslices", - "number of time slices to serve, used on Aurora", - PropertyType.INT, - true); - - public static final EarlybirdProperty ROLE = new EarlybirdProperty<>( - "role", - "Role in the service path of Earlybird", - PropertyType.STRING, - true, - true); - - public static final EarlybirdProperty EARLYBIRD_NAME = new EarlybirdProperty<>( - "earlybird_name", - "Name in the service path of Earlybird without hash partition suffix", - PropertyType.STRING, - true, - true); - - public static final EarlybirdProperty ENV = new EarlybirdProperty<>( - "env", - "Environment in the service path of Earlybird", - PropertyType.STRING, - true, - true); - - public static final EarlybirdProperty ZONE = new EarlybirdProperty<>( - "zone", - "Zone (data center) in the service path of Earlybird", - PropertyType.STRING, - true, - true); - - public static final EarlybirdProperty DL_URI = new EarlybirdProperty<>( - "dl_uri", - "DistributedLog URI for default DL reader", - PropertyType.STRING, - false); - - public static final EarlybirdProperty USER_UPDATES_DL_URI = new EarlybirdProperty<>( - "user_updates_dl_uri", - "DistributedLog URI for user updates DL reader", - PropertyType.STRING, - false); - - public static final EarlybirdProperty ANTISOCIAL_USERUPDATES_DL_STREAM = - new EarlybirdProperty<>( - "antisocial_userupdates_dl_stream", - "DL stream name for antisocial user updates without DL version suffix", - PropertyType.STRING, - false); - - public static final EarlybirdProperty ZK_APP_ROOT = new EarlybirdProperty<>( - "zk_app_root", - "SZooKeeper base root path for this application", - PropertyType.STRING, - true); - - public static final EarlybirdProperty SEGMENT_LOAD_FROM_HDFS_ENABLED = - new EarlybirdProperty<>( - "segment_load_from_hdfs_enabled", - "Whether to load segment data from HDFS", - PropertyType.BOOLEAN, - false); - - public static final EarlybirdProperty SEGMENT_FLUSH_TO_HDFS_ENABLED = - new EarlybirdProperty<>( - "segment_flush_to_hdfs_enabled", - "Whether to flush segment data to HDFS", - PropertyType.BOOLEAN, - false); - - public static final EarlybirdProperty HDFS_SEGMENT_SYNC_DIR = new EarlybirdProperty<>( - "hdfs_segment_sync_dir", - "HDFS directory to sync segment data", - PropertyType.STRING, - false); - - public static final EarlybirdProperty HDFS_SEGMENT_UPLOAD_DIR = new EarlybirdProperty<>( - "hdfs_segment_upload_dir", - "HDFS directory to upload segment data", - PropertyType.STRING, - false); - - public static final EarlybirdProperty ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED = - new EarlybirdProperty<>( - "archive_daily_status_batch_flushing_enabled", - "Whether to enable archive daily status batch flushing", - PropertyType.BOOLEAN, - false); - - public static final EarlybirdProperty HDFS_INDEX_SYNC_DIR = new EarlybirdProperty<>( - "hdfs_index_sync_dir", - "HDFS directory to sync index data", - PropertyType.STRING, - true); - - public static final EarlybirdProperty READ_INDEX_FROM_PROD_LOCATION = - new EarlybirdProperty<>( - "read_index_from_prod_location", - "Read index from prod to speed up startup on staging / loadtest", - PropertyType.BOOLEAN, - false); - - public static final EarlybirdProperty USE_DECIDER_OVERLAY = new EarlybirdProperty<>( - "use_decider_overlay", - "Whether to use decider overlay", - PropertyType.BOOLEAN, - false); - - public static final EarlybirdProperty DECIDER_OVERLAY_CONFIG = new EarlybirdProperty<>( - "decider_overlay_config", - "Path to decider overlay config", - PropertyType.STRING, - false); - - public static final EarlybirdProperty MAX_CONCURRENT_SEGMENT_INDEXERS = - new EarlybirdProperty<>( - "max_concurrent_segment_indexers", - "Maximum number of segments indexed concurrently", - PropertyType.INT, - false); - - public static final EarlybirdProperty TF_MODELS_ENABLED = - new EarlybirdProperty<>( - "tf_models_enabled", - "Whether tensorflow models should be loaded", - PropertyType.BOOLEAN, - false); - - public static final EarlybirdProperty TF_MODELS_CONFIG_PATH = - new EarlybirdProperty<>( - "tf_models_config_path", - "The configuration path of the yaml file containing the list of tensorflow models to load.", - PropertyType.STRING, - false); - - public static final EarlybirdProperty TF_INTER_OP_THREADS = - new EarlybirdProperty<>( - "tf_inter_op_threads", - "How many tensorflow inter op threads to use. See TF documentation for more information.", - PropertyType.INT, - false); - - public static final EarlybirdProperty TF_INTRA_OP_THREADS = - new EarlybirdProperty<>( - "tf_intra_op_threads", - "How many tensorflow intra op threads to use. See TF documentation for more information.", - PropertyType.INT, - false); - - public static final EarlybirdProperty MAX_ALLOWED_REPLICAS_NOT_IN_SERVER_SET = - new EarlybirdProperty<>( - "max_allowed_replicas_not_in_server_set", - "How many replicas are allowed to be missing from the Earlybird server set.", - PropertyType.INT, - false); - - public static final EarlybirdProperty CHECK_NUM_REPLICAS_IN_SERVER_SET = - new EarlybirdProperty<>( - "check_num_replicas_in_server_set", - "Whether CoordinatedEarlybirdActions should check the number of alive replicas", - PropertyType.BOOLEAN, - false); - - public static final EarlybirdProperty MAX_QUEUE_SIZE = - new EarlybirdProperty<>( - "max_queue_size", - "Maximum size of searcher worker executor queue. If <= 0 queue is unbounded.", - PropertyType.INT, - false); - - public static final EarlybirdProperty KAFKA_ENV = - new EarlybirdProperty<>( - "kafka_env", - "The environment to use for kafka topics.", - PropertyType.STRING, - false); - public static final EarlybirdProperty KAFKA_PATH = - new EarlybirdProperty<>( - "kafka_path", - "Wily path to the Search kafka cluster.", - PropertyType.STRING, - false); - public static final EarlybirdProperty TWEET_EVENTS_KAFKA_PATH = - new EarlybirdProperty<>( - "tweet_events_kafka_path", - "Wily path to the tweet-events kafka cluster.", - PropertyType.STRING, - false); - public static final EarlybirdProperty USER_UPDATES_KAFKA_TOPIC = - new EarlybirdProperty<>( - "user_updates_topic", - "Name of the Kafka topic that contain user updates.", - PropertyType.STRING, - false); - public static final EarlybirdProperty USER_SCRUB_GEO_KAFKA_TOPIC = - new EarlybirdProperty<>( - "user_scrub_geo_topic", - "Name of the Kafka topic that contain UserScrubGeoEvents.", - PropertyType.STRING, - false); - public static final EarlybirdProperty EARLYBIRD_SCRUB_GEN = - new EarlybirdProperty<>( - "earlybird_scrub_gen", - "SCRUB_GEN TO DEPLOY", - PropertyType.STRING, - false); - public static final EarlybirdProperty CONSUME_GEO_SCRUB_EVENTS = - new EarlybirdProperty<>( - "consume_geo_scrub_events", - "Whether to consume user scrub geo events or not", - PropertyType.BOOLEAN, - false); - - private static final List> ALL_PROPERTIES = - Arrays.stream(EarlybirdProperty.class.getDeclaredFields()) - .filter(field -> - (field.getModifiers() & Modifier.STATIC) > 0 - && field.getType() == EarlybirdProperty.class) - .map(field -> { - try { - return (EarlybirdProperty) field.get(EarlybirdProperty.class); - } catch (Exception e) { - throw new RuntimeException(e); - } - }) - .collect(Collectors.collectingAndThen(Collectors.toList(), ImmutableList::copyOf)); - - public static ServiceIdentifier getServiceIdentifier() { - return new ServiceIdentifier( - ROLE.get(), - EARLYBIRD_NAME.get(), - ENV.get(), - ZONE.get()); - } - - private final String name; - private final String help; - private final PropertyType type; - private final boolean requiredOnAurora; - private final boolean requiredOnDedicated; - - private EarlybirdProperty(String name, String help, PropertyType type, - boolean requiredOnAurora) { - this(name, help, type, requiredOnAurora, false); - } - - private EarlybirdProperty(String name, String help, PropertyType type, - boolean requiredOnAurora, boolean requiredOnDedicated) { - this.name = name; - this.help = help; - this.type = type; - this.requiredOnAurora = requiredOnAurora; - this.requiredOnDedicated = requiredOnDedicated; - } - - public String name() { - return name; - } - - public boolean isRequiredOnAurora() { - return requiredOnAurora; - } - - public boolean isRequiredOnDedicated() { - return requiredOnDedicated; - } - - public Flag createFlag(Flags flags) { - return flags.createMandatory(name, help, null, type.flaggable); - } - - public T get() { - return type.getter.apply(name); - } - - public T get(T devaultValue) { - return type.getterWithDefault.apply(name, devaultValue); - } - - public static EarlybirdProperty[] values() { - return ALL_PROPERTIES.toArray(new EarlybirdProperty[0]); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/BUILD b/src/java/com/twitter/search/earlybird/common/userupdates/BUILD deleted file mode 100644 index 27a3c8c8f..000000000 --- a/src/java/com/twitter/search/earlybird/common/userupdates/BUILD +++ /dev/null @@ -1,45 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/commons-io", - "3rdparty/jvm/geo/google:geoGoogle", - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server", - "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider", - "3rdparty/jvm/org/apache/hadoop:hadoop-client-default", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn", - "3rdparty/jvm/org/apache/lucene:lucene-core", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "3rdparty/src/jvm/com/twitter/scalding:core", - "3rdparty/src/jvm/com/twitter/scalding:date", - "3rdparty/src/jvm/com/twitter/scalding:parquet", - "decider/src/main/scala", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/common_internal/hadoop", - "src/java/com/twitter/search/common/logging", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/common/util/hash", - "src/java/com/twitter/search/common/util/io", - "src/java/com/twitter/search/common/util/io:dl-reader-writer", - "src/java/com/twitter/search/common/util/io:flushable", - "src/java/com/twitter/search/common/util/io:record-reader-api", - "src/java/com/twitter/search/earlybird/common/config", - "src/scala/com/twitter/scalding_internal/error_handling", - "src/scala/com/twitter/scalding_internal/multiformat", - "src/scala/com/twitter/scalding_internal/source", - "src/scala/com/twitter/search/user_table/sources", - "src/thrift/com/twitter/search/common:indexing-java", - "src/thrift/com/twitter/tweetypie:events-java", - "util/util-core:scala", - ], -) diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/BUILD.docx b/src/java/com/twitter/search/earlybird/common/userupdates/BUILD.docx new file mode 100644 index 000000000..b7108e8d2 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/userupdates/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.docx b/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.docx new file mode 100644 index 000000000..dc010d477 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.java b/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.java deleted file mode 100644 index c0c6c3be7..000000000 --- a/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.java +++ /dev/null @@ -1,100 +0,0 @@ -package com.twitter.search.earlybird.common.userupdates; - -import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.TimeUnit; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.tweetypie.thriftjava.UserScrubGeoEvent; - -/** - * Map of users who have actioned to delete location data from their tweets. UserID's are mapped - * to the maxTweetId that will eventually be scrubbed from the index (userId -> maxTweetId). - * - * ConcurrentHashMap is thread safe without synchronizing the whole map. Reads can happen very fast - * while writes are done with a lock. This is ideal since many Earlybird Searcher threads could - * be reading from the map at once, whereas we will only be adding to the map via kafka. - * - * This map is checked against to filter out tweets that should not be returned to geo queries. - * See: go/realtime-geo-filtering - */ -public class UserScrubGeoMap { - // The number of geo events that contain a user ID already present in the map. This count is used - // to verify the number of users in the map against the number of events consumed from kafka. - private static final SearchCounter USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT = - SearchCounter.export("user_scrub_geo_event_existing_user_count"); - public static final SearchTimerStats USER_SCRUB_GEO_EVENT_LAG_STAT = - SearchTimerStats.export("user_scrub_geo_event_lag", - TimeUnit.MILLISECONDS, - false, - true); - private ConcurrentHashMap map; - - public UserScrubGeoMap() { - map = new ConcurrentHashMap<>(); - SearchCustomGauge.export("num_users_in_geo_map", this::getNumUsersInMap); - } - - /** - * Ensure that the max_tweet_id in the userScrubGeoEvent is greater than the one already stored - * in the map for the given user id (if any) before updating the entry for this user. - * This will protect Earlybirds from potential issues where out of date UserScrubGeoEvents - * appear in the incoming Kafka stream. - * - * @param userScrubGeoEvent - */ - public void indexUserScrubGeoEvent(UserScrubGeoEvent userScrubGeoEvent) { - long userId = userScrubGeoEvent.getUser_id(); - long newMaxTweetId = userScrubGeoEvent.getMax_tweet_id(); - long oldMaxTweetId = map.getOrDefault(userId, 0L); - if (map.containsKey(userId)) { - USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT.increment(); - } - map.put(userId, Math.max(oldMaxTweetId, newMaxTweetId)); - USER_SCRUB_GEO_EVENT_LAG_STAT.timerIncrement(computeEventLag(newMaxTweetId)); - } - - /** - * A tweet is geo scrubbed if it is older than the max tweet id that is scrubbed for the tweet's - * author. - * If there is no entry for the tweet's author in the map, then the tweet is not geo scrubbed. - * - * @param tweetId - * @param fromUserId - * @return - */ - public boolean isTweetGeoScrubbed(long tweetId, long fromUserId) { - return tweetId <= map.getOrDefault(fromUserId, 0L); - } - - /** - * The lag (in milliseconds) from when a UserScrubGeoEvent is created, until it is applied to the - * UserScrubGeoMap. Take the maxTweetId found in the current event and convert it to a timestamp. - * The maxTweetId will give us a timestamp closest to when Tweetypie processes macaw-geo requests. - * - * @param maxTweetId - * @return - */ - private long computeEventLag(long maxTweetId) { - long eventCreatedAtTime = SnowflakeIdParser.getTimestampFromTweetId(maxTweetId); - return System.currentTimeMillis() - eventCreatedAtTime; - } - - public long getNumUsersInMap() { - return map.size(); - } - - public ConcurrentHashMap getMap() { - return map; - } - - public boolean isEmpty() { - return map.isEmpty(); - } - - public boolean isSet(long userId) { - return map.containsKey(userId); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.docx b/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.docx new file mode 100644 index 000000000..8b68b1bab Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.java b/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.java deleted file mode 100644 index 3df08a5df..000000000 --- a/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.java +++ /dev/null @@ -1,572 +0,0 @@ -package com.twitter.search.earlybird.common.userupdates; - -import java.util.Iterator; -import java.util.concurrent.atomic.AtomicReference; -import java.util.function.Predicate; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.util.hash.GeneralLongHashFunction; - -/** - * Table containing metadata about users, like NSFW or Antisocial status. - * Used for result filtering. - */ -public class UserTable { - private static final Logger LOG = LoggerFactory.getLogger(UserTable.class); - - @VisibleForTesting // Not final for testing. - protected static long userUpdateTableMaxCapacity = 1L << 30; - - private static final int DEFAULT_INITIAL_CAPACITY = 1024; - private static final int BYTE_WIDTH = 8; - - private static final String USER_TABLE_CAPACITY = "user_table_capacity"; - private static final String USER_TABLE_SIZE = "user_table_size"; - private static final String - USER_NUM_USERS_WITH_NO_BITS_SET = "user_table_users_with_no_bits_set"; - private static final String USER_TABLE_ANTISOCIAL_USERS = "user_table_antisocial_users"; - private static final String USER_TABLE_OFFENSIVE_USERS = "user_table_offensive_users"; - private static final String USER_TABLE_NSFW_USERS = "user_table_nsfw_users"; - private static final String USER_TABLE_IS_PROTECTED_USERS = "user_table_is_protected_users"; - - /** - * number of users filtered - */ - private static final SearchRateCounter USER_TABLE_USERS_FILTERED_COUNTER = - new SearchRateCounter("user_table_users_filtered"); - - private SearchLongGauge userTableCapacity; - private SearchLongGauge userTableSize; - private SearchLongGauge userTableNumUsersWithNoBitsSet; - private SearchLongGauge userTableAntisocialUsers; - private SearchLongGauge userTableOffensiveUsers; - private SearchLongGauge userTableNsfwUsers; - private SearchLongGauge userTableIsProtectedUsers; - - private final Predicate userIdFilter; - private long lastRecordTimestamp; - - private static final class HashTable { - private int numUsersInTable; - private int numUsersWithNoBitsSet; - // size 8 array contains the number of users who have the bit set at the index (0-7) position - // e.g. setBitCounts[0] stores the number of users who have the 0 bit set in their bytes - private long[] setBitCounts; - - private final long[] hash; - private final byte[] bits; - - private final int hashMask; - - HashTable(int size) { - this.hash = new long[size]; - this.bits = new byte[size]; - this.hashMask = size - 1; - this.numUsersInTable = 0; - this.setBitCounts = new long[BYTE_WIDTH]; - } - - protected int hashSize() { - return hash.length; - } - - // If we want to decrease the number of users in the table, we can delete as many users - // as this table returns, by calling filterTableAndCountValidItems. - public void setCountOfNumUsersWithNoBitsSet() { - int count = 0; - for (int i = 0; i < hash.length; i++) { - if ((hash[i] > 0) && (bits[i] == 0)) { - count++; - } - } - - numUsersWithNoBitsSet = count; - } - - public void setSetBitCounts() { - long[] counts = new long[BYTE_WIDTH]; - for (int i = 0; i < hash.length; i++) { - if (hash[i] > 0) { - int tempBits = bits[i] & 0xff; - int curBitPos = 0; - while (tempBits != 0) { - if ((tempBits & 1) != 0) { - counts[curBitPos]++; - } - tempBits = tempBits >>> 1; - curBitPos++; - } - } - } - setBitCounts = counts; - } - } - - public static final int ANTISOCIAL_BIT = 1; - public static final int OFFENSIVE_BIT = 1 << 1; - public static final int NSFW_BIT = 1 << 2; - public static final int IS_PROTECTED_BIT = 1 << 3; - - public long getLastRecordTimestamp() { - return this.lastRecordTimestamp; - } - - public void setLastRecordTimestamp(long lastRecordTimestamp) { - this.lastRecordTimestamp = lastRecordTimestamp; - } - - public void setOffensive(long userID, boolean offensive) { - set(userID, OFFENSIVE_BIT, offensive); - } - - public void setAntisocial(long userID, boolean antisocial) { - set(userID, ANTISOCIAL_BIT, antisocial); - } - - public void setNSFW(long userID, boolean nsfw) { - set(userID, NSFW_BIT, nsfw); - } - - public void setIsProtected(long userID, boolean isProtected) { - set(userID, IS_PROTECTED_BIT, isProtected); - } - - /** - * Adds the given user update to this table. - */ - public boolean indexUserUpdate(UserUpdatesChecker checker, UserUpdate userUpdate) { - if (checker.skipUserUpdate(userUpdate)) { - return false; - } - - switch (userUpdate.updateType) { - case ANTISOCIAL: - setAntisocial(userUpdate.twitterUserID, userUpdate.updateValue != 0); - break; - case NSFW: - setNSFW(userUpdate.twitterUserID, userUpdate.updateValue != 0); - break; - case OFFENSIVE: - setOffensive(userUpdate.twitterUserID, userUpdate.updateValue != 0); - break; - case PROTECTED: - setIsProtected(userUpdate.twitterUserID, userUpdate.updateValue != 0); - break; - default: - return false; - } - - return true; - } - - private final AtomicReference hashTable = new AtomicReference<>(); - - private int hashCode(long userID) { - return (int) GeneralLongHashFunction.hash(userID); - } - - /** - * Returns an iterator for user IDs that have at least one of the bits set. - */ - public Iterator getFlaggedUserIdIterator() { - HashTable table = hashTable.get(); - - final long[] currUserIdTable = table.hash; - final byte[] currBitsTable = table.bits; - return new Iterator() { - private int index = findNext(0); - - private int findNext(int index) { - int startingIndex = index; - while (startingIndex < currUserIdTable.length) { - if (currUserIdTable[startingIndex] != 0 && currBitsTable[startingIndex] != 0) { - break; - } - ++startingIndex; - } - return startingIndex; - } - - @Override - public boolean hasNext() { - return index < currUserIdTable.length; - } - - @Override - public Long next() { - Long r = currUserIdTable[index]; - index = findNext(index + 1); - return r; - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - }; - } - - /** - * Constructs an UserUpdatesTable with an given HashTable instance. - * Use useIdFilter as a Predicate that returns true for the elements - * needed to be kept in the table. - * Use shouldRehash to force a rehasing on the given HashTable. - */ - private UserTable(HashTable hashTable, Predicate userIdFilter, - boolean shouldRehash) { - - Preconditions.checkNotNull(userIdFilter); - - this.hashTable.set(hashTable); - this.userIdFilter = userIdFilter; - - exportUserUpdatesTableStats(); - - LOG.info("User table num users: {}. Users with no bits set: {}. " - + "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.", - this.getNumUsersInTable(), - this.getNumUsersWithNoBitsSet(), - this.getSetBitCount(ANTISOCIAL_BIT), - this.getSetBitCount(OFFENSIVE_BIT), - this.getSetBitCount(NSFW_BIT), - this.getSetBitCount(IS_PROTECTED_BIT)); - - if (shouldRehash) { - int filteredTableSize = filterTableAndCountValidItems(); - // Having exactly 100% usage can impact lookup. Maintain the table at under 50% usage. - int newTableCapacity = computeDesiredHashTableCapacity(filteredTableSize * 2); - - rehash(newTableCapacity); - - LOG.info("User table num users after rehash: {}. Users with no bits set: {}. " - + "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.", - this.getNumUsersInTable(), - this.getNumUsersWithNoBitsSet(), - this.getSetBitCount(ANTISOCIAL_BIT), - this.getSetBitCount(OFFENSIVE_BIT), - this.getSetBitCount(NSFW_BIT), - this.getSetBitCount(IS_PROTECTED_BIT)); - } - } - - private UserTable(int initialSize, Predicate userIdFilter) { - this(new HashTable(computeDesiredHashTableCapacity(initialSize)), userIdFilter, false); - } - - @VisibleForTesting - public UserTable(int initialSize) { - this(initialSize, userId -> true); - } - - public static UserTable - newTableWithDefaultCapacityAndPredicate(Predicate userIdFilter) { - - return new UserTable(DEFAULT_INITIAL_CAPACITY, userIdFilter); - } - - public static UserTable newTableNonFilteredWithDefaultCapacity() { - return newTableWithDefaultCapacityAndPredicate(userId -> true); - } - - private void exportUserUpdatesTableStats() { - userTableSize = SearchLongGauge.export(USER_TABLE_SIZE); - userTableCapacity = SearchLongGauge.export(USER_TABLE_CAPACITY); - userTableNumUsersWithNoBitsSet = SearchLongGauge.export( - USER_NUM_USERS_WITH_NO_BITS_SET - ); - userTableAntisocialUsers = SearchLongGauge.export(USER_TABLE_ANTISOCIAL_USERS); - userTableOffensiveUsers = SearchLongGauge.export(USER_TABLE_OFFENSIVE_USERS); - userTableNsfwUsers = SearchLongGauge.export(USER_TABLE_NSFW_USERS); - userTableIsProtectedUsers = SearchLongGauge.export(USER_TABLE_IS_PROTECTED_USERS); - - LOG.info( - "Exporting stats for user table. Starting with numUsersInTable={}, usersWithZeroBits={}, " - + "antisocialUsers={}, offensiveUsers={}, nsfwUsers={}, isProtectedUsers={}.", - getNumUsersInTable(), - getNumUsersWithNoBitsSet(), - getSetBitCount(ANTISOCIAL_BIT), - getSetBitCount(OFFENSIVE_BIT), - getSetBitCount(NSFW_BIT), - getSetBitCount(IS_PROTECTED_BIT)); - updateStats(); - } - - private void updateStats() { - HashTable table = this.hashTable.get(); - userTableSize.set(table.numUsersInTable); - userTableNumUsersWithNoBitsSet.set(table.numUsersWithNoBitsSet); - userTableCapacity.set(table.hashSize()); - userTableAntisocialUsers.set(getSetBitCount(ANTISOCIAL_BIT)); - userTableOffensiveUsers.set(getSetBitCount(OFFENSIVE_BIT)); - userTableNsfwUsers.set(getSetBitCount(NSFW_BIT)); - userTableIsProtectedUsers.set(getSetBitCount(IS_PROTECTED_BIT)); - } - - /** - * Computes the size of the hashtable as the first power of two greater than or equal to initialSize - */ - private static int computeDesiredHashTableCapacity(int initialSize) { - long powerOfTwoSize = 2; - while (initialSize > powerOfTwoSize) { - powerOfTwoSize *= 2; - } - if (powerOfTwoSize > Integer.MAX_VALUE) { - LOG.error("Error: powerOfTwoSize overflowed Integer.MAX_VALUE! Initial size: " + initialSize); - powerOfTwoSize = 1 << 30; // max power of 2 - } - - return (int) powerOfTwoSize; - } - - public int getNumUsersInTable() { - return hashTable.get().numUsersInTable; - } - - /** - * Get the number of users who have the bit set at the `userStateBit` position - */ - public long getSetBitCount(int userStateBit) { - int bit = userStateBit; - int bitPosition = 0; - while (bit != 0 && (bit & 1) == 0) { - bit = bit >>> 1; - bitPosition++; - } - return hashTable.get().setBitCounts[bitPosition]; - } - - public Predicate getUserIdFilter() { - return userIdFilter::test; - } - - /** - * Updates a user flag in this table. - */ - public final void set(long userID, int bit, boolean value) { - // if userID is filtered return immediately - if (!shouldKeepUser(userID)) { - USER_TABLE_USERS_FILTERED_COUNTER.increment(); - return; - } - - HashTable table = this.hashTable.get(); - - int hashPos = findHashPosition(table, userID); - long item = table.hash[hashPos]; - byte bits = 0; - int bitsDiff = 0; - - if (item != 0) { - byte bitsOriginally = bits = table.bits[hashPos]; - if (value) { - bits |= bit; - } else { - // AND'ing with the inverse map clears the desired bit, but - // doesn't change any of the other bits - bits &= ~bit; - } - - // Find the changed bits after the above operation, it is possible that no bit is changed if - // the input 'bit' is already set/unset in the table. - // Since bitwise operators cannot be directly applied on Byte, Byte is promoted into int to - // apply the operators. When that happens, if the most significant bit of the Byte is set, - // the promoted int has all significant bits set to 1. 0xff bitmask is applied here to make - // sure only the last 8 bits are considered. - bitsDiff = (bitsOriginally & 0xff) ^ (bits & 0xff); - - if (bitsOriginally > 0 && bits == 0) { - table.numUsersWithNoBitsSet++; - } else if (bitsOriginally == 0 && bits > 0) { - table.numUsersWithNoBitsSet--; - } - } else { - if (!value) { - // no need to add this user, since all bits would be false anyway - return; - } - - // New user string. - if (table.numUsersInTable + 1 >= (table.hashSize() >> 1) - && table.hashSize() != userUpdateTableMaxCapacity) { - if (2L * (long) table.hashSize() < userUpdateTableMaxCapacity) { - rehash(2 * table.hashSize()); - table = this.hashTable.get(); - } else { - if (table.hashSize() < (int) userUpdateTableMaxCapacity) { - rehash((int) userUpdateTableMaxCapacity); - table = this.hashTable.get(); - LOG.warn("User update table size reached Integer.MAX_VALUE, performance will degrade."); - } - } - - // Must repeat this operation with the resized hashTable. - hashPos = findHashPosition(table, userID); - } - - item = userID; - bits |= bit; - bitsDiff = bit & 0xff; - - table.numUsersInTable++; - } - - table.hash[hashPos] = item; - table.bits[hashPos] = bits; - - // update setBitCounts for the changed bits after applying the input 'bit' - int curBitsDiffPos = 0; - while (bitsDiff != 0) { - if ((bitsDiff & 1) != 0) { - if (value) { - table.setBitCounts[curBitsDiffPos]++; - } else { - table.setBitCounts[curBitsDiffPos]--; - } - } - bitsDiff = bitsDiff >>> 1; - curBitsDiffPos++; - } - - updateStats(); - } - - public final boolean isSet(long userID, int bits) { - HashTable table = hashTable.get(); - int hashPos = findHashPosition(table, userID); - return table.hash[hashPos] != 0 && (table.bits[hashPos] & bits) != 0; - } - - /** - * Returns true when userIdFilter condition is being met. - * If filter is not present returns true - */ - private boolean shouldKeepUser(long userID) { - return userIdFilter.test(userID); - } - - private int findHashPosition(final HashTable table, final long userID) { - int code = hashCode(userID); - int hashPos = code & table.hashMask; - - // Locate user in hash - long item = table.hash[hashPos]; - - if (item != 0 && item != userID) { - // Conflict: keep searching different locations in - // the hash table. - final int inc = ((code >> 8) + code) | 1; - do { - code += inc; - hashPos = code & table.hashMask; - item = table.hash[hashPos]; - } while (item != 0 && item != userID); - } - - return hashPos; - } - - /** - * Applies the filtering predicate and returns the size of the filtered table. - */ - private synchronized int filterTableAndCountValidItems() { - final HashTable oldTable = this.hashTable.get(); - int newSize = 0; - - int clearNoItemSet = 0; - int clearNoBitsSet = 0; - int clearDontKeepUser = 0; - - for (int i = 0; i < oldTable.hashSize(); i++) { - final long item = oldTable.hash[i]; // this is the userID - final byte bits = oldTable.bits[i]; - - boolean clearSlot = false; - if (item == 0) { - clearSlot = true; - clearNoItemSet++; - } else if (bits == 0) { - clearSlot = true; - clearNoBitsSet++; - } else if (!shouldKeepUser(item)) { - clearSlot = true; - clearDontKeepUser++; - } - - if (clearSlot) { - oldTable.hash[i] = 0; - oldTable.bits[i] = 0; - } else { - newSize += 1; - } - } - - oldTable.setCountOfNumUsersWithNoBitsSet(); - oldTable.setSetBitCounts(); - - LOG.info("Done filtering table: clearNoItemSet={}, clearNoBitsSet={}, clearDontKeepUser={}", - clearNoItemSet, clearNoBitsSet, clearDontKeepUser); - - return newSize; - } - - /** - * Called when hash is too small (> 50% occupied) - */ - private void rehash(final int newSize) { - final HashTable oldTable = this.hashTable.get(); - final HashTable newTable = new HashTable(newSize); - - final int newMask = newTable.hashMask; - final long[] newHash = newTable.hash; - final byte[] newBits = newTable.bits; - - for (int i = 0; i < oldTable.hashSize(); i++) { - final long item = oldTable.hash[i]; - final byte bits = oldTable.bits[i]; - if (item != 0 && bits != 0) { - int code = hashCode(item); - - int hashPos = code & newMask; - assert hashPos >= 0; - if (newHash[hashPos] != 0) { - final int inc = ((code >> 8) + code) | 1; - do { - code += inc; - hashPos = code & newMask; - } while (newHash[hashPos] != 0); - } - newHash[hashPos] = item; - newBits[hashPos] = bits; - newTable.numUsersInTable++; - } - } - - newTable.setCountOfNumUsersWithNoBitsSet(); - newTable.setSetBitCounts(); - this.hashTable.set(newTable); - - updateStats(); - } - - public void setTable(UserTable newTable) { - hashTable.set(newTable.hashTable.get()); - updateStats(); - } - - @VisibleForTesting - protected int getHashTableCapacity() { - return hashTable.get().hashSize(); - } - - @VisibleForTesting - protected int getNumUsersWithNoBitsSet() { - return hashTable.get().numUsersWithNoBitsSet; - } -} diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.docx b/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.docx new file mode 100644 index 000000000..7b6842e93 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.java b/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.java deleted file mode 100644 index 76b14de5a..000000000 --- a/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.java +++ /dev/null @@ -1,263 +0,0 @@ -package com.twitter.search.earlybird.common.userupdates; - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.util.Arrays; -import java.util.Iterator; -import java.util.List; -import java.util.NoSuchElementException; -import java.util.Optional; -import java.util.Spliterator; -import java.util.Spliterators; -import java.util.concurrent.TimeUnit; -import java.util.function.Predicate; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; -import javax.annotation.Nullable; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hdfs.HdfsConfiguration; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common_internal.hadoop.HdfsUtils; -import com.twitter.scalding.DateRange; -import com.twitter.scalding.Hours; -import com.twitter.scalding.RichDate; -import com.twitter.search.user_table.sources.MostRecentGoodSafetyUserStateSource; -import com.twitter.search.common.indexing.thriftjava.SafetyUserState; -import com.twitter.search.common.util.io.LzoThriftBlockFileReader; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.util.Duration; -import com.twitter.util.Time; - -/** - * Builds a user table from a user safety snapshot on HDFS. - */ -public class UserTableBuilderFromSnapshot { - private static final Logger LOG = LoggerFactory.getLogger(UserTableBuilderFromSnapshot.class); - - private static final int MAX_DAYS_TO_CHECK = 7; - public static final String DATA_DIR = "user_states"; - public static final String METADATA_DIR = "last_updated_ms"; - - private final String snapshotBaseDir; - - private String snapshotDataPath; - private String snapshotMetaDataPath; - private UserTable userTable; - - private long nsfwCount; - private long antisocialCount; - private long isProtectedCount; - - public UserTableBuilderFromSnapshot() { - snapshotBaseDir = - EarlybirdConfig.getString(EarlybirdConfig.USER_SNAPSHOT_BASE_DIR, null); - - LOG.info("Configured user snapshot directory: " + snapshotBaseDir); - } - - private static final class UserUpdate { - public final long userId; - @Nullable public final Boolean antisocial; - @Nullable public final Boolean nsfw; - @Nullable public final Boolean isProtected; - - private UserUpdate(long userId, - @Nullable Boolean antisocial, - @Nullable Boolean nsfw, - @Nullable Boolean isProtected) { - this.userId = userId; - this.antisocial = antisocial; - this.nsfw = nsfw; - this.isProtected = isProtected; - } - - public static UserUpdate fromUserState(SafetyUserState safetyUserState) { - long userId = safetyUserState.getUserID(); - @Nullable Boolean antisocial = null; - @Nullable Boolean nsfw = null; - @Nullable Boolean isProtected = null; - - if (safetyUserState.isIsAntisocial()) { - antisocial = true; - } - if (safetyUserState.isIsNsfw()) { - nsfw = true; - } - if (safetyUserState.isSetIsProtected() && safetyUserState.isIsProtected()) { - isProtected = true; - } - - return new UserUpdate(userId, antisocial, nsfw, isProtected); - } - } - - /** - * Builds a user table from an HDFS user snapshot. - * @return The table, or nothing if something went wrong. - */ - public Optional build(Predicate userFilter) { - userTable = UserTable.newTableWithDefaultCapacityAndPredicate(userFilter); - nsfwCount = 0; - antisocialCount = 0; - isProtectedCount = 0; - - if (snapshotBaseDir == null || snapshotBaseDir.isEmpty()) { - LOG.info("No snapshot directory. Can't build user table."); - return Optional.empty(); - } - - LOG.info("Starting to build user table."); - - Stream stream = null; - - try { - setSnapshotPath(); - - stream = getUserUpdates(); - stream.forEach(this::insertUser); - } catch (IOException e) { - LOG.error("IOException while building table: {}", e.getMessage(), e); - - return Optional.empty(); - } finally { - if (stream != null) { - stream.close(); - } - } - - LOG.info("Built user table with {} users, {} nsfw, {} antisocial and {} protected.", - userTable.getNumUsersInTable(), - nsfwCount, - antisocialCount, - isProtectedCount); - - try { - userTable.setLastRecordTimestamp(readTimestampOfLastSeenUpdateFromSnapshot()); - } catch (IOException e) { - LOG.error("IOException reading timestamp of last update: {}", e.getMessage(), e); - return Optional.empty(); - } - - LOG.info("Setting last record timestamp to {}.", userTable.getLastRecordTimestamp()); - - return Optional.of(userTable); - } - - private void setSnapshotPath() { - snapshotDataPath = - new MostRecentGoodSafetyUserStateSource( - snapshotBaseDir, - DATA_DIR, - METADATA_DIR, - DateRange.apply( - RichDate.now().$minus(Hours.apply(MAX_DAYS_TO_CHECK * 24)), - RichDate.now()) - ).partitionHdfsPaths(new HdfsConfiguration()) - ._1() - .head() - .replaceAll("\\*$", ""); - snapshotMetaDataPath = snapshotDataPath.replace(DATA_DIR, METADATA_DIR); - - LOG.info("Snapshot data path: {}", snapshotDataPath); - LOG.info("Snapshot metadata path: {}", snapshotMetaDataPath); - } - - private Stream getUserUpdates() throws IOException { - FileSystem fs = FileSystem.get(new Configuration()); - List lzoFiles = - Arrays.stream(fs.listStatus(new Path(snapshotDataPath), - path -> path.getName().startsWith("part-"))) - .map(fileStatus -> Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath()) - .toString()) - .collect(Collectors.toList()); - - final LzoThriftBlockFileReader thriftReader = - new LzoThriftBlockFileReader<>(lzoFiles, SafetyUserState.class, null); - - Iterator iter = new Iterator() { - private SafetyUserState next; - - @Override - public boolean hasNext() { - if (next != null) { - return true; - } - - do { - try { - next = thriftReader.readNext(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } while (next == null && !thriftReader.isExhausted()); - return next != null; - } - - @Override - public UserUpdate next() { - if (next != null || hasNext()) { - UserUpdate userUpdate = UserUpdate.fromUserState(next); - next = null; - return userUpdate; - } - throw new NoSuchElementException(); - } - }; - - return StreamSupport - .stream( - Spliterators.spliteratorUnknownSize(iter, Spliterator.ORDERED | Spliterator.NONNULL), - false) - .onClose(thriftReader::stop); - } - - private long readTimestampOfLastSeenUpdateFromSnapshot() throws IOException { - String timestampFile = snapshotMetaDataPath + "part-00000"; - BufferedReader buffer = new BufferedReader(new InputStreamReader( - HdfsUtils.getInputStreamSupplier(timestampFile).openStream())); - - long timestampMillis = Long.parseLong(buffer.readLine()); - LOG.info("read timestamp {} from HDFS:{}", timestampMillis, timestampFile); - - Time time = Time.fromMilliseconds(timestampMillis) - .minus(Duration.fromTimeUnit(10, TimeUnit.MINUTES)); - return time.inMilliseconds(); - } - - private void insertUser(UserUpdate userUpdate) { - if (userUpdate == null) { - return; - } - - if (userUpdate.antisocial != null) { - userTable.set( - userUpdate.userId, - UserTable.ANTISOCIAL_BIT, - userUpdate.antisocial); - antisocialCount++; - } - - if (userUpdate.nsfw != null) { - userTable.set( - userUpdate.userId, - UserTable.NSFW_BIT, - userUpdate.nsfw); - nsfwCount++; - } - - if (userUpdate.isProtected != null) { - userTable.set( - userUpdate.userId, - UserTable.IS_PROTECTED_BIT, - userUpdate.isProtected); - isProtectedCount++; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.docx b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.docx new file mode 100644 index 000000000..c18acff09 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.java b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.java deleted file mode 100644 index 6cfb0814c..000000000 --- a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.java +++ /dev/null @@ -1,38 +0,0 @@ -package com.twitter.search.earlybird.common.userupdates; - -import java.util.Date; - -import com.twitter.search.common.indexing.thriftjava.UserUpdateType; - -/** - * Contains an update for a user. - */ -public class UserUpdate { - public final long twitterUserID; - public final UserUpdateType updateType; - public final int updateValue; - private final Date updatedAt; - - public UserUpdate(long twitterUserID, - UserUpdateType updateType, - int updateValue, - Date updatedAt) { - - this.twitterUserID = twitterUserID; - this.updateType = updateType; - this.updateValue = updateValue; - this.updatedAt = (Date) updatedAt.clone(); - } - - @Override public String toString() { - return "UserInfoUpdate[userID=" + twitterUserID + ",updateType=" + updateType - + ",updateValue=" + updateValue + ",updatedAt=" + getUpdatedAt() + "]"; - } - - /** - * Returns a copy of the updated-at date. - */ - public Date getUpdatedAt() { - return (Date) updatedAt.clone(); - } -} diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.docx b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.docx new file mode 100644 index 000000000..d203ff6b9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.docx differ diff --git a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.java b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.java deleted file mode 100644 index b12558fe1..000000000 --- a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.java +++ /dev/null @@ -1,70 +0,0 @@ -package com.twitter.search.earlybird.common.userupdates; - -import java.util.Date; -import java.util.concurrent.TimeUnit; - -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.search.common.indexing.thriftjava.UserUpdateType; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; - -/** - * Contains logic for deciding whether to apply a certain user update to the {@link UserTable}. - */ -public class UserUpdatesChecker { - private final Date antisocialStartDate; - private final Decider decider; - private final boolean isFullArchiveCluster; - - public UserUpdatesChecker(Clock clock, Decider decider, EarlybirdCluster cluster) { - // How many days of antisocial users to keep. A value of -1 means keeping all user updates. - long antisocialRecordDays = - EarlybirdConfig.getLong("keep_recent_antisocial_user_updates_days", 30); - this.antisocialStartDate = antisocialRecordDays > 0 - ? new Date(clock.nowMillis() - TimeUnit.DAYS.toMillis(antisocialRecordDays)) : null; - this.decider = decider; - this.isFullArchiveCluster = cluster == EarlybirdCluster.FULL_ARCHIVE; - } - - /** - * Decides whether to skip the given UserInfoUpdate. - */ - public boolean skipUserUpdate(UserUpdate userUpdate) { - if (userUpdate == null) { // always skip null updates - return true; - } - - UserUpdateType type = userUpdate.updateType; - - if (type == UserUpdateType.PROTECTED && skipProtectedUserUpdate()) { - return true; - } - - if (type == UserUpdateType.ANTISOCIAL && skipAntisocialUserUpdate(userUpdate)) { - return true; - } - - // NSFW users can continue to tweet even after they are marked as NSFW. That means - // that the snapshot needs to have all NSFW users from the beginning of time. Hence, no NSFW - // users updates check here. - - // pass all checks, do not skip this user update - return false; - } - - // Antisocial/suspended users can't tweet after they are suspended. Thus if our index stores - // tweets from the last 10 days, and they were suspended 60 days ago, we don't need them since - // there will be no tweets from them. We can save space by not storing info about those users. - - // (For archive, at rebuild time we filter out all suspended users tweets, so for a user that - // was suspended before a rebuild, no need to use space to store that the user is suspended) - private boolean skipAntisocialUserUpdate(UserUpdate userUpdate) { - return antisocialStartDate != null && userUpdate.getUpdatedAt().before(antisocialStartDate); - } - - // skip protected user updates for realtime and protected clusters - private boolean skipProtectedUserUpdate() { - return !isFullArchiveCluster; - } -} diff --git a/src/java/com/twitter/search/earlybird/config/BUILD b/src/java/com/twitter/search/earlybird/config/BUILD deleted file mode 100644 index 3bfb3ee1f..000000000 --- a/src/java/com/twitter/search/earlybird/config/BUILD +++ /dev/null @@ -1,21 +0,0 @@ -java_library( - sources = ["**/*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/com/google/code/findbugs:jsr305", - "3rdparty/jvm/com/google/guava", - "3rdparty/jvm/com/google/inject:guice", - "3rdparty/jvm/org/apache/thrift:libthrift", - "3rdparty/jvm/org/apache/zookeeper:zookeeper-client", - "3rdparty/jvm/org/slf4j:slf4j-api", - "src/java/com/twitter/common/base", - "src/java/com/twitter/common/util:system-mocks", - "src/java/com/twitter/search/common/config", - "src/java/com/twitter/search/common/metrics", - "src/java/com/twitter/search/common/partitioning/snowflakeparser", - "src/java/com/twitter/search/common/util/date", - "src/java/com/twitter/search/common/util/zookeeper", - "src/java/com/twitter/search/earlybird/common/config", - ], -) diff --git a/src/java/com/twitter/search/earlybird/config/BUILD.docx b/src/java/com/twitter/search/earlybird/config/BUILD.docx new file mode 100644 index 000000000..5f636e181 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/config/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird/config/ServingRange.docx b/src/java/com/twitter/search/earlybird/config/ServingRange.docx new file mode 100644 index 000000000..81d0ba621 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/config/ServingRange.docx differ diff --git a/src/java/com/twitter/search/earlybird/config/ServingRange.java b/src/java/com/twitter/search/earlybird/config/ServingRange.java deleted file mode 100644 index 076f3bd80..000000000 --- a/src/java/com/twitter/search/earlybird/config/ServingRange.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.twitter.search.earlybird.config; - -/** - * An interface for abstracting a tier's serving range. - */ -public interface ServingRange { - /** - * Returns the serving range's lowest tweet ID. - */ - long getServingRangeSinceId(); - - /** - * Returns the serving range's highest tweet ID. - */ - long getServingRangeMaxId(); - - /** - * Returns the serving range's earliest time, in seconds since epoch. - */ - long getServingRangeSinceTimeSecondsFromEpoch(); - - /** - * Returns the serving range's latest time, in seconds since epoch. - */ - long getServingRangeUntilTimeSecondsFromEpoch(); -} diff --git a/src/java/com/twitter/search/earlybird/config/TierConfig.docx b/src/java/com/twitter/search/earlybird/config/TierConfig.docx new file mode 100644 index 000000000..93fc691b7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/config/TierConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/config/TierConfig.java b/src/java/com/twitter/search/earlybird/config/TierConfig.java deleted file mode 100644 index 4ef7d339c..000000000 --- a/src/java/com/twitter/search/earlybird/config/TierConfig.java +++ /dev/null @@ -1,175 +0,0 @@ -package com.twitter.search.earlybird.config; - -import java.util.Date; -import java.util.Map; -import java.util.Set; - -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.config.Config; -import com.twitter.search.common.config.ConfigFile; -import com.twitter.search.common.config.ConfigurationException; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.util.date.DateUtil; - -/** - * This class provides APIs to access the tier configurations for a cluster. - * Each tier has tier name, number of partitions, tier start time and end time. - */ -public final class TierConfig { - private static final org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(TierConfig.class); - - private static final String DEFAULT_CONFIG_DIR = "common/config"; - public static final String DEFAULT_TIER_FILE = "earlybird-tiers.yml"; - - public static final Date DEFAULT_TIER_START_DATE = DateUtil.toDate(2006, 3, 21); - // It's convenient for DEFAULT_TIER_END_DATE to be before ~2100, because then the output of - // FieldTermCounter.getHourValue(DEFAULT_TIER_END_END_DATE) can still fit into an integer. - public static final Date DEFAULT_TIER_END_DATE = DateUtil.toDate(2099, 1, 1); - - public static final String DEFAULT_TIER_NAME = "all"; - public static final boolean DEFAULT_ENABLED = true; - public static final TierInfo.RequestReadType DEFAULT_READ_TYPE = TierInfo.RequestReadType.LIGHT; - - private static ConfigFile tierConfigFile = null; - private static ConfigSource tierConfigSource = null; - - public enum ConfigSource { - LOCAL, - ZOOKEEPER - } - - private TierConfig() { } - - private static synchronized void init() { - if (tierConfigFile == null) { - tierConfigFile = new ConfigFile(DEFAULT_CONFIG_DIR, DEFAULT_TIER_FILE); - tierConfigSource = ConfigSource.LOCAL; - SearchLongGauge.export("tier_config_source_" + tierConfigSource.name()).set(1); - LOG.info("Tier config file " + DEFAULT_TIER_FILE + " is successfully loaded from bundle."); - } - } - - public static ConfigFile getConfigFile() { - init(); - return tierConfigFile; - } - - public static String getConfigFileName() { - return getConfigFile().getConfigFileName(); - } - - /** - * Return all the tier names specified in the config file. - */ - public static Set getTierNames() { - return Config.getConfig().getMapCopy(getConfigFileName()).keySet(); - } - - /** - * Sets the value of the given tier config property to the given value. - */ - public static void setForTests(String property, Object value) { - Config.getConfig().setForTests(DEFAULT_TIER_FILE, property, value); - } - - /** - * Returns the config info for the specified tier. - */ - public static TierInfo getTierInfo(String tierName) { - return getTierInfo(tierName, null /* use current environment */); - } - - /** - * Returns the config info for the specified tier and environment. - */ - public static TierInfo getTierInfo(String tierName, @Nullable String environment) { - String tierConfigFileType = getConfigFileName(); - Map tierInfo; - try { - tierInfo = (Map) Config.getConfig() - .getFromEnvironment(environment, tierConfigFileType, tierName); - } catch (ConfigurationException e) { - throw new RuntimeException(e); - } - if (tierInfo == null) { - LOG.error("Cannot find tier config for " - + tierName + "in config file: " + tierConfigFileType); - throw new RuntimeException("Configuration error: " + tierConfigFileType); - } - - Long partitions = (Long) tierInfo.get("number_of_partitions"); - if (partitions == null) { - LOG.error("No number of partition is specified for tier " - + tierName + " in tier config file " + tierConfigFileType); - throw new RuntimeException("Configuration error: " + tierConfigFileType); - } - - Long numTimeslices = (Long) tierInfo.get("serving_timeslices"); - if (numTimeslices == null) { - LOG.info("No max timeslices is specified for tier " - + tierName + " in tier config file " + tierConfigFileType - + ", not setting a cap on number of serving timeslices"); - // NOTE: we use max int32 here because it will ultimately be cast to an int, but the config - // map expects Longs for all integral types. Using Long.MAX_VALUE leads to max serving - // timeslices being set to -1 when it is truncated to an int. - numTimeslices = (long) Integer.MAX_VALUE; - } - - Date tierStartDate = (Date) tierInfo.get("data_range_start_date_inclusive"); - if (tierStartDate == null) { - tierStartDate = DEFAULT_TIER_START_DATE; - } - Date tierEndDate = (Date) tierInfo.get("data_range_end_date_exclusive"); - if (tierEndDate == null) { - tierEndDate = DEFAULT_TIER_END_DATE; - } - - Boolean tierEnabled = (Boolean) tierInfo.get("tier_enabled"); - if (tierEnabled == null) { - tierEnabled = DEFAULT_ENABLED; - } - - TierInfo.RequestReadType readType = - getRequestReadType((String) tierInfo.get("tier_read_type"), DEFAULT_READ_TYPE); - TierInfo.RequestReadType readTypeOverride = - getRequestReadType((String) tierInfo.get("tier_read_type_override"), readType); - - return new TierInfo( - tierName, - tierStartDate, - tierEndDate, - partitions.intValue(), - numTimeslices.intValue(), - tierEnabled, - (String) tierInfo.get("serving_range_since_id_exclusive"), - (String) tierInfo.get("serving_range_max_id_inclusive"), - (Date) tierInfo.get("serving_range_start_date_inclusive_override"), - (Date) tierInfo.get("serving_range_end_date_exclusive_override"), - readType, - readTypeOverride, - Clock.SYSTEM_CLOCK); - } - - public static synchronized void clear() { - tierConfigFile = null; - tierConfigSource = null; - } - - protected static synchronized ConfigSource getTierConfigSource() { - return tierConfigSource; - } - - private static TierInfo.RequestReadType getRequestReadType( - String readTypeEnumName, TierInfo.RequestReadType defaultReadType) { - TierInfo.RequestReadType readType = defaultReadType; - if (readTypeEnumName != null) { - readType = TierInfo.RequestReadType.valueOf(readTypeEnumName.trim().toUpperCase()); - Preconditions.checkState(readType != null); - } - return readType; - } -} diff --git a/src/java/com/twitter/search/earlybird/config/TierInfo.docx b/src/java/com/twitter/search/earlybird/config/TierInfo.docx new file mode 100644 index 000000000..3e62705bf Binary files /dev/null and b/src/java/com/twitter/search/earlybird/config/TierInfo.docx differ diff --git a/src/java/com/twitter/search/earlybird/config/TierInfo.java b/src/java/com/twitter/search/earlybird/config/TierInfo.java deleted file mode 100644 index b4640224f..000000000 --- a/src/java/com/twitter/search/earlybird/config/TierInfo.java +++ /dev/null @@ -1,180 +0,0 @@ -package com.twitter.search.earlybird.config; - -import java.util.Date; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import com.twitter.common.util.Clock; - -/** - * Properties of a single tier. - */ -public class TierInfo implements ServingRange { - // What I'm seeing historically is that this has been used when adding a new tier. First you - // add it and send dark traffic to it, then possibly grey and then you launch it by turning on - // light traffic. - public static enum RequestReadType { - // Light read: send request, wait for results, and results are returned - LIGHT, - // Dark read: send request, do not wait for results, and results are discarded - DARK, - // Grey read: send request, wait for results, but discard after results come back. - // Same results as dark read; similar latency as light read. - GREY, - } - - private final String tierName; - private final Date dataStartDate; - private final Date dataEndDate; - private final int numPartitions; - private final int maxTimeslices; - private final TierServingBoundaryEndPoint servingRangeSince; - private final TierServingBoundaryEndPoint servingRangeMax; - private final TierServingBoundaryEndPoint servingRangeSinceOverride; - private final TierServingBoundaryEndPoint servingRangeMaxOverride; - - // These two properties are only used by clients of Earlybird (E.g. roots), - // but not by Earlybirds. - private final boolean enabled; - private final RequestReadType readType; - private final RequestReadType readTypeOverride; - - public TierInfo(String tierName, - Date dataStartDate, - Date dataEndDate, - int numPartitions, - int maxTimeslices, - boolean enabled, - String sinceIdString, - String maxIdString, - Date servingStartDateOverride, - Date servingEndDateOverride, - RequestReadType readType, - RequestReadType readTypeOverride, - Clock clock) { - Preconditions.checkArgument(numPartitions > 0); - Preconditions.checkArgument(maxTimeslices > 0); - this.tierName = tierName; - this.dataStartDate = dataStartDate; - this.dataEndDate = dataEndDate; - this.numPartitions = numPartitions; - this.maxTimeslices = maxTimeslices; - this.enabled = enabled; - this.readType = readType; - this.readTypeOverride = readTypeOverride; - this.servingRangeSince = TierServingBoundaryEndPoint - .newTierServingBoundaryEndPoint(sinceIdString, dataStartDate, clock); - this.servingRangeMax = TierServingBoundaryEndPoint - .newTierServingBoundaryEndPoint(maxIdString, dataEndDate, clock); - if (servingStartDateOverride != null) { - this.servingRangeSinceOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint( - TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingStartDateOverride, clock); - } else { - this.servingRangeSinceOverride = servingRangeSince; - } - - if (servingEndDateOverride != null) { - this.servingRangeMaxOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint( - TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingEndDateOverride, clock); - } else { - this.servingRangeMaxOverride = servingRangeMax; - } - } - - @VisibleForTesting - public TierInfo(String tierName, - Date dataStartDate, - Date dataEndDate, - int numPartitions, - int maxTimeslices, - boolean enabled, - String sinceIdString, - String maxIdString, - RequestReadType readType, - Clock clock) { - // No overrides: - // servingRangeSinceOverride == servingRangeSince - // servingRangeMaxOverride == servingRangeMax - // readTypeOverride == readType - this(tierName, dataStartDate, dataEndDate, numPartitions, maxTimeslices, enabled, sinceIdString, - maxIdString, null, null, readType, readType, clock); - } - - @Override - public String toString() { - return tierName; - } - - public String getTierName() { - return tierName; - } - - public Date getDataStartDate() { - return dataStartDate; - } - - public Date getDataEndDate() { - return dataEndDate; - } - - public int getNumPartitions() { - return numPartitions; - } - - public int getMaxTimeslices() { - return maxTimeslices; - } - - public TierConfig.ConfigSource getSource() { - return TierConfig.getTierConfigSource(); - } - - public boolean isEnabled() { - return enabled; - } - - public boolean isDarkRead() { - return readType == RequestReadType.DARK; - } - - public RequestReadType getReadType() { - return readType; - } - - public RequestReadType getReadTypeOverride() { - return readTypeOverride; - } - - public long getServingRangeSinceId() { - return servingRangeSince.getBoundaryTweetId(); - } - - public long getServingRangeMaxId() { - return servingRangeMax.getBoundaryTweetId(); - } - - long getServingRangeOverrideSinceId() { - return servingRangeSinceOverride.getBoundaryTweetId(); - } - - long getServingRangeOverrideMaxId() { - return servingRangeMaxOverride.getBoundaryTweetId(); - } - - public long getServingRangeSinceTimeSecondsFromEpoch() { - return servingRangeSince.getBoundaryTimeSecondsFromEpoch(); - } - - public long getServingRangeUntilTimeSecondsFromEpoch() { - return servingRangeMax.getBoundaryTimeSecondsFromEpoch(); - } - - long getServingRangeOverrideSinceTimeSecondsFromEpoch() { - return servingRangeSinceOverride.getBoundaryTimeSecondsFromEpoch(); - } - - long getServingRangeOverrideUntilTimeSecondsFromEpoch() { - return servingRangeMaxOverride.getBoundaryTimeSecondsFromEpoch(); - } -} diff --git a/src/java/com/twitter/search/earlybird/config/TierInfoSource.docx b/src/java/com/twitter/search/earlybird/config/TierInfoSource.docx new file mode 100644 index 000000000..d3bd1e7e9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/config/TierInfoSource.docx differ diff --git a/src/java/com/twitter/search/earlybird/config/TierInfoSource.java b/src/java/com/twitter/search/earlybird/config/TierInfoSource.java deleted file mode 100644 index 9835ca6ef..000000000 --- a/src/java/com/twitter/search/earlybird/config/TierInfoSource.java +++ /dev/null @@ -1,39 +0,0 @@ -package com.twitter.search.earlybird.config; - -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - -import javax.inject.Inject; - -import com.twitter.search.common.util.zookeeper.ZooKeeperProxy; - -public class TierInfoSource { - private final ZooKeeperProxy zkClient; - - @Inject - public TierInfoSource(ZooKeeperProxy sZooKeeperClient) { - this.zkClient = sZooKeeperClient; - } - - public List getTierInformation() { - return getTierInfoWithPrefix("tier"); - } - - public String getConfigFileType() { - return TierConfig.getConfigFileName(); - } - - private List getTierInfoWithPrefix(String tierPrefix) { - Set tierNames = TierConfig.getTierNames(); - List tierInfos = new ArrayList<>(); - for (String name : tierNames) { - if (name.startsWith(tierPrefix)) { - TierInfo tierInfo = TierConfig.getTierInfo(name); - tierInfos.add(tierInfo); - } - } - return tierInfos; - } - -} diff --git a/src/java/com/twitter/search/earlybird/config/TierInfoUtil.docx b/src/java/com/twitter/search/earlybird/config/TierInfoUtil.docx new file mode 100644 index 000000000..67acf3448 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/config/TierInfoUtil.docx differ diff --git a/src/java/com/twitter/search/earlybird/config/TierInfoUtil.java b/src/java/com/twitter/search/earlybird/config/TierInfoUtil.java deleted file mode 100644 index 995de7b81..000000000 --- a/src/java/com/twitter/search/earlybird/config/TierInfoUtil.java +++ /dev/null @@ -1,78 +0,0 @@ -package com.twitter.search.earlybird.config; - -import java.util.Comparator; -import java.util.SortedSet; - -import com.google.common.base.Preconditions; - -public final class TierInfoUtil { - public static final Comparator TIER_COMPARATOR = (t1, t2) -> { - // Reverse sort order based on date. - return t2.getDataStartDate().compareTo(t1.getDataStartDate()); - }; - - private TierInfoUtil() { - } - - /** - * Checks that the serving ranges and the override serving ranges of the given tiers do not - * overlap, and do not have gaps. Dark reads tiers are ignored. - */ - public static void checkTierServingRanges(SortedSet tierInfos) { - boolean tierServingRangesOverlap = false; - boolean tierOverrideServingRangesOverlap = false; - boolean tierServingRangesHaveGaps = false; - boolean tierOverrideServingRangesHaveGaps = false; - - TierInfoWrapper previousTierInfoWrapper = null; - TierInfoWrapper previousOverrideTierInfoWrapper = null; - for (TierInfo tierInfo : tierInfos) { - TierInfoWrapper tierInfoWrapper = new TierInfoWrapper(tierInfo, false); - TierInfoWrapper overrideTierInfoWrapper = new TierInfoWrapper(tierInfo, true); - - // Check only the tiers to which we send light reads. - if (!tierInfoWrapper.isDarkRead()) { - if (previousTierInfoWrapper != null) { - if (TierInfoWrapper.servingRangesOverlap(previousTierInfoWrapper, tierInfoWrapper)) { - // In case of rebalancing, we may have an overlap data range while - // overriding with a good serving range. - if (previousOverrideTierInfoWrapper == null - || TierInfoWrapper.servingRangesOverlap( - previousOverrideTierInfoWrapper, overrideTierInfoWrapper)) { - tierServingRangesOverlap = true; - } - } - if (TierInfoWrapper.servingRangesHaveGap(previousTierInfoWrapper, tierInfoWrapper)) { - tierServingRangesHaveGaps = true; - } - } - - previousTierInfoWrapper = tierInfoWrapper; - } - - if (!overrideTierInfoWrapper.isDarkRead()) { - if (previousOverrideTierInfoWrapper != null) { - if (TierInfoWrapper.servingRangesOverlap(previousOverrideTierInfoWrapper, - overrideTierInfoWrapper)) { - tierOverrideServingRangesOverlap = true; - } - if (TierInfoWrapper.servingRangesHaveGap(previousOverrideTierInfoWrapper, - overrideTierInfoWrapper)) { - tierOverrideServingRangesHaveGaps = true; - } - } - - previousOverrideTierInfoWrapper = overrideTierInfoWrapper; - } - } - - Preconditions.checkState(!tierServingRangesOverlap, - "Serving ranges of light reads tiers must not overlap."); - Preconditions.checkState(!tierServingRangesHaveGaps, - "Serving ranges of light reads tiers must not have gaps."); - Preconditions.checkState(!tierOverrideServingRangesOverlap, - "Override serving ranges of light reads tiers must not overlap."); - Preconditions.checkState(!tierOverrideServingRangesHaveGaps, - "Override serving ranges of light reads tiers must not have gaps."); - } -} diff --git a/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.docx b/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.docx new file mode 100644 index 000000000..15dffa7b0 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.docx differ diff --git a/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.java b/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.java deleted file mode 100644 index b6c3110dd..000000000 --- a/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.java +++ /dev/null @@ -1,89 +0,0 @@ -package com.twitter.search.earlybird.config; - -import java.util.Date; - -import com.google.common.base.Preconditions; - -/** - * A simple wrapper around TierInfo that returns the "real" or the "overriden" values from the given - * {@code TierInfo} instance, based on the given {@code useOverrideTierConfig} flag. - */ -public class TierInfoWrapper implements ServingRange { - private final TierInfo tierInfo; - private final boolean useOverrideTierConfig; - - public TierInfoWrapper(TierInfo tierInfo, boolean useOverrideTierConfig) { - this.tierInfo = Preconditions.checkNotNull(tierInfo); - this.useOverrideTierConfig = useOverrideTierConfig; - } - - public String getTierName() { - return tierInfo.getTierName(); - } - - public Date getDataStartDate() { - return tierInfo.getDataStartDate(); - } - - public Date getDataEndDate() { - return tierInfo.getDataEndDate(); - } - - public int getNumPartitions() { - return tierInfo.getNumPartitions(); - } - - public int getMaxTimeslices() { - return tierInfo.getMaxTimeslices(); - } - - public TierConfig.ConfigSource getSource() { - return tierInfo.getSource(); - } - - public boolean isEnabled() { - return tierInfo.isEnabled(); - } - - public boolean isDarkRead() { - return getReadType() == TierInfo.RequestReadType.DARK; - } - - public TierInfo.RequestReadType getReadType() { - return useOverrideTierConfig ? tierInfo.getReadTypeOverride() : tierInfo.getReadType(); - } - - public long getServingRangeSinceId() { - return useOverrideTierConfig - ? tierInfo.getServingRangeOverrideSinceId() - : tierInfo.getServingRangeSinceId(); - } - - public long getServingRangeMaxId() { - return useOverrideTierConfig - ? tierInfo.getServingRangeOverrideMaxId() - : tierInfo.getServingRangeMaxId(); - } - - public long getServingRangeSinceTimeSecondsFromEpoch() { - return useOverrideTierConfig - ? tierInfo.getServingRangeOverrideSinceTimeSecondsFromEpoch() - : tierInfo.getServingRangeSinceTimeSecondsFromEpoch(); - } - - public long getServingRangeUntilTimeSecondsFromEpoch() { - return useOverrideTierConfig - ? tierInfo.getServingRangeOverrideUntilTimeSecondsFromEpoch() - : tierInfo.getServingRangeUntilTimeSecondsFromEpoch(); - } - - public static boolean servingRangesOverlap(TierInfoWrapper tier1, TierInfoWrapper tier2) { - return (tier1.getServingRangeMaxId() > tier2.getServingRangeSinceId()) - && (tier2.getServingRangeMaxId() > tier1.getServingRangeSinceId()); - } - - public static boolean servingRangesHaveGap(TierInfoWrapper tier1, TierInfoWrapper tier2) { - return (tier1.getServingRangeMaxId() < tier2.getServingRangeSinceId()) - || (tier2.getServingRangeMaxId() < tier1.getServingRangeSinceId()); - } -} diff --git a/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.docx b/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.docx new file mode 100644 index 000000000..775c896ba Binary files /dev/null and b/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.docx differ diff --git a/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.java b/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.java deleted file mode 100644 index 8a0eb852b..000000000 --- a/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.java +++ /dev/null @@ -1,146 +0,0 @@ -package com.twitter.search.earlybird.config; - -import java.util.Date; - -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; - -/** - * The start or end boundary of a tier's serving range. - * This is used to add since_id and max_id operators onto search queries. - */ -public class TierServingBoundaryEndPoint { - @VisibleForTesting - public static final String INFERRED_FROM_DATA_RANGE = "inferred_from_data_range"; - public static final String RELATIVE_TO_CURRENT_TIME_MS = "relative_to_current_time_ms"; - - // Either offsetToCurrentTimeMillis is set or (absoluteTweetId and timeBoundarySecondsFromEpoch) - // are set. - @Nullable - private final Long offsetToCurrentTimeMillis; - @Nullable - private final Long absoluteTweetId; - @Nullable - private final Long timeBoundarySecondsFromEpoch; - private final Clock clock; - - TierServingBoundaryEndPoint(Long absoluteTweetId, - Long timeBoundarySecondsFromEpoch, - Long offsetToCurrentTimeMillis, - Clock clock) { - this.offsetToCurrentTimeMillis = offsetToCurrentTimeMillis; - this.absoluteTweetId = absoluteTweetId; - this.timeBoundarySecondsFromEpoch = timeBoundarySecondsFromEpoch; - this.clock = clock; - } - - /** - * Parse the boundary string and construct a TierServingBoundaryEndPoint instance. - * @param boundaryString boundary configuration string. Valid values are: - *

  • - * "inferred_from_data_range" infers serving range from data range. This only works after - * Nov 2010 when Twitter switched to snowflake IDs. - * This is the default value. - *
  • - *
  • - * "absolute_tweet_id_and_timestamp_millis:id:timestamp" a tweet ID/timestamp is given - * explicitly as the serving range - * boundary. - *
  • - *
  • - * "relative_to_current_time_ms:offset" adds offset onto current timestamp in millis to - * compute serving range. - *
  • - * - * @param boundaryDate the data boundary. This is used in conjunction with - * inferred_from_data_date to determine the serving boundary. - * @param clock Clock used to obtain current time, when relative_to_current_time_ms is used. - * Tests pass in a FakeClock. - */ - public static TierServingBoundaryEndPoint newTierServingBoundaryEndPoint(String boundaryString, - Date boundaryDate, - Clock clock) { - if (boundaryString == null || boundaryString.trim().equals( - INFERRED_FROM_DATA_RANGE)) { - return inferBoundaryFromDataRange(boundaryDate, clock); - } else if (boundaryString.trim().startsWith(RELATIVE_TO_CURRENT_TIME_MS)) { - return getRelativeBoundary(boundaryString, clock); - } else { - throw new IllegalStateException("Cannot parse serving range string: " + boundaryString); - } - } - - private static TierServingBoundaryEndPoint inferBoundaryFromDataRange(Date boundaryDate, - Clock clock) { - // infer from data range - // handle default start date and end date, in case the dates are not specified in the config - if (boundaryDate.equals(TierConfig.DEFAULT_TIER_START_DATE)) { - return new TierServingBoundaryEndPoint( - -1L, TierConfig.DEFAULT_TIER_START_DATE.getTime() / 1000, null, clock); - } else if (boundaryDate.equals(TierConfig.DEFAULT_TIER_END_DATE)) { - return new TierServingBoundaryEndPoint( - Long.MAX_VALUE, TierConfig.DEFAULT_TIER_END_DATE.getTime() / 1000, null, clock); - } else { - // convert data start / end dates into since / max ID. - long boundaryTimeMillis = boundaryDate.getTime(); - if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(boundaryTimeMillis)) { - throw new IllegalStateException("Serving time range can not be determined, because " - + boundaryDate + " is before Twitter switched to snowflake tweet IDs."); - } - // Earlybird since_id is inclusive and max_id is exclusive. We substract 1 here. - // Consider example: - // full0: 5000 (inclusive) - 6000 (exclusive) - // full1: 6000 (inclusive) - 7000 (exclusive) - // For tier full0, we should use max_id 5999 instead of 6000. - // For tier full1, we should use since_id 5999 instead of 6000. - // Hence we substract 1 here. - long adjustedTweetId = - SnowflakeIdParser.generateValidStatusId(boundaryTimeMillis, 0) - 1; - Preconditions.checkState(adjustedTweetId >= 0, "boundary tweet ID must be non-negative"); - return new TierServingBoundaryEndPoint( - adjustedTweetId, boundaryTimeMillis / 1000, null, clock); - } - } - - private static TierServingBoundaryEndPoint getRelativeBoundary(String boundaryString, - Clock clock) { - // An offset relative to current time is given - String[] parts = boundaryString.split(":"); - Preconditions.checkState(parts.length == 2); - long offset = Long.parseLong(parts[1]); - return new TierServingBoundaryEndPoint(null, null, offset, clock); - } - - /** - * Returns the tweet ID for this tier boundary. If the tier boundary was created using a tweet ID, - * that tweet ID is returned. Otherwise, a tweet ID is derived from the time boundary. - */ - @VisibleForTesting - public long getBoundaryTweetId() { - // If absoluteTweetId is available, use it. - if (absoluteTweetId != null) { - return absoluteTweetId; - } else { - Preconditions.checkNotNull(offsetToCurrentTimeMillis); - long boundaryTime = clock.nowMillis() + offsetToCurrentTimeMillis; - return SnowflakeIdParser.generateValidStatusId(boundaryTime, 0); - } - } - - /** - * Returns the time boundary for this tier boundary, in seconds since epoch. - */ - public long getBoundaryTimeSecondsFromEpoch() { - if (timeBoundarySecondsFromEpoch != null) { - return timeBoundarySecondsFromEpoch; - } else { - Preconditions.checkNotNull(offsetToCurrentTimeMillis); - return (clock.nowMillis() + offsetToCurrentTimeMillis) / 1000; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/document/DeletedStatus.docx b/src/java/com/twitter/search/earlybird/document/DeletedStatus.docx new file mode 100644 index 000000000..293aa27d4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/document/DeletedStatus.docx differ diff --git a/src/java/com/twitter/search/earlybird/document/DeletedStatus.java b/src/java/com/twitter/search/earlybird/document/DeletedStatus.java deleted file mode 100644 index 6da8d23a0..000000000 --- a/src/java/com/twitter/search/earlybird/document/DeletedStatus.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.search.earlybird.document; - -/** - * DeletedStatus is a marker indicating that the specified tweet in the specified - * timeslice has been deleted. - */ -public final class DeletedStatus { - public final long timeSliceID; - public final long statusID; - - public DeletedStatus(long timeSliceID, long statusID) { - this.timeSliceID = timeSliceID; - this.statusID = statusID; - } -} diff --git a/src/java/com/twitter/search/earlybird/document/DocumentFactory.docx b/src/java/com/twitter/search/earlybird/document/DocumentFactory.docx new file mode 100644 index 000000000..04fba2d94 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/document/DocumentFactory.docx differ diff --git a/src/java/com/twitter/search/earlybird/document/DocumentFactory.java b/src/java/com/twitter/search/earlybird/document/DocumentFactory.java deleted file mode 100644 index 4745f329c..000000000 --- a/src/java/com/twitter/search/earlybird/document/DocumentFactory.java +++ /dev/null @@ -1,110 +0,0 @@ -package com.twitter.search.earlybird.document; - -import java.io.IOException; -import javax.annotation.Nullable; - -import org.apache.commons.codec.binary.Base64; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.index.IndexableField; -import org.apache.thrift.TBase; -import org.apache.thrift.TException; -import org.apache.thrift.TSerializer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.util.text.OmitNormTextField; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; - -/** - * Factory that constructs a Lucene document from a thrift object stored in T format. - * - * @param ThriftStatus or ThriftIndexingEvent, to be converted to a Lucene Document. - */ -public abstract class DocumentFactory> { - private static final Logger LOG = LoggerFactory.getLogger(DocumentFactory.class); - private static final int MAX_ALLOWED_INVALID_DOCUMENTS = 100; - - private static final SearchCounter INVALID_DOCUMENTS_COUNTER = - SearchCounter.export("invalid_documents"); - - private final CriticalExceptionHandler criticalExceptionHandler; - - public DocumentFactory(CriticalExceptionHandler criticalExceptionHandler) { - this.criticalExceptionHandler = criticalExceptionHandler; - } - - /** - * Given the thrift representation of a tweet, returns the associated tweetId. - */ - public abstract long getStatusId(T thriftObject); - - /** - * Given the thrift representation of a tweet, returns a Lucene Document with all the fields - * that need to be indexed. - */ - @Nullable - public final Document newDocument(T thriftObject) { - try { - return innerNewDocument(thriftObject); - } catch (Exception e) { - String statusId = "Not available"; - if (thriftObject != null) { - try { - statusId = Long.toString(getStatusId(thriftObject)); - } catch (Exception ex) { - LOG.error("Unable to get tweet id for document", ex); - statusId = "Not parsable"; - } - } - LOG.error("Unexpected exception while indexing. Status id: " + statusId, e); - - if (thriftObject != null) { - // Log the status in base64 for debugging - try { - LOG.warn("Bad ThriftStatus. Id: " + statusId + " base 64: " - + Base64.encodeBase64String(new TSerializer().serialize(thriftObject))); - } catch (TException e1) { - // Ignored since this is logging for debugging. - } - } - INVALID_DOCUMENTS_COUNTER.increment(); - if (INVALID_DOCUMENTS_COUNTER.get() > MAX_ALLOWED_INVALID_DOCUMENTS) { - criticalExceptionHandler.handle(this, e); - } - return new Document(); - } - } - - /** - * Given the thrift representation of a tweet, returns a Lucene Document with all the fields - * that need to be indexed. - * - * Return null if the given thrift object is invalid. - * - * @throws IOException if there are problems reading the input of producing the output. Exception - * is handled in {@link #newDocument(TBase)}. - */ - @Nullable - protected abstract Document innerNewDocument(T thriftObject) throws IOException; - - // Helper methods that prevent us from adding null fields to the lucene index - protected void addField(Document document, IndexableField field) { - if (field != null) { - document.add(field); - } - } - - protected Field newField(String data, String fieldName) { - return newField(data, fieldName, OmitNormTextField.TYPE_NOT_STORED); - } - - protected Field newField(String data, String fieldName, FieldType fieldType) { - if (data != null) { - return new Field(fieldName, data, fieldType); - } - return null; - } -} diff --git a/src/java/com/twitter/search/earlybird/document/ThriftDocumentPreprocessor.docx b/src/java/com/twitter/search/earlybird/document/ThriftDocumentPreprocessor.docx new file mode 100644 index 000000000..359e895e6 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/document/ThriftDocumentPreprocessor.docx differ diff --git a/src/java/com/twitter/search/earlybird/document/ThriftDocumentPreprocessor.java b/src/java/com/twitter/search/earlybird/document/ThriftDocumentPreprocessor.java deleted file mode 100644 index 4ef5909e5..000000000 --- a/src/java/com/twitter/search/earlybird/document/ThriftDocumentPreprocessor.java +++ /dev/null @@ -1,170 +0,0 @@ -package com.twitter.search.earlybird.document; - -import java.io.IOException; -import java.util.List; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchTruthTableCounter; -import com.twitter.search.common.schema.base.FieldNameToIdMapping; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.ThriftDocumentUtil; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeaturesUtil; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftField; - -import geo.google.datamodel.GeoAddressAccuracy; - -/** - * Used to preprocess a ThriftDocument before indexing. - */ -public final class ThriftDocumentPreprocessor { - private static final FieldNameToIdMapping ID_MAP = new EarlybirdFieldConstants(); - private static final String FILTER_LINK_VALUE = EarlybirdThriftDocumentUtil.formatFilter( - EarlybirdFieldConstant.LINKS_FIELD.getFieldName()); - private static final String HAS_LINK_VALUE = EarlybirdFieldConstant.getFacetSkipFieldName( - EarlybirdFieldConstant.LINKS_FIELD.getFieldName()); - - private ThriftDocumentPreprocessor() { - } - - /** - * Processes the given document. - */ - public static ThriftDocument preprocess( - ThriftDocument doc, EarlybirdCluster cluster, ImmutableSchemaInterface schema) - throws IOException { - patchArchiveThriftDocumentAccuracy(doc, cluster); - patchArchiveHasLinks(doc, cluster); - addAllMissingMinEngagementFields(doc, cluster, schema); - return doc; - } - - private static final SearchCounter GEO_SCRUBBED_COUNT = - SearchCounter.export("geo_scrubbed_count"); - private static final SearchCounter GEO_ARCHIVE_PATCHED_ACCURACY_COUNT = - SearchCounter.export("geo_archive_patched_accuracy_count"); - private static final SearchCounter GEO_MISSING_COORDINATE_COUNT = - SearchCounter.export("geo_missing_coordinate_count"); - private static final SearchCounter ARCHIVED_LINKS_FIELD_PATCHED_COUNT = - SearchCounter.export("links_field_patched_count"); - - /** - * Counter for all the combinations of nullcast bit set and nullcast filter set. - * - * Sum over `ThriftDocumentPreprocessor_nullcast_doc_stats__nullcastBitSet_true_*` to get all docs - * with nullcast bit set to true. - */ - private static final SearchTruthTableCounter NULLCAST_DOC_STATS = - SearchTruthTableCounter.export( - "ThriftDocumentPreprocessor_nullcast_doc_stats", - "nullcastBitSet", - "nullcastFilterSet"); - - /*** - * See JIRA SEARCH-7329 - */ - private static void patchArchiveThriftDocumentAccuracy(ThriftDocument doc, - EarlybirdCluster cluster) { - ThriftField geoField = ThriftDocumentUtil.getField( - doc, - EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(), - ID_MAP); - if (geoField != null) { - if (!geoField.getFieldData().isSetGeoCoordinate()) { - GEO_MISSING_COORDINATE_COUNT.increment(); - return; - } - - // -1 means that the data is geo scrubbed. - if (geoField.getFieldData().getGeoCoordinate().getAccuracy() == -1) { - doc.getFields().remove(geoField); - GEO_SCRUBBED_COUNT.increment(); - } else if (EarlybirdCluster.isArchive(cluster)) { - // In archive indexing, we base precision on SearchArchiveStatus.getPrecision, which is not - // in the scale we want. We always use POINT_LEVEL scale for now. - geoField.getFieldData().getGeoCoordinate().setAccuracy( - GeoAddressAccuracy.POINT_LEVEL.getCode()); - GEO_ARCHIVE_PATCHED_ACCURACY_COUNT.increment(); - } - } - } - - /** - * See SEARCH-9635 - * This patch is used to replace - * ("field":"internal","term":"__filter_links") with - * ("field":"internal","term":"__has_links"). - */ - private static void patchArchiveHasLinks(ThriftDocument doc, EarlybirdCluster cluster) { - if (!EarlybirdCluster.isArchive(cluster)) { - return; - } - - List fieldList = ThriftDocumentUtil.getFields(doc, - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - ID_MAP); - for (ThriftField field : fieldList) { - if (field.getFieldData().getStringValue().equals(FILTER_LINK_VALUE)) { - field.getFieldData().setStringValue(HAS_LINK_VALUE); - ARCHIVED_LINKS_FIELD_PATCHED_COUNT.increment(); - break; - } - } - } - - /** - * Check whether the nullcast bit and nullcast filter are consistent in the given doc. - */ - public static boolean isNullcastBitAndFilterConsistent(ThriftDocument doc, - ImmutableSchemaInterface schema) { - return isNullcastBitAndFilterConsistent(doc, schema, NULLCAST_DOC_STATS); - } - - @VisibleForTesting - static boolean isNullcastBitAndFilterConsistent( - ThriftDocument doc, ImmutableSchemaInterface schema, SearchTruthTableCounter nullCastStats) { - final boolean isNullcastBitSet = EarlybirdThriftDocumentUtil.isNullcastBitSet(schema, doc); - final boolean isNullcastFilterSet = EarlybirdThriftDocumentUtil.isNullcastFilterSet(doc); - - // Track stats. - nullCastStats.record(isNullcastBitSet, isNullcastFilterSet); - - return isNullcastBitSet == isNullcastFilterSet; - } - - @VisibleForTesting - static void addAllMissingMinEngagementFields( - ThriftDocument doc, EarlybirdCluster cluster, ImmutableSchemaInterface schema - ) throws IOException { - if (!EarlybirdCluster.isArchive(cluster)) { - return; - } - EarlybirdFieldConstants.EarlybirdFieldConstant encodedFeatureFieldConstant = - EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD; - byte[] encodedFeaturesBytes = ThriftDocumentUtil.getBytesValue(doc, - encodedFeatureFieldConstant.getFieldName(), ID_MAP); - if (encodedFeaturesBytes == null) { - return; - } - EarlybirdEncodedFeatures encodedFeatures = EarlybirdEncodedFeaturesUtil.fromBytes( - schema, - EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD, - encodedFeaturesBytes, - 0); - for (String field: EarlybirdFieldConstants.MIN_ENGAGEMENT_FIELD_TO_CSF_NAME_MAP.keySet()) { - EarlybirdFieldConstant csfEngagementField = EarlybirdFieldConstants - .MIN_ENGAGEMENT_FIELD_TO_CSF_NAME_MAP.get(field); - Preconditions.checkState(csfEngagementField != null); - int engagementCounter = encodedFeatures.getFeatureValue(csfEngagementField); - EarlybirdThriftDocumentUtil.addNormalizedMinEngagementField(doc, field, engagementCounter); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventDocumentFactory.docx b/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventDocumentFactory.docx new file mode 100644 index 000000000..c2bba8c59 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventDocumentFactory.docx differ diff --git a/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventDocumentFactory.java b/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventDocumentFactory.java deleted file mode 100644 index d225a387e..000000000 --- a/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventDocumentFactory.java +++ /dev/null @@ -1,246 +0,0 @@ -package com.twitter.search.earlybird.document; - -import java.io.IOException; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import org.apache.lucene.document.Document; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.schema.SchemaDocumentFactory; -import com.twitter.search.common.schema.base.FieldNameToIdMapping; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.base.ThriftDocumentUtil; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.util.text.filter.NormalizedTokenFilter; -import com.twitter.search.common.util.text.splitter.HashtagMentionPunctuationSplitter; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; - -public class ThriftIndexingEventDocumentFactory extends DocumentFactory { - private static final Logger LOG = - LoggerFactory.getLogger(ThriftIndexingEventDocumentFactory.class); - - private static final FieldNameToIdMapping ID_MAPPING = new EarlybirdFieldConstants(); - private static final long TIMESTAMP_ALLOWED_FUTURE_DELTA_MS = TimeUnit.SECONDS.toMillis(60); - private static final String FILTER_TWEETS_WITH_FUTURE_TWEET_ID_AND_CREATED_AT_DECIDER_KEY = - "filter_tweets_with_future_tweet_id_and_created_at"; - - private static final SearchCounter NUM_TWEETS_WITH_FUTURE_TWEET_ID_AND_CREATED_AT_MS = - SearchCounter.export("num_tweets_with_future_tweet_id_and_created_at_ms"); - private static final SearchCounter NUM_TWEETS_WITH_INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS_FOUND = - SearchCounter.export("num_tweets_with_inconsistent_tweet_id_and_created_at_ms_found"); - private static final SearchCounter - NUM_TWEETS_WITH_INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS_ADJUSTED = - SearchCounter.export("num_tweets_with_inconsistent_tweet_id_and_created_at_ms_adjusted"); - private static final SearchCounter NUM_TWEETS_WITH_INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS_DROPPED - = SearchCounter.export("num_tweets_with_inconsistent_tweet_id_and_created_at_ms_dropped"); - - @VisibleForTesting - static final String ENABLE_ADJUST_CREATED_AT_TIME_IF_MISMATCH_WITH_SNOWFLAKE = - "enable_adjust_created_at_time_if_mismatch_with_snowflake"; - - @VisibleForTesting - static final String ENABLE_DROP_CREATED_AT_TIME_IF_MISMATCH_WITH_SNOWFLAKE = - "enable_drop_created_at_time_if_mismatch_with_snowflake"; - - private final SchemaDocumentFactory schemaDocumentFactory; - private final EarlybirdCluster cluster; - private final SearchIndexingMetricSet searchIndexingMetricSet; - private final Decider decider; - private final Schema schema; - private final Clock clock; - - public ThriftIndexingEventDocumentFactory( - Schema schema, - EarlybirdCluster cluster, - Decider decider, - SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler) { - this( - schema, - getSchemaDocumentFactory(schema, cluster, decider), - cluster, - searchIndexingMetricSet, - decider, - Clock.SYSTEM_CLOCK, - criticalExceptionHandler - ); - } - - /** - * Returns a document factory that knows how to convert ThriftDocuments to Documents based on the - * provided schema. - */ - public static SchemaDocumentFactory getSchemaDocumentFactory( - Schema schema, - EarlybirdCluster cluster, - Decider decider) { - return new SchemaDocumentFactory(schema, - Lists.newArrayList( - new TruncationTokenStreamWriter(cluster, decider), - (fieldInfo, stream) -> { - // Strip # @ $ symbols, and break up underscore connected tokens. - if (fieldInfo.getFieldType().useTweetSpecificNormalization()) { - return new HashtagMentionPunctuationSplitter(new NormalizedTokenFilter(stream)); - } - - return stream; - })); - } - - @VisibleForTesting - protected ThriftIndexingEventDocumentFactory( - Schema schema, - SchemaDocumentFactory schemaDocumentFactory, - EarlybirdCluster cluster, - SearchIndexingMetricSet searchIndexingMetricSet, - Decider decider, - Clock clock, - CriticalExceptionHandler criticalExceptionHandler) { - super(criticalExceptionHandler); - this.schema = schema; - this.schemaDocumentFactory = schemaDocumentFactory; - this.cluster = cluster; - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.decider = decider; - this.clock = clock; - } - - @Override - public long getStatusId(ThriftIndexingEvent event) { - Preconditions.checkNotNull(event); - if (event.isSetDocument() && event.getDocument() != null) { - ThriftDocument thriftDocument = event.getDocument(); - try { - // Ideally, we should not call getSchemaSnapshot() here. But, as this is called only to - // retrieve status id and the ID field is static, this is fine for the purpose. - thriftDocument = ThriftDocumentPreprocessor.preprocess( - thriftDocument, cluster, schema.getSchemaSnapshot()); - } catch (IOException e) { - throw new IllegalStateException("Unable to obtain tweet ID from ThriftDocument", e); - } - return ThriftDocumentUtil.getLongValue( - thriftDocument, EarlybirdFieldConstant.ID_FIELD.getFieldName(), ID_MAPPING); - } else { - throw new IllegalArgumentException("ThriftDocument is null inside ThriftIndexingEvent."); - } - } - - @Override - protected Document innerNewDocument(ThriftIndexingEvent event) throws IOException { - Preconditions.checkNotNull(event); - Preconditions.checkNotNull(event.getDocument()); - - ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot(); - - // If the tweet id and create_at are in the future, do not index it. - if (areTweetIDAndCreateAtInTheFuture(event) - && DeciderUtil.isAvailableForRandomRecipient(decider, - FILTER_TWEETS_WITH_FUTURE_TWEET_ID_AND_CREATED_AT_DECIDER_KEY)) { - NUM_TWEETS_WITH_FUTURE_TWEET_ID_AND_CREATED_AT_MS.increment(); - return null; - } - - if (isNullcastBitAndFilterConsistent(schemaSnapshot, event)) { - ThriftDocument thriftDocument = - adjustOrDropIfTweetIDAndCreatedAtAreInconsistent( - ThriftDocumentPreprocessor.preprocess(event.getDocument(), cluster, schemaSnapshot)); - - if (thriftDocument != null) { - return schemaDocumentFactory.newDocument(thriftDocument); - } else { - return null; - } - } else { - return null; - } - } - - private ThriftDocument adjustOrDropIfTweetIDAndCreatedAtAreInconsistent(ThriftDocument document) { - final long tweetID = EarlybirdThriftDocumentUtil.getID(document); - // Thrift document is storing created at in seconds. - final long createdAtMs = EarlybirdThriftDocumentUtil.getCreatedAtMs(document); - - if (!SnowflakeIdParser.isTweetIDAndCreatedAtConsistent(tweetID, createdAtMs)) { - // Increment found counter. - NUM_TWEETS_WITH_INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS_FOUND.increment(); - LOG.error( - "Found inconsistent tweet ID and created at timestamp: [tweetID={}], [createdAtMs={}]", - tweetID, createdAtMs); - - if (DeciderUtil.isAvailableForRandomRecipient( - decider, ENABLE_ADJUST_CREATED_AT_TIME_IF_MISMATCH_WITH_SNOWFLAKE)) { - // Update created at (and csf) with the time stamp in snow flake ID. - final long createdAtMsInID = SnowflakeIdParser.getTimestampFromTweetId(tweetID); - EarlybirdThriftDocumentUtil.replaceCreatedAtAndCreatedAtCSF( - document, (int) (createdAtMsInID / 1000)); - - // Increment adjusted counter. - NUM_TWEETS_WITH_INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS_ADJUSTED.increment(); - LOG.error( - "Updated created at to match tweet ID: createdAtMs={}, tweetID={}, createdAtMsInID={}", - createdAtMs, tweetID, createdAtMsInID); - } else if (DeciderUtil.isAvailableForRandomRecipient( - decider, ENABLE_DROP_CREATED_AT_TIME_IF_MISMATCH_WITH_SNOWFLAKE)) { - // Drop and increment counter! - NUM_TWEETS_WITH_INCONSISTENT_TWEET_ID_AND_CREATED_AT_MS_DROPPED.increment(); - LOG.error( - "Dropped tweet with inconsistent ID and timestamp: createdAtMs={}, tweetID={}", - createdAtMs, tweetID); - return null; - } - } - - return document; - } - - private boolean isNullcastBitAndFilterConsistent( - ImmutableSchemaInterface schemaSnapshot, - ThriftIndexingEvent event) { - return ThriftDocumentPreprocessor.isNullcastBitAndFilterConsistent( - event.getDocument(), schemaSnapshot); - } - - /** - * Check if the tweet ID and create_at are in the future and beyond the allowed - * TIMESTAMP_ALLOWED_FUTURE_DELTA_MS range from current time stamp. - */ - private boolean areTweetIDAndCreateAtInTheFuture(ThriftIndexingEvent event) { - ThriftDocument document = event.getDocument(); - - final long tweetID = EarlybirdThriftDocumentUtil.getID(document); - if (tweetID < SnowflakeIdParser.SNOWFLAKE_ID_LOWER_BOUND) { - return false; - } - - final long tweetIDTimestampMs = SnowflakeIdParser.getTimestampFromTweetId(tweetID); - final long allowedFutureTimestampMs = clock.nowMillis() + TIMESTAMP_ALLOWED_FUTURE_DELTA_MS; - - final long createdAtMs = EarlybirdThriftDocumentUtil.getCreatedAtMs(document); - if (tweetIDTimestampMs > allowedFutureTimestampMs && createdAtMs > allowedFutureTimestampMs) { - LOG.error( - "Found future tweet ID and created at timestamp: " - + "[tweetID={}], [createdAtMs={}], [compareDeltaMs={}]", - tweetID, createdAtMs, TIMESTAMP_ALLOWED_FUTURE_DELTA_MS); - return true; - } - - return false; - } -} diff --git a/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventUpdateFactory.docx b/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventUpdateFactory.docx new file mode 100644 index 000000000..3d0bdaa87 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventUpdateFactory.docx differ diff --git a/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventUpdateFactory.java b/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventUpdateFactory.java deleted file mode 100644 index 63a4ced1b..000000000 --- a/src/java/com/twitter/search/earlybird/document/ThriftIndexingEventUpdateFactory.java +++ /dev/null @@ -1,91 +0,0 @@ -package com.twitter.search.earlybird.document; - -import java.io.IOException; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.lucene.document.Document; - -import com.twitter.decider.Decider; -import com.twitter.search.common.schema.SchemaDocumentFactory; -import com.twitter.search.common.schema.base.FieldNameToIdMapping; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.base.ThriftDocumentUtil; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; - -/** - * Builds a Lucene Document from a ThriftIndexingEvent. A simplified version of - * {@link ThriftIndexingEventDocumentFactory} that can be used for update events, which exclude - * many fields that the tweet indexing events contain. - */ -public class ThriftIndexingEventUpdateFactory extends DocumentFactory { - private static final FieldNameToIdMapping ID_MAPPING = new EarlybirdFieldConstants(); - - private final SchemaDocumentFactory schemaDocumentFactory; - private final EarlybirdCluster cluster; - private final Schema schema; - - public ThriftIndexingEventUpdateFactory( - Schema schema, - EarlybirdCluster cluster, - Decider decider, - CriticalExceptionHandler criticalExceptionHandler) { - this( - schema, - ThriftIndexingEventDocumentFactory.getSchemaDocumentFactory(schema, cluster, decider), - cluster, - criticalExceptionHandler - ); - } - - @VisibleForTesting - protected ThriftIndexingEventUpdateFactory( - Schema schema, - SchemaDocumentFactory schemaDocumentFactory, - EarlybirdCluster cluster, - CriticalExceptionHandler criticalExceptionHandler) { - super(criticalExceptionHandler); - this.schema = schema; - this.schemaDocumentFactory = schemaDocumentFactory; - this.cluster = cluster; - } - - @Override - public long getStatusId(ThriftIndexingEvent event) { - Preconditions.checkNotNull(event); - Preconditions.checkState( - event.isSetDocument(), "ThriftDocument is null inside ThriftIndexingEvent."); - - ThriftDocument thriftDocument; - try { - // Ideally, we should not call getSchemaSnapshot() here. But, as this is called only to - // retrieve status id and the ID field is static, this is fine for the purpose. - thriftDocument = ThriftDocumentPreprocessor.preprocess( - event.getDocument(), cluster, schema.getSchemaSnapshot()); - } catch (IOException e) { - throw new IllegalStateException("Unable to obtain tweet ID from ThriftDocument: " + event, e); - } - return ThriftDocumentUtil.getLongValue( - thriftDocument, EarlybirdFieldConstant.ID_FIELD.getFieldName(), ID_MAPPING); - } - - @Override - protected Document innerNewDocument(ThriftIndexingEvent event) throws IOException { - Preconditions.checkNotNull(event); - Preconditions.checkNotNull(event.getDocument()); - - ImmutableSchemaInterface schemaSnapshot = schema.getSchemaSnapshot(); - - ThriftDocument document = ThriftDocumentPreprocessor.preprocess( - event.getDocument(), cluster, schemaSnapshot); - - return schemaDocumentFactory.newDocument(document); - } -} diff --git a/src/java/com/twitter/search/earlybird/document/TimeSlicedThriftIndexingEvent.docx b/src/java/com/twitter/search/earlybird/document/TimeSlicedThriftIndexingEvent.docx new file mode 100644 index 000000000..94b2590f8 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/document/TimeSlicedThriftIndexingEvent.docx differ diff --git a/src/java/com/twitter/search/earlybird/document/TimeSlicedThriftIndexingEvent.java b/src/java/com/twitter/search/earlybird/document/TimeSlicedThriftIndexingEvent.java deleted file mode 100644 index 0e791a008..000000000 --- a/src/java/com/twitter/search/earlybird/document/TimeSlicedThriftIndexingEvent.java +++ /dev/null @@ -1,40 +0,0 @@ -package com.twitter.search.earlybird.document; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; - -/** - * Object to encapsulate {@link ThriftIndexingEvent} with a time slice ID. - */ -public class TimeSlicedThriftIndexingEvent { - private final long timeSliceID; - private final ThriftIndexingEvent thriftIndexingEvent; - - public TimeSlicedThriftIndexingEvent(long timeSliceID, ThriftIndexingEvent thriftIndexingEvent) { - Preconditions.checkNotNull(thriftIndexingEvent); - - this.timeSliceID = timeSliceID; - this.thriftIndexingEvent = thriftIndexingEvent; - } - - public long getStatusID() { - return thriftIndexingEvent.getUid(); - } - - public long getTimeSliceID() { - return timeSliceID; - } - - public ThriftIndexingEvent getThriftIndexingEvent() { - return thriftIndexingEvent; - } - - @Override - public String toString() { - return "TimeSlicedThriftIndexingEvent{" - + "timeSliceID=" + timeSliceID - + ", thriftIndexingEvent=" + thriftIndexingEvent - + '}'; - } -} diff --git a/src/java/com/twitter/search/earlybird/document/TruncationTokenStreamWriter.docx b/src/java/com/twitter/search/earlybird/document/TruncationTokenStreamWriter.docx new file mode 100644 index 000000000..7af640cae Binary files /dev/null and b/src/java/com/twitter/search/earlybird/document/TruncationTokenStreamWriter.docx differ diff --git a/src/java/com/twitter/search/earlybird/document/TruncationTokenStreamWriter.java b/src/java/com/twitter/search/earlybird/document/TruncationTokenStreamWriter.java deleted file mode 100644 index 830ae7946..000000000 --- a/src/java/com/twitter/search/earlybird/document/TruncationTokenStreamWriter.java +++ /dev/null @@ -1,86 +0,0 @@ -package com.twitter.search.earlybird.document; - -import com.twitter.common.text.token.TokenProcessor; -import com.twitter.common.text.token.TwitterTokenStream; -import com.twitter.decider.Decider; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.schema.SchemaDocumentFactory; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; - -public class TruncationTokenStreamWriter implements SchemaDocumentFactory.TokenStreamRewriter { - private static final int NEVER_TRUNCATE_CHARS_BELOW_POSITION = 140; - private static final String TRUNCATE_LONG_TWEETS_DECIDER_KEY_PREFIX = - "truncate_long_tweets_in_"; - private static final String NUM_TWEET_CHARACTERS_SUPPORTED_DECIDER_KEY_PREFIX = - "num_tweet_characters_supported_in_"; - - private static final SearchCounter NUM_TWEETS_TRUNCATED = - SearchCounter.export("num_tweets_truncated"); - private static final SearchLongGauge NUM_TWEET_CHARACTERS_SUPPORTED = - SearchLongGauge.export("num_tweet_characters_supported"); - - private final Decider decider; - private final String truncateLongTweetsDeciderKey; - private final String numCharsSupportedDeciderKey; - - /** - * Creates a TruncationTokenStreamWriter - */ - public TruncationTokenStreamWriter(EarlybirdCluster cluster, Decider decider) { - this.decider = decider; - - this.truncateLongTweetsDeciderKey = - TRUNCATE_LONG_TWEETS_DECIDER_KEY_PREFIX + cluster.name().toLowerCase(); - this.numCharsSupportedDeciderKey = - NUM_TWEET_CHARACTERS_SUPPORTED_DECIDER_KEY_PREFIX + cluster.name().toLowerCase(); - } - - @Override - public TwitterTokenStream rewrite(Schema.FieldInfo fieldInfo, TwitterTokenStream stream) { - if (EarlybirdFieldConstant.TEXT_FIELD.getFieldName().equals(fieldInfo.getName())) { - final int maxPosition = getTruncatePosition(); - NUM_TWEET_CHARACTERS_SUPPORTED.set(maxPosition); - if (maxPosition >= NEVER_TRUNCATE_CHARS_BELOW_POSITION) { - return new TokenProcessor(stream) { - @Override - public final boolean incrementToken() { - if (incrementInputStream()) { - if (offset() < maxPosition) { - return true; - } - NUM_TWEETS_TRUNCATED.increment(); - } - - return false; - } - }; - } - } - - return stream; - } - - /** - * Get the truncation position. - * - * @return the truncation position or -1 if truncation is disabled. - */ - private int getTruncatePosition() { - int maxPosition; - if (!DeciderUtil.isAvailableForRandomRecipient(decider, truncateLongTweetsDeciderKey)) { - return -1; - } - maxPosition = DeciderUtil.getAvailability(decider, numCharsSupportedDeciderKey); - - if (maxPosition < NEVER_TRUNCATE_CHARS_BELOW_POSITION) { - // Never truncate below NEVER_TRUNCATE_CHARS_BELOW_POSITION chars - maxPosition = NEVER_TRUNCATE_CHARS_BELOW_POSITION; - } - - return maxPosition; - } -} diff --git a/src/java/com/twitter/search/earlybird/document/TweetDocument.docx b/src/java/com/twitter/search/earlybird/document/TweetDocument.docx new file mode 100644 index 000000000..c5cd5b49d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/document/TweetDocument.docx differ diff --git a/src/java/com/twitter/search/earlybird/document/TweetDocument.java b/src/java/com/twitter/search/earlybird/document/TweetDocument.java deleted file mode 100644 index 5d4dae6f4..000000000 --- a/src/java/com/twitter/search/earlybird/document/TweetDocument.java +++ /dev/null @@ -1,52 +0,0 @@ -package com.twitter.search.earlybird.document; - -import org.apache.lucene.document.Document; - -/** - * TweetDocument is a record produced by DocumentReader and TweetIndexUpdateReader - * for consumption by the partition indexer. - */ -public final class TweetDocument { - private final long tweetID; - private final long timeSliceID; - private final long eventTimeMs; - private final Document document; - - public TweetDocument( - long tweetID, - long timeSliceID, - long eventTimeMs, - Document document - ) { - this.tweetID = tweetID; - this.timeSliceID = timeSliceID; - this.eventTimeMs = eventTimeMs; - this.document = document; - } - - public long getTweetID() { - return tweetID; - } - - public long getTimeSliceID() { - return timeSliceID; - } - - public long getEventTimeMs() { - return eventTimeMs; - } - - public Document getDocument() { - return document; - } - - @Override - public String toString() { - return "TweetDocument{" - + "tweetID=" + tweetID - + ", timeSliceID=" + timeSliceID - + ", eventTimeMs=" + eventTimeMs - + ", document=" + document - + '}'; - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/AlreadyInServerSetUpdateException.docx b/src/java/com/twitter/search/earlybird/exception/AlreadyInServerSetUpdateException.docx new file mode 100644 index 000000000..7c2014a85 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/AlreadyInServerSetUpdateException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/AlreadyInServerSetUpdateException.java b/src/java/com/twitter/search/earlybird/exception/AlreadyInServerSetUpdateException.java deleted file mode 100644 index d6db98dad..000000000 --- a/src/java/com/twitter/search/earlybird/exception/AlreadyInServerSetUpdateException.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.search.earlybird.exception; - -import com.twitter.common.zookeeper.ServerSet; - -/** - * Used when trying to join a server set when this earlybird is already in a server set. - */ -public class AlreadyInServerSetUpdateException extends ServerSet.UpdateException { - public AlreadyInServerSetUpdateException(String message) { - super(message); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/BadRequestException.docx b/src/java/com/twitter/search/earlybird/exception/BadRequestException.docx new file mode 100644 index 000000000..e6da39aae Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/BadRequestException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/BadRequestException.java b/src/java/com/twitter/search/earlybird/exception/BadRequestException.java deleted file mode 100644 index b19db6571..000000000 --- a/src/java/com/twitter/search/earlybird/exception/BadRequestException.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.twitter.search.earlybird.exception; - -public class BadRequestException extends Exception { - public BadRequestException(String message, Throwable cause) { - super(message, cause); - } - - public BadRequestException(String message) { - super(message); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/ClientException.docx b/src/java/com/twitter/search/earlybird/exception/ClientException.docx new file mode 100644 index 000000000..4f94ebf23 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/ClientException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/ClientException.java b/src/java/com/twitter/search/earlybird/exception/ClientException.java deleted file mode 100644 index 3387f0662..000000000 --- a/src/java/com/twitter/search/earlybird/exception/ClientException.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.twitter.search.earlybird.exception; - -public class ClientException extends Exception { - public ClientException(Throwable t) { - super(t); - } - - public ClientException(String message) { - super(message); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/CriticalExceptionHandler.docx b/src/java/com/twitter/search/earlybird/exception/CriticalExceptionHandler.docx new file mode 100644 index 000000000..9bdaa269b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/CriticalExceptionHandler.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/CriticalExceptionHandler.java b/src/java/com/twitter/search/earlybird/exception/CriticalExceptionHandler.java deleted file mode 100644 index a2b72511f..000000000 --- a/src/java/com/twitter/search/earlybird/exception/CriticalExceptionHandler.java +++ /dev/null @@ -1,114 +0,0 @@ -package com.twitter.search.earlybird.exception; - -import com.google.common.annotations.VisibleForTesting; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.slf4j.Marker; -import org.slf4j.MarkerFactory; - -import com.twitter.search.common.config.Config; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.earlybird.EarlybirdStatus; - -/** - * Used for handling exceptions considered critical. - * - * When you handle an exception with this class, two things might happen. - * 1. If earlybirds are still starting, we'll shut them down. - * 2. If earlybirds have started, we'll increment a counter that will cause alerts. - * - * If you want to verify that your code handles exceptions as you expect, you can use the - * helper class ExceptionCauser. - */ -public class CriticalExceptionHandler { - private static final Logger LOG = LoggerFactory.getLogger(CriticalExceptionHandler.class); - private static final Marker FATAL = MarkerFactory.getMarker("FATAL"); - - // This stat should remain at 0 during normal operations. - // This stat being non-zero should trigger alerts. - public static final SearchCounter CRITICAL_EXCEPTION_COUNT = - SearchCounter.export("fatal_exception_count"); - - public static final SearchCounter UNSAFE_MEMORY_ACCESS = - SearchCounter.export("unsafe_memory_access"); - - private Runnable shutdownHook; - - public void setShutdownHook(Runnable shutdownHook) { - this.shutdownHook = shutdownHook; - } - - /** - * Handle a critical exception. - * - * @param thrower Instance of the class where the exception was thrown. - * @param thrown The exception. - */ - public void handle(Object thrower, Throwable thrown) { - if (thrown == null) { - return; - } - - try { - handleFatalException(thrower, thrown); - } catch (Throwable e) { - LOG.error("Unexpected exception in EarlybirdExceptionHandler.handle() while handling an " - + "unexpected exception from " + thrower.getClass(), e); - } - } - - @VisibleForTesting - boolean shouldIncrementFatalExceptionCounter(Throwable thrown) { - // See D212952 - // We don't want to get pages when this happens. - for (Throwable t = thrown; t != null; t = t.getCause()) { - if (t instanceof InternalError && t.getMessage() != null - && t.getMessage().contains("unsafe memory access operation")) { - // Don't treat InternalError caused by unsafe memory access operation which is usually - // triggered by SIGBUS for accessing a corrupted memory block. - UNSAFE_MEMORY_ACCESS.increment(); - return false; - } - } - - return true; - } - - /** - * Handle an exception that's considered fatal. - * - * @param thrower instance of the class where the exception was thrown. - * @param thrown The Error or Exception. - */ - private void handleFatalException(Object thrower, Throwable thrown) { - LOG.error(FATAL, "Fatal exception in " + thrower.getClass() + ":", thrown); - - if (shouldIncrementFatalExceptionCounter(thrown)) { - CRITICAL_EXCEPTION_COUNT.increment(); - } - - if (EarlybirdStatus.isStarting()) { - LOG.error(FATAL, "Got fatal exception while starting up, exiting ..."); - if (this.shutdownHook != null) { - this.shutdownHook.run(); - } else { - LOG.error("earlybirdServer not set, can't shut down."); - } - - if (!Config.environmentIsTest()) { - // Sleep for 3 minutes to allow the fatal exception to be caught by observability. - try { - Thread.sleep(3 * 60 * 1000); - } catch (InterruptedException e) { - LOG.error(FATAL, "interupted sleep while shutting down."); - } - LOG.info("Terminate JVM."); - //CHECKSTYLE:OFF RegexpSinglelineJava - // See SEARCH-15256 - System.exit(-1); - //CHECKSTYLE:ON RegexpSinglelineJava - } - } - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/EarlybirdException.docx b/src/java/com/twitter/search/earlybird/exception/EarlybirdException.docx new file mode 100644 index 000000000..4fd90ec80 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/EarlybirdException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/EarlybirdException.java b/src/java/com/twitter/search/earlybird/exception/EarlybirdException.java deleted file mode 100644 index fe82488a4..000000000 --- a/src/java/com/twitter/search/earlybird/exception/EarlybirdException.java +++ /dev/null @@ -1,18 +0,0 @@ -package com.twitter.search.earlybird.exception; - -/** - * General Earlybird exception class to use instead of the Java exception class. - */ -public class EarlybirdException extends Exception { - public EarlybirdException(Throwable cause) { - super(cause); - } - - public EarlybirdException(String message) { - super(message); - } - - public EarlybirdException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/EarlybirdFinagleServerMonitor.docx b/src/java/com/twitter/search/earlybird/exception/EarlybirdFinagleServerMonitor.docx new file mode 100644 index 000000000..de9a332d6 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/EarlybirdFinagleServerMonitor.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/EarlybirdFinagleServerMonitor.java b/src/java/com/twitter/search/earlybird/exception/EarlybirdFinagleServerMonitor.java deleted file mode 100644 index 92971b48c..000000000 --- a/src/java/com/twitter/search/earlybird/exception/EarlybirdFinagleServerMonitor.java +++ /dev/null @@ -1,25 +0,0 @@ -package com.twitter.search.earlybird.exception; - -import com.twitter.finagle.Failure; -import com.twitter.util.AbstractMonitor; - -public class EarlybirdFinagleServerMonitor extends AbstractMonitor { - private final CriticalExceptionHandler criticalExceptionHandler; - - public EarlybirdFinagleServerMonitor(CriticalExceptionHandler criticalExceptionHandler) { - this.criticalExceptionHandler = criticalExceptionHandler; - } - - @Override - public boolean handle(Throwable e) { - if (e instanceof Failure) { - // skip Finagle failure - return true; - } - - criticalExceptionHandler.handle(this, e); - - // We return true here because we handle all exceptions. - return true; - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/EarlybirdRuntimeException.docx b/src/java/com/twitter/search/earlybird/exception/EarlybirdRuntimeException.docx new file mode 100644 index 000000000..51545a36e Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/EarlybirdRuntimeException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/EarlybirdRuntimeException.java b/src/java/com/twitter/search/earlybird/exception/EarlybirdRuntimeException.java deleted file mode 100644 index a570324fd..000000000 --- a/src/java/com/twitter/search/earlybird/exception/EarlybirdRuntimeException.java +++ /dev/null @@ -1,7 +0,0 @@ -package com.twitter.search.earlybird.exception; - -public class EarlybirdRuntimeException extends RuntimeException { - public EarlybirdRuntimeException(Throwable cause) { - super(cause); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/EarlybirdStartupException.docx b/src/java/com/twitter/search/earlybird/exception/EarlybirdStartupException.docx new file mode 100644 index 000000000..fee6f1e01 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/EarlybirdStartupException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/EarlybirdStartupException.java b/src/java/com/twitter/search/earlybird/exception/EarlybirdStartupException.java deleted file mode 100644 index 5c40cd0e3..000000000 --- a/src/java/com/twitter/search/earlybird/exception/EarlybirdStartupException.java +++ /dev/null @@ -1,20 +0,0 @@ -package com.twitter.search.earlybird.exception; - -/** - * Thrown by code that is executed during startup and used to communicate to caller that startup - * has failed. Generally results in shutting down of the server, but check on your own if you - * need to. - */ -public class EarlybirdStartupException extends Exception { - public EarlybirdStartupException(Throwable cause) { - super(cause); - } - - public EarlybirdStartupException(String message) { - super(message); - } - - public EarlybirdStartupException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/FlushVersionMismatchException.docx b/src/java/com/twitter/search/earlybird/exception/FlushVersionMismatchException.docx new file mode 100644 index 000000000..8b75ea9d6 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/FlushVersionMismatchException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/FlushVersionMismatchException.java b/src/java/com/twitter/search/earlybird/exception/FlushVersionMismatchException.java deleted file mode 100644 index e6cee497d..000000000 --- a/src/java/com/twitter/search/earlybird/exception/FlushVersionMismatchException.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.twitter.search.earlybird.exception; - -import java.io.IOException; - -public class FlushVersionMismatchException extends IOException { - public FlushVersionMismatchException(Throwable cause) { - super(cause); - } - - public FlushVersionMismatchException(String message) { - super(message); - } - - public FlushVersionMismatchException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/MissingKafkaTopicException.docx b/src/java/com/twitter/search/earlybird/exception/MissingKafkaTopicException.docx new file mode 100644 index 000000000..5214089a9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/MissingKafkaTopicException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/MissingKafkaTopicException.java b/src/java/com/twitter/search/earlybird/exception/MissingKafkaTopicException.java deleted file mode 100644 index f6d1c675d..000000000 --- a/src/java/com/twitter/search/earlybird/exception/MissingKafkaTopicException.java +++ /dev/null @@ -1,11 +0,0 @@ -package com.twitter.search.earlybird.exception; - -public class MissingKafkaTopicException extends Exception { - public MissingKafkaTopicException(String message) { - super(message); - } - - public MissingKafkaTopicException(String message, Throwable cause) { - super(message, cause); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/MissingUserException.docx b/src/java/com/twitter/search/earlybird/exception/MissingUserException.docx new file mode 100644 index 000000000..ed35d6841 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/MissingUserException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/MissingUserException.java b/src/java/com/twitter/search/earlybird/exception/MissingUserException.java deleted file mode 100644 index fab3886f2..000000000 --- a/src/java/com/twitter/search/earlybird/exception/MissingUserException.java +++ /dev/null @@ -1,4 +0,0 @@ -package com.twitter.search.earlybird.exception; - -public class MissingUserException extends Exception { -} diff --git a/src/java/com/twitter/search/earlybird/exception/NotInServerSetUpdateException.docx b/src/java/com/twitter/search/earlybird/exception/NotInServerSetUpdateException.docx new file mode 100644 index 000000000..e5610b69e Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/NotInServerSetUpdateException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/NotInServerSetUpdateException.java b/src/java/com/twitter/search/earlybird/exception/NotInServerSetUpdateException.java deleted file mode 100644 index 1be7e7679..000000000 --- a/src/java/com/twitter/search/earlybird/exception/NotInServerSetUpdateException.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.search.earlybird.exception; - -import com.twitter.common.zookeeper.ServerSet; - -/** - * Used when trying to leave a server set when this earlybird is already out of the server set. - */ -public class NotInServerSetUpdateException extends ServerSet.UpdateException { - public NotInServerSetUpdateException(String message) { - super(message); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/TransientException.docx b/src/java/com/twitter/search/earlybird/exception/TransientException.docx new file mode 100644 index 000000000..d8d44b35f Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/TransientException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/TransientException.java b/src/java/com/twitter/search/earlybird/exception/TransientException.java deleted file mode 100644 index 76f6b8dc6..000000000 --- a/src/java/com/twitter/search/earlybird/exception/TransientException.java +++ /dev/null @@ -1,15 +0,0 @@ -package com.twitter.search.earlybird.exception; - -public class TransientException extends Exception { - public TransientException(Throwable t) { - super(t); - } - - public TransientException(String message, Throwable cause) { - super(message, cause); - } - - public TransientException(String message) { - super(message); - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/UncaughtExceptionHandler.docx b/src/java/com/twitter/search/earlybird/exception/UncaughtExceptionHandler.docx new file mode 100644 index 000000000..0cc3fa4c8 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/UncaughtExceptionHandler.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/UncaughtExceptionHandler.java b/src/java/com/twitter/search/earlybird/exception/UncaughtExceptionHandler.java deleted file mode 100644 index 300e855fa..000000000 --- a/src/java/com/twitter/search/earlybird/exception/UncaughtExceptionHandler.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.twitter.search.earlybird.exception; - -import com.twitter.util.AbstractMonitor; - -public class UncaughtExceptionHandler extends AbstractMonitor { - private final CriticalExceptionHandler criticalExceptionHandler; - - public UncaughtExceptionHandler() { - this.criticalExceptionHandler = new CriticalExceptionHandler(); - } - - public void setShutdownHook(Runnable shutdown) { - this.criticalExceptionHandler.setShutdownHook(shutdown); - } - - @Override - public boolean handle(Throwable e) { - criticalExceptionHandler.handle(this, e); - - // We return true here because we handle all exceptions. - return true; - } -} diff --git a/src/java/com/twitter/search/earlybird/exception/WrappedKafkaApiException.docx b/src/java/com/twitter/search/earlybird/exception/WrappedKafkaApiException.docx new file mode 100644 index 000000000..8e523a9ae Binary files /dev/null and b/src/java/com/twitter/search/earlybird/exception/WrappedKafkaApiException.docx differ diff --git a/src/java/com/twitter/search/earlybird/exception/WrappedKafkaApiException.java b/src/java/com/twitter/search/earlybird/exception/WrappedKafkaApiException.java deleted file mode 100644 index de5126dad..000000000 --- a/src/java/com/twitter/search/earlybird/exception/WrappedKafkaApiException.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.twitter.search.earlybird.exception; - -import org.apache.kafka.common.errors.ApiException; - -/** - * Kafka's ApiException class doesn't retain its stack trace (see its source code). - * As a result a kafka exception that propagates up the call chain can't point to where exactly - * did the exception happen in our code. As a solution, use this class when calling kafka API - * methods. - */ -public class WrappedKafkaApiException extends RuntimeException { - public WrappedKafkaApiException(ApiException cause) { - super(cause); - } - - public WrappedKafkaApiException(String message, ApiException cause) { - super(message, cause); - } -} diff --git a/src/java/com/twitter/search/earlybird/factory/EarlybirdIndexConfigUtil.docx b/src/java/com/twitter/search/earlybird/factory/EarlybirdIndexConfigUtil.docx new file mode 100644 index 000000000..5ae4682fd Binary files /dev/null and b/src/java/com/twitter/search/earlybird/factory/EarlybirdIndexConfigUtil.docx differ diff --git a/src/java/com/twitter/search/earlybird/factory/EarlybirdIndexConfigUtil.java b/src/java/com/twitter/search/earlybird/factory/EarlybirdIndexConfigUtil.java deleted file mode 100644 index 7fcef3e0b..000000000 --- a/src/java/com/twitter/search/earlybird/factory/EarlybirdIndexConfigUtil.java +++ /dev/null @@ -1,53 +0,0 @@ -package com.twitter.search.earlybird.factory; - -import com.twitter.decider.Decider; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.RealtimeEarlybirdIndexConfig; -import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig; -import com.twitter.search.earlybird.archive.ArchiveSearchPartitionManager; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; - -public final class EarlybirdIndexConfigUtil { - private EarlybirdIndexConfigUtil() { - } - - /** - * Creates the index config for this earlybird. - */ - public static EarlybirdIndexConfig createEarlybirdIndexConfig( - Decider decider, SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler) { - if (isArchiveSearch()) { - return new ArchiveOnDiskEarlybirdIndexConfig(decider, searchIndexingMetricSet, - criticalExceptionHandler); - } else if (isProtectedSearch()) { - return new RealtimeEarlybirdIndexConfig( - EarlybirdCluster.PROTECTED, decider, searchIndexingMetricSet, criticalExceptionHandler); - } else if (isRealtimeCG()) { - return new RealtimeEarlybirdIndexConfig( - EarlybirdCluster.REALTIME_CG, decider, searchIndexingMetricSet, criticalExceptionHandler); - } else { - return new RealtimeEarlybirdIndexConfig( - EarlybirdCluster.REALTIME, decider, searchIndexingMetricSet, criticalExceptionHandler); - } - } - - public static boolean isArchiveSearch() { - // Re-reading config on each call so that tests can reliably overwrite this - return EarlybirdConfig.getString("partition_manager", "realtime") - .equals(ArchiveSearchPartitionManager.CONFIG_NAME); - } - - private static boolean isProtectedSearch() { - // Re-reading config on each call so that tests can reliably overwrite this - return EarlybirdConfig.getBool("protected_index", false); - } - - private static boolean isRealtimeCG() { - // Re-reading config on each call so that tests can reliably overwrite this - return EarlybirdConfig.getBool("realtime_cg_index", false); - } -} diff --git a/src/java/com/twitter/search/earlybird/factory/EarlybirdKafkaConsumersFactory.docx b/src/java/com/twitter/search/earlybird/factory/EarlybirdKafkaConsumersFactory.docx new file mode 100644 index 000000000..58f649ebf Binary files /dev/null and b/src/java/com/twitter/search/earlybird/factory/EarlybirdKafkaConsumersFactory.docx differ diff --git a/src/java/com/twitter/search/earlybird/factory/EarlybirdKafkaConsumersFactory.java b/src/java/com/twitter/search/earlybird/factory/EarlybirdKafkaConsumersFactory.java deleted file mode 100644 index e360a3df1..000000000 --- a/src/java/com/twitter/search/earlybird/factory/EarlybirdKafkaConsumersFactory.java +++ /dev/null @@ -1,19 +0,0 @@ -package com.twitter.search.earlybird.factory; - -import org.apache.kafka.clients.consumer.KafkaConsumer; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; - -public interface EarlybirdKafkaConsumersFactory { - /** - * Create a kafka consumer with default records to be polled. - */ - KafkaConsumer createKafkaConsumer( - String clientID); - - /** - * Create a kafka consumer with a set number of records to be polled. - */ - KafkaConsumer createKafkaConsumer( - String clientID, int maxPollRecords); -} diff --git a/src/java/com/twitter/search/earlybird/factory/EarlybirdServerFactory.docx b/src/java/com/twitter/search/earlybird/factory/EarlybirdServerFactory.docx new file mode 100644 index 000000000..4b856c76d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/factory/EarlybirdServerFactory.docx differ diff --git a/src/java/com/twitter/search/earlybird/factory/EarlybirdServerFactory.java b/src/java/com/twitter/search/earlybird/factory/EarlybirdServerFactory.java deleted file mode 100644 index c8459a3f0..000000000 --- a/src/java/com/twitter/search/earlybird/factory/EarlybirdServerFactory.java +++ /dev/null @@ -1,353 +0,0 @@ -package com.twitter.search.earlybird.factory; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.search.common.aurora.AuroraInstanceKey; -import com.twitter.search.common.aurora.AuroraSchedulerClient; -import com.twitter.search.common.concurrent.ScheduledExecutorServiceFactory; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.util.ml.tensorflow_engine.TensorflowModelsManager; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.earlybird.EarlybirdDarkProxy; -import com.twitter.search.earlybird.EarlybirdFinagleServerManager; -import com.twitter.search.earlybird.EarlybirdFuturePoolManager; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.EarlybirdServer; -import com.twitter.search.earlybird.EarlybirdServerSetManager; -import com.twitter.search.earlybird.EarlybirdWarmUpManager; -import com.twitter.search.earlybird.QualityFactor; -import com.twitter.search.earlybird.UpdateableEarlybirdStateManager; -import com.twitter.search.earlybird.archive.ArchiveEarlybirdIndexConfig; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.userupdates.UserScrubGeoMap; -import com.twitter.search.earlybird.common.userupdates.UserUpdatesChecker; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.ml.ScoringModelsManager; -import com.twitter.search.earlybird.partition.AudioSpaceEventsStreamIndexer; -import com.twitter.search.earlybird.partition.AudioSpaceTable; -import com.twitter.search.earlybird.partition.DynamicPartitionConfig; -import com.twitter.search.earlybird.partition.EarlybirdStartup; -import com.twitter.search.earlybird.partition.MultiSegmentTermDictionaryManager; -import com.twitter.search.earlybird.partition.PartitionConfig; -import com.twitter.search.earlybird.partition.PartitionManager; -import com.twitter.search.earlybird.partition.SegmentManager; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; -import com.twitter.search.earlybird.partition.UserScrubGeoEventStreamIndexer; -import com.twitter.search.earlybird.partition.UserUpdatesStreamIndexer; -import com.twitter.search.earlybird.querycache.QueryCacheConfig; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.util.TermCountMonitor; -import com.twitter.search.earlybird.util.TweetCountMonitor; - -/** - * This is the wiring file that builds EarlybirdServers. - * Production and test code share this same wiring file. - *

    - * To supply mocks for testing, one can do so by supplying a different - * EarlybirdWiringModule to this wiring file. - */ -public final class EarlybirdServerFactory { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdServerFactory.class); - - /** - * Creates the EarlybirdServer based on the bindings in the given wire module. - * - * @param earlybirdWireModule The wire module that specifies all required bindings. - */ - public EarlybirdServer makeEarlybirdServer(EarlybirdWireModule earlybirdWireModule) - throws IOException { - LOG.info("Started making an Earlybird server"); - CriticalExceptionHandler criticalExceptionHandler = new CriticalExceptionHandler(); - Decider decider = earlybirdWireModule.provideDecider(); - SearchDecider searchDecider = new SearchDecider(decider); - - EarlybirdWireModule.ZooKeeperClients zkClients = earlybirdWireModule.provideZooKeeperClients(); - ZooKeeperTryLockFactory zkTryLockFactory = - zkClients.stateClient.createZooKeeperTryLockFactory(); - - EarlybirdIndexConfig earlybirdIndexConfig = - earlybirdWireModule.provideEarlybirdIndexConfig( - decider, earlybirdWireModule.provideSearchIndexingMetricSet(), - criticalExceptionHandler); - - SearchStatsReceiver earlybirdServerStats = - earlybirdWireModule.provideEarlybirdServerStatsReceiver(); - - EarlybirdSearcherStats tweetsSearcherStats = - earlybirdWireModule.provideTweetsSearcherStats(); - - DynamicPartitionConfig dynamicPartitionConfig = - earlybirdWireModule.provideDynamicPartitionConfig(); - - PartitionConfig partitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - LOG.info("Partition config info [Cluster: {}, Tier: {}, Partition: {}, Replica: {}]", - partitionConfig.getClusterName(), - partitionConfig.getTierName(), - partitionConfig.getIndexingHashPartitionID(), - partitionConfig.getHostPositionWithinHashPartition()); - - Clock clock = earlybirdWireModule.provideClock(); - UserUpdatesChecker userUpdatesChecker = - new UserUpdatesChecker(clock, decider, earlybirdIndexConfig.getCluster()); - - UserTable userTable = UserTable.newTableWithDefaultCapacityAndPredicate( - earlybirdIndexConfig.getUserTableFilter(partitionConfig)::apply); - - UserScrubGeoMap userScrubGeoMap = new UserScrubGeoMap(); - - AudioSpaceTable audioSpaceTable = new AudioSpaceTable(clock); - - SegmentSyncConfig segmentSyncConfig = - earlybirdWireModule.provideSegmentSyncConfig(earlybirdIndexConfig.getCluster()); - - SegmentManager segmentManager = earlybirdWireModule.provideSegmentManager( - dynamicPartitionConfig, - earlybirdIndexConfig, - earlybirdWireModule.provideSearchIndexingMetricSet(), - tweetsSearcherStats, - earlybirdServerStats, - userUpdatesChecker, - segmentSyncConfig, - userTable, - userScrubGeoMap, - clock, - criticalExceptionHandler); - - QueryCacheConfig config = earlybirdWireModule.provideQueryCacheConfig(earlybirdServerStats); - - QueryCacheManager queryCacheManager = earlybirdWireModule.provideQueryCacheManager( - config, - earlybirdIndexConfig, - partitionConfig.getMaxEnabledLocalSegments(), - userTable, - userScrubGeoMap, - earlybirdWireModule.provideQueryCacheUpdateTaskScheduledExecutorFactory(), - earlybirdServerStats, - tweetsSearcherStats, - decider, - criticalExceptionHandler, - clock); - - EarlybirdServerSetManager serverSetManager = earlybirdWireModule.provideServerSetManager( - zkClients.discoveryClient, - dynamicPartitionConfig, - earlybirdServerStats, - EarlybirdConfig.getThriftPort(), - ""); - - EarlybirdWarmUpManager warmUpManager = - earlybirdWireModule.provideWarmUpManager(zkClients.discoveryClient, - dynamicPartitionConfig, - earlybirdServerStats, - decider, - clock, - EarlybirdConfig.getWarmUpThriftPort(), - "warmup_"); - - EarlybirdDarkProxy earlybirdDarkProxy = earlybirdWireModule.provideEarlybirdDarkProxy( - new SearchDecider(decider), - earlybirdWireModule.provideFinagleStatsReceiver(), - serverSetManager, - warmUpManager, - partitionConfig.getClusterName()); - - UserUpdatesStreamIndexer userUpdatesStreamIndexer = - earlybirdWireModule.provideUserUpdatesKafkaConsumer(segmentManager); - - UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer = - earlybirdWireModule.provideUserScrubGeoEventKafkaConsumer(segmentManager); - - AudioSpaceEventsStreamIndexer audioSpaceEventsStreamIndexer = - earlybirdWireModule.provideAudioSpaceEventsStreamIndexer(audioSpaceTable, clock); - - MultiSegmentTermDictionaryManager.Config termDictionaryConfig = - earlybirdWireModule.provideMultiSegmentTermDictionaryManagerConfig(); - MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager = - earlybirdWireModule.provideMultiSegmentTermDictionaryManager( - termDictionaryConfig, - segmentManager, - earlybirdServerStats, - decider, - earlybirdIndexConfig.getCluster()); - - TermCountMonitor termCountMonitor = - earlybirdWireModule.provideTermCountMonitor( - segmentManager, earlybirdWireModule.provideTermCountMonitorScheduledExecutorFactory(), - earlybirdServerStats, - criticalExceptionHandler); - TweetCountMonitor tweetCountMonitor = - earlybirdWireModule.provideTweetCountMonitor( - segmentManager, earlybirdWireModule.provideTweetCountMonitorScheduledExecutorFactory(), - earlybirdServerStats, - criticalExceptionHandler); - - ScoringModelsManager scoringModelsManager = earlybirdWireModule.provideScoringModelsManager( - earlybirdServerStats, - earlybirdIndexConfig - ); - - TensorflowModelsManager tensorflowModelsManager = - earlybirdWireModule.provideTensorflowModelsManager( - earlybirdServerStats, - "tf_loader", - decider, - earlybirdIndexConfig - ); - - AuroraSchedulerClient schedulerClient = null; - AuroraInstanceKey auroraInstanceKey = EarlybirdConfig.getAuroraInstanceKey(); - if (auroraInstanceKey != null) { - schedulerClient = new AuroraSchedulerClient(auroraInstanceKey.getCluster()); - } - - UpdateableEarlybirdStateManager earlybirdStateManager = - earlybirdWireModule.provideUpdateableEarlybirdStateManager( - earlybirdIndexConfig, - dynamicPartitionConfig, - zkClients.stateClient, - schedulerClient, - earlybirdWireModule.provideStateUpdateManagerExecutorFactory(), - scoringModelsManager, - tensorflowModelsManager, - earlybirdServerStats, - new SearchDecider(decider), - criticalExceptionHandler); - - EarlybirdFuturePoolManager futurePoolManager = earlybirdWireModule.provideFuturePoolManager(); - EarlybirdFinagleServerManager finagleServerManager = - earlybirdWireModule.provideFinagleServerManager(criticalExceptionHandler); - - PartitionManager partitionManager = null; - if (EarlybirdIndexConfigUtil.isArchiveSearch()) { - partitionManager = buildArchivePartitionManager( - earlybirdWireModule, - userUpdatesStreamIndexer, - userScrubGeoEventStreamIndexer, - zkTryLockFactory, - earlybirdIndexConfig, - dynamicPartitionConfig, - segmentManager, - queryCacheManager, - earlybirdServerStats, - serverSetManager, - earlybirdWireModule.providePartitionManagerExecutorFactory(), - earlybirdWireModule.provideSimpleUserUpdateIndexerScheduledExecutorFactory(), - clock, - segmentSyncConfig, - criticalExceptionHandler); - } else { - LOG.info("Not creating PartitionManager"); - } - - EarlybirdSegmentFactory earlybirdSegmentFactory = new EarlybirdSegmentFactory( - earlybirdIndexConfig, - earlybirdWireModule.provideSearchIndexingMetricSet(), - tweetsSearcherStats, - clock); - - EarlybirdStartup earlybirdStartup = earlybirdWireModule.provideEarlybirdStartup( - partitionManager, - userUpdatesStreamIndexer, - userScrubGeoEventStreamIndexer, - audioSpaceEventsStreamIndexer, - dynamicPartitionConfig, - criticalExceptionHandler, - segmentManager, - multiSegmentTermDictionaryManager, - queryCacheManager, - zkTryLockFactory, - serverSetManager, - clock, - segmentSyncConfig, - earlybirdSegmentFactory, - earlybirdIndexConfig.getCluster(), - searchDecider); - - QualityFactor qualityFactor = earlybirdWireModule.provideQualityFactor( - decider, - earlybirdServerStats); - - EarlybirdServer earlybirdServer = new EarlybirdServer( - queryCacheManager, - zkClients.stateClient, - decider, - earlybirdIndexConfig, - dynamicPartitionConfig, - partitionManager, - segmentManager, - audioSpaceTable, - termCountMonitor, - tweetCountMonitor, - earlybirdStateManager, - futurePoolManager, - finagleServerManager, - serverSetManager, - warmUpManager, - earlybirdServerStats, - tweetsSearcherStats, - scoringModelsManager, - tensorflowModelsManager, - clock, - multiSegmentTermDictionaryManager, - earlybirdDarkProxy, - segmentSyncConfig, - earlybirdWireModule.provideQueryTimeoutFactory(), - earlybirdStartup, - qualityFactor, - earlybirdWireModule.provideSearchIndexingMetricSet()); - - earlybirdStateManager.setEarlybirdServer(earlybirdServer); - criticalExceptionHandler.setShutdownHook(earlybirdServer::shutdown); - - return earlybirdServer; - } - - private PartitionManager buildArchivePartitionManager( - EarlybirdWireModule earlybirdWireModule, - UserUpdatesStreamIndexer userUpdatesStreamIndexer, - UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer, - ZooKeeperTryLockFactory zkTryLockFactory, - EarlybirdIndexConfig earlybirdIndexConfig, - DynamicPartitionConfig dynamicPartitionConfig, - SegmentManager segmentManager, - QueryCacheManager queryCacheManager, - SearchStatsReceiver searchStatsReceiver, - EarlybirdServerSetManager serverSetManager, - ScheduledExecutorServiceFactory partitionManagerExecutorServiceFactory, - ScheduledExecutorServiceFactory simpleUserUpdateIndexerExecutorFactory, - Clock clock, - SegmentSyncConfig segmentSyncConfig, - CriticalExceptionHandler criticalExceptionHandler) - throws IOException { - - Preconditions.checkState(earlybirdIndexConfig instanceof ArchiveEarlybirdIndexConfig); - LOG.info("Creating ArchiveSearchPartitionManager"); - return earlybirdWireModule.provideFullArchivePartitionManager( - zkTryLockFactory, - queryCacheManager, - segmentManager, - dynamicPartitionConfig, - userUpdatesStreamIndexer, - userScrubGeoEventStreamIndexer, - searchStatsReceiver, - (ArchiveEarlybirdIndexConfig) earlybirdIndexConfig, - serverSetManager, - partitionManagerExecutorServiceFactory, - simpleUserUpdateIndexerExecutorFactory, - earlybirdWireModule.provideSearchIndexingMetricSet(), - clock, - segmentSyncConfig, - criticalExceptionHandler); - } -} diff --git a/src/java/com/twitter/search/earlybird/factory/EarlybirdWireModule.docx b/src/java/com/twitter/search/earlybird/factory/EarlybirdWireModule.docx new file mode 100644 index 000000000..b68776709 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/factory/EarlybirdWireModule.docx differ diff --git a/src/java/com/twitter/search/earlybird/factory/EarlybirdWireModule.java b/src/java/com/twitter/search/earlybird/factory/EarlybirdWireModule.java deleted file mode 100644 index a6b67e021..000000000 --- a/src/java/com/twitter/search/earlybird/factory/EarlybirdWireModule.java +++ /dev/null @@ -1,901 +0,0 @@ -package com.twitter.search.earlybird.factory; - -import java.io.IOException; -import java.lang.management.ManagementFactory; -import java.util.Optional; -import java.util.concurrent.ScheduledThreadPoolExecutor; -import java.util.concurrent.TimeUnit; - -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; -import com.sun.management.OperatingSystemMXBean; - -import org.apache.directory.api.util.Strings; -import org.apache.hadoop.fs.FileSystem; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.common.TopicPartition; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.decider.Decider; -import com.twitter.finagle.stats.MetricsStatsReceiver; -import com.twitter.finagle.stats.StatsReceiver; -import com.twitter.search.common.aurora.AuroraSchedulerClient; -import com.twitter.search.common.concurrent.ScheduledExecutorServiceFactory; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.file.FileUtils; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.metrics.SearchStatsReceiverImpl; -import com.twitter.search.common.partitioning.zookeeper.SearchZkClient; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.search.termination.QueryTimeoutFactory; -import com.twitter.search.common.util.io.kafka.FinagleKafkaClientUtils; -import com.twitter.search.common.util.io.kafka.ThriftDeserializer; -import com.twitter.search.common.util.ml.tensorflow_engine.TensorflowModelsManager; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.common.util.zookeeper.ZooKeeperProxy; -import com.twitter.search.earlybird.EarlybirdCPUQualityFactor; -import com.twitter.search.earlybird.EarlybirdDarkProxy; -import com.twitter.search.earlybird.EarlybirdFinagleServerManager; -import com.twitter.search.earlybird.EarlybirdFuturePoolManager; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.EarlybirdProductionFinagleServerManager; -import com.twitter.search.earlybird.EarlybirdServerSetManager; -import com.twitter.search.earlybird.EarlybirdWarmUpManager; -import com.twitter.search.earlybird.QualityFactor; -import com.twitter.search.earlybird.ServerSetMember; -import com.twitter.search.earlybird.UpdateableEarlybirdStateManager; -import com.twitter.search.earlybird.archive.ArchiveEarlybirdIndexConfig; -import com.twitter.search.earlybird.archive.ArchiveSearchPartitionManager; -import com.twitter.search.earlybird.common.CaughtUpMonitor; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.common.userupdates.UserScrubGeoMap; -import com.twitter.search.earlybird.common.userupdates.UserUpdatesChecker; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.exception.MissingKafkaTopicException; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.ml.ScoringModelsManager; -import com.twitter.search.earlybird.partition.AudioSpaceEventsStreamIndexer; -import com.twitter.search.earlybird.partition.AudioSpaceTable; -import com.twitter.search.earlybird.partition.DynamicPartitionConfig; -import com.twitter.search.earlybird.partition.EarlybirdIndexFlusher; -import com.twitter.search.earlybird.partition.EarlybirdIndexLoader; -import com.twitter.search.earlybird.partition.EarlybirdKafkaConsumer; -import com.twitter.search.earlybird.partition.EarlybirdStartup; -import com.twitter.search.earlybird.partition.OptimizationAndFlushingCoordinationLock; -import com.twitter.search.earlybird.partition.TimeLimitedHadoopExistsCall; -import com.twitter.search.earlybird.partition.UserScrubGeoEventStreamIndexer; -import com.twitter.search.earlybird.partition.freshstartup.FreshStartupHandler; -import com.twitter.search.earlybird.partition.HdfsUtil; -import com.twitter.search.earlybird.partition.KafkaStartup; -import com.twitter.search.earlybird.partition.MultiSegmentTermDictionaryManager; -import com.twitter.search.earlybird.partition.PartitionManager; -import com.twitter.search.earlybird.partition.PartitionManagerStartup; -import com.twitter.search.earlybird.partition.PartitionWriter; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentManager; -import com.twitter.search.earlybird.partition.SegmentSyncConfig; -import com.twitter.search.earlybird.partition.StartupUserEventIndexer; -import com.twitter.search.earlybird.partition.TweetCreateHandler; -import com.twitter.search.earlybird.partition.TweetUpdateHandler; -import com.twitter.search.earlybird.partition.UserUpdatesStreamIndexer; -import com.twitter.search.earlybird.querycache.QueryCacheConfig; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.util.CoordinatedEarlybirdAction; -import com.twitter.search.earlybird.util.EarlybirdDecider; -import com.twitter.search.earlybird.util.TermCountMonitor; -import com.twitter.search.earlybird.util.TweetCountMonitor; -import com.twitter.ubs.thriftjava.AudioSpaceBaseEvent; - -/** - * Production module that provides Earlybird components. - */ -public class EarlybirdWireModule { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdWireModule.class); - private static final int MAX_POLL_RECORDS = 1000; - - /** - * How many threads we will use for building up the query cache during startup. - * The number of threads will be set to 1 after this earlybird is current. - */ - private static final int QUERY_CACHE_NUM_WORKER_THREADS_AT_STARTUP = - EarlybirdConfig.getInt("query_cache_updater_startup_threads", 1); - - /** - * Scheduled executor service factory can be re-used in production. - * All the managers can share the same executor service factory. - */ - private final ScheduledExecutorServiceFactory sharedExecutorServiceFactory = - new ScheduledExecutorServiceFactory(); - - private final SearchStatsReceiver sharedSearchStatsReceiver = new SearchStatsReceiverImpl(); - private final StatsReceiver sharedFinagleStatsReceiver = new MetricsStatsReceiver(); - - private final SearchIndexingMetricSet searchIndexingMetricSet = - new SearchIndexingMetricSet(sharedSearchStatsReceiver); - - private final EarlybirdSearcherStats tweetsSearcherStats = - new EarlybirdSearcherStats(sharedSearchStatsReceiver); - - private final CaughtUpMonitor indexCaughtUpMonitor = new CaughtUpMonitor("dl_index"); - - public CaughtUpMonitor provideIndexCaughtUpMonitor() { - return indexCaughtUpMonitor; - } - - private final CaughtUpMonitor kafkaIndexCaughtUpMonitor = new CaughtUpMonitor("kafka_index"); - - public CaughtUpMonitor provideKafkaIndexCaughtUpMonitor() { - return kafkaIndexCaughtUpMonitor; - } - - private final OptimizationAndFlushingCoordinationLock optimizationAndFlushingCoordinationLock = - new OptimizationAndFlushingCoordinationLock(); - - public OptimizationAndFlushingCoordinationLock provideOptimizationAndFlushingCoordinationLock() { - return optimizationAndFlushingCoordinationLock; - } - - public QueryTimeoutFactory provideQueryTimeoutFactory() { - return new QueryTimeoutFactory(); - } - - public static class ZooKeeperClients { - public ZooKeeperProxy discoveryClient; - public ZooKeeperProxy stateClient; - - public ZooKeeperClients() { - this( - SearchZkClient.getServiceDiscoveryZooKeeperClient(), - SearchZkClient.getSZooKeeperClient()); - } - - public ZooKeeperClients(ZooKeeperProxy discoveryClient, ZooKeeperProxy stateClient) { - this.discoveryClient = discoveryClient; - this.stateClient = stateClient; - } - } - - /** - * Provides the earlybird decider. - */ - public Decider provideDecider() { - return EarlybirdDecider.initialize(); - } - - /** - * Provides the set of ZooKeeper clients to be used by earlybird. - */ - public ZooKeeperClients provideZooKeeperClients() { - return new ZooKeeperClients(); - } - - /** - * Provides the query cache config. - */ - public QueryCacheConfig provideQueryCacheConfig(SearchStatsReceiver searchStatsReceiver) { - return new QueryCacheConfig(searchStatsReceiver); - } - - /** - * Provides the earlybird index config. - */ - public EarlybirdIndexConfig provideEarlybirdIndexConfig( - Decider decider, SearchIndexingMetricSet indexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler) { - return EarlybirdIndexConfigUtil.createEarlybirdIndexConfig(decider, indexingMetricSet, - criticalExceptionHandler); - } - - public DynamicPartitionConfig provideDynamicPartitionConfig() { - return new DynamicPartitionConfig(PartitionConfigUtil.initPartitionConfig()); - } - - /** - * Provides the segment manager to be used by this earlybird. - */ - public SegmentManager provideSegmentManager( - DynamicPartitionConfig dynamicPartitionConfig, - EarlybirdIndexConfig earlybirdIndexConfig, - SearchIndexingMetricSet partitionIndexingMetricSet, - EarlybirdSearcherStats searcherStats, - SearchStatsReceiver earlybirdServerStats, - UserUpdatesChecker userUpdatesChecker, - SegmentSyncConfig segmentSyncConfig, - UserTable userTable, - UserScrubGeoMap userScrubGeoMap, - Clock clock, - CriticalExceptionHandler criticalExceptionHandler) { - return new SegmentManager( - dynamicPartitionConfig, - earlybirdIndexConfig, - partitionIndexingMetricSet, - searcherStats, - earlybirdServerStats, - userUpdatesChecker, - segmentSyncConfig, - userTable, - userScrubGeoMap, - clock, - EarlybirdConfig.getMaxSegmentSize(), - criticalExceptionHandler, - provideKafkaIndexCaughtUpMonitor()); - } - - public QueryCacheManager provideQueryCacheManager( - QueryCacheConfig config, - EarlybirdIndexConfig indexConfig, - int maxEnabledSegments, - UserTable userTable, - UserScrubGeoMap userScrubGeoMap, - ScheduledExecutorServiceFactory queryCacheUpdaterScheduledExecutorFactory, - SearchStatsReceiver searchStatsReceiver, - EarlybirdSearcherStats searcherStats, - Decider decider, - CriticalExceptionHandler criticalExceptionHandler, - Clock clock) { - return new QueryCacheManager(config, indexConfig, maxEnabledSegments, userTable, - userScrubGeoMap, queryCacheUpdaterScheduledExecutorFactory, searchStatsReceiver, - searcherStats, decider, criticalExceptionHandler, clock); - } - - public TermCountMonitor provideTermCountMonitor( - SegmentManager segmentManager, ScheduledExecutorServiceFactory executorServiceFactory, - SearchStatsReceiver searchStatsReceiver, - CriticalExceptionHandler criticalExceptionHandler) { - return new TermCountMonitor(segmentManager, executorServiceFactory, 500, TimeUnit.MILLISECONDS, - searchStatsReceiver, criticalExceptionHandler); - } - - public TweetCountMonitor provideTweetCountMonitor( - SegmentManager segmentManager, - ScheduledExecutorServiceFactory executorServiceFactory, - SearchStatsReceiver searchStatsReceiver, - CriticalExceptionHandler criticalExceptionHandler) { - return new TweetCountMonitor(segmentManager, executorServiceFactory, 500, - TimeUnit.MILLISECONDS, searchStatsReceiver, criticalExceptionHandler); - } - - /** - * Returns a manager that keeps track of earlybird's global state while it runs. - */ - public UpdateableEarlybirdStateManager provideUpdateableEarlybirdStateManager( - EarlybirdIndexConfig earlybirdIndexConfig, - DynamicPartitionConfig dynamicPartitionConfig, - ZooKeeperProxy zooKeeperClient, - AuroraSchedulerClient schedulerClient, - ScheduledExecutorServiceFactory executorServiceFactory, - ScoringModelsManager scoringModelsManager, - TensorflowModelsManager tensorflowModelsManager, - SearchStatsReceiver searchStatsReceiver, - SearchDecider searchDecider, - CriticalExceptionHandler criticalExceptionHandler) { - Clock clock = provideClockForStateManager(); - - return new UpdateableEarlybirdStateManager( - earlybirdIndexConfig, dynamicPartitionConfig, zooKeeperClient, schedulerClient, - executorServiceFactory, scoringModelsManager, tensorflowModelsManager, searchStatsReceiver, - searchDecider, criticalExceptionHandler, - clock); - } - - public Clock provideClockForStateManager() { - return this.provideClock(); - } - - public ScheduledExecutorServiceFactory providePartitionManagerExecutorFactory() { - return sharedExecutorServiceFactory; - } - - public ScheduledExecutorServiceFactory provideStateUpdateManagerExecutorFactory() { - return sharedExecutorServiceFactory; - } - - public ScheduledExecutorServiceFactory provideTermCountMonitorScheduledExecutorFactory() { - return sharedExecutorServiceFactory; - } - - public ScheduledExecutorServiceFactory provideTweetCountMonitorScheduledExecutorFactory() { - return sharedExecutorServiceFactory; - } - - /** - * Provides the ScheduledExecutorServiceFactory that will be used to schedule all query cache - * update tasks. - */ - public ScheduledExecutorServiceFactory provideQueryCacheUpdateTaskScheduledExecutorFactory() { - return new ScheduledExecutorServiceFactory() { - @Override - public QueryCacheUpdaterScheduledExecutorService build( - String threadNameFormat, boolean isDaemon) { - ScheduledThreadPoolExecutor threadpoolExecutor = - new ScheduledThreadPoolExecutor(QUERY_CACHE_NUM_WORKER_THREADS_AT_STARTUP, - buildThreadFactory(threadNameFormat, isDaemon)); - threadpoolExecutor.setMaximumPoolSize(QUERY_CACHE_NUM_WORKER_THREADS_AT_STARTUP); - threadpoolExecutor.setCorePoolSize(QUERY_CACHE_NUM_WORKER_THREADS_AT_STARTUP); - threadpoolExecutor.setExecuteExistingDelayedTasksAfterShutdownPolicy(false); - threadpoolExecutor.setContinueExistingPeriodicTasksAfterShutdownPolicy(false); - threadpoolExecutor.setRemoveOnCancelPolicy(true); - LOG.info("Starting query cache executor with {} thread.", - QUERY_CACHE_NUM_WORKER_THREADS_AT_STARTUP); - - return new QueryCacheUpdaterScheduledExecutorService( - threadpoolExecutor) { - @Override public void setWorkerPoolSizeAfterStartup() { - delegate.setCorePoolSize(1); - delegate.setMaximumPoolSize(1); - LOG.info("Reset query cache executor to be single threaded."); - } - }; - } - }; - } - - public ScheduledExecutorServiceFactory provideSimpleUserUpdateIndexerScheduledExecutorFactory() { - return sharedExecutorServiceFactory; - } - - /** - * Returns the manager that manages the pool of searcher threads. - */ - public EarlybirdFuturePoolManager provideFuturePoolManager() { - return new EarlybirdFuturePoolManager("SearcherWorker"); - } - - /** - * Returns the manager that manages all earlybird finagle servers (warm up and production). - */ - public EarlybirdFinagleServerManager provideFinagleServerManager( - CriticalExceptionHandler criticalExceptionHandler) { - return new EarlybirdProductionFinagleServerManager(criticalExceptionHandler); - } - - /** - * Creates the production serverset manager. - */ - public EarlybirdServerSetManager provideServerSetManager( - ZooKeeperProxy discoveryClient, - DynamicPartitionConfig dynamicPartitionConfig, - SearchStatsReceiver searchStatsReceiver, - int port, - String serverSetNamePrefix) { - return new EarlybirdServerSetManager( - searchStatsReceiver, - discoveryClient, - dynamicPartitionConfig.getCurrentPartitionConfig(), - port, - serverSetNamePrefix); - } - - /** - * Creates the warm up serverset manager. - */ - public EarlybirdWarmUpManager provideWarmUpManager( - ZooKeeperProxy discoveryClient, - DynamicPartitionConfig dynamicPartitionConfig, - SearchStatsReceiver searchStatsReceiver, - Decider decider, - Clock clock, - int port, - String serverSetNamePrefix) { - return new EarlybirdWarmUpManager( - new EarlybirdServerSetManager( - searchStatsReceiver, - discoveryClient, - dynamicPartitionConfig.getCurrentPartitionConfig(), - port, - serverSetNamePrefix), - dynamicPartitionConfig.getCurrentPartitionConfig(), - searchIndexingMetricSet, - decider, - clock); - } - - /** - * Returns a dark proxy that knows how to send dark traffic to the warm up earlybird serverset. - */ - public EarlybirdDarkProxy provideEarlybirdDarkProxy( - SearchDecider searchDecider, - StatsReceiver finagleStatsReceiver, - EarlybirdServerSetManager earlybirdServerSetManager, - EarlybirdWarmUpManager earlybirdWarmUpManager, - String clusterName) { - return new EarlybirdDarkProxy(searchDecider, - finagleStatsReceiver.scope("dark_proxy"), - earlybirdServerSetManager, - earlybirdWarmUpManager, - clusterName); - } - - - /** - * Returns the manager for all (non-Tensorflow) scoring models. - */ - public ScoringModelsManager provideScoringModelsManager( - SearchStatsReceiver serverStats, - EarlybirdIndexConfig earlybirdIndexConfig) { - boolean modelsEnabled = EarlybirdConfig.getBool("scoring_models_enabled", false); - if (!modelsEnabled) { - LOG.info("Scoring Models - Disabled in the config. Not loading any models."); - serverStats.getCounter("scoring_models_disabled_in_config").increment(); - return ScoringModelsManager.NO_OP_MANAGER; - } - - String hdfsNameNode = EarlybirdConfig.getString("scoring_models_namenode"); - String hdfsModelsPath = EarlybirdConfig.getString("scoring_models_basedir"); - try { - return ScoringModelsManager.create( - serverStats, hdfsNameNode, hdfsModelsPath, earlybirdIndexConfig.getSchema()); - } catch (IOException e) { - LOG.error("Scoring Models - Error creating ScoringModelsManager", e); - serverStats.getCounter("scoring_models_initialization_errors").increment(); - return ScoringModelsManager.NO_OP_MANAGER; - } - } - - /** - * Provides the manager for all Tensorflow models. - */ - public TensorflowModelsManager provideTensorflowModelsManager( - SearchStatsReceiver serverStats, - String statsPrefix, - Decider decider, - EarlybirdIndexConfig earlybirdIndexConfig) { - - boolean modelsEnabled = EarlybirdProperty.TF_MODELS_ENABLED.get(false); - - if (!modelsEnabled) { - LOG.info("Tensorflow Models - Disabled in the config. Not loading any models."); - serverStats.getCounter("tf_models_disabled_in_config").increment(); - return TensorflowModelsManager.createNoOp(statsPrefix); - } - - String modelsConfigPath = - Preconditions.checkNotNull(EarlybirdProperty.TF_MODELS_CONFIG_PATH.get()); - - - int intraOpThreads = Preconditions.checkNotNull(EarlybirdProperty.TF_INTRA_OP_THREADS.get(0)); - int interOpThreads = Preconditions.checkNotNull(EarlybirdProperty.TF_INTER_OP_THREADS.get(0)); - - TensorflowModelsManager.initTensorflowThreadPools(intraOpThreads, interOpThreads); - - return TensorflowModelsManager.createUsingConfigFile( - FileUtils.getFileHandle(modelsConfigPath), - true, - statsPrefix, - () -> DeciderUtil.isAvailableForRandomRecipient( - decider, "enable_tf_serve_models"), - () -> decider.isAvailable("enable_tf_load_models"), - earlybirdIndexConfig.getSchema()); - } - - public SearchStatsReceiver provideEarlybirdServerStatsReceiver() { - return sharedSearchStatsReceiver; - } - - public StatsReceiver provideFinagleStatsReceiver() { - return sharedFinagleStatsReceiver; - } - - public SearchIndexingMetricSet provideSearchIndexingMetricSet() { - return searchIndexingMetricSet; - } - - public EarlybirdSearcherStats provideTweetsSearcherStats() { - return tweetsSearcherStats; - } - - /** - * Provides the clock to be used by this earlybird. - */ - public Clock provideClock() { - return Clock.SYSTEM_CLOCK; - } - - /** - * Provides the config for the multi-segment term dictionary manager. - */ - public MultiSegmentTermDictionaryManager.Config provideMultiSegmentTermDictionaryManagerConfig() { - return new MultiSegmentTermDictionaryManager.Config( - Lists.newArrayList( - EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName())); - } - - /** - * Provides the manager for the term dictionary that spans all segments. - */ - public MultiSegmentTermDictionaryManager provideMultiSegmentTermDictionaryManager( - MultiSegmentTermDictionaryManager.Config termDictionaryConfig, - SegmentManager segmentManager, - SearchStatsReceiver statsReceiver, - Decider decider, - EarlybirdCluster earlybirdCluster) { - return new MultiSegmentTermDictionaryManager( - termDictionaryConfig, segmentManager, statsReceiver, decider, earlybirdCluster); - } - - /** - * Returns the partition manager to be used by the archive earlybirds. - */ - public PartitionManager provideFullArchivePartitionManager( - ZooKeeperTryLockFactory zooKeeperTryLockFactory, - QueryCacheManager queryCacheManager, - SegmentManager segmentManager, - DynamicPartitionConfig dynamicPartitionConfig, - UserUpdatesStreamIndexer userUpdatesStreamIndexer, - UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer, - SearchStatsReceiver searchStatsReceiver, - ArchiveEarlybirdIndexConfig earlybirdIndexConfig, - ServerSetMember serverSetMember, - ScheduledExecutorServiceFactory executorServiceFactory, - ScheduledExecutorServiceFactory userUpdateIndexerExecutorFactory, - SearchIndexingMetricSet earlybirdSearchIndexingMetricSet, - Clock clock, - SegmentSyncConfig segmentSyncConfig, - CriticalExceptionHandler criticalExceptionHandler) throws IOException { - - return new ArchiveSearchPartitionManager( - zooKeeperTryLockFactory, - queryCacheManager, - segmentManager, - dynamicPartitionConfig, - userUpdatesStreamIndexer, - userScrubGeoEventStreamIndexer, - searchStatsReceiver, - earlybirdIndexConfig, - serverSetMember, - executorServiceFactory, - userUpdateIndexerExecutorFactory, - earlybirdSearchIndexingMetricSet, - segmentSyncConfig, - clock, - criticalExceptionHandler); - } - - /** - * Provides the SegmentSyncConfig instance to be used by earlybird. - */ - public SegmentSyncConfig provideSegmentSyncConfig(EarlybirdCluster cluster) { - String scrubGen = null; - if (cluster == EarlybirdCluster.FULL_ARCHIVE) { - scrubGen = EarlybirdProperty.EARLYBIRD_SCRUB_GEN.get(); - LOG.info("The scrubGen provided from Aurora is: {}", scrubGen); - Preconditions.checkState(Strings.isNotEmpty(scrubGen)); - } - return new SegmentSyncConfig(Optional.ofNullable(scrubGen)); - } - - protected void storeEarlybirdStartupProducts( - TweetCreateHandler tweetCreateHandler, - PartitionWriter partitionWriter, - EarlybirdIndexFlusher earlybirdIndexFlusher - ) { - // TestWireModule wants to store these for further use. - } - - /** - * What directory are we going to load segments from on startup. - * - * When you're running loadtests or stagingN instances and they don't have a recent index - * flushed, it can take hours to generate a new index with a fresh startup. This slows - * down development. If the read_index_from_prod_location flag is set to true, we will read - * the index from the location where prod instances are flushing their index to. - * Unset it if you want to generate your own index. - * - * @return a string with the directory. - */ - public String getIndexLoadingDirectory() { - boolean readIndexFromProdLocation = EarlybirdProperty.READ_INDEX_FROM_PROD_LOCATION.get(false); - String environment = EarlybirdProperty.ENV.get("no_env_specified"); // default value for tests. - String readIndexDir = EarlybirdProperty.HDFS_INDEX_SYNC_DIR.get(); - - if (readIndexFromProdLocation) { - LOG.info("Will attempt to read index from prod locations"); - LOG.info("Index directory provided: {}", readIndexDir); - // Replacing the path is a bit hacky, but it works ok. - readIndexDir = readIndexDir.replace("/" + environment + "/", "/prod/"); - LOG.info("Will instead use index directory: {}", readIndexDir); - } - - return readIndexDir; - } - - /** - * Indexer for audio space events. - */ - public AudioSpaceEventsStreamIndexer provideAudioSpaceEventsStreamIndexer( - AudioSpaceTable audioSpaceTable, - Clock clock) { - try { - return new AudioSpaceEventsStreamIndexer( - FinagleKafkaClientUtils.newKafkaConsumerForAssigning( - "", - new ThriftDeserializer<>(AudioSpaceBaseEvent.class), - "", - 20 - ), audioSpaceTable, clock); - } catch (MissingKafkaTopicException ex) { - LOG.error("Missing kafka stream", ex); - return null; - } - } - - /** - * Returns a class to start the Earlybird. See {@link EarlybirdStartup}. - */ - public EarlybirdStartup provideEarlybirdStartup( - PartitionManager partitionManager, - UserUpdatesStreamIndexer userUpdatesStreamIndexer, - UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer, - AudioSpaceEventsStreamIndexer audioSpaceEventsStreamIndexer, - DynamicPartitionConfig dynamicPartitionConfig, - CriticalExceptionHandler criticalExceptionHandler, - SegmentManager segmentManager, - MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager, - QueryCacheManager queryCacheManager, - ZooKeeperTryLockFactory zooKeeperTryLockFactory, - ServerSetMember serverSetMember, - Clock clock, - SegmentSyncConfig segmentSyncConfig, - EarlybirdSegmentFactory earlybirdSegmentFactory, - EarlybirdCluster cluster, - SearchDecider decider) throws IOException { - if (cluster == EarlybirdCluster.FULL_ARCHIVE) { - return new PartitionManagerStartup(clock, partitionManager); - } - - // Check that the earlybird name is what we're expecting so we can build the kafka topics. - String earlybirdName = EarlybirdProperty.EARLYBIRD_NAME.get(); - Preconditions.checkArgument("earlybird-realtime".equals(earlybirdName) - || "earlybird-protected".equals(earlybirdName) - || "earlybird-realtime-exp0".equals(earlybirdName) - || "earlybird-realtime_cg".equals(earlybirdName)); - - StartupUserEventIndexer startupUserEventIndexer = new StartupUserEventIndexer( - provideSearchIndexingMetricSet(), - userUpdatesStreamIndexer, - userScrubGeoEventStreamIndexer, - segmentManager, - clock); - - // Coordinate leaving the serverset to flush segments to HDFS. - CoordinatedEarlybirdAction actionCoordinator = new CoordinatedEarlybirdAction( - zooKeeperTryLockFactory, - "segment_flusher", - dynamicPartitionConfig, - serverSetMember, - criticalExceptionHandler, - segmentSyncConfig); - actionCoordinator.setShouldSynchronize(true); - - FileSystem hdfsFileSystem = HdfsUtil.getHdfsFileSystem(); - EarlybirdIndexFlusher earlybirdIndexFlusher = new EarlybirdIndexFlusher( - actionCoordinator, - hdfsFileSystem, - EarlybirdProperty.HDFS_INDEX_SYNC_DIR.get(), - segmentManager, - dynamicPartitionConfig.getCurrentPartitionConfig(), - clock, - new TimeLimitedHadoopExistsCall(hdfsFileSystem), - provideOptimizationAndFlushingCoordinationLock()); - - String baseTopicName = "search_ingester_%s_events_%s_%s"; - - String earlybirdType; - - if ("earlybird-protected".equals(earlybirdName)) { - earlybirdType = "protected"; - } else if ("earlybird-realtime_cg".equals(earlybirdName)) { - earlybirdType = "realtime_cg"; - } else { - earlybirdType = "realtime"; - } - - String tweetTopicName = String.format( - baseTopicName, - "indexing", - earlybirdType, - EarlybirdProperty.KAFKA_ENV.get()); - - String updateTopicName = String.format( - baseTopicName, - "update", - earlybirdType, - EarlybirdProperty.KAFKA_ENV.get()); - - LOG.info("Tweet topic: {}", tweetTopicName); - LOG.info("Update topic: {}", updateTopicName); - - TopicPartition tweetTopic = new TopicPartition( - tweetTopicName, - dynamicPartitionConfig.getCurrentPartitionConfig().getIndexingHashPartitionID()); - TopicPartition updateTopic = new TopicPartition( - updateTopicName, - dynamicPartitionConfig.getCurrentPartitionConfig().getIndexingHashPartitionID()); - - EarlybirdKafkaConsumersFactory earlybirdKafkaConsumersFactory = - provideEarlybirdKafkaConsumersFactory(); - FreshStartupHandler freshStartupHandler = new FreshStartupHandler( - clock, - earlybirdKafkaConsumersFactory, - tweetTopic, - updateTopic, - segmentManager, - EarlybirdConfig.getMaxSegmentSize(), - EarlybirdConfig.getLateTweetBuffer(), - criticalExceptionHandler - ); - - TweetUpdateHandler updateHandler = new TweetUpdateHandler(segmentManager); - - CoordinatedEarlybirdAction postOptimizationRebuilds = new CoordinatedEarlybirdAction( - zooKeeperTryLockFactory, - "post_optimization_rebuilds", - dynamicPartitionConfig, - serverSetMember, - criticalExceptionHandler, - segmentSyncConfig - ); - postOptimizationRebuilds.setShouldSynchronize(true); - CoordinatedEarlybirdAction gcAction = new CoordinatedEarlybirdAction( - zooKeeperTryLockFactory, - "gc_before_optimization", - dynamicPartitionConfig, - serverSetMember, - criticalExceptionHandler, - segmentSyncConfig - ); - gcAction.setShouldSynchronize(true); - - TweetCreateHandler createHandler = new TweetCreateHandler( - segmentManager, - provideSearchIndexingMetricSet(), - criticalExceptionHandler, - multiSegmentTermDictionaryManager, - queryCacheManager, - postOptimizationRebuilds, - gcAction, - EarlybirdConfig.getLateTweetBuffer(), - EarlybirdConfig.getMaxSegmentSize(), - provideKafkaIndexCaughtUpMonitor(), - provideOptimizationAndFlushingCoordinationLock()); - - PartitionWriter partitionWriter = new PartitionWriter( - createHandler, - updateHandler, - criticalExceptionHandler, - PenguinVersion.versionFromByteValue(EarlybirdConfig.getPenguinVersionByte()), - clock); - - KafkaConsumer rawKafkaConsumer = - earlybirdKafkaConsumersFactory.createKafkaConsumer( - "earlybird_tweet_kafka_consumer"); - - EarlybirdKafkaConsumer earlybirdKafkaConsumer = provideKafkaConsumer( - criticalExceptionHandler, - rawKafkaConsumer, - tweetTopic, - updateTopic, - partitionWriter, - earlybirdIndexFlusher); - - EarlybirdIndexLoader earlybirdIndexLoader = new EarlybirdIndexLoader( - hdfsFileSystem, - getIndexLoadingDirectory(), // See SEARCH-32839 - EarlybirdProperty.ENV.get("default_env_value"), - dynamicPartitionConfig.getCurrentPartitionConfig(), - earlybirdSegmentFactory, - segmentSyncConfig, - clock); - - this.storeEarlybirdStartupProducts( - createHandler, - partitionWriter, - earlybirdIndexFlusher - ); - - return new KafkaStartup( - segmentManager, - earlybirdKafkaConsumer, - startupUserEventIndexer, - userUpdatesStreamIndexer, - userScrubGeoEventStreamIndexer, - audioSpaceEventsStreamIndexer, - queryCacheManager, - earlybirdIndexLoader, - freshStartupHandler, - provideSearchIndexingMetricSet(), - multiSegmentTermDictionaryManager, - criticalExceptionHandler, - decider - ); - } - - public QualityFactor provideQualityFactor( - Decider decider, - SearchStatsReceiver searchStatsReceiver - ) { - return new EarlybirdCPUQualityFactor(decider, - ManagementFactory.getPlatformMXBean(OperatingSystemMXBean.class), - searchStatsReceiver); - } - - /** - * Returns a new UserUpdatesKafkaConsumer to read user updates. - */ - public UserUpdatesStreamIndexer provideUserUpdatesKafkaConsumer( - SegmentManager segmentManager) { - try { - return new UserUpdatesStreamIndexer( - UserUpdatesStreamIndexer.provideKafkaConsumer(), - EarlybirdProperty.USER_UPDATES_KAFKA_TOPIC.get(), - provideSearchIndexingMetricSet(), - segmentManager); - } catch (MissingKafkaTopicException ex) { - // Yes, it will crash the server. We've never seen this topic missing, but - // we've seen some others, so we had to build this functionality in the - // constructor. If one day this one goes missing, we'll have to figure out - // how to handle it. For now, we crash. - throw new RuntimeException(ex); - } - } - - /** - * Returns a new UserScrubGeosKafkaConsumer to read geo scrubbing events. - */ - public UserScrubGeoEventStreamIndexer provideUserScrubGeoEventKafkaConsumer( - SegmentManager segmentManager) { - try { - return new UserScrubGeoEventStreamIndexer( - UserScrubGeoEventStreamIndexer.provideKafkaConsumer(), - EarlybirdProperty.USER_SCRUB_GEO_KAFKA_TOPIC.get(), - provideSearchIndexingMetricSet(), - segmentManager); - } catch (MissingKafkaTopicException ex) { - /** - * See {@link #provideUserUpdatesKafkaConsumer} - */ - throw new RuntimeException(ex); - } - } - - /** - * Returns a new ProductionEarlybirdKafkaConsumer to read ThriftVersionedEvents. - */ - public EarlybirdKafkaConsumersFactory provideEarlybirdKafkaConsumersFactory() { - return new ProductionEarlybirdKafkaConsumersFactory( - EarlybirdProperty.KAFKA_PATH.get(), - MAX_POLL_RECORDS - ); - } - - /** - * Returns a class to read Tweets in the Earlybird. See {@link EarlybirdKafkaConsumer}. - */ - public EarlybirdKafkaConsumer provideKafkaConsumer( - CriticalExceptionHandler criticalExceptionHandler, - KafkaConsumer rawKafkaConsumer, - TopicPartition tweetTopic, - TopicPartition updateTopic, - PartitionWriter partitionWriter, - EarlybirdIndexFlusher earlybirdIndexFlusher - ) { - return new EarlybirdKafkaConsumer( - rawKafkaConsumer, - provideSearchIndexingMetricSet(), - criticalExceptionHandler, - partitionWriter, - tweetTopic, - updateTopic, - earlybirdIndexFlusher, - provideKafkaIndexCaughtUpMonitor()); - } -} diff --git a/src/java/com/twitter/search/earlybird/factory/PartitionConfigUtil.docx b/src/java/com/twitter/search/earlybird/factory/PartitionConfigUtil.docx new file mode 100644 index 000000000..731ed2942 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/factory/PartitionConfigUtil.docx differ diff --git a/src/java/com/twitter/search/earlybird/factory/PartitionConfigUtil.java b/src/java/com/twitter/search/earlybird/factory/PartitionConfigUtil.java deleted file mode 100644 index 3a183a219..000000000 --- a/src/java/com/twitter/search/earlybird/factory/PartitionConfigUtil.java +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.search.earlybird.factory; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.config.TierConfig; -import com.twitter.search.earlybird.config.TierInfo; -import com.twitter.search.earlybird.partition.PartitionConfig; - -public final class PartitionConfigUtil { - private static final Logger LOG = LoggerFactory.getLogger(PartitionConfigUtil.class); - - private PartitionConfigUtil() { - } - - /** - * Initiate PartitionConfig for earlybirds running on Aurora - */ - public static PartitionConfig initPartitionConfigForAurora(int numOfInstances) { - String tier = EarlybirdProperty.EARLYBIRD_TIER.get(); - int partitionId = EarlybirdProperty.PARTITION_ID.get(); - int replicaId = EarlybirdProperty.REPLICA_ID.get(); - if (tier.equals(PartitionConfig.DEFAULT_TIER_NAME)) { - // realtime or protected earlybird - return new PartitionConfig( - partitionId, - EarlybirdProperty.SERVING_TIMESLICES.get(), - replicaId, - numOfInstances, - EarlybirdProperty.NUM_PARTITIONS.get()); - } else { - // archive earlybird - TierInfo tierInfo = TierConfig.getTierInfo(tier); - return new PartitionConfig(tier, tierInfo.getDataStartDate(), tierInfo.getDataEndDate(), - partitionId, tierInfo.getMaxTimeslices(), replicaId, numOfInstances, - tierInfo.getNumPartitions()); - } - } - - /** - * Tries to create a new PartitionConfig instance based on the Aurora flags - */ - public static PartitionConfig initPartitionConfig() { - return initPartitionConfigForAurora(EarlybirdProperty.NUM_INSTANCES.get()); - } -} diff --git a/src/java/com/twitter/search/earlybird/factory/ProductionEarlybirdKafkaConsumersFactory.docx b/src/java/com/twitter/search/earlybird/factory/ProductionEarlybirdKafkaConsumersFactory.docx new file mode 100644 index 000000000..c6566e894 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/factory/ProductionEarlybirdKafkaConsumersFactory.docx differ diff --git a/src/java/com/twitter/search/earlybird/factory/ProductionEarlybirdKafkaConsumersFactory.java b/src/java/com/twitter/search/earlybird/factory/ProductionEarlybirdKafkaConsumersFactory.java deleted file mode 100644 index e024f27ee..000000000 --- a/src/java/com/twitter/search/earlybird/factory/ProductionEarlybirdKafkaConsumersFactory.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.twitter.search.earlybird.factory; - -import org.apache.kafka.clients.consumer.KafkaConsumer; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.util.io.kafka.CompactThriftDeserializer; -import com.twitter.search.common.util.io.kafka.FinagleKafkaClientUtils; - -/** - * Responsible for creating kafka consumers. - */ -public class ProductionEarlybirdKafkaConsumersFactory implements EarlybirdKafkaConsumersFactory { - private final String kafkaPath; - private final int defaultMaxPollRecords; - - ProductionEarlybirdKafkaConsumersFactory(String kafkaPath, int defaultMaxPollRecords) { - this.kafkaPath = kafkaPath; - this.defaultMaxPollRecords = defaultMaxPollRecords; - } - - /** - * Create a kafka consumer with set maximum of records to be polled. - */ - @Override - public KafkaConsumer createKafkaConsumer( - String clientID, int maxPollRecords) { - return FinagleKafkaClientUtils.newKafkaConsumerForAssigning( - kafkaPath, - new CompactThriftDeserializer<>(ThriftVersionedEvents.class), - clientID, - maxPollRecords); - } - - /** - * Create a kafka consumer with default records to be polled. - */ - @Override - public KafkaConsumer createKafkaConsumer(String clientID) { - return createKafkaConsumer(clientID, defaultMaxPollRecords); - } -} diff --git a/src/java/com/twitter/search/earlybird/factory/QueryCacheUpdaterScheduledExecutorService.docx b/src/java/com/twitter/search/earlybird/factory/QueryCacheUpdaterScheduledExecutorService.docx new file mode 100644 index 000000000..d58651925 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/factory/QueryCacheUpdaterScheduledExecutorService.docx differ diff --git a/src/java/com/twitter/search/earlybird/factory/QueryCacheUpdaterScheduledExecutorService.java b/src/java/com/twitter/search/earlybird/factory/QueryCacheUpdaterScheduledExecutorService.java deleted file mode 100644 index ff0619441..000000000 --- a/src/java/com/twitter/search/earlybird/factory/QueryCacheUpdaterScheduledExecutorService.java +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.search.earlybird.factory; - -import java.util.concurrent.Callable; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; - -import com.twitter.common.util.concurrent.ForwardingExecutorService; - -/** - * This delegate type is intended for QueryCacheUpdater because it uses multiple threads to - * create query cache during startup and then switch later to use single thread to update the - * cache. - */ -public abstract class QueryCacheUpdaterScheduledExecutorService - extends ForwardingExecutorService implements ScheduledExecutorService { - public QueryCacheUpdaterScheduledExecutorService(T executor) { - super(executor); - } - - /** - * Sets the number of worker threads in this executor service to an appropriate value after the - * earlybird startup has finished. While earlybird is starting up, we might want this executor - * service to have more threads, in order to parallelize more some start up tasks. But once - * earlybird is up, it might make sense to lower the number of worker threads. - */ - public abstract void setWorkerPoolSizeAfterStartup(); - - @Override - public ScheduledFuture schedule(Runnable command, long delay, TimeUnit unit) { - return delegate.schedule(command, delay, unit); - } - - @Override - public ScheduledFuture scheduleAtFixedRate( - Runnable command, long initialDelay, long period, TimeUnit unit) { - return delegate.scheduleAtFixedRate(command, initialDelay, period, unit); - } - - @Override - public ScheduledFuture scheduleWithFixedDelay( - Runnable command, long initialDelay, long delay, TimeUnit unit) { - return delegate.scheduleWithFixedDelay(command, initialDelay, delay, unit); - } - - @Override - public ScheduledFuture schedule(Callable callable, long delay, TimeUnit unit) { - return delegate.schedule(callable, delay, unit); - } - - @VisibleForTesting - public T getDelegate() { - return delegate; - } -} diff --git a/src/java/com/twitter/search/earlybird/index/AbstractInMemoryTimeMapper.docx b/src/java/com/twitter/search/earlybird/index/AbstractInMemoryTimeMapper.docx new file mode 100644 index 000000000..e20395de3 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/AbstractInMemoryTimeMapper.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/AbstractInMemoryTimeMapper.java b/src/java/com/twitter/search/earlybird/index/AbstractInMemoryTimeMapper.java deleted file mode 100644 index 4e9ef4f7c..000000000 --- a/src/java/com/twitter/search/earlybird/index/AbstractInMemoryTimeMapper.java +++ /dev/null @@ -1,83 +0,0 @@ -package com.twitter.search.earlybird.index; - -import com.twitter.search.core.earlybird.index.TimeMapper; -import com.twitter.search.core.earlybird.index.inverted.IntBlockPool; -import com.twitter.search.core.earlybird.index.util.SearchSortUtils; -import com.twitter.search.earlybird.search.queries.SinceUntilFilter; - -public abstract class AbstractInMemoryTimeMapper implements TimeMapper { - // Reverse map: timestamp to first doc ID seen with that timestamp. - // This is two arrays: the timestamps (sorted), and the doc ids. - protected final IntBlockPool reverseMapTimes; - protected final IntBlockPool reverseMapIds; - protected volatile int reverseMapLastIndex; - - public AbstractInMemoryTimeMapper() { - this.reverseMapTimes = new IntBlockPool(ILLEGAL_TIME, "time_mapper_times"); - this.reverseMapIds = new IntBlockPool(ILLEGAL_TIME, "time_mapper_ids"); - this.reverseMapLastIndex = -1; - } - - protected AbstractInMemoryTimeMapper(int reverseMapLastIndex, - IntBlockPool reverseMapTimes, - IntBlockPool reverseMapIds) { - this.reverseMapTimes = reverseMapTimes; - this.reverseMapIds = reverseMapIds; - this.reverseMapLastIndex = reverseMapLastIndex; - } - - @Override - public final int getLastTime() { - return reverseMapLastIndex == -1 ? ILLEGAL_TIME : reverseMapTimes.get(reverseMapLastIndex); - } - - @Override - public final int getFirstTime() { - return reverseMapLastIndex == -1 ? ILLEGAL_TIME : reverseMapTimes.get(0); - } - - @Override - public final int findFirstDocId(int timeSeconds, int smallestDocID) { - if (timeSeconds == SinceUntilFilter.NO_FILTER || reverseMapLastIndex == -1) { - return smallestDocID; - } - - final int index = SearchSortUtils.binarySearch( - new IntArrayComparator(), 0, reverseMapLastIndex, timeSeconds, false); - - if (index == reverseMapLastIndex && reverseMapTimes.get(index) < timeSeconds) { - // Special case for out of bounds time. - return smallestDocID; - } - - return reverseMapIds.get(index); - } - - protected abstract void setTime(int docID, int timeSeconds); - - protected void doAddMapping(int docID, int timeSeconds) { - setTime(docID, timeSeconds); - int lastTime = getLastTime(); - if (timeSeconds > lastTime) { - // Found a timestamp newer than any timestamp we've seen before. - // Add a reverse mapping to this tweet (the first seen with this timestamp). - // - // When indexing out of order tweets, we could have gaps in the timestamps recorded in - // reverseMapTimes. For example, if we get 3 tweets with timestamp T0, T0 + 5, T0 + 3, then we - // will only record T0 and T0 + 5 in reverseMapTimes. However, this should not be an issue, - // because reverseMapTimes is only used by findFirstDocId(), and it's OK for that method to - // return a smaller doc ID than strictly necessary (in this case, findFirstDocId(T0 + 3) will - // return the doc ID of the second tweet, instead of returning the doc ID of the third tweet). - reverseMapTimes.add(timeSeconds); - reverseMapIds.add(docID); - reverseMapLastIndex++; - } - } - - private class IntArrayComparator implements SearchSortUtils.Comparator { - @Override - public int compare(int index, Integer value) { - return Integer.compare(reverseMapTimes.get(index), value); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/index/DocValuesBasedTimeMapper.docx b/src/java/com/twitter/search/earlybird/index/DocValuesBasedTimeMapper.docx new file mode 100644 index 000000000..5bc055b3d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/DocValuesBasedTimeMapper.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/DocValuesBasedTimeMapper.java b/src/java/com/twitter/search/earlybird/index/DocValuesBasedTimeMapper.java deleted file mode 100644 index 9b7770de6..000000000 --- a/src/java/com/twitter/search/earlybird/index/DocValuesBasedTimeMapper.java +++ /dev/null @@ -1,146 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.search.DocIdSetIterator; - -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.util.analysis.IntTermAttributeImpl; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.TimeMapper; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldIndex; - -/** - * A few caveats when using this class: - * - This class only supports in-order createdAt! - * - Before actually using this class, one must call prepareToRead() with a Lucene AtomicReader - * - prepareToRead() will load docID to createdAt mapping into memory, if not already done. - */ -public class DocValuesBasedTimeMapper implements TimeMapper { - private LeafReader reader; - private ColumnStrideFieldIndex docValues; - - protected int minTimestamp = ILLEGAL_TIME; - protected int maxTimestamp = ILLEGAL_TIME; - - /** - * When indexing finishes, this method should be called with a index reader that - * can see all documents. - * @param leafReader Lucene index reader used to access "TweetID" to "createdAt" mapping. - */ - public void initializeWithLuceneReader(LeafReader leafReader, ColumnStrideFieldIndex csf) - throws IOException { - reader = Preconditions.checkNotNull(leafReader); - docValues = Preconditions.checkNotNull(csf); - - // Find the min and max timestamps. - // See SEARCH-5534 - // In the archive, tweets are always sorted in descending order by tweet ID, but - // that does not mean that the documents are necessarily sorted by time. We've observed tweet ID - // generation be decoupled from timestamp creation (i.e. a larger tweet ID having a smaller - // created_at time). - minTimestamp = Integer.MAX_VALUE; - maxTimestamp = Integer.MIN_VALUE; - - NumericDocValues onDiskDocValues = reader.getNumericDocValues( - EarlybirdFieldConstants.EarlybirdFieldConstant.CREATED_AT_CSF_FIELD.getFieldName()); - for (int i = 0; i < reader.maxDoc(); ++i) { - Preconditions.checkArgument(onDiskDocValues.advanceExact(i)); - int timestamp = (int) onDiskDocValues.longValue(); - docValues.setValue(i, timestamp); - - if (timestamp < minTimestamp) { - minTimestamp = timestamp; - } - if (timestamp > maxTimestamp) { - maxTimestamp = timestamp; - } - } - } - - @Override - public int getLastTime() { - return maxTimestamp; - } - - @Override - public int getFirstTime() { - return minTimestamp; - } - - @Override - public int getTime(int docID) { - if (docID < 0 || docID > reader.maxDoc()) { - return ILLEGAL_TIME; - } - return (int) docValues.get(docID); - } - - @Override - public int findFirstDocId(int timeSeconds, int smallestDocID) throws IOException { - // In the full archive, the smallest doc id corresponds to largest timestamp. - if (timeSeconds > maxTimestamp) { - return smallestDocID; - } - if (timeSeconds < minTimestamp) { - return reader.maxDoc() - 1; - } - - int docId = DocValuesHelper.getLargestDocIdWithCeilOfValue( - reader, - EarlybirdFieldConstants.EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName(), - IntTermAttributeImpl.copyIntoNewBytesRef(timeSeconds)); - if (docId == DocIdSetIterator.NO_MORE_DOCS) { - return ILLEGAL_TIME; - } - - return docId; - } - - @Override - public TimeMapper optimize(DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) { - // DocValuesBasedTimerMapper instances are not flushed or loaded, - // so their optimization is a no-op. - return this; - } - - @Override - public Flushable.Handler getFlushHandler() { - // EarlybirdIndexSegmentData will still try to flush the DocValuesBasedTimeMapper for the - // respective segment, so we need to pass in a DocValuesBasedTimeMapper instance to this - // flusher: otherwise, Flushable.Handler.flush() will throw a NullPointerException. - return new FlushHandler(new DocValuesBasedTimeMapper()); - } - - // Full archive earlybirds don't actually flush or load the DocValuesBasedTimeMapper. This is - // why doFlush() is a no-op, and doLoad() returns a new DocValuesBasedTimeMapper instance - // (initializeWithLuceneReader() will be called at load time to initialize this new - // DocValuesBasedTimeMapper instance). - public static class FlushHandler extends Flushable.Handler { - public FlushHandler() { - super(); - } - - public FlushHandler(DocValuesBasedTimeMapper objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) { - } - - @Override - protected DocValuesBasedTimeMapper doLoad(FlushInfo flushInfo, DataDeserializer in) { - return new DocValuesBasedTimeMapper(); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/index/DocValuesBasedTweetIDMapper.docx b/src/java/com/twitter/search/earlybird/index/DocValuesBasedTweetIDMapper.docx new file mode 100644 index 000000000..4b5ebef79 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/DocValuesBasedTweetIDMapper.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/DocValuesBasedTweetIDMapper.java b/src/java/com/twitter/search/earlybird/index/DocValuesBasedTweetIDMapper.java deleted file mode 100644 index 6fe1cee4d..000000000 --- a/src/java/com/twitter/search/earlybird/index/DocValuesBasedTweetIDMapper.java +++ /dev/null @@ -1,149 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.search.DocIdSetIterator; - -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.util.analysis.SortableLongTermAttributeImpl; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldIndex; - -/** - * A few caveats when using this class: - * - Before actually using this class, one must call prepareToRead() with a Lucene AtomicReader - * - prepareToRead() will load docID to tweetID mapping into memory, if not already done. - */ -public class DocValuesBasedTweetIDMapper extends TweetIDMapper implements Flushable { - private LeafReader reader; - private ColumnStrideFieldIndex docValues; - - /** - * When indexing finishes, this method should be called with a index reader that - * can see all documents. - * @param leafReader Lucene index reader used to access TweetID to internal ID mapping - */ - public void initializeWithLuceneReader(LeafReader leafReader, ColumnStrideFieldIndex csf) - throws IOException { - reader = Preconditions.checkNotNull(leafReader); - docValues = Preconditions.checkNotNull(csf); - - NumericDocValues onDiskDocValues = reader.getNumericDocValues( - EarlybirdFieldConstants.EarlybirdFieldConstant.ID_CSF_FIELD.getFieldName()); - for (int i = 0; i < reader.maxDoc(); ++i) { - Preconditions.checkArgument(onDiskDocValues.advanceExact(i)); - docValues.setValue(i, onDiskDocValues.longValue()); - } - - // In the archive, tweets are always sorted in descending order of tweet ID. - setMinTweetID(docValues.get(reader.maxDoc() - 1)); - setMaxTweetID(docValues.get(0)); - setMinDocID(0); - setMaxDocID(reader.maxDoc() - 1); - setNumDocs(reader.maxDoc()); - } - - @Override - public int getDocID(long tweetID) throws IOException { - int docId = DocValuesHelper.getFirstDocIdWithValue( - reader, - EarlybirdFieldConstants.EarlybirdFieldConstant.ID_FIELD.getFieldName(), - SortableLongTermAttributeImpl.copyIntoNewBytesRef(tweetID)); - if (docId == DocIdSetIterator.NO_MORE_DOCS) { - return ID_NOT_FOUND; - } - return docId; - } - - @Override - protected int getNextDocIDInternal(int docID) { - // The doc IDs are consecutive and TweetIDMapper already checked the boundary conditions. - return docID + 1; - } - - @Override - protected int getPreviousDocIDInternal(int docID) { - // The doc IDs are consecutive and TweetIDMapper already checked the boundary conditions. - return docID - 1; - } - - @Override - public long getTweetID(int internalID) { - if (internalID < 0 || internalID > getMaxDocID()) { - return ID_NOT_FOUND; - } - return docValues.get(internalID); - } - - @Override - protected int addMappingInternal(long tweetID) { - throw new UnsupportedOperationException( - "ArchiveTweetIDMapper should be written through Lucene instead of TweetIDMappingWriter"); - } - - @Override - protected final int findDocIDBoundInternal(long tweetID, - boolean findMaxDocID) throws IOException { - // TermsEnum has a seekCeil() method, but doesn't have a seekFloor() method, so the best we can - // do here is ignore findLow and always return the ceiling if the tweet ID cannot be found. - // However, in practice, we do a seekExact() in both cases: see the inner classes in - // com.twitter.search.core.earlybird.index.inverted.RealtimeIndexTerms. - int docId = DocValuesHelper.getLargestDocIdWithCeilOfValue( - reader, - EarlybirdFieldConstants.EarlybirdFieldConstant.ID_FIELD.getFieldName(), - SortableLongTermAttributeImpl.copyIntoNewBytesRef(tweetID)); - if (docId == DocIdSetIterator.NO_MORE_DOCS) { - return ID_NOT_FOUND; - } - - // The docId is the upper bound of the search, so if we want the lower bound, - // because doc IDs are dense, we subtract one. - return findMaxDocID ? docId : docId - 1; - } - - @Override - public DocIDToTweetIDMapper optimize() { - // DocValuesBasedTweetIDMapper instances are not flushed or loaded, - // so their optimization is a no-op. - return this; - } - - @Override - public Flushable.Handler getFlushHandler() { - // EarlybirdIndexSegmentData will still try to flush the DocValuesBasedTweetIDMapper - // for the respective segment, so we need to pass in a DocValuesBasedTweetIDMapper instance to - // this flusher: otherwise, Flushable.Handler.flush() will throw a NullPointerException. - return new FlushHandler(new DocValuesBasedTweetIDMapper()); - } - - // Full archive earlybirds don't actually flush or load the DocValuesBasedTweetIDMapper. This is - // why doFlush() is a no-op, and doLoad() returns a new DocValuesBasedTweetIDMapper instance - // (initializeWithLuceneReader() will be called at load time to initialize this new - // DocValuesBasedTweetIDMapper instance). - public static class FlushHandler extends Flushable.Handler { - public FlushHandler() { - super(); - } - - public FlushHandler(DocValuesBasedTweetIDMapper objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) { - } - - @Override - protected DocValuesBasedTweetIDMapper doLoad(FlushInfo flushInfo, DataDeserializer in) { - return new DocValuesBasedTweetIDMapper(); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/index/DocValuesHelper.docx b/src/java/com/twitter/search/earlybird/index/DocValuesHelper.docx new file mode 100644 index 000000000..af4f65ef4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/DocValuesHelper.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/DocValuesHelper.java b/src/java/com/twitter/search/earlybird/index/DocValuesHelper.java deleted file mode 100644 index 417a3e640..000000000 --- a/src/java/com/twitter/search/earlybird/index/DocValuesHelper.java +++ /dev/null @@ -1,70 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; - -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BytesRef; - -public final class DocValuesHelper { - private DocValuesHelper() { - } - - /** - * Reverse lookup. Given a value, returns the first doc ID with this value. This requires a field - * that indexes the values. - * - * @param reader The reader to use to look up field values. - * @param value The value to lookup. - * @param indexField The field containing an index of the values. - */ - public static int getFirstDocIdWithValue( - LeafReader reader, String indexField, BytesRef value) throws IOException { - TermsEnum termsEnum = getTermsEnum(reader, indexField); - if (termsEnum == null || !termsEnum.seekExact(value)) { - return DocIdSetIterator.NO_MORE_DOCS; - } - - DocIdSetIterator docsIterator = termsEnum.postings(null); - return docsIterator.nextDoc(); - } - - /** - * Reverse lookup. Same as getFirstDocIdWithValue(), but if no document with the given value - * exists, the next bigger value is used for looking up the first doc ID. - * - * If there are multiple documents that match the value, all documents will be scanned, and the - * largest doc ID that matches will be returned. - * - * @param reader The reader to use to look up field values. - * @param value The value to lookup. - * @param indexField The field containing an index of the values. - */ - public static int getLargestDocIdWithCeilOfValue( - LeafReader reader, String indexField, BytesRef value) throws IOException { - TermsEnum termsEnum = getTermsEnum(reader, indexField); - if (termsEnum == null) { - return DocIdSetIterator.NO_MORE_DOCS; - } - if (termsEnum.seekCeil(value) == TermsEnum.SeekStatus.END) { - return DocIdSetIterator.NO_MORE_DOCS; - } - - DocIdSetIterator docsIterator = termsEnum.postings(null); - int docId = docsIterator.nextDoc(); - while (docsIterator.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { - docId = docsIterator.docID(); - } - return docId; - } - - private static TermsEnum getTermsEnum(LeafReader reader, String indexField) throws IOException { - Terms terms = reader.terms(indexField); - if (terms == null) { - return null; - } - return terms.iterator(); - } -} diff --git a/src/java/com/twitter/search/earlybird/index/EarlybirdSegment.docx b/src/java/com/twitter/search/earlybird/index/EarlybirdSegment.docx new file mode 100644 index 000000000..834eb815d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/EarlybirdSegment.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/EarlybirdSegment.java b/src/java/com/twitter/search/earlybird/index/EarlybirdSegment.java deleted file mode 100644 index a902dc890..000000000 --- a/src/java/com/twitter/search/earlybird/index/EarlybirdSegment.java +++ /dev/null @@ -1,1070 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.Closeable; -import java.io.File; -import java.io.IOException; -import java.time.Instant; -import java.time.ZoneOffset; -import java.time.ZonedDateTime; -import java.time.format.DateTimeFormatter; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.concurrent.atomic.AtomicReference; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.HashBasedTable; -import com.google.common.collect.Table; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.apache.commons.io.FileUtils; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.IndexableField; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.IOContext; -import org.apache.lucene.store.IndexOutput; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.collections.Pair; -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.schema.base.FeatureConfiguration; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.ThriftDocumentUtil; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeatures; -import com.twitter.search.common.schema.earlybird.EarlybirdEncodedFeaturesUtil; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.thriftjava.ThriftDocument; -import com.twitter.search.common.schema.thriftjava.ThriftField; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentWriter; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldIndex; -import com.twitter.search.core.earlybird.index.column.DocValuesUpdate; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsFactory; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.document.TweetDocument; -import com.twitter.search.earlybird.exception.FlushVersionMismatchException; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentIndexStats; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.snowflake.id.SnowflakeId; - -public class EarlybirdSegment { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdSegment.class); - private static final Logger UPDATES_ERRORS_LOG = - LoggerFactory.getLogger(EarlybirdSegment.class.getName() + ".UpdatesErrors"); - private static final String SUCCESS_FILE = "EARLYBIRD_SUCCESS"; - private static final DateTimeFormatter HOURLY_COUNT_DATE_TIME_FORMATTER = - DateTimeFormatter.ofPattern("yyyy_MM_dd_HH"); - - @VisibleForTesting - public static final String NUM_TWEETS_CREATED_AT_PATTERN = "num_tweets_%s_%s_created_at_%s"; - - private static final String INVALID_FEATURE_UPDATES_DROPPED_PREFIX = - "invalid_index_feature_update_dropped_"; - - // The number of tweets not indexed because they have been previously indexed. - private static final SearchCounter DUPLICATE_TWEET_SKIPPED_COUNTER = - SearchCounter.export("duplicate_tweet_skipped"); - - // The number of tweets that came out of order. - private static final SearchCounter OUT_OF_ORDER_TWEET_COUNTER = - SearchCounter.export("out_of_order_tweet"); - - // The number partial updates dropped because the field could not be found in the schema. - // This counter is incremented once per field rather than once per partial update event. - // Note: caller may retry update, this counter will be incremented multiple times for same update. - private static final SearchCounter INVALID_FIELDS_IN_PARTIAL_UPDATES = - SearchCounter.export("invalid_fields_in_partial_updates"); - - // The number partial updates dropped because the tweet id could not be found in the segment. - // Note: caller may retry update, this counter will be incremented multiple times for same update. - private static final SearchCounter PARTIAL_UPDATE_FOR_TWEET_NOT_IN_INDEX = - SearchCounter.export("partial_update_for_tweet_id_not_in_index"); - - // The number of partial updates that were applied only partially, because the update could not - // be applied for at least one of the fields. - private static final SearchCounter PARTIAL_UPDATE_PARTIAL_FAILURE = - SearchCounter.export("partial_update_partial_failure"); - - // Both the indexing chain and the index writer are lazily initialized when adding docs for - // the first time. - private final AtomicReference segmentWriterReference = - new AtomicReference<>(); - - // Stats from the PartitionIndexer / SimpleSegmentIndexer. - private final SegmentIndexStats indexStats; - private final String segmentName; - private final int maxSegmentSize; - private final long timeSliceID; - private final AtomicReference luceneIndexReader = - new AtomicReference<>(); - private final Directory luceneDir; - private final File luceneDirFile; - private final EarlybirdIndexConfig indexConfig; - private final List closableResources = Lists.newArrayList(); - private long lastInOrderTweetId = 0; - - private final EarlybirdIndexExtensionsFactory extensionsFactory; - private final SearchIndexingMetricSet searchIndexingMetricSet; - private final EarlybirdSearcherStats searcherStats; - - private final Map indexedTweetsCounters = Maps.newHashMap(); - private final PerFieldCounters perFieldCounters; - private final Clock clock; - - @VisibleForTesting - public volatile boolean appendedLuceneIndex = false; - - public EarlybirdSegment( - String segmentName, - long timeSliceID, - int maxSegmentSize, - Directory luceneDir, - EarlybirdIndexConfig indexConfig, - SearchIndexingMetricSet searchIndexingMetricSet, - EarlybirdSearcherStats searcherStats, - Clock clock) { - this.segmentName = segmentName; - this.maxSegmentSize = maxSegmentSize; - this.timeSliceID = timeSliceID; - this.luceneDir = luceneDir; - this.indexConfig = indexConfig; - this.indexStats = new SegmentIndexStats(); - this.perFieldCounters = new PerFieldCounters(); - this.extensionsFactory = new TweetSearchIndexExtensionsFactory(); - - if (luceneDir != null && luceneDir instanceof FSDirectory) { - // getDirectory() throws if the luceneDir is already closed. - // To delete a directory, we need to close it first. - // Obtain a reference to the File now, so we can delete it later. - // See SEARCH-5281 - this.luceneDirFile = ((FSDirectory) luceneDir).getDirectory().toFile(); - } else { - this.luceneDirFile = null; - } - this.searchIndexingMetricSet = Preconditions.checkNotNull(searchIndexingMetricSet); - this.searcherStats = searcherStats; - this.clock = clock; - } - - @VisibleForTesting - public Directory getLuceneDirectory() { - return luceneDir; - } - - public SegmentIndexStats getIndexStats() { - return indexStats; - } - - /** - * Returns the smallest tweet ID in this segment. If the segment is not loaded yet, or is empty, - * DocIDToTweetIDMapper.ID_NOT_FOUND is returned (-1). - * - * @return The smallest tweet ID in this segment. - */ - public long getLowestTweetId() { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter == null) { - return DocIDToTweetIDMapper.ID_NOT_FOUND; - } - - DocIDToTweetIDMapper mapper = segmentWriter.getSegmentData().getDocIDToTweetIDMapper(); - int highestDocID = mapper.getPreviousDocID(Integer.MAX_VALUE); - return mapper.getTweetID(highestDocID); - } - - /** - * Returns the cardinality (size) sum of the cardinality of each - * query cache set. - */ - public long getQueryCachesCardinality() { - EarlybirdIndexSegmentWriter writer = getIndexSegmentWriter(); - if (writer == null) { - // The segment is not loaded yet, or the query caches for this segment are not built yet. - return -1; - } - - EarlybirdIndexSegmentData earlybirdIndexSegmentData = writer.getSegmentData(); - return earlybirdIndexSegmentData.getQueryCachesCardinality(); - } - - public List> getQueryCachesData() { - return getIndexSegmentWriter().getSegmentData().getPerQueryCacheCardinality(); - } - - - /** - * Returns the highest tweet ID in this segment. If the segment is not loaded yet, or is empty, - * DocIDToTweetIDMapper.ID_NOT_FOUND is returned (-1). - * - * @return The highest tweet ID in this segment. - */ - public long getHighestTweetId() { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter == null) { - return DocIDToTweetIDMapper.ID_NOT_FOUND; - } - - DocIDToTweetIDMapper mapper = segmentWriter.getSegmentData().getDocIDToTweetIDMapper(); - int lowestDocID = mapper.getNextDocID(-1); - return mapper.getTweetID(lowestDocID); - } - - /** - * Optimizes the underlying segment data. - */ - public void optimizeIndexes() throws IOException { - EarlybirdIndexSegmentWriter unoptimizedWriter = segmentWriterReference.get(); - Preconditions.checkNotNull(unoptimizedWriter); - - unoptimizedWriter.forceMerge(); - unoptimizedWriter.close(); - - // Optimize our own data structures in the indexing chain - // In the archive this is pretty much a no-op. - // The indexWriter in writeableSegment should no longer be used and referenced, and - // writeableSegment.writer can be garbage collected at this point. - EarlybirdIndexSegmentData optimized = indexConfig.optimize(unoptimizedWriter.getSegmentData()); - resetSegmentWriterReference(newWriteableSegment(optimized), true); - - addSuccessFile(); - } - - /** - * Returns a new, optimized, realtime segment, by copying the data in this segment. - */ - public EarlybirdSegment makeOptimizedSegment() throws IOException { - EarlybirdIndexSegmentWriter unoptimizedWriter = segmentWriterReference.get(); - Preconditions.checkNotNull(unoptimizedWriter); - EarlybirdSegment optimizedSegment = new EarlybirdSegment( - segmentName, - timeSliceID, - maxSegmentSize, - luceneDir, - indexConfig, - searchIndexingMetricSet, - searcherStats, - clock); - - EarlybirdIndexSegmentData optimizedSegmentData = - indexConfig.optimize(unoptimizedWriter.getSegmentData()); - LOG.info("Done optimizing, setting segment data"); - - optimizedSegment.setSegmentData( - optimizedSegmentData, - indexStats.getPartialUpdateCount(), - indexStats.getOutOfOrderUpdateCount()); - return optimizedSegment; - } - - public String getSegmentName() { - return segmentName; - } - - public boolean isOptimized() { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - return segmentWriter != null && segmentWriter.getSegmentData().isOptimized(); - } - - /** - * Removes the document for the given tweet ID from this segment, if this segment contains a - * document for this tweet ID. - */ - public boolean delete(long tweetID) throws IOException { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (!hasDocument(tweetID)) { - return false; - } - - segmentWriter.deleteDocuments(new TweetIDQuery(tweetID)); - return true; - } - - protected void updateDocValues(long tweetID, String field, DocValuesUpdate update) - throws IOException { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - segmentWriter.updateDocValues(new TweetIDQuery(tweetID), field, update); - } - - /** - * Appends the Lucene index from another segment to this segment. - */ - public void append(EarlybirdSegment otherSegment) throws IOException { - if (indexConfig.isIndexStoredOnDisk()) { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - Preconditions.checkNotNull(segmentWriter); - EarlybirdIndexSegmentWriter otherSegmentWriter = otherSegment.segmentWriterReference.get(); - if (otherSegmentWriter != null) { - otherSegmentWriter.close(); - } - segmentWriter.addIndexes(otherSegment.luceneDir); - LOG.info("Calling forceMerge now after appending segment."); - segmentWriter.forceMerge(); - appendedLuceneIndex = true; - LOG.info("Appended {} docs to segment {}. New doc count = {}", - otherSegment.indexStats.getStatusCount(), luceneDir.toString(), - indexStats.getStatusCount()); - - indexStats.setIndexSizeOnDiskInBytes(getSegmentSizeOnDisk()); - } - } - - /** - * Only needed for the on disk archive. - * Creates TwitterIndexReader used for searching. This is shared by all Searchers. - * This method also initializes the Lucene based mappers and CSF for the on disk archive. - * - * This method should be called after optimizing/loading a segment, but before the segment starts - * to serve search queries. - */ - public void warmSegment() throws IOException { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - Preconditions.checkNotNull(segmentWriter); - - // only need to pre-create reader and initialize mappers and CSF in the on disk archive cluster - if (indexConfig.isIndexStoredOnDisk() && luceneIndexReader.get() == null) { - EarlybirdIndexSegmentAtomicReader luceneAtomicReader = - segmentWriter.getSegmentData().createAtomicReader(); - - luceneIndexReader.set(luceneAtomicReader); - closableResources.add(luceneAtomicReader); - closableResources.add(luceneDir); - } - } - - /** - * Create a tweet index searcher on the segment. - * - * For production search session, the schema snapshot should be always passed in to make sure - * that the schema usage inside scoring is consistent. - * - * For non-production usage, like one-off debugging search, you can use the function call without - * the schema snapshot. - */ - @Nullable - public EarlybirdSingleSegmentSearcher getSearcher( - UserTable userTable, - ImmutableSchemaInterface schemaSnapshot) throws IOException { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter == null) { - return null; - } - return new EarlybirdSingleSegmentSearcher( - schemaSnapshot, getIndexReader(segmentWriter), userTable, searcherStats, clock); - } - - /** - * Returns a new searcher for this segment. - */ - @Nullable - public EarlybirdSingleSegmentSearcher getSearcher( - UserTable userTable) throws IOException { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter == null) { - return null; - } - return new EarlybirdSingleSegmentSearcher( - segmentWriter.getSegmentData().getSchema().getSchemaSnapshot(), - getIndexReader(segmentWriter), - userTable, - searcherStats, - clock); - } - - /** - * Returns a new reader for this segment. - */ - @Nullable - public EarlybirdIndexSegmentAtomicReader getIndexReader() throws IOException { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter == null) { - return null; - } - return getIndexReader(segmentWriter); - } - - private EarlybirdIndexSegmentAtomicReader getIndexReader( - EarlybirdIndexSegmentWriter segmentWriter - ) throws IOException { - EarlybirdIndexSegmentAtomicReader reader = luceneIndexReader.get(); - if (reader != null) { - return reader; - } - Preconditions.checkState(!indexConfig.isIndexStoredOnDisk()); - - // Realtime EB mode. - return segmentWriter.getSegmentData().createAtomicReader(); - } - - /** - * Gets max tweet id in this segment. - * - * @return the tweet id or -1 if not found. - */ - public long getMaxTweetId() { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter == null) { - return -1; - } else { - TweetIDMapper tweetIDMapper = - (TweetIDMapper) segmentWriter.getSegmentData().getDocIDToTweetIDMapper(); - return tweetIDMapper.getMaxTweetID(); - } - } - - private EarlybirdIndexSegmentWriter newWriteableSegment(EarlybirdIndexSegmentData segmentData) - throws IOException { - EarlybirdIndexSegmentWriter old = segmentWriterReference.get(); - if (old != null) { - old.close(); - } - - LOG.info("Creating new segment writer for {} on {}", segmentName, luceneDir); - IndexWriterConfig indexWriterConfig = indexConfig.newIndexWriterConfig(); - return segmentData.createEarlybirdIndexSegmentWriter(indexWriterConfig); - } - - private void resetSegmentWriterReference( - EarlybirdIndexSegmentWriter segmentWriter, boolean previousSegmentWriterAllowed) { - EarlybirdIndexSegmentWriter previousSegmentWriter = - segmentWriterReference.getAndSet(segmentWriter); - if (!previousSegmentWriterAllowed) { - Preconditions.checkState( - previousSegmentWriter == null, - "A previous segment writer must have been set for segment " + segmentName); - } - - // Reset the stats for the number of indexed tweets per hour and recompute them. - // See SEARCH-23619 - for (SearchCounter indexedTweetsCounter : indexedTweetsCounters.values()) { - indexedTweetsCounter.reset(); - } - - if (segmentWriter != null) { - indexStats.setSegmentData(segmentWriter.getSegmentData()); - - if (indexConfig.getCluster() != EarlybirdCluster.FULL_ARCHIVE) { - initHourlyTweetCounts(segmentWriterReference.get()); - } - } else { - // It's important to unset segment data so that there are no references to it - // and it can be GC-ed. - indexStats.unsetSegmentDataAndSaveCounts(); - } - } - - /** - * Add a document if it is not already in segment. - */ - public void addDocument(TweetDocument doc) throws IOException { - if (indexConfig.isIndexStoredOnDisk()) { - addDocumentToArchiveSegment(doc); - } else { - addDocumentToRealtimeSegment(doc); - } - } - - private void addDocumentToArchiveSegment(TweetDocument doc) throws IOException { - // For archive, the document id should come in order, to drop duplicates, only need to - // compare current id with last one. - long tweetId = doc.getTweetID(); - if (tweetId == lastInOrderTweetId) { - LOG.warn("Dropped duplicate tweet for archive: {}", tweetId); - DUPLICATE_TWEET_SKIPPED_COUNTER.increment(); - return; - } - - if (tweetId > lastInOrderTweetId && lastInOrderTweetId != 0) { - // Archive orders document from newest to oldest, so this shouldn't happen - LOG.warn("Encountered out-of-order tweet for archive: {}", tweetId); - OUT_OF_ORDER_TWEET_COUNTER.increment(); - } else { - lastInOrderTweetId = tweetId; - } - - addDocumentInternal(doc); - } - - private void addDocumentToRealtimeSegment(TweetDocument doc) throws IOException { - long tweetId = doc.getTweetID(); - boolean outOfOrder = tweetId <= lastInOrderTweetId; - if (outOfOrder) { - OUT_OF_ORDER_TWEET_COUNTER.increment(); - } else { - lastInOrderTweetId = tweetId; - } - - // We only need to call hasDocument() for out-of-order tweets. - if (outOfOrder && hasDocument(tweetId)) { - // We do get duplicates sometimes so you'll see some amount of these. - DUPLICATE_TWEET_SKIPPED_COUNTER.increment(); - } else { - addDocumentInternal(doc); - incrementHourlyTweetCount(doc.getTweetID()); - } - } - - private void addDocumentInternal(TweetDocument tweetDocument) throws IOException { - Document doc = tweetDocument.getDocument(); - - // Never write blank documents into the index. - if (doc == null || doc.getFields() == null || doc.getFields().size() == 0) { - return; - } - - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter == null) { - EarlybirdIndexSegmentData segmentData = indexConfig.newSegmentData( - maxSegmentSize, - timeSliceID, - luceneDir, - extensionsFactory); - segmentWriter = newWriteableSegment(segmentData); - resetSegmentWriterReference(segmentWriter, false); - } - - Preconditions.checkState(segmentWriter.numDocs() < maxSegmentSize, - "Reached max segment size %s", maxSegmentSize); - - IndexableField[] featuresField = doc.getFields( - EarlybirdFieldConstants.ENCODED_TWEET_FEATURES_FIELD_NAME); - Preconditions.checkState(featuresField.length == 1, - "featuresField.length should be 1, but is %s", featuresField.length); - - // We require the createdAt field to be set so we can properly filter tweets based on time. - IndexableField[] createdAt = - doc.getFields(EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName()); - Preconditions.checkState(createdAt.length == 1); - - EarlybirdEncodedFeatures features = EarlybirdEncodedFeaturesUtil.fromBytes( - indexConfig.getSchema().getSchemaSnapshot(), - EarlybirdFieldConstant.ENCODED_TWEET_FEATURES_FIELD, - featuresField[0].binaryValue().bytes, - featuresField[0].binaryValue().offset); - boolean currentDocIsOffensive = features.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG); - perFieldCounters.increment(ThriftIndexingEventType.INSERT, doc); - segmentWriter.addTweet(doc, tweetDocument.getTweetID(), currentDocIsOffensive); - } - - private void incrementHourlyTweetCount(long tweetId) { - // SEARCH-23619, We won't attempt to increment the count for pre-snowflake IDs, since - // extracting an exact create time is pretty tricky at this point, and the stat is mostly - // useful for checking realtime tweet indexing. - if (SnowflakeId.isSnowflakeId(tweetId)) { - long tweetCreateTime = SnowflakeId.unixTimeMillisFromId(tweetId); - String tweetHour = HOURLY_COUNT_DATE_TIME_FORMATTER.format( - ZonedDateTime.ofInstant(Instant.ofEpochMilli(tweetCreateTime), ZoneOffset.UTC)); - - String segmentOptimizedSuffix = isOptimized() ? "optimized" : "unoptimized"; - SearchCounter indexedTweetsCounter = indexedTweetsCounters.computeIfAbsent( - tweetHour + "_" + segmentOptimizedSuffix, - (tweetHourKey) -> SearchCounter.export(String.format( - NUM_TWEETS_CREATED_AT_PATTERN, segmentOptimizedSuffix, segmentName, tweetHour))); - indexedTweetsCounter.increment(); - } - } - - private void initHourlyTweetCounts(EarlybirdIndexSegmentWriter segmentWriter) { - DocIDToTweetIDMapper mapper = segmentWriter.getSegmentData().getDocIDToTweetIDMapper(); - int docId = Integer.MIN_VALUE; - while ((docId = mapper.getNextDocID(docId)) != DocIDToTweetIDMapper.ID_NOT_FOUND) { - incrementHourlyTweetCount(mapper.getTweetID(docId)); - } - } - - /** - * Adds the given document for the given tweet ID to the segment, potentially out of order. - */ - public boolean appendOutOfOrder(Document doc, long tweetID) throws IOException { - // Never write blank documents into the index. - if (doc == null || doc.getFields() == null || doc.getFields().size() == 0) { - return false; - } - - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter == null) { - logAppendOutOfOrderFailure(tweetID, doc, "segment is null"); - return false; - } - - if (!indexConfig.supportOutOfOrderIndexing()) { - logAppendOutOfOrderFailure(tweetID, doc, "out of order indexing not supported"); - return false; - } - - if (!hasDocument(tweetID)) { - logAppendOutOfOrderFailure(tweetID, doc, "tweet ID index lookup failed"); - searchIndexingMetricSet.updateOnMissingTweetCounter.increment(); - perFieldCounters.incrementTweetNotInIndex(ThriftIndexingEventType.OUT_OF_ORDER_APPEND, doc); - return false; - } - - perFieldCounters.increment(ThriftIndexingEventType.OUT_OF_ORDER_APPEND, doc); - segmentWriter.appendOutOfOrder(new TweetIDQuery(tweetID), doc); - indexStats.incrementOutOfOrderUpdateCount(); - return true; - } - - private void logAppendOutOfOrderFailure(long tweetID, Document doc, String reason) { - UPDATES_ERRORS_LOG.debug( - "appendOutOfOrder() failed to apply update document with hash {} on tweet ID {}: {}", - Objects.hashCode(doc), tweetID, reason); - } - - /** - * Determines if this segment contains the given tweet ID. - */ - public boolean hasDocument(long tweetID) throws IOException { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter == null) { - return false; - } - - return segmentWriter.getSegmentData().getDocIDToTweetIDMapper().getDocID(tweetID) - != DocIDToTweetIDMapper.ID_NOT_FOUND; - } - - private static final String VERSION_PROP_NAME = "version"; - private static final String VERSION_DESC_PROP_NAME = "versionDescription"; - private static final String PARTIAL_UPDATES_COUNT = "partialUpdatesCount"; - private static final String OUT_OF_ORDER_UPDATES_COUNT = "outOfOrderUpdatesCount"; - - private void checkIfFlushedDataVersionMatchesExpected(FlushInfo flushInfo) throws IOException { - int expectedVersionNumber = indexConfig.getSchema().getMajorVersionNumber(); - String expectedVersionDesc = indexConfig.getSchema().getVersionDescription(); - int version = flushInfo.getIntProperty(VERSION_PROP_NAME); - final String versionDesc = flushInfo.getStringProperty(VERSION_DESC_PROP_NAME); - - if (version != expectedVersionNumber) { - throw new FlushVersionMismatchException("Flushed version mismatch. Expected: " - + expectedVersionNumber + ", but was: " + version); - } - - if (!expectedVersionDesc.equals(versionDesc)) { - final String message = "Flush version " + expectedVersionNumber + " is ambiguous" - + " Expected: " + expectedVersionDesc - + " Found: " + versionDesc - + " Please clean up segments with bad flush version from HDFS and Earlybird local disk."; - throw new FlushVersionMismatchException(message); - } - } - - /** - * Loads the segment data and properties from the given deserializer and flush info. - * - * @param in The deserializer from which the segment's data will be read. - * @param flushInfo The flush info from which the segment's properties will be read. - */ - public void load(DataDeserializer in, FlushInfo flushInfo) throws IOException { - checkIfFlushedDataVersionMatchesExpected(flushInfo); - - int partialUpdatesCount = flushInfo.getIntProperty(PARTIAL_UPDATES_COUNT); - int outOfOrderUpdatesCount = flushInfo.getIntProperty(OUT_OF_ORDER_UPDATES_COUNT); - - EarlybirdIndexSegmentData loadedSegmentData = indexConfig.loadSegmentData( - flushInfo, in, luceneDir, extensionsFactory); - - setSegmentData(loadedSegmentData, partialUpdatesCount, outOfOrderUpdatesCount); - } - - /** - * Update the data backing this EarlyirdSegment. - */ - public void setSegmentData( - EarlybirdIndexSegmentData segmentData, - int partialUpdatesCount, - int outOfOrderUpdatesCount) throws IOException { - resetSegmentWriterReference(newWriteableSegment(segmentData), false); - try { - warmSegment(); - } catch (IOException e) { - LOG.error("Failed to create IndexReader for segment {}. Will destroy unreadable segment.", - segmentName, e); - destroyImmediately(); - throw e; - } - - LOG.info("Starting segment {} with {} partial updates, {} out of order updates and {} deletes.", - segmentName, partialUpdatesCount, outOfOrderUpdatesCount, indexStats.getDeleteCount()); - indexStats.setPartialUpdateCount(partialUpdatesCount); - indexStats.setOutOfOrderUpdateCount(outOfOrderUpdatesCount); - indexStats.setIndexSizeOnDiskInBytes(getSegmentSizeOnDisk()); - } - - /** - * Flushes the this segment's properties to the given FlushInfo instance, and this segment's data - * to the given DataSerializer instance. - * - * @param flushInfo The FlushInfo instance where all segment properties should be added. - * @param out The serializer to which all segment data should be flushed. - */ - public void flush(FlushInfo flushInfo, DataSerializer out) throws IOException { - flushInfo.addIntProperty(VERSION_PROP_NAME, indexConfig.getSchema().getMajorVersionNumber()); - flushInfo.addStringProperty(VERSION_DESC_PROP_NAME, - indexConfig.getSchema().getVersionDescription()); - flushInfo.addIntProperty(PARTIAL_UPDATES_COUNT, indexStats.getPartialUpdateCount()); - flushInfo.addIntProperty(OUT_OF_ORDER_UPDATES_COUNT, indexStats.getOutOfOrderUpdateCount()); - if (segmentWriterReference.get() == null) { - LOG.warn("Segment writer is null. flushInfo: {}", flushInfo); - } else if (segmentWriterReference.get().getSegmentData() == null) { - LOG.warn("Segment data is null. segment writer: {}, flushInfo: {}", - segmentWriterReference.get(), flushInfo); - } - segmentWriterReference.get().getSegmentData().flushSegment(flushInfo, out); - indexStats.setIndexSizeOnDiskInBytes(getSegmentSizeOnDisk()); - } - - /** - * Check to see if this segment can be loaded from an on-disk index, and load it if it can be. - * - * This should only be applicable to the current segment for the on-disk archive. It's not - * fully flushed until it's full, but we do have a lucene index on local disk which can be - * used at startup (rather than have to reindex all the current timeslice documents again). - * - * If loaded, the index reader will be pre-created, and the segment will be marked as - * optimized. - * - * If the index directory exists but it cannot be loaded, the index directory will be deleted. - * - * @return true if the index exists on disk, and was loaded. - */ - public boolean tryToLoadExistingIndex() throws IOException { - Preconditions.checkState(segmentWriterReference.get() == null); - if (indexConfig.isIndexStoredOnDisk()) { - if (DirectoryReader.indexExists(luceneDir) && checkSuccessFile()) { - LOG.info("Index directory already exists for {} at {}", segmentName, luceneDir); - - // set the optimized flag, since we don't need to optimize any more, and pre-create - // the index reader (for the on-disk index optimize() is a noop that just sets the - // optimized flag). - EarlybirdIndexSegmentData earlybirdIndexSegmentData = indexConfig.newSegmentData( - maxSegmentSize, - timeSliceID, - luceneDir, - extensionsFactory); - EarlybirdIndexSegmentData optimizedEarlybirdIndexSegmentData = - indexConfig.optimize(earlybirdIndexSegmentData); - resetSegmentWriterReference(newWriteableSegment(optimizedEarlybirdIndexSegmentData), false); - - warmSegment(); - - LOG.info("Used existing lucene index for {} with {} documents", - segmentName, indexStats.getStatusCount()); - - indexStats.setIndexSizeOnDiskInBytes(getSegmentSizeOnDisk()); - - return true; - } else { - // Check if there is an existing lucene dir without a SUCCESS file on disk. - // If so, we will remove it and reindex from scratch. - if (moveFSDirectoryIfExists(luceneDir)) { - // Throw here to be cleaned up and retried by SimpleSegmentIndexer. - throw new IOException("Found invalid existing lucene directory at: " + luceneDir); - } - } - } - return false; - } - - /** - * Partially updates a document with the field value(s) specified by event. - * Returns true if all writes were successful and false if one or more writes fail or if - * tweet id isn't found in the segment. - */ - public boolean applyPartialUpdate(ThriftIndexingEvent event) throws IOException { - Preconditions.checkArgument(event.getEventType() == ThriftIndexingEventType.PARTIAL_UPDATE); - Preconditions.checkArgument(event.isSetUid()); - Preconditions.checkArgument(!ThriftDocumentUtil.hasDuplicateFields(event.getDocument())); - ImmutableSchemaInterface schemaSnapshot = indexConfig.getSchema().getSchemaSnapshot(); - - long tweetId = event.getUid(); - ThriftDocument doc = event.getDocument(); - - if (!hasDocument(tweetId)) { - // no need to attempt field writes, fail early - PARTIAL_UPDATE_FOR_TWEET_NOT_IN_INDEX.increment(); - perFieldCounters.incrementTweetNotInIndex( - ThriftIndexingEventType.PARTIAL_UPDATE, doc); - return false; - } - - int invalidFields = 0; - for (ThriftField field : doc.getFields()) { - String featureName = schemaSnapshot.getFieldName(field.getFieldConfigId()); - FeatureConfiguration featureConfig = - schemaSnapshot.getFeatureConfigurationByName(featureName); - if (featureConfig == null) { - INVALID_FIELDS_IN_PARTIAL_UPDATES.increment(); - invalidFields++; - continue; - } - - perFieldCounters.increment(ThriftIndexingEventType.PARTIAL_UPDATE, featureName); - - updateDocValues( - tweetId, - featureName, - (docValues, docID) -> updateFeatureValue(docID, featureConfig, docValues, field)); - } - - if (invalidFields > 0 && invalidFields != doc.getFieldsSize()) { - PARTIAL_UPDATE_PARTIAL_FAILURE.increment(); - } - - if (invalidFields == 0) { - indexStats.incrementPartialUpdateCount(); - } else { - UPDATES_ERRORS_LOG.warn("Failed to apply update for tweetID {}, found {} invalid fields: {}", - tweetId, invalidFields, event); - } - - return invalidFields == 0; - } - - @VisibleForTesting - static void updateFeatureValue(int docID, - FeatureConfiguration featureConfig, - ColumnStrideFieldIndex docValues, - ThriftField updateField) { - int oldValue = Math.toIntExact(docValues.get(docID)); - int newValue = updateField.getFieldData().getIntValue(); - - if (!featureConfig.validateFeatureUpdate(oldValue, newValue)) { - // Counter values can only increase - SearchCounter.export( - INVALID_FEATURE_UPDATES_DROPPED_PREFIX + featureConfig.getName()).increment(); - } else { - docValues.setValue(docID, newValue); - } - } - - /** - * Checks if the provided directory exists and is not empty, - * and if it does moves it out to a diff directory for later inspection. - * @param luceneDirectory the dir to move if it exists. - * @return true iff we found an existing directory. - */ - private static boolean moveFSDirectoryIfExists(Directory luceneDirectory) { - Preconditions.checkState(luceneDirectory instanceof FSDirectory); - File directory = ((FSDirectory) luceneDirectory).getDirectory().toFile(); - if (directory != null && directory.exists() && directory.list().length > 0) { - // Save the bad lucene index by moving it out, for later inspection. - File movedDir = new File(directory.getParent(), - directory.getName() + ".failed." + System.currentTimeMillis()); - LOG.warn("Moving existing non-successful index for {} from {} to {}", - luceneDirectory, directory, movedDir); - boolean success = directory.renameTo(movedDir); - if (!success) { - LOG.warn("Unable to rename non-successful index: {}", luceneDirectory); - } - return true; - } - return false; - } - - /** - * For the on-disk archive, if we were able to successfully merge and flush the Lucene index to - * disk, we mark it explicitly with a SUCCESS file, so that it can be safely reused. - */ - private void addSuccessFile() throws IOException { - if (indexConfig.isIndexStoredOnDisk()) { - IndexOutput successFile = luceneDir.createOutput(SUCCESS_FILE, IOContext.DEFAULT); - successFile.close(); - } - } - - /** - * Returns the current number of documents in this segment. - */ - public int getNumDocs() throws IOException { - return indexStats.getStatusCount(); - } - - /** - * Reclaim resources used by this segment (E.g. closing lucene index reader). - * Resources will be reclaimed within the calling thread with no delay. - */ - public void destroyImmediately() { - try { - closeSegmentWriter(); - maybeDeleteSegmentOnDisk(); - unloadSegmentFromMemory(); - } finally { - indexConfig.getResourceCloser().closeResourcesImmediately(closableResources); - } - } - - /** - * Close the in-memory resources belonging to this segment. This should allow the in-memory - * segment data to be garbage collected. After closing, the segment is not writable. - */ - public void close() { - if (segmentWriterReference.get() == null) { - LOG.info("Segment {} already closed.", segmentName); - return; - } - - LOG.info("Closing segment {}.", segmentName); - try { - closeSegmentWriter(); - unloadSegmentFromMemory(); - } finally { - indexConfig.getResourceCloser().closeResourcesImmediately(closableResources); - } - } - - private void closeSegmentWriter() { - EarlybirdIndexSegmentWriter segmentWriter = segmentWriterReference.get(); - if (segmentWriter != null) { - closableResources.add(() -> { - LOG.info("Closing writer for segment: {}", segmentName); - segmentWriter.close(); - }); - } - } - - private void maybeDeleteSegmentOnDisk() { - if (indexConfig.isIndexStoredOnDisk()) { - Preconditions.checkState( - luceneDir instanceof FSDirectory, - "On-disk indexes should have an underlying directory that we can close and remove."); - closableResources.add(luceneDir); - - if (luceneDirFile != null && luceneDirFile.exists()) { - closableResources.add(new Closeable() { - @Override - public void close() throws IOException { - FileUtils.deleteDirectory(luceneDirFile); - } - - @Override - public String toString() { - return "delete {" + luceneDirFile + "}"; - } - }); - } - } - } - - private void unloadSegmentFromMemory() { - // Make sure we don't retain a reference to the IndexWriter or SegmentData. - resetSegmentWriterReference(null, true); - } - - private long getSegmentSizeOnDisk() throws IOException { - searchIndexingMetricSet.segmentSizeCheckCount.increment(); - - long totalSize = 0; - if (luceneDir != null) { - for (String file : luceneDir.listAll()) { - totalSize += luceneDir.fileLength(file); - } - } - return totalSize; - } - - ////////////////////////// - // for unit tests only - ////////////////////////// - - public EarlybirdIndexConfig getEarlybirdIndexConfig() { - return indexConfig; - } - - @VisibleForTesting - public boolean checkSuccessFile() { - return new File(luceneDirFile, SUCCESS_FILE).exists(); - } - - @VisibleForTesting - EarlybirdIndexSegmentWriter getIndexSegmentWriter() { - return segmentWriterReference.get(); - } - - // Helper class to encapsulate counter tables, patterns and various ways to increment - private class PerFieldCounters { - // The number of update/append events for each field in the schema. - private static final String PER_FIELD_EVENTS_COUNTER_PATTERN = "%s_for_field_%s"; - // The number of dropped update/append events for each field due to tweetId not found - private static final String TWEET_NOT_IN_INDEX_PER_FIELD_EVENTS_COUNTER_PATTERN = - "%s_for_tweet_id_not_in_index_for_field_%s"; - private final Table perFieldTable = - HashBasedTable.create(); - private final Table notInIndexPerFieldTable = - HashBasedTable.create(); - - public void increment( - ThriftIndexingEventType eventType, ThriftDocument doc) { - ImmutableSchemaInterface schemaSnapshot = indexConfig.getSchema().getSchemaSnapshot(); - for (ThriftField field : doc.getFields()) { - String fieldName = schemaSnapshot.getFieldName(field.getFieldConfigId()); - incrementForPattern( - eventType, fieldName, perFieldTable, PER_FIELD_EVENTS_COUNTER_PATTERN); - } - } - - public void incrementTweetNotInIndex( - ThriftIndexingEventType eventType, ThriftDocument doc) { - ImmutableSchemaInterface schemaSnapshot = indexConfig.getSchema().getSchemaSnapshot(); - for (ThriftField field : doc.getFields()) { - String fieldName = schemaSnapshot.getFieldName(field.getFieldConfigId()); - incrementForPattern( - eventType, fieldName, notInIndexPerFieldTable, - TWEET_NOT_IN_INDEX_PER_FIELD_EVENTS_COUNTER_PATTERN); - } - } - - public void increment(ThriftIndexingEventType eventType, Document doc) { - for (IndexableField field : doc.getFields()) { - incrementForPattern( - eventType, field.name(), - perFieldTable, PER_FIELD_EVENTS_COUNTER_PATTERN); - } - } - - public void increment(ThriftIndexingEventType eventType, String fieldName) { - incrementForPattern(eventType, fieldName, perFieldTable, PER_FIELD_EVENTS_COUNTER_PATTERN); - } - - public void incrementTweetNotInIndex(ThriftIndexingEventType eventType, Document doc) { - for (IndexableField field : doc.getFields()) { - incrementForPattern( - eventType, field.name(), - notInIndexPerFieldTable, - TWEET_NOT_IN_INDEX_PER_FIELD_EVENTS_COUNTER_PATTERN); - } - } - - private void incrementForPattern( - ThriftIndexingEventType eventType, String fieldName, - Table counterTable, String pattern) { - - SearchCounter stat; - if (counterTable.contains(eventType, fieldName)) { - stat = counterTable.get(eventType, fieldName); - } else { - stat = SearchCounter.export(String.format(pattern, eventType, fieldName).toLowerCase()); - counterTable.put(eventType, fieldName, stat); - } - stat.increment(); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/index/EarlybirdSegmentFactory.docx b/src/java/com/twitter/search/earlybird/index/EarlybirdSegmentFactory.docx new file mode 100644 index 000000000..35b1e0c98 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/EarlybirdSegmentFactory.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/EarlybirdSegmentFactory.java b/src/java/com/twitter/search/earlybird/index/EarlybirdSegmentFactory.java deleted file mode 100644 index 1a43cf52b..000000000 --- a/src/java/com/twitter/search/earlybird/index/EarlybirdSegmentFactory.java +++ /dev/null @@ -1,58 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; - -import org.apache.lucene.store.Directory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.partitioning.base.Segment; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.partition.SearchIndexingMetricSet; -import com.twitter.search.earlybird.partition.SegmentSyncInfo; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; - -public class EarlybirdSegmentFactory { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdSegmentFactory.class); - - private final EarlybirdIndexConfig earlybirdIndexConfig; - private final SearchIndexingMetricSet searchIndexingMetricSet; - private final EarlybirdSearcherStats searcherStats; - private Clock clock; - - public EarlybirdSegmentFactory( - EarlybirdIndexConfig earlybirdIndexConfig, - SearchIndexingMetricSet searchIndexingMetricSet, - EarlybirdSearcherStats searcherStats, - Clock clock) { - this.earlybirdIndexConfig = earlybirdIndexConfig; - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.searcherStats = searcherStats; - this.clock = clock; - } - - public EarlybirdIndexConfig getEarlybirdIndexConfig() { - return earlybirdIndexConfig; - } - - /** - * Creates a new earlybird segment. - */ - public EarlybirdSegment newEarlybirdSegment(Segment segment, SegmentSyncInfo segmentSyncInfo) - throws IOException { - Directory dir = earlybirdIndexConfig.newLuceneDirectory(segmentSyncInfo); - - LOG.info("Creating EarlybirdSegment on " + dir.toString()); - - return new EarlybirdSegment( - segment.getSegmentName(), - segment.getTimeSliceID(), - segment.getMaxSegmentSize(), - dir, - earlybirdIndexConfig, - searchIndexingMetricSet, - searcherStats, - clock); - } -} diff --git a/src/java/com/twitter/search/earlybird/index/EarlybirdSingleSegmentSearcher.docx b/src/java/com/twitter/search/earlybird/index/EarlybirdSingleSegmentSearcher.docx new file mode 100644 index 000000000..1c4bd3f9b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/EarlybirdSingleSegmentSearcher.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/EarlybirdSingleSegmentSearcher.java b/src/java/com/twitter/search/earlybird/index/EarlybirdSingleSegmentSearcher.java deleted file mode 100644 index 848a1bd6f..000000000 --- a/src/java/com/twitter/search/earlybird/index/EarlybirdSingleSegmentSearcher.java +++ /dev/null @@ -1,423 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Map.Entry; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.CollectionStatistics; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.LeafCollector; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.TermStatistics; -import org.apache.lucene.search.Weight; -import org.apache.lucene.util.BytesRef; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; -import com.twitter.search.common.relevance.features.EarlybirdDocumentFeatures; -import com.twitter.search.common.results.thriftjava.FieldHitAttribution; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.search.TwitterCollector; -import com.twitter.search.common.search.TwitterIndexSearcher; -import com.twitter.search.common.util.analysis.LongTermAttributeImpl; -import com.twitter.search.common.util.lang.ThriftLanguageUtil; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; -import com.twitter.search.earlybird.EarlybirdSearcher; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.search.EarlybirdLuceneSearcher; -import com.twitter.search.earlybird.search.Hit; -import com.twitter.search.earlybird.search.SearchRequestInfo; -import com.twitter.search.earlybird.search.SimpleSearchResults; -import com.twitter.search.earlybird.search.facets.AbstractFacetTermCollector; -import com.twitter.search.earlybird.search.facets.TermStatisticsCollector; -import com.twitter.search.earlybird.search.facets.TermStatisticsRequestInfo; -import com.twitter.search.earlybird.search.relevance.scoring.RelevanceQuery; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.ThriftFacetCount; -import com.twitter.search.earlybird.thrift.ThriftFacetCountMetadata; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird.thrift.ThriftTermRequest; -import com.twitter.search.earlybird.thrift.ThriftTermResults; -import com.twitter.search.earlybird.thrift.ThriftTermStatisticsResults; - -public class EarlybirdSingleSegmentSearcher extends EarlybirdLuceneSearcher { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdSingleSegmentSearcher.class); - - private final EarlybirdIndexSegmentAtomicReader twitterReader; - private final ImmutableSchemaInterface schema; - private final UserTable userTable; - private final long timeSliceID; - - private final EarlybirdSearcherStats searcherStats; - private Clock clock; - - public EarlybirdSingleSegmentSearcher( - ImmutableSchemaInterface schema, - EarlybirdIndexSegmentAtomicReader reader, - UserTable userTable, - EarlybirdSearcherStats searcherStats, - Clock clock) { - super(reader); - this.schema = schema; - this.twitterReader = reader; - this.userTable = userTable; - this.timeSliceID = reader.getSegmentData().getTimeSliceID(); - this.searcherStats = searcherStats; - this.clock = clock; - } - - public final long getTimeSliceID() { - return timeSliceID; - } - - public EarlybirdIndexSegmentAtomicReader getTwitterIndexReader() { - return twitterReader; - } - - /** - * search() main loop. - * This behaves exactly like IndexSearcher.search() if a stock Lucene collector passed in. - * However, if a TwitterCollector is passed in, this class performs Twitter style early - * termination without relying on - * {@link org.apache.lucene.search.CollectionTerminatedException}. - * This method is nearly identical to TwitterIndexSearcher.search() with two differences: - * 1) advances to smallest docID before searching. Important to skip incomplete docs in - * realtime segments. - * 2) skips deletes using twitterReader - */ - @Override - protected void search(List leaves, Weight weight, Collector coll) - throws IOException { - // If an TwitterCollector is passed in, we can do a few extra things in here, such - // as early termination. Otherwise we can just fall back to IndexSearcher.search(). - if (!(coll instanceof TwitterCollector)) { - super.search(leaves, weight, coll); - return; - } - - TwitterCollector collector = (TwitterCollector) coll; - if (collector.isTerminated()) { - return; - } - - LOG.debug("Starting segment {}", timeSliceID); - - // Notify the collector that we're starting this segment, and check for early - // termination criteria again. setNextReader() performs 'expensive' early - // termination checks in some implementations such as TwitterEarlyTerminationCollector. - LeafCollector leafCollector = collector.getLeafCollector(twitterReader.getContext()); - if (collector.isTerminated()) { - return; - } - - // Initialize the scorer: - // Note that constructing the scorer may actually do real work, such as advancing to the - // first hit. - // The scorer may be null if we can tell right away that the query has no hits: e.g. if the - // first hit does not actually exist. - Scorer scorer = weight.scorer(twitterReader.getContext()); - if (scorer == null) { - LOG.debug("Scorer was null, not searching segment {}", timeSliceID); - collector.finishSegment(DocIdSetIterator.NO_MORE_DOCS); - return; - } - leafCollector.setScorer(scorer); - - // Make sure to start searching at the smallest docID. - DocIdSetIterator docIdSetIterator = scorer.iterator(); - int smallestDocId = twitterReader.getSmallestDocID(); - int docID = docIdSetIterator.advance(smallestDocId); - - // Collect results. - while (docID != DocIdSetIterator.NO_MORE_DOCS) { - // Exclude deleted docs. - if (!twitterReader.getDeletesView().isDeleted(docID)) { - leafCollector.collect(docID); - } - - // Check if we're done after we consumed the document. - if (collector.isTerminated()) { - break; - } - - docID = docIdSetIterator.nextDoc(); - } - - // Always finish the segment, providing the last docID advanced to. - collector.finishSegment(docID); - } - - @Override - public void fillFacetResults( - AbstractFacetTermCollector collector, ThriftSearchResults searchResults) - throws IOException { - if (searchResults == null || searchResults.getResultsSize() == 0) { - return; - } - - EarlybirdIndexSegmentData segmentData = twitterReader.getSegmentData(); - collector.resetFacetLabelProviders( - segmentData.getFacetLabelProviders(), segmentData.getFacetIDMap()); - DocIDToTweetIDMapper docIdMapper = segmentData.getDocIDToTweetIDMapper(); - for (ThriftSearchResult result : searchResults.getResults()) { - int docId = docIdMapper.getDocID(result.getId()); - if (docId < 0) { - continue; - } - - segmentData.getFacetCountingArray().collectForDocId(docId, collector); - collector.fillResultAndClear(result); - } - } - - @Override - public TermStatisticsCollector.TermStatisticsSearchResults collectTermStatistics( - TermStatisticsRequestInfo searchRequestInfo, - EarlybirdSearcher searcher, int requestDebugMode) throws IOException { - TermStatisticsCollector collector = new TermStatisticsCollector( - schema, searchRequestInfo, searcherStats, clock, requestDebugMode); - - search(searchRequestInfo.getLuceneQuery(), collector); - searcher.maybeSetCollectorDebugInfo(collector); - return collector.getResults(); - } - - /** This method is only used for debugging, so it's not optimized for speed */ - @Override - public void explainSearchResults(SearchRequestInfo searchRequestInfo, - SimpleSearchResults hits, - ThriftSearchResults searchResults) throws IOException { - Weight weight = - createWeight(rewrite(searchRequestInfo.getLuceneQuery()), ScoreMode.COMPLETE, 1.0f); - - DocIDToTweetIDMapper docIdMapper = twitterReader.getSegmentData().getDocIDToTweetIDMapper(); - for (int i = 0; i < hits.numHits(); i++) { - final Hit hit = hits.getHit(i); - Preconditions.checkState(hit.getTimeSliceID() == timeSliceID, - "hit: " + hit.toString() + " is not in timeslice: " + timeSliceID); - final ThriftSearchResult result = searchResults.getResults().get(i); - if (!result.isSetMetadata()) { - result.setMetadata(new ThriftSearchResultMetadata() - .setPenguinVersion(EarlybirdConfig.getPenguinVersionByte())); - } - - final int docIdToExplain = docIdMapper.getDocID(hit.getStatusID()); - if (docIdToExplain == DocIDToTweetIDMapper.ID_NOT_FOUND) { - result.getMetadata().setExplanation( - "ERROR: Could not find doc ID to explain for " + hit.toString()); - } else { - Explanation explanation; - FieldHitAttribution fieldHitAttribution = result.getMetadata().getFieldHitAttribution(); - if (weight instanceof RelevanceQuery.RelevanceWeight && fieldHitAttribution != null) { - RelevanceQuery.RelevanceWeight relevanceWeight = - (RelevanceQuery.RelevanceWeight) weight; - - explanation = relevanceWeight.explain( - twitterReader.getContext(), docIdToExplain, fieldHitAttribution); - } else { - explanation = weight.explain(twitterReader.getContext(), docIdToExplain); - } - hit.setHasExplanation(true); - result.getMetadata().setExplanation(explanation.toString()); - } - } - } - - @Override - public void fillFacetResultMetadata(Map facetResults, - ImmutableSchemaInterface documentSchema, - byte debugMode) throws IOException { - FacetLabelProvider provider = twitterReader.getFacetLabelProviders( - documentSchema.getFacetFieldByFacetName(EarlybirdFieldConstant.TWIMG_FACET)); - - FacetLabelProvider.FacetLabelAccessor photoAccessor = null; - - if (provider != null) { - photoAccessor = provider.getLabelAccessor(); - } - - for (Entry facetResult : facetResults.entrySet()) { - Term term = facetResult.getKey(); - ThriftFacetCount facetCount = facetResult.getValue(); - - ThriftFacetCountMetadata metadata = facetCount.getMetadata(); - if (metadata == null) { - metadata = new ThriftFacetCountMetadata(); - facetCount.setMetadata(metadata); - } - - fillTermMetadata(term, metadata, photoAccessor, debugMode); - } - } - - @Override - public void fillTermStatsMetadata(ThriftTermStatisticsResults termStatsResults, - ImmutableSchemaInterface documentSchema, - byte debugMode) throws IOException { - - FacetLabelProvider provider = twitterReader.getFacetLabelProviders( - documentSchema.getFacetFieldByFacetName(EarlybirdFieldConstant.TWIMG_FACET)); - - FacetLabelProvider.FacetLabelAccessor photoAccessor = null; - - if (provider != null) { - photoAccessor = provider.getLabelAccessor(); - } - - for (Map.Entry entry - : termStatsResults.termResults.entrySet()) { - - ThriftTermRequest termRequest = entry.getKey(); - if (termRequest.getFieldName().isEmpty()) { - continue; - } - Schema.FieldInfo facetField = schema.getFacetFieldByFacetName(termRequest.getFieldName()); - Term term = null; - if (facetField != null) { - term = new Term(facetField.getName(), termRequest.getTerm()); - } - if (term == null) { - continue; - } - - ThriftFacetCountMetadata metadata = entry.getValue().getMetadata(); - if (metadata == null) { - metadata = new ThriftFacetCountMetadata(); - entry.getValue().setMetadata(metadata); - } - - fillTermMetadata(term, metadata, photoAccessor, debugMode); - } - } - - private void fillTermMetadata(Term term, ThriftFacetCountMetadata metadata, - FacetLabelProvider.FacetLabelAccessor photoAccessor, - byte debugMode) throws IOException { - boolean isTwimg = term.field().equals(EarlybirdFieldConstant.TWIMG_LINKS_FIELD.getFieldName()); - int internalDocID = DocIDToTweetIDMapper.ID_NOT_FOUND; - long statusID = -1; - long userID = -1; - Term facetTerm = term; - - // Deal with the from_user_id facet. - if (term.field().equals(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName())) { - userID = Long.parseLong(term.text()); - facetTerm = new Term(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName(), - LongTermAttributeImpl.copyIntoNewBytesRef(userID)); - } else if (isTwimg) { - statusID = Long.parseLong(term.text()); - internalDocID = twitterReader.getSegmentData().getDocIDToTweetIDMapper().getDocID(statusID); - } - - if (internalDocID == DocIDToTweetIDMapper.ID_NOT_FOUND) { - // If this is not a twimg, this is how statusID should be looked up - // - // If this is a twimg but we couldn't find the internalDocID, that means this segment, - // or maybe even this earlybird, does not contain the original tweet. Then we treat this as - // a normal facet for now - internalDocID = twitterReader.getOldestDocID(facetTerm); - if (internalDocID >= 0) { - statusID = - twitterReader.getSegmentData().getDocIDToTweetIDMapper().getTweetID(internalDocID); - } else { - statusID = -1; - } - } - - // make sure tweet is not deleted - if (internalDocID < 0 || twitterReader.getDeletesView().isDeleted(internalDocID)) { - return; - } - - if (metadata.isSetStatusId() - && metadata.getStatusId() > 0 - && metadata.getStatusId() <= statusID) { - // we already have the metadata for this facet from an earlier tweet - return; - } - - // now check if this tweet is offensive, e.g. antisocial, nsfw, sensitive - EarlybirdDocumentFeatures documentFeatures = new EarlybirdDocumentFeatures(twitterReader); - documentFeatures.advance(internalDocID); - boolean isOffensiveFlagSet = - documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG); - boolean isSensitiveFlagSet = - documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_SENSITIVE_CONTENT); - boolean offensive = isOffensiveFlagSet || isSensitiveFlagSet; - - // also, user should not be marked as antisocial, nsfw or offensive - if (userID < 0) { - userID = documentFeatures.getFeatureValue(EarlybirdFieldConstant.FROM_USER_ID_CSF); - } - offensive |= userTable.isSet(userID, - UserTable.ANTISOCIAL_BIT - | UserTable.OFFENSIVE_BIT - | UserTable.NSFW_BIT); - - metadata.setStatusId(statusID); - metadata.setTwitterUserId(userID); - metadata.setCreated_at(twitterReader.getSegmentData().getTimeMapper().getTime(internalDocID)); - int langId = (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LANGUAGE); - Locale lang = ThriftLanguageUtil.getLocaleOf(ThriftLanguage.findByValue(langId)); - metadata.setStatusLanguage(ThriftLanguageUtil.getThriftLanguageOf(lang)); - metadata.setStatusPossiblySensitive(offensive); - if (isTwimg && photoAccessor != null && !metadata.isSetNativePhotoUrl()) { - int termID = twitterReader.getTermID(term); - if (termID != EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) { - BytesRef termPayload = photoAccessor.getTermPayload(termID); - if (termPayload != null) { - metadata.setNativePhotoUrl(termPayload.utf8ToString()); - } - } - } - - if (debugMode > 3) { - StringBuilder sb = new StringBuilder(256); - if (metadata.isSetExplanation()) { - sb.append(metadata.getExplanation()); - } - sb.append(String.format("TweetId=%d (%s %s), UserId=%d (%s %s), Term=%s\n", - statusID, - isOffensiveFlagSet ? "OFFENSIVE" : "", - isSensitiveFlagSet ? "SENSITIVE" : "", - userID, - userTable.isSet(userID, UserTable.ANTISOCIAL_BIT) ? "ANTISOCIAL" : "", - userTable.isSet(userID, UserTable.NSFW_BIT) ? "NSFW" : "", - term.toString())); - metadata.setExplanation(sb.toString()); - } - } - - public ImmutableSchemaInterface getSchemaSnapshot() { - return schema; - } - - @Override - public CollectionStatistics collectionStatistics(String field) throws IOException { - return TwitterIndexSearcher.collectionStatistics(field, getIndexReader()); - } - - @Override - public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) { - return TwitterIndexSearcher.termStats(term, docFreq, totalTermFreq); - } -} diff --git a/src/java/com/twitter/search/earlybird/index/OptimizedTimeMapper.docx b/src/java/com/twitter/search/earlybird/index/OptimizedTimeMapper.docx new file mode 100644 index 000000000..ad8633cc2 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/OptimizedTimeMapper.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/OptimizedTimeMapper.java b/src/java/com/twitter/search/earlybird/index/OptimizedTimeMapper.java deleted file mode 100644 index 95267cad9..000000000 --- a/src/java/com/twitter/search/earlybird/index/OptimizedTimeMapper.java +++ /dev/null @@ -1,109 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; -import java.util.Arrays; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.TimeMapper; -import com.twitter.search.core.earlybird.index.inverted.IntBlockPool; - -/** - * A TimeMapper implementation that stores the timestamps associated with the doc IDs in an array. - */ -public class OptimizedTimeMapper extends AbstractInMemoryTimeMapper implements Flushable { - // Doc id to timestamp map. Timestamps that are negative are out-of-order. - protected final int[] timeMap; - - // Size must be greater than the max doc ID stored in the optimized tweet ID mapper. - public OptimizedTimeMapper(RealtimeTimeMapper realtimeTimeMapper, - DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - super(); - int maxDocId = optimizedTweetIdMapper.getPreviousDocID(Integer.MAX_VALUE); - timeMap = new int[maxDocId + 1]; - Arrays.fill(timeMap, ILLEGAL_TIME); - - int docId = maxDocId; - while (docId != DocIDToTweetIDMapper.ID_NOT_FOUND) { - int originalDocId = originalTweetIdMapper.getDocID(optimizedTweetIdMapper.getTweetID(docId)); - Preconditions.checkState(originalDocId != DocIDToTweetIDMapper.ID_NOT_FOUND); - - int docIdTimestamp = realtimeTimeMapper.getTime(originalDocId); - Preconditions.checkState(docIdTimestamp != TimeMapper.ILLEGAL_TIME); - - doAddMapping(docId, docIdTimestamp); - - docId = optimizedTweetIdMapper.getPreviousDocID(docId); - } - } - - private OptimizedTimeMapper(int[] timeMap, - int reverseMapLastIndex, - IntBlockPool reverseMapTimes, - IntBlockPool reverseMapIds) { - super(reverseMapLastIndex, reverseMapTimes, reverseMapIds); - this.timeMap = timeMap; - } - - @Override - public int getTime(int docID) { - return timeMap[docID]; - } - - @Override - protected void setTime(int docID, int timeSeconds) { - timeMap[docID] = timeSeconds; - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - private static final String REVERSE_MAP_LAST_INDEX_PROP = "reverseMapLastIndex"; - private static final String TIMES_SUB_PROP = "times"; - private static final String IDS_SUB_PROP = "ids"; - - public FlushHandler() { - super(); - } - - public FlushHandler(OptimizedTimeMapper objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - OptimizedTimeMapper mapper = getObjectToFlush(); - out.writeIntArray(mapper.timeMap); - flushInfo.addIntProperty(REVERSE_MAP_LAST_INDEX_PROP, mapper.reverseMapLastIndex); - mapper.reverseMapTimes.getFlushHandler().flush( - flushInfo.newSubProperties(TIMES_SUB_PROP), out); - mapper.reverseMapIds.getFlushHandler().flush( - flushInfo.newSubProperties(IDS_SUB_PROP), out); - } - - @Override - protected OptimizedTimeMapper doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - return new OptimizedTimeMapper( - in.readIntArray(), - flushInfo.getIntProperty(REVERSE_MAP_LAST_INDEX_PROP), - new IntBlockPool.FlushHandler().load(flushInfo.getSubProperties(TIMES_SUB_PROP), in), - new IntBlockPool.FlushHandler().load(flushInfo.getSubProperties(IDS_SUB_PROP), in)); - } - } - - @Override - public TimeMapper optimize(DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) { - throw new UnsupportedOperationException("OptimizedTimeMapper instances are already optimized."); - } -} diff --git a/src/java/com/twitter/search/earlybird/index/OptimizedTweetIDMapper.docx b/src/java/com/twitter/search/earlybird/index/OptimizedTweetIDMapper.docx new file mode 100644 index 000000000..ddf1007ac Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/OptimizedTweetIDMapper.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/OptimizedTweetIDMapper.java b/src/java/com/twitter/search/earlybird/index/OptimizedTweetIDMapper.java deleted file mode 100644 index a9bdb7b54..000000000 --- a/src/java/com/twitter/search/earlybird/index/OptimizedTweetIDMapper.java +++ /dev/null @@ -1,145 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -import it.unimi.dsi.fastutil.longs.Long2IntMap; -import it.unimi.dsi.fastutil.longs.Long2IntOpenHashMap; -import it.unimi.dsi.fastutil.longs.LongArrays; - -/** - * After a segment is complete, we call {@link EarlybirdSegment#optimizeIndexes()} to compact the - * doc IDs assigned to the tweets in this segment, so that we can do faster ceil and floor lookups. - */ -public class OptimizedTweetIDMapper extends TweetIDMapper { - // Maps doc IDs to tweet IDs. Therefore, it should be sorted in descending order of tweet IDs. - protected final long[] inverseMap; - private final Long2IntMap tweetIdToDocIdMap; - - private OptimizedTweetIDMapper(long[] inverseMap, - long minTweetID, - long maxTweetID, - int minDocID, - int maxDocID) { - super(minTweetID, maxTweetID, minDocID, maxDocID, inverseMap.length); - this.inverseMap = inverseMap; - this.tweetIdToDocIdMap = buildTweetIdToDocIdMap(); - } - - public OptimizedTweetIDMapper(OutOfOrderRealtimeTweetIDMapper source) throws IOException { - super(source.getMinTweetID(), - source.getMaxTweetID(), - 0, - source.getNumDocs() - 1, - source.getNumDocs()); - inverseMap = source.sortTweetIds(); - tweetIdToDocIdMap = buildTweetIdToDocIdMap(); - } - - private Long2IntMap buildTweetIdToDocIdMap() { - int[] values = new int[inverseMap.length]; - for (int i = 0; i < values.length; i++) { - values[i] = i; - } - - Long2IntMap map = new Long2IntOpenHashMap(inverseMap, values); - map.defaultReturnValue(-1); - return map; - } - - @Override - public int getDocID(long tweetID) { - return tweetIdToDocIdMap.getOrDefault(tweetID, ID_NOT_FOUND); - } - - @Override - protected int getNextDocIDInternal(int docID) { - // The doc IDs are consecutive and TweetIDMapper already checked the boundary conditions. - return docID + 1; - } - - @Override - protected int getPreviousDocIDInternal(int docID) { - // The doc IDs are consecutive and TweetIDMapper already checked the boundary conditions. - return docID - 1; - } - - @Override - public long getTweetID(int internalID) { - return inverseMap[internalID]; - } - - @Override - protected int findDocIDBoundInternal(long tweetID, boolean findMaxDocID) { - int docId = tweetIdToDocIdMap.get(tweetID); - if (docId >= 0) { - return docId; - } - - int binarySearchResult = - LongArrays.binarySearch(inverseMap, tweetID, (k1, k2) -> -Long.compare(k1, k2)); - // Since the tweet ID is not present in this mapper, the binary search should return a negative - // value (-insertionPoint - 1). And since TweetIDMapper.findDocIdBound() already verified that - // tweetID is not smaller than all tweet IDs in this mapper, and not larger than all tweet IDs - // in this mapper, the insertionPoint should never be 0 or inverseMap.length. - int insertionPoint = -binarySearchResult - 1; - // The insertion point is the index in the tweet array of the upper bound of the search, so if - // we want the lower bound, because doc IDs are dense, we subtract one. - return findMaxDocID ? insertionPoint : insertionPoint - 1; - } - - @Override - protected final int addMappingInternal(final long tweetID) { - throw new UnsupportedOperationException("The OptimizedTweetIDMapper is immutable."); - } - - @Override - public DocIDToTweetIDMapper optimize() { - throw new UnsupportedOperationException("OptimizedTweetIDMapper is already optimized."); - } - - @Override - public FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static class FlushHandler extends Flushable.Handler { - private static final String MIN_TWEET_ID_PROP_NAME = "MinTweetID"; - private static final String MAX_TWEET_ID_PROP_NAME = "MaxTweetID"; - private static final String MIN_DOC_ID_PROP_NAME = "MinDocID"; - private static final String MAX_DOC_ID_PROP_NAME = "MaxDocID"; - - public FlushHandler() { - super(); - } - - public FlushHandler(OptimizedTweetIDMapper objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) throws IOException { - OptimizedTweetIDMapper objectToFlush = getObjectToFlush(); - flushInfo.addLongProperty(MIN_TWEET_ID_PROP_NAME, objectToFlush.getMinTweetID()); - flushInfo.addLongProperty(MAX_TWEET_ID_PROP_NAME, objectToFlush.getMaxTweetID()); - flushInfo.addIntProperty(MIN_DOC_ID_PROP_NAME, objectToFlush.getMinDocID()); - flushInfo.addIntProperty(MAX_DOC_ID_PROP_NAME, objectToFlush.getMaxDocID()); - out.writeLongArray(objectToFlush.inverseMap); - } - - @Override - protected OptimizedTweetIDMapper doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - return new OptimizedTweetIDMapper(in.readLongArray(), - flushInfo.getLongProperty(MIN_TWEET_ID_PROP_NAME), - flushInfo.getLongProperty(MAX_TWEET_ID_PROP_NAME), - flushInfo.getIntProperty(MIN_DOC_ID_PROP_NAME), - flushInfo.getIntProperty(MAX_DOC_ID_PROP_NAME)); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/index/OutOfOrderRealtimeTweetIDMapper.docx b/src/java/com/twitter/search/earlybird/index/OutOfOrderRealtimeTweetIDMapper.docx new file mode 100644 index 000000000..651a9a926 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/OutOfOrderRealtimeTweetIDMapper.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/OutOfOrderRealtimeTweetIDMapper.java b/src/java/com/twitter/search/earlybird/index/OutOfOrderRealtimeTweetIDMapper.java deleted file mode 100644 index f03e45f50..000000000 --- a/src/java/com/twitter/search/earlybird/index/OutOfOrderRealtimeTweetIDMapper.java +++ /dev/null @@ -1,531 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; -import java.util.Arrays; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -import it.unimi.dsi.fastutil.ints.Int2ByteOpenHashMap; -import it.unimi.dsi.fastutil.ints.Int2LongMap; -import it.unimi.dsi.fastutil.ints.Int2LongOpenHashMap; - -/** - * A mapper that maps tweet IDs to doc IDs based on the tweet timestamps. This mapper guarantees - * that if creationTime(A) > creationTime(B), then docId(A) < docId(B), no matter in which order - * the tweets are added to this mapper. However, if creationTime(A) == creationTime(B), then there - * is no guarantee on the order between docId(A) and docId(B). - * - * Essentially, this mapper guarantees that tweets with a later creation time are mapped to smaller - * doc IDs, but it does not provide any ordering for tweets with the same timestamp (down to - * millisecond granularity, which is what Snowflake provides). Our claim is that ordering tweets - * with the same timestamp is not needed, because for the purposes of realtime search, the only - * significant part of the tweet ID is the timestamp. So any such ordering would just be an ordering - * for the Snowflake shards and/or sequence numbers, rather than a time based ordering for tweets. - * - * The mapper uses the following scheme to assign docIDs to tweets: - * +----------+-----------------------------+------------------------------+ - * | Bit 0 | Bits 1 - 27 | Bits 28 - 31 | - * + ---------+-----------------------------+------------------------------+ - * | sign | tweet ID timestamp - | Allow 16 tweets to be posted | - * | always 0 | segment boundary timestamp | on the same millisecond | - * + ---------+-----------------------------+------------------------------+ - * - * Important assumptions: - * * Snowflake IDs have millisecond granularity. Therefore, 27 bits is enough to represent a time - * period of 2^27 / (3600 * 100) = ~37 hours, which is more than enough to cover one realtime - * segment (our realtime segments currently span ~13 hours). - * * At peak times, the tweet posting rate is less than 10,000 tps. Given our current partitioning - * scheme (22 partitions), each realtime earlybird should expect to get less than 500 tweets per - * second, which comes down to less than 1 tweet per millisecond, assuming the partitioning hash - * function distributes the tweets fairly randomly independent of their timestamps. Therefore, - * providing space for 16 tweets (4 bits) in every millisecond should be more than enough to - * accommodate the current requirements, and any potential future changes (higher tweet rate, - * fewer partitions, etc.). - * - * How the mapper works: - * * The tweetId -> docId conversion is implicit (using the tweet's timestamp). - * * We use a IntToByteMap to store the number of tweets for each timestamp, so that we can - * allocate different doc IDs to tweets posted on the same millisecond. The size of this map is: - * segmentSize * 2 (load factor) * 1 (size of byte) = 16MB - * * The docId -> tweetId mappings are stored in an IntToLongMap. The size of this map is: - * segmentSize * 2 (load factor) * 8 (size of long) = 128MB - * * The mapper takes the "segment boundary" (the timestamp of the timeslice ID) as a parameter. - * This segment boundary determines the earliest tweet that this mapper can correctly index - * (it is subtracted from the timestamp of all tweets added to the mapper). Therefore, in order - * to correctly handle late tweets, we move back this segment boundary by twelve hour. - * * Tweets created before (segment boundary - 12 hours) are stored as if their timestamp was the - * segment boundary. - * * The largest timestamp that the mapper can store is: - * LARGEST_RELATIVE_TIMESTAMP = (1 << TIMESTAMP_BITS) - LUCENE_TIMESTAMP_BUFFER. - * Tweets created after (segmentBoundaryTimestamp + LARGEST_RELATIVE_TIMESTAMP) are stored as if - * their timestamp was (segmentBoundaryTimestamp + LARGEST_RELATIVE_TIMESTAMP). - * * When a tweet is added, we compute its doc ID as: - * int relativeTimestamp = tweetTimestamp - segmentBoundaryTimestamp; - * int docIdTimestamp = LARGEST_RELATIVE_TIMESTAMP - relativeTimestamp; - * int numTweetsForTimestamp = tweetsPerTimestamp.get(docIdTimestamp); - * int docId = (docIdTimestamp << DOC_ID_BITS) - * + MAX_DOCS_PER_TIMESTAMP - numTweetsForTimestamp - 1 - * - * This doc ID distribution scheme guarantees that tweets created later will be assigned smaller doc - * IDs (as long as we don't have more than 16 tweets created in the same millisecond). However, - * there is no ordering guarantee for tweets created at the same timestamp -- they are assigned doc - * IDs in the order in which they're added to the mapper. - * - * If we have more than 16 tweets created at time T, the mapper will still gracefully handle that - * case: the "extra" tweets will be assigned doc IDs from the pool of doc IDs for timestamp (T + 1). - * However, the ordering guarantee might no longer hold for those "extra" tweets. Also, the "extra" - * tweets might be missed by certain since_id/max_id queries (the findDocIdBound() method might not - * be able to correctly work for these tweet IDs). - */ -public class OutOfOrderRealtimeTweetIDMapper extends TweetIDMapper { - private static final Logger LOG = LoggerFactory.getLogger(OutOfOrderRealtimeTweetIDMapper.class); - - // The number of bits used to represent the tweet timestamp. - private static final int TIMESTAMP_BITS = 27; - - // The number of bits used to represent the number of tweets with a certain timestamp. - @VisibleForTesting - static final int DOC_ID_BITS = Integer.SIZE - TIMESTAMP_BITS - 1; - - // The maximum number of tweets/docs that we can store per timestamp. - @VisibleForTesting - static final int MAX_DOCS_PER_TIMESTAMP = 1 << DOC_ID_BITS; - - // Lucene has some logic that doesn't deal well with doc IDs close to Integer.MAX_VALUE. - // For example, BooleanScorer has a SIZE constant set to 2048, which gets added to the doc IDs - // inside the score() method. So when the doc IDs are close to Integer.MAX_VALUE, this causes an - // overflow, which can send Lucene into an infinite loop. Therefore, we need to make sure that - // we do not assign doc IDs close to Integer.MAX_VALUE. - private static final int LUCENE_TIMESTAMP_BUFFER = 1 << 16; - - @VisibleForTesting - public static final int LATE_TWEETS_TIME_BUFFER_MILLIS = 12 * 3600 * 1000; // 12 hours - - // The largest relative timestamp that this mapper can store. - @VisibleForTesting - static final int LARGEST_RELATIVE_TIMESTAMP = (1 << TIMESTAMP_BITS) - LUCENE_TIMESTAMP_BUFFER; - - private final long segmentBoundaryTimestamp; - private final int segmentSize; - - private final Int2LongOpenHashMap tweetIds; - private final Int2ByteOpenHashMap tweetsPerTimestamp; - - private static final SearchRateCounter BAD_BUCKET_RATE = - SearchRateCounter.export("tweets_assigned_to_bad_timestamp_bucket"); - private static final SearchRateCounter TWEETS_NOT_ASSIGNED_RATE = - SearchRateCounter.export("tweets_not_assigned"); - private static final SearchRateCounter OLD_TWEETS_DROPPED = - SearchRateCounter.export("old_tweets_dropped"); - - public OutOfOrderRealtimeTweetIDMapper(int segmentSize, long timesliceID) { - long firstTimestamp = SnowflakeIdParser.getTimestampFromTweetId(timesliceID); - // Leave a buffer so that we can handle tweets that are up to twelve hours late. - this.segmentBoundaryTimestamp = firstTimestamp - LATE_TWEETS_TIME_BUFFER_MILLIS; - this.segmentSize = segmentSize; - - tweetIds = new Int2LongOpenHashMap(segmentSize); - tweetIds.defaultReturnValue(ID_NOT_FOUND); - - tweetsPerTimestamp = new Int2ByteOpenHashMap(segmentSize); - tweetsPerTimestamp.defaultReturnValue((byte) ID_NOT_FOUND); - } - - @VisibleForTesting - int getDocIdTimestamp(long tweetId) { - long tweetTimestamp = SnowflakeIdParser.getTimestampFromTweetId(tweetId); - if (tweetTimestamp < segmentBoundaryTimestamp) { - return ID_NOT_FOUND; - } - - long relativeTimestamp = tweetTimestamp - segmentBoundaryTimestamp; - if (relativeTimestamp > LARGEST_RELATIVE_TIMESTAMP) { - relativeTimestamp = LARGEST_RELATIVE_TIMESTAMP; - } - - return LARGEST_RELATIVE_TIMESTAMP - (int) relativeTimestamp; - } - - private int getDocIdForTimestamp(int docIdTimestamp, byte docIndexInTimestamp) { - return (docIdTimestamp << DOC_ID_BITS) + MAX_DOCS_PER_TIMESTAMP - docIndexInTimestamp; - } - - @VisibleForTesting - long[] getTweetsForDocIdTimestamp(int docIdTimestamp) { - byte numDocsForTimestamp = tweetsPerTimestamp.get(docIdTimestamp); - if (numDocsForTimestamp == ID_NOT_FOUND) { - // This should never happen in prod, but better to be safe. - return new long[0]; - } - - long[] tweetIdsInBucket = new long[numDocsForTimestamp]; - int startingDocId = (docIdTimestamp << DOC_ID_BITS) + MAX_DOCS_PER_TIMESTAMP - 1; - for (int i = 0; i < numDocsForTimestamp; ++i) { - tweetIdsInBucket[i] = tweetIds.get(startingDocId - i); - } - return tweetIdsInBucket; - } - - private int newDocId(long tweetId) { - int expectedDocIdTimestamp = getDocIdTimestamp(tweetId); - if (expectedDocIdTimestamp == ID_NOT_FOUND) { - LOG.info("Dropping tweet {} because it is from before the segment boundary timestamp {}", - tweetId, - segmentBoundaryTimestamp); - OLD_TWEETS_DROPPED.increment(); - return ID_NOT_FOUND; - } - - int docIdTimestamp = expectedDocIdTimestamp; - byte numDocsForTimestamp = tweetsPerTimestamp.get(docIdTimestamp); - - if (numDocsForTimestamp == MAX_DOCS_PER_TIMESTAMP) { - BAD_BUCKET_RATE.increment(); - } - - while ((docIdTimestamp > 0) && (numDocsForTimestamp == MAX_DOCS_PER_TIMESTAMP)) { - --docIdTimestamp; - numDocsForTimestamp = tweetsPerTimestamp.get(docIdTimestamp); - } - - if (numDocsForTimestamp == MAX_DOCS_PER_TIMESTAMP) { - // The relative timestamp 0 already has MAX_DOCS_PER_TIMESTAMP. Can't add more docs. - LOG.error("Tweet {} could not be assigned a doc ID in any bucket, because the bucket for " - + "timestamp 0 is already full: {}", - tweetId, Arrays.toString(getTweetsForDocIdTimestamp(0))); - TWEETS_NOT_ASSIGNED_RATE.increment(); - return ID_NOT_FOUND; - } - - if (docIdTimestamp != expectedDocIdTimestamp) { - LOG.warn("Tweet {} could not be assigned a doc ID in the bucket for its timestamp {}, " - + "because this bucket is full. Instead, it was assigned a doc ID in the bucket for " - + "timestamp {}. The tweets in the correct bucket are: {}", - tweetId, - expectedDocIdTimestamp, - docIdTimestamp, - Arrays.toString(getTweetsForDocIdTimestamp(expectedDocIdTimestamp))); - } - - if (numDocsForTimestamp == ID_NOT_FOUND) { - numDocsForTimestamp = 0; - } - ++numDocsForTimestamp; - tweetsPerTimestamp.put(docIdTimestamp, numDocsForTimestamp); - - return getDocIdForTimestamp(docIdTimestamp, numDocsForTimestamp); - } - - @Override - public int getDocID(long tweetId) { - int docIdTimestamp = getDocIdTimestamp(tweetId); - while (docIdTimestamp >= 0) { - int numDocsForTimestamp = tweetsPerTimestamp.get(docIdTimestamp); - int startingDocId = (docIdTimestamp << DOC_ID_BITS) + MAX_DOCS_PER_TIMESTAMP - 1; - for (int docId = startingDocId; docId > startingDocId - numDocsForTimestamp; --docId) { - if (tweetIds.get(docId) == tweetId) { - return docId; - } - } - - // If we have MAX_DOCS_PER_TIMESTAMP docs with this timestamp, then we might've mis-assigned - // a tweet to the previous docIdTimestamp bucket. In that case, we need to keep searching. - // Otherwise, the tweet is not in the index. - if (numDocsForTimestamp < MAX_DOCS_PER_TIMESTAMP) { - break; - } - - --docIdTimestamp; - } - - return ID_NOT_FOUND; - } - - @Override - protected int getNextDocIDInternal(int docId) { - // Check if docId + 1 is an assigned doc ID in this mapper. This might be the case when we have - // multiple tweets posted on the same millisecond. - if (tweetIds.get(docId + 1) != ID_NOT_FOUND) { - return docId + 1; - } - - // If (docId + 1) is not assigned, then it means we do not have any more tweets posted at the - // timestamp corresponding to docId. We need to find the next relative timestamp for which this - // mapper has tweets, and return the first tweet for that timestamp. Note that iterating over - // the space of all possible timestamps is faster than iterating over the space of all possible - // doc IDs (it's MAX_DOCS_PER_TIMESTAMP times faster). - int nextDocIdTimestamp = (docId >> DOC_ID_BITS) + 1; - byte numDocsForTimestamp = tweetsPerTimestamp.get(nextDocIdTimestamp); - int maxDocIdTimestamp = getMaxDocID() >> DOC_ID_BITS; - while ((nextDocIdTimestamp <= maxDocIdTimestamp) - && (numDocsForTimestamp == ID_NOT_FOUND)) { - ++nextDocIdTimestamp; - numDocsForTimestamp = tweetsPerTimestamp.get(nextDocIdTimestamp); - } - - if (numDocsForTimestamp != ID_NOT_FOUND) { - return getDocIdForTimestamp(nextDocIdTimestamp, numDocsForTimestamp); - } - - return ID_NOT_FOUND; - } - - @Override - protected int getPreviousDocIDInternal(int docId) { - // Check if docId - 1 is an assigned doc ID in this mapper. This might be the case when we have - // multiple tweets posted on the same millisecond. - if (tweetIds.get(docId - 1) != ID_NOT_FOUND) { - return docId - 1; - } - - // If (docId - 1) is not assigned, then it means we do not have any more tweets posted at the - // timestamp corresponding to docId. We need to find the previous relative timestamp for which - // this mapper has tweets, and return the first tweet for that timestamp. Note that iterating - // over the space of all possible timestamps is faster than iterating over the space of all - // possible doc IDs (it's MAX_DOCS_PER_TIMESTAMP times faster). - int previousDocIdTimestamp = (docId >> DOC_ID_BITS) - 1; - byte numDocsForTimestamp = tweetsPerTimestamp.get(previousDocIdTimestamp); - int minDocIdTimestamp = getMinDocID() >> DOC_ID_BITS; - while ((previousDocIdTimestamp >= minDocIdTimestamp) - && (numDocsForTimestamp == ID_NOT_FOUND)) { - --previousDocIdTimestamp; - numDocsForTimestamp = tweetsPerTimestamp.get(previousDocIdTimestamp); - } - - if (numDocsForTimestamp != ID_NOT_FOUND) { - return getDocIdForTimestamp(previousDocIdTimestamp, (byte) 1); - } - - return ID_NOT_FOUND; - } - - @Override - public long getTweetID(int docId) { - return tweetIds.get(docId); - } - - @Override - protected int addMappingInternal(long tweetId) { - int docId = newDocId(tweetId); - if (docId == ID_NOT_FOUND) { - return ID_NOT_FOUND; - } - - tweetIds.put(docId, tweetId); - return docId; - } - - @Override - protected int findDocIDBoundInternal(long tweetId, boolean findMaxDocId) { - // Note that it would be incorrect to lookup the doc ID for the given tweet ID and return that - // doc ID, as we would skip over tweets created in the same millisecond but with a lower doc ID. - int docIdTimestamp = getDocIdTimestamp(tweetId); - - // The docIdTimestamp is ID_NOT_FOUND only if the tweet is from before the segment boundary and - // this should never happen here because TweetIDMapper.findDocIdBound ensures that the tweet id - // passed into this method is >= minTweetID which means the tweet is from after the segment - // boundary. - Preconditions.checkState( - docIdTimestamp != ID_NOT_FOUND, - "Tried to find doc id bound for tweet %d which is from before the segment boundary %d", - tweetId, - segmentBoundaryTimestamp); - - // It's OK to return a doc ID that doesn't correspond to any tweet ID in the index, - // as the doc ID is simply used as a starting point and ending point for range queries, - // not a source of truth. - if (findMaxDocId) { - // Return the largest possible doc ID for the timestamp. - return getDocIdForTimestamp(docIdTimestamp, (byte) 1); - } else { - // Return the smallest possible doc ID for the timestamp. - byte tweetsInTimestamp = tweetsPerTimestamp.getOrDefault(docIdTimestamp, (byte) 0); - return getDocIdForTimestamp(docIdTimestamp, tweetsInTimestamp); - } - } - - /** - * Returns the array of all tweet IDs stored in this mapper in a sorted (descending) order. - * Essentially, this method remaps all tweet IDs stored in this mapper to a compressed doc ID - * space of [0, numDocs). - * - * Note that this method is not thread safe, and it's meant to be called only at segment - * optimization time. If addMappingInternal() is called during the execution of this method, - * the behavior is undefined (it will most likely return bad results or throw an exception). - * - * @return An array of all tweet IDs stored in this mapper, in a sorted (descending) order. - */ - public long[] sortTweetIds() { - int numDocs = getNumDocs(); - if (numDocs == 0) { - return new long[0]; - } - - // Add all tweets stored in this mapper to sortTweetIds. - long[] sortedTweetIds = new long[numDocs]; - int sortedTweetIdsIndex = 0; - for (int docId = getMinDocID(); docId != ID_NOT_FOUND; docId = getNextDocID(docId)) { - sortedTweetIds[sortedTweetIdsIndex++] = getTweetID(docId); - } - Preconditions.checkState(sortedTweetIdsIndex == numDocs, - "Could not traverse all documents in the mapper. Expected to find " - + numDocs + " docs, but found only " + sortedTweetIdsIndex); - - // Sort sortedTweetIdsIndex in descending order. There's no way to sort a primitive array in - // descending order, so we have to sort it in ascending order and then reverse it. - Arrays.sort(sortedTweetIds); - for (int i = 0; i < numDocs / 2; ++i) { - long tmp = sortedTweetIds[i]; - sortedTweetIds[i] = sortedTweetIds[numDocs - 1 - i]; - sortedTweetIds[numDocs - 1 - i] = tmp; - } - - return sortedTweetIds; - } - - @Override - public DocIDToTweetIDMapper optimize() throws IOException { - return new OptimizedTweetIDMapper(this); - } - - /** - * Returns the largest Tweet ID that this doc ID mapper could handle. The returned Tweet ID - * would be safe to put into the mapper, but any larger ones would not be correctly handled. - */ - public static long calculateMaxTweetID(long timesliceID) { - long numberOfUsableTimestamps = LARGEST_RELATIVE_TIMESTAMP - LATE_TWEETS_TIME_BUFFER_MILLIS; - long firstTimestamp = SnowflakeIdParser.getTimestampFromTweetId(timesliceID); - long lastTimestamp = firstTimestamp + numberOfUsableTimestamps; - return SnowflakeIdParser.generateValidStatusId( - lastTimestamp, SnowflakeIdParser.RESERVED_BITS_MASK); - } - - /** - * Evaluates whether two instances of OutOfOrderRealtimeTweetIDMapper are equal by value. It is - * slow because it has to check every tweet ID/doc ID in the map. - */ - @VisibleForTesting - boolean verySlowEqualsForTests(OutOfOrderRealtimeTweetIDMapper that) { - return getMinTweetID() == that.getMinTweetID() - && getMaxTweetID() == that.getMaxTweetID() - && getMinDocID() == that.getMinDocID() - && getMaxDocID() == that.getMaxDocID() - && segmentBoundaryTimestamp == that.segmentBoundaryTimestamp - && segmentSize == that.segmentSize - && tweetsPerTimestamp.equals(that.tweetsPerTimestamp) - && tweetIds.equals(that.tweetIds); - } - - @Override - public OutOfOrderRealtimeTweetIDMapper.FlushHandler getFlushHandler() { - return new OutOfOrderRealtimeTweetIDMapper.FlushHandler(this); - } - - private OutOfOrderRealtimeTweetIDMapper( - long minTweetID, - long maxTweetID, - int minDocID, - int maxDocID, - long segmentBoundaryTimestamp, - int segmentSize, - int[] docIDs, - long[] tweetIDList - ) { - super(minTweetID, maxTweetID, minDocID, maxDocID, docIDs.length); - - Preconditions.checkState(docIDs.length == tweetIDList.length); - - this.segmentBoundaryTimestamp = segmentBoundaryTimestamp; - this.segmentSize = segmentSize; - - tweetIds = new Int2LongOpenHashMap(segmentSize); - tweetIds.defaultReturnValue(ID_NOT_FOUND); - - tweetsPerTimestamp = new Int2ByteOpenHashMap(segmentSize); - tweetsPerTimestamp.defaultReturnValue((byte) ID_NOT_FOUND); - - for (int i = 0; i < docIDs.length; i++) { - int docID = docIDs[i]; - long tweetID = tweetIDList[i]; - tweetIds.put(docID, tweetID); - - int timestampBucket = docID >> DOC_ID_BITS; - if (tweetsPerTimestamp.containsKey(timestampBucket)) { - tweetsPerTimestamp.addTo(timestampBucket, (byte) 1); - } else { - tweetsPerTimestamp.put(timestampBucket, (byte) 1); - } - } - } - - public static class FlushHandler extends Flushable.Handler { - private static final String MIN_TWEET_ID_PROP_NAME = "MinTweetID"; - private static final String MAX_TWEET_ID_PROP_NAME = "MaxTweetID"; - private static final String MIN_DOC_ID_PROP_NAME = "MinDocID"; - private static final String MAX_DOC_ID_PROP_NAME = "MaxDocID"; - private static final String SEGMENT_BOUNDARY_TIMESTAMP_PROP_NAME = "SegmentBoundaryTimestamp"; - private static final String SEGMENT_SIZE_PROP_NAME = "SegmentSize"; - - public FlushHandler() { - super(); - } - - public FlushHandler(OutOfOrderRealtimeTweetIDMapper objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer serializer) throws IOException { - OutOfOrderRealtimeTweetIDMapper mapper = getObjectToFlush(); - - flushInfo.addLongProperty(MIN_TWEET_ID_PROP_NAME, mapper.getMinTweetID()); - flushInfo.addLongProperty(MAX_TWEET_ID_PROP_NAME, mapper.getMaxTweetID()); - flushInfo.addIntProperty(MIN_DOC_ID_PROP_NAME, mapper.getMinDocID()); - flushInfo.addIntProperty(MAX_DOC_ID_PROP_NAME, mapper.getMaxDocID()); - flushInfo.addLongProperty(SEGMENT_BOUNDARY_TIMESTAMP_PROP_NAME, - mapper.segmentBoundaryTimestamp); - flushInfo.addIntProperty(SEGMENT_SIZE_PROP_NAME, mapper.segmentSize); - - serializer.writeInt(mapper.tweetIds.size()); - for (Int2LongMap.Entry entry : mapper.tweetIds.int2LongEntrySet()) { - serializer.writeInt(entry.getIntKey()); - serializer.writeLong(entry.getLongValue()); - } - } - - @Override - protected OutOfOrderRealtimeTweetIDMapper doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - - int size = in.readInt(); - int[] docIds = new int[size]; - long[] tweetIds = new long[size]; - for (int i = 0; i < size; i++) { - docIds[i] = in.readInt(); - tweetIds[i] = in.readLong(); - } - - return new OutOfOrderRealtimeTweetIDMapper( - flushInfo.getLongProperty(MIN_TWEET_ID_PROP_NAME), - flushInfo.getLongProperty(MAX_TWEET_ID_PROP_NAME), - flushInfo.getIntProperty(MIN_DOC_ID_PROP_NAME), - flushInfo.getIntProperty(MAX_DOC_ID_PROP_NAME), - flushInfo.getLongProperty(SEGMENT_BOUNDARY_TIMESTAMP_PROP_NAME), - flushInfo.getIntProperty(SEGMENT_SIZE_PROP_NAME), - docIds, - tweetIds); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/index/RealtimeTimeMapper.docx b/src/java/com/twitter/search/earlybird/index/RealtimeTimeMapper.docx new file mode 100644 index 000000000..29ac793df Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/RealtimeTimeMapper.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/RealtimeTimeMapper.java b/src/java/com/twitter/search/earlybird/index/RealtimeTimeMapper.java deleted file mode 100644 index d78d971e6..000000000 --- a/src/java/com/twitter/search/earlybird/index/RealtimeTimeMapper.java +++ /dev/null @@ -1,149 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.TimeMapper; -import com.twitter.search.core.earlybird.index.inverted.IntBlockPool; - -import it.unimi.dsi.fastutil.ints.Int2IntMap; -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; - -/** - * Maps 32-bit document IDs to seconds-since-epoch timestamps. - */ -public class RealtimeTimeMapper extends AbstractInMemoryTimeMapper { - // Doc id to timestamp map. Timestamps that are negative are out-of-order. - protected final Int2IntOpenHashMap timeMap; - private final int capacity; - - public RealtimeTimeMapper(int capacity) { - super(); - this.capacity = capacity; - - timeMap = new Int2IntOpenHashMap(capacity); - timeMap.defaultReturnValue(ILLEGAL_TIME); - } - - @Override - public int getTime(int docID) { - return timeMap.get(docID); - } - - @Override - protected void setTime(int docID, int timeSeconds) { - timeMap.put(docID, timeSeconds); - } - - public final void addMapping(int docID, int timeSeconds) { - doAddMapping(docID, timeSeconds); - } - - @Override - public TimeMapper optimize(DocIDToTweetIDMapper originalTweetIdMapper, - DocIDToTweetIDMapper optimizedTweetIdMapper) throws IOException { - return new OptimizedTimeMapper(this, originalTweetIdMapper, optimizedTweetIdMapper); - } - - /** - * Evaluates whether two instances of RealtimeTimeMapper are equal by value. It is - * slow because it has to check every tweet ID/timestamp in the map. - */ - @VisibleForTesting - boolean verySlowEqualsForTests(RealtimeTimeMapper that) { - return reverseMapLastIndex == that.reverseMapLastIndex - && reverseMapIds.verySlowEqualsForTests(that.reverseMapIds) - && reverseMapTimes.verySlowEqualsForTests(that.reverseMapTimes) - && capacity == that.capacity - && timeMap.equals(that.timeMap); - } - - private RealtimeTimeMapper( - int capacity, - int reverseMapLastIndex, - int[] docIds, - int[] timestamps, - IntBlockPool reverseMapTimes, - IntBlockPool reverseMapIds - ) { - super(reverseMapLastIndex, reverseMapTimes, reverseMapIds); - - this.capacity = capacity; - - timeMap = new Int2IntOpenHashMap(capacity); - timeMap.defaultReturnValue(ILLEGAL_TIME); - - Preconditions.checkState(docIds.length == timestamps.length); - - for (int i = 0; i < docIds.length; i++) { - timeMap.put(docIds[i], timestamps[i]); - } - } - - @Override - public RealtimeTimeMapper.FlushHandler getFlushHandler() { - return new RealtimeTimeMapper.FlushHandler(this); - } - - public static class FlushHandler extends Flushable.Handler { - private static final String REVERSE_MAP_LAST_INDEX_PROP = "reverseMapLastIndex"; - private static final String TIMES_SUB_PROP = "times"; - private static final String IDS_SUB_PROP = "ids"; - private static final String CAPACITY_PROP = "capacity"; - - public FlushHandler() { - super(); - } - - public FlushHandler(RealtimeTimeMapper objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer serializer) throws IOException { - RealtimeTimeMapper mapper = getObjectToFlush(); - - flushInfo.addIntProperty(CAPACITY_PROP, mapper.capacity); - flushInfo.addIntProperty(REVERSE_MAP_LAST_INDEX_PROP, mapper.reverseMapLastIndex); - - serializer.writeInt(mapper.timeMap.size()); - for (Int2IntMap.Entry entry : mapper.timeMap.int2IntEntrySet()) { - serializer.writeInt(entry.getIntKey()); - serializer.writeInt(entry.getIntValue()); - } - - mapper.reverseMapTimes.getFlushHandler().flush( - flushInfo.newSubProperties(TIMES_SUB_PROP), serializer); - mapper.reverseMapIds.getFlushHandler().flush( - flushInfo.newSubProperties(IDS_SUB_PROP), serializer); - } - - @Override - protected RealtimeTimeMapper doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - - int size = in.readInt(); - int[] docIds = new int[size]; - int[] timestamps = new int[size]; - for (int i = 0; i < size; i++) { - docIds[i] = in.readInt(); - timestamps[i] = in.readInt(); - } - - return new RealtimeTimeMapper( - flushInfo.getIntProperty(CAPACITY_PROP), - flushInfo.getIntProperty(REVERSE_MAP_LAST_INDEX_PROP), - docIds, - timestamps, - new IntBlockPool.FlushHandler().load(flushInfo.getSubProperties(TIMES_SUB_PROP), in), - new IntBlockPool.FlushHandler().load(flushInfo.getSubProperties(IDS_SUB_PROP), in)); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/index/TimeMappingWriter.docx b/src/java/com/twitter/search/earlybird/index/TimeMappingWriter.docx new file mode 100644 index 000000000..684490de8 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/TimeMappingWriter.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/TimeMappingWriter.java b/src/java/com/twitter/search/earlybird/index/TimeMappingWriter.java deleted file mode 100644 index acc1cafe7..000000000 --- a/src/java/com/twitter/search/earlybird/index/TimeMappingWriter.java +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; - -import org.apache.lucene.util.AttributeSource; - -import com.twitter.search.common.util.analysis.IntTermAttribute; -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentWriter; - -public class TimeMappingWriter implements EarlybirdRealtimeIndexSegmentWriter.InvertedDocConsumer { - private IntTermAttribute termAtt; - private final RealtimeTimeMapper mapper; - - public TimeMappingWriter(RealtimeTimeMapper mapper) { - this.mapper = mapper; - } - - @Override - public final void start(AttributeSource attributeSource, boolean currentDocIsOffensive) { - termAtt = attributeSource.addAttribute(IntTermAttribute.class); - } - - @Override - public final void add(int docId, int position) throws IOException { - final int timeSec = termAtt.getTerm(); - mapper.addMapping(docId, timeSec); - } - - @Override - public void finish() { - } -} diff --git a/src/java/com/twitter/search/earlybird/index/TweetIDMapper.docx b/src/java/com/twitter/search/earlybird/index/TweetIDMapper.docx new file mode 100644 index 000000000..7ed369af9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/TweetIDMapper.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/TweetIDMapper.java b/src/java/com/twitter/search/earlybird/index/TweetIDMapper.java deleted file mode 100644 index e97d58cad..000000000 --- a/src/java/com/twitter/search/earlybird/index/TweetIDMapper.java +++ /dev/null @@ -1,183 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; - -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public abstract class TweetIDMapper implements DocIDToTweetIDMapper, Flushable { - private long minTweetID; - private long maxTweetID; - private int minDocID; - private int maxDocID; - private int numDocs; - - protected TweetIDMapper() { - this(Long.MAX_VALUE, Long.MIN_VALUE, Integer.MAX_VALUE, Integer.MIN_VALUE, 0); - } - - protected TweetIDMapper( - long minTweetID, long maxTweetID, int minDocID, int maxDocID, int numDocs) { - this.minTweetID = minTweetID; - this.maxTweetID = maxTweetID; - this.minDocID = minDocID; - this.maxDocID = maxDocID; - this.numDocs = numDocs; - } - - // Realtime updates minTweetID and maxTweetID in addMapping. - // Archives updates minTweetID and maxTweetID in prepareToRead. - protected void setMinTweetID(long minTweetID) { - this.minTweetID = minTweetID; - } - - protected void setMaxTweetID(long maxTweetID) { - this.maxTweetID = maxTweetID; - } - - protected void setMinDocID(int minDocID) { - this.minDocID = minDocID; - } - - protected void setMaxDocID(int maxDocID) { - this.maxDocID = maxDocID; - } - - protected void setNumDocs(int numDocs) { - this.numDocs = numDocs; - } - - public long getMinTweetID() { - return this.minTweetID; - } - - public long getMaxTweetID() { - return this.maxTweetID; - } - - public int getMinDocID() { - return minDocID; - } - - public int getMaxDocID() { - return maxDocID; - } - - @Override - public int getNumDocs() { - return numDocs; - } - - /** - * Given a tweetId, find the corresponding doc ID to start, or end, a search. - * - * In the ordered, dense doc ID mappers, this returns either the doc ID assigned to the tweet ID, - * or doc ID of the next lowest tweet ID, if the tweet is not in the index. In this case - * findMaxDocID is ignored. - * - * In {@link OutOfOrderRealtimeTweetIDMapper}, doc IDs are not ordered within a millisecond, so we - * want to search the entire millisecond bucket for a filter. To accomplish this, - * if findMaxDocId is true we return the largest possible doc ID for that millisecond. - * If findMaxDocId is false, we return the smallest possible doc ID for that millisecond. - * - * The returned doc ID will be between smallestDocID and largestDocID (inclusive). - * The returned doc ID may not be in the index. - */ - public int findDocIdBound(long tweetID, - boolean findMaxDocID, - int smallestDocID, - int largestDocID) throws IOException { - if (tweetID > maxTweetID) { - return smallestDocID; - } - if (tweetID < minTweetID) { - return largestDocID; - } - - int internalID = findDocIDBoundInternal(tweetID, findMaxDocID); - - return Math.max(smallestDocID, Math.min(largestDocID, internalID)); - } - - @Override - public final int getNextDocID(int docID) { - if (numDocs <= 0) { - return ID_NOT_FOUND; - } - if (docID < minDocID) { - return minDocID; - } - if (docID >= maxDocID) { - return ID_NOT_FOUND; - } - return getNextDocIDInternal(docID); - } - - @Override - public final int getPreviousDocID(int docID) { - if (numDocs <= 0) { - return ID_NOT_FOUND; - } - if (docID <= minDocID) { - return ID_NOT_FOUND; - } - if (docID > maxDocID) { - return maxDocID; - } - return getPreviousDocIDInternal(docID); - } - - @Override - public int addMapping(final long tweetID) { - int docId = addMappingInternal(tweetID); - if (docId != ID_NOT_FOUND) { - ++numDocs; - if (tweetID > maxTweetID) { - maxTweetID = tweetID; - } - if (tweetID < minTweetID) { - minTweetID = tweetID; - } - if (docId > maxDocID) { - maxDocID = docId; - } - if (docId < minDocID) { - minDocID = docId; - } - } - - return docId; - } - - /** - * Returns the smallest valid doc ID in this mapper that's strictly higher than the given doc ID. - * If no such doc ID exists, ID_NOT_FOUND must be returned. - * - * The given docID is guaranteed to be in the range [minDocID, maxDocID). - * - * @param docID The current doc ID. - * @return The smallest valid doc ID in this mapper that's strictly higher than the given doc ID, - * or a negative number, if no such doc ID exists. - */ - protected abstract int getNextDocIDInternal(int docID); - - /** - * Returns the smallest valid doc ID in this mapper that's strictly higher than the given doc ID. - * If no such doc ID exists, ID_NOT_FOUND must be returned. - * - * The given docID is guaranteed to be in the range (minDocID, maxDocID]. - * - * @param docID The current doc ID. - * @return The smallest valid doc ID in this mapper that's strictly higher than the given doc ID, - * or a negative number, if no such doc ID exists. - */ - protected abstract int getPreviousDocIDInternal(int docID); - - protected abstract int addMappingInternal(final long tweetID); - - /** - * See {@link TweetIDMapper#findDocIdBound}. - */ - protected abstract int findDocIDBoundInternal(long tweetID, - boolean findMaxDocID) throws IOException; -} diff --git a/src/java/com/twitter/search/earlybird/index/TweetIDQuery.docx b/src/java/com/twitter/search/earlybird/index/TweetIDQuery.docx new file mode 100644 index 000000000..3f9ce5d9a Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/TweetIDQuery.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/TweetIDQuery.java b/src/java/com/twitter/search/earlybird/index/TweetIDQuery.java deleted file mode 100644 index 6f4c39a02..000000000 --- a/src/java/com/twitter/search/earlybird/index/TweetIDQuery.java +++ /dev/null @@ -1,81 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; -import java.util.Arrays; -import java.util.Set; - -import com.google.common.collect.Sets; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Weight; - -import com.twitter.search.common.query.DefaultFilterWeight; -import com.twitter.search.common.search.IntArrayDocIdSetIterator; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; - -public class TweetIDQuery extends Query { - private final Set tweetIDs = Sets.newHashSet(); - - public TweetIDQuery(long... tweetIDs) { - for (long tweetID : tweetIDs) { - this.tweetIDs.add(tweetID); - } - } - - @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) { - return new DefaultFilterWeight(this) { - @Override - protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) throws IOException { - EarlybirdIndexSegmentData segmentData = - ((EarlybirdIndexSegmentAtomicReader) context.reader()).getSegmentData(); - DocIDToTweetIDMapper docIdToTweetIdMapper = segmentData.getDocIDToTweetIDMapper(); - - Set set = Sets.newHashSet(); - for (long tweetID : tweetIDs) { - int docID = docIdToTweetIdMapper.getDocID(tweetID); - if (docID != DocIDToTweetIDMapper.ID_NOT_FOUND) { - set.add(docID); - } - } - - if (set.isEmpty()) { - return DocIdSetIterator.empty(); - } - - int[] docIDs = new int[set.size()]; - int i = 0; - for (int docID : set) { - docIDs[i++] = docID; - } - Arrays.sort(docIDs); - return new IntArrayDocIdSetIterator(docIDs); - } - }; - } - - @Override - public int hashCode() { - return tweetIDs.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof TweetIDQuery)) { - return false; - } - - return tweetIDs.equals(TweetIDQuery.class.cast(obj).tweetIDs); - } - - @Override - public String toString(String field) { - return "TWEET_ID_QUERY: " + tweetIDs; - } -} diff --git a/src/java/com/twitter/search/earlybird/index/TweetIDToInternalIDMap.docx b/src/java/com/twitter/search/earlybird/index/TweetIDToInternalIDMap.docx new file mode 100644 index 000000000..73adc108e Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/TweetIDToInternalIDMap.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/TweetIDToInternalIDMap.java b/src/java/com/twitter/search/earlybird/index/TweetIDToInternalIDMap.java deleted file mode 100644 index 87204a623..000000000 --- a/src/java/com/twitter/search/earlybird/index/TweetIDToInternalIDMap.java +++ /dev/null @@ -1,154 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; -import java.util.Arrays; - -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.Flushable; -import com.twitter.search.core.earlybird.index.DocIDToTweetIDMapper; - -public final class TweetIDToInternalIDMap implements Flushable { - private final int size; - private final int[] hash; - public final int halfSize; - private final int mask; - public int numMappings; - - static final int PRIME_NUMBER = 37; - - // For FlushHandler.load() use only - private TweetIDToInternalIDMap(final int[] hash, - final int numMappings) { - this.hash = hash; - this.size = hash.length; - this.halfSize = size >> 1; - this.mask = size - 1; - this.numMappings = numMappings; - } - - TweetIDToInternalIDMap(final int size) { - this.hash = new int[size]; - Arrays.fill(hash, DocIDToTweetIDMapper.ID_NOT_FOUND); - this.size = size; - this.halfSize = size >> 1; - this.mask = size - 1; - this.numMappings = 0; - } - - // Slightly different hash function from the one used to partition tweets to Earlybirds. - protected static int hashCode(final long tweetID) { - long timestamp = SnowflakeIdParser.getTimestampFromTweetId(tweetID); - int code = (int) ((timestamp - 1) ^ (timestamp >>> 32)); - code = PRIME_NUMBER * (int) (tweetID & SnowflakeIdParser.RESERVED_BITS_MASK) + code; - return code; - } - - protected static int incrementHashCode(int code) { - return ((code >> 8) + code) | 1; - } - - private int hashPos(int code) { - return code & mask; - } - - /** - * Associates the given tweet ID with the given internal doc ID. - * - * @param tweetID The tweet ID. - * @param internalID The doc ID that should be associated with this tweet ID. - * @param inverseMap The map that stores the doc ID to tweet ID associations. - */ - public void add(final long tweetID, final int internalID, final long[] inverseMap) { - int code = hashCode(tweetID); - int hashPos = hashPos(code); - int value = hash[hashPos]; - assert inverseMap[internalID] == tweetID; - - if (value != DocIDToTweetIDMapper.ID_NOT_FOUND) { - final int inc = incrementHashCode(code); - do { - code += inc; - hashPos = hashPos(code); - value = hash[hashPos]; - } while (value != DocIDToTweetIDMapper.ID_NOT_FOUND); - } - - assert value == DocIDToTweetIDMapper.ID_NOT_FOUND; - - hash[hashPos] = internalID; - numMappings++; - } - - /** - * Returns the doc ID corresponding to the given tweet ID. - * - * @param tweetID The tweet ID. - * @param inverseMap The map that stores the doc ID to tweet ID associations. - * @return The doc ID corresponding to the given tweet ID. - */ - public int get(long tweetID, final long[] inverseMap) { - int code = hashCode(tweetID); - int hashPos = hashPos(code); - int value = hash[hashPos]; - - if (value != DocIDToTweetIDMapper.ID_NOT_FOUND && inverseMap[value] != tweetID) { - final int inc = incrementHashCode(code); - - do { - code += inc; - hashPos = hashPos(code); - value = hash[hashPos]; - } while (value != DocIDToTweetIDMapper.ID_NOT_FOUND && inverseMap[value] != tweetID); - } - - if (hashPos == -1) { - return DocIDToTweetIDMapper.ID_NOT_FOUND; - } - return hash[hashPos]; - } - - @Override - public TweetIDToInternalIDMap.FlushHandler getFlushHandler() { - return new FlushHandler(this); - } - - public static final class FlushHandler extends Flushable.Handler { - public FlushHandler() { - super(); - } - - private static final String HASH_ARRAY_SIZE_PROP_NAME = "HashArraySize"; - private static final String MASK_PROP_NAME = "Mask"; - private static final String NUM_MAPPINGS_PROP_NAME = "NumMappings"; - - public FlushHandler(TweetIDToInternalIDMap objectToFlush) { - super(objectToFlush); - } - - @Override - protected void doFlush(FlushInfo flushInfo, DataSerializer out) - throws IOException { - TweetIDToInternalIDMap mapper = getObjectToFlush(); - - flushInfo - .addIntProperty(HASH_ARRAY_SIZE_PROP_NAME, mapper.hash.length) - .addIntProperty(MASK_PROP_NAME, mapper.mask) - .addIntProperty(NUM_MAPPINGS_PROP_NAME, mapper.numMappings); - - out.writeIntArray(mapper.hash); - } - - @Override - protected TweetIDToInternalIDMap doLoad(FlushInfo flushInfo, DataDeserializer in) - throws IOException { - final int[] hash = in.readIntArray(); - - final int numMappings = flushInfo.getIntProperty(NUM_MAPPINGS_PROP_NAME); - - return new TweetIDToInternalIDMap(hash, numMappings); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/index/TweetSearchIndexExtensionsFactory.docx b/src/java/com/twitter/search/earlybird/index/TweetSearchIndexExtensionsFactory.docx new file mode 100644 index 000000000..ec96918b7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/TweetSearchIndexExtensionsFactory.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/TweetSearchIndexExtensionsFactory.java b/src/java/com/twitter/search/earlybird/index/TweetSearchIndexExtensionsFactory.java deleted file mode 100644 index 3e782aa1f..000000000 --- a/src/java/com/twitter/search/earlybird/index/TweetSearchIndexExtensionsFactory.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.twitter.search.earlybird.index; - -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsData; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsFactory; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdRealtimeIndexExtensionsData; - -public class TweetSearchIndexExtensionsFactory extends EarlybirdIndexExtensionsFactory { - @Override - public EarlybirdRealtimeIndexExtensionsData newRealtimeIndexExtensionsData() { - return new TweetSearchRealtimeIndexExtensionsData(); - } - - @Override - public EarlybirdIndexExtensionsData newLuceneIndexExtensionsData() { - return new TweetSearchLuceneIndexExtensionsData(); - } -} diff --git a/src/java/com/twitter/search/earlybird/index/TweetSearchLuceneIndexExtensionsData.docx b/src/java/com/twitter/search/earlybird/index/TweetSearchLuceneIndexExtensionsData.docx new file mode 100644 index 000000000..04ac8d3d9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/TweetSearchLuceneIndexExtensionsData.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/TweetSearchLuceneIndexExtensionsData.java b/src/java/com/twitter/search/earlybird/index/TweetSearchLuceneIndexExtensionsData.java deleted file mode 100644 index 84ba879e1..000000000 --- a/src/java/com/twitter/search/earlybird/index/TweetSearchLuceneIndexExtensionsData.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.twitter.search.earlybird.index; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.schema.base.EarlybirdFieldType; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; -import com.twitter.search.core.earlybird.index.column.ColumnStrideFieldIndex; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdIndexExtensionsData; - -public class TweetSearchLuceneIndexExtensionsData implements EarlybirdIndexExtensionsData { - @Override - public void setupExtensions(EarlybirdIndexSegmentAtomicReader atomicReader) throws IOException { - // If we use stock lucene to back the mappers and column stride fields, - // we need to initialize them - EarlybirdIndexSegmentData segmentData = atomicReader.getSegmentData(); - DocValuesBasedTweetIDMapper tweetIDMapper = - (DocValuesBasedTweetIDMapper) segmentData.getDocIDToTweetIDMapper(); - tweetIDMapper.initializeWithLuceneReader( - atomicReader, - getColumnStrideFieldIndex(segmentData, EarlybirdFieldConstant.ID_CSF_FIELD)); - - DocValuesBasedTimeMapper timeMapper = - (DocValuesBasedTimeMapper) segmentData.getTimeMapper(); - timeMapper.initializeWithLuceneReader( - atomicReader, - getColumnStrideFieldIndex(segmentData, EarlybirdFieldConstant.CREATED_AT_CSF_FIELD)); - } - - private ColumnStrideFieldIndex getColumnStrideFieldIndex( - EarlybirdIndexSegmentData segmentData, EarlybirdFieldConstant csfField) { - String csfFieldName = csfField.getFieldName(); - EarlybirdFieldType fieldType = - segmentData.getSchema().getFieldInfo(csfFieldName).getFieldType(); - Preconditions.checkState(fieldType.isCsfLoadIntoRam()); - return segmentData.getDocValuesManager().addColumnStrideField(csfFieldName, fieldType); - } -} diff --git a/src/java/com/twitter/search/earlybird/index/TweetSearchRealtimeIndexExtensionsData.docx b/src/java/com/twitter/search/earlybird/index/TweetSearchRealtimeIndexExtensionsData.docx new file mode 100644 index 000000000..3890e87d4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/TweetSearchRealtimeIndexExtensionsData.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/TweetSearchRealtimeIndexExtensionsData.java b/src/java/com/twitter/search/earlybird/index/TweetSearchRealtimeIndexExtensionsData.java deleted file mode 100644 index 02752d08d..000000000 --- a/src/java/com/twitter/search/earlybird/index/TweetSearchRealtimeIndexExtensionsData.java +++ /dev/null @@ -1,33 +0,0 @@ -package com.twitter.search.earlybird.index; - -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentWriter.InvertedDocConsumerBuilder; -import com.twitter.search.core.earlybird.index.EarlybirdRealtimeIndexSegmentWriter.StoredFieldsConsumerBuilder; -import com.twitter.search.core.earlybird.index.extensions.EarlybirdRealtimeIndexExtensionsData; - -public class TweetSearchRealtimeIndexExtensionsData - implements EarlybirdRealtimeIndexExtensionsData { - @Override - public void createStoredFieldsConsumer(StoredFieldsConsumerBuilder builder) { - // no extensions necessary here - } - - @Override - public void createInvertedDocConsumer(InvertedDocConsumerBuilder builder) { - if (EarlybirdFieldConstant.ID_FIELD.getFieldName().equals(builder.getFieldName())) { - // The tweet ID should've already been added to the tweet ID <-> doc ID mapper. - builder.setUseDefaultConsumer(false); - } - - if (EarlybirdFieldConstant.CREATED_AT_FIELD.getFieldName().equals(builder.getFieldName())) { - RealtimeTimeMapper timeMapper = (RealtimeTimeMapper) builder.getSegmentData().getTimeMapper(); - builder.addConsumer(new TimeMappingWriter(timeMapper)); - builder.setUseDefaultConsumer(false); - } - } - - @Override - public void setupExtensions(EarlybirdIndexSegmentAtomicReader atomicReader) { - } -} diff --git a/src/java/com/twitter/search/earlybird/index/facets/BUILD b/src/java/com/twitter/search/earlybird/index/facets/BUILD deleted file mode 100644 index 8041b4394..000000000 --- a/src/java/com/twitter/search/earlybird/index/facets/BUILD +++ /dev/null @@ -1,16 +0,0 @@ -java_library( - sources = ["*.java"], - platform = "java8", - tags = ["bazel-compatible"], - dependencies = [ - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common", - "3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn", - "3rdparty/jvm/org/apache/lucene:lucene-facet", - "3rdparty/jvm/org/apache/lucene:lucene-queries", - "src/java/com/twitter/search/common/constants", - "src/java/com/twitter/search/common/schema/base", - "src/java/com/twitter/search/common/schema/earlybird", - "src/java/com/twitter/search/core/earlybird", - "src/thrift/com/twitter/search:earlybird-java", - ], -) diff --git a/src/java/com/twitter/search/earlybird/index/facets/BUILD.docx b/src/java/com/twitter/search/earlybird/index/facets/BUILD.docx new file mode 100644 index 000000000..3dc5b6ba1 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/facets/BUILD.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/facets/FacetSkipList.docx b/src/java/com/twitter/search/earlybird/index/facets/FacetSkipList.docx new file mode 100644 index 000000000..d93e21b8b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/index/facets/FacetSkipList.docx differ diff --git a/src/java/com/twitter/search/earlybird/index/facets/FacetSkipList.java b/src/java/com/twitter/search/earlybird/index/facets/FacetSkipList.java deleted file mode 100644 index 8735f82a2..000000000 --- a/src/java/com/twitter/search/earlybird/index/facets/FacetSkipList.java +++ /dev/null @@ -1,126 +0,0 @@ -package com.twitter.search.earlybird.index.facets; - -import java.io.IOException; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Set; - -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.core.earlybird.facets.FacetCountState; -import com.twitter.search.earlybird.thrift.ThriftTermRequest; - -public abstract class FacetSkipList { - public static class SkipTokenStream extends TokenStream { - private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - - private Iterator iterator; - private Set facetFields = new HashSet<>(); - - public void add(Schema.FieldInfo field) { - this.facetFields.add(field); - } - - @Override - public final boolean incrementToken() throws IOException { - if (iterator == null) { - iterator = facetFields.iterator(); - } - - while (iterator.hasNext()) { - Schema.FieldInfo field = iterator.next(); - if (field.getFieldType().isStoreFacetSkiplist()) { - termAtt.setEmpty(); - termAtt.append(EarlybirdFieldConstant.getFacetSkipFieldName(field.getName())); - - return true; - } - } - - return false; - } - } - - /** - * Returns a Term query to search in the given facet field. - */ - public static Term getSkipListTerm(Schema.FieldInfo facetField) { - if (facetField.getFieldType().isStoreFacetSkiplist()) { - return new Term(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.getFacetSkipFieldName(facetField.getName())); - } - return null; - } - - /** - * Returns a disjunction query that searches in all facet fields in the given facet count state. - */ - public static Query getSkipListQuery(FacetCountState facetCountState) { - Set fieldsWithSkipLists = - facetCountState.getFacetFieldsToCountWithSkipLists(); - - if (fieldsWithSkipLists == null || fieldsWithSkipLists.isEmpty()) { - return null; - } - - Query skipLists; - - if (fieldsWithSkipLists.size() == 1) { - skipLists = new TermQuery(getSkipListTerm(fieldsWithSkipLists.iterator().next())); - } else { - BooleanQuery.Builder disjunctionBuilder = new BooleanQuery.Builder(); - for (Schema.FieldInfo facetField : fieldsWithSkipLists) { - disjunctionBuilder.add( - new TermQuery(new Term( - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.getFacetSkipFieldName(facetField.getName()))), - BooleanClause.Occur.SHOULD); - } - skipLists = disjunctionBuilder.build(); - } - - return skipLists; - } - - /** - * Returns a term request that can be used to get term statistics for the skip list term - * associated with the provided facet. Returns null, if this FacetField is configured to not - * store a skiplist. - */ - public static ThriftTermRequest getSkipListTermRequest(Schema schema, String facetName) { - return getSkipListTermRequest(schema.getFacetFieldByFacetName(facetName)); - } - - /** - * Returns a term request that can be used to get term statistics for the skip list term - * associated with the provided facet. Returns null, if this FacetField is configured to not - * store a skiplist. - */ - public static ThriftTermRequest getSkipListTermRequest(Schema.FieldInfo facetField) { - return facetField != null && facetField.getFieldType().isStoreFacetSkiplist() - ? new ThriftTermRequest( - EarlybirdFieldConstant.getFacetSkipFieldName(facetField.getName())) - .setFieldName(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName()) - : null; - } - - /** - * Returns a term request using the specified fieldName. This is only a temporary solution until - * Blender can access the Schema to pass the FacetIDMap into the method above. - * - * @deprecated Temporary solution until Blender - */ - @Deprecated - public static ThriftTermRequest getSkipListTermRequest(String fieldName) { - return new ThriftTermRequest(EarlybirdFieldConstant.getFacetSkipFieldName(fieldName)) - .setFieldName(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName()); - } -} diff --git a/src/java/com/twitter/search/earlybird/ml/ScoringModelsManager.docx b/src/java/com/twitter/search/earlybird/ml/ScoringModelsManager.docx new file mode 100644 index 000000000..9ba7cd8b9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/ml/ScoringModelsManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/ml/ScoringModelsManager.java b/src/java/com/twitter/search/earlybird/ml/ScoringModelsManager.java deleted file mode 100644 index 0e12f18c7..000000000 --- a/src/java/com/twitter/search/earlybird/ml/ScoringModelsManager.java +++ /dev/null @@ -1,155 +0,0 @@ -package com.twitter.search.earlybird.ml; - -import java.io.IOException; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Optional; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.file.AbstractFile; -import com.twitter.search.common.file.FileUtils; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.schema.DynamicSchema; -import com.twitter.search.common.util.ml.prediction_engine.CompositeFeatureContext; -import com.twitter.search.common.util.ml.prediction_engine.LightweightLinearModel; -import com.twitter.search.common.util.ml.prediction_engine.ModelLoader; - -import static com.twitter.search.modeling.tweet_ranking.TweetScoringFeatures.CONTEXT; -import static com.twitter.search.modeling.tweet_ranking.TweetScoringFeatures.FeatureContextVersion.CURRENT_VERSION; - -/** - * Loads the scoring models for tweets and provides access to them. - * - * This class relies on a list of ModelLoader objects to retrieve the objects from them. It will - * return the first model found according to the order in the list. - * - * For production, we load models from 2 sources: classpath and HDFS. If a model is available - * from HDFS, we return it, otherwise we use the model from the classpath. - * - * The models used for default requests (i.e. not experiments) MUST be present in the - * classpath, this allows us to avoid errors if they can't be loaded from HDFS. - * Models for experiments can live only in HDFS, so we don't need to redeploy Earlybird if we - * want to test them. - */ -public class ScoringModelsManager { - - private static final Logger LOG = LoggerFactory.getLogger(ScoringModelsManager.class); - - /** - * Used when - * 1. Testing - * 2. The scoring models are disabled in the config - * 3. Exceptions thrown during loading the scoring models - */ - public static final ScoringModelsManager NO_OP_MANAGER = new ScoringModelsManager() { - @Override - public boolean isEnabled() { - return false; - } - }; - - private final ModelLoader[] loaders; - private final DynamicSchema dynamicSchema; - - public ScoringModelsManager(ModelLoader... loaders) { - this.loaders = loaders; - this.dynamicSchema = null; - } - - public ScoringModelsManager(DynamicSchema dynamicSchema, ModelLoader... loaders) { - this.loaders = loaders; - this.dynamicSchema = dynamicSchema; - } - - /** - * Indicates that the scoring models were enabled in the config and were loaded successfully - */ - public boolean isEnabled() { - return true; - } - - public void reload() { - for (ModelLoader loader : loaders) { - loader.run(); - } - } - - /** - * Loads and returns the model with the given name, if one exists. - */ - public Optional getModel(String modelName) { - for (ModelLoader loader : loaders) { - Optional model = loader.getModel(modelName); - if (model.isPresent()) { - return model; - } - } - return Optional.absent(); - } - - /** - * Creates an instance that loads models first from HDFS and the classpath resources. - * - * If the models are not found in HDFS, it uses the models from the classpath as fallback. - */ - public static ScoringModelsManager create( - SearchStatsReceiver serverStats, - String hdfsNameNode, - String hdfsBasedPath, - DynamicSchema dynamicSchema) throws IOException { - // Create a composite feature context so we can load both legacy and schema-based models - CompositeFeatureContext featureContext = new CompositeFeatureContext( - CONTEXT, dynamicSchema::getSearchFeatureSchema); - ModelLoader hdfsLoader = createHdfsLoader( - serverStats, hdfsNameNode, hdfsBasedPath, featureContext); - ModelLoader classpathLoader = createClasspathLoader( - serverStats, featureContext); - - // Explicitly load the models from the classpath - classpathLoader.run(); - - ScoringModelsManager manager = new ScoringModelsManager(hdfsLoader, classpathLoader); - LOG.info("Initialized ScoringModelsManager for loading models from HDFS and the classpath"); - return manager; - } - - protected static ModelLoader createHdfsLoader( - SearchStatsReceiver serverStats, - String hdfsNameNode, - String hdfsBasedPath, - CompositeFeatureContext featureContext) { - String hdfsVersionedPath = hdfsBasedPath + "/" + CURRENT_VERSION.getVersionDirectory(); - LOG.info("Starting to load scoring models from HDFS: {}:{}", - hdfsNameNode, hdfsVersionedPath); - return ModelLoader.forHdfsDirectory( - hdfsNameNode, - hdfsVersionedPath, - featureContext, - "scoring_models_hdfs_", - serverStats); - } - - /** - * Creates a loader that loads models from a default location in the classpath. - */ - @VisibleForTesting - public static ModelLoader createClasspathLoader( - SearchStatsReceiver serverStats, CompositeFeatureContext featureContext) - throws IOException { - AbstractFile defaultModelsBaseDir = FileUtils.getTmpDirHandle( - ScoringModelsManager.class, - "/com/twitter/search/earlybird/ml/default_models"); - AbstractFile defaultModelsDir = defaultModelsBaseDir.getChild( - CURRENT_VERSION.getVersionDirectory()); - - LOG.info("Starting to load scoring models from the classpath: {}", - defaultModelsDir.getPath()); - return ModelLoader.forDirectory( - defaultModelsDir, - featureContext, - "scoring_models_classpath_", - serverStats); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/AudioSpaceEventsStreamIndexer.docx b/src/java/com/twitter/search/earlybird/partition/AudioSpaceEventsStreamIndexer.docx new file mode 100644 index 000000000..3d961d7ff Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/AudioSpaceEventsStreamIndexer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/AudioSpaceEventsStreamIndexer.java b/src/java/com/twitter/search/earlybird/partition/AudioSpaceEventsStreamIndexer.java deleted file mode 100644 index 2cd655695..000000000 --- a/src/java/com/twitter/search/earlybird/partition/AudioSpaceEventsStreamIndexer.java +++ /dev/null @@ -1,75 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.earlybird.exception.MissingKafkaTopicException; -import com.twitter.ubs.thriftjava.AudioSpaceBaseEvent; -import com.twitter.ubs.thriftjava.AudioSpaceEvent; -import com.twitter.util.Duration; - -/** - * - * An example publish event looks like this: - * - */ -public class AudioSpaceEventsStreamIndexer extends SimpleStreamIndexer { - private static final Logger LOG = LoggerFactory.getLogger(AudioSpaceEventsStreamIndexer.class); - - private static final String AUDIO_SPACE_EVENTS_TOPIC = "audio_space_events_v1"; - - @VisibleForTesting - // We use this to filter out old space publish events so as to avoid the risk of processing - // old space publish events whose corresponding finish events are no longer in the stream. - // It's unlikely that spaces would last longer than this constant so it should be safe to assume - // that the space whose publish event is older than this age is finished. - protected static final long MAX_PUBLISH_EVENTS_AGE_MS = - Duration.fromHours(11).inMillis(); - - private final AudioSpaceTable audioSpaceTable; - private final Clock clock; - - public AudioSpaceEventsStreamIndexer( - KafkaConsumer kafkaConsumer, - AudioSpaceTable audioSpaceTable, - Clock clock) throws MissingKafkaTopicException { - super(kafkaConsumer, AUDIO_SPACE_EVENTS_TOPIC); - this.audioSpaceTable = audioSpaceTable; - this.clock = clock; - } - - @Override - protected void validateAndIndexRecord(ConsumerRecord record) { - AudioSpaceBaseEvent baseEvent = record.value(); - - if (baseEvent != null && baseEvent.isSetBroadcast_id() && baseEvent.isSetEvent_metadata()) { - AudioSpaceEvent event = baseEvent.getEvent_metadata(); - String spaceId = baseEvent.getBroadcast_id(); - if (event != null && event.isSet(AudioSpaceEvent._Fields.SPACE_PUBLISH_EVENT)) { - long publishEventAgeMs = clock.nowMillis() - baseEvent.getTime_stamp_millis(); - if (publishEventAgeMs < MAX_PUBLISH_EVENTS_AGE_MS) { - audioSpaceTable.audioSpaceStarts(spaceId); - } - } else if (event != null && event.isSet(AudioSpaceEvent._Fields.SPACE_END_EVENT)) { - audioSpaceTable.audioSpaceFinishes(spaceId); - } - } - } - - @VisibleForTesting - public AudioSpaceTable getAudioSpaceTable() { - return audioSpaceTable; - } - - void printSummary() { - LOG.info(audioSpaceTable.toString()); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/AudioSpaceTable.docx b/src/java/com/twitter/search/earlybird/partition/AudioSpaceTable.docx new file mode 100644 index 000000000..740e7c03c Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/AudioSpaceTable.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/AudioSpaceTable.java b/src/java/com/twitter/search/earlybird/partition/AudioSpaceTable.java deleted file mode 100644 index a7d29d2c9..000000000 --- a/src/java/com/twitter/search/earlybird/partition/AudioSpaceTable.java +++ /dev/null @@ -1,150 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.ArrayDeque; -import java.util.Queue; -import java.util.Set; -import java.util.concurrent.ConcurrentSkipListSet; - -import com.twitter.common.collections.Pair; -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.util.Duration; -import com.twitter.util.Time; - -public class AudioSpaceTable { - private static final String STATS_PREFIX = "audio_space_"; - private static final Duration AUDIO_EVENT_EXPIRATION_DURATION = - Duration.fromHours(12); - - private final Set startedSpaces; - private final Set finishedSpaces; - /** - * timestampedSpaceEvents contains both start and finish events. - * This is to aid in the case in which we receive only on or the other for a spaceId -- start or finish - * without doing this, we could potentially never purge from the sets. - */ - private final Queue> timestampedSpaceEvents; - private final Clock clock; - - private final SearchRateCounter audioSpaceStarts = - SearchRateCounter.export(STATS_PREFIX + "stream_starts"); - private final SearchRateCounter audioSpaceFinishes = - SearchRateCounter.export(STATS_PREFIX + "stream_finishes"); - private final SearchRateCounter isRunningCalls = - SearchRateCounter.export(STATS_PREFIX + "is_running_calls"); - private final SearchRateCounter audioSpaceDuplicateStarts = - SearchRateCounter.export(STATS_PREFIX + "duplicate_start_events"); - private final SearchRateCounter audioSpaceDuplicateFinishes = - SearchRateCounter.export(STATS_PREFIX + "duplicate_finish_events"); - private final SearchRateCounter startsProcessedAfterCorrespondingFinishes = - SearchRateCounter.export(STATS_PREFIX + "starts_processed_after_corresponding_finishes"); - private final SearchRateCounter finishesProcessedWithoutCorrespondingStarts = - SearchRateCounter.export(STATS_PREFIX + "finishes_processed_without_corresponding_starts"); - - public AudioSpaceTable(Clock clock) { - // We read and write from different threads, so we need a thread-safe set implementation. - startedSpaces = new ConcurrentSkipListSet<>(); - finishedSpaces = new ConcurrentSkipListSet<>(); - timestampedSpaceEvents = new ArrayDeque<>(); - this.clock = clock; - SearchCustomGauge.export(STATS_PREFIX + "live", this::getNumberOfLiveAudioSpaces); - SearchCustomGauge.export(STATS_PREFIX + "retained_starts", startedSpaces::size); - SearchCustomGauge.export(STATS_PREFIX + "retained_finishes", finishedSpaces::size); - } - - private int getNumberOfLiveAudioSpaces() { - // This call is a bit expensive, but I logged it and it's getting called once a minute, at - // the beginning of the minute, so it's fine. - int count = 0; - for (String startedSpace : startedSpaces) { - count += finishedSpaces.contains(startedSpace) ? 0 : 1; - } - return count; - } - - /** - * We keep spaces that have started in the last 12 hours. - * This is called on every start space event received, and cleans up - * the retained spaces so memory usage does not become too high - */ - private void purgeOldSpaces() { - Pair oldest = timestampedSpaceEvents.peek(); - Time now = Time.fromMilliseconds(clock.nowMillis()); - while (oldest != null) { - Duration durationSinceInsert = now.minus(oldest.getFirst()); - if (durationSinceInsert.compareTo(AUDIO_EVENT_EXPIRATION_DURATION) > 0) { - // This event has expired, so we purge it and move on to the next. - String oldSpaceId = oldest.getSecond(); - startedSpaces.remove(oldSpaceId); - finishedSpaces.remove(oldSpaceId); - oldest = timestampedSpaceEvents.poll(); - } else { - // Oldest event is not old enough so quit purging - break; - } - } - } - - /** - * Record AudioSpace start event - */ - public void audioSpaceStarts(String spaceId) { - audioSpaceStarts.increment(); - boolean spaceSeenBefore = !startedSpaces.add(spaceId); - if (spaceSeenBefore) { - audioSpaceDuplicateStarts.increment(); - } - - if (finishedSpaces.contains(spaceId)) { - startsProcessedAfterCorrespondingFinishes.increment(); - } - - timestampedSpaceEvents.add(new Pair(Time.fromMilliseconds(clock.nowMillis()), spaceId)); - purgeOldSpaces(); - } - - /** - * Record AudioSpace finish event - */ - public void audioSpaceFinishes(String spaceId) { - audioSpaceFinishes.increment(); - boolean spaceSeenBefore = !finishedSpaces.add(spaceId); - if (spaceSeenBefore) { - audioSpaceDuplicateFinishes.increment(); - } - - if (!startedSpaces.contains(spaceId)) { - finishesProcessedWithoutCorrespondingStarts.increment(); - } - - timestampedSpaceEvents.add(new Pair(Time.fromMilliseconds(clock.nowMillis()), spaceId)); - purgeOldSpaces(); - } - - public boolean isRunning(String spaceId) { - isRunningCalls.increment(); - return startedSpaces.contains(spaceId) && !finishedSpaces.contains(spaceId); - } - - /** - * Print stats on this AudioSpaceTable - * @return Stats string - */ - public String toString() { - return "AudioSpaceTable: Starts: " + audioSpaceStarts.getCounter().get() - + ", Finishes: " + audioSpaceFinishes.getCounter().get() - + ", Retained starts: " + startedSpaces.size() - + ", Retained finishes: " + finishedSpaces.size() - + ", Currently live: " + getNumberOfLiveAudioSpaces(); - } - - public Set getStartedSpaces() { - return startedSpaces; - } - - public Set getFinishedSpaces() { - return finishedSpaces; - } - -} diff --git a/src/java/com/twitter/search/earlybird/partition/BalancingKafkaConsumer.docx b/src/java/com/twitter/search/earlybird/partition/BalancingKafkaConsumer.docx new file mode 100644 index 000000000..735ce58d2 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/BalancingKafkaConsumer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/BalancingKafkaConsumer.java b/src/java/com/twitter/search/earlybird/partition/BalancingKafkaConsumer.java deleted file mode 100644 index da3fd9536..000000000 --- a/src/java/com/twitter/search/earlybird/partition/BalancingKafkaConsumer.java +++ /dev/null @@ -1,117 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.time.Duration; -import java.util.Arrays; -import java.util.Collections; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.common.TopicPartition; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchRateCounter; - -/** - * BalancingKafkaConsumer is designed to read from the tweets and updates streams in proportion to - * the rates that those streams are written to, i.e. both topics should have nearly the same amount - * of lag. This is important because if one stream gets too far ahead of the other, we could end up - * in a situation where: - * 1. If the tweet stream is ahead of the updates stream, we couldn't apply an update because a - * segment has been optimized, and one of those fields became frozen. - * 2. If the updates stream is ahead of the tweet stream, we might drop updates because they are - * more than a minute old, but the tweets might still not be indexed. - * - * Also see 'Consumption Flow Control' in - * https://kafka.apache.org/23/javadoc/index.html?org/apache/kafka/clients/consumer/KafkaConsumer.html - */ -public class BalancingKafkaConsumer { - // If one of the topic-partitions lags the other by more than 10 seconds, - // it's worth it to pause the faster one and let the slower one catch up. - private static final long BALANCE_THRESHOLD_MS = Duration.ofSeconds(10).toMillis(); - private final KafkaConsumer kafkaConsumer; - private final TopicPartition tweetTopic; - private final TopicPartition updateTopic; - private final SearchRateCounter tweetsPaused; - private final SearchRateCounter updatesPaused; - private final SearchRateCounter resumed; - - private long tweetTimestamp = 0; - private long updateTimestamp = 0; - private long pausedAt = 0; - private boolean paused = false; - - public BalancingKafkaConsumer( - KafkaConsumer kafkaConsumer, - TopicPartition tweetTopic, - TopicPartition updateTopic - ) { - this.kafkaConsumer = kafkaConsumer; - this.tweetTopic = tweetTopic; - this.updateTopic = updateTopic; - - String prefix = "balancing_kafka_"; - String suffix = "_topic_paused"; - - tweetsPaused = SearchRateCounter.export(prefix + tweetTopic.topic() + suffix); - updatesPaused = SearchRateCounter.export(prefix + updateTopic.topic() + suffix); - resumed = SearchRateCounter.export(prefix + "topics_resumed"); - } - - /** - * Calls poll on the underlying consumer and pauses topics as necessary. - */ - public ConsumerRecords poll(Duration timeout) { - ConsumerRecords records = kafkaConsumer.poll(timeout); - topicFlowControl(records); - return records; - } - - private void topicFlowControl(ConsumerRecords records) { - for (ConsumerRecord record : records) { - long timestamp = record.timestamp(); - - if (updateTopic.topic().equals(record.topic())) { - updateTimestamp = Math.max(updateTimestamp, timestamp); - } else if (tweetTopic.topic().equals(record.topic())) { - tweetTimestamp = Math.max(tweetTimestamp, timestamp); - } else { - throw new IllegalStateException( - "Unexpected partition " + record.topic() + " in BalancingKafkaConsumer"); - } - } - - if (paused) { - // If we paused and one of the streams is still below the pausedAt point, we want to continue - // reading from just the lagging stream. - if (tweetTimestamp >= pausedAt && updateTimestamp >= pausedAt) { - // We caught up, resume reading from both topics. - paused = false; - kafkaConsumer.resume(Arrays.asList(tweetTopic, updateTopic)); - resumed.increment(); - } - } else { - long difference = Math.abs(tweetTimestamp - updateTimestamp); - - if (difference < BALANCE_THRESHOLD_MS) { - // The streams have approximately the same lag, so no need to pause anything. - return; - } - // The difference is too great, one of the streams is lagging behind the other so we need to - // pause one topic so the other can catch up. - paused = true; - pausedAt = Math.max(updateTimestamp, tweetTimestamp); - if (tweetTimestamp > updateTimestamp) { - kafkaConsumer.pause(Collections.singleton(tweetTopic)); - tweetsPaused.increment(); - } else { - kafkaConsumer.pause(Collections.singleton(updateTopic)); - updatesPaused.increment(); - } - } - } - - public void close() { - kafkaConsumer.close(); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/CompleteSegmentManager.docx b/src/java/com/twitter/search/earlybird/partition/CompleteSegmentManager.docx new file mode 100644 index 000000000..336d783c9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/CompleteSegmentManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/CompleteSegmentManager.java b/src/java/com/twitter/search/earlybird/partition/CompleteSegmentManager.java deleted file mode 100644 index 38ff55c08..000000000 --- a/src/java/com/twitter/search/earlybird/partition/CompleteSegmentManager.java +++ /dev/null @@ -1,349 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.Iterator; -import java.util.List; -import java.util.function.Supplier; - -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.util.io.recordreader.RecordReader; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.document.TweetDocument; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.segment.SegmentDataProvider; - -/** - * CompleteSegmentManager is used to parallelize indexing of complete (not partial) segments - * on startup. It also populates the fields used by the PartitionManager. - */ -public class CompleteSegmentManager { - private static final Logger LOG = LoggerFactory.getLogger(CompleteSegmentManager.class); - - private static final String INDEX_COMPLETED_SEGMENTS = - "indexing, optimizing and flushing complete segments"; - private static final String LOAD_COMPLETED_SEGMENTS = "loading complete segments"; - private static final String INDEX_UPDATES_FOR_COMPLETED_SEGMENTS = - "indexing updates for complete segments"; - private static final String BUILD_MULTI_SEGMENT_TERM_DICT = - "build multi segment term dictionaries"; - - // Max number of segments being loaded / indexed concurrently. - private final int maxConcurrentSegmentIndexers = - EarlybirdProperty.MAX_CONCURRENT_SEGMENT_INDEXERS.get(3); - - // The state we are building. - protected final SegmentDataProvider segmentDataProvider; - private final InstrumentedQueue retryQueue; - - private final UserUpdatesStreamIndexer userUpdatesStreamIndexer; - private final UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer; - - private final SegmentManager segmentManager; - private final ZooKeeperTryLockFactory zkTryLockFactory; - private final SearchIndexingMetricSet searchIndexingMetricSet; - private final Clock clock; - private MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager; - private final SegmentSyncConfig segmentSyncConfig; - - private final CriticalExceptionHandler criticalExceptionHandler; - - private boolean interrupted = false; - - public CompleteSegmentManager( - ZooKeeperTryLockFactory zooKeeperTryLockFactory, - SegmentDataProvider segmentDataProvider, - UserUpdatesStreamIndexer userUpdatesStreamIndexer, - UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer, - SegmentManager segmentManager, - InstrumentedQueue retryQueue, - SearchIndexingMetricSet searchIndexingMetricSet, - Clock clock, - MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager, - SegmentSyncConfig segmentSyncConfig, - CriticalExceptionHandler criticalExceptionHandler) { - this.zkTryLockFactory = zooKeeperTryLockFactory; - this.segmentDataProvider = segmentDataProvider; - this.userUpdatesStreamIndexer = userUpdatesStreamIndexer; - this.userScrubGeoEventStreamIndexer = userScrubGeoEventStreamIndexer; - this.segmentManager = segmentManager; - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.clock = clock; - this.multiSegmentTermDictionaryManager = multiSegmentTermDictionaryManager; - this.segmentSyncConfig = segmentSyncConfig; - this.retryQueue = retryQueue; - this.criticalExceptionHandler = criticalExceptionHandler; - } - - /** - * Indexes all user events. - */ - public void indexUserEvents() { - LOG.info("Loading/indexing user events."); - StartupUserEventIndexer startupUserEventIndexer = new StartupUserEventIndexer( - searchIndexingMetricSet, - userUpdatesStreamIndexer, - userScrubGeoEventStreamIndexer, - segmentManager, - clock - ); - - startupUserEventIndexer.indexAllEvents(); - LOG.info("Finished loading/indexing user events."); - } - - /** - * Loads or indexes from scratch all complete segments. - * - * @param segmentsToIndexProvider A supplier that provides the list of all complete segments. - */ - public void indexCompleteSegments( - Supplier> segmentsToIndexProvider) throws Exception { - List segmentIndexers = Lists.newArrayList(); - - EarlybirdStatus.beginEvent( - INDEX_COMPLETED_SEGMENTS, searchIndexingMetricSet.startupInIndexCompletedSegments); - while (!interrupted && !Thread.currentThread().isInterrupted()) { - try { - // Get the refreshed list of local segment databases. - segmentManager.updateSegments(segmentDataProvider.newSegmentList()); - Iterator segmentsToIndex = segmentsToIndexProvider.get().iterator(); - - // Start up to max concurrent segment indexers. - segmentIndexers.clear(); - while (segmentsToIndex.hasNext() && segmentIndexers.size() < maxConcurrentSegmentIndexers) { - SegmentInfo nextSegment = segmentsToIndex.next(); - if (!nextSegment.isComplete()) { - Thread thread = new Thread(new SingleSegmentIndexer(nextSegment), - "startup-segment-indexer-" + nextSegment.getSegmentName()); - thread.start(); - segmentIndexers.add(thread); - } - } - - // No remaining indexer threads, we're done. - if (segmentIndexers.size() == 0) { - LOG.info("Finished indexing complete segments"); - EarlybirdStatus.endEvent( - INDEX_COMPLETED_SEGMENTS, searchIndexingMetricSet.startupInIndexCompletedSegments); - break; - } - - // Wait for threads to complete fully. - LOG.info("Started {} indexing threads", segmentIndexers.size()); - for (Thread thread : segmentIndexers) { - thread.join(); - } - LOG.info("Joined all {} indexing threads", segmentIndexers.size()); - } catch (IOException e) { - LOG.error("IOException in SegmentStartupManager loop", e); - } catch (InterruptedException e) { - interrupted = true; - LOG.error("Interrupted joining segment indexer thread", e); - } - } - } - - /** - * Loads all given complete segments. - * - * @param completeSegments The list of all complete segments to be loaded. - */ - public void loadCompleteSegments(List completeSegments) throws Exception { - if (!interrupted && !Thread.currentThread().isInterrupted()) { - LOG.info("Starting to load {} complete segments.", completeSegments.size()); - EarlybirdStatus.beginEvent( - LOAD_COMPLETED_SEGMENTS, searchIndexingMetricSet.startupInLoadCompletedSegments); - - List segmentThreads = Lists.newArrayList(); - List segmentsToBeLoaded = Lists.newArrayList(); - for (SegmentInfo segmentInfo : completeSegments) { - if (segmentInfo.isEnabled()) { - segmentsToBeLoaded.add(segmentInfo); - Thread segmentLoaderThread = new Thread( - () -> new SegmentLoader(segmentSyncConfig, criticalExceptionHandler) - .load(segmentInfo), - "startup-segment-loader-" + segmentInfo.getSegmentName()); - segmentThreads.add(segmentLoaderThread); - segmentLoaderThread.start(); - } else { - LOG.info("Will not load segment {} because it's disabled.", segmentInfo.getSegmentName()); - } - } - - for (Thread segmentLoaderThread : segmentThreads) { - segmentLoaderThread.join(); - } - - for (SegmentInfo segmentInfo : segmentsToBeLoaded) { - if (!segmentInfo.getSyncInfo().isLoaded()) { - // Throw an exception if a segment could not be loaded: We do not want earlybirds to - // startup with missing segments. - throw new RuntimeException("Could not load segment " + segmentInfo.getSegmentName()); - } - } - - LOG.info("Loaded all complete segments, starting indexing all updates."); - EarlybirdStatus.beginEvent( - INDEX_UPDATES_FOR_COMPLETED_SEGMENTS, - searchIndexingMetricSet.startupInIndexUpdatesForCompletedSegments); - - // Index all updates for all complete segments until we're fully caught up. - if (!EarlybirdCluster.isArchive(segmentManager.getEarlybirdIndexConfig().getCluster())) { - segmentThreads.clear(); - for (SegmentInfo segmentInfo : completeSegments) { - if (segmentInfo.isEnabled()) { - Thread segmentUpdatesThread = new Thread( - () -> new SimpleUpdateIndexer( - segmentDataProvider.getSegmentDataReaderSet(), - searchIndexingMetricSet, - retryQueue, - criticalExceptionHandler).indexAllUpdates(segmentInfo), - "startup-complete-segment-update-indexer-" + segmentInfo.getSegmentName()); - segmentThreads.add(segmentUpdatesThread); - segmentUpdatesThread.start(); - } else { - LOG.info("Will not index updates for segment {} because it's disabled.", - segmentInfo.getSegmentName()); - } - } - - for (Thread segmentUpdatesThread : segmentThreads) { - segmentUpdatesThread.join(); - } - } - LOG.info("Indexed updates for all complete segments."); - EarlybirdStatus.endEvent( - INDEX_UPDATES_FOR_COMPLETED_SEGMENTS, - searchIndexingMetricSet.startupInIndexUpdatesForCompletedSegments); - - EarlybirdStatus.endEvent( - LOAD_COMPLETED_SEGMENTS, searchIndexingMetricSet.startupInLoadCompletedSegments); - } - } - - /** - * Builds the term dictionary that spans all earlybird segments. Some fields share the term - * dictionary across segments as an optimization. - */ - public void buildMultiSegmentTermDictionary() { - EarlybirdStatus.beginEvent( - BUILD_MULTI_SEGMENT_TERM_DICT, - searchIndexingMetricSet.startupInMultiSegmentTermDictionaryUpdates); - if (!interrupted && !Thread.currentThread().isInterrupted()) { - LOG.info("Building multi segment term dictionaries."); - boolean built = multiSegmentTermDictionaryManager.buildDictionary(); - LOG.info("Done building multi segment term dictionaries, result: {}", built); - } - EarlybirdStatus.endEvent( - BUILD_MULTI_SEGMENT_TERM_DICT, - searchIndexingMetricSet.startupInMultiSegmentTermDictionaryUpdates); - } - - /** - * Warms up the data in the given segments. The warm up will usually make sure that all necessary - * is loaded in RAM and all relevant data structures are created before the segments starts - * serving real requests. - * - * @param segments The list of segments to warm up. - */ - public final void warmSegments(Iterable segments) throws InterruptedException { - int threadId = 1; - Iterator it = segments.iterator(); - - try { - List segmentWarmers = Lists.newLinkedList(); - while (it.hasNext()) { - - segmentWarmers.clear(); - while (it.hasNext() && segmentWarmers.size() < maxConcurrentSegmentIndexers) { - final SegmentInfo segment = it.next(); - Thread t = new Thread(() -> - new SegmentWarmer(criticalExceptionHandler).warmSegmentIfNecessary(segment), - "startup-warmer-" + threadId++); - - t.start(); - segmentWarmers.add(t); - } - - for (Thread t : segmentWarmers) { - t.join(); - } - } - } catch (InterruptedException e) { - LOG.error("Interrupted segment warmer thread", e); - Thread.currentThread().interrupt(); - throw e; - } - } - - /** - * Indexes a complete segment. - */ - private class SingleSegmentIndexer implements Runnable { - private final SegmentInfo segmentInfo; - - public SingleSegmentIndexer(SegmentInfo segmentInfo) { - this.segmentInfo = segmentInfo; - } - - @Override - public void run() { - // 0) Check if the segment can be loaded. This might copy the segment from HDFS. - if (new SegmentLoader(segmentSyncConfig, criticalExceptionHandler) - .downloadSegment(segmentInfo)) { - LOG.info("Will not index segment {} because it was downloaded from HDFS.", - segmentInfo.getSegmentName()); - segmentInfo.setComplete(true); - return; - } - - LOG.info("SingleSegmentIndexer starting for segment: " + segmentInfo); - - // 1) Index all tweets in this segment. - RecordReader tweetReader; - try { - tweetReader = segmentDataProvider.getSegmentDataReaderSet().newDocumentReader(segmentInfo); - if (tweetReader != null) { - tweetReader.setExhaustStream(true); - } - } catch (Exception e) { - throw new RuntimeException("Could not create tweet reader for segment: " + segmentInfo, e); - } - - new SimpleSegmentIndexer(tweetReader, searchIndexingMetricSet).indexSegment(segmentInfo); - - if (!segmentInfo.isComplete() || segmentInfo.isIndexing()) { - throw new RuntimeException("Segment does not appear to be complete: " + segmentInfo); - } - - // 2) Index all updates in this segment (archive earlybirds don't have updates). - if (!EarlybirdCluster.isArchive(segmentManager.getEarlybirdIndexConfig().getCluster())) { - new SimpleUpdateIndexer( - segmentDataProvider.getSegmentDataReaderSet(), - searchIndexingMetricSet, - retryQueue, - criticalExceptionHandler).indexAllUpdates(segmentInfo); - } - - // 3) Optimize the segment. - SegmentOptimizer.optimize(segmentInfo); - - // 4) Flush to HDFS if necessary. - new SegmentHdfsFlusher(zkTryLockFactory, segmentSyncConfig) - .flushSegmentToDiskAndHDFS(segmentInfo); - - // 5) Unload the segment from memory. - segmentInfo.getIndexSegment().close(); - } - } - -} diff --git a/src/java/com/twitter/search/earlybird/partition/DynamicPartitionConfig.docx b/src/java/com/twitter/search/earlybird/partition/DynamicPartitionConfig.docx new file mode 100644 index 000000000..f42a37187 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/DynamicPartitionConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/DynamicPartitionConfig.java b/src/java/com/twitter/search/earlybird/partition/DynamicPartitionConfig.java deleted file mode 100644 index 946160448..000000000 --- a/src/java/com/twitter/search/earlybird/partition/DynamicPartitionConfig.java +++ /dev/null @@ -1,69 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; - -/** - * Keeps track of an up-to-date PartitionConfig. The PartitionConfig may be periodically reloaded - * from ZooKeeper. If you need a consistent view of the current partition configuration, make sure - * to grab a reference to a single PartitionConfig using getCurrentPartitionConfig() and reuse that - * object. - */ -public class DynamicPartitionConfig { - private static final Logger LOG = LoggerFactory.getLogger(DynamicPartitionConfig.class); - private static final SearchCounter FAILED_UPDATE_COUNTER_NAME = - SearchCounter.export("dynamic_partition_config_failed_update"); - private static final SearchCounter SUCCESSFUL_UPDATE_COUNTER = - SearchCounter.export("dynamic_partition_config_successful_update"); - // We assume that DynamicPartitionConfig is practically a singleton in Earlybird app. - private static final SearchLongGauge NUM_REPLICAS_IN_HASH_PARTITION = - SearchLongGauge.export("dynamic_partition_config_num_replicas_in_hash_partition"); - - private final PartitionConfig curPartitionConfig; - - public DynamicPartitionConfig(PartitionConfig initialConfig) { - this.curPartitionConfig = initialConfig; - NUM_REPLICAS_IN_HASH_PARTITION.set(initialConfig.getNumReplicasInHashPartition()); - } - - public PartitionConfig getCurrentPartitionConfig() { - return curPartitionConfig; - } - - /** - * Verifies that the new partition config is compatible with the old one, and if it is, updates - * the number of replicas per partition based on the new partition config. - */ - public void setCurrentPartitionConfig(PartitionConfig partitionConfig) { - Preconditions.checkNotNull(partitionConfig); - // For now, we only allow the number of replicas in this partition to be dynamically updated. - // Ensure that the only things that have changed between the previous - if (curPartitionConfig.getClusterName().equals(partitionConfig.getClusterName()) - && (curPartitionConfig.getMaxEnabledLocalSegments() - == partitionConfig.getMaxEnabledLocalSegments()) - && (curPartitionConfig.getNumPartitions() == partitionConfig.getNumPartitions()) - && (curPartitionConfig.getTierStartDate().equals(partitionConfig.getTierStartDate())) - && (curPartitionConfig.getTierEndDate().equals(partitionConfig.getTierEndDate())) - && (curPartitionConfig.getTierName().equals(partitionConfig.getTierName()))) { - - if (curPartitionConfig.getNumReplicasInHashPartition() - != partitionConfig.getNumReplicasInHashPartition()) { - SUCCESSFUL_UPDATE_COUNTER.increment(); - curPartitionConfig.setNumReplicasInHashPartition( - partitionConfig.getNumReplicasInHashPartition()); - NUM_REPLICAS_IN_HASH_PARTITION.set(partitionConfig.getNumReplicasInHashPartition()); - } - } else { - FAILED_UPDATE_COUNTER_NAME.increment(); - LOG.warn( - "Attempted to update partition config with inconsistent layout.\n" - + "Current: " + curPartitionConfig.getPartitionConfigDescription() + "\n" - + "New: " + partitionConfig.getPartitionConfigDescription()); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdIndex.docx b/src/java/com/twitter/search/earlybird/partition/EarlybirdIndex.docx new file mode 100644 index 000000000..94a731e76 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/EarlybirdIndex.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdIndex.java b/src/java/com/twitter/search/earlybird/partition/EarlybirdIndex.java deleted file mode 100644 index cda9ea682..000000000 --- a/src/java/com/twitter/search/earlybird/partition/EarlybirdIndex.java +++ /dev/null @@ -1,65 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -public class EarlybirdIndex { - private final List segmentInfoList; - - public static final int MAX_NUM_OF_NON_OPTIMIZED_SEGMENTS = 2; - - // The Kafka offsets for the tweet create stream and the tweet update stream. Indexing should - // start from these offsets when it resumes. - private final long tweetOffset; - private final long updateOffset; - private final long maxIndexedTweetId; - - public EarlybirdIndex( - List segmentInfoList, - long tweetOffset, - long updateOffset, - long maxIndexedTweetId - ) { - List segmentInfos = new ArrayList<>(segmentInfoList); - Collections.sort(segmentInfos); - this.segmentInfoList = segmentInfos; - this.tweetOffset = tweetOffset; - this.updateOffset = updateOffset; - this.maxIndexedTweetId = maxIndexedTweetId; - } - - public EarlybirdIndex(List segmentInfoList, long tweetOffset, long updateOffset) { - this(segmentInfoList, tweetOffset, updateOffset, -1); - } - - public List getSegmentInfoList() { - return segmentInfoList; - } - - public long getTweetOffset() { - return tweetOffset; - } - - public long getUpdateOffset() { - return updateOffset; - } - - public long getMaxIndexedTweetId() { - return maxIndexedTweetId; - } - - /** - * Returns the number of non-optimized segments in this index. - * @return the number of non-optimized segments in this index. - */ - public int numOfNonOptimizedSegments() { - int numNonOptimized = 0; - for (SegmentInfo segmentInfo : segmentInfoList) { - if (!segmentInfo.isOptimized()) { - numNonOptimized++; - } - } - return numNonOptimized; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexFlusher.docx b/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexFlusher.docx new file mode 100644 index 000000000..dba9e5a6a Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexFlusher.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexFlusher.java b/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexFlusher.java deleted file mode 100644 index 3804b5cd7..000000000 --- a/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexFlusher.java +++ /dev/null @@ -1,371 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.text.DateFormat; -import java.text.ParseException; -import java.text.SimpleDateFormat; -import java.time.Duration; -import java.util.ArrayList; -import java.util.Date; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.concurrent.TimeoutException; - -import scala.runtime.BoxedUnit; - -import com.google.common.base.Preconditions; - -import org.apache.commons.compress.utils.Lists; -import org.apache.commons.lang.RandomStringUtils; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.config.Config; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.schema.earlybird.FlushVersion; -import com.twitter.search.common.util.io.flushable.DataSerializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.earlybird.common.NonPagingAssert; -import com.twitter.search.earlybird.util.ActionLogger; -import com.twitter.search.earlybird.util.CoordinatedEarlybirdActionInterface; -import com.twitter.search.earlybird.util.CoordinatedEarlybirdActionLockFailed; -import com.twitter.search.earlybird.util.ParallelUtil; - -/** - * Flushes an EarlybirdIndex to HDFS, so that when Earlybird starts, it can read the index from - * HDFS instead of indexing from scratch. - * - * The path looks like: - * /smf1/rt2/user/search/earlybird/loadtest/realtime/indexes/flush_version_158/partition_8/index_2020_02_25_02 - */ -public class EarlybirdIndexFlusher { - public enum FlushAttemptResult { - CHECKED_RECENTLY, - FOUND_INDEX, - FLUSH_ATTEMPT_MADE, - FAILED_LOCK_ATTEMPT, - HADOOP_TIMEOUT - } - - @FunctionalInterface - public interface PostFlushOperation { - /** - * Run this after we finish flushing an index, before we rejoin the serverset. - */ - void execute(); - } - - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdIndexFlusher.class); - - private static final SearchCounter FLUSH_SUCCESS_COUNTER = - SearchCounter.export("successfully_flushed_index"); - - public static final String TWEET_KAFKA_OFFSET = "tweet_kafka_offset"; - public static final String UPDATE_KAFKA_OFFSET = "update_kafka_offset"; - public static final String FLUSHED_FROM_REPLICA = "flushed_from_replica"; - public static final String SEGMENTS = "segments"; - public static final String TIMESLICE_ID = "timeslice_id"; - - public static final String DATA_SUFFIX = ".data"; - public static final String INFO_SUFFIX = ".info"; - public static final String INDEX_INFO = "earlybird_index.info"; - - private static final String INDEX_PATH_FORMAT = "%s/flush_version_%d/partition_%d"; - public static final DateFormat INDEX_DATE_SUFFIX = new SimpleDateFormat("yyyy_MM_dd_HH"); - public static final String INDEX_PREFIX = "index_"; - public static final String TMP_PREFIX = "tmp_"; - - // Check if we need to flush every five minutes. - private static final long FLUSH_CHECK_PERIOD = Duration.ofMinutes(5).toMillis(); - - // Make sure we don't keep more than 3 copies of the index in HDFS, so that we don't run out of - // HDFS space. - private static final int INDEX_COPIES = 3; - - private static final NonPagingAssert FLUSHING_TOO_MANY_NON_OPTIMIZED_SEGMENTS = - new NonPagingAssert("flushing_too_many_non_optimized_segments"); - - private final CoordinatedEarlybirdActionInterface actionCoordinator; - private final FileSystem fileSystem; - private final Path indexPath; - private final Clock clock; - private final SegmentManager segmentManager; - private final int replicaId; - private final TimeLimitedHadoopExistsCall timeLimitedHadoopExistsCall; - private final OptimizationAndFlushingCoordinationLock optimizationAndFlushingCoordinationLock; - - private long checkedAt = 0; - - public EarlybirdIndexFlusher( - CoordinatedEarlybirdActionInterface actionCoordinator, - FileSystem fileSystem, - String indexHDFSPath, - SegmentManager segmentManager, - PartitionConfig partitionConfig, - Clock clock, - TimeLimitedHadoopExistsCall timeLimitedHadoopExistsCall, - OptimizationAndFlushingCoordinationLock optimizationAndFlushingCoordinationLock - ) { - this.actionCoordinator = actionCoordinator; - this.fileSystem = fileSystem; - this.indexPath = buildPathToIndexes(indexHDFSPath, partitionConfig); - this.segmentManager = segmentManager; - this.clock = clock; - this.replicaId = partitionConfig.getHostPositionWithinHashPartition(); - this.timeLimitedHadoopExistsCall = timeLimitedHadoopExistsCall; - this.optimizationAndFlushingCoordinationLock = optimizationAndFlushingCoordinationLock; - } - - /** - * Periodically checks if an index needs to be uploaded to HDFS, and uploads it if necessary. - * Skips flush if unable to acquire the optimizationAndFlushingCoordinationLock. - */ - public FlushAttemptResult flushIfNecessary( - long tweetOffset, - long updateOffset, - PostFlushOperation postFlushOperation) throws Exception { - long now = clock.nowMillis(); - if (now - checkedAt < FLUSH_CHECK_PERIOD) { - return FlushAttemptResult.CHECKED_RECENTLY; - } - - checkedAt = now; - - // Try to aqcuire lock to ensure that we are not in the gc_before_optimization or the - // post_optimization_rebuilds step of optimization. If the lock is not available, then skip - // flushing. - if (!optimizationAndFlushingCoordinationLock.tryLock()) { - return FlushAttemptResult.FAILED_LOCK_ATTEMPT; - } - // Acquired the lock, so wrap the flush in a try/finally block to ensure we release the lock - try { - Path flushPath = pathForHour(); - - try { - // If this doesn't execute on time, it will throw an exception and this function - // finishes its execution. - boolean result = timeLimitedHadoopExistsCall.exists(flushPath); - - if (result) { - return FlushAttemptResult.FOUND_INDEX; - } - } catch (TimeoutException e) { - LOG.warn("Timeout while calling hadoop", e); - return FlushAttemptResult.HADOOP_TIMEOUT; - } - - boolean flushedIndex = false; - try { - // this function returns a boolean. - actionCoordinator.execute("index_flushing", isCoordinated -> - flushIndex(flushPath, isCoordinated, tweetOffset, updateOffset, postFlushOperation)); - flushedIndex = true; - } catch (CoordinatedEarlybirdActionLockFailed e) { - // This only happens when we fail to grab the lock, which is fine because another Earlybird - // is already working on flushing this index, so we don't need to. - LOG.debug("Failed to grab lock", e); - } - - if (flushedIndex) { - // We don't return with a guarantee that we actually flushed something. It's possible - // that the .execute() function above was not able to leave the server set to flush. - return FlushAttemptResult.FLUSH_ATTEMPT_MADE; - } else { - return FlushAttemptResult.FAILED_LOCK_ATTEMPT; - } - } finally { - optimizationAndFlushingCoordinationLock.unlock(); - } - } - - /** - * Create a subpath to the directory with many indexes in it. Will have an index for each hour. - */ - public static Path buildPathToIndexes(String root, PartitionConfig partitionConfig) { - return new Path(String.format( - INDEX_PATH_FORMAT, - root, - FlushVersion.CURRENT_FLUSH_VERSION.getVersionNumber(), - partitionConfig.getIndexingHashPartitionID())); - } - - - /** - * Returns a sorted map from the unix time in millis an index was flushed to the path of an index. - * The last element will be the path of the most recent index. - */ - public static SortedMap getIndexPathsByTime( - Path indexPath, - FileSystem fileSystem - ) throws IOException, ParseException { - LOG.info("Getting index paths from file system: {}", fileSystem.getUri().toASCIIString()); - - SortedMap pathByTime = new TreeMap<>(); - Path globPattern = indexPath.suffix("/" + EarlybirdIndexFlusher.INDEX_PREFIX + "*"); - LOG.info("Lookup glob pattern: {}", globPattern); - - for (FileStatus indexDir : fileSystem.globStatus(globPattern)) { - String name = new File(indexDir.getPath().toString()).getName(); - String dateString = name.substring(EarlybirdIndexFlusher.INDEX_PREFIX.length()); - Date date = EarlybirdIndexFlusher.INDEX_DATE_SUFFIX.parse(dateString); - pathByTime.put(date.getTime(), indexDir.getPath()); - } - LOG.info("Found {} files matching the pattern.", pathByTime.size()); - - return pathByTime; - } - - private boolean flushIndex( - Path flushPath, - boolean isCoordinated, - long tweetOffset, - long updateOffset, - PostFlushOperation postFlushOperation - ) throws Exception { - Preconditions.checkState(isCoordinated); - - if (fileSystem.exists(flushPath)) { - return false; - } - - LOG.info("Starting index flush"); - - // In case the process is killed suddenly, we wouldn't be able to clean up the temporary - // directory, and we don't want other processes to reuse it, so add some randomness. - Path tmpPath = indexPath.suffix("/" + TMP_PREFIX + RandomStringUtils.randomAlphabetic(8)); - boolean creationSucceed = fileSystem.mkdirs(tmpPath); - if (!creationSucceed) { - throw new IOException("Couldn't create HDFS directory at " + flushPath); - } - - LOG.info("Temp path: {}", tmpPath); - try { - ArrayList segmentInfos = Lists.newArrayList(segmentManager.getSegmentInfos( - SegmentManager.Filter.Enabled, SegmentManager.Order.NEW_TO_OLD).iterator()); - segmentManager.logState("Before flushing"); - EarlybirdIndex index = new EarlybirdIndex(segmentInfos, tweetOffset, updateOffset); - ActionLogger.run( - "Flushing index to " + tmpPath, - () -> flushIndex(tmpPath, index)); - } catch (Exception e) { - LOG.error("Exception while flushing index. Rethrowing."); - - if (fileSystem.delete(tmpPath, true)) { - LOG.info("Successfully deleted temp output"); - } else { - LOG.error("Couldn't delete temp output"); - } - - throw e; - } - - // We flush it to a temporary directory, then rename the temporary directory so that it the - // change is atomic, and other Earlybirds will either see the old indexes, or the new, complete - // index, but never an in progress index. - boolean renameSucceeded = fileSystem.rename(tmpPath, flushPath); - if (!renameSucceeded) { - throw new IOException("Couldn't rename HDFS from " + tmpPath + " to " + flushPath); - } - LOG.info("Flushed index to {}", flushPath); - - cleanupOldIndexes(); - - FLUSH_SUCCESS_COUNTER.increment(); - - LOG.info("Executing post flush operation..."); - postFlushOperation.execute(); - - return true; - } - - private void cleanupOldIndexes() throws Exception { - LOG.info("Looking up whether we need to clean up old indexes..."); - SortedMap pathsByTime = - EarlybirdIndexFlusher.getIndexPathsByTime(indexPath, fileSystem); - - while (pathsByTime.size() > INDEX_COPIES) { - Long key = pathsByTime.firstKey(); - Path oldestHourPath = pathsByTime.remove(key); - LOG.info("Deleting old index at path '{}'.", oldestHourPath); - - if (fileSystem.delete(oldestHourPath, true)) { - LOG.info("Successfully deleted old index"); - } else { - LOG.error("Couldn't delete old index"); - } - } - } - - private Path pathForHour() { - Date date = new Date(clock.nowMillis()); - String time = INDEX_DATE_SUFFIX.format(date); - return indexPath.suffix("/" + INDEX_PREFIX + time); - } - - private void flushIndex(Path flushPath, EarlybirdIndex index) throws Exception { - int numOfNonOptimized = index.numOfNonOptimizedSegments(); - if (numOfNonOptimized > EarlybirdIndex.MAX_NUM_OF_NON_OPTIMIZED_SEGMENTS) { - LOG.error( - "Found {} non-optimized segments when flushing to disk!", numOfNonOptimized); - FLUSHING_TOO_MANY_NON_OPTIMIZED_SEGMENTS.assertFailed(); - } - - int numSegments = index.getSegmentInfoList().size(); - int flushingThreadPoolSize = numSegments; - - if (Config.environmentIsTest()) { - // SEARCH-33763: Limit the thread pool size for tests to avoid using too much memory on scoot. - flushingThreadPoolSize = 2; - } - - LOG.info("Flushing index using a thread pool size of {}", flushingThreadPoolSize); - - ParallelUtil.parmap("flush-index", flushingThreadPoolSize, si -> ActionLogger.call( - "Flushing segment " + si.getSegmentName(), - () -> flushSegment(flushPath, si)), index.getSegmentInfoList()); - - FlushInfo indexInfo = new FlushInfo(); - indexInfo.addLongProperty(UPDATE_KAFKA_OFFSET, index.getUpdateOffset()); - indexInfo.addLongProperty(TWEET_KAFKA_OFFSET, index.getTweetOffset()); - indexInfo.addIntProperty(FLUSHED_FROM_REPLICA, replicaId); - - FlushInfo segmentFlushInfos = indexInfo.newSubProperties(SEGMENTS); - for (SegmentInfo segmentInfo : index.getSegmentInfoList()) { - FlushInfo segmentFlushInfo = segmentFlushInfos.newSubProperties(segmentInfo.getSegmentName()); - segmentFlushInfo.addLongProperty(TIMESLICE_ID, segmentInfo.getTimeSliceID()); - } - - Path indexInfoPath = flushPath.suffix("/" + INDEX_INFO); - try (FSDataOutputStream infoOutputStream = fileSystem.create(indexInfoPath)) { - OutputStreamWriter infoFileWriter = new OutputStreamWriter(infoOutputStream); - FlushInfo.flushAsYaml(indexInfo, infoFileWriter); - } - } - - private BoxedUnit flushSegment(Path flushPath, SegmentInfo segmentInfo) throws Exception { - Path segmentPrefix = flushPath.suffix("/" + segmentInfo.getSegmentName()); - Path segmentPath = segmentPrefix.suffix(DATA_SUFFIX); - - FlushInfo flushInfo = new FlushInfo(); - - try (FSDataOutputStream outputStream = fileSystem.create(segmentPath)) { - DataSerializer out = new DataSerializer(segmentPath.toString(), outputStream); - segmentInfo.getIndexSegment().flush(flushInfo, out); - } - - Path infoPath = segmentPrefix.suffix(INFO_SUFFIX); - - try (FSDataOutputStream infoOutputStream = fileSystem.create(infoPath)) { - OutputStreamWriter infoFileWriter = new OutputStreamWriter(infoOutputStream); - FlushInfo.flushAsYaml(flushInfo, infoFileWriter); - } - return BoxedUnit.UNIT; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexLoader.docx b/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexLoader.docx new file mode 100644 index 000000000..37e7f927f Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexLoader.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexLoader.java b/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexLoader.java deleted file mode 100644 index 1806bd106..000000000 --- a/src/java/com/twitter/search/earlybird/partition/EarlybirdIndexLoader.java +++ /dev/null @@ -1,224 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.time.Duration; -import java.util.List; -import java.util.Optional; -import java.util.SortedMap; - -import com.google.common.base.Stopwatch; - -import org.apache.commons.compress.utils.Lists; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.partitioning.base.TimeSlice; -import com.twitter.search.common.util.io.flushable.DataDeserializer; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.earlybird.common.NonPagingAssert; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.util.ActionLogger; -import com.twitter.search.earlybird.util.ParallelUtil; - -/** - * Loads an index from HDFS, if possible, or indexes all tweets from scratch using a - * FreshStartupHandler. - */ -public class EarlybirdIndexLoader { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdIndexLoader.class); - - public static final String ENV_FOR_TESTS = "test_env"; - - // To determine whether we should or should not load the most recent index from HDFS if available. - public static final long INDEX_FRESHNESS_THRESHOLD_MILLIS = Duration.ofDays(1).toMillis(); - - private static final NonPagingAssert LOADING_TOO_MANY_NON_OPTIMIZED_SEGMENTS = - new NonPagingAssert("loading_too_many_non_optimized_segments"); - - private final FileSystem fileSystem; - private final Path indexPath; - private final PartitionConfig partitionConfig; - private final EarlybirdSegmentFactory earlybirdSegmentFactory; - private final SegmentSyncConfig segmentSyncConfig; - private final Clock clock; - // Aurora environment we're running in: "prod", "loadtest", "staging2" etc. etc - private final String environment; - - public EarlybirdIndexLoader( - FileSystem fileSystem, - String indexHDFSPath, - String environment, - PartitionConfig partitionConfig, - EarlybirdSegmentFactory earlybirdSegmentFactory, - SegmentSyncConfig segmentSyncConfig, - Clock clock - ) { - this.fileSystem = fileSystem; - this.partitionConfig = partitionConfig; - this.earlybirdSegmentFactory = earlybirdSegmentFactory; - this.segmentSyncConfig = segmentSyncConfig; - this.indexPath = EarlybirdIndexFlusher.buildPathToIndexes(indexHDFSPath, partitionConfig); - this.clock = clock; - this.environment = environment; - } - - /** - * Tries to load an index from HDFS for this FlushVersion/Partition/Cluster. Returns an empty - * option if there is no index found. - */ - public Optional loadIndex() { - try { - Optional loadedIndex = - ActionLogger.call("Load index from HDFS.", this::loadFromHDFS); - - if (loadedIndex.isPresent()) { - EarlybirdIndex index = loadedIndex.get(); - int numOfNonOptimized = index.numOfNonOptimizedSegments(); - if (numOfNonOptimized > EarlybirdIndex.MAX_NUM_OF_NON_OPTIMIZED_SEGMENTS) { - // We should never have too many unoptimized segments. If this happens we likely have a - // bug somewhere that caused another Earlybird to flush too many unoptimized segments. - // Use NonPagingAssert to alert the oncall if this happens so they can look into it. - LOG.error("Found {} non-optimized segments when loading from disk!", numOfNonOptimized); - LOADING_TOO_MANY_NON_OPTIMIZED_SEGMENTS.assertFailed(); - - // If there are too many unoptimized segments, optimize the older ones until there are - // only MAX_NUM_OF_NON_OPTIMIZED_SEGMENTS left in the unoptimized state. The segment info - // list is always in order, so we will never try to optimize the most recent segments - // here. - int numSegmentsToOptimize = - numOfNonOptimized - EarlybirdIndex.MAX_NUM_OF_NON_OPTIMIZED_SEGMENTS; - LOG.info("Will try to optimize {} segments", numSegmentsToOptimize); - for (SegmentInfo segmentInfo : index.getSegmentInfoList()) { - if (numSegmentsToOptimize > 0 && !segmentInfo.isOptimized()) { - Stopwatch optimizationStopwatch = Stopwatch.createStarted(); - LOG.info("Starting to optimize segment: {}", segmentInfo.getSegmentName()); - segmentInfo.getIndexSegment().optimizeIndexes(); - numSegmentsToOptimize--; - LOG.info("Optimization of segment {} finished in {}.", - segmentInfo.getSegmentName(), optimizationStopwatch); - } - } - } - - int newNumOfNonOptimized = index.numOfNonOptimizedSegments(); - LOG.info("Loaded {} segments. {} are unoptimized.", - index.getSegmentInfoList().size(), - newNumOfNonOptimized); - - return loadedIndex; - } - } catch (Throwable e) { - LOG.error("Error loading index from HDFS, will index from scratch.", e); - } - - return Optional.empty(); - } - - private Optional loadFromHDFS() throws Exception { - SortedMap pathsByTime = - EarlybirdIndexFlusher.getIndexPathsByTime(indexPath, fileSystem); - - if (pathsByTime.isEmpty()) { - LOG.info("Could not load index from HDFS (path: {}), will index from scratch.", indexPath); - return Optional.empty(); - } - - long mostRecentIndexTimeMillis = pathsByTime.lastKey(); - Path mostRecentIndexPath = pathsByTime.get(mostRecentIndexTimeMillis); - - if (clock.nowMillis() - mostRecentIndexTimeMillis > INDEX_FRESHNESS_THRESHOLD_MILLIS) { - LOG.info("Most recent index in HDFS (path: {}) is old, will do a fresh startup.", - mostRecentIndexPath); - return Optional.empty(); - } - - EarlybirdIndex index = ActionLogger.call( - "loading index from " + mostRecentIndexPath, - () -> loadIndex(mostRecentIndexPath)); - - return Optional.of(index); - } - - private EarlybirdIndex loadIndex(Path flushPath) throws Exception { - Path indexInfoPath = flushPath.suffix("/" + EarlybirdIndexFlusher.INDEX_INFO); - - FlushInfo indexInfo; - try (FSDataInputStream infoInputStream = fileSystem.open(indexInfoPath)) { - indexInfo = FlushInfo.loadFromYaml(infoInputStream); - } - - FlushInfo segmentsFlushInfo = indexInfo.getSubProperties(EarlybirdIndexFlusher.SEGMENTS); - List segmentNames = Lists.newArrayList(segmentsFlushInfo.getKeyIterator()); - - // This should only happen if you're running in stagingN and loading a prod index through - // the read_index_from_prod_location flag. In this case, we point to a directory that has - // a lot more than the number of segments we want in staging and we trim this list to the - // desired number. - if (environment.matches("staging\\d")) { - if (segmentNames.size() > partitionConfig.getMaxEnabledLocalSegments()) { - LOG.info("Trimming list of loaded segments from size {} to size {}.", - segmentNames.size(), partitionConfig.getMaxEnabledLocalSegments()); - segmentNames = segmentNames.subList( - segmentNames.size() - partitionConfig.getMaxEnabledLocalSegments(), - segmentNames.size()); - } - } - - List segmentInfoList = ParallelUtil.parmap("load-index", name -> { - FlushInfo subProperties = segmentsFlushInfo.getSubProperties(name); - long timesliceID = subProperties.getLongProperty(EarlybirdIndexFlusher.TIMESLICE_ID); - return ActionLogger.call( - "loading segment " + name, - () -> loadSegment(flushPath, name, timesliceID)); - }, segmentNames); - - return new EarlybirdIndex( - segmentInfoList, - indexInfo.getLongProperty(EarlybirdIndexFlusher.TWEET_KAFKA_OFFSET), - indexInfo.getLongProperty(EarlybirdIndexFlusher.UPDATE_KAFKA_OFFSET)); - } - - private SegmentInfo loadSegment( - Path flushPath, - String segmentName, - long timesliceID - ) throws IOException { - Path segmentPrefix = flushPath.suffix("/" + segmentName); - Path segmentPath = segmentPrefix.suffix(EarlybirdIndexFlusher.DATA_SUFFIX); - - TimeSlice timeSlice = new TimeSlice( - timesliceID, - EarlybirdConfig.getMaxSegmentSize(), - partitionConfig.getIndexingHashPartitionID(), - partitionConfig.getNumPartitions()); - - SegmentInfo segmentInfo = new SegmentInfo( - timeSlice.getSegment(), - earlybirdSegmentFactory, - segmentSyncConfig); - - Path infoPath = segmentPrefix.suffix(EarlybirdIndexFlusher.INFO_SUFFIX); - FlushInfo flushInfo; - try (FSDataInputStream infoInputStream = fileSystem.open(infoPath)) { - flushInfo = FlushInfo.loadFromYaml(infoInputStream); - } - - FSDataInputStream inputStream = fileSystem.open(segmentPath); - - // It's significantly slower to read from the FSDataInputStream on demand, so we - // use a buffered reader to pre-read bigger chunks. - int bufferSize = 1 << 22; // 4MB - BufferedInputStream bufferedInputStream = new BufferedInputStream(inputStream, bufferSize); - - DataDeserializer in = new DataDeserializer(bufferedInputStream, segmentName); - segmentInfo.getIndexSegment().load(in, flushInfo); - - return segmentInfo; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdKafkaConsumer.docx b/src/java/com/twitter/search/earlybird/partition/EarlybirdKafkaConsumer.docx new file mode 100644 index 000000000..5e325dbab Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/EarlybirdKafkaConsumer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdKafkaConsumer.java b/src/java/com/twitter/search/earlybird/partition/EarlybirdKafkaConsumer.java deleted file mode 100644 index def6b8939..000000000 --- a/src/java/com/twitter/search/earlybird/partition/EarlybirdKafkaConsumer.java +++ /dev/null @@ -1,281 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.Closeable; -import java.time.Duration; -import java.util.Map; -import java.util.concurrent.atomic.AtomicBoolean; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Stopwatch; -import com.google.common.collect.ImmutableList; - -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.errors.ApiException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.util.LogFormatUtil; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.common.CaughtUpMonitor; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.exception.WrappedKafkaApiException; -import com.twitter.search.earlybird.thrift.EarlybirdStatusCode; - -/** - * Reads TVEs from Kafka and writes them to a PartitionWriter. - */ -public class EarlybirdKafkaConsumer implements Closeable { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdKafkaConsumer.class); - - private static final Duration POLL_TIMEOUT = Duration.ofSeconds(1); - private static final String STATS_PREFIX = "earlybird_kafka_consumer_"; - - // See SEARCH-31827 - private static final SearchCounter INGESTING_DONE = - SearchCounter.export(STATS_PREFIX + "ingesting_done"); - private static final SearchRateCounter POLL_LOOP_EXCEPTIONS = - SearchRateCounter.export(STATS_PREFIX + "poll_loop_exceptions"); - private static final SearchRateCounter FLUSHING_EXCEPTIONS = - SearchRateCounter.export(STATS_PREFIX + "flushing_exceptions"); - - private static final SearchTimerStats TIMED_POLLS = - SearchTimerStats.export(STATS_PREFIX + "timed_polls"); - private static final SearchTimerStats TIMED_INDEX_EVENTS = - SearchTimerStats.export(STATS_PREFIX + "timed_index_events"); - - private final AtomicBoolean running = new AtomicBoolean(true); - private final BalancingKafkaConsumer balancingKafkaConsumer; - private final PartitionWriter partitionWriter; - protected final TopicPartition tweetTopic; - protected final TopicPartition updateTopic; - private final KafkaConsumer underlyingKafkaConsumer; - private final CriticalExceptionHandler criticalExceptionHandler; - private final EarlybirdIndexFlusher earlybirdIndexFlusher; - private final SearchIndexingMetricSet searchIndexingMetricSet; - private boolean finishedIngestUntilCurrent; - private final CaughtUpMonitor indexCaughtUpMonitor; - - protected class ConsumeBatchResult { - private boolean isCaughtUp; - private long readRecordsCount; - - public ConsumeBatchResult(boolean isCaughtUp, long readRecordsCount) { - this.isCaughtUp = isCaughtUp; - this.readRecordsCount = readRecordsCount; - } - - public boolean isCaughtUp() { - return isCaughtUp; - } - - public long getReadRecordsCount() { - return readRecordsCount; - } - } - - public EarlybirdKafkaConsumer( - KafkaConsumer underlyingKafkaConsumer, - SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler, - PartitionWriter partitionWriter, - TopicPartition tweetTopic, - TopicPartition updateTopic, - EarlybirdIndexFlusher earlybirdIndexFlusher, - CaughtUpMonitor kafkaIndexCaughtUpMonitor - ) { - this.partitionWriter = partitionWriter; - this.underlyingKafkaConsumer = underlyingKafkaConsumer; - this.criticalExceptionHandler = criticalExceptionHandler; - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.tweetTopic = tweetTopic; - this.updateTopic = updateTopic; - this.earlybirdIndexFlusher = earlybirdIndexFlusher; - - LOG.info("Reading from Kafka topics: tweetTopic={}, updateTopic={}", tweetTopic, updateTopic); - underlyingKafkaConsumer.assign(ImmutableList.of(updateTopic, tweetTopic)); - - this.balancingKafkaConsumer = - new BalancingKafkaConsumer(underlyingKafkaConsumer, tweetTopic, updateTopic); - this.finishedIngestUntilCurrent = false; - this.indexCaughtUpMonitor = kafkaIndexCaughtUpMonitor; - } - - /** - * Run the consumer, indexing from Kafka. - */ - @VisibleForTesting - public void run() { - while (isRunning()) { - ConsumeBatchResult result = consumeBatch(true); - indexCaughtUpMonitor.setAndNotify(result.isCaughtUp()); - } - } - - /** - * Reads from Kafka, starting at the given offsets, and applies the events until we are caught up - * with the current streams. - */ - public void ingestUntilCurrent(long tweetOffset, long updateOffset) { - Preconditions.checkState(!finishedIngestUntilCurrent); - Stopwatch stopwatch = Stopwatch.createStarted(); - LOG.info("Ingest until current: seeking to Kafka offset {} for tweets and {} for updates.", - tweetOffset, updateOffset); - - try { - underlyingKafkaConsumer.seek(tweetTopic, tweetOffset); - underlyingKafkaConsumer.seek(updateTopic, updateOffset); - } catch (ApiException kafkaApiException) { - throw new WrappedKafkaApiException("Can't seek to tweet and update offsets", - kafkaApiException); - } - - Map endOffsets; - try { - endOffsets = underlyingKafkaConsumer.endOffsets(ImmutableList.of(tweetTopic, updateTopic)); - } catch (ApiException kafkaApiException) { - throw new WrappedKafkaApiException("Can't find end offsets", - kafkaApiException); - } - - if (endOffsets.size() > 0) { - LOG.info(String.format("Records until current: tweets=%,d, updates=%,d", - endOffsets.get(tweetTopic) - tweetOffset + 1, - endOffsets.get(updateTopic) - updateOffset + 1)); - } - - consumeBatchesUntilCurrent(true); - - LOG.info("ingestUntilCurrent finished in {}.", stopwatch); - - partitionWriter.logState(); - INGESTING_DONE.increment(); - finishedIngestUntilCurrent = true; - } - - /** - * Consume tweets and updates from streams until we're up to date. - * - * @return total number of read records. - */ - private long consumeBatchesUntilCurrent(boolean flushingEnabled) { - long totalRecordsRead = 0; - long batchesConsumed = 0; - - while (isRunning()) { - ConsumeBatchResult result = consumeBatch(flushingEnabled); - batchesConsumed++; - totalRecordsRead += result.getReadRecordsCount(); - if (isCurrent(result.isCaughtUp())) { - break; - } - } - - LOG.info("Processed batches: {}", batchesConsumed); - - return totalRecordsRead; - } - - // This method is overriden in MockEarlybirdKafkaConsumer. - public boolean isCurrent(boolean current) { - return current; - } - - /** - * We don't index during flushing, so after the flush is done, the index is stale. - * We need to get to current, before we rejoin the serverset so that upon rejoining we're - * not serving a stale index. - */ - @VisibleForTesting - void getToCurrentPostFlush() { - LOG.info("Getting to current post flush"); - Stopwatch stopwatch = Stopwatch.createStarted(); - - long totalRecordsRead = consumeBatchesUntilCurrent(false); - - LOG.info("Post flush, became current in: {}, after reading {} records.", - stopwatch, LogFormatUtil.formatInt(totalRecordsRead)); - } - - /* - * @return true if we are current after indexing this batch. - */ - @VisibleForTesting - protected ConsumeBatchResult consumeBatch(boolean flushingEnabled) { - long readRecordsCount = 0; - boolean isCaughtUp = false; - - try { - // Poll. - SearchTimer pollTimer = TIMED_POLLS.startNewTimer(); - ConsumerRecords records = - balancingKafkaConsumer.poll(POLL_TIMEOUT); - readRecordsCount += records.count(); - TIMED_POLLS.stopTimerAndIncrement(pollTimer); - - // Index. - SearchTimer indexTimer = TIMED_INDEX_EVENTS.startNewTimer(); - isCaughtUp = partitionWriter.indexBatch(records); - TIMED_INDEX_EVENTS.stopTimerAndIncrement(indexTimer); - } catch (Exception ex) { - POLL_LOOP_EXCEPTIONS.increment(); - LOG.error("Exception in poll loop", ex); - } - - try { - // Possibly flush the index. - if (isCaughtUp && flushingEnabled) { - long tweetOffset = 0; - long updateOffset = 0; - - try { - tweetOffset = underlyingKafkaConsumer.position(tweetTopic); - updateOffset = underlyingKafkaConsumer.position(updateTopic); - } catch (ApiException kafkaApiException) { - throw new WrappedKafkaApiException("can't get topic positions", kafkaApiException); - } - - EarlybirdIndexFlusher.FlushAttemptResult flushAttemptResult = - earlybirdIndexFlusher.flushIfNecessary( - tweetOffset, updateOffset, this::getToCurrentPostFlush); - - if (flushAttemptResult == EarlybirdIndexFlusher.FlushAttemptResult.FLUSH_ATTEMPT_MADE) { - // Viz might show this as a fairly high number, so we're printing it here to confirm - // the value on the server. - LOG.info("Finished flushing. Index freshness in ms: {}", - LogFormatUtil.formatInt(searchIndexingMetricSet.getIndexFreshnessInMillis())); - } - - if (!finishedIngestUntilCurrent) { - LOG.info("Became current on startup. Tried to flush with result: {}", - flushAttemptResult); - } - } - } catch (Exception ex) { - FLUSHING_EXCEPTIONS.increment(); - LOG.error("Exception while flushing", ex); - } - - return new ConsumeBatchResult(isCaughtUp, readRecordsCount); - } - - public boolean isRunning() { - return running.get() && EarlybirdStatus.getStatusCode() != EarlybirdStatusCode.STOPPING; - } - - public void prepareAfterStartingWithIndex(long maxIndexedTweetId) { - partitionWriter.prepareAfterStartingWithIndex(maxIndexedTweetId); - } - - public void close() { - balancingKafkaConsumer.close(); - running.set(false); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdStartup.docx b/src/java/com/twitter/search/earlybird/partition/EarlybirdStartup.docx new file mode 100644 index 000000000..0a7ada50d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/EarlybirdStartup.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/EarlybirdStartup.java b/src/java/com/twitter/search/earlybird/partition/EarlybirdStartup.java deleted file mode 100644 index e0a2d125d..000000000 --- a/src/java/com/twitter/search/earlybird/partition/EarlybirdStartup.java +++ /dev/null @@ -1,17 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.Closeable; - -import com.twitter.search.earlybird.exception.EarlybirdStartupException; - -/** - * Handles starting and indexing data for an Earlybird. - */ -@FunctionalInterface -public interface EarlybirdStartup { - /** - * Handles indexing Tweets, Tweet Updates and user updates. Blocks until current, and forks a - * thread to keep the index current. - */ - Closeable start() throws EarlybirdStartupException; -} diff --git a/src/java/com/twitter/search/earlybird/partition/FlowControlException.docx b/src/java/com/twitter/search/earlybird/partition/FlowControlException.docx new file mode 100644 index 000000000..a767b57d3 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/FlowControlException.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/FlowControlException.java b/src/java/com/twitter/search/earlybird/partition/FlowControlException.java deleted file mode 100644 index f7a6bced5..000000000 --- a/src/java/com/twitter/search/earlybird/partition/FlowControlException.java +++ /dev/null @@ -1,16 +0,0 @@ -package com.twitter.search.earlybird.partition; - -/** - * Exception used to cause a ScheduledExecutorService to stop executing. Used when the - * success condition of the class has been achieved. - */ -public class FlowControlException extends RuntimeException { - - public FlowControlException() { - super(); - } - - public FlowControlException(String message) { - super(message); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/HdfsUtil.docx b/src/java/com/twitter/search/earlybird/partition/HdfsUtil.docx new file mode 100644 index 000000000..5a86f5021 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/HdfsUtil.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/HdfsUtil.java b/src/java/com/twitter/search/earlybird/partition/HdfsUtil.java deleted file mode 100644 index 5c393df50..000000000 --- a/src/java/com/twitter/search/earlybird/partition/HdfsUtil.java +++ /dev/null @@ -1,30 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -public final class HdfsUtil { - private HdfsUtil() { - } - - public static FileSystem getHdfsFileSystem() throws IOException { - Configuration config = new Configuration(); - // Since earlybird uses hdfs from different threads, and closes the FileSystem from - // them independently, we want each thread to have its own, new FileSystem. - return FileSystem.newInstance(config); - } - - /** - * Checks if the given segment is present on HDFS - */ - public static boolean segmentExistsOnHdfs(FileSystem fs, SegmentInfo segmentInfo) - throws IOException { - String hdfsBaseDirPrefix = segmentInfo.getSyncInfo().getHdfsUploadDirPrefix(); - FileStatus[] statuses = fs.globStatus(new Path(hdfsBaseDirPrefix)); - return statuses != null && statuses.length > 0; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/ISegmentWriter.docx b/src/java/com/twitter/search/earlybird/partition/ISegmentWriter.docx new file mode 100644 index 000000000..9d19eca05 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/ISegmentWriter.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/ISegmentWriter.java b/src/java/com/twitter/search/earlybird/partition/ISegmentWriter.java deleted file mode 100644 index c4b32ea25..000000000 --- a/src/java/com/twitter/search/earlybird/partition/ISegmentWriter.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; - -public interface ISegmentWriter { - enum Result { - SUCCESS, - FAILURE_RETRYABLE, - FAILURE_NOT_RETRYABLE, - } - - /** - * Indexes the given ThriftVersionedEvents instance (adds it to the segment associated with this - * SegmentWriter instance). - */ - Result indexThriftVersionedEvents(ThriftVersionedEvents tve) throws IOException; - - /** - * Returns the segment info for this segment writer. - */ - SegmentInfo getSegmentInfo(); -} diff --git a/src/java/com/twitter/search/earlybird/partition/IndexingResultCounts.docx b/src/java/com/twitter/search/earlybird/partition/IndexingResultCounts.docx new file mode 100644 index 000000000..cb72f5332 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/IndexingResultCounts.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/IndexingResultCounts.java b/src/java/com/twitter/search/earlybird/partition/IndexingResultCounts.java deleted file mode 100644 index 16235722d..000000000 --- a/src/java/com/twitter/search/earlybird/partition/IndexingResultCounts.java +++ /dev/null @@ -1,51 +0,0 @@ -package com.twitter.search.earlybird.partition; - -/** - * Helper class used to store counts to be logged. - */ -public class IndexingResultCounts { - private int indexingCalls; - private int failureRetriable; - private int failureNotRetriable; - private int indexingSuccess; - - public IndexingResultCounts() { - } - - /** - * Updates the internal counts with a single result. - */ - public void countResult(ISegmentWriter.Result result) { - indexingCalls++; - if (result == ISegmentWriter.Result.FAILURE_NOT_RETRYABLE) { - failureNotRetriable++; - } else if (result == ISegmentWriter.Result.FAILURE_RETRYABLE) { - failureRetriable++; - } else if (result == ISegmentWriter.Result.SUCCESS) { - indexingSuccess++; - } - } - - int getIndexingCalls() { - return indexingCalls; - } - - int getFailureRetriable() { - return failureRetriable; - } - - int getFailureNotRetriable() { - return failureNotRetriable; - } - - int getIndexingSuccess() { - return indexingSuccess; - } - - @Override - public String toString() { - return String.format("[calls: %,d, success: %,d, fail not-retryable: %,d, fail retryable: %,d]", - indexingCalls, indexingSuccess, failureNotRetriable, failureRetriable); - } -} - diff --git a/src/java/com/twitter/search/earlybird/partition/InstrumentedQueue.docx b/src/java/com/twitter/search/earlybird/partition/InstrumentedQueue.docx new file mode 100644 index 000000000..353d1a2ab Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/InstrumentedQueue.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/InstrumentedQueue.java b/src/java/com/twitter/search/earlybird/partition/InstrumentedQueue.java deleted file mode 100644 index 2f72a2c75..000000000 --- a/src/java/com/twitter/search/earlybird/partition/InstrumentedQueue.java +++ /dev/null @@ -1,51 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.concurrent.ConcurrentLinkedDeque; -import java.util.concurrent.atomic.AtomicLong; - -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchRateCounter; - -/** - * A queue with metrics on size, enqueue rate and dequeue rate. - */ -public class InstrumentedQueue { - private final SearchRateCounter enqueueRate; - private final SearchRateCounter dequeueRate; - private final AtomicLong queueSize = new AtomicLong(); - - private final ConcurrentLinkedDeque queue; - - public InstrumentedQueue(String statsPrefix) { - SearchLongGauge.export(statsPrefix + "_size", queueSize); - enqueueRate = SearchRateCounter.export(statsPrefix + "_enqueue"); - dequeueRate = SearchRateCounter.export(statsPrefix + "_dequeue"); - - queue = new ConcurrentLinkedDeque<>(); - } - - /** - * Adds a new element to the queue. - */ - public void add(T tve) { - queue.add(tve); - enqueueRate.increment(); - queueSize.incrementAndGet(); - } - - /** - * Returns the first element in the queue. If the queue is empty, {@code null} is returned. - */ - public T poll() { - T tve = queue.poll(); - if (tve != null) { - dequeueRate.increment(); - queueSize.decrementAndGet(); - } - return tve; - } - - public long getQueueSize() { - return queueSize.get(); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/KafkaStartup.docx b/src/java/com/twitter/search/earlybird/partition/KafkaStartup.docx new file mode 100644 index 000000000..8e795c26b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/KafkaStartup.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/KafkaStartup.java b/src/java/com/twitter/search/earlybird/partition/KafkaStartup.java deleted file mode 100644 index e413125d7..000000000 --- a/src/java/com/twitter/search/earlybird/partition/KafkaStartup.java +++ /dev/null @@ -1,328 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.Closeable; -import java.util.Optional; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Stopwatch; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.config.Config; -import com.twitter.search.common.decider.SearchDecider; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.exception.EarlybirdStartupException; -import com.twitter.search.earlybird.partition.freshstartup.FreshStartupHandler; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.earlybird.thrift.EarlybirdStatusCode; -import com.twitter.search.queryparser.query.QueryParserException; - -/** - * Handles starting an Earlybird from Kafka topics. - * - * Currently very unoptimized -- future versions will implement parallel indexing and loading - * serialized data from HDFS. See http://go/removing-dl-tdd. - */ -public class KafkaStartup implements EarlybirdStartup { - private static final Logger LOG = LoggerFactory.getLogger(KafkaStartup.class); - - private final EarlybirdKafkaConsumer earlybirdKafkaConsumer; - private final StartupUserEventIndexer startupUserEventIndexer; - private final QueryCacheManager queryCacheManager; - private final SegmentManager segmentManager; - private final EarlybirdIndexLoader earlybirdIndexLoader; - private final FreshStartupHandler freshStartupHandler; - private final UserUpdatesStreamIndexer userUpdatesStreamIndexer; - private final UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer; - private final SearchIndexingMetricSet searchIndexingMetricSet; - private final SearchLongGauge loadedIndex; - private final SearchLongGauge freshStartup; - private final MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager; - private final AudioSpaceEventsStreamIndexer audioSpaceEventsStreamIndexer; - private final CriticalExceptionHandler earlybirdExceptionHandler; - private final SearchDecider decider; - - private static final String FRESH_STARTUP = "fresh startup"; - private static final String INGEST_UNTIL_CURRENT = "ingest until current"; - private static final String LOAD_FLUSHED_INDEX = "load flushed index"; - private static final String SETUP_QUERY_CACHE = "setting up query cache"; - private static final String USER_UPDATES_STARTUP = "user updates startup"; - private static final String AUDIO_SPACES_STARTUP = "audio spaces startup"; - private static final String BUILD_MULTI_SEGMENT_TERM_DICTIONARY = - "build multi segment term dictionary"; - - public KafkaStartup( - SegmentManager segmentManager, - EarlybirdKafkaConsumer earlybirdKafkaConsumer, - StartupUserEventIndexer startupUserEventIndexer, - UserUpdatesStreamIndexer userUpdatesStreamIndexer, - UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer, - AudioSpaceEventsStreamIndexer audioSpaceEventsStreamIndexer, - QueryCacheManager queryCacheManager, - EarlybirdIndexLoader earlybirdIndexLoader, - FreshStartupHandler freshStartupHandler, - SearchIndexingMetricSet searchIndexingMetricSet, - MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager, - CriticalExceptionHandler earlybirdExceptionHandler, - SearchDecider decider - ) { - this.segmentManager = segmentManager; - this.earlybirdKafkaConsumer = earlybirdKafkaConsumer; - this.startupUserEventIndexer = startupUserEventIndexer; - this.queryCacheManager = queryCacheManager; - this.earlybirdIndexLoader = earlybirdIndexLoader; - this.freshStartupHandler = freshStartupHandler; - this.userUpdatesStreamIndexer = userUpdatesStreamIndexer; - this.userScrubGeoEventStreamIndexer = userScrubGeoEventStreamIndexer; - this.audioSpaceEventsStreamIndexer = audioSpaceEventsStreamIndexer; - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.loadedIndex = SearchLongGauge.export("kafka_startup_loaded_index"); - this.freshStartup = SearchLongGauge.export("fresh_startup"); - this.multiSegmentTermDictionaryManager = multiSegmentTermDictionaryManager; - this.earlybirdExceptionHandler = earlybirdExceptionHandler; - this.decider = decider; - freshStartup.set(0); - } - - private void userEventsStartup() { - LOG.info("Start indexing user events."); - - startupUserEventIndexer.indexAllEvents(); - - LOG.info("Finished loading/indexing user events."); - - // User updates are now current, keep them current by continuing to index from the stream. - LOG.info("Starting to run UserUpdatesStreamIndexer"); - new Thread(userUpdatesStreamIndexer::run, "userupdates-stream-indexer").start(); - - if (EarlybirdConfig.consumeUserScrubGeoEvents()) { - // User scrub geo events are now current, - // keep them current by continuing to index from the stream. - LOG.info("Starting to run UserScrubGeoEventsStreamIndexer"); - new Thread(userScrubGeoEventStreamIndexer::run, - "userScrubGeoEvents-stream-indexer").start(); - } - } - - private void loadAudioSpaceEvents() { - LOG.info("Index audio space events..."); - EarlybirdStatus.beginEvent(AUDIO_SPACES_STARTUP, - searchIndexingMetricSet.startupInAudioSpaceEventIndexer); - - if (audioSpaceEventsStreamIndexer == null) { - LOG.error("Null audioSpaceEventsStreamIndexer"); - return; - } - - if (decider.isAvailable("enable_reading_audio_space_events")) { - Stopwatch stopwatch = Stopwatch.createStarted(); - audioSpaceEventsStreamIndexer.seekToBeginning(); - audioSpaceEventsStreamIndexer.readRecordsUntilCurrent(); - LOG.info("Finished reading audio spaces in {}", stopwatch); - audioSpaceEventsStreamIndexer.printSummary(); - - new Thread(audioSpaceEventsStreamIndexer::run, - "audioSpaceEvents-stream-indexer").start(); - } else { - LOG.info("Reading audio space events not enabled"); - } - - EarlybirdStatus.endEvent(AUDIO_SPACES_STARTUP, - searchIndexingMetricSet.startupInAudioSpaceEventIndexer); - } - - private void tweetsAndUpdatesStartup() throws EarlybirdStartupException { - LOG.info("Index tweets and updates..."); - EarlybirdStatus.beginEvent(LOAD_FLUSHED_INDEX, - searchIndexingMetricSet.startupInLoadFlushedIndex); - EarlybirdIndex index; - - // Set when you want to get a server from starting to ready quickly for development - // purposes. - boolean fastDevStartup = EarlybirdConfig.getBool("fast_dev_startup"); - - Optional optIndex = Optional.empty(); - if (!fastDevStartup) { - optIndex = earlybirdIndexLoader.loadIndex(); - } - - if (optIndex.isPresent()) { - loadedIndex.set(1); - LOG.info("Loaded an index."); - index = optIndex.get(); - EarlybirdStatus.endEvent(LOAD_FLUSHED_INDEX, - searchIndexingMetricSet.startupInLoadFlushedIndex); - } else { - LOG.info("Didn't load an index, indexing from scratch."); - freshStartup.set(1); - boolean parallelIndexFromScratch = EarlybirdConfig.getBool( - "parallel_index_from_scratch"); - LOG.info("parallel_index_from_scratch: {}", parallelIndexFromScratch); - EarlybirdStatus.beginEvent(FRESH_STARTUP, - searchIndexingMetricSet.startupInFreshStartup); - try { - if (fastDevStartup) { - index = freshStartupHandler.fastIndexFromScratchForDevelopment(); - } else if (parallelIndexFromScratch) { - index = freshStartupHandler.parallelIndexFromScratch(); - } else { - index = freshStartupHandler.indexFromScratch(); - } - } catch (Exception ex) { - throw new EarlybirdStartupException(ex); - } finally { - EarlybirdStatus.endEvent(FRESH_STARTUP, - searchIndexingMetricSet.startupInFreshStartup); - } - } - - LOG.info("Index has {} segments.", index.getSegmentInfoList().size()); - if (index.getSegmentInfoList().size() > 0) { - LOG.info("Inserting segments into SegmentManager"); - for (SegmentInfo segmentInfo : index.getSegmentInfoList()) { - segmentManager.putSegmentInfo(segmentInfo); - } - - earlybirdKafkaConsumer.prepareAfterStartingWithIndex( - index.getMaxIndexedTweetId() - ); - } - - // Build the Multi segment term dictionary before catching up on indexing to ensure that the - // segments won't roll and delete the oldest segment while a multi segment term dictionary that - // includes that segment is being built. - buildMultiSegmentTermDictionary(); - - segmentManager.logState("Starting ingestUntilCurrent"); - LOG.info("partial updates indexed: {}", segmentManager.getNumPartialUpdates()); - EarlybirdStatus.beginEvent(INGEST_UNTIL_CURRENT, - searchIndexingMetricSet.startupInIngestUntilCurrent); - - earlybirdKafkaConsumer.ingestUntilCurrent(index.getTweetOffset(), index.getUpdateOffset()); - - validateSegments(); - segmentManager.logState("ingestUntilCurrent is done"); - LOG.info("partial updates indexed: {}", segmentManager.getNumPartialUpdates()); - EarlybirdStatus.endEvent(INGEST_UNTIL_CURRENT, - searchIndexingMetricSet.startupInIngestUntilCurrent); - new Thread(earlybirdKafkaConsumer::run, "earlybird-kafka-consumer").start(); - } - - protected void validateSegments() throws EarlybirdStartupException { - if (!Config.environmentIsTest()) { - // Unfortunately, many tests start Earlybirds with 0 indexed documents, so we disable this - // check in tests. - validateSegmentsForNonTest(); - } - } - - protected void validateSegmentsForNonTest() throws EarlybirdStartupException { - // SEARCH-24123: Prevent Earlybird from starting if there are no indexed documents. - if (segmentManager.getNumIndexedDocuments() == 0) { - throw new EarlybirdStartupException("Earlybird has zero indexed documents."); - } - } - - private void queryCacheStartup() throws EarlybirdStartupException { - EarlybirdStatus.beginEvent(SETUP_QUERY_CACHE, - searchIndexingMetricSet.startupInQueryCacheUpdates); - try { - queryCacheManager.setupTasksIfNeeded(segmentManager); - } catch (QueryParserException e) { - LOG.error("Exception when setting up query cache tasks"); - throw new EarlybirdStartupException(e); - } - - queryCacheManager.waitUntilAllQueryCachesAreBuilt(); - - // Print the sizes of the query caches so that we can see that they're built. - Iterable segmentInfos = - segmentManager.getSegmentInfos(SegmentManager.Filter.All, SegmentManager.Order.OLD_TO_NEW); - segmentManager.logState("After building query caches"); - for (SegmentInfo segmentInfo : segmentInfos) { - LOG.info("Segment: {}, Total cardinality: {}", segmentInfo.getSegmentName(), - segmentInfo.getIndexSegment().getQueryCachesCardinality()); - } - - // We're done building the query caches for all segments, and the earlybird is ready to become - // current. Restrict all future query cache task runs to one single core, to make sure our - // searcher threads are not impacted. - queryCacheManager.setWorkerPoolSizeAfterStartup(); - EarlybirdStatus.endEvent(SETUP_QUERY_CACHE, - searchIndexingMetricSet.startupInQueryCacheUpdates); - } - - /** - * Closes all currently running Indexers. - */ - @VisibleForTesting - public void shutdownIndexing() { - LOG.info("Shutting down KafkaStartup."); - - earlybirdKafkaConsumer.close(); - userUpdatesStreamIndexer.close(); - userScrubGeoEventStreamIndexer.close(); - // Note that the QueryCacheManager is shut down in EarlybirdServer::shutdown. - } - - private void buildMultiSegmentTermDictionary() { - EarlybirdStatus.beginEvent(BUILD_MULTI_SEGMENT_TERM_DICTIONARY, - searchIndexingMetricSet.startupInMultiSegmentTermDictionaryUpdates); - Stopwatch stopwatch = Stopwatch.createStarted(); - LOG.info("Building multi segment term dictionary"); - multiSegmentTermDictionaryManager.buildDictionary(); - LOG.info("Done with building multi segment term dictionary in {}", stopwatch); - EarlybirdStatus.endEvent(BUILD_MULTI_SEGMENT_TERM_DICTIONARY, - searchIndexingMetricSet.startupInMultiSegmentTermDictionaryUpdates); - } - - private void parallelIndexingStartup() throws EarlybirdStartupException { - Thread userEventsThread = new Thread(this::userEventsStartup, "index-user-events-startup"); - Thread tweetsAndUpdatesThread = new Thread(() -> { - try { - tweetsAndUpdatesStartup(); - } catch (EarlybirdStartupException e) { - earlybirdExceptionHandler.handle(this, e); - } - }, "index-tweets-and-updates-startup"); - Thread audioSpaceEventsThread = new Thread(this::loadAudioSpaceEvents, - "index-audio-space-events-startup"); - userEventsThread.start(); - tweetsAndUpdatesThread.start(); - audioSpaceEventsThread.start(); - - try { - userEventsThread.join(); - } catch (InterruptedException e) { - throw new EarlybirdStartupException("Interrupted while indexing user events"); - } - try { - tweetsAndUpdatesThread.join(); - } catch (InterruptedException e) { - throw new EarlybirdStartupException("Interrupted while indexing tweets and updates"); - } - try { - audioSpaceEventsThread.join(); - } catch (InterruptedException e) { - throw new EarlybirdStartupException("Interrupted while indexing audio space events"); - } - } - - /** - * Does startups and starts indexing. Returns when the earlybird - * is current. - */ - @Override - public Closeable start() throws EarlybirdStartupException { - parallelIndexingStartup(); - queryCacheStartup(); - - EarlybirdStatus.setStatus(EarlybirdStatusCode.CURRENT); - - return this::shutdownIndexing; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/MultiSegmentTermDictionaryManager.docx b/src/java/com/twitter/search/earlybird/partition/MultiSegmentTermDictionaryManager.docx new file mode 100644 index 000000000..3acfea50a Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/MultiSegmentTermDictionaryManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/MultiSegmentTermDictionaryManager.java b/src/java/com/twitter/search/earlybird/partition/MultiSegmentTermDictionaryManager.java deleted file mode 100644 index a1abba74b..000000000 --- a/src/java/com/twitter/search/earlybird/partition/MultiSegmentTermDictionaryManager.java +++ /dev/null @@ -1,314 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.Collections; -import java.util.List; -import java.util.Map; -import java.util.concurrent.TimeUnit; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.decider.Decider; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.core.earlybird.index.inverted.InvertedIndex; -import com.twitter.search.core.earlybird.index.inverted.MultiSegmentTermDictionary; -import com.twitter.search.core.earlybird.index.inverted.MultiSegmentTermDictionaryWithFastutil; -import com.twitter.search.core.earlybird.index.inverted.OptimizedMemoryIndex; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.index.EarlybirdSegment; -import com.twitter.search.earlybird.partition.SegmentManager.Filter; -import com.twitter.search.earlybird.partition.SegmentManager.Order; - -/** - * Manages MultiSegmentTermDictionary's for specific fields on this earlybird. Only manages them - * for optimized segments, and should only regenerate new dictionaries when the list of optimized - * segments changes. See SEARCH-10836 - */ -public class MultiSegmentTermDictionaryManager { - private static final Logger LOG = - LoggerFactory.getLogger(MultiSegmentTermDictionaryManager.class); - - @VisibleForTesting - public static final SearchTimerStats TERM_DICTIONARY_CREATION_STATS = - SearchTimerStats.export("multi_segment_term_dictionary_manager_build_dictionary", - TimeUnit.MILLISECONDS, false); - - public static final MultiSegmentTermDictionaryManager NOOP_INSTANCE = - new MultiSegmentTermDictionaryManager( - new Config(Collections.emptyList()), null, null, null, null) { - @Override - public boolean buildDictionary() { - return false; - } - }; - - private static final String MANAGER_DISABLED_DECIDER_KEY_PREFIX = - "multi_segment_term_dictionary_manager_disabled_in_"; - - public static class Config { - private final ImmutableList fieldNames; - - public Config(List fieldNames) { - Preconditions.checkNotNull(fieldNames); - this.fieldNames = ImmutableList.copyOf(fieldNames); - } - - public List managedFieldNames() { - return fieldNames; - } - - public boolean isEnabled() { - return EarlybirdConfig.getBool("multi_segment_term_dictionary_enabled", false); - } - } - - @VisibleForTesting - public static String getManagerDisabledDeciderName(EarlybirdCluster earlybirdCluster) { - return MANAGER_DISABLED_DECIDER_KEY_PREFIX + earlybirdCluster.name().toLowerCase(); - } - - private static final class FieldStats { - private final SearchTimerStats buildTime; - private final SearchLongGauge numTerms; - private final SearchLongGauge numTermEntries; - - private FieldStats(SearchStatsReceiver statsReceiver, String fieldName) { - Preconditions.checkNotNull(fieldName); - Preconditions.checkNotNull(statsReceiver); - - String timerName = String.format( - "multi_segment_term_dictionary_manager_field_%s_build_dictionary", fieldName); - this.buildTime = statsReceiver.getTimerStats( - timerName, TimeUnit.MILLISECONDS, false, false, false); - - String numTermsName = String.format( - "multi_segment_term_dictionary_manager_field_%s_num_terms", fieldName); - this.numTerms = statsReceiver.getLongGauge(numTermsName); - - String numTermEntriesName = String.format( - "multi_segment_term_dictionary_manager_field_%s_num_term_entries", fieldName); - this.numTermEntries = statsReceiver.getLongGauge(numTermEntriesName); - } - } - - private final Config config; - @Nullable private final SegmentManager segmentManager; - @Nullable private final Decider decider; - @Nullable private final EarlybirdCluster earlybirdCluster; - private final ImmutableMap fieldTimerStats; - // A per-field map of multi-segment term dictionaries. Each key is a field. The values are the - // multi-segment term dictionaries for that field. - private volatile ImmutableMap multiSegmentTermDictionaryMap; - private List previousSegmentsToMerge; - - public MultiSegmentTermDictionaryManager( - Config config, - SegmentManager segmentManager, - SearchStatsReceiver statsReceiver, - Decider decider, - EarlybirdCluster earlybirdCluster) { - this.config = config; - this.segmentManager = segmentManager; - this.decider = decider; - this.earlybirdCluster = earlybirdCluster; - - this.multiSegmentTermDictionaryMap = ImmutableMap.of(); - this.previousSegmentsToMerge = Lists.newArrayList(); - - ImmutableMap.Builder builder = ImmutableMap.builder(); - if (statsReceiver != null) { - for (String fieldName : config.managedFieldNames()) { - builder.put(fieldName, new FieldStats(statsReceiver, fieldName)); - } - } - this.fieldTimerStats = builder.build(); - } - - /** - * Return the most recently built MultiSegmentTermDictionary for the given field. - * Will return null if the field is not supported by this manager. - */ - @Nullable - public MultiSegmentTermDictionary getMultiSegmentTermDictionary(String fieldName) { - return this.multiSegmentTermDictionaryMap.get(fieldName); - } - - /** - * Build new versions of multi-segment term dictionaries if the manager is enabled, and new - * segments are available. - * @return true if the manager actually ran, and generated new versions of multi-segment term - * dictionaries. - * - * We synchronize this method because it would be a logic error to modify the variables from - * multiple threads simultaneously, and it is possible for two segments to finish optimizing at - * the same time and try to run it. - */ - public synchronized boolean buildDictionary() { - if (!config.isEnabled()) { - return false; - } - - Preconditions.checkNotNull(decider); - Preconditions.checkNotNull(earlybirdCluster); - if (DeciderUtil.isAvailableForRandomRecipient(decider, - getManagerDisabledDeciderName(earlybirdCluster))) { - LOG.info("Multi segment term dictionary manager is disabled via decider for cluster {}.", - earlybirdCluster); - this.multiSegmentTermDictionaryMap = ImmutableMap.of(); - this.previousSegmentsToMerge = Lists.newArrayList(); - return false; - } - - List segmentsToMerge = getSegmentsToMerge(); - - if (differentFromPreviousList(segmentsToMerge)) { - long start = System.currentTimeMillis(); - try { - this.multiSegmentTermDictionaryMap = createNewDictionaries(segmentsToMerge); - this.previousSegmentsToMerge = segmentsToMerge; - return true; - } catch (IOException e) { - LOG.error("Unable to build multi segment term dictionaries", e); - return false; - } finally { - long elapsed = System.currentTimeMillis() - start; - TERM_DICTIONARY_CREATION_STATS.timerIncrement(elapsed); - } - } else { - LOG.warn("No-op for buildDictionary()"); - return false; - } - } - - /** - * Only merge terms from enabled and optimized segments. No need to look at non-enabled segments, - * and we also don't want to use un-optimized segments as their term dictionaries are still - * changing. - */ - private List getSegmentsToMerge() { - Iterable segmentInfos = - segmentManager.getSegmentInfos(Filter.Enabled, Order.OLD_TO_NEW); - - List segmentsToMerge = Lists.newArrayList(); - for (SegmentInfo segmentInfo : segmentInfos) { - if (segmentInfo.getIndexSegment().isOptimized()) { - segmentsToMerge.add(segmentInfo); - } - } - return segmentsToMerge; - } - - private boolean differentFromPreviousList(List segmentsToMerge) { - // there is a potentially different approach here to only check if the - // segmentsToMerge is subsumed by the previousSegmentsToMerge list, and not recompute - // the multi segment term dictionary if so. - // There is a case where a new segment is added, the previously current segment is not yet - // optimized, but the oldest segment is dropped. With this impl, we will recompute to remove - // the dropped segment, however, we will recompute soon again when the - // "previously current segment" is actually optimized. We can potentially delay the first - // merging before the optimization. - if (this.previousSegmentsToMerge.size() == segmentsToMerge.size()) { - for (int i = 0; i < this.previousSegmentsToMerge.size(); i++) { - if (previousSegmentsToMerge.get(i).compareTo(segmentsToMerge.get(i)) != 0) { - return true; - } - } - return false; - } - return true; - } - - /** - * Rebuild the term dictionaries from scratch for all the managed fields. - * Returning a brand new map here with all the fields' term dictionaries so that we can isolate - * failures to build, and only replace the entire map of all the fields are built successfully. - */ - private ImmutableMap createNewDictionaries( - List segments) throws IOException { - - Map map = Maps.newHashMap(); - - for (String field : config.managedFieldNames()) { - LOG.info("Merging term dictionaries for field {}", field); - - List indexesToMerge = findFieldIndexesToMerge(segments, field); - - if (indexesToMerge.isEmpty()) { - LOG.info("No indexes to merge for field {}", field); - } else { - long start = System.currentTimeMillis(); - - MultiSegmentTermDictionary multiSegmentTermDictionary = - mergeDictionaries(field, indexesToMerge); - - map.put(field, multiSegmentTermDictionary); - - long elapsed = System.currentTimeMillis() - start; - LOG.info("Done merging term dictionary for field {}, for {} segments in {}ms", - field, indexesToMerge.size(), elapsed); - - FieldStats fieldStats = fieldTimerStats.get(field); - fieldStats.buildTime.timerIncrement(elapsed); - fieldStats.numTerms.set(multiSegmentTermDictionary.getNumTerms()); - fieldStats.numTermEntries.set(multiSegmentTermDictionary.getNumTermEntries()); - } - } - return ImmutableMap.copyOf(map); - } - - private List findFieldIndexesToMerge( - List segments, String field) throws IOException { - - List indexesToMerge = Lists.newArrayList(); - - for (SegmentInfo segment : segments) { - EarlybirdSegment indexSegment = segment.getIndexSegment(); - Preconditions.checkState(indexSegment.isOptimized(), - "Expect segment to be optimized: %s", segment); - - InvertedIndex fieldIndex = Preconditions.checkNotNull(indexSegment.getIndexReader()) - .getSegmentData().getFieldIndex(field); - - // See SEARCH-11952 - // We will only have a InvertedIndex/OptimizedMemoryIndex here - // in the in-memory non-lucene-based indexes, and not in the archive. We can somewhat - // reasonably extend this to work with the archive by making the dictionaries work with - // TermsEnum's directly instead of OptimizedMemoryIndex's. Leaving this as a further - // extension for now. - if (fieldIndex != null) { - if (fieldIndex instanceof OptimizedMemoryIndex) { - indexesToMerge.add((OptimizedMemoryIndex) fieldIndex); - } else { - LOG.info("Found field index for field {} in segment {} of type {}", - field, segment, fieldIndex.getClass()); - } - } else { - LOG.info("Found null field index for field {} in segment {}", field, segment); - } - } - LOG.info("Found good fields for {} out of {} segments", indexesToMerge.size(), - segments.size()); - - return indexesToMerge; - } - - private MultiSegmentTermDictionary mergeDictionaries( - String field, - List indexes) { - // May change this if we get a better implementation in the future. - return new MultiSegmentTermDictionaryWithFastutil(field, indexes); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/OptimizationAndFlushingCoordinationLock.docx b/src/java/com/twitter/search/earlybird/partition/OptimizationAndFlushingCoordinationLock.docx new file mode 100644 index 000000000..fd6450b8d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/OptimizationAndFlushingCoordinationLock.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/OptimizationAndFlushingCoordinationLock.java b/src/java/com/twitter/search/earlybird/partition/OptimizationAndFlushingCoordinationLock.java deleted file mode 100644 index bf39bb653..000000000 --- a/src/java/com/twitter/search/earlybird/partition/OptimizationAndFlushingCoordinationLock.java +++ /dev/null @@ -1,46 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.concurrent.locks.ReentrantLock; - -import com.google.common.annotations.VisibleForTesting; - -/** - * Lock used to ensure that flushing does not occur concurrently with the gc_before_optimization - * and post_optimization_rebuilds actions - see where we call the "lock" method of this class. - * - * Both coordinated actions include a full GC in them, for reasons described in that part - * of the code. After the GC, they wait until indexing has caught up before rejoining the serverset. - * - * If we flush concurrently with these actions, we can pause indexing for a while and waiting - * until we're caught up can take some time, which can affect the memory state negatively. - * For example, the first GC (before optimization) we do so that we have a clean state of memory - * before optimization. - * - * The other reason we lock before executing the actions is because if we have flushing that's - * currently running, once it finishes, we will rejoin the serverset and that can be followed by - * a stop-the-world GC from the actions, which will affect our success rate. - */ -public class OptimizationAndFlushingCoordinationLock { - private final ReentrantLock lock; - - public OptimizationAndFlushingCoordinationLock() { - this.lock = new ReentrantLock(); - } - - public void lock() { - lock.lock(); - } - - public void unlock() { - lock.unlock(); - } - - public boolean tryLock() { - return lock.tryLock(); - } - - @VisibleForTesting - public boolean hasQueuedThreads() { - return lock.hasQueuedThreads(); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/OptimizingSegmentWriter.docx b/src/java/com/twitter/search/earlybird/partition/OptimizingSegmentWriter.docx new file mode 100644 index 000000000..a4e2225f5 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/OptimizingSegmentWriter.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/OptimizingSegmentWriter.java b/src/java/com/twitter/search/earlybird/partition/OptimizingSegmentWriter.java deleted file mode 100644 index 732c2bbe8..000000000 --- a/src/java/com/twitter/search/earlybird/partition/OptimizingSegmentWriter.java +++ /dev/null @@ -1,210 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.atomic.AtomicReference; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Stopwatch; -import com.google.common.base.Verify; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.util.GCUtil; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.common.CaughtUpMonitor; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.index.EarlybirdSegment; -import com.twitter.search.earlybird.util.CoordinatedEarlybirdActionInterface; -import com.twitter.util.Future; -import com.twitter.util.Promise; - -/** - * This class optimizes a segment without blocking reads or writes. - * - * In steady state operation (Indexing or Optimized), it delegates operations directly to a - * SegmentWriter. - * - * Optimization is naturally a copying operation -- we don't need to mutate anything internally. - * We need to be able to apply updates to the unoptimized segment while we are creating - * the optimized segment. We also need to be able to apply these updates to the optimized segment, - * but we can't apply updates while a segment is being optimized, because document IDs will be - * changing internally and posting lists could be any state. To deal with this, we queue updates - * that occur during optimization, and then apply them as the last step of optimization. At that - * point, the segment will be optimized and up to date, so we can swap the unoptimized segment for - * the optimized one. - */ -public class OptimizingSegmentWriter implements ISegmentWriter { - private static final Logger LOG = LoggerFactory.getLogger(OptimizingSegmentWriter.class); - - private final AtomicReference state = new AtomicReference<>(State.Indexing); - private final ConcurrentLinkedQueue queuedEvents = - new ConcurrentLinkedQueue<>(); - - private final CriticalExceptionHandler criticalExceptionHandler; - private final SearchIndexingMetricSet searchIndexingMetricSet; - private final String segmentName; - private final Promise optimizationPromise = new Promise<>(); - - // We use the lock to ensure that the optimizing thread and the writer thread do not attempt - // to call indexThriftVersionedEvents on the underlying writer simultaneously. - private final Object lock = new Object(); - // The reference to the current writer. Protected by lock. - private final AtomicReference segmentWriterReference; - - private final CaughtUpMonitor indexCaughtUpMonitor; - - /** - * The state flow: - * Indexing -> Optimizing -> - * ONE OF: - * - Optimized - * - FailedToOptimize - */ - @VisibleForTesting - enum State { - Indexing, - Optimizing, - FailedToOptimize, - Optimized, - } - - public OptimizingSegmentWriter( - SegmentWriter segmentWriter, - CriticalExceptionHandler criticalExceptionHandler, - SearchIndexingMetricSet searchIndexingMetricSet, - CaughtUpMonitor indexCaughtUpMonitor - ) { - Preconditions.checkState(!segmentWriter.getSegmentInfo().isOptimized()); - segmentWriterReference = new AtomicReference<>(segmentWriter); - - this.criticalExceptionHandler = criticalExceptionHandler; - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.segmentName = segmentWriter.getSegmentInfo().getSegmentName(); - this.indexCaughtUpMonitor = indexCaughtUpMonitor; - } - - /** - * Start optimizing this segment in the background. Returns a Future that will complete when - * the optimization is complete. - * Acquires the optimizationAndFlushingCoordinationLock before attempting to optimize. - */ - public Future startOptimization( - CoordinatedEarlybirdActionInterface gcAction, - OptimizationAndFlushingCoordinationLock optimizationAndFlushingCoordinationLock) { - new Thread(() -> { - // Acquire lock to ensure that flushing is not in progress. If the lock is not available, - // then wait until it is. - LOG.info("Acquire coordination lock before beginning gc_before_optimization action."); - try { - optimizationAndFlushingCoordinationLock.lock(); - LOG.info("Successfully acquired coordination lock for gc_before_optimization action."); - gcAction.retryActionUntilRan("gc before optimization", () -> { - LOG.info("Run GC before optimization"); - GCUtil.runGC(); - // Wait for indexing to catch up before gcAction rejoins the serverset. We only need to do - // this if the host has already finished startup. - if (EarlybirdStatus.hasStarted()) { - indexCaughtUpMonitor.resetAndWaitUntilCaughtUp(); - } - }); - } finally { - LOG.info("Finished gc_before_optimization action. " - + "Releasing coordination lock and beginning optimization."); - optimizationAndFlushingCoordinationLock.unlock(); - } - - transition(State.Indexing, State.Optimizing); - - SegmentInfo unoptimizedSegmentInfo = null; - try { - unoptimizedSegmentInfo = segmentWriterReference.get().getSegmentInfo(); - Preconditions.checkState(!unoptimizedSegmentInfo.isOptimized()); - - Stopwatch stopwatch = Stopwatch.createStarted(); - LOG.info("Started optimizing segment data {}.", segmentName); - EarlybirdSegment optimizedSegment = - unoptimizedSegmentInfo.getIndexSegment().makeOptimizedSegment(); - LOG.info("Finished optimizing segment data {} in {}.", segmentName, stopwatch); - - SegmentInfo newSegmentInfo = unoptimizedSegmentInfo - .copyWithEarlybirdSegment(optimizedSegment); - - SegmentWriter optimizedWriter = - new SegmentWriter(newSegmentInfo, searchIndexingMetricSet.updateFreshness); - Verify.verify(optimizedWriter.getSegmentInfo().isOptimized()); - - // We want to apply all updates to the new segment twice, because this first call may apply - // many thousands of updates and take a while to complete. - applyAllPendingUpdates(optimizedWriter); - - // We try to do as little as possible while holding the lock, so the writer can continue - // to make progress. First we apply all the updates that have been queued up before we - // grabbed the lock, then we need to swap the new writer for the old one. - synchronized (lock) { - applyAllPendingUpdates(optimizedWriter); - segmentWriterReference.getAndSet(optimizedWriter); - transition(State.Optimizing, State.Optimized); - } - - if (!unoptimizedSegmentInfo.isEnabled()) { - LOG.info("Disabling segment: {}", unoptimizedSegmentInfo.getSegmentName()); - newSegmentInfo.setIsEnabled(false); - } - - optimizationPromise.setValue(newSegmentInfo); - } catch (Throwable e) { - if (unoptimizedSegmentInfo != null) { - unoptimizedSegmentInfo.setFailedOptimize(); - } - - transition(State.Optimizing, State.FailedToOptimize); - optimizationPromise.setException(e); - } - }, "optimizing-segment-writer").start(); - - return optimizationPromise; - } - - private void applyAllPendingUpdates(SegmentWriter segmentWriter) throws IOException { - LOG.info("Applying {} queued updates to segment {}.", queuedEvents.size(), segmentName); - // More events can be enqueued while this method is running, so we track the total applied too. - long eventCount = 0; - Stopwatch stopwatch = Stopwatch.createStarted(); - ThriftVersionedEvents update; - while ((update = queuedEvents.poll()) != null) { - segmentWriter.indexThriftVersionedEvents(update); - eventCount++; - } - LOG.info("Applied {} queued updates to segment {} in {}.", - eventCount, segmentName, stopwatch); - } - - @Override - public Result indexThriftVersionedEvents(ThriftVersionedEvents tve) throws IOException { - synchronized (lock) { - if (state.get() == State.Optimizing) { - queuedEvents.add(tve); - } - return segmentWriterReference.get().indexThriftVersionedEvents(tve); - } - } - - @Override - public SegmentInfo getSegmentInfo() { - return segmentWriterReference.get().getSegmentInfo(); - } - - private void transition(State from, State to) { - Preconditions.checkState(state.compareAndSet(from, to)); - LOG.info("Transitioned from {} to {} for segment {}.", from, to, segmentName); - } - - @VisibleForTesting - public Future getOptimizationPromise() { - return optimizationPromise; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionConfig.docx b/src/java/com/twitter/search/earlybird/partition/PartitionConfig.docx new file mode 100644 index 000000000..2f7833fa7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/PartitionConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionConfig.java b/src/java/com/twitter/search/earlybird/partition/PartitionConfig.java deleted file mode 100644 index 5d8280ca6..000000000 --- a/src/java/com/twitter/search/earlybird/partition/PartitionConfig.java +++ /dev/null @@ -1,171 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.Date; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.commons.lang3.builder.ToStringBuilder; - -import com.twitter.search.common.config.Config; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.config.TierConfig; - -public class PartitionConfig { - // Which sub-cluster this host belongs to - private final String tierName; - - // Which cluster this host belongs to - private final String clusterName; - - public static final String DEFAULT_TIER_NAME = "all"; - - // the date range of the timeslices this tier will load. The start date is inclusive, while - // the end date is exclusive. - private final Date tierStartDate; - private final Date tierEndDate; - - private final int indexingHashPartitionID; // Hash Partition ID assigned for this EB - private final int maxEnabledLocalSegments; // Number of segments to keep - // The position of this host in the ordered list of hosts serving this hash partition - private final int hostPositionWithinHashPartition; - private volatile int numReplicasInHashPartition; - - private final int numPartitions; // Total number of partitions in the current cluster - - public PartitionConfig( - int indexingHashPartitionID, - int maxEnabledLocalSegments, - int hostPositionWithinHashPartition, - int numReplicasInHashPartition, - int numPartitions) { - this(DEFAULT_TIER_NAME, - TierConfig.DEFAULT_TIER_START_DATE, - TierConfig.DEFAULT_TIER_END_DATE, - indexingHashPartitionID, - maxEnabledLocalSegments, - hostPositionWithinHashPartition, - numReplicasInHashPartition, - numPartitions); - } - - public PartitionConfig(String tierName, - Date tierStartDate, - Date tierEndDate, - int indexingHashPartitionID, - int maxEnabledLocalSegments, - int hostPositionWithinHashPartition, - int numReplicasInHashPartition, - int numPartitions) { - this(tierName, tierStartDate, tierEndDate, indexingHashPartitionID, maxEnabledLocalSegments, - hostPositionWithinHashPartition, numReplicasInHashPartition, Config.getEnvironment(), - numPartitions); - } - - public PartitionConfig(String tierName, - Date tierStartDate, - Date tierEndDate, - int indexingHashPartitionID, - int maxEnabledLocalSegments, - int hostPositionWithinHashPartition, - int numReplicasInHashPartition, - String clusterName, - int numPartitions) { - this.tierName = Preconditions.checkNotNull(tierName); - this.clusterName = Preconditions.checkNotNull(clusterName); - this.tierStartDate = Preconditions.checkNotNull(tierStartDate); - this.tierEndDate = Preconditions.checkNotNull(tierEndDate); - this.indexingHashPartitionID = indexingHashPartitionID; - this.maxEnabledLocalSegments = maxEnabledLocalSegments; - this.hostPositionWithinHashPartition = hostPositionWithinHashPartition; - this.numReplicasInHashPartition = numReplicasInHashPartition; - this.numPartitions = numPartitions; - } - - public String getTierName() { - return tierName; - } - - public String getClusterName() { - return clusterName; - } - - public Date getTierStartDate() { - return tierStartDate; - } - - public Date getTierEndDate() { - return tierEndDate; - } - - public int getIndexingHashPartitionID() { - return indexingHashPartitionID; - } - - public int getMaxEnabledLocalSegments() { - return maxEnabledLocalSegments; - } - - public int getHostPositionWithinHashPartition() { - return hostPositionWithinHashPartition; - } - - public int getNumReplicasInHashPartition() { - return numReplicasInHashPartition; - } - - /** - * The number of ways the Tweet and/or user data is partitioned (or sharded) in this Earlybird, in - * this tier. - */ - public int getNumPartitions() { - return numPartitions; - } - - public String getPartitionConfigDescription() { - return ToStringBuilder.reflectionToString(this); - } - - public void setNumReplicasInHashPartition(int numReplicas) { - numReplicasInHashPartition = numReplicas; - } - - public static final int DEFAULT_NUM_SERVING_TIMESLICES_FOR_TEST = 18; - public static PartitionConfig getPartitionConfigForTests() { - return getPartitionConfigForTests( - TierConfig.DEFAULT_TIER_START_DATE, - TierConfig.DEFAULT_TIER_END_DATE); - } - - public static PartitionConfig getPartitionConfigForTests(Date tierStartDate, Date tierEndDate) { - return getPartitionConfigForTests( - DEFAULT_NUM_SERVING_TIMESLICES_FOR_TEST, tierStartDate, tierEndDate, 1); - } - - /** - * Returns a PartitionConfig instance configured for tests. - * - * @param numServingTimeslices The number of timeslices that should be served. - * @param tierStartDate The tier's start date. Used only in the full archive earlybirds. - * @param tierEndDate The tier's end date. Used only by in the full archive earlybirds. - * @param numReplicasInHashPartition The number of replicas for each partition. - * @return A PartitionConfig instance configured for tests. - */ - @VisibleForTesting - public static PartitionConfig getPartitionConfigForTests( - int numServingTimeslices, - Date tierStartDate, - Date tierEndDate, - int numReplicasInHashPartition) { - return new PartitionConfig( - EarlybirdConfig.getString("sub_tiers_for_tests", "test"), - tierStartDate, - tierEndDate, - EarlybirdConfig.getInt("hash_partition_for_tests", -1), - numServingTimeslices, - 0, // hostPositionWithinHashPartition - numReplicasInHashPartition, - EarlybirdConfig.getInt("num_partitions_for_tests", -1) - ); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoader.docx b/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoader.docx new file mode 100644 index 000000000..7378b6025 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoader.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoader.java b/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoader.java deleted file mode 100644 index fabae4595..000000000 --- a/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoader.java +++ /dev/null @@ -1,45 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; - -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.aurora.AuroraInstanceKey; -import com.twitter.search.common.aurora.AuroraSchedulerClient; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.factory.PartitionConfigUtil; - -public final class PartitionConfigLoader { - private static final Logger LOG = LoggerFactory.getLogger(PartitionConfigLoader.class); - - private PartitionConfigLoader() { - // this never gets called - } - - /** - * Load partition information from the command line arguments and Aurora scheduler. - * - * @return The new PartitionConfig object for this host - */ - public static PartitionConfig getPartitionInfoForMesosConfig( - AuroraSchedulerClient schedulerClient) throws PartitionConfigLoadingException { - AuroraInstanceKey instanceKey = - Preconditions.checkNotNull(EarlybirdConfig.getAuroraInstanceKey()); - int numTasks; - - try { - numTasks = schedulerClient.getActiveTasks( - instanceKey.getRole(), instanceKey.getEnv(), instanceKey.getJobName()).size(); - LOG.info("Found {} active tasks", numTasks); - } catch (IOException e) { - // This can happen when Aurora Scheduler is holding a conclave to elect a new reader. - LOG.warn("Failed to get tasks from Aurora scheduler.", e); - throw new PartitionConfigLoadingException("Failed to get tasks from Aurora scheduler."); - } - - return PartitionConfigUtil.initPartitionConfigForAurora(numTasks); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoadingException.docx b/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoadingException.docx new file mode 100644 index 000000000..bb986ca78 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoadingException.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoadingException.java b/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoadingException.java deleted file mode 100644 index fa39ce361..000000000 --- a/src/java/com/twitter/search/earlybird/partition/PartitionConfigLoadingException.java +++ /dev/null @@ -1,12 +0,0 @@ -package com.twitter.search.earlybird.partition; - -/** - * An exception thrown when the earlybird layout could not be loaded, or when a host cannot find - * itself in the layout, and the layout has errors (which might be the reason why the host could not - * find itself in the layout). - */ -public class PartitionConfigLoadingException extends Exception { - public PartitionConfigLoadingException(String message) { - super(message); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionManager.docx b/src/java/com/twitter/search/earlybird/partition/PartitionManager.docx new file mode 100644 index 000000000..b08750f88 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/PartitionManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionManager.java b/src/java/com/twitter/search/earlybird/partition/PartitionManager.java deleted file mode 100644 index c71d3d602..000000000 --- a/src/java/com/twitter/search/earlybird/partition/PartitionManager.java +++ /dev/null @@ -1,254 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.concurrent.ScheduledExecutorServiceFactory; -import com.twitter.search.common.config.Config; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.exception.EarlybirdStartupException; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.earlybird.segment.SegmentDataProvider; -import com.twitter.search.earlybird.thrift.EarlybirdStatusCode; -import com.twitter.search.earlybird.util.OneTaskScheduledExecutorManager; -import com.twitter.search.earlybird.util.PeriodicActionParams; -import com.twitter.search.earlybird.util.ShutdownWaitTimeParams; -import com.twitter.search.queryparser.query.QueryParserException; - -/** - * PartitionManager is responsible for indexing data for a partition, including Tweets and Users. - */ -public abstract class PartitionManager extends OneTaskScheduledExecutorManager { - private static final Logger LOG = LoggerFactory.getLogger(PartitionManager.class); - - private static final SearchCounter IGNORED_EXCEPTIONS = - SearchCounter.export("partition_manager_ignored_exceptions"); - - private static final String PARTITION_MANAGER_THREAD_NAME = "PartitionManager"; - private static final boolean THREAD_IS_DAEMON = true; - protected static final String INDEX_CURRENT_SEGMENT = "indexing the current segment"; - protected static final String SETUP_QUERY_CACHE = "setting up query cache"; - - protected final SegmentManager segmentManager; - protected final QueryCacheManager queryCacheManager; - // Should be updated by info read from ZK - protected final DynamicPartitionConfig dynamicPartitionConfig; - - private final SearchIndexingMetricSet searchIndexingMetricSet; - - private boolean partitionManagerFirstLoop = true; - - public PartitionManager(QueryCacheManager queryCacheManager, - SegmentManager segmentManager, - DynamicPartitionConfig dynamicPartitionConfig, - ScheduledExecutorServiceFactory executorServiceFactory, - SearchIndexingMetricSet searchIndexingMetricSet, - SearchStatsReceiver searchStatsReceiver, - CriticalExceptionHandler criticalExceptionHandler) { - super( - executorServiceFactory, - PARTITION_MANAGER_THREAD_NAME, - THREAD_IS_DAEMON, - PeriodicActionParams.withFixedDelay( - EarlybirdConfig.getInt("time_slice_roll_check_interval_ms", 500), - TimeUnit.MILLISECONDS), - ShutdownWaitTimeParams.indefinitely(), - searchStatsReceiver, - criticalExceptionHandler); - - this.segmentManager = segmentManager; - this.queryCacheManager = queryCacheManager; - this.dynamicPartitionConfig = dynamicPartitionConfig; - this.searchIndexingMetricSet = searchIndexingMetricSet; - } - - /** - * Runs the partition manager. - */ - public final void runImpl() { - if (partitionManagerFirstLoop) { - try { - testHookBeforeStartUp(); - startUp(); - validateSegments(); - segmentManager.logState("After startUp"); - } catch (Throwable t) { - criticalExceptionHandler.handle(this, t); - shutDownIndexing(); - throw new RuntimeException("PartitionManager unhandled exception, stopping scheduler", t); - } - } - - try { - testHookAfterSleep(); - indexingLoop(partitionManagerFirstLoop); - } catch (InterruptedException e) { - LOG.warn("PartitionManager thread interrupted, stoping scheduler", e); - shutDownIndexing(); - throw new RuntimeException("PartitionManager thread interrupted", e); - } catch (Exception e) { - LOG.error("Exception in indexing PartitionManager loop", e); - IGNORED_EXCEPTIONS.increment(); - } catch (Throwable t) { - LOG.error("Unhandled exception in indexing PartitionManager loop", t); - criticalExceptionHandler.handle(this, t); - shutDownIndexing(); - throw new RuntimeException("PartitionManager unhandled exception, stopping scheduler", t); - } finally { - partitionManagerFirstLoop = false; - } - } - - /** - * Returns the SegmentDataProvider instance that will be used to fetch the information for all - * segments. - */ - public abstract SegmentDataProvider getSegmentDataProvider(); - - /** - * Starts up this partition manager. - */ - protected abstract void startUp() throws Exception; - - /** - * Runs one indexing iteration. - * - * @param firstLoop Determines if this is the first time the indexing loop is running. - */ - protected abstract void indexingLoop(boolean firstLoop) throws Exception; - - /** - * Shuts down all indexing. - */ - protected abstract void shutDownIndexing(); - - @Override - public void shutdownComponent() { - shutDownIndexing(); - } - - /** - * Notifies all other threads that the partition manager has become current (ie. has indexed all - * available events). - */ - public void becomeCurrent() { - LOG.info("PartitionManager became current"); - if (EarlybirdStatus.isStarting()) { - EarlybirdStatus.setStatus(EarlybirdStatusCode.CURRENT); - } else { - LOG.warn("Could not set statusCode to CURRENT from " + EarlybirdStatus.getStatusCode()); - } - - // Now that we're done starting up, set the query cache thread pool size to one. - queryCacheManager.setWorkerPoolSizeAfterStartup(); - } - - protected void setupQueryCacheIfNeeded() throws QueryParserException { - queryCacheManager.setupTasksIfNeeded(segmentManager); - } - - // Only for tests, used for testing exception handling - private static TestHook testHookBeforeStartUp; - private static TestHook testHookAfterSleep; - - private static void testHookBeforeStartUp() throws Exception { - if (Config.environmentIsTest() && testHookBeforeStartUp != null) { - testHookBeforeStartUp.run(); - } - } - - private static void testHookAfterSleep() throws Exception { - if (Config.environmentIsTest() && testHookAfterSleep != null) { - testHookAfterSleep.run(); - } - } - - @Override - protected void runOneIteration() { - try { - runImpl(); - } catch (Throwable t) { - LOG.error("Unhandled exception in PartitionManager loop", t); - throw new RuntimeException(t.getMessage()); - } - } - - public SearchIndexingMetricSet getSearchIndexingMetricSet() { - return searchIndexingMetricSet; - } - - /** - * Allows tests to run code before the partition manager starts up. - * - * @param testHook The code to run before the start up. - */ - @VisibleForTesting - public static void setTestHookBeforeStartUp(TestHook testHook) { - if (Config.environmentIsTest()) { - testHookBeforeStartUp = testHook; - } else { - throw new RuntimeException("Trying to set startup test hook in non-test code!!"); - } - } - - /** - * Allows tests to run code before the indexing loop. - * - * @param testHook The code to run before the indexing loop. - */ - @VisibleForTesting - public static void setTestHookAfterSleep(TestHook testHook) { - if (Config.environmentIsTest()) { - testHookAfterSleep = testHook; - } else { - throw new RuntimeException("Trying to set test hook in non-test code!!"); - } - } - - /** - * An interface that allows tests to run code at various points in the PartitionManager's - * lyfecycle. - */ - @VisibleForTesting - public interface TestHook { - /** - * Defines the code that should be run. - */ - void run() throws Exception; - } - - /** - * Allows tests to determine if this partition manager is all caught up. - * - * @return {@code true} if this partition manager is caught up, {@code false} otherwise. - */ - @VisibleForTesting - public abstract boolean isCaughtUpForTests(); - - @VisibleForTesting - protected void validateSegments() throws EarlybirdStartupException { - // This is necessary because many tests rely on starting partition manager but not indexing any - // tweets. However, we do not want Earlybirds to start in production if they are not serving any - // tweets. (SEARCH-24238) - if (Config.environmentIsTest()) { - return; - } - validateSegmentsForNonTest(); - } - - @VisibleForTesting - protected void validateSegmentsForNonTest() throws EarlybirdStartupException { - // Subclasses can override this and provide additional checks. - if (segmentManager.getNumIndexedDocuments() == 0) { - throw new EarlybirdStartupException("Earlybird has zero indexed documents."); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionManagerStartup.docx b/src/java/com/twitter/search/earlybird/partition/PartitionManagerStartup.docx new file mode 100644 index 000000000..1f64b23ff Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/PartitionManagerStartup.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionManagerStartup.java b/src/java/com/twitter/search/earlybird/partition/PartitionManagerStartup.java deleted file mode 100644 index f25bdd1bd..000000000 --- a/src/java/com/twitter/search/earlybird/partition/PartitionManagerStartup.java +++ /dev/null @@ -1,57 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.Closeable; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.earlybird.EarlybirdServer; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.exception.EarlybirdStartupException; -import com.twitter.search.earlybird.thrift.EarlybirdStatusCode; - -/** - * Handles starting and indexing data for a partition, using a PartitionManager. - */ -public class PartitionManagerStartup implements EarlybirdStartup { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdServer.class); - - private final Clock clock; - private final PartitionManager partitionManager; - - public PartitionManagerStartup( - Clock clock, - PartitionManager partitionManager - ) { - this.clock = clock; - this.partitionManager = partitionManager; - } - - @Override - public Closeable start() throws EarlybirdStartupException { - partitionManager.schedule(); - - int count = 0; - - while (EarlybirdStatus.getStatusCode() != EarlybirdStatusCode.CURRENT) { - if (EarlybirdStatus.getStatusCode() == EarlybirdStatusCode.STOPPING) { - return partitionManager; - } - - try { - clock.waitFor(1000); - } catch (InterruptedException e) { - LOG.info("Sleep interrupted, quitting earlybird"); - throw new EarlybirdStartupException("Sleep interrupted"); - } - - // Log every 120 seconds. - if (count++ % 120 == 0) { - LOG.info("Thrift port closed until Earlybird, both indexing and query cache, is current"); - } - } - - return partitionManager; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionWriter.docx b/src/java/com/twitter/search/earlybird/partition/PartitionWriter.docx new file mode 100644 index 000000000..1dcfbedfd Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/PartitionWriter.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/PartitionWriter.java b/src/java/com/twitter/search/earlybird/partition/PartitionWriter.java deleted file mode 100644 index 797acd002..000000000 --- a/src/java/com/twitter/search/earlybird/partition/PartitionWriter.java +++ /dev/null @@ -1,109 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.time.Duration; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.common_internal.text.version.PenguinVersion; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; - -/** - * PartitionWriter writes Tweet events and Tweet update events to an Earlybird index. It is - * responsible for creating new segments, adding Tweets to the correct segment, and applying updates - * to the correct segment. - */ -public class PartitionWriter { - private static final Logger LOG = LoggerFactory.getLogger(PartitionWriter.class); - private static final String STATS_PREFIX = "partition_writer_"; - - private static final SearchRateCounter MISSING_PENGUIN_VERSION = - SearchRateCounter.export(STATS_PREFIX + "missing_penguin_version"); - private static final Duration CAUGHT_UP_FRESHNESS = Duration.ofSeconds(5); - private static final SearchRateCounter EVENTS_CONSUMED = - SearchRateCounter.export(STATS_PREFIX + "events_consumed"); - - private final PenguinVersion penguinVersion; - private final TweetUpdateHandler updateHandler; - private final TweetCreateHandler createHandler; - private final Clock clock; - private final CriticalExceptionHandler criticalExceptionHandler; - - - - public PartitionWriter( - TweetCreateHandler tweetCreateHandler, - TweetUpdateHandler tweetUpdateHandler, - CriticalExceptionHandler criticalExceptionHandler, - PenguinVersion penguinVersion, - Clock clock - ) { - LOG.info("Creating PartitionWriter."); - this.createHandler = tweetCreateHandler; - this.updateHandler = tweetUpdateHandler; - this.criticalExceptionHandler = criticalExceptionHandler; - this.penguinVersion = penguinVersion; - this.clock = clock; - } - - /** - * Index a batch of TVE records. - */ - public boolean indexBatch(Iterable> records) - throws Exception { - long minTweetAge = Long.MAX_VALUE; - for (ConsumerRecord record : records) { - ThriftVersionedEvents tve = record.value(); - indexTVE(tve); - EVENTS_CONSUMED.increment(); - long tweetAgeInMs = SnowflakeIdParser.getTweetAgeInMs(clock.nowMillis(), tve.getId()); - minTweetAge = Math.min(tweetAgeInMs, minTweetAge); - } - - return minTweetAge < CAUGHT_UP_FRESHNESS.toMillis(); - } - - /** - * Index a ThriftVersionedEvents struct. - */ - @VisibleForTesting - public void indexTVE(ThriftVersionedEvents tve) throws IOException { - ThriftIndexingEvent tie = tve.getVersionedEvents().get(penguinVersion.getByteValue()); - if (tie == null) { - LOG.error("Could not find a ThriftIndexingEvent for PenguinVersion {} in " - + "ThriftVersionedEvents: {}", penguinVersion, tve); - MISSING_PENGUIN_VERSION.increment(); - return; - } - - // An `INSERT` event is used for new Tweets. These are generated from Tweet Create Events from - // TweetyPie. - if (tie.getEventType() == ThriftIndexingEventType.INSERT) { - createHandler.handleTweetCreate(tve); - updateHandler.retryPendingUpdates(tve.getId()); - } else { - updateHandler.handleTweetUpdate(tve, false); - } - } - - public void prepareAfterStartingWithIndex(long maxIndexedTweetId) { - createHandler.prepareAfterStartingWithIndex(maxIndexedTweetId); - } - - void logState() { - LOG.info("PartitionWriter state:"); - LOG.info(String.format(" Events indexed: %,d", EVENTS_CONSUMED.getCount())); - createHandler.logState(); - updateHandler.logState(); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SearchIndexingMetricSet.docx b/src/java/com/twitter/search/earlybird/partition/SearchIndexingMetricSet.docx new file mode 100644 index 000000000..835c0a024 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SearchIndexingMetricSet.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SearchIndexingMetricSet.java b/src/java/com/twitter/search/earlybird/partition/SearchIndexingMetricSet.java deleted file mode 100644 index 9edeb56d2..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SearchIndexingMetricSet.java +++ /dev/null @@ -1,208 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.EnumMap; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; -import com.twitter.search.earlybird.util.ScheduledExecutorManager; - -/** - * Collection of common metrics used in the indexing, and related code. - * We create a set/holder for them as we want to create all counters only one time, and these - * counters can be used by both SimpleUpdateIndexer, PartitionIndexer, EarlybirdSegment, and others. - */ -public class SearchIndexingMetricSet { - /** - * A proxy for the creation time of the "freshest" tweet that we have in the index. - * It is used in computing the index freshness stat "earlybird_index_freshness_millis". - * - In the realtme clusters, this should match the creation time of highestStatusId. - * - In the archive clusters, this should match the timestamp of the latest indexed day. - */ - public final SearchLongGauge freshestTweetTimeMillis; - - /** The highest indexed tweet ID. Used to compute index freshness. */ - public final SearchLongGauge highestStatusId; - - /** - * The current timeslice's ID. We can compare this to indexer's exported current timeslice ID to - * identify stuck timeslice rolls. - */ - public final SearchLongGauge currentTimesliceId; - - /** The number of archive timeslices that we failed to process. */ - public final SearchCounter archiveTimeSliceBuildFailedCounter; - - /** The number of times we checked a segment's size on disk. */ - public final SearchCounter segmentSizeCheckCount; - - /** The number of segments that have reached their max size. */ - public final SearchCounter maxSegmentSizeReachedCounter; - - /** The number of indexed tweets and the aggregate indexing latencies in microseconds. */ - public final SearchTimerStats statusStats; - /** The number of applied updates and the aggregate indexing latencies in microseconds. */ - public final SearchTimerStats updateStats; - /** The number of retried updates and the aggregate indexing latencies in microseconds. */ - public final SearchTimerStats updateRetryStats; - /** The number of applied user updates and the aggregate indexing latencies in microseconds. */ - public final SearchTimerStats userUpdateIndexingStats; - /** The number of applied userGeoScrubEvents and the aggregate indexing latencies in - * microseconds. */ - public final SearchTimerStats userScrubGeoIndexingStats; - /** The number of updates attempted on missing tweets. */ - public final SearchRateCounter updateOnMissingTweetCounter; - /** The number of updates dropped. */ - public final SearchRateCounter droppedUpdateEvent; - - /** The latencies in microseconds of the PartitionIndexer loop. */ - public final SearchTimerStats partitionIndexerRunLoopCounter; - /** The latencies in microseconds of the PartitionIndexer.indexFromReaders() calls. */ - public final SearchTimerStats partitionIndexerIndexFromReadersCounter; - /** The number of invocations of the PartitionIndexer task. */ - public final SearchCounter partitionIndexerIterationCounter; - - /** The number of unsorted updates handled by SimpleUpdateIndexer. */ - public final SearchCounter simpleUpdateIndexerUnsortedUpdateCounter; - /** The number of unsorted updates with the wrong segment handled by SimpleUpdateIndexer. */ - public final SearchCounter simpleUpdateIndexerUnsortedUpdateWithWrongSegmentCounter; - - /** The number of invocations of the SimpleUserUpdateIndexer task. */ - public final SearchCounter simpleUserUpdateIndexerIterationCounter; - - /** The number of exceptions encountered by SimpleSegmentIndexer while indexing a segment. */ - public final SearchCounter simpleSegmentIndexerExceptionCounter; - - /** - * A map from TIE update type to the creation time of the updated tweet in milliseconds of the - * freshest update we have indexed. - */ - public final EnumMap updateFreshness = - new EnumMap<>(ThriftIndexingEventType.class); - - public final SearchStatsReceiver searchStatsReceiver; - - public static class StartupMetric { - // Switched from 0 to 1 during the event. - private SearchLongGauge duringGauge; - // Switched from 0 to time it takes, in milliseconds. - private SearchLongGauge durationMillisGauge; - - StartupMetric(String name) { - this.duringGauge = SearchLongGauge.export(name); - this.durationMillisGauge = SearchLongGauge.export("duration_of_" + name); - } - - public void begin() { - duringGauge.set(1); - } - - public void end(long durationInMillis) { - duringGauge.set(0); - durationMillisGauge.set(durationInMillis); - } - } - - public final StartupMetric startupInProgress; - public final StartupMetric startupInIndexCompletedSegments; - public final StartupMetric startupInLoadCompletedSegments; - public final StartupMetric startupInIndexUpdatesForCompletedSegments; - public final StartupMetric startupInCurrentSegment; - public final StartupMetric startupInUserUpdates; - public final StartupMetric startupInQueryCacheUpdates; - public final StartupMetric startupInMultiSegmentTermDictionaryUpdates; - public final StartupMetric startupInWarmUp; - - // Kafka metrics - public final StartupMetric startupInLoadFlushedIndex; - public final StartupMetric startupInFreshStartup; - public final StartupMetric startupInIngestUntilCurrent; - public final StartupMetric startupInUserUpdatesStartup; - public final StartupMetric startupInUserEventIndexer; - public final StartupMetric startupInAudioSpaceEventIndexer; - - public SearchIndexingMetricSet(SearchStatsReceiver searchStatsReceiver) { - this.freshestTweetTimeMillis = searchStatsReceiver.getLongGauge( - "earlybird_freshest_tweet_timestamp_millis"); - this.highestStatusId = searchStatsReceiver.getLongGauge("highest_indexed_status_id"); - this.currentTimesliceId = searchStatsReceiver.getLongGauge("earlybird_current_timeslice_id"); - this.archiveTimeSliceBuildFailedCounter = searchStatsReceiver.getCounter( - "archive_time_slice_build_failed"); - this.segmentSizeCheckCount = searchStatsReceiver.getCounter("segment_size_check_count"); - this.maxSegmentSizeReachedCounter = searchStatsReceiver.getCounter("max_segment_reached"); - - this.statusStats = searchStatsReceiver.getTimerStats( - "index_status", TimeUnit.MICROSECONDS, false, false, false); - this.updateStats = searchStatsReceiver.getTimerStats( - "updates", TimeUnit.MICROSECONDS, false, false, false); - this.updateRetryStats = searchStatsReceiver.getTimerStats( - "update_retries", TimeUnit.MICROSECONDS, false, false, false); - this.userUpdateIndexingStats = searchStatsReceiver.getTimerStats( - "user_updates", TimeUnit.MICROSECONDS, false, false, false); - this.userScrubGeoIndexingStats = searchStatsReceiver.getTimerStats( - "user_scrub_geo", TimeUnit.MICROSECONDS, false, false, false); - this.updateOnMissingTweetCounter = searchStatsReceiver.getRateCounter( - "index_update_on_missing_tweet"); - this.droppedUpdateEvent = searchStatsReceiver.getRateCounter("dropped_update_event"); - - this.partitionIndexerRunLoopCounter = searchStatsReceiver.getTimerStats( - "partition_indexer_run_loop", TimeUnit.MICROSECONDS, false, true, false); - this.partitionIndexerIndexFromReadersCounter = searchStatsReceiver.getTimerStats( - "partition_indexer_indexFromReaders", TimeUnit.MICROSECONDS, false, true, false); - this.partitionIndexerIterationCounter = searchStatsReceiver.getCounter( - ScheduledExecutorManager.SCHEDULED_EXECUTOR_TASK_PREFIX + "PartitionIndexer"); - - this.simpleUpdateIndexerUnsortedUpdateCounter = searchStatsReceiver.getCounter( - "simple_update_indexer_unsorted_update_count"); - this.simpleUpdateIndexerUnsortedUpdateWithWrongSegmentCounter = searchStatsReceiver.getCounter( - "simple_update_indexer_unsorted_update_with_wrong_segment_count"); - - this.simpleUserUpdateIndexerIterationCounter = searchStatsReceiver.getCounter( - ScheduledExecutorManager.SCHEDULED_EXECUTOR_TASK_PREFIX + "SimpleUserUpdateIndexer"); - - this.simpleSegmentIndexerExceptionCounter = searchStatsReceiver.getCounter( - "exception_while_indexing_segment"); - - for (ThriftIndexingEventType type : ThriftIndexingEventType.values()) { - AtomicLong freshness = new AtomicLong(0); - updateFreshness.put(type, freshness); - String statName = ("index_freshness_" + type + "_age_millis").toLowerCase(); - searchStatsReceiver.getCustomGauge(statName, - () -> System.currentTimeMillis() - freshness.get()); - } - - this.startupInProgress = new StartupMetric("startup_in_progress"); - this.startupInIndexCompletedSegments = new StartupMetric("startup_in_index_completed_segments"); - this.startupInLoadCompletedSegments = new StartupMetric("startup_in_load_completed_segments"); - this.startupInIndexUpdatesForCompletedSegments = - new StartupMetric("startup_in_index_updates_for_completed_segments"); - this.startupInCurrentSegment = new StartupMetric("startup_in_current_segment"); - this.startupInUserUpdates = new StartupMetric("startup_in_user_updates"); - this.startupInQueryCacheUpdates = new StartupMetric("startup_in_query_cache_updates"); - this.startupInMultiSegmentTermDictionaryUpdates = - new StartupMetric("startup_in_multi_segment_dictionary_updates"); - this.startupInWarmUp = new StartupMetric("startup_in_warm_up"); - - this.startupInLoadFlushedIndex = new StartupMetric("startup_in_load_flushed_index"); - this.startupInFreshStartup = new StartupMetric("startup_in_fresh_startup"); - this.startupInIngestUntilCurrent = new StartupMetric("startup_in_ingest_until_current"); - this.startupInUserUpdatesStartup = new StartupMetric("startup_in_user_updates_startup"); - this.startupInUserEventIndexer = new StartupMetric("startup_in_user_events_indexer"); - this.startupInAudioSpaceEventIndexer = - new StartupMetric("startup_in_audio_space_events_indexer"); - - searchStatsReceiver.getCustomGauge("earlybird_index_freshness_millis", - this::getIndexFreshnessInMillis); - - this.searchStatsReceiver = searchStatsReceiver; - } - - long getIndexFreshnessInMillis() { - return System.currentTimeMillis() - freshestTweetTimeMillis.get(); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentHdfsFlusher.docx b/src/java/com/twitter/search/earlybird/partition/SegmentHdfsFlusher.docx new file mode 100644 index 000000000..4931338cc Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentHdfsFlusher.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentHdfsFlusher.java b/src/java/com/twitter/search/earlybird/partition/SegmentHdfsFlusher.java deleted file mode 100644 index c048bfa6e..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentHdfsFlusher.java +++ /dev/null @@ -1,247 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.File; -import java.io.IOException; -import java.util.concurrent.TimeUnit; - -import org.apache.commons.io.FileUtils; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.base.Command; -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.search.common.database.DatabaseConfig; -import com.twitter.search.common.metrics.Timer; -import com.twitter.search.common.util.io.flushable.PersistentFile; -import com.twitter.search.common.util.zktrylock.TryLock; -import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory; - -/** - * Flush segments to disk and upload them to HDFS. - */ -public class SegmentHdfsFlusher { - private static final Logger LOG = LoggerFactory.getLogger(SegmentHdfsFlusher.class); - private static final Amount HDFS_UPLOADER_TRY_LOCK_NODE_EXPIRATION_TIME_MILLIS = - Amount.of(1L, Time.HOURS); - - private final SegmentSyncConfig sync; - private final boolean holdLockWhileUploading; - private final ZooKeeperTryLockFactory zkTryLockFactory; - - public SegmentHdfsFlusher(ZooKeeperTryLockFactory zooKeeperTryLockFactory, - SegmentSyncConfig sync, - boolean holdLockWhileUploading) { - this.zkTryLockFactory = zooKeeperTryLockFactory; - this.sync = sync; - this.holdLockWhileUploading = holdLockWhileUploading; - } - - public SegmentHdfsFlusher( - ZooKeeperTryLockFactory zooKeeperTryLockFactory, - SegmentSyncConfig sync) { - this(zooKeeperTryLockFactory, sync, true); - } - - private boolean shouldFlushSegment(SegmentInfo segmentInfo) { - return segmentInfo.isEnabled() - && !segmentInfo.getSyncInfo().isFlushed() - && segmentInfo.isComplete() - && segmentInfo.isOptimized() - && !segmentInfo.isFailedOptimize() - && !segmentInfo.getSyncInfo().isLoaded(); - } - - /** - * Flushes a segment to local disk and to HDFS. - */ - public boolean flushSegmentToDiskAndHDFS(SegmentInfo segmentInfo) { - if (!shouldFlushSegment(segmentInfo)) { - return false; - } - try { - if (segmentInfo.isIndexing()) { - LOG.error("Tried to flush current segment!"); - return false; - } - - // Check-and-set the beingUploaded flag from false to true. If the CAS fails, it means the - // segment is being flushed already, or being deleted. In this case, we can just return false. - if (!segmentInfo.casBeingUploaded(false, true)) { - LOG.warn("Tried to flush a segment that's being flushed or deleted."); - return false; - } - - // At this point, the above CAS must have returned false. This mean the beingUploaded flag - // was false, and set to true now. We can proceed with flushing the segment. - try { - checkAndFlushSegmentToHdfs(segmentInfo); - } finally { - segmentInfo.setBeingUploaded(false); - } - return true; - } catch (Exception e) { - LOG.error("Exception while flushing IndexSegment to " - + segmentInfo.getSyncInfo().getHdfsFlushDir(), e); - return false; - } - } - - /** - * First try to acquire a lock in Zookeeper for this segment, so multiple Earlybirds in the same - * partition don't flush or upload the segment at the same time. When the lock is acquired, check - * for the segment in HDFS. If the data already exists, don't flush to disk. - */ - private void checkAndFlushSegmentToHdfs(final SegmentInfo segment) { - LOG.info("Checking and flushing segment {}", segment); - - try { - // Always flush the segment locally. - Directory dir = FSDirectory.open(createFlushDir(segment).toPath()); - segment.flush(dir); - LOG.info("Completed local flush of segment {}. Flush to HDFS enabled: {}", - segment, sync.isFlushToHdfsEnabled()); - } catch (IOException e) { - LOG.error("Failed to flush segment " + segment + " locally", e); - return; - } - - if (!holdLockWhileUploading) { - flushToHdfsIfNecessary(segment); - } else { - TryLock lock = zkTryLockFactory.createTryLock( - DatabaseConfig.getLocalHostname(), - sync.getZooKeeperSyncFullPath(), - sync.getVersionedName(segment.getSegment()), - HDFS_UPLOADER_TRY_LOCK_NODE_EXPIRATION_TIME_MILLIS - ); - - boolean gotLock = lock.tryWithLock((Command) () -> flushToHdfsIfNecessary(segment)); - if (!gotLock) { - LOG.info("Failed to get zk upload lock for segment {}", segment); - } - } - } - - /** - * Check whether the segment has already been flushed to HDFS. If not, flush the segment to disk - * and upload the files to HDFS. - * - * If the ZK lock isn't used, there is a race between the existence check and the upload (in - * which another Earlybird can sneak in and upload the segment), so we will potentially upload - * the same segment from different hosts. Thus, the Earlybird hostname is part of the segment's - * path on HDFS. - */ - private void flushToHdfsIfNecessary(SegmentInfo segmentInfo) { - Timer timer = new Timer(TimeUnit.MILLISECONDS); - String status = "flushed"; - try (FileSystem fs = HdfsUtil.getHdfsFileSystem()) { - // If we can't load segments from HDFS, don't bother checking HDFS for the segment - if (sync.isSegmentLoadFromHdfsEnabled() - && (segmentInfo.getSyncInfo().isFlushed() - || HdfsUtil.segmentExistsOnHdfs(fs, segmentInfo))) { - status = "existing"; - } else if (sync.isFlushToHdfsEnabled()) { - copyLocalFilesToHdfs(fs, segmentInfo); - status = "uploaded"; - } - - // whether we uploaded, or someone else did, this segment should now be on HDFS. If - // uploading to HDFS is disabled, we still consider it complete. - segmentInfo.getSyncInfo().setFlushed(true); - } catch (IOException e) { - LOG.error("Failed copying segment {} to HDFS after {} ms", segmentInfo, timer.stop(), e); - status = "exception"; - } finally { - if (timer.running()) { - timer.stop(); - } - LOG.info("Flush of segment {} to HDFS completed in {} milliseconds. Status: {}", - segmentInfo, timer.getElapsed(), status); - } - } - - /** - * Copy local segment files to HDFS. Files are first copied into a temporary directory - * in the form _ and when all the files are written out to HDFS, - * the dir is renamed to _, where it is accessible to other Earlybirds. - */ - private void copyLocalFilesToHdfs(FileSystem fs, SegmentInfo segment) throws IOException { - String hdfsTempBaseDir = segment.getSyncInfo().getHdfsTempFlushDir(); - - // If the temp dir already exists on HDFS, a prior flush must have been interrupted. - // Delete it and start fresh. - removeHdfsTempDir(fs, hdfsTempBaseDir); - - for (String fileName : sync.getAllSyncFileNames(segment)) { - String hdfsFileName = hdfsTempBaseDir + "/" + fileName; - String localBaseDir = segment.getSyncInfo().getLocalSyncDir(); - String localFileName = localBaseDir + "/" + fileName; - - LOG.debug("About to start copying {} to HDFS, from {} to {}", - fileName, localFileName, hdfsFileName); - Timer timer = new Timer(TimeUnit.MILLISECONDS); - fs.copyFromLocalFile(new Path(localFileName), new Path(hdfsFileName)); - LOG.debug("Completed copying {} to HDFS, from {} to {}, in {} ms", - fileName, localFileName, hdfsFileName, timer.stop()); - } - - // now let's rename the dir into its proper form. - String hdfsBaseDir = segment.getSyncInfo().getHdfsFlushDir(); - if (fs.rename(new Path(hdfsTempBaseDir), new Path(hdfsBaseDir))) { - LOG.info("Renamed segment dir on HDFS from {} to {}", hdfsTempBaseDir, hdfsBaseDir); - } else { - String errorMessage = String.format("Failed to rename segment dir on HDFS from %s to %s", - hdfsTempBaseDir, hdfsBaseDir); - LOG.error(errorMessage); - - removeHdfsTempDir(fs, hdfsTempBaseDir); - - // Throw an IOException so the calling code knows that the copy failed - throw new IOException(errorMessage); - } - } - - private void removeHdfsTempDir(FileSystem fs, String tempDir) throws IOException { - Path tempDirPath = new Path(tempDir); - if (fs.exists(tempDirPath)) { - LOG.info("Found existing temporary flush dir {} on HDFS, removing", tempDir); - if (!fs.delete(tempDirPath, true /* recursive */)) { - LOG.error("Failed to delete temp dir {}", tempDir); - } - } - } - - // Create or replace the local flush directory - private File createFlushDir(SegmentInfo segmentInfo) throws IOException { - final String flushDirStr = segmentInfo.getSyncInfo().getLocalSyncDir(); - - File flushDir = new File(flushDirStr); - if (flushDir.exists()) { - // Delete just the flushed persistent files if they are there. - // We may also have the lucene on-disk indexed in the same dir here, - // that we do not want to delete. - for (String persistentFile : sync.getPersistentFileNames(segmentInfo)) { - for (String fileName : PersistentFile.getAllFileNames(persistentFile)) { - File file = new File(flushDir, fileName); - if (file.exists()) { - LOG.info("Deleting incomplete flush file {}", file.getAbsolutePath()); - FileUtils.forceDelete(file); - } - } - } - return flushDir; - } - - // Try to create the flush directory - if (!flushDir.mkdirs()) { - throw new IOException("Not able to create segment flush directory \"" + flushDirStr + "\""); - } - - return flushDir; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentIndexStats.docx b/src/java/com/twitter/search/earlybird/partition/SegmentIndexStats.docx new file mode 100644 index 000000000..aacd05457 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentIndexStats.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentIndexStats.java b/src/java/com/twitter/search/earlybird/partition/SegmentIndexStats.java deleted file mode 100644 index ead096fbb..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentIndexStats.java +++ /dev/null @@ -1,96 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.Optional; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; - -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; - -public class SegmentIndexStats { - private EarlybirdIndexSegmentData segmentData; - - private final AtomicLong indexSizeOnDiskInBytes = new AtomicLong(0); - private final AtomicInteger partialUpdateCount = new AtomicInteger(0); - private final AtomicInteger outOfOrderUpdateCount = new AtomicInteger(0); - - private Optional savedStatusCount = Optional.empty(); - private Optional savedDeletesCount = Optional.empty(); - - public void setSegmentData(EarlybirdIndexSegmentData segmentData) { - this.segmentData = segmentData; - } - - /** - * We'd like to be able to return the last counts after we unload a segment from memory. - */ - public void unsetSegmentDataAndSaveCounts() { - savedStatusCount = Optional.of(getStatusCount()); - savedDeletesCount = Optional.of(getDeleteCount()); - segmentData = null; - } - - /** - * Returns the number of deletes processed by this segment. - */ - public int getDeleteCount() { - if (segmentData != null) { - return segmentData.getDeletedDocs().numDeletions(); - } else { - return savedDeletesCount.orElse(0); - } - } - - /** - * Return the number of documents in this segment. - */ - public int getStatusCount() { - if (segmentData != null) { - return segmentData.numDocs(); - } else { - return savedStatusCount.orElse(0); - } - } - - public long getIndexSizeOnDiskInBytes() { - return indexSizeOnDiskInBytes.get(); - } - - public void setIndexSizeOnDiskInBytes(long value) { - indexSizeOnDiskInBytes.set(value); - } - - public int getPartialUpdateCount() { - return partialUpdateCount.get(); - } - - public void incrementPartialUpdateCount() { - partialUpdateCount.incrementAndGet(); - } - - public void setPartialUpdateCount(int value) { - partialUpdateCount.set(value); - } - - public int getOutOfOrderUpdateCount() { - return outOfOrderUpdateCount.get(); - } - - public void incrementOutOfOrderUpdateCount() { - outOfOrderUpdateCount.incrementAndGet(); - } - - public void setOutOfOrderUpdateCount(int value) { - outOfOrderUpdateCount.set(value); - } - - @Override - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append("Indexed ").append(getStatusCount()).append(" documents, "); - sb.append(getDeleteCount()).append(" deletes, "); - sb.append(getPartialUpdateCount()).append(" partial updates, "); - sb.append(getOutOfOrderUpdateCount()).append(" out of order udpates. "); - sb.append("Index size: ").append(getIndexSizeOnDiskInBytes()); - return sb.toString(); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentIndexStatsExporter.docx b/src/java/com/twitter/search/earlybird/partition/SegmentIndexStatsExporter.docx new file mode 100644 index 000000000..9112b1808 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentIndexStatsExporter.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentIndexStatsExporter.java b/src/java/com/twitter/search/earlybird/partition/SegmentIndexStatsExporter.java deleted file mode 100644 index dca484e4c..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentIndexStatsExporter.java +++ /dev/null @@ -1,85 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import com.twitter.common.base.Supplier; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchMetric; -import com.twitter.search.common.metrics.SearchMetricsRegistry; - -/** - * Exporting per-segment stats collected in {@link SegmentIndexStats}. - * - * This class tries to reuse stat prefixes of "segment_stats_[0-N]_*" where N is the number - * of segments managed by this earlybird. - * For example, stats prefixed with "segment_stats_0_*" always represent the most recent segment. - * As we add more segments (and drop older ones), the same "segment_stats_*" stats end up exporting - * data for different underlying segments. - * - * This is done as an alternative to exporting stats that have the timesliceId in them, which - * would avoid the need for reusing the same stat names, but would create an ever-increasing set - * of unique stats exported by earlybirds. - */ -public final class SegmentIndexStatsExporter { - private static final class StatReader extends SearchMetric { - private volatile Supplier counter = () -> 0; - - private StatReader(String name) { - super(name); - } - - @Override - public Long read() { - return counter.get().longValue(); - } - - @Override - public void reset() { - counter = () -> 0; - } - } - - private SegmentIndexStatsExporter() { - } - - private static final String NAME_PREFIX = "segment_stats_"; - - /** - * Exports stats for some counts for the given segment: - * - status_count: number of tweets indexed - * - delete_count: number of deletes indexed - * - partial_update_count: number of partial updates indexed - * - out_of_order_update_count: number of out of order updates indexed - * - segment_size_bytes: the segment size in bytes - * - * @param segmentInfo The segment for which these stats should be exported. - * @param segmentIndex The index of this segment in the list of all segments. - */ - public static void export(SegmentInfo segmentInfo, int segmentIndex) { - exportStat(segmentIndex, "status_count", - () -> segmentInfo.getIndexStats().getStatusCount()); - exportStat(segmentIndex, "delete_count", - () -> segmentInfo.getIndexStats().getDeleteCount()); - exportStat(segmentIndex, "partial_update_count", - () -> segmentInfo.getIndexStats().getPartialUpdateCount()); - exportStat(segmentIndex, "out_of_order_update_count", - () -> segmentInfo.getIndexStats().getOutOfOrderUpdateCount()); - exportStat(segmentIndex, "segment_size_bytes", - () -> segmentInfo.getIndexStats().getIndexSizeOnDiskInBytes()); - - SearchLongGauge timeSliceIdStat = - SearchLongGauge.export(NAME_PREFIX + segmentIndex + "_timeslice_id"); - timeSliceIdStat.set(segmentInfo.getTimeSliceID()); - } - - private static void exportStat(final int segmentIndex, - final String nameSuffix, - Supplier counter) { - final String name = getName(segmentIndex, nameSuffix); - StatReader statReader = SearchMetricsRegistry.registerOrGet( - () -> new StatReader(name), name, StatReader.class); - statReader.counter = counter; - } - - private static String getName(final int segmentIndex, final String nameSuffix) { - return NAME_PREFIX + segmentIndex + "_" + nameSuffix; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentInfo.docx b/src/java/com/twitter/search/earlybird/partition/SegmentInfo.docx new file mode 100644 index 000000000..519f7157c Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentInfo.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentInfo.java b/src/java/com/twitter/search/earlybird/partition/SegmentInfo.java deleted file mode 100644 index 684cf2bbe..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentInfo.java +++ /dev/null @@ -1,428 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.File; -import java.io.IOException; -import java.io.OutputStreamWriter; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.concurrent.atomic.AtomicLong; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.commons.io.FileUtils; -import org.apache.lucene.store.Directory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.collections.Pair; -import com.twitter.search.common.partitioning.base.Segment; -import com.twitter.search.common.partitioning.base.TimeSlice; -import com.twitter.search.common.schema.earlybird.FlushVersion; -import com.twitter.search.common.util.LogFormatUtil; -import com.twitter.search.common.util.io.flushable.FlushInfo; -import com.twitter.search.common.util.io.flushable.PersistentFile; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.index.EarlybirdSegment; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; - -public class SegmentInfo implements Comparable { - private static final Logger LOG = LoggerFactory.getLogger(SegmentInfo.class); - - private static final String UPDATE_STREAM_OFFSET_TIMESTAMP = "updateStreamOffsetTimestamp"; - public static final int INVALID_ID = -1; - - // Delay before deleting a segment - private final long timeToWaitBeforeClosingMillis = EarlybirdConfig.getLong( - "defer_index_closing_time_millis", 600000L); - // How many times deletions are retired. - private final AtomicInteger deletionRetries = new AtomicInteger(5); - - // Base segment information, including database name, minStatusId. - private final Segment segment; - - // Bits managed by various SegmentProcessors and PartitionManager. - private volatile boolean isEnabled = true; // True if the segment is enabled. - private volatile boolean isIndexing = false; // True during indexing. - private volatile boolean isComplete = false; // True when indexing is complete. - private volatile boolean isClosed = false; // True if indexSegment is closed. - private volatile boolean wasIndexed = false; // True if the segment was indexed from scratch. - private volatile boolean failedOptimize = false; // optimize attempt failed. - private AtomicBoolean beingUploaded = new AtomicBoolean(); // segment is being copied to HDFS - - private final SegmentSyncInfo segmentSyncInfo; - private final EarlybirdIndexConfig earlybirdIndexConfig; - - private final EarlybirdSegment indexSegment; - - private final AtomicLong updatesStreamOffsetTimestamp = new AtomicLong(0); - - public SegmentInfo(Segment segment, - EarlybirdSegmentFactory earlybirdSegmentFactory, - SegmentSyncConfig syncConfig) throws IOException { - this(segment, earlybirdSegmentFactory, new SegmentSyncInfo(syncConfig, segment)); - } - - @VisibleForTesting - public SegmentInfo(Segment segment, - EarlybirdSegmentFactory earlybirdSegmentFactory, - SegmentSyncInfo segmentSyncInfo) throws IOException { - this(earlybirdSegmentFactory.newEarlybirdSegment(segment, segmentSyncInfo), - segmentSyncInfo, - segment, - earlybirdSegmentFactory.getEarlybirdIndexConfig()); - } - - public SegmentInfo( - EarlybirdSegment earlybirdSegment, - SegmentSyncInfo segmentSyncInfo, - Segment segment, - EarlybirdIndexConfig earlybirdIndexConfig - ) { - this.indexSegment = earlybirdSegment; - this.segmentSyncInfo = segmentSyncInfo; - this.earlybirdIndexConfig = earlybirdIndexConfig; - this.segment = segment; - } - - public EarlybirdSegment getIndexSegment() { - return indexSegment; - } - - public SegmentIndexStats getIndexStats() { - return indexSegment.getIndexStats(); - } - - public EarlybirdIndexConfig getEarlybirdIndexConfig() { - return earlybirdIndexConfig; - } - - public long getTimeSliceID() { - return segment.getTimeSliceID(); - } - - public String getSegmentName() { - return segment.getSegmentName(); - } - - public int getNumPartitions() { - return segment.getNumHashPartitions(); - } - - public boolean isEnabled() { - return isEnabled; - } - - public void setIsEnabled(boolean isEnabled) { - this.isEnabled = isEnabled; - } - - public boolean isOptimized() { - return indexSegment.isOptimized(); - } - - public boolean wasIndexed() { - return wasIndexed; - } - - public void setWasIndexed(boolean wasIndexed) { - this.wasIndexed = wasIndexed; - } - - public boolean isFailedOptimize() { - return failedOptimize; - } - - public void setFailedOptimize() { - this.failedOptimize = true; - } - - public boolean isIndexing() { - return isIndexing; - } - - public void setIndexing(boolean indexing) { - this.isIndexing = indexing; - } - - public boolean isComplete() { - return isComplete; - } - - public boolean isClosed() { - return isClosed; - } - - public boolean isBeingUploaded() { - return beingUploaded.get(); - } - - public void setBeingUploaded(boolean beingUploaded) { - this.beingUploaded.set(beingUploaded); - } - - public boolean casBeingUploaded(boolean expectation, boolean updateValue) { - return beingUploaded.compareAndSet(expectation, updateValue); - } - - @VisibleForTesting - public void setComplete(boolean complete) { - this.isComplete = complete; - } - - public boolean needsIndexing() { - return isEnabled && !isIndexing && !isComplete; - } - - @Override - public int compareTo(SegmentInfo other) { - return Long.compare(getTimeSliceID(), other.getTimeSliceID()); - } - - @Override - public boolean equals(Object obj) { - return obj instanceof SegmentInfo && compareTo((SegmentInfo) obj) == 0; - } - - @Override - public int hashCode() { - return new Long(getTimeSliceID()).hashCode(); - } - - public long getUpdatesStreamOffsetTimestamp() { - return updatesStreamOffsetTimestamp.get(); - } - - public void setUpdatesStreamOffsetTimestamp(long timestamp) { - updatesStreamOffsetTimestamp.set(timestamp); - } - - @Override - public String toString() { - StringBuilder builder = new StringBuilder(); - builder.append(getSegmentName()).append(" ["); - builder.append(isEnabled ? "enabled, " : "disabled, "); - - if (isIndexing) { - builder.append("indexing, "); - } - - if (isComplete) { - builder.append("complete, "); - } - - if (isOptimized()) { - builder.append("optimized, "); - } - - if (wasIndexed) { - builder.append("wasIndexed, "); - } - - builder.append("IndexSync:"); - this.segmentSyncInfo.addDebugInfo(builder); - - return builder.append("]").toString(); - } - - public Segment getSegment() { - return segment; - } - - /** - * Delete the index segment directory corresponding to this segment info. Return true if deleted - * successfully; otherwise, false. - */ - public boolean deleteLocalIndexedSegmentDirectoryImmediately() { - if (isClosed) { - LOG.info("SegmentInfo is already closed: " + toString()); - return true; - } - - Preconditions.checkNotNull(indexSegment, "indexSegment should never be null."); - isClosed = true; - indexSegment.destroyImmediately(); - - SegmentSyncConfig sync = getSyncInfo().getSegmentSyncConfig(); - try { - String dirToClear = sync.getLocalSyncDirName(segment); - FileUtils.forceDelete(new File(dirToClear)); - LOG.info("Deleted segment directory: " + toString()); - return true; - } catch (IOException e) { - LOG.error("Cannot clean up segment directory for segment: " + toString(), e); - return false; - } - } - - /** - * Delete the index segment directory after some configured delay. - * Note that we don't delete segments that are being uploaded. - * If a segment is being uploaded when we try to delete, close() retries the deletion later. - */ - public void deleteIndexSegmentDirectoryAfterDelay() { - LOG.info("Scheduling SegmentInfo for deletion: " + toString()); - getEarlybirdIndexConfig().getResourceCloser().closeResourceQuietlyAfterDelay( - timeToWaitBeforeClosingMillis, () -> { - // Atomically check and set the being uploaded flag, if it is not set. - if (beingUploaded.compareAndSet(false, true)) { - // If successfully set the flag to true, we can delete immediately - setIsEnabled(false); - deleteLocalIndexedSegmentDirectoryImmediately(); - LOG.info("Deleted index segment dir for segment: " - + getSegment().getSegmentName()); - } else { - // If the flag is already true (compareAndSet fails), we need to reschedule. - if (deletionRetries.decrementAndGet() > 0) { - LOG.warn("Segment is being uploaded, will retry deletion later. SegmentInfo: " - + getSegment().getSegmentName()); - deleteIndexSegmentDirectoryAfterDelay(); - } else { - LOG.warn("Failed to cleanup index segment dir for segment: " - + getSegment().getSegmentName()); - } - } - }); - } - - public SegmentSyncInfo getSyncInfo() { - return segmentSyncInfo; - } - - public FlushVersion getFlushVersion() { - return FlushVersion.CURRENT_FLUSH_VERSION; - } - - public String getZkNodeName() { - return getSegmentName() + getFlushVersion().getVersionFileExtension(); - } - - static String getSyncDirName(String parentDir, String dbName, String version) { - return parentDir + "/" + dbName + version; - } - - /** - * Parses the segment name from the name of the flushed directory. - */ - public static String getSegmentNameFromFlushedDir(String flushedDir) { - String segmentName = null; - String[] fields = flushedDir.split("/"); - if (fields.length > 0) { - segmentName = fields[fields.length - 1]; - segmentName = segmentName.replaceAll(FlushVersion.DELIMITER + ".*", ""); - } - return segmentName; - } - - /** - * Flushes this segment to the given directory. - * - * @param dir The directory to flush the segment to. - * @throws IOException If the segment could not be flushed. - */ - public void flush(Directory dir) throws IOException { - LOG.info("Flushing segment: {}", getSegmentName()); - try (PersistentFile.Writer writer = PersistentFile.getWriter(dir, getSegmentName())) { - FlushInfo flushInfo = new FlushInfo(); - flushInfo.addLongProperty(UPDATE_STREAM_OFFSET_TIMESTAMP, getUpdatesStreamOffsetTimestamp()); - getIndexSegment().flush(flushInfo, writer.getDataSerializer()); - - OutputStreamWriter infoFileWriter = new OutputStreamWriter(writer.getInfoFileOutputStream()); - FlushInfo.flushAsYaml(flushInfo, infoFileWriter); - } - } - - /** - * Makes a new SegmentInfo out of the current segment info, except that we switch the underlying - * segment. - */ - public SegmentInfo copyWithEarlybirdSegment(EarlybirdSegment optimizedSegment) { - // Take everything from the current segment info that doesn't change for the new segment - // info and rebuild everything that can change. - TimeSlice newTimeSlice = new TimeSlice( - getTimeSliceID(), - EarlybirdConfig.getMaxSegmentSize(), - segment.getHashPartitionID(), - segment.getNumHashPartitions() - ); - Segment newSegment = newTimeSlice.getSegment(); - - return new SegmentInfo( - optimizedSegment, - new SegmentSyncInfo( - segmentSyncInfo.getSegmentSyncConfig(), - newSegment), - newSegment, - earlybirdIndexConfig - ); - } - - /** - * Loads the segment from the given directory. - * - * @param dir The directory to load the segment from. - * @throws IOException If the segment could not be loaded. - */ - public void load(Directory dir) throws IOException { - LOG.info("Loading segment: {}", getSegmentName()); - try (PersistentFile.Reader reader = PersistentFile.getReader(dir, getSegmentName())) { - FlushInfo flushInfo = FlushInfo.loadFromYaml(reader.getInfoInputStream()); - setUpdatesStreamOffsetTimestamp(flushInfo.getLongProperty(UPDATE_STREAM_OFFSET_TIMESTAMP)); - getIndexSegment().load(reader.getDataInputStream(), flushInfo); - } - } - - private String getShortStatus() { - if (!isEnabled()) { - return "disabled"; - } - - if (isIndexing()) { - return "indexing"; - } - - if (isComplete()) { - return "indexed"; - } - - return "pending"; - } - - /** - * Get a string to be shown in admin commands which shows the query caches' sizes for this - * segment. - */ - public String getQueryCachesData() { - StringBuilder out = new StringBuilder(); - out.append("Segment: " + getSegmentName() + "\n"); - out.append("Total documents: " + LogFormatUtil.formatInt( - getIndexStats().getStatusCount()) + "\n"); - out.append("Query caches:\n"); - for (Pair data : indexSegment.getQueryCachesData()) { - out.append(" " + data.getFirst()); - out.append(": "); - out.append(LogFormatUtil.formatInt(data.getSecond())); - out.append("\n"); - } - return out.toString(); - } - - public String getSegmentMetadata() { - return "status: " + getShortStatus() + "\n" - + "id: " + getTimeSliceID() + "\n" - + "name: " + getSegmentName() + "\n" - + "statusCount: " + getIndexStats().getStatusCount() + "\n" - + "deleteCount: " + getIndexStats().getDeleteCount() + "\n" - + "partialUpdateCount: " + getIndexStats().getPartialUpdateCount() + "\n" - + "outOfOrderUpdateCount: " + getIndexStats().getOutOfOrderUpdateCount() + "\n" - + "isEnabled: " + isEnabled() + "\n" - + "isIndexing: " + isIndexing() + "\n" - + "isComplete: " + isComplete() + "\n" - + "isFlushed: " + getSyncInfo().isFlushed() + "\n" - + "isOptimized: " + isOptimized() + "\n" - + "isLoaded: " + getSyncInfo().isLoaded() + "\n" - + "wasIndexed: " + wasIndexed() + "\n" - + "queryCachesCardinality: " + indexSegment.getQueryCachesCardinality() + "\n"; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentLoader.docx b/src/java/com/twitter/search/earlybird/partition/SegmentLoader.docx new file mode 100644 index 000000000..7581b65a6 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentLoader.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentLoader.java b/src/java/com/twitter/search/earlybird/partition/SegmentLoader.java deleted file mode 100644 index f36233073..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentLoader.java +++ /dev/null @@ -1,300 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.File; -import java.io.IOException; -import java.util.concurrent.TimeUnit; - -import org.apache.commons.io.FileUtils; -import org.apache.commons.io.IOUtils; -import org.apache.hadoop.fs.FileStatus; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.Timer; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.util.io.flushable.PersistentFile; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.exception.FlushVersionMismatchException; -import com.twitter.search.earlybird.stats.SegmentSyncStats; - -public class SegmentLoader { - private static final Logger LOG = LoggerFactory.getLogger(SegmentLoader.class); - private static final SegmentSyncStats SEGMENT_LOAD_FROM_HDFS_STATS = - new SegmentSyncStats("load_from_hdfs"); - - private final CriticalExceptionHandler criticalExceptionHandler; - private final SegmentSyncConfig segmentSyncConfig; - - private final Clock clock; - - public SegmentLoader(SegmentSyncConfig sync, - CriticalExceptionHandler criticalExceptionHandler) { - this(sync, criticalExceptionHandler, Clock.SYSTEM_CLOCK); - } - - public SegmentLoader(SegmentSyncConfig sync, - CriticalExceptionHandler criticalExceptionHandler, - Clock clock) { - this.criticalExceptionHandler = criticalExceptionHandler; - this.segmentSyncConfig = sync; - this.clock = clock; - } - - public boolean load(SegmentInfo segmentInfo) { - return downloadSegment(segmentInfo) && loadSegmentFromDisk(segmentInfo); - } - - /** - * Determines if the Earlybird should attempt to download the given segment from HDFS. This - * returns true if the segment is not already present on local disk, and the segment does exist - * on HDFS. - */ - public boolean shouldDownloadSegmentWhileInServerSet(SegmentInfo segmentInfo) { - if (isValidSegmentOnDisk(segmentInfo)) { - return false; - } - try (FileSystem fs = HdfsUtil.getHdfsFileSystem()) { - return HdfsUtil.segmentExistsOnHdfs(fs, segmentInfo); - } catch (IOException e) { - LOG.error("Failed to check HDFS for segment " + segmentInfo, e); - return false; - } - } - - /** - * Verifies if the data for the given segment is present on the local disk, and if it's not, - * downloads it from HDFS. - */ - public boolean downloadSegment(SegmentInfo segmentInfo) { - if (!segmentInfo.isEnabled()) { - LOG.debug("Segment is disabled: " + segmentInfo); - return false; - } - - if (segmentInfo.isIndexing() || segmentInfo.getSyncInfo().isLoaded()) { - LOG.debug("Cannot load indexing or loaded segment: " + segmentInfo); - return false; - } - - // Return whether the appropriate version is on disk, and if not, download it from HDFS. - return isValidSegmentOnDisk(segmentInfo) || checkSegmentOnHdfsAndCopyLocally(segmentInfo); - } - - /** - * Loads the data for the given segment from the local disk. - */ - public boolean loadSegmentFromDisk(SegmentInfo segmentInfo) { - if (segmentInfo.isIndexing()) { - LOG.error("Tried to load current segment!"); - return false; - } - - segmentInfo.setIndexing(true); - try { - File flushDir = new File(segmentInfo.getSyncInfo().getLocalSyncDir()); - Directory loadDir = FSDirectory.open(flushDir.toPath()); - - segmentInfo.load(loadDir); - - if (!verifySegmentStatusCountLargeEnough(segmentInfo)) { - SearchRateCounter.export( - "segment_loader_failed_too_few_tweets_in_segment_" + segmentInfo.getSegmentName()) - .increment(); - return false; - } - - segmentInfo.setIndexing(false); - segmentInfo.setComplete(true); - segmentInfo.getSyncInfo().setLoaded(true); - return true; - } catch (FlushVersionMismatchException e) { - handleException(segmentInfo, e); - // If earlybird is in starting state, handler will terminate it - criticalExceptionHandler.handle(this, e); - } catch (Exception e) { - handleException(segmentInfo, e); - } - - SearchRateCounter.export("segment_loader_failed_" + segmentInfo.getSegmentName()).increment(); - return false; - } - - // Check to see if the segment exists on disk, and its checksum passes. - private boolean isValidSegmentOnDisk(SegmentInfo segment) { - String loadDirStr = segment.getSyncInfo().getLocalSyncDir(); - File loadDir = new File(loadDirStr); - - if (!loadDir.exists()) { - return false; - } - - for (String persistentFileName : segmentSyncConfig.getPersistentFileNames(segment)) { - if (!verifyInfoChecksum(loadDir, persistentFileName)) { - return false; - } - } - - return true; - } - - private static boolean verifyInfoChecksum(File loadDir, String databaseName) { - if (checksumFileExists(loadDir, databaseName)) { - try { - Directory dir = FSDirectory.open(loadDir.toPath()); - PersistentFile.Reader reader = PersistentFile.getReader(dir, databaseName); - try { - reader.verifyInfoChecksum(); - return true; - } finally { - IOUtils.closeQuietly(reader); - IOUtils.closeQuietly(dir); - } - } catch (PersistentFile.CorruptFileException e) { - LOG.error("Failed checksum verification.", e); - } catch (IOException e) { - LOG.error("Error while trying to read checksum file", e); - } - } - return false; - } - - // Check that the loaded segment's status count is higher than the configured threshold - private boolean verifySegmentStatusCountLargeEnough(SegmentInfo segmentInfo) { - long segmentStatusCount = segmentInfo.getIndexStats().getStatusCount(); - if (segmentStatusCount > segmentSyncConfig.getMinSegmentStatusCountThreshold()) { - return true; - } else if (segmentInfo.getEarlybirdIndexConfig().isIndexStoredOnDisk() - && couldBeMostRecentArchiveSegment(segmentInfo)) { - // The most recent archive earlybird segment is expected to be incomplete - LOG.info("Segment status count (" + segmentStatusCount + ") is below the threshold of " - + segmentSyncConfig.getMinSegmentStatusCountThreshold() - + ", but this is expected because the most recent segment is expected to be incomplete: " - + segmentInfo); - return true; - } else { - // The segment status count is small so the segment is likely incomplete. - LOG.error("Segment status count (" + segmentStatusCount + ") is below the threshold of " - + segmentSyncConfig.getMinSegmentStatusCountThreshold() + ": " + segmentInfo); - segmentInfo.setIndexing(false); - segmentInfo.getSyncInfo().setLoaded(false); - - // Remove segment from local disk - if (!segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately()) { - LOG.error("Failed to cleanup unloadable segment directory."); - } - - return false; - } - } - - // Check if this segment could be the most recent archive earlybird segment (would be on the - // latest tier). Archive segments tend to span around 12 days, so using a conservative threshold - // of 20 days. - private boolean couldBeMostRecentArchiveSegment(SegmentInfo segmentInfo) { - long timesliceAgeMs = - SnowflakeIdParser.getTweetAgeInMs(clock.nowMillis(), segmentInfo.getTimeSliceID()); - return (timesliceAgeMs / 1000 / 60 / 60 / 24) <= 20; - } - - /** - * Check to see if the segment exists on hdfs. Will look for the correct segment version - * uploaded by any of the hosts. - * If the segment exists on hdfs, the segment will be copied from hdfs to the local file - * system, and we will verify the checksum against the copied version. - * @return true iff the segment was copied to local disk, and the checksum is verified. - */ - private boolean checkSegmentOnHdfsAndCopyLocally(SegmentInfo segment) { - if (!segmentSyncConfig.isSegmentLoadFromHdfsEnabled()) { - return isValidSegmentOnDisk(segment); - } - - LOG.info("About to start downloading segment from hdfs: " + segment); - Timer timer = new Timer(TimeUnit.MILLISECONDS); - String status = null; - String localBaseDir = segment.getSyncInfo().getLocalSyncDir(); - FileSystem fs = null; - try { - fs = HdfsUtil.getHdfsFileSystem(); - - String hdfsBaseDirPrefix = segment.getSyncInfo().getHdfsSyncDirPrefix(); - FileStatus[] statuses = fs.globStatus(new Path(hdfsBaseDirPrefix)); - if (statuses != null && statuses.length > 0) { - Path hdfsSyncPath = statuses[0].getPath(); - copySegmentFilesFromHdfs(segment, segmentSyncConfig, fs, hdfsSyncPath); - status = "loaded"; - } else { - LOG.info("No segments found in hdfs under: " + hdfsBaseDirPrefix); - status = "notloaded"; - } - fs.close(); - } catch (IOException ex) { - LOG.error("Failed copying segment from hdfs: " + segment + " after: " - + timer.stop() + " ms", ex); - status = "exception"; - SEGMENT_LOAD_FROM_HDFS_STATS.recordError(); - try { - FileUtils.deleteDirectory(new File(localBaseDir)); - } catch (IOException e) { - LOG.error("Error cleaning up local segment directory: " + segment, e); - } - } finally { - timer.stop(); - SEGMENT_LOAD_FROM_HDFS_STATS.actionComplete(timer); - LOG.info("Download from hdfs completed in " - + timer.getElapsed() + " milliseconds: " + segment + " status: " + status); - IOUtils.closeQuietly(fs); - } - - // now check to see if we have successfully copied the segment - return isValidSegmentOnDisk(segment); - } - - private static void copySegmentFilesFromHdfs(SegmentInfo segment, - SegmentSyncConfig syncConfig, - FileSystem fs, - Path hdfsSyncPath) throws IOException { - String localBaseDir = segment.getSyncInfo().getLocalSyncDir(); - File localBaseDirFile = new File(localBaseDir); - FileUtils.deleteQuietly(localBaseDirFile); - if (localBaseDirFile.exists()) { - LOG.warn("Cannot delete the existing path: " + localBaseDir); - } - for (String fileName : syncConfig.getAllSyncFileNames(segment)) { - Path hdfsFilePath = new Path(hdfsSyncPath, fileName); - String localFileName = localBaseDir + "/" + fileName; - LOG.debug("About to start loading from hdfs: " + fileName + " from: " - + hdfsFilePath + " to: " + localFileName); - - Timer timer = new Timer(TimeUnit.MILLISECONDS); - fs.copyToLocalFile(hdfsFilePath, new Path(localFileName)); - LOG.debug("Loaded segment file from hdfs: " + fileName + " from: " - + hdfsFilePath + " to: " + localFileName + " in: " + timer.stop() + " ms."); - } - - LOG.info("Finished downloading segments from " + hdfsSyncPath); - } - - private static boolean checksumFileExists(File loadDir, String databaseName) { - String checksumFileName = PersistentFile.genChecksumFileName(databaseName); - File checksumFile = new File(loadDir, checksumFileName); - - return checksumFile.exists(); - } - - private void handleException(SegmentInfo segmentInfo, Exception e) { - LOG.error("Exception while loading IndexSegment from " - + segmentInfo.getSyncInfo().getLocalSyncDir(), e); - - segmentInfo.setIndexing(false); - segmentInfo.getSyncInfo().setLoaded(false); - if (!segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately()) { - LOG.error("Failed to cleanup unloadable segment directory."); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentManager.docx b/src/java/com/twitter/search/earlybird/partition/SegmentManager.docx new file mode 100644 index 000000000..16a0ecee9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentManager.java b/src/java/com/twitter/search/earlybird/partition/SegmentManager.java deleted file mode 100644 index 69736da8d..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentManager.java +++ /dev/null @@ -1,822 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.ConcurrentSkipListMap; -import java.util.stream.Collectors; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Predicate; -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.partitioning.base.Segment; -import com.twitter.search.common.partitioning.base.TimeSlice; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.common.CaughtUpMonitor; -import com.twitter.search.earlybird.common.userupdates.UserScrubGeoMap; -import com.twitter.search.earlybird.common.userupdates.UserUpdate; -import com.twitter.search.earlybird.common.userupdates.UserUpdatesChecker; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.index.EarlybirdSegmentFactory; -import com.twitter.search.earlybird.index.EarlybirdSingleSegmentSearcher; -import com.twitter.search.earlybird.search.EarlybirdLuceneSearcher; -import com.twitter.search.earlybird.search.EarlybirdMultiSegmentSearcher; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.EarlybirdResponseCode; -import com.twitter.tweetypie.thriftjava.UserScrubGeoEvent; - -public class SegmentManager { - private static final Logger LOG = LoggerFactory.getLogger(SegmentManager.class); - private final Clock clock; - private static final String STATS_PREFIX = "segment_manager_"; - private static final SearchLongGauge SEGMENT_COUNT_STATS = - SearchLongGauge.export(STATS_PREFIX + "total_segments"); - private static final SearchCounter OPTIMIZED_SEGMENTS = - SearchCounter.export(STATS_PREFIX + "optimized_segments"); - private static final SearchCounter UNOPTIMIZED_SEGMENTS = - SearchCounter.export(STATS_PREFIX + "unoptimized_segments"); - - public enum Filter { - All(info -> true), - Enabled(SegmentInfo::isEnabled), - NeedsIndexing(SegmentInfo::needsIndexing), - Complete(SegmentInfo::isComplete); - - private final Predicate predicate; - - Filter(Predicate predicate) { - this.predicate = predicate; - } - - private static final Map NAME_INDEX = - Maps.newHashMapWithExpectedSize(Filter.values().length); - - static { - for (Filter filter : Filter.values()) { - NAME_INDEX.put(filter.name().toLowerCase(), filter); - } - } - - /** - * Parses the filter from the given string, based on the filter name. - */ - public static Filter fromStringIgnoreCase(String str) { - if (str == null) { - return null; - } - - return NAME_INDEX.get(str.toLowerCase()); - } - } - - public enum Order { - OLD_TO_NEW, - NEW_TO_OLD, - } - - /** - * A listener that gets notified when the list of segments changes. - */ - public interface SegmentUpdateListener { - /** - * Called with the new list of segments when it changes. - * - * @param segments The new list of segments. - */ - void update(Collection segments, String message); - } - - private final List updateListeners = - Collections.synchronizedList(Lists.newLinkedList()); - - private final ConcurrentSkipListMap segmentWriters = - new ConcurrentSkipListMap<>(); - - private final Set badTimesliceIds = new HashSet<>(); - - private final int maxEnabledSegments; - private final int maxSegmentSize; - private final EarlybirdSegmentFactory earlybirdSegmentFactory; - private final UserTable userTable; - private final UserScrubGeoMap userScrubGeoMap; - private final EarlybirdIndexConfig earlybirdIndexConfig; - private final DynamicPartitionConfig dynamicPartitionConfig; - private final UserUpdatesChecker userUpdatesChecker; - private final SegmentSyncConfig segmentSyncConfig; - private final EarlybirdSearcherStats searcherStats; - private final SearchIndexingMetricSet searchIndexingMetricSet; - private final CriticalExceptionHandler criticalExceptionHandler; - private final CaughtUpMonitor indexCaughtUpMonitor; - - public SegmentManager( - DynamicPartitionConfig dynamicPartitionConfig, - EarlybirdIndexConfig earlybirdIndexConfig, - SearchIndexingMetricSet searchIndexingMetricSet, - EarlybirdSearcherStats searcherStats, - SearchStatsReceiver earlybirdStatsReceiver, - UserUpdatesChecker userUpdatesChecker, - SegmentSyncConfig segmentSyncConfig, - UserTable userTable, - UserScrubGeoMap userScrubGeoMap, - Clock clock, - int maxSegmentSize, - CriticalExceptionHandler criticalExceptionHandler, - CaughtUpMonitor indexCaughtUpMonitor) { - - PartitionConfig curPartitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - - this.userTable = userTable; - this.userScrubGeoMap = userScrubGeoMap; - - this.earlybirdSegmentFactory = new EarlybirdSegmentFactory( - earlybirdIndexConfig, - searchIndexingMetricSet, - searcherStats, - clock); - this.earlybirdIndexConfig = earlybirdIndexConfig; - this.maxEnabledSegments = curPartitionConfig.getMaxEnabledLocalSegments(); - this.dynamicPartitionConfig = dynamicPartitionConfig; - this.userUpdatesChecker = userUpdatesChecker; - this.segmentSyncConfig = segmentSyncConfig; - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.searcherStats = searcherStats; - this.clock = clock; - this.maxSegmentSize = maxSegmentSize; - this.criticalExceptionHandler = criticalExceptionHandler; - this.indexCaughtUpMonitor = indexCaughtUpMonitor; - - earlybirdStatsReceiver.getCustomGauge("total_loaded_segments", - segmentWriters::size); - earlybirdStatsReceiver.getCustomGauge("total_indexed_documents", - this::getNumIndexedDocuments); - earlybirdStatsReceiver.getCustomGauge("total_segment_size_bytes", - this::getTotalSegmentSizeOnDisk); - earlybirdStatsReceiver.getCustomGauge("earlybird_index_depth_millis", - this::getIndexDepthMillis); - } - - /** - * Logs the current state of this segment manager. - * - * @param label A label that should identify the segment manager. - */ - public void logState(String label) { - StringBuilder sb = new StringBuilder(); - sb.append("State of SegmentManager (" + label + "):\n"); - sb.append("Number of segments: " + segmentWriters.size()); - boolean hasSegments = false; - for (Map.Entry entry : this.segmentWriters.entrySet()) { - SegmentInfo segmentInfo = entry.getValue().getSegmentInfo(); - hasSegments = true; - - sb.append(String.format("\nSegment (%s): isClosed: %5s, isComplete: %5s, " - + "isEnabled: %5s, isIndexing: %5s, isOptimized: %5s, wasIndexed: %5s", - segmentInfo.getSegmentName(), - segmentInfo.isClosed(), - segmentInfo.isComplete(), - segmentInfo.isEnabled(), - segmentInfo.isIndexing(), - segmentInfo.isOptimized(), - segmentInfo.wasIndexed() - )); - - sb.append(String.format(" | Index stats: %s", segmentInfo.getIndexStats().toString())); - } - if (!hasSegments) { - sb.append(" No segments."); - } - LOG.info(sb.toString()); - } - - - public PartitionConfig getPartitionConfig() { - return dynamicPartitionConfig.getCurrentPartitionConfig(); - } - - public int getMaxEnabledSegments() { - return maxEnabledSegments; - } - - public EarlybirdSegmentFactory getEarlybirdSegmentFactory() { - return earlybirdSegmentFactory; - } - - public EarlybirdIndexConfig getEarlybirdIndexConfig() { - return earlybirdIndexConfig; - } - - public UserTable getUserTable() { - return userTable; - } - - public UserScrubGeoMap getUserScrubGeoMap() { - return userScrubGeoMap; - } - - @VisibleForTesting - public void reset() { - segmentWriters.clear(); - } - - /** - * Returns the list of all segments that match the given filter, in the given order. - */ - public Iterable getSegmentInfos(Filter filter, Order order) { - Comparator comparator; - - if (order == Order.OLD_TO_NEW) { - comparator = Comparator.naturalOrder(); - } else { - comparator = Comparator.reverseOrder(); - } - - return () -> segmentWriters.values().stream() - .map(ISegmentWriter::getSegmentInfo) - .filter(filter.predicate::apply) - .sorted(comparator) - .iterator(); - } - - private void createAndPutSegmentInfo(Segment segment) throws IOException { - LOG.info("Creating new SegmentInfo for segment " + segment.getSegmentName()); - putSegmentInfo(new SegmentInfo(segment, earlybirdSegmentFactory, segmentSyncConfig)); - } - - /** - * Updates the list of segments managed by this manager, based on the given list. - */ - public void updateSegments(List segmentsList) throws IOException { - // Truncate to the amount of segments we want to keep enabled. - List truncatedSegmentList = - SegmentManager.truncateSegmentList(segmentsList, maxEnabledSegments); - - final long newestTimeSliceID = getNewestTimeSliceID(); - final Set segmentsToDisable = new HashSet<>(segmentWriters.keySet()); - - for (Segment segment : truncatedSegmentList) { - final long timeSliceID = segment.getTimeSliceID(); - segmentsToDisable.remove(timeSliceID); - - // On the first loop iteration of the first call to updateSegments(), newestTimeSliceID should - // be set to -1, so the condition should be false. After that, all segments should either be - // newer than the latest process segment, or if we're replacing an old segment, it should have - // a SegmentInfo instance associated with it. - if (timeSliceID <= newestTimeSliceID) { - ISegmentWriter segmentWriter = segmentWriters.get(timeSliceID); - // Old time slice ID. It should have a SegmentInfo instance associated with it. - if (segmentWriter == null) { - if (!badTimesliceIds.contains(timeSliceID)) { - // We're dealing with a bad timeslice. Log an error, but do it only once per timeslice. - LOG.error("The SegmentInfo instance associated with an old timeSliceID should never be " - + "null. TimeSliceID: {}", timeSliceID); - badTimesliceIds.add(timeSliceID); - } - } else if (segmentWriter.getSegmentInfo().isClosed()) { - // If the SegmentInfo was closed, create a new one. - LOG.info("SegmentInfo for segment {} is closed.", segment.getSegmentName()); - createAndPutSegmentInfo(segment); - } - } else { - // New time slice ID: create a SegmentInfo instance for it. - createAndPutSegmentInfo(segment); - } - } - - // Anything we didn't see locally can be disabled. - for (Long segmentID : segmentsToDisable) { - disableSegment(segmentID); - } - - // Update segment stats and other exported variables. - updateStats(); - } - - /** - * Re-export stats after a segment has changed, or the set of segments has changed. - */ - public void updateStats() { - // Update the partition count stats. - SEGMENT_COUNT_STATS.set(segmentWriters.size()); - - OPTIMIZED_SEGMENTS.reset(); - UNOPTIMIZED_SEGMENTS.reset(); - for (ISegmentWriter writer : segmentWriters.values()) { - if (writer.getSegmentInfo().isOptimized()) { - OPTIMIZED_SEGMENTS.increment(); - } else { - UNOPTIMIZED_SEGMENTS.increment(); - } - } - } - - private long getIndexDepthMillis() { - long oldestTimeSliceID = getOldestEnabledTimeSliceID(); - if (oldestTimeSliceID == SegmentInfo.INVALID_ID) { - return 0; - } else { - // Compute timestamp from timesliceId, which is also a snowflake tweetId - long timestamp = SnowflakeIdParser.getTimestampFromTweetId(oldestTimeSliceID); - // Set current index depth in milliseconds - long indexDepthInMillis = System.currentTimeMillis() - timestamp; - // Index depth should never be negative. - if (indexDepthInMillis < 0) { - LOG.warn("Negative index depth. Large time skew on this Earlybird?"); - return 0; - } else { - return indexDepthInMillis; - } - } - } - - private void updateExportedSegmentStats() { - int index = 0; - for (SegmentInfo segmentInfo : getSegmentInfos(Filter.Enabled, Order.NEW_TO_OLD)) { - SegmentIndexStatsExporter.export(segmentInfo, index++); - } - } - - // Marks the SegmentInfo object matching this time slice as disabled. - private void disableSegment(long timeSliceID) { - SegmentInfo info = getSegmentInfo(timeSliceID); - if (info == null) { - LOG.warn("Tried to disable missing segment " + timeSliceID); - return; - } - info.setIsEnabled(false); - LOG.info("Disabled segment " + info); - } - - public long getNewestTimeSliceID() { - final Iterator segments = getSegmentInfos(Filter.All, Order.NEW_TO_OLD).iterator(); - return segments.hasNext() ? segments.next().getTimeSliceID() : SegmentInfo.INVALID_ID; - } - - /** - * Returns the timeslice ID of the oldest enabled segment. - */ - public long getOldestEnabledTimeSliceID() { - if (segmentWriters.size() == 0) { - return SegmentInfo.INVALID_ID; - } - ISegmentWriter segmentWriter = segmentWriters.firstEntry().getValue(); - return segmentWriter.getSegmentInfo().getTimeSliceID(); - } - - /** - * Returns the SegmentInfo for the given timeSliceID. - */ - public final SegmentInfo getSegmentInfo(long timeSliceID) { - ISegmentWriter segmentWriter = segmentWriters.get(timeSliceID); - return segmentWriter == null ? null : segmentWriter.getSegmentInfo(); - } - - /** - * Returns the segment info for the segment that should contain the given tweet ID. - */ - public final SegmentInfo getSegmentInfoFromStatusID(long tweetID) { - for (SegmentInfo segmentInfo : getSegmentInfos(Filter.All, Order.NEW_TO_OLD)) { - if (tweetID >= segmentInfo.getTimeSliceID()) { - return segmentInfo; - } - } - - return null; - } - - /** - * Removes the segment associated with the given timeslice ID from the segment manager. This will - * also take care of all required clean up related to the segment being removed, such as closing - * its writer. - */ - public boolean removeSegmentInfo(long timeSliceID) { - if (timeSliceID == getNewestTimeSliceID()) { - throw new RuntimeException("Cannot drop segment of current time-slice " + timeSliceID); - } - - ISegmentWriter removed = segmentWriters.get(timeSliceID); - if (removed == null) { - return false; - } - - LOG.info("Removing segment {}", removed.getSegmentInfo()); - Preconditions.checkState(!removed.getSegmentInfo().isEnabled()); - removed.getSegmentInfo().getIndexSegment().close(); - segmentWriters.remove(timeSliceID); - - String segmentName = removed.getSegmentInfo().getSegmentName(); - updateAllListeners("Removed segment " + segmentName); - LOG.info("Removed segment " + segmentName); - updateExportedSegmentStats(); - updateStats(); - return true; - } - - /** - * Add the given SegmentWriter into the segmentWriters map. - * If a segment with the same timesliceID already exists in the map, the old one is replaced - * with the new one; this should only happen in the archive. - * - * The replaced segment is destroyed after a delay to allow in-flight requests to finish. - */ - public ISegmentWriter putSegmentInfo(SegmentInfo info) { - ISegmentWriter usedSegmentWriter; - - SegmentWriter segmentWriter - = new SegmentWriter(info, searchIndexingMetricSet.updateFreshness); - - if (!info.isOptimized()) { - LOG.info("Inserting an optimizing segment writer for segment: {}", - info.getSegmentName()); - - usedSegmentWriter = new OptimizingSegmentWriter( - segmentWriter, - criticalExceptionHandler, - searchIndexingMetricSet, - indexCaughtUpMonitor); - } else { - usedSegmentWriter = segmentWriter; - } - - putSegmentWriter(usedSegmentWriter); - return usedSegmentWriter; - } - - private void putSegmentWriter(ISegmentWriter segmentWriter) { - SegmentInfo newSegmentInfo = segmentWriter.getSegmentInfo(); - SegmentInfo oldSegmentInfo = getSegmentInfo(newSegmentInfo.getTimeSliceID()); - - // Some sanity checks. - if (oldSegmentInfo != null) { - // This map is thread safe, so this put can be considered atomic. - segmentWriters.put(newSegmentInfo.getTimeSliceID(), segmentWriter); - LOG.info("Replaced SegmentInfo with a new one in segmentWriters map. " - + "Old SegmentInfo: {} New SegmentInfo: {}", oldSegmentInfo, newSegmentInfo); - - if (!oldSegmentInfo.isClosed()) { - oldSegmentInfo.deleteIndexSegmentDirectoryAfterDelay(); - } - } else { - long newestTimeSliceID = getNewestTimeSliceID(); - if (newestTimeSliceID != SegmentInfo.INVALID_ID - && newestTimeSliceID > newSegmentInfo.getTimeSliceID()) { - LOG.error("Not adding out-of-order segment " + newSegmentInfo); - return; - } - - segmentWriters.put(newSegmentInfo.getTimeSliceID(), segmentWriter); - LOG.info("Added segment " + newSegmentInfo); - } - - updateAllListeners("Added segment " + newSegmentInfo.getTimeSliceID()); - updateExportedSegmentStats(); - updateStats(); - } - - private SegmentInfo createSegmentInfo(long timesliceID) throws IOException { - PartitionConfig partitionConfig = dynamicPartitionConfig.getCurrentPartitionConfig(); - - TimeSlice timeSlice = new TimeSlice( - timesliceID, - maxSegmentSize, - partitionConfig.getIndexingHashPartitionID(), - partitionConfig.getNumPartitions()); - - SegmentInfo segmentInfo = - new SegmentInfo(timeSlice.getSegment(), earlybirdSegmentFactory, segmentSyncConfig); - - return segmentInfo; - } - - /** - * Create a new optimizing segment writer and add it to the map. - */ - public OptimizingSegmentWriter createAndPutOptimizingSegmentWriter( - long timesliceID) throws IOException { - SegmentInfo segmentInfo = createSegmentInfo(timesliceID); - - OptimizingSegmentWriter writer = new OptimizingSegmentWriter( - new SegmentWriter(segmentInfo, searchIndexingMetricSet.updateFreshness), - criticalExceptionHandler, - searchIndexingMetricSet, - indexCaughtUpMonitor); - - putSegmentWriter(writer); - return writer; - } - - /** - * Create a new segment writer. - */ - public SegmentWriter createSegmentWriter(long timesliceID) throws IOException { - SegmentInfo segmentInfo = createSegmentInfo(timesliceID); - - SegmentWriter writer = new SegmentWriter( - segmentInfo, searchIndexingMetricSet.updateFreshness); - - return writer; - } - - private void updateAllListeners(String message) { - List segmentInfos = segmentWriters.values().stream() - .map(ISegmentWriter::getSegmentInfo) - .collect(Collectors.toList()); - for (SegmentUpdateListener listener : updateListeners) { - try { - listener.update(segmentInfos, message); - } catch (Exception e) { - LOG.warn("SegmentManager: Unable to call update() on listener.", e); - } - } - } - - // Returns true if the map contains a SegmentInfo matching the given time slice. - public final boolean hasSegmentInfo(long timeSliceID) { - return segmentWriters.containsKey(timeSliceID); - } - - public void addUpdateListener(SegmentUpdateListener listener) { - updateListeners.add(listener); - } - - /** - * Look up the segment containing the given status id. - * If found, its timeslice id is returned. - * If none found, -1 is returned. - */ - public long lookupTimeSliceID(long statusID) throws IOException { - SegmentInfo segmentInfo = getSegmentInfoForID(statusID); - if (segmentInfo == null) { - return -1; - } - if (!segmentInfo.getIndexSegment().hasDocument(statusID)) { - return -1; - } - - return segmentInfo.getTimeSliceID(); - } - - /** - * Truncates the given segment list to the specified number of segments, by keeping the newest - * segments. - */ - @VisibleForTesting - public static List truncateSegmentList(List segmentList, int maxNumSegments) { - // Maybe cut-off the beginning of the sorted list of IDs. - if (maxNumSegments > 0 && maxNumSegments < segmentList.size()) { - return segmentList.subList(segmentList.size() - maxNumSegments, segmentList.size()); - } else { - return segmentList; - } - } - - @VisibleForTesting - public void setOffensive(long userID, boolean offensive) { - userTable.setOffensive(userID, offensive); - } - - @VisibleForTesting - public void setAntisocial(long userID, boolean antisocial) { - userTable.setAntisocial(userID, antisocial); - } - - /** - * Returns a searcher for all segments. - */ - public EarlybirdMultiSegmentSearcher getMultiSearcher(ImmutableSchemaInterface schemaSnapshot) - throws IOException { - return new EarlybirdMultiSegmentSearcher( - schemaSnapshot, - getSearchers(schemaSnapshot, Filter.All, Order.NEW_TO_OLD), - searcherStats, - clock); - } - - /** - * Returns a new searcher for the given segment. - */ - @Nullable - public EarlybirdLuceneSearcher getSearcher( - Segment segment, - ImmutableSchemaInterface schemaSnapshot) throws IOException { - return getSearcher(segment.getTimeSliceID(), schemaSnapshot); - } - - /** - * Get max tweet id across all enabled segments. - * @return max tweet id or -1 if none found - */ - public long getMaxTweetIdFromEnabledSegments() { - for (SegmentInfo segmentInfo : getSegmentInfos(Filter.Enabled, Order.NEW_TO_OLD)) { - long maxTweetId = segmentInfo.getIndexSegment().getMaxTweetId(); - if (maxTweetId != -1) { - return maxTweetId; - } - } - - return -1; - } - - /** - * Create a tweet index searcher on the segment represented by the timeslice id. For production - * search session, the schema snapshot should be always passed in to make sure that the schema - * usage inside scoring is consistent. - * - * For non-production usage, like one-off debugging search, you can use the function call without - * the schema snapshot. - * - * @param timeSliceID the timeslice id, which represents the index segment - * @param schemaSnapshot the schema snapshot - * @return the tweet index searcher - */ - @Nullable - public EarlybirdSingleSegmentSearcher getSearcher( - long timeSliceID, - ImmutableSchemaInterface schemaSnapshot) throws IOException { - SegmentInfo segmentInfo = getSegmentInfo(timeSliceID); - if (segmentInfo == null) { - return null; - } - return segmentInfo.getIndexSegment().getSearcher(userTable, schemaSnapshot); - } - - /** - * Returns a new searcher for the segment with the given timeslice ID. If the given timeslice ID - * does not correspond to any active segment, {@code null} is returned. - * - * @param timeSliceID The segment's timeslice ID. - * @return A new searcher for the segment with the given timeslice ID. - */ - @Nullable - public EarlybirdSingleSegmentSearcher getSearcher(long timeSliceID) throws IOException { - SegmentInfo segmentInfo = getSegmentInfo(timeSliceID); - if (segmentInfo == null) { - return null; - } - return segmentInfo.getIndexSegment().getSearcher(userTable); - } - - @Nullable - public EarlybirdResponseCode checkSegment(Segment segment) { - return checkSegmentInternal(getSegmentInfo(segment.getTimeSliceID())); - } - - private static EarlybirdResponseCode checkSegmentInternal(SegmentInfo info) { - if (info == null) { - return EarlybirdResponseCode.PARTITION_NOT_FOUND; - } else if (info.isEnabled()) { - return EarlybirdResponseCode.SUCCESS; - } else { - return EarlybirdResponseCode.PARTITION_DISABLED; - } - } - - private List getSearchers( - ImmutableSchemaInterface schemaSnapshot, - Filter filter, - Order order) throws IOException { - List searchers = Lists.newArrayList(); - for (SegmentInfo segmentInfo : getSegmentInfos(filter, order)) { - EarlybirdSingleSegmentSearcher searcher = - segmentInfo.getIndexSegment().getSearcher(userTable, schemaSnapshot); - if (searcher != null) { - searchers.add(searcher); - } - } - return searchers; - } - - /** - * Gets metadata for segments for debugging purposes. - */ - public List getSegmentMetadata() { - List segmentMetadata = new ArrayList<>(); - for (SegmentInfo segment : getSegmentInfos(Filter.All, Order.OLD_TO_NEW)) { - segmentMetadata.add(segment.getSegmentMetadata()); - } - return segmentMetadata; - } - - /** - * Gets info for query caches to be displayed in an admin page. - */ - public String getQueryCachesData() { - StringBuilder output = new StringBuilder(); - for (SegmentInfo segment : getSegmentInfos(Filter.All, Order.OLD_TO_NEW)) { - output.append(segment.getQueryCachesData() + "\n"); - } - return output.toString(); - } - - /** - * Index the given user update. Returns false if the given update is skipped. - */ - public boolean indexUserUpdate(UserUpdate userUpdate) { - return userTable.indexUserUpdate(userUpdatesChecker, userUpdate); - } - - /** - * Index the given UserScrubGeoEvent. - * @param userScrubGeoEvent - */ - public void indexUserScrubGeoEvent(UserScrubGeoEvent userScrubGeoEvent) { - userScrubGeoMap.indexUserScrubGeoEvent(userScrubGeoEvent); - } - - /** - * Return how many documents this segment manager has indexed in all of its enabled segments. - */ - public long getNumIndexedDocuments() { - // Order here doesn't matter, we just want all enabled segments, and allocate - // as little as needed. - long indexedDocs = 0; - for (SegmentInfo segmentInfo : getSegmentInfos(Filter.Enabled, Order.OLD_TO_NEW)) { - indexedDocs += segmentInfo.getIndexSegment().getIndexStats().getStatusCount(); - } - return indexedDocs; - } - - /** - * Return how many partial updates this segment manager has applied - * in all of its enabled segments. - */ - public long getNumPartialUpdates() { - long partialUpdates = 0; - for (SegmentInfo segmentInfo : getSegmentInfos(Filter.Enabled, Order.OLD_TO_NEW)) { - partialUpdates += segmentInfo.getIndexSegment().getIndexStats().getPartialUpdateCount(); - } - return partialUpdates; - } - - /** - * Returns the segment info for the segment containing the given tweet ID. - */ - public SegmentInfo getSegmentInfoForID(long tweetID) { - ISegmentWriter segmentWriter = getSegmentWriterForID(tweetID); - return segmentWriter == null ? null : segmentWriter.getSegmentInfo(); - } - - /** - * Returns the segment writer for the segment containing the given tweet ID. - */ - @Nullable - public ISegmentWriter getSegmentWriterForID(long tweetID) { - Map.Entry entry = segmentWriters.floorEntry(tweetID); - return entry == null ? null : entry.getValue(); - } - - /** - * Remove old segments until we have less than or equal to the number of max enabled segments. - */ - public void removeExcessSegments() { - int removedSegmentCount = 0; - while (segmentWriters.size() > getMaxEnabledSegments()) { - long timesliceID = getOldestEnabledTimeSliceID(); - disableSegment(timesliceID); - removeSegmentInfo(timesliceID); - removedSegmentCount += 1; - } - LOG.info("Segment manager removed {} excess segments", removedSegmentCount); - } - - /** - * Returns total index size on disk across all enabled segments in this segment manager. - */ - private long getTotalSegmentSizeOnDisk() { - long totalIndexSize = 0; - for (SegmentInfo segmentInfo : getSegmentInfos(Filter.Enabled, Order.OLD_TO_NEW)) { - totalIndexSize += segmentInfo.getIndexSegment().getIndexStats().getIndexSizeOnDiskInBytes(); - } - return totalIndexSize; - } - - @VisibleForTesting - ISegmentWriter getSegmentWriterWithoutCreationForTests(long timesliceID) { - return segmentWriters.get(timesliceID); - } - - @VisibleForTesting - ArrayList getTimeSliceIdsForTests() { - return new ArrayList(segmentWriters.keySet()); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentOptimizer.docx b/src/java/com/twitter/search/earlybird/partition/SegmentOptimizer.docx new file mode 100644 index 000000000..099b7ac88 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentOptimizer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentOptimizer.java b/src/java/com/twitter/search/earlybird/partition/SegmentOptimizer.java deleted file mode 100644 index d06d0fffc..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentOptimizer.java +++ /dev/null @@ -1,60 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.earlybird.EarlybirdStatus; - -public final class SegmentOptimizer { - private static final Logger LOG = LoggerFactory.getLogger(SegmentOptimizer.class); - - private static final String OPTIMIZING_SEGMENT_EVENT_PATTERN = "optimizing segment %s"; - private static final String OPTIMIZING_SEGMENT_GAUGE_PATTERN = "optimizing_segment_%s"; - - private SegmentOptimizer() { - } - - /** - * Optimize a segment. Returns whether optimization was successful. - */ - public static boolean optimize(SegmentInfo segmentInfo) { - try { - return optimizeThrowing(segmentInfo); - } catch (Exception e) { - // This is a bad situation, as earlybird can't run with too many un-optimized - // segments in memory. - LOG.error("Exception while optimizing segment " + segmentInfo.getSegmentName() + ": ", e); - segmentInfo.setFailedOptimize(); - return false; - } - } - - public static boolean needsOptimization(SegmentInfo segmentInfo) { - return segmentInfo.isComplete() && !segmentInfo.isOptimized() - && !segmentInfo.isFailedOptimize() && !segmentInfo.isIndexing(); - } - - private static boolean optimizeThrowing(SegmentInfo segmentInfo) throws IOException { - if (!needsOptimization(segmentInfo)) { - return false; - } - - String gaugeName = - String.format(OPTIMIZING_SEGMENT_GAUGE_PATTERN, segmentInfo.getSegmentName()); - SearchIndexingMetricSet.StartupMetric metric = - new SearchIndexingMetricSet.StartupMetric(gaugeName); - - String eventName = - String.format(OPTIMIZING_SEGMENT_EVENT_PATTERN, segmentInfo.getSegmentName()); - EarlybirdStatus.beginEvent(eventName, metric); - try { - segmentInfo.getIndexSegment().optimizeIndexes(); - } finally { - EarlybirdStatus.endEvent(eventName, metric); - } - - return true; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentSyncConfig.docx b/src/java/com/twitter/search/earlybird/partition/SegmentSyncConfig.docx new file mode 100644 index 000000000..b4d38904f Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentSyncConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentSyncConfig.java b/src/java/com/twitter/search/earlybird/partition/SegmentSyncConfig.java deleted file mode 100644 index d7e9bd82e..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentSyncConfig.java +++ /dev/null @@ -1,218 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Collections; -import java.util.Date; -import java.util.Optional; -import java.util.concurrent.TimeUnit; - -import com.twitter.search.common.database.DatabaseConfig; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.partitioning.base.Segment; -import com.twitter.search.common.schema.earlybird.FlushVersion; -import com.twitter.search.common.util.io.flushable.PersistentFile; -import com.twitter.search.earlybird.archive.ArchiveSegment; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.util.ScrubGenUtil; -import com.twitter.util.TwitterDateFormat; - -/** - * Encapsulates config information related to reading and writing segments to local filesystem or - * HDFS. - */ -public class SegmentSyncConfig { - public static final String LUCENE_DIR_PREFIX = "lucene_"; - - private final Optional scrubGen; - - public SegmentSyncConfig(Optional scrubGen) { - this.scrubGen = scrubGen; - String scrubGenStat = scrubGen.orElse("unset"); - SearchLongGauge.export("scrub_gen_" + scrubGenStat).set(1); - if (scrubGen.isPresent()) { - // Export a stat for the number of days between the scrub gen date and now - SearchCustomGauge.export("scrub_gen_age_in_days", () -> { - long scrubGenMillis = ScrubGenUtil.parseScrubGenToDate(scrubGen.get()).getTime(); - return TimeUnit.MILLISECONDS.toDays(System.currentTimeMillis() - scrubGenMillis); - }); - } - } - - /** - * Returns the file extension to be used for the current flush version. - */ - public String getVersionFileExtension() { - return FlushVersion.CURRENT_FLUSH_VERSION.getVersionFileExtension(); - } - - /** - * Returns the threshold for how large a segment's status count must be at load time to be - * considered valid. - */ - public int getMinSegmentStatusCountThreshold() { - double minSegmentTweetCountProportionThreshold = - EarlybirdConfig.getDouble("min_segment_tweet_count_percentage_threshold", 0) / 100; - return (int) (EarlybirdConfig.getMaxSegmentSize() * minSegmentTweetCountProportionThreshold); - } - - /** - * Determines if this earlybird is allowed to flush segments to HDFS. - */ - public boolean isFlushToHdfsEnabled() { - return EarlybirdProperty.SEGMENT_FLUSH_TO_HDFS_ENABLED.get(false) - // Flush to HDFS is always disabled if FlushVersion is not official. - && FlushVersion.CURRENT_FLUSH_VERSION.isOfficial(); - } - - /** - * Determines if this earlybird is allowed to load segments from HDFS. - */ - public boolean isSegmentLoadFromHdfsEnabled() { - return EarlybirdProperty.SEGMENT_LOAD_FROM_HDFS_ENABLED.get(false); - } - - /** - * Determines if this earlybird is allowed to delete flushed segments. - */ - public boolean isDeleteFlushedSegmentsEnabled() { - return EarlybirdConfig.getBool("segment_dropper_delete_flushed", true); - } - - /** - * Returns the root of the segment directory on the local disk. - */ - public String getLocalSegmentSyncRootDir() { - return EarlybirdConfig.getString("segment_sync_dir", "partitions") - + getScrubGenFlushDirSuffix(); - } - - /** - * Returns the root of the segment directory on HDFS. - */ - public String getHdfsSegmentSyncRootDir() { - return EarlybirdProperty.HDFS_SEGMENT_SYNC_DIR.get("partitions") - + getScrubGenFlushDirSuffix(); - } - - /** - * Returns the HDFS root directory where all segments should be uploaded. - */ - public String getHdfsSegmentUploadRootDir() { - String hdfsSegmentUploadDir = EarlybirdProperty.HDFS_SEGMENT_UPLOAD_DIR.get(null); - return hdfsSegmentUploadDir != null - ? hdfsSegmentUploadDir + getScrubGenFlushDirSuffix() - : getHdfsSegmentSyncRootDir(); - } - - /** - * Returns the ZooKeeper path used for segment sync'ing. - */ - public String getZooKeeperSyncFullPath() { - return EarlybirdProperty.ZK_APP_ROOT.get() + "/" - + EarlybirdConfig.getString("segment_flush_sync_relative_path", "segment_flush_sync"); - } - - /** - * Returns the list of directories that should be persisted for this segment. - */ - public Collection getPersistentFileNames(SegmentInfo segment) { - return Collections.singleton(segment.getSegmentName()); - } - - /** - * Returns the list of all files that should be sync'ed for this segment. - */ - public Collection getAllSyncFileNames(SegmentInfo segment) { - Collection allFileNames = PersistentFile.getAllFileNames(segment.getSegmentName()); - if (segment.getEarlybirdIndexConfig().isIndexStoredOnDisk()) { - allFileNames = new ArrayList<>(allFileNames); - // Just the file name, not the full path - allFileNames.add(getLocalLuceneSyncDirFileName(segment.getSegment())); - } - return allFileNames; - } - - /** - * Returns the local sync directory for the given segment. - */ - public String getLocalSyncDirName(Segment segment) { - return getLocalSegmentSyncRootDir() + "/" + segment.getSegmentName() - + getVersionFileExtension(); - } - - /** - * Returns the local Lucene directory for the given segment. - */ - public String getLocalLuceneSyncDirName(Segment segment) { - return getLocalSyncDirName(segment) + "/" + getLocalLuceneSyncDirFileName(segment); - } - - /** - * Returns the name (not the path) of the Lucene directory for the given segment. - */ - private String getLocalLuceneSyncDirFileName(Segment segment) { - if (segment instanceof ArchiveSegment) { - Date endDate = ((ArchiveSegment) segment).getDataEndDate(); - String endDateString = TwitterDateFormat.apply("yyyyMMdd").format(endDate); - return LUCENE_DIR_PREFIX + endDateString; - } else { - return LUCENE_DIR_PREFIX + "realtime"; - } - } - - /** - * Returns the HDFS sync directory for the given segment. - */ - public String getHdfsSyncDirNamePrefix(Segment segment) { - return getHdfsSegmentSyncRootDir() + "/" + segment.getSegmentName() - + getVersionFileExtension() + "*"; - } - - /** - * Returns the prefix of the HDFS directory where the files for this segment should be uploaded. - */ - public String getHdfsUploadDirNamePrefix(Segment segment) { - return getHdfsSegmentUploadRootDir() + "/" + segment.getSegmentName() - + getVersionFileExtension() + "*"; - } - - /** - * Returns the HDFS directory where the files for this segment should be uploaded. - */ - public String getHdfsFlushDirName(Segment segment) { - return getHdfsSegmentUploadRootDir() + "/" + segment.getSegmentName() - + getVersionFileExtension() + "_" + DatabaseConfig.getLocalHostname(); - } - - /** - * Returns a temp HDFS directory to be used for this segment. - */ - public String getHdfsTempFlushDirName(Segment segment) { - return getHdfsSegmentUploadRootDir() + "/temp_" - + DatabaseConfig.getLocalHostname() + "_" + segment.getSegmentName() - + getVersionFileExtension(); - } - - /** - * Concatenates the name of this segment with the flush version extension. - */ - public String getVersionedName(Segment segment) { - return segment.getSegmentName() + getVersionFileExtension(); - } - - private String getScrubGenFlushDirSuffix() { - return scrubGen - .map(s -> "/scrubbed/" + s) - .orElse(""); - } - - /** - * Returns the scrub gen set for this earlybird. - */ - public Optional getScrubGen() { - return scrubGen; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentSyncInfo.docx b/src/java/com/twitter/search/earlybird/partition/SegmentSyncInfo.docx new file mode 100644 index 000000000..1365e5fe9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentSyncInfo.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentSyncInfo.java b/src/java/com/twitter/search/earlybird/partition/SegmentSyncInfo.java deleted file mode 100644 index f204882ca..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentSyncInfo.java +++ /dev/null @@ -1,113 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import com.google.common.annotations.VisibleForTesting; - -import com.twitter.search.common.partitioning.base.Segment; - -/** - * Representation for segment sync state, the local and hdfs file locations, as well as the - * current in-memory sync states maintained by earlybirds. - */ -public class SegmentSyncInfo { - // Is this segment loaded from disk? - private volatile boolean loaded = false; - // Has this segment been flushed to disk, and uploaded to HDFS if uploading is enabled? - private volatile boolean flushed = false; - // Time when the segment was flushed to local disk - private volatile long flushTimeMillis = 0; - - private final Segment segment; - private final SegmentSyncConfig syncConfig; - private final String localSyncDir; - private final String hdfsFlushDir; - private final String hdfsSyncDirPrefix; - private final String hdfsUploadDirPrefix; - private final String hdfsTempFlushDir; - - @VisibleForTesting - public SegmentSyncInfo(SegmentSyncConfig syncConfig, Segment segment) { - this.segment = segment; - this.syncConfig = syncConfig; - this.localSyncDir = syncConfig.getLocalSyncDirName(segment); - this.hdfsSyncDirPrefix = syncConfig.getHdfsSyncDirNamePrefix(segment); - this.hdfsUploadDirPrefix = syncConfig.getHdfsUploadDirNamePrefix(segment); - this.hdfsFlushDir = syncConfig.getHdfsFlushDirName(segment); - this.hdfsTempFlushDir = syncConfig.getHdfsTempFlushDirName(segment); - } - - public boolean isLoaded() { - return loaded; - } - - public boolean isFlushed() { - return flushed; - } - - public long getFlushTimeMillis() { - return flushTimeMillis; - } - - public String getLocalSyncDir() { - return localSyncDir; - } - - public SegmentSyncConfig getSegmentSyncConfig() { - return syncConfig; - } - - public String getLocalLuceneSyncDir() { - // For archive search this name depends on the end date of the segment, which can change, - // so we cannot pre-compute this in the constructor. - // This should only be used in the on-disk archive. - return syncConfig.getLocalLuceneSyncDirName(segment); - } - - public String getHdfsFlushDir() { - return hdfsFlushDir; - } - - public String getHdfsSyncDirPrefix() { - return hdfsSyncDirPrefix; - } - - public String getHdfsUploadDirPrefix() { - return hdfsUploadDirPrefix; - } - - public String getHdfsTempFlushDir() { - return hdfsTempFlushDir; - } - - public void setLoaded(boolean isLoaded) { - this.loaded = isLoaded; - } - - /** - * Stores the flushing state for this segment. - */ - public void setFlushed(boolean isFlushed) { - if (isFlushed) { - this.flushTimeMillis = System.currentTimeMillis(); - } - this.flushed = isFlushed; - } - - /** - * Adds debug information about the loaded and flushed status of this segment to the given - * StringBuilder. - */ - public void addDebugInfo(StringBuilder builder) { - builder.append("["); - int startLength = builder.length(); - if (loaded) { - builder.append("loaded, "); - } - if (flushed) { - builder.append("flushed, "); - } - if (startLength < builder.length()) { - builder.setLength(builder.length() - 2); - } - builder.append("]"); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentVulture.docx b/src/java/com/twitter/search/earlybird/partition/SegmentVulture.docx new file mode 100644 index 000000000..fa9035321 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentVulture.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentVulture.java b/src/java/com/twitter/search/earlybird/partition/SegmentVulture.java deleted file mode 100644 index 8a07b7f80..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentVulture.java +++ /dev/null @@ -1,380 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Set; -import java.util.SortedSet; -import java.util.TreeSet; - -import javax.annotation.Nonnull; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Sets; - -import org.apache.commons.io.FileUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.partitioning.base.Segment; -import com.twitter.search.common.schema.earlybird.FlushVersion; -import com.twitter.search.earlybird.archive.ArchiveSearchPartitionManager; -import com.twitter.search.earlybird.archive.ArchiveTimeSlicer; -import com.twitter.search.earlybird.archive.ArchiveTimeSlicer.ArchiveTimeSlice; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.factory.EarlybirdIndexConfigUtil; - -/** - * This class removes older flush version segments. - * Considering that we almost never increase status flush versions, old statuses are not cleaned up - * automatically. - */ -public final class SegmentVulture { - private static final Logger LOG = LoggerFactory.getLogger(SegmentVulture.class); - @VisibleForTesting // Not final for testing. - protected static int numIndexFlushVersionsToKeep = - EarlybirdConfig.getInt("number_of_flush_versions_to_keep", 2); - - private SegmentVulture() { - // this never gets called - } - - /** - * Delete old build generations, keep currentGeneration. - */ - @VisibleForTesting - static void removeOldBuildGenerations(String rootDirPath, String currentGeneration) { - File rootDir = new File(rootDirPath); - - if (!rootDir.exists() || !rootDir.isDirectory()) { - LOG.error("Root directory is invalid: " + rootDirPath); - return; - } - - File[] buildGenerations = rootDir.listFiles(); - - for (File generation : buildGenerations) { - if (generation.getName().equals(currentGeneration)) { - LOG.info("Skipping current generation: " + generation.getAbsoluteFile()); - continue; - } - - try { - FileUtils.deleteDirectory(generation); - LOG.info("Deleted old build generation: " + generation.getAbsolutePath()); - } catch (IOException e) { - LOG.error("Failed to delete old build generation at: " + generation.getAbsolutePath(), e); - } - } - LOG.info("Successfully deleted all old generations"); - } - - /** - * Delete all the timeslice data outside the serving range. - */ - @VisibleForTesting - static void removeArchiveTimesliceOutsideServingRange(PartitionConfig partitionConfig, - ArchiveTimeSlicer timeSlicer, SegmentSyncConfig segmentSyncConfig) { - try { - long servingStartTimesliceId = Long.MAX_VALUE; - long servingEndTimesliceId = 0; - int partitionID = partitionConfig.getIndexingHashPartitionID(); - List timeSliceList = timeSlicer.getTimeSlicesInTierRange(); - for (ArchiveTimeSlice timeSlice : timeSliceList) { - if (timeSlice.getMinStatusID(partitionID) < servingStartTimesliceId) { - servingStartTimesliceId = timeSlice.getMinStatusID(partitionID); - } - if (timeSlice.getMaxStatusID(partitionID) > servingEndTimesliceId) { - servingEndTimesliceId = timeSlice.getMaxStatusID(partitionID); - } - } - LOG.info("Got the serving range: [" + servingStartTimesliceId + ", " - + servingEndTimesliceId + "], " + "[" + partitionConfig.getTierStartDate() + ", " - + partitionConfig.getTierEndDate() + ") for tier: " + partitionConfig.getTierName()); - - // The tier configuration does not have valid serving range: do not do anything. - if (servingEndTimesliceId <= servingStartTimesliceId) { - LOG.error("Invalid serving range [" + partitionConfig.getTierStartDate() + ", " - + partitionConfig.getTierEndDate() + "] for tier: " + partitionConfig.getTierName()); - return; - } - - int numDeleted = 0; - File[] segments = getSegmentsOnRootDir(segmentSyncConfig); - for (File segment : segments) { - String segmentName = SegmentInfo.getSegmentNameFromFlushedDir(segment.getName()); - if (segmentName == null) { - LOG.error("Invalid directory for segments: " + segment.getAbsolutePath()); - continue; - } - long timesliceId = Segment.getTimeSliceIdFromName(segmentName); - if (timesliceId < 0) { - LOG.error("Unknown dir/file found: " + segment.getAbsolutePath()); - continue; - } - - if (timesliceId < servingStartTimesliceId || timesliceId > servingEndTimesliceId) { - LOG.info(segment.getAbsolutePath() + " will be deleted for outside serving Range[" - + partitionConfig.getTierStartDate() + ", " + partitionConfig.getTierEndDate() + ")"); - if (deleteSegment(segment)) { - numDeleted++; - } - } - } - LOG.info("Deleted " + numDeleted + " segments out of " + segments.length + " segments"); - } catch (IOException e) { - LOG.error("Can not timeslice based on the document data: ", e); - throw new RuntimeException(e); - } - } - - /** - * Deleted segments from other partitions. When boxes are moved between - * partitions, segments from other partitions may stay, we will have to - * delete them. - */ - @VisibleForTesting - static void removeIndexesFromOtherPartitions(int myPartition, int numPartitions, - SegmentSyncConfig segmentSyncConfig) { - File[] segments = getSegmentsOnRootDir(segmentSyncConfig); - int numDeleted = 0; - for (File segment : segments) { - int segmentNumPartitions = Segment.numPartitionsFromName(segment.getName()); - int segmentPartition = Segment.getPartitionFromName(segment.getName()); - - if (segmentNumPartitions < 0 || segmentPartition < 0) { // Not a segment file, ignoring - LOG.info("Unknown dir/file found: " + segment.getAbsolutePath()); - continue; - } - - if (segmentNumPartitions != numPartitions || segmentPartition != myPartition) { - if (deleteSegment(segment)) { - numDeleted++; - } - } - } - LOG.info("Deleted " + numDeleted + " segments out of " + segments.length + " segments"); - } - - /** - * Delete flushed segments of older flush versions. - */ - @VisibleForTesting - static void removeOldFlushVersionIndexes(int currentFlushVersion, - SegmentSyncConfig segmentSyncConfig) { - SortedSet indexFlushVersions = - listFlushVersions(segmentSyncConfig, currentFlushVersion); - - if (indexFlushVersions == null - || indexFlushVersions.size() <= numIndexFlushVersionsToKeep) { - return; - } - - Set suffixesToKeep = Sets.newHashSetWithExpectedSize(numIndexFlushVersionsToKeep); - int flushVersionsToKeep = numIndexFlushVersionsToKeep; - while (flushVersionsToKeep > 0 && !indexFlushVersions.isEmpty()) { - Integer oldestFlushVersion = indexFlushVersions.last(); - String flushFileExtension = FlushVersion.getVersionFileExtension(oldestFlushVersion); - if (flushFileExtension != null) { - suffixesToKeep.add(flushFileExtension); - flushVersionsToKeep--; - } else { - LOG.warn("Found unknown flush versions: " + oldestFlushVersion - + " Segments with this flush version will be deleted to recover disk space."); - } - indexFlushVersions.remove(oldestFlushVersion); - } - - String segmentSyncRootDir = segmentSyncConfig.getLocalSegmentSyncRootDir(); - File dir = new File(segmentSyncRootDir); - File[] segments = dir.listFiles(); - - for (File segment : segments) { - boolean keepSegment = false; - for (String suffix : suffixesToKeep) { - if (segment.getName().endsWith(suffix)) { - keepSegment = true; - break; - } - } - if (!keepSegment) { - try { - FileUtils.deleteDirectory(segment); - LOG.info("Deleted old flushed segment: " + segment.getAbsolutePath()); - } catch (IOException e) { - LOG.error("Failed to delete old flushed segment.", e); - } - } - } - } - - private static File[] getSegmentsOnRootDir(SegmentSyncConfig segmentSyncConfig) { - String segmentSyncRootDir = segmentSyncConfig.getLocalSegmentSyncRootDir(); - File dir = new File(segmentSyncRootDir); - File[] segments = dir.listFiles(); - if (segments == null) { - return new File[0]; - } else { - return segments; - } - } - - private static boolean deleteSegment(File segment) { - try { - FileUtils.deleteDirectory(segment); - LOG.info("Deleted segment from other partition: " + segment.getAbsolutePath()); - return true; - } catch (IOException e) { - LOG.error("Failed to delete segment from other partition.", e); - return false; - } - } - - // Returns FlushVersions found on disk. - // Current FlushVersion is always added into the list, even if segments are not found on disk, - // because they may not have appeared yet. - @Nonnull - @VisibleForTesting - static SortedSet listFlushVersions(SegmentSyncConfig sync, int currentFlushVersion) { - TreeSet flushVersions = Sets.newTreeSet(); - - // Always add current flush version. - // It is possible that on startup when this is run, the current flush version - // segments have not appeared yet. - flushVersions.add(currentFlushVersion); - - String segmentSyncRootDir = sync.getLocalSegmentSyncRootDir(); - File dir = new File(segmentSyncRootDir); - if (!dir.exists()) { - LOG.info("segmentSyncRootDir [" + segmentSyncRootDir - + "] does not exist"); - return flushVersions; - } - if (!dir.isDirectory()) { - LOG.error("segmentSyncRootDir [" + segmentSyncRootDir - + "] does not point to a directory"); - return flushVersions; - } - if (!dir.canRead()) { - LOG.error("No permission to read from segmentSyncRootDir [" - + segmentSyncRootDir + "]"); - return flushVersions; - } - if (!dir.canWrite()) { - LOG.error("No permission to write to segmentSyncRootDir [" - + segmentSyncRootDir + "]"); - return flushVersions; - } - - File[] segments = dir.listFiles(); - for (File segment : segments) { - String name = segment.getName(); - if (!name.contains(FlushVersion.DELIMITER)) { - // This is a not a segment with a FlushVersion, skip. - LOG.info("Found segment directory without a flush version: " + name); - continue; - } - String[] nameSplits = name.split(FlushVersion.DELIMITER); - if (nameSplits.length != 2) { - LOG.warn("Found segment with bad name: " + segment.getAbsolutePath()); - continue; - } - - // Second half contains flush version - try { - int flushVersion = Integer.parseInt(nameSplits[1]); - flushVersions.add(flushVersion); - } catch (NumberFormatException e) { - LOG.warn("Bad flush version number in segment name: " + segment.getAbsolutePath()); - } - } - return flushVersions; - } - - /** - * Removes old segments in the current build gen. - */ - @VisibleForTesting - static void removeOldSegments(SegmentSyncConfig sync) { - if (!sync.getScrubGen().isPresent()) { - return; - } - - File currentScrubGenSegmentDir = new File(sync.getLocalSegmentSyncRootDir()); - - // The unscrubbed segment root directory, used for rebuilds and for segments created before - // we introduced scrub gens. The getLocalSegmentSyncRootDir should be something like: - // $unscrubbedSegmentDir/scrubbed/$scrub_gen/, - // get unscrubbedSegmentDir from string name here in case scrubbed dir does not exist yet - File unscrubbedSegmentDir = new File(sync.getLocalSegmentSyncRootDir().split("scrubbed")[0]); - if (!unscrubbedSegmentDir.exists()) { - // For a new host that swapped in, it might not have flushed_segment dir yet. - // return directly in that case. - LOG.info(unscrubbedSegmentDir.getAbsoluteFile() + "does not exist, nothing to remove."); - return; - } - Preconditions.checkArgument(unscrubbedSegmentDir.exists()); - for (File file : unscrubbedSegmentDir.listFiles()) { - if (file.getName().matches("scrubbed")) { - continue; - } - try { - LOG.info("Deleting old unscrubbed segment: " + file.getAbsolutePath()); - FileUtils.deleteDirectory(file); - } catch (IOException e) { - LOG.error("Failed to delete directory: " + file.getPath(), e); - } - } - - // Delete all segments from previous scrub generations. - File allScrubbedSegmentsDir = currentScrubGenSegmentDir.getParentFile(); - if (allScrubbedSegmentsDir.exists()) { - for (File file : allScrubbedSegmentsDir.listFiles()) { - if (file.getPath().equals(currentScrubGenSegmentDir.getPath())) { - continue; - } - try { - LOG.info("Deleting old scrubbed segment: " + file.getAbsolutePath()); - FileUtils.deleteDirectory(file); - } catch (IOException e) { - LOG.error("Failed to delete directory: " + file.getPath(), e); - } - } - } - } - - /** - * Removes the data for all unused segments from the local disk. This includes: - * - data for old segments - * - data for segments belonging to another partition - * - data for segments belonging to a different flush version. - */ - public static void removeUnusedSegments( - PartitionManager partitionManager, - PartitionConfig partitionConfig, - int schemaMajorVersion, - SegmentSyncConfig segmentSyncConfig) { - - if (EarlybirdIndexConfigUtil.isArchiveSearch()) { - removeOldBuildGenerations( - EarlybirdConfig.getString("root_dir"), - EarlybirdConfig.getString("offline_segment_build_gen") - ); - removeOldSegments(segmentSyncConfig); - - Preconditions.checkState(partitionManager instanceof ArchiveSearchPartitionManager); - removeArchiveTimesliceOutsideServingRange( - partitionConfig, - ((ArchiveSearchPartitionManager) partitionManager).getTimeSlicer(), segmentSyncConfig); - } - - // Remove segments from other partitions - removeIndexesFromOtherPartitions( - partitionConfig.getIndexingHashPartitionID(), - partitionConfig.getNumPartitions(), segmentSyncConfig); - - // Remove old flushed segments - removeOldFlushVersionIndexes(schemaMajorVersion, segmentSyncConfig); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentWarmer.docx b/src/java/com/twitter/search/earlybird/partition/SegmentWarmer.docx new file mode 100644 index 000000000..cb5c58a56 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentWarmer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentWarmer.java b/src/java/com/twitter/search/earlybird/partition/SegmentWarmer.java deleted file mode 100644 index 7d59e5618..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentWarmer.java +++ /dev/null @@ -1,49 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; - -public class SegmentWarmer { - private static final Logger LOG = LoggerFactory.getLogger(SegmentWarmer.class); - - private final CriticalExceptionHandler criticalExceptionHandler; - - public SegmentWarmer(CriticalExceptionHandler criticalExceptionHandler) { - this.criticalExceptionHandler = criticalExceptionHandler; - } - - private boolean shouldWarmSegment(SegmentInfo segmentInfo) { - return segmentInfo.isEnabled() - && segmentInfo.isComplete() - && segmentInfo.isOptimized() - && !segmentInfo.isIndexing(); - } - - /** - * Warms a segment if it is ready to be warmed. Only has an affect on Archive Lucene segments. - */ - public boolean warmSegmentIfNecessary(SegmentInfo segmentInfo) { - if (!shouldWarmSegment(segmentInfo)) { - return false; - } - try { - segmentInfo.getIndexSegment().warmSegment(); - return true; - } catch (IOException e) { - // This is a bad situation, as earlybird can't search a segment that hasn't been warmed up - // So we delete the bad segment, and restart the earlybird if it's in starting phrase, - // otherwise alert. - LOG.error("Failed to warmup segment " + segmentInfo.getSegmentName() - + ". Will destroy local unreadable segment.", e); - segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately(); - - criticalExceptionHandler.handle(this, e); - - return false; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentWriter.docx b/src/java/com/twitter/search/earlybird/partition/SegmentWriter.docx new file mode 100644 index 000000000..7e04205db Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SegmentWriter.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SegmentWriter.java b/src/java/com/twitter/search/earlybird/partition/SegmentWriter.java deleted file mode 100644 index 46103840f..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SegmentWriter.java +++ /dev/null @@ -1,239 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.EnumMap; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; - -import com.google.common.collect.HashBasedTable; -import com.google.common.collect.Table; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.Percentile; -import com.twitter.search.common.metrics.PercentileUtil; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent; -import com.twitter.search.common.schema.thriftjava.ThriftIndexingEventType; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.document.DocumentFactory; -import com.twitter.search.earlybird.document.TweetDocument; -import com.twitter.search.earlybird.index.EarlybirdSegment; -import com.twitter.util.Time; - -public class SegmentWriter implements ISegmentWriter { - - // helper, used for collecting stats - enum FailureReason { - FAILED_INSERT, - FAILED_FOR_TWEET_IN_INDEX, - FAILED_FOR_COMPLETE_SEGMENT - } - - private static final String STAT_PREFIX = "segment_writer_"; - private static final String EVENT_COUNTER = STAT_PREFIX + "%s_%s_segment_%s"; - private static final String EVENT_COUNTER_ALL_SEGMENTS = STAT_PREFIX + "%s_%s_all_segments"; - private static final String EVENT_TIMERS = STAT_PREFIX + "%s_timing"; - private static final String DROPPED_UPDATES_FOR_DISABLED_SEGMENTS = - STAT_PREFIX + "%s_dropped_updates_for_disabled_segments"; - private static final String INDEXING_LATENCY = - STAT_PREFIX + "%s_indexing_latency_ms"; - - private final byte penguinVersion; - private final DocumentFactory updateFactory; - private final DocumentFactory documentFactory; - private final SearchRateCounter missingPenguinVersion; - private final EarlybirdSegment earlybirdSegment; - private final SegmentInfo segmentInfo; - // Stores per segment counters for each (indexing event type, result) pair - // Example stat name - // "segment_writer_partial_update_success_segment_twttr_search_test_start_%d_p_0_of_1" - private final Table statsForUpdateType = - HashBasedTable.create(); - // Stores aggregated counters for each (indexing event type, result) pair across all segments - // Example stat name - // "segment_writer_partial_update_success_all_segments" - private final Table - aggregateStatsForUpdateType = HashBasedTable.create(); - // Stores per segment counters for each (indexing event type, non-retryable failure reason) pair - // Example stat name - // "segment_writer_partial_update_failed_for_tweet_in_index_segment_twttr_search_t_%d_p_0_of_1" - private final Table - failureStatsForUpdateType = HashBasedTable.create(); - // Stores aggregated counters for each (indexing event type, non-retryable failure reason) pair - // Example stat name - // "segment_writer_partial_update_failed_for_tweet_in_index_all_segments" - private final Table - aggregateFailureStatsForUpdateType = HashBasedTable.create(); - private final EnumMap eventTimers = - new EnumMap<>(ThriftIndexingEventType.class); - private final EnumMap - droppedUpdatesForDisabledSegments = new EnumMap<>(ThriftIndexingEventType.class); - // We pass this stat from the SearchIndexingMetricSet so that we can share the atomic longs - // between all SegmentWriters and export the largest freshness value across all segments. - private final EnumMap updateFreshness; - private final EnumMap> indexingLatency = - new EnumMap<>(ThriftIndexingEventType.class); - - public SegmentWriter( - SegmentInfo segmentInfo, - EnumMap updateFreshness - ) { - this.segmentInfo = segmentInfo; - this.updateFreshness = updateFreshness; - this.earlybirdSegment = segmentInfo.getIndexSegment(); - this.penguinVersion = EarlybirdConfig.getPenguinVersionByte(); - this.updateFactory = segmentInfo.getEarlybirdIndexConfig().createUpdateFactory(); - this.documentFactory = segmentInfo.getEarlybirdIndexConfig().createDocumentFactory(); - - String segmentName = segmentInfo.getSegmentName(); - for (ThriftIndexingEventType type : ThriftIndexingEventType.values()) { - for (Result result : Result.values()) { - String stat = String.format(EVENT_COUNTER, type, result, segmentName).toLowerCase(); - statsForUpdateType.put(type, result, SearchRateCounter.export(stat)); - - String aggregateStat = - String.format(EVENT_COUNTER_ALL_SEGMENTS, type, result).toLowerCase(); - aggregateStatsForUpdateType.put(type, result, SearchRateCounter.export(aggregateStat)); - } - - for (FailureReason reason : FailureReason.values()) { - String stat = String.format(EVENT_COUNTER, type, reason, segmentName).toLowerCase(); - failureStatsForUpdateType.put(type, reason, SearchRateCounter.export(stat)); - - String aggregateStat = - String.format(EVENT_COUNTER_ALL_SEGMENTS, type, reason).toLowerCase(); - aggregateFailureStatsForUpdateType.put( - type, reason, SearchRateCounter.export(aggregateStat)); - } - - eventTimers.put(type, SearchTimerStats.export( - String.format(EVENT_TIMERS, type).toLowerCase(), - TimeUnit.MICROSECONDS, - false)); - droppedUpdatesForDisabledSegments.put( - type, - SearchRateCounter.export( - String.format(DROPPED_UPDATES_FOR_DISABLED_SEGMENTS, type).toLowerCase())); - indexingLatency.put( - type, - PercentileUtil.createPercentile( - String.format(INDEXING_LATENCY, type).toLowerCase())); - } - - this.missingPenguinVersion = SearchRateCounter.export( - "documents_without_current_penguin_version_" + penguinVersion + "_" + segmentName); - } - - @Override - public synchronized Result indexThriftVersionedEvents(ThriftVersionedEvents tve) - throws IOException { - if (!tve.getVersionedEvents().containsKey(penguinVersion)) { - missingPenguinVersion.increment(); - return Result.FAILURE_NOT_RETRYABLE; - } - - ThriftIndexingEvent tie = tve.getVersionedEvents().get(penguinVersion); - ThriftIndexingEventType eventType = tie.getEventType(); - - if (!segmentInfo.isEnabled()) { - droppedUpdatesForDisabledSegments.get(eventType).increment(); - return Result.SUCCESS; - } - - SearchTimerStats timerStats = eventTimers.get(eventType); - SearchTimer timer = timerStats.startNewTimer(); - - long tweetId = tve.getId(); - Result result = tryApplyIndexingEvent(tweetId, tie); - - if (result == Result.SUCCESS) { - long tweetAgeInMs = SnowflakeIdParser.getTimestampFromTweetId(tweetId); - - AtomicLong freshness = updateFreshness.get(tie.getEventType()); - // Note that this is racy at startup because we don't do an atomic swap, but it will be - // approximately accurate, and this stat doesn't matter until we are current. - if (freshness.get() < tweetAgeInMs) { - freshness.set(tweetAgeInMs); - } - - if (tie.isSetCreateTimeMillis()) { - long age = Time.now().inMillis() - tie.getCreateTimeMillis(); - indexingLatency.get(tie.getEventType()).record(age); - } - } - - statsForUpdateType.get(eventType, result).increment(); - aggregateStatsForUpdateType.get(eventType, result).increment(); - timerStats.stopTimerAndIncrement(timer); - - return result; - } - - public SegmentInfo getSegmentInfo() { - return segmentInfo; - } - - public boolean hasTweet(long tweetId) throws IOException { - return earlybirdSegment.hasDocument(tweetId); - } - - private Result tryApplyIndexingEvent(long tweetId, ThriftIndexingEvent tie) throws IOException { - if (applyIndexingEvent(tie, tweetId)) { - return Result.SUCCESS; - } - - if (tie.getEventType() == ThriftIndexingEventType.INSERT) { - // We don't retry inserts - incrementFailureStats(tie, FailureReason.FAILED_INSERT); - return Result.FAILURE_NOT_RETRYABLE; - } - - if (earlybirdSegment.hasDocument(tweetId)) { - // An update fails to be applied for a tweet that is in the index. - incrementFailureStats(tie, FailureReason.FAILED_FOR_TWEET_IN_INDEX); - return Result.FAILURE_NOT_RETRYABLE; - } - - if (segmentInfo.isComplete()) { - // An update is directed at a tweet that is not in the segment (hasDocument(tweetId) failed), - // and the segment is complete (i.e. there will never be new tweets for this segment). - incrementFailureStats(tie, FailureReason.FAILED_FOR_COMPLETE_SEGMENT); - return Result.FAILURE_NOT_RETRYABLE; - } - - // The tweet may arrive later for this event, so it's possible a later try will succeed - return Result.FAILURE_RETRYABLE; - } - - private void incrementFailureStats(ThriftIndexingEvent tie, FailureReason failureReason) { - failureStatsForUpdateType.get(tie.getEventType(), failureReason).increment(); - aggregateFailureStatsForUpdateType.get(tie.getEventType(), failureReason).increment(); - } - - private boolean applyIndexingEvent(ThriftIndexingEvent tie, long tweetId) throws IOException { - switch (tie.getEventType()) { - case OUT_OF_ORDER_APPEND: - return earlybirdSegment.appendOutOfOrder(updateFactory.newDocument(tie), tweetId); - case PARTIAL_UPDATE: - return earlybirdSegment.applyPartialUpdate(tie); - case DELETE: - return earlybirdSegment.delete(tweetId); - case INSERT: - earlybirdSegment.addDocument(buildInsertDocument(tie, tweetId)); - return true; - default: - throw new IllegalArgumentException("Unexpected update type: " + tie.getEventType()); - } - } - - private TweetDocument buildInsertDocument(ThriftIndexingEvent tie, long tweetId) { - return new TweetDocument( - tweetId, - segmentInfo.getTimeSliceID(), - tie.getCreateTimeMillis(), - documentFactory.newDocument(tie)); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SimpleSegmentIndexer.docx b/src/java/com/twitter/search/earlybird/partition/SimpleSegmentIndexer.docx new file mode 100644 index 000000000..5da184c01 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SimpleSegmentIndexer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SimpleSegmentIndexer.java b/src/java/com/twitter/search/earlybird/partition/SimpleSegmentIndexer.java deleted file mode 100644 index c96f7373c..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SimpleSegmentIndexer.java +++ /dev/null @@ -1,191 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.concurrent.TimeUnit; - -import javax.annotation.Nullable; - -import com.google.common.base.Stopwatch; - - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.util.io.recordreader.RecordReader; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.document.TweetDocument; -import com.twitter.search.earlybird.index.EarlybirdSegment; - -/** - * SimpleSegmentIndex indexes all Tweets for a *complete* segment. It does not index any updates or - * deletes. - */ -public class SimpleSegmentIndexer { - private static final Logger LOG = LoggerFactory.getLogger(SimpleSegmentIndexer.class); - - /** - * If not null, this segment is appended at the end after indexing finishes. - */ - @Nullable - private final SegmentInfo segmentToAppend; - - private final RecordReader tweetReader; - private final SearchIndexingMetricSet partitionIndexingMetricSet; - - // Segment we are indexing. - private EarlybirdSegment indexingSegment; - - // Total number of statuses indexed in this segment. - private long segmentSize = 0; - - public SimpleSegmentIndexer( - RecordReader tweetReader, - SearchIndexingMetricSet partitionIndexingMetricSet) { - this(tweetReader, partitionIndexingMetricSet, null); - } - - public SimpleSegmentIndexer(RecordReader tweetReader, - SearchIndexingMetricSet partitionIndexingMetricSet, - @Nullable SegmentInfo segmentToAppend) { - this.tweetReader = tweetReader; - this.segmentToAppend = segmentToAppend; - this.partitionIndexingMetricSet = partitionIndexingMetricSet; - } - - private boolean shouldIndexSegment(SegmentInfo segmentInfo) { - if (!segmentInfo.isEnabled()) { - return false; - } - - if (segmentToAppend != null) { - return true; - } - - return !segmentInfo.isComplete() - && !segmentInfo.isIndexing() - && !segmentInfo.getSyncInfo().isLoaded(); - } - - /** - * Indexes all tweets for a complete segment. - */ - public boolean indexSegment(SegmentInfo segmentInfo) { - LOG.info("Indexing segment " + segmentInfo.getSegmentName()); - if (!shouldIndexSegment(segmentInfo)) { - return false; - } - - // If we're starting to index, we're not complete, will become complete if we - // were successful here. - segmentInfo.setComplete(false); - - try { - segmentInfo.setIndexing(true); - indexingSegment = segmentInfo.getIndexSegment(); - - // if we're updating the segment, then we'll index only the new available days - // and then append the lucene index from the old segment - // If segmentToAppend is not null, it means we are updating a segment. - if (indexingSegment.tryToLoadExistingIndex()) { - segmentInfo.getSyncInfo().setLoaded(true); - LOG.info("Loaded existing index for " + segmentInfo + ", not indexing."); - } else { - indexingLoop(); - if (segmentToAppend != null) { - indexingSegment.append(segmentToAppend.getIndexSegment()); - } - } - - segmentInfo.setIndexing(false); - segmentInfo.setComplete(true); - segmentInfo.setWasIndexed(true); - LOG.info("Successfully indexed segment " + segmentInfo.getSegmentName()); - return true; - } catch (Exception e) { - LOG.error("Exception while indexing IndexSegment " + segmentInfo - + " after " + indexingSegment.getIndexStats().getStatusCount() + " documents.", e); - partitionIndexingMetricSet.simpleSegmentIndexerExceptionCounter.increment(); - - LOG.warn("Failed to load a new day into full archive. Cleaning up segment: " - + indexingSegment.getSegmentName()); - - // Clean up the lucene dir if it exists. Earlybird will retry loading the new day again later. - if (!segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately()) { - LOG.error("Failed to clean up index segment folder after indexing failures."); - } - - return false; - } finally { - if (tweetReader != null) { - tweetReader.stop(); - } - segmentInfo.setIndexing(false); - } - } - - // Indexes a document if available. Returns true if index was updated. - protected boolean indexDocument(TweetDocument tweetDocument) throws IOException { - if (tweetDocument == null) { - return false; - } - - SearchTimer timer = partitionIndexingMetricSet.statusStats.startNewTimer(); - indexingSegment.addDocument(tweetDocument); - partitionIndexingMetricSet.statusStats.stopTimerAndIncrement(timer); - segmentSize++; - return true; - } - - /** - * Indexes all tweets for this segment, until no more tweets are available. - * - * @throws InterruptedException If the thread is interrupted while indexing tweets. - * @throws IOException If there's a problem reading or indexing tweets. - */ - public void indexingLoop() throws InterruptedException, IOException { - Stopwatch stopwatch = Stopwatch.createStarted(); - - Stopwatch readingStopwatch = Stopwatch.createUnstarted(); - Stopwatch indexingStopwatch = Stopwatch.createUnstarted(); - - int indexedDocumentsCount = 0; - SearchLongGauge timeToIndexSegment = SearchLongGauge.export("time_to_index_segment"); - timeToIndexSegment.set(0); - if (tweetReader != null) { - while (!tweetReader.isExhausted() && !Thread.currentThread().isInterrupted()) { - readingStopwatch.start(); - TweetDocument tweetDocument = tweetReader.readNext(); - readingStopwatch.stop(); - - indexingStopwatch.start(); - boolean documentIndexed = indexDocument(tweetDocument); - indexingStopwatch.stop(); - - if (!documentIndexed) { - // No documents waiting to be indexed. Take a nap. - Thread.sleep(10); - } else { - indexedDocumentsCount++; - } - - if (segmentSize >= EarlybirdConfig.getMaxSegmentSize()) { - LOG.error("Reached max segment size " + segmentSize + ", stopping indexer"); - partitionIndexingMetricSet.maxSegmentSizeReachedCounter.increment(); - tweetReader.stop(); - break; - } - } - } - - timeToIndexSegment.set(stopwatch.elapsed(TimeUnit.MILLISECONDS)); - - LOG.info("SimpleSegmentIndexer finished: {}. Documents: {}", - indexingSegment.getSegmentName(), indexedDocumentsCount); - LOG.info("Time taken: {}, Reading time: {}, Indexing time: {}", - stopwatch, readingStopwatch, indexingStopwatch); - LOG.info("Total Memory: {}, Free Memory: {}", - Runtime.getRuntime().totalMemory(), Runtime.getRuntime().freeMemory()); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/SimpleStreamIndexer.docx b/src/java/com/twitter/search/earlybird/partition/SimpleStreamIndexer.docx new file mode 100644 index 000000000..95b2c6338 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SimpleStreamIndexer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SimpleStreamIndexer.java b/src/java/com/twitter/search/earlybird/partition/SimpleStreamIndexer.java deleted file mode 100644 index 7b4e72281..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SimpleStreamIndexer.java +++ /dev/null @@ -1,187 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.time.Duration; -import java.util.List; -import java.util.Map; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.stream.Collectors; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Verify; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.consumer.OffsetAndTimestamp; -import org.apache.kafka.common.PartitionInfo; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.errors.WakeupException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.earlybird.common.NonPagingAssert; -import com.twitter.search.earlybird.exception.MissingKafkaTopicException; - -/** - * Abstract base class for processing events from Kafka with the goal of indexing them and - * keeping Earlybirds up to date with the latest events. Indexing is defined by the - * implementation. - * - * NOTE: {@link EarlybirdKafkaConsumer} (tweet/tweet events consumer) is doing this in its - * own way, we might merge in the future. - * - * @param (Long) - * @param (Event/Thrift type to be consumed) - */ -public abstract class SimpleStreamIndexer { - private static final Logger LOG = LoggerFactory.getLogger(SimpleStreamIndexer.class); - - private static final Duration POLL_TIMEOUT = Duration.ofMillis(250); - private static final Duration CAUGHT_UP_FRESHNESS = Duration.ofSeconds(5); - - protected static final int MAX_POLL_RECORDS = 1000; - - private final SearchCounter numPollErrors; - protected SearchRateCounter indexingSuccesses; - protected SearchRateCounter indexingFailures; - - protected List topicPartitionList; - protected final KafkaConsumer kafkaConsumer; - private final AtomicBoolean running = new AtomicBoolean(true); - private final String topic; - - private boolean isCaughtUp = false; - - /** - * Create a simple stream indexer. - * - * @throws MissingKafkaTopicException - this shouldn't happen, but in case some - * external stream is not present, we want to have the caller decide how to - * handle it. Some missing streams might be fatal, for others it might not be - * justified to block startup. There's no point in constructing this object if - * a stream is missing, so we don't allow that to happen. - */ - public SimpleStreamIndexer(KafkaConsumer kafkaConsumer, - String topic) throws MissingKafkaTopicException { - this.kafkaConsumer = kafkaConsumer; - this.topic = topic; - List partitionInfos = this.kafkaConsumer.partitionsFor(topic); - - if (partitionInfos == null) { - LOG.error("Ooops, no partitions for {}", topic); - NonPagingAssert.assertFailed("missing_topic_" + topic); - throw new MissingKafkaTopicException(topic); - } - LOG.info("Discovered {} partitions for topic: {}", partitionInfos.size(), topic); - - numPollErrors = SearchCounter.export("stream_indexer_poll_errors_" + topic); - - this.topicPartitionList = partitionInfos - .stream() - .map(info -> new TopicPartition(topic, info.partition())) - .collect(Collectors.toList()); - this.kafkaConsumer.assign(topicPartitionList); - } - - /** - * Consume updates on startup until current (eg. until we've seen a record within 5 seconds - * of current time.) - */ - public void readRecordsUntilCurrent() { - do { - ConsumerRecords records = poll(); - - for (ConsumerRecord record : records) { - if (record.timestamp() > System.currentTimeMillis() - CAUGHT_UP_FRESHNESS.toMillis()) { - isCaughtUp = true; - } - validateAndIndexRecord(record); - } - } while (!isCaughtUp()); - } - - /** - * Run the consumer, indexing record values directly into their respective structures. - */ - public void run() { - try { - while (running.get()) { - for (ConsumerRecord record : poll()) { - validateAndIndexRecord(record); - } - } - } catch (WakeupException e) { - if (running.get()) { - LOG.error("Caught wakeup exception while running", e); - } - } finally { - kafkaConsumer.close(); - LOG.info("Consumer closed."); - } - } - - public boolean isCaughtUp() { - return isCaughtUp; - } - - /** - * For every partition in the topic, seek to an offset that has a timestamp greater - * than or equal to the given timestamp. - * @param timestamp - */ - public void seekToTimestamp(Long timestamp) { - Map partitionTimestampMap = topicPartitionList.stream() - .collect(Collectors.toMap(tp -> tp, tp -> timestamp)); - Map partitionOffsetMap = - kafkaConsumer.offsetsForTimes(partitionTimestampMap); - - partitionOffsetMap.forEach((tp, offsetAndTimestamp) -> { - Verify.verify(offsetAndTimestamp != null, - "Couldn't find records after timestamp: " + timestamp); - - kafkaConsumer.seek(tp, offsetAndTimestamp.offset()); - }); - } - - /** - * Seeks the kafka consumer to the beginning. - */ - public void seekToBeginning() { - kafkaConsumer.seekToBeginning(topicPartitionList); - } - - /** - * Polls and returns at most MAX_POLL_RECORDS records. - * @return - */ - @VisibleForTesting - protected ConsumerRecords poll() { - ConsumerRecords records; - try { - records = kafkaConsumer.poll(POLL_TIMEOUT); - } catch (Exception e) { - records = ConsumerRecords.empty(); - if (e instanceof WakeupException) { - throw e; - } else { - LOG.warn("Error polling from {} kafka topic.", topic, e); - numPollErrors.increment(); - } - } - return records; - } - - protected abstract void validateAndIndexRecord(ConsumerRecord record); - - // Shutdown hook which can be called from a seperate thread. Calling consumer.wakeup() interrupts - // the running indexer and causes it to first stop polling for new records before gracefully - // closing the consumer. - public void close() { - LOG.info("Shutting down stream indexer for topic {}", topic); - running.set(false); - kafkaConsumer.wakeup(); - } -} - diff --git a/src/java/com/twitter/search/earlybird/partition/SimpleUpdateIndexer.docx b/src/java/com/twitter/search/earlybird/partition/SimpleUpdateIndexer.docx new file mode 100644 index 000000000..d7b45c19e Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/SimpleUpdateIndexer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/SimpleUpdateIndexer.java b/src/java/com/twitter/search/earlybird/partition/SimpleUpdateIndexer.java deleted file mode 100644 index 30d8d3e3f..000000000 --- a/src/java/com/twitter/search/earlybird/partition/SimpleUpdateIndexer.java +++ /dev/null @@ -1,140 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.Optional; -import java.util.concurrent.TimeUnit; - -import com.google.common.base.Preconditions; -import com.google.common.base.Stopwatch; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.util.io.dl.DLRecordTimestampUtil; -import com.twitter.search.common.util.io.recordreader.RecordReader; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.segment.SegmentDataReaderSet; - -/** - * Indexes all updates for a complete segment at startup. - */ -public class SimpleUpdateIndexer { - private static final Logger LOG = LoggerFactory.getLogger(SimpleUpdateIndexer.class); - - private final SegmentDataReaderSet readerSet; - private final SearchIndexingMetricSet partitionIndexingMetricSet; - private final InstrumentedQueue retryQueue; - private final CriticalExceptionHandler criticalExceptionHandler; - - public SimpleUpdateIndexer(SegmentDataReaderSet readerSet, - SearchIndexingMetricSet partitionIndexingMetricSet, - InstrumentedQueue retryQueue, - CriticalExceptionHandler criticalExceptionHandler) { - this.readerSet = readerSet; - this.partitionIndexingMetricSet = partitionIndexingMetricSet; - this.retryQueue = retryQueue; - this.criticalExceptionHandler = criticalExceptionHandler; - } - - /** - * Indexes all updates for the given segment. - */ - public void indexAllUpdates(SegmentInfo segmentInfo) { - Preconditions.checkState( - segmentInfo.isEnabled() && segmentInfo.isComplete() && !segmentInfo.isIndexing()); - - try { - readerSet.attachUpdateReaders(segmentInfo); - } catch (IOException e) { - throw new RuntimeException("Could not attach readers for segment: " + segmentInfo, e); - } - - RecordReader reader = - readerSet.getUpdateEventsReaderForSegment(segmentInfo); - if (reader == null) { - return; - } - - LOG.info("Got updates reader (starting timestamp = {}) for segment {}: {}", - DLRecordTimestampUtil.recordIDToTimestamp(reader.getOffset()), - segmentInfo.getSegmentName(), - reader); - - // The segment is complete (we check this in indexAllUpdates()), so we can safely get - // the smallest and largest tweet IDs in this segment. - long lowestTweetId = segmentInfo.getIndexSegment().getLowestTweetId(); - long highestTweetId = segmentInfo.getIndexSegment().getHighestTweetId(); - Preconditions.checkArgument( - lowestTweetId > 0, - "Could not get the lowest tweet ID in segment " + segmentInfo.getSegmentName()); - Preconditions.checkArgument( - highestTweetId > 0, - "Could not get the highest tweet ID in segment " + segmentInfo.getSegmentName()); - - SegmentWriter segmentWriter = - new SegmentWriter(segmentInfo, partitionIndexingMetricSet.updateFreshness); - - LOG.info("Starting to index updates for segment: {}", segmentInfo.getSegmentName()); - Stopwatch stopwatch = Stopwatch.createStarted(); - - while (!Thread.currentThread().isInterrupted() && !reader.isCaughtUp()) { - applyUpdate(segmentInfo, reader, segmentWriter, lowestTweetId, highestTweetId); - } - - LOG.info("Finished indexing updates for segment {} in {} seconds.", - segmentInfo.getSegmentName(), - stopwatch.elapsed(TimeUnit.SECONDS)); - } - - private void applyUpdate(SegmentInfo segmentInfo, - RecordReader reader, - SegmentWriter segmentWriter, - long lowestTweetId, - long highestTweetId) { - ThriftVersionedEvents update; - try { - update = reader.readNext(); - } catch (IOException e) { - LOG.error("Exception while reading update for segment: " + segmentInfo.getSegmentName(), e); - criticalExceptionHandler.handle(this, e); - return; - } - if (update == null) { - LOG.warn("Update is not available but reader was not caught up. Segment: {}", - segmentInfo.getSegmentName()); - return; - } - - try { - // If the indexer put this update in the wrong timeslice, add it to the retry queue, and - // let PartitionIndexer retry it (it has logic to apply it to the correct segment). - if ((update.getId() < lowestTweetId) || (update.getId() > highestTweetId)) { - retryQueue.add(update); - return; - } - - // At this point, we are updating a segment that has every tweet it will ever have, - // (the segment is complete), so there is no point queueing an update to retry it. - SearchTimer timer = partitionIndexingMetricSet.updateStats.startNewTimer(); - segmentWriter.indexThriftVersionedEvents(update); - partitionIndexingMetricSet.updateStats.stopTimerAndIncrement(timer); - - updateUpdatesStreamTimestamp(segmentInfo); - } catch (IOException e) { - LOG.error("Exception while indexing updates for segment: " + segmentInfo.getSegmentName(), e); - criticalExceptionHandler.handle(this, e); - } - } - - private void updateUpdatesStreamTimestamp(SegmentInfo segmentInfo) { - Optional offset = readerSet.getUpdateEventsStreamOffsetForSegment(segmentInfo); - if (!offset.isPresent()) { - LOG.info("Unable to get updates stream offset for segment: {}", segmentInfo.getSegmentName()); - } else { - long offsetTimeMillis = DLRecordTimestampUtil.recordIDToTimestamp(offset.get()); - segmentInfo.setUpdatesStreamOffsetTimestamp(offsetTimeMillis); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/StartupUserEventIndexer.docx b/src/java/com/twitter/search/earlybird/partition/StartupUserEventIndexer.docx new file mode 100644 index 000000000..dce1cbd41 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/StartupUserEventIndexer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/StartupUserEventIndexer.java b/src/java/com/twitter/search/earlybird/partition/StartupUserEventIndexer.java deleted file mode 100644 index 5a468f3a6..000000000 --- a/src/java/com/twitter/search/earlybird/partition/StartupUserEventIndexer.java +++ /dev/null @@ -1,236 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.sql.Timestamp; -import java.text.DateFormat; -import java.text.SimpleDateFormat; -import java.time.Duration; -import java.util.Date; -import java.util.Optional; - -import com.google.common.annotations.VisibleForTesting; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.common.NonPagingAssert; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.common.userupdates.UserScrubGeoMap; -import com.twitter.search.earlybird.common.userupdates.UserTableBuilderFromSnapshot; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.factory.EarlybirdIndexConfigUtil; - -/** - * Indexer class responsible for getting the the {@link UserTable} and {@link UserScrubGeoMap} - * indexed up until the current moment. - */ -public class StartupUserEventIndexer { - private static final Logger LOG = LoggerFactory.getLogger(StartupUserEventIndexer.class); - private static final String LOAD_USER_UPDATE_SNAPSHOT = - "loading user update snapshot"; - private static final String INDEX_ALL_USER_EVENTS = - "indexing all user events"; - private static final NonPagingAssert FAILED_USER_TABLE_HDFS_LOAD - = new NonPagingAssert("failed_user_table_hdfs_load"); - - private static final long MAX_RETRY_MILLIS_FOR_SEEK_TO_TIMESTAMP = - Duration.ofMinutes(1).toMillis(); - private static final long SLEEP_MILLIS_BETWEEN_RETRIES_FOR_SEEK_TO_TIMESTAMP = - Duration.ofSeconds(1).toMillis(); - - private static final long MILLIS_IN_FOURTEEN_DAYS = 1209600000; - private static final long MILLIS_IN_ONE_DAY = 86400000; - - private final SearchIndexingMetricSet searchIndexingMetricSet; - private final UserUpdatesStreamIndexer userUpdatesStreamIndexer; - private final UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer; - private final SegmentManager segmentManager; - private final Clock clock; - - public StartupUserEventIndexer( - SearchIndexingMetricSet searchIndexingMetricSet, - UserUpdatesStreamIndexer userUpdatesStreamIndexer, - UserScrubGeoEventStreamIndexer userScrubGeoEventStreamIndexer, - SegmentManager segmentManager, - Clock clock) { - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.userUpdatesStreamIndexer = userUpdatesStreamIndexer; - this.userScrubGeoEventStreamIndexer = userScrubGeoEventStreamIndexer; - this.segmentManager = segmentManager; - this.clock = clock; - } - - /** - * Index all user events. - */ - public void indexAllEvents() { - EarlybirdStatus.beginEvent( - INDEX_ALL_USER_EVENTS, searchIndexingMetricSet.startupInUserEventIndexer); - - indexUserUpdates(); - if (EarlybirdConfig.consumeUserScrubGeoEvents()) { - indexUserScrubGeoEvents(); - } - - EarlybirdStatus.endEvent( - INDEX_ALL_USER_EVENTS, searchIndexingMetricSet.startupInUserEventIndexer); - } - - /** - * Index user updates until current. - */ - public void indexUserUpdates() { - EarlybirdStatus.beginEvent( - LOAD_USER_UPDATE_SNAPSHOT, searchIndexingMetricSet.startupInUserUpdates); - - Optional userTable = buildUserTable(); - if (userTable.isPresent()) { - segmentManager.getUserTable().setTable(userTable.get()); - LOG.info("Set new user table."); - - if (!seekToTimestampWithRetriesIfNecessary( - userTable.get().getLastRecordTimestamp(), - userUpdatesStreamIndexer)) { - LOG.error("User Updates stream indexer unable to seek to timestamp. " - + "Will seek to beginning."); - userUpdatesStreamIndexer.seekToBeginning(); - } - } else { - LOG.info("Failed to load user update snapshot. Will reindex user updates from scratch."); - FAILED_USER_TABLE_HDFS_LOAD.assertFailed(); - userUpdatesStreamIndexer.seekToBeginning(); - } - - userUpdatesStreamIndexer.readRecordsUntilCurrent(); - LOG.info("Finished catching up on user updates via Kafka"); - - EarlybirdStatus.endEvent( - LOAD_USER_UPDATE_SNAPSHOT, searchIndexingMetricSet.startupInUserUpdates); - } - - /** - * Index UserScrubGeoEvents until current. - */ - public void indexUserScrubGeoEvents() { - seekUserScrubGeoEventKafkaConsumer(); - - SearchTimer timer = new SearchTimer(); - timer.start(); - userScrubGeoEventStreamIndexer.readRecordsUntilCurrent(); - timer.stop(); - - LOG.info("Finished catching up on user scrub geo events via Kafka"); - LOG.info("UserScrubGeoMap contains {} users and finished in {} milliseconds", - segmentManager.getUserScrubGeoMap().getNumUsersInMap(), timer.getElapsed()); - } - - /** - * Seeks UserScrubGeoEventKafkaConsumer using timestamp derived from - * getTimestampForUserScrubGeoEventKafkaConsumer(). - */ - @VisibleForTesting - public void seekUserScrubGeoEventKafkaConsumer() { - long seekTimestamp = getTimestampForUserScrubGeoEventKafkaConsumer(); - if (seekTimestamp == -1) { - userScrubGeoEventStreamIndexer.seekToBeginning(); - } else { - if (!seekToTimestampWithRetriesIfNecessary(seekTimestamp, userScrubGeoEventStreamIndexer)) { - LOG.error("User Scrub Geo stream indexer unable to seek to timestamp. " - + "Will seek to beginning."); - userScrubGeoEventStreamIndexer.seekToBeginning(); - } - } - } - - /** - * Get timestamp to seek UserScrubGeoEventKafkaConsumer to. - * @return - */ - public long getTimestampForUserScrubGeoEventKafkaConsumer() { - if (EarlybirdIndexConfigUtil.isArchiveSearch()) { - return getTimestampForArchive(); - } else { - return getTimestampForRealtime(); - } - } - - /** - * For archive: grab scrub gen from config file and convert date into a timestamp. Add buffer of - * one day. We need all UserScrubGeoEvents since the date of the current scrub gen. - * - * See go/realtime-geo-filtering - * @return - */ - public long getTimestampForArchive() { - try { - String scrubGenString = EarlybirdProperty.EARLYBIRD_SCRUB_GEN.get(); - - DateFormat dateFormat = new SimpleDateFormat("yyyyMMdd"); - Date date = dateFormat.parse(scrubGenString); - return new Timestamp(date.getTime()).getTime() - MILLIS_IN_ONE_DAY; - - } catch (Exception e) { - LOG.error("Could not derive timestamp from scrub gen. " - + "Will seek User Scrub Geo Kafka consumer to beginning of topic"); - } - return -1; - } - - /** - * For realtime/protected: Compute the timestamp 14 days from the current time. This will account - * for all events that have occurred during the lifecylce of the current index. - * - * See go/realtime-geo-filtering - */ - public long getTimestampForRealtime() { - return System.currentTimeMillis() - MILLIS_IN_FOURTEEN_DAYS; - } - - private boolean seekToTimestampWithRetriesIfNecessary( - long lastRecordTimestamp, - SimpleStreamIndexer streamIndexer) { - long initialTimeMillis = clock.nowMillis(); - int numFailures = 0; - while (shouldTrySeekToTimestamp(initialTimeMillis, numFailures)) { - try { - streamIndexer.seekToTimestamp(lastRecordTimestamp); - LOG.info("Seeked consumer to timestamp {} after {} failures", - lastRecordTimestamp, numFailures); - return true; - } catch (Exception e) { - numFailures++; - LOG.info("Caught exception when seeking to timestamp. Num failures: {}. Exception: {}", - numFailures, e); - // Sleep before attempting to retry - try { - clock.waitFor(SLEEP_MILLIS_BETWEEN_RETRIES_FOR_SEEK_TO_TIMESTAMP); - } catch (InterruptedException interruptedException) { - LOG.warn("Interrupted while sleeping between seekToTimestamp retries", - interruptedException); - // Preserve interrupt status. - Thread.currentThread().interrupt(); - break; - } - } - } - // Failed to seek to timestamp - return false; - } - - private boolean shouldTrySeekToTimestamp(long initialTimeMillis, int numFailures) { - if (numFailures == 0) { - // no attempts have been made yet, so we should try to seek to timestamp - return true; - } else { - return clock.nowMillis() - initialTimeMillis < MAX_RETRY_MILLIS_FOR_SEEK_TO_TIMESTAMP; - } - } - - protected Optional buildUserTable() { - UserTableBuilderFromSnapshot builder = new UserTableBuilderFromSnapshot(); - return builder.build(segmentManager.getUserTable().getUserIdFilter()); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/StatusBatchFlushVersion.docx b/src/java/com/twitter/search/earlybird/partition/StatusBatchFlushVersion.docx new file mode 100644 index 000000000..55bb912c7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/StatusBatchFlushVersion.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/StatusBatchFlushVersion.java b/src/java/com/twitter/search/earlybird/partition/StatusBatchFlushVersion.java deleted file mode 100644 index 3175e89e1..000000000 --- a/src/java/com/twitter/search/earlybird/partition/StatusBatchFlushVersion.java +++ /dev/null @@ -1,41 +0,0 @@ -package com.twitter.search.earlybird.partition; - -/** - * Keeps track of versioning for flushed status batch data. - */ -public enum StatusBatchFlushVersion { - - VERSION_0("Initial version of status batch flushing", true), - VERSION_1("Switching to use field groups (contains changes to PartitionedBatch)", true), - VERSION_2("Removing support for per-partition _SUCCESS markers", true), - /* Put the semi colon on a separate line to avoid polluting git blame history */; - - public static final StatusBatchFlushVersion CURRENT_FLUSH_VERSION = - StatusBatchFlushVersion.values()[StatusBatchFlushVersion.values().length - 1]; - - public static final String DELIMITER = "_v_"; - - private final String description; - private final boolean isOfficial; - - private StatusBatchFlushVersion(String description, boolean official) { - this.description = description; - isOfficial = official; - } - - public int getVersionNumber() { - return this.ordinal(); - } - - public String getVersionFileExtension() { - return DELIMITER + ordinal(); - } - - public boolean isOfficial() { - return isOfficial; - } - - public String getDescription() { - return description; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/TimeLimitedHadoopExistsCall.docx b/src/java/com/twitter/search/earlybird/partition/TimeLimitedHadoopExistsCall.docx new file mode 100644 index 000000000..1a758e2e2 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/TimeLimitedHadoopExistsCall.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/TimeLimitedHadoopExistsCall.java b/src/java/com/twitter/search/earlybird/partition/TimeLimitedHadoopExistsCall.java deleted file mode 100644 index e3781bac7..000000000 --- a/src/java/com/twitter/search/earlybird/partition/TimeLimitedHadoopExistsCall.java +++ /dev/null @@ -1,90 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.concurrent.Callable; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; - -import com.google.common.util.concurrent.SimpleTimeLimiter; -import com.google.common.util.concurrent.TimeLimiter; - -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.Path; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.metrics.SearchTimerStats; - -/** - * Abstracts details of making time limited calls to hadoop. - * - * During IM-3556 we discovered that hadoop API calls can take a long time (seconds, minutes) - * if the Hadoop clsuter is in a bad state. Our code was generally not prepared for that and - * this caused various issues. This class is a fix on top of the Hadoop API's exists call and - * it introduces a timeout. - * - * The main motivation for having this as an external class is for testability. - */ -public class TimeLimitedHadoopExistsCall { - private final TimeLimiter hadoopCallsTimeLimiter; - private final FileSystem fileSystem; - private final int timeLimitInSeconds; - - private static final SearchTimerStats EXISTS_CALLS_TIMER = - SearchTimerStats.export("hadoop_exists_calls"); - - private static final SearchCounter EXISTS_CALLS_EXCEPTION = - SearchCounter.export("hadoop_exists_calls_exception"); - - public TimeLimitedHadoopExistsCall(FileSystem fileSystem) { - // This times varies. Sometimes it's very quick, sometimes it takes some amount of seconds. - // Do a rate on hadoop_exists_calls_latency_ms to see for yourself. - this(fileSystem, 30); - } - - public TimeLimitedHadoopExistsCall(FileSystem fileSystem, int timeLimitInSeconds) { - // We do hadoop calls once every "FLUSH_CHECK_PERIOD" minutes. If a call takes - // a long time (say 10 minutes), we'll use a new thread for the next call, to give it - // a chance to complete. - // - // Let's say every call takes 2 hours. After 5 calls, the 6th call won't be able - // to take a thread out of the thread pool and it will time out. That's fair, we don't - // want to keep sending requests to Hadoop if the situation is so dire. - ExecutorService executorService = Executors.newFixedThreadPool(5); - this.hadoopCallsTimeLimiter = SimpleTimeLimiter.create(executorService); - this.fileSystem = fileSystem; - this.timeLimitInSeconds = timeLimitInSeconds; - } - - - protected boolean hadoopExistsCall(Path path) throws IOException { - SearchTimer timer = EXISTS_CALLS_TIMER.startNewTimer(); - boolean res = fileSystem.exists(path); - EXISTS_CALLS_TIMER.stopTimerAndIncrement(timer); - return res; - } - - /** - * Checks if a path exists on Hadoop. - * - * @return true if the path exists. - * @throws Exception see exceptions thrown by callWithTimeout - */ - boolean exists(Path path) throws Exception { - try { - boolean result = hadoopCallsTimeLimiter.callWithTimeout(new Callable() { - @Override - public Boolean call() throws Exception { - return hadoopExistsCall(path); - } - }, timeLimitInSeconds, TimeUnit.SECONDS); - - return result; - } catch (Exception ex) { - EXISTS_CALLS_EXCEPTION.increment(); - // No need to print and rethrow, it will be printed when caught upstream. - throw ex; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/TweetCreateHandler.docx b/src/java/com/twitter/search/earlybird/partition/TweetCreateHandler.docx new file mode 100644 index 000000000..ccab82955 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/TweetCreateHandler.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/TweetCreateHandler.java b/src/java/com/twitter/search/earlybird/partition/TweetCreateHandler.java deleted file mode 100644 index e47f75a09..000000000 --- a/src/java/com/twitter/search/earlybird/partition/TweetCreateHandler.java +++ /dev/null @@ -1,526 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.Iterator; - -import scala.runtime.BoxedUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Stopwatch; -import com.google.common.base.Verify; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.config.Config; -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.util.GCUtil; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.common.CaughtUpMonitor; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.index.OutOfOrderRealtimeTweetIDMapper; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.earlybird.util.CoordinatedEarlybirdActionInterface; -import com.twitter.util.Await; -import com.twitter.util.Duration; -import com.twitter.util.Future; -import com.twitter.util.TimeoutException; - -/** - * This class handles incoming new Tweets. It is responsible for creating segments for the incoming - * Tweets when necessary, triggering optimization on those segments, and writing Tweets to the - * correct segment. - */ -public class TweetCreateHandler { - private static final Logger LOG = LoggerFactory.getLogger(TweetCreateHandler.class); - - public static final long LATE_TWEET_TIME_BUFFER_MS = Duration.fromMinutes(1).inMilliseconds(); - - private static final String STATS_PREFIX = "tweet_create_handler_"; - - // To get a better idea of which of these succeeded and so on, see stats in SegmentManager. - private IndexingResultCounts indexingResultCounts; - private static final SearchRateCounter TWEETS_IN_WRONG_SEGMENT = - SearchRateCounter.export(STATS_PREFIX + "tweets_in_wrong_segment"); - private static final SearchRateCounter SEGMENTS_CLOSED_EARLY = - SearchRateCounter.export(STATS_PREFIX + "segments_closed_early"); - private static final SearchRateCounter INSERTED_IN_CURRENT_SEGMENT = - SearchRateCounter.export(STATS_PREFIX + "inserted_in_current_segment"); - private static final SearchRateCounter INSERTED_IN_PREVIOUS_SEGMENT = - SearchRateCounter.export(STATS_PREFIX + "inserted_in_previous_segment"); - private static final NewSegmentStats NEW_SEGMENT_STATS = new NewSegmentStats(); - private static final SearchCounter CREATED_SEGMENTS = - SearchCounter.export(STATS_PREFIX + "created_segments"); - private static final SearchRateCounter INCOMING_TWEETS = - SearchRateCounter.export(STATS_PREFIX + "incoming_tweets"); - private static final SearchRateCounter INDEXING_SUCCESS = - SearchRateCounter.export(STATS_PREFIX + "indexing_success"); - private static final SearchRateCounter INDEXING_FAILURE = - SearchRateCounter.export(STATS_PREFIX + "indexing_failure"); - - // Various stats and logging around creation of new segments, put in this - // class so that the code is not watered down too much by this. - private static class NewSegmentStats { - private static final String NEW_SEGMENT_STATS_PREFIX = - STATS_PREFIX + "new_segment_"; - - private static final SearchCounter START_NEW_AFTER_REACHING_LIMIT = - SearchCounter.export(NEW_SEGMENT_STATS_PREFIX + "start_after_reaching_limit"); - private static final SearchCounter START_NEW_AFTER_EXCEEDING_MAX_ID = - SearchCounter.export(NEW_SEGMENT_STATS_PREFIX + "start_after_exceeding_max_id"); - private static final SearchCounter TIMESLICE_SET_TO_CURRENT_ID = - SearchCounter.export(NEW_SEGMENT_STATS_PREFIX + "timeslice_set_to_current_id"); - private static final SearchCounter TIMESLICE_SET_TO_MAX_ID = - SearchCounter.export(NEW_SEGMENT_STATS_PREFIX + "timeslice_set_to_max_id"); - private static final SearchLongGauge TIMESPAN_BETWEEN_MAX_AND_CURRENT = - SearchLongGauge.export(NEW_SEGMENT_STATS_PREFIX + "timespan_between_id_and_max"); - - void recordCreateNewSegment() { - CREATED_SEGMENTS.increment(); - } - - void recordStartAfterReachingTweetsLimit(int numDocs, int numDocsCutoff, - int maxSegmentSize, int lateTweetBuffer) { - START_NEW_AFTER_REACHING_LIMIT.increment(); - LOG.info(String.format( - "Will create new segment: numDocs=%,d, numDocsCutoff=%,d" - + " | maxSegmentSize=%,d, lateTweetBuffer=%,d", - numDocs, numDocsCutoff, maxSegmentSize, lateTweetBuffer)); - } - - void recordStartAfterExceedingLargestValidTweetId(long tweetId, long largestValidTweetId) { - START_NEW_AFTER_EXCEEDING_MAX_ID.increment(); - LOG.info(String.format( - "Will create new segment: tweetDd=%,d, largestValidTweetID for segment=%,d", - tweetId, largestValidTweetId)); - } - - void recordSettingTimesliceToCurrentTweet(long tweetID) { - TIMESLICE_SET_TO_CURRENT_ID.increment(); - LOG.info("Creating new segment: tweet that triggered it has the largest id we've seen. " - + " id={}", tweetID); - } - - void recordSettingTimesliceToMaxTweetId(long tweetID, long maxTweetID) { - TIMESLICE_SET_TO_MAX_ID.increment(); - LOG.info("Creating new segment: tweet that triggered it doesn't have the largest id" - + " we've seen. tweetId={}, maxTweetId={}", - tweetID, maxTweetID); - long timeDifference = - SnowflakeIdParser.getTimeDifferenceBetweenTweetIDs(maxTweetID, tweetID); - LOG.info("Time difference between max seen and last seen: {} ms", timeDifference); - TIMESPAN_BETWEEN_MAX_AND_CURRENT.set(timeDifference); - } - - void wrapNewSegmentCreation(long tweetID, long maxTweetID, - long currentSegmentTimesliceBoundary, - long largestValidTweetIDForCurrentSegment) { - long timeDifferenceStartToMax = SnowflakeIdParser.getTimeDifferenceBetweenTweetIDs( - largestValidTweetIDForCurrentSegment, - currentSegmentTimesliceBoundary); - LOG.info("Time between timeslice boundary and largest valid tweet id: {} ms", - timeDifferenceStartToMax); - - LOG.info("Created new segment: (tweetId={}, maxTweetId={}, maxTweetId-tweetId={} " - + " | currentSegmentTimesliceBoundary={}, largestValidTweetIDForSegment={})", - tweetID, maxTweetID, maxTweetID - tweetID, currentSegmentTimesliceBoundary, - largestValidTweetIDForCurrentSegment); - } - } - - - private final SegmentManager segmentManager; - private final MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager; - private final int maxSegmentSize; - private final int lateTweetBuffer; - - private long maxTweetID = Long.MIN_VALUE; - - private long largestValidTweetIDForCurrentSegment; - private long currentSegmentTimesliceBoundary; - private OptimizingSegmentWriter currentSegment; - private OptimizingSegmentWriter previousSegment; - private final QueryCacheManager queryCacheManager; - private final CriticalExceptionHandler criticalExceptionHandler; - private final SearchIndexingMetricSet searchIndexingMetricSet; - private final CoordinatedEarlybirdActionInterface postOptimizationRebuildsAction; - private final CoordinatedEarlybirdActionInterface gcAction; - private final CaughtUpMonitor indexCaughtUpMonitor; - private final OptimizationAndFlushingCoordinationLock optimizationAndFlushingCoordinationLock; - - public TweetCreateHandler( - SegmentManager segmentManager, - SearchIndexingMetricSet searchIndexingMetricSet, - CriticalExceptionHandler criticalExceptionHandler, - MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager, - QueryCacheManager queryCacheManager, - CoordinatedEarlybirdActionInterface postOptimizationRebuildsAction, - CoordinatedEarlybirdActionInterface gcAction, - int lateTweetBuffer, - int maxSegmentSize, - CaughtUpMonitor indexCaughtUpMonitor, - OptimizationAndFlushingCoordinationLock optimizationAndFlushingCoordinationLock - ) { - this.segmentManager = segmentManager; - this.criticalExceptionHandler = criticalExceptionHandler; - this.multiSegmentTermDictionaryManager = multiSegmentTermDictionaryManager; - this.queryCacheManager = queryCacheManager; - this.indexingResultCounts = new IndexingResultCounts(); - this.searchIndexingMetricSet = searchIndexingMetricSet; - this.postOptimizationRebuildsAction = postOptimizationRebuildsAction; - this.gcAction = gcAction; - this.indexCaughtUpMonitor = indexCaughtUpMonitor; - - Preconditions.checkState(lateTweetBuffer < maxSegmentSize); - this.lateTweetBuffer = lateTweetBuffer; - this.maxSegmentSize = maxSegmentSize; - this.optimizationAndFlushingCoordinationLock = optimizationAndFlushingCoordinationLock; - } - - void prepareAfterStartingWithIndex(long maxIndexedTweetId) { - LOG.info("Preparing after starting with an index."); - - Iterator segmentInfosIterator = - segmentManager - .getSegmentInfos(SegmentManager.Filter.All, SegmentManager.Order.NEW_TO_OLD) - .iterator(); - - // Setup the last segment. - Verify.verify(segmentInfosIterator.hasNext(), "at least one segment expected"); - ISegmentWriter lastWriter = segmentManager.getSegmentWriterForID( - segmentInfosIterator.next().getTimeSliceID()); - Verify.verify(lastWriter != null); - - LOG.info("TweetCreateHandler found last writer: {}", lastWriter.getSegmentInfo().toString()); - this.currentSegmentTimesliceBoundary = lastWriter.getSegmentInfo().getTimeSliceID(); - this.largestValidTweetIDForCurrentSegment = - OutOfOrderRealtimeTweetIDMapper.calculateMaxTweetID(currentSegmentTimesliceBoundary); - this.currentSegment = (OptimizingSegmentWriter) lastWriter; - - if (maxIndexedTweetId == -1) { - maxTweetID = lastWriter.getSegmentInfo().getIndexSegment().getMaxTweetId(); - LOG.info("Max tweet id = {}", maxTweetID); - } else { - // See SEARCH-31032 - maxTweetID = maxIndexedTweetId; - } - - // If we have a previous segment that's not optimized, set it up too, we still need to pick - // it up for optimization and we might still be able to add tweets to it. - if (segmentInfosIterator.hasNext()) { - SegmentInfo previousSegmentInfo = segmentInfosIterator.next(); - if (!previousSegmentInfo.isOptimized()) { - ISegmentWriter previousSegmentWriter = segmentManager.getSegmentWriterForID( - previousSegmentInfo.getTimeSliceID()); - - if (previousSegmentWriter != null) { - LOG.info("Picked previous segment"); - this.previousSegment = (OptimizingSegmentWriter) previousSegmentWriter; - } else { - // Should not happen. - LOG.error("Not found previous segment writer"); - } - } else { - LOG.info("Previous segment info is optimized"); - } - } else { - LOG.info("Previous segment info not found, we only have one segment"); - } - } - - private void updateIndexFreshness() { - searchIndexingMetricSet.highestStatusId.set(maxTweetID); - - long tweetTimestamp = SnowflakeIdParser.getTimestampFromTweetId( - searchIndexingMetricSet.highestStatusId.get()); - searchIndexingMetricSet.freshestTweetTimeMillis.set(tweetTimestamp); - } - - /** - * Index a new TVE representing a Tweet create event. - */ - public void handleTweetCreate(ThriftVersionedEvents tve) throws IOException { - INCOMING_TWEETS.increment(); - long id = tve.getId(); - maxTweetID = Math.max(id, maxTweetID); - - updateIndexFreshness(); - - boolean shouldCreateNewSegment = false; - - if (currentSegment == null) { - shouldCreateNewSegment = true; - LOG.info("Will create new segment: current segment is null"); - } else { - int numDocs = currentSegment.getSegmentInfo().getIndexSegment().getNumDocs(); - int numDocsCutoff = maxSegmentSize - lateTweetBuffer; - if (numDocs >= numDocsCutoff) { - NEW_SEGMENT_STATS.recordStartAfterReachingTweetsLimit(numDocs, numDocsCutoff, - maxSegmentSize, lateTweetBuffer); - shouldCreateNewSegment = true; - } else if (id > largestValidTweetIDForCurrentSegment) { - NEW_SEGMENT_STATS.recordStartAfterExceedingLargestValidTweetId(id, - largestValidTweetIDForCurrentSegment); - shouldCreateNewSegment = true; - } - } - - if (shouldCreateNewSegment) { - createNewSegment(id); - } - - if (previousSegment != null) { - // Inserts and some updates can't be applied to an optimized segment, so we want to wait at - // least LATE_TWEET_TIME_BUFFER between when we created the new segment and when we optimize - // the previous segment, in case there are late tweets. - // We leave a large (150k, typically) buffer in the segment so that we don't have to close - // the previousSegment before LATE_TWEET_TIME_BUFFER has passed, but if we index - // lateTweetBuffer Tweets before optimizing, then we must optimize, - // so that we don't insert more than max segment size tweets into the previous segment. - long relativeTweetAgeMs = - SnowflakeIdParser.getTimeDifferenceBetweenTweetIDs(id, currentSegmentTimesliceBoundary); - - boolean needToOptimize = false; - int numDocs = previousSegment.getSegmentInfo().getIndexSegment().getNumDocs(); - String previousSegmentName = previousSegment.getSegmentInfo().getSegmentName(); - if (numDocs >= maxSegmentSize) { - LOG.info(String.format("Previous segment (%s) reached maxSegmentSize, need to optimize it." - + " numDocs=%,d, maxSegmentSize=%,d", previousSegmentName, numDocs, maxSegmentSize)); - needToOptimize = true; - } else if (relativeTweetAgeMs > LATE_TWEET_TIME_BUFFER_MS) { - LOG.info(String.format("Previous segment (%s) is old enough, we can optimize it." - + " Got tweet past time buffer of %,d ms by: %,d ms", previousSegmentName, - LATE_TWEET_TIME_BUFFER_MS, relativeTweetAgeMs - LATE_TWEET_TIME_BUFFER_MS)); - needToOptimize = true; - } - - if (needToOptimize) { - optimizePreviousSegment(); - } - } - - ISegmentWriter segmentWriter; - if (id >= currentSegmentTimesliceBoundary) { - INSERTED_IN_CURRENT_SEGMENT.increment(); - segmentWriter = currentSegment; - } else if (previousSegment != null) { - INSERTED_IN_PREVIOUS_SEGMENT.increment(); - segmentWriter = previousSegment; - } else { - TWEETS_IN_WRONG_SEGMENT.increment(); - LOG.info("Inserting TVE ({}) into the current segment ({}) even though it should have gone " - + "in a previous segment.", id, currentSegmentTimesliceBoundary); - segmentWriter = currentSegment; - } - - SearchTimer timer = searchIndexingMetricSet.statusStats.startNewTimer(); - ISegmentWriter.Result result = segmentWriter.indexThriftVersionedEvents(tve); - searchIndexingMetricSet.statusStats.stopTimerAndIncrement(timer); - - if (result == ISegmentWriter.Result.SUCCESS) { - INDEXING_SUCCESS.increment(); - } else { - INDEXING_FAILURE.increment(); - } - - indexingResultCounts.countResult(result); - } - - /** - * Many tests need to verify behavior with segments optimized & unoptimized, so we need to expose - * this. - */ - @VisibleForTesting - public Future optimizePreviousSegment() { - String segmentName = previousSegment.getSegmentInfo().getSegmentName(); - previousSegment.getSegmentInfo().setIndexing(false); - LOG.info("Optimizing previous segment: {}", segmentName); - segmentManager.logState("Starting optimization for segment: " + segmentName); - - Future future = previousSegment - .startOptimization(gcAction, optimizationAndFlushingCoordinationLock) - .map(this::postOptimizationSteps) - .onFailure(t -> { - criticalExceptionHandler.handle(this, t); - return BoxedUnit.UNIT; - }); - - waitForOptimizationIfInTest(future); - - previousSegment = null; - return future; - } - - /** - * In tests, it's easier if when a segment starts optimizing, we know that it will finish - * optimizing. This way we have no race condition where we're surprised that something that - * started optimizing is not ready. - * - * In prod we don't have this problem. Segments run for 10 hours and optimization is 20 minutes - * so there's no need for extra synchronization. - */ - private void waitForOptimizationIfInTest(Future future) { - if (Config.environmentIsTest()) { - try { - Await.ready(future); - LOG.info("Optimizing is done"); - } catch (InterruptedException | TimeoutException ex) { - LOG.info("Exception while optimizing", ex); - } - } - } - - private SegmentInfo postOptimizationSteps(SegmentInfo optimizedSegmentInfo) { - segmentManager.updateStats(); - // See SEARCH-32175 - optimizedSegmentInfo.setComplete(true); - - String segmentName = optimizedSegmentInfo.getSegmentName(); - LOG.info("Finished optimization for segment: " + segmentName); - segmentManager.logState( - "Finished optimization for segment: " + segmentName); - - /* - * Building the multi segment term dictionary causes GC pauses. The reason for this is because - * it's pretty big (possible ~15GB). When it's allocated, we have to copy a lot of data from - * survivor space to old gen. That causes several GC pauses. See SEARCH-33544 - * - * GC pauses are in general not fatal, but since all instances finish a segment at roughly the - * same time, they might happen at the same time and then it's a problem. - * - * Some possible solutions to this problem would be to build this dictionary in some data - * structures that are pre-allocated or to build only the part for the last segment, as - * everything else doesn't change. These solutions are a bit difficult to implement and this - * here is an easy workaround. - * - * Note that we might finish optimizing a segment and then it might take ~60+ minutes until it's - * a particular Earlybird's turn to run this code. The effect of this is going to be that we - * are not going to use the multi segment dictionary for the last two segments, one of which is - * still pretty small. That's not terrible, since right before optimization we're not using - * the dictionary for the last segment anyways, since it's still not optimized. - */ - try { - LOG.info("Acquire coordination lock before beginning post_optimization_rebuilds action."); - optimizationAndFlushingCoordinationLock.lock(); - LOG.info("Successfully acquired coordination lock for post_optimization_rebuilds action."); - postOptimizationRebuildsAction.retryActionUntilRan( - "post optimization rebuilds", () -> { - Stopwatch stopwatch = Stopwatch.createStarted(); - LOG.info("Starting to build multi term dictionary for {}", segmentName); - boolean result = multiSegmentTermDictionaryManager.buildDictionary(); - LOG.info("Done building multi term dictionary for {} in {}, result: {}", - segmentName, stopwatch, result); - queryCacheManager.rebuildQueryCachesAfterSegmentOptimization( - optimizedSegmentInfo); - - // This is a serial full GC and it defragments the memory so things can run smoothly - // until the next segment rolls. What we have observed is that if we don't do that - // later on some earlybirds can have promotion failures on an old gen that hasn't - // reached the initiating occupancy limit and these promotions failures can trigger a - // long (1.5 min) full GC. That usually happens because of fragmentation issues. - GCUtil.runGC(); - // Wait for indexing to catch up before rejoining the serverset. We only need to do - // this if the host has already finished startup. - if (EarlybirdStatus.hasStarted()) { - indexCaughtUpMonitor.resetAndWaitUntilCaughtUp(); - } - }); - } finally { - LOG.info("Finished post_optimization_rebuilds action. Releasing coordination lock."); - optimizationAndFlushingCoordinationLock.unlock(); - } - - return optimizedSegmentInfo; - } - - /** - * Many tests rely on precise segment boundaries, so we expose this to allow them to create a - * particular segment. - */ - @VisibleForTesting - public void createNewSegment(long tweetID) throws IOException { - NEW_SEGMENT_STATS.recordCreateNewSegment(); - - if (previousSegment != null) { - // We shouldn't have more than one unoptimized segment, so if we get to this point and the - // previousSegment has not been optimized and set to null, start optimizing it before - // creating the next one. Note that this is a weird case and would only happen if we get - // Tweets with drastically different IDs than we expect, or there is a large amount of time - // where no Tweets are created in this partition. - LOG.error("Creating new segment for Tweet {} when the previous segment {} was not sealed. " - + "Current segment: {}. Documents: {}. largestValidTweetIDForSegment: {}.", - tweetID, - previousSegment.getSegmentInfo().getTimeSliceID(), - currentSegment.getSegmentInfo().getTimeSliceID(), - currentSegment.getSegmentInfo().getIndexSegment().getNumDocs(), - largestValidTweetIDForCurrentSegment); - optimizePreviousSegment(); - SEGMENTS_CLOSED_EARLY.increment(); - } - - previousSegment = currentSegment; - - // We have two cases: - // - // Case 1: - // If the greatest Tweet ID we have seen is tweetID, then when we want to create a new segment - // with that ID, so the Tweet being processed goes into the new segment. - // - // Case 2: - // If the tweetID is bigger than the max tweetID, then this method is being called directly from - // tests, so we didn't update the maxTweetID, so we can create a new segment with the new - // Tweet ID. - // - // Case 3: - // If it's not the greatest Tweet ID we have seen, then we don't want to create a - // segment boundary that is lower than any Tweet IDs in the current segment, because then - // some tweets from the previous segment would be in the wrong segment, so create a segment - // that has a greater ID than any Tweets that we have seen. - // - // Example: - // - We have seen tweets 3, 10, 5, 6. - // - We now see tweet 7 and we decide it's time to create a new segment. - // - The new segment will start at tweet 11. It can't start at tweet 7, because - // tweet 10 will be in the wrong segment. - // - Tweet 7 that we just saw will end up in the previous segment. - if (maxTweetID <= tweetID) { - currentSegmentTimesliceBoundary = tweetID; - NEW_SEGMENT_STATS.recordSettingTimesliceToCurrentTweet(tweetID); - } else { - currentSegmentTimesliceBoundary = maxTweetID + 1; - NEW_SEGMENT_STATS.recordSettingTimesliceToMaxTweetId(tweetID, maxTweetID); - } - currentSegment = segmentManager.createAndPutOptimizingSegmentWriter( - currentSegmentTimesliceBoundary); - - currentSegment.getSegmentInfo().setIndexing(true); - - largestValidTweetIDForCurrentSegment = - OutOfOrderRealtimeTweetIDMapper.calculateMaxTweetID(currentSegmentTimesliceBoundary); - - NEW_SEGMENT_STATS.wrapNewSegmentCreation(tweetID, maxTweetID, - currentSegmentTimesliceBoundary, largestValidTweetIDForCurrentSegment); - - segmentManager.removeExcessSegments(); - } - - void logState() { - LOG.info("TweetCreateHandler:"); - LOG.info(String.format(" tweets sent for indexing: %,d", - indexingResultCounts.getIndexingCalls())); - LOG.info(String.format(" non-retriable failure: %,d", - indexingResultCounts.getFailureNotRetriable())); - LOG.info(String.format(" retriable failure: %,d", - indexingResultCounts.getFailureRetriable())); - LOG.info(String.format(" successfully indexed: %,d", - indexingResultCounts.getIndexingSuccess())); - LOG.info(String.format(" tweets in wrong segment: %,d", TWEETS_IN_WRONG_SEGMENT.getCount())); - LOG.info(String.format(" segments closed early: %,d", SEGMENTS_CLOSED_EARLY.getCount())); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/TweetUpdateHandler.docx b/src/java/com/twitter/search/earlybird/partition/TweetUpdateHandler.docx new file mode 100644 index 000000000..3fe79c259 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/TweetUpdateHandler.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/TweetUpdateHandler.java b/src/java/com/twitter/search/earlybird/partition/TweetUpdateHandler.java deleted file mode 100644 index c4fd7e25c..000000000 --- a/src/java/com/twitter/search/earlybird/partition/TweetUpdateHandler.java +++ /dev/null @@ -1,175 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.SortedMap; -import java.util.TreeMap; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; - -/** - * This class handles incoming updates to Tweets in the index. - * - * Much of the logic deals with retries. It is very common to get an update before we have gotten - * the Tweet that the update should be applied to. In this case, we queue the update for up to a - * minute, so that we give the original Tweet the chance to be written to the index. - */ -public class TweetUpdateHandler { - private static final Logger LOG = LoggerFactory.getLogger(TweetUpdateHandler.class); - private static final Logger UPDATES_ERRORS_LOG = - LoggerFactory.getLogger(TweetUpdateHandler.class.getName() + ".UpdatesErrors"); - - private static final String STATS_PREFIX = "tweet_update_handler_"; - - private IndexingResultCounts indexingResultCounts; - private static final SearchRateCounter INCOMING_EVENT = - SearchRateCounter.export(STATS_PREFIX + "incoming_event"); - private static final SearchRateCounter QUEUED_FOR_RETRY = - SearchRateCounter.export(STATS_PREFIX + "queued_for_retry"); - private static final SearchRateCounter DROPPED_OLD_EVENT = - SearchRateCounter.export(STATS_PREFIX + "dropped_old_event"); - private static final SearchRateCounter DROPPED_INCOMING_EVENT = - SearchRateCounter.export(STATS_PREFIX + "dropped_incoming_event"); - private static final SearchRateCounter DROPPED_CLEANUP_EVENT = - SearchRateCounter.export(STATS_PREFIX + "dropped_cleanup_event"); - private static final SearchRateCounter DROPPED_NOT_RETRYABLE_EVENT = - SearchRateCounter.export(STATS_PREFIX + "dropped_not_retryable_event"); - private static final SearchRateCounter PICKED_TO_RETRY = - SearchRateCounter.export(STATS_PREFIX + "picked_to_retry"); - private static final SearchRateCounter INDEXED_EVENT = - SearchRateCounter.export(STATS_PREFIX + "indexed_event"); - - private static final long RETRY_TIME_THRESHOLD_MS = 60_000; // one minute. - - private final SortedMap> pendingUpdates = new TreeMap<>(); - private final SegmentManager segmentManager; - - /** - * At this time we cleaned all updates that are more than RETRY_TIME_THRESHOLD_MS old. - */ - private long lastCleanedUpdatesTime = 0; - - /** - * The time of the most recent Tweet that we have applied an update for. We use this to - * determine when we should give up on retrying an update, instead of using the system clock, - * because we may be processing the stream from a long time ago if we are starting up or if - * there is lag in the Kafka topics and we want to let each update get a fair shot at being - * applied. - */ - private long mostRecentUpdateTime = 0; - - public TweetUpdateHandler(SegmentManager segmentManager) { - this.segmentManager = segmentManager; - this.indexingResultCounts = new IndexingResultCounts(); - } - - /** - * Index an update to a Tweet. - */ - public void handleTweetUpdate(ThriftVersionedEvents tve, boolean isRetry) throws IOException { - if (!isRetry) { - INCOMING_EVENT.increment(); - } - long id = tve.getId(); - - mostRecentUpdateTime = - Math.max(SnowflakeIdParser.getTimestampFromTweetId(id), mostRecentUpdateTime); - cleanStaleUpdates(); - - ISegmentWriter writer = segmentManager.getSegmentWriterForID(id); - if (writer == null) { - if (segmentManager.getNumIndexedDocuments() == 0) { - // If we haven't indexed any tweets at all, then we shouldn't drop this update, because it - // might be applied to a Tweet we haven't indexed yet so queue it up for retry. - queueForRetry(id, tve); - } else { - DROPPED_OLD_EVENT.increment(); - } - return; - } - - SegmentWriter.Result result = writer.indexThriftVersionedEvents(tve); - indexingResultCounts.countResult(result); - - if (result == ISegmentWriter.Result.FAILURE_RETRYABLE) { - // If the tweet hasn't arrived yet. - queueForRetry(id, tve); - } else if (result == ISegmentWriter.Result.FAILURE_NOT_RETRYABLE) { - DROPPED_NOT_RETRYABLE_EVENT.increment(); - UPDATES_ERRORS_LOG.warn("Failed to apply update for tweetID {}: {}", id, tve); - } else if (result == ISegmentWriter.Result.SUCCESS) { - INDEXED_EVENT.increment(); - } - } - - private void queueForRetry(long id, ThriftVersionedEvents tve) { - long ageMillis = mostRecentUpdateTime - SnowflakeIdParser.getTimestampFromTweetId(id); - if (ageMillis > RETRY_TIME_THRESHOLD_MS) { - DROPPED_INCOMING_EVENT.increment(); - UPDATES_ERRORS_LOG.warn( - "Giving up retrying update for tweetID {}: {} because the retry time has elapsed", - id, tve); - return; - } - - pendingUpdates.computeIfAbsent(id, i -> new ArrayList<>()).add(tve); - QUEUED_FOR_RETRY.increment(); - } - - // Every time we have processed a minute's worth of updates, remove all pending updates that are - // more than a minute old, relative to the most recent Tweet we have seen. - private void cleanStaleUpdates() { - long oldUpdatesThreshold = mostRecentUpdateTime - RETRY_TIME_THRESHOLD_MS; - if (lastCleanedUpdatesTime < oldUpdatesThreshold) { - SortedMap> droppedUpdates = pendingUpdates - .headMap(SnowflakeIdParser.generateValidStatusId(oldUpdatesThreshold, 0)); - for (List events : droppedUpdates.values()) { - for (ThriftVersionedEvents event : events) { - UPDATES_ERRORS_LOG.warn( - "Giving up retrying update for tweetID {}: {} because the retry time has elapsed", - event.getId(), event); - } - DROPPED_CLEANUP_EVENT.increment(events.size()); - } - droppedUpdates.clear(); - - lastCleanedUpdatesTime = mostRecentUpdateTime; - } - } - - /** - * After we successfully indexed tweetID, if we have any pending updates for that tweetID, try to - * apply them again. - */ - public void retryPendingUpdates(long tweetID) throws IOException { - if (pendingUpdates.containsKey(tweetID)) { - for (ThriftVersionedEvents update : pendingUpdates.remove(tweetID)) { - PICKED_TO_RETRY.increment(); - handleTweetUpdate(update, true); - } - } - } - - void logState() { - LOG.info("TweetUpdateHandler:"); - LOG.info(String.format(" tweets sent for indexing: %,d", - indexingResultCounts.getIndexingCalls())); - LOG.info(String.format(" non-retriable failure: %,d", - indexingResultCounts.getFailureNotRetriable())); - LOG.info(String.format(" retriable failure: %,d", - indexingResultCounts.getFailureRetriable())); - LOG.info(String.format(" successfully indexed: %,d", - indexingResultCounts.getIndexingSuccess())); - LOG.info(String.format(" queued for retry: %,d", QUEUED_FOR_RETRY.getCount())); - LOG.info(String.format(" dropped old events: %,d", DROPPED_OLD_EVENT.getCount())); - LOG.info(String.format(" dropped incoming events: %,d", DROPPED_INCOMING_EVENT.getCount())); - LOG.info(String.format(" dropped cleanup events: %,d", DROPPED_CLEANUP_EVENT.getCount())); - LOG.info(String.format(" picked events to retry: %,d", PICKED_TO_RETRY.getCount())); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/UserPartitionUtil.docx b/src/java/com/twitter/search/earlybird/partition/UserPartitionUtil.docx new file mode 100644 index 000000000..f139197a7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/UserPartitionUtil.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/UserPartitionUtil.java b/src/java/com/twitter/search/earlybird/partition/UserPartitionUtil.java deleted file mode 100644 index c78d822ab..000000000 --- a/src/java/com/twitter/search/earlybird/partition/UserPartitionUtil.java +++ /dev/null @@ -1,32 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import com.google.common.base.Predicate; - -import com.twitter.search.common.util.hash.EarlybirdPartitioningFunction; -import com.twitter.search.common.util.hash.GeneralEarlybirdPartitioningFunction; - -public final class UserPartitionUtil { - private UserPartitionUtil() { - } - - /** - * Filter out the users that are not present in this partition. - */ - public static Predicate filterUsersByPartitionPredicate(final PartitionConfig config) { - return new Predicate() { - - private final int partitionID = config.getIndexingHashPartitionID(); - private final int numPartitions = config.getNumPartitions(); - private final EarlybirdPartitioningFunction partitioner = - new GeneralEarlybirdPartitioningFunction(); - - @Override - public boolean apply(Long userId) { - // See SEARCH-6675 - // Right now if the partitioning logic changes in ArchivePartitioning this logic - // needs to be updated too. - return partitioner.getPartition(userId, numPartitions) == partitionID; - } - }; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/UserScrubGeoEventStreamIndexer.docx b/src/java/com/twitter/search/earlybird/partition/UserScrubGeoEventStreamIndexer.docx new file mode 100644 index 000000000..0b3ea1dd0 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/UserScrubGeoEventStreamIndexer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/UserScrubGeoEventStreamIndexer.java b/src/java/com/twitter/search/earlybird/partition/UserScrubGeoEventStreamIndexer.java deleted file mode 100644 index 7cd3a28b9..000000000 --- a/src/java/com/twitter/search/earlybird/partition/UserScrubGeoEventStreamIndexer.java +++ /dev/null @@ -1,88 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.util.io.kafka.FinagleKafkaClientUtils; -import com.twitter.search.common.util.io.kafka.ThriftDeserializer; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.exception.MissingKafkaTopicException; -import com.twitter.tweetypie.thriftjava.TweetEvent; -import com.twitter.tweetypie.thriftjava.UserScrubGeoEvent; - -public class UserScrubGeoEventStreamIndexer extends SimpleStreamIndexer { - private static final Logger LOG = LoggerFactory.getLogger(UserScrubGeoEventStreamIndexer.class); - - protected static String kafkaClientId = "earlybird_user_scrub_geo_kafka_consumer"; - private static final SearchCounter NUM_MISSING_DATA_ERRORS = - SearchCounter.export("num_user_scrub_geo_event_kafka_consumer_num_missing_data_errors"); - - private final SegmentManager segmentManager; - private final SearchIndexingMetricSet searchIndexingMetricSet; - - public UserScrubGeoEventStreamIndexer(KafkaConsumer kafkaConsumer, - String topic, - SearchIndexingMetricSet searchIndexingMetricSet, - SegmentManager segmentManager) - throws MissingKafkaTopicException { - super(kafkaConsumer, topic); - - this.segmentManager = segmentManager; - this.searchIndexingMetricSet = searchIndexingMetricSet; - - indexingSuccesses = SearchRateCounter.export("user_scrub_geo_indexing_successes"); - indexingFailures = SearchRateCounter.export("user_scrub_geo_indexing_failures"); - } - - /** - * Provides UserScrubGeoEvent Kafka Consumer to EarlybirdWireModule. - * @return - */ - public static KafkaConsumer provideKafkaConsumer() { - return FinagleKafkaClientUtils.newKafkaConsumerForAssigning( - EarlybirdProperty.TWEET_EVENTS_KAFKA_PATH.get(), - new ThriftDeserializer<>(TweetEvent.class), - kafkaClientId, - MAX_POLL_RECORDS); - } - - @VisibleForTesting - protected void validateAndIndexRecord(ConsumerRecord record) { - TweetEvent event = record.value(); - UserScrubGeoEvent geoEvent; - try { - geoEvent = event.getData().getUser_scrub_geo_event(); - } catch (Exception e) { - LOG.warn("TweetEventData is null for TweetEvent: " + event.toString()); - indexingFailures.increment(); - return; - } - - if (geoEvent == null) { - LOG.warn("UserScrubGeoEvent is null"); - indexingFailures.increment(); - - } else if (!geoEvent.isSetMax_tweet_id() || !geoEvent.isSetUser_id()) { - // We should not consume an event that does not contain both a maxTweetId & userId since we - // we won't have enough data to properly store them in the map. We should, however, keep - // track of these cases since we don't want to miss out on users who have scrubbed their - // geo data from their tweets when applying the UserScrubGeoFilter. - LOG.warn("UserScrubGeoEvent is missing fields: " + geoEvent.toString()); - indexingFailures.increment(); - NUM_MISSING_DATA_ERRORS.increment(); - - } else { - SearchTimer timer = searchIndexingMetricSet.userScrubGeoIndexingStats.startNewTimer(); - segmentManager.indexUserScrubGeoEvent(geoEvent); - indexingSuccesses.increment(); - searchIndexingMetricSet.userScrubGeoIndexingStats.stopTimerAndIncrement(timer); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/UserUpdatesStreamIndexer.docx b/src/java/com/twitter/search/earlybird/partition/UserUpdatesStreamIndexer.docx new file mode 100644 index 000000000..b2ae945de Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/UserUpdatesStreamIndexer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/UserUpdatesStreamIndexer.java b/src/java/com/twitter/search/earlybird/partition/UserUpdatesStreamIndexer.java deleted file mode 100644 index c264c4a82..000000000 --- a/src/java/com/twitter/search/earlybird/partition/UserUpdatesStreamIndexer.java +++ /dev/null @@ -1,89 +0,0 @@ -package com.twitter.search.earlybird.partition; - -import java.util.Date; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.indexing.thriftjava.AntisocialUserUpdate; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.util.io.kafka.CompactThriftDeserializer; -import com.twitter.search.common.util.io.kafka.FinagleKafkaClientUtils; -import com.twitter.search.earlybird.common.config.EarlybirdProperty; -import com.twitter.search.earlybird.common.userupdates.UserUpdate; -import com.twitter.search.earlybird.exception.MissingKafkaTopicException; - -public class UserUpdatesStreamIndexer extends SimpleStreamIndexer { - private static final Logger LOG = LoggerFactory.getLogger(UserUpdatesStreamIndexer.class); - - private static final SearchCounter NUM_CORRUPT_DATA_ERRORS = - SearchCounter.export("num_user_updates_kafka_consumer_corrupt_data_errors"); - protected static String kafkaClientId = ""; - - private final SegmentManager segmentManager; - private final SearchIndexingMetricSet searchIndexingMetricSet; - - public UserUpdatesStreamIndexer(KafkaConsumer kafkaConsumer, - String topic, - SearchIndexingMetricSet searchIndexingMetricSet, - SegmentManager segmentManager) - throws MissingKafkaTopicException { - super(kafkaConsumer, topic); - this.segmentManager = segmentManager; - this.searchIndexingMetricSet = searchIndexingMetricSet; - - indexingSuccesses = SearchRateCounter.export("user_update_indexing_successes"); - indexingFailures = SearchRateCounter.export("user_update_indexing_failures"); - } - - /** - * Provides user updates kafka consumer to EarlybirdWireModule. - * @return - */ - public static KafkaConsumer provideKafkaConsumer() { - return FinagleKafkaClientUtils.newKafkaConsumerForAssigning( - EarlybirdProperty.KAFKA_PATH.get(), - new CompactThriftDeserializer<>(AntisocialUserUpdate.class), - kafkaClientId, - MAX_POLL_RECORDS); - } - - UserUpdate convertToUserInfoUpdate(AntisocialUserUpdate update) { - return new UserUpdate( - update.getUserID(), - update.getType(), - update.isValue() ? 1 : 0, - new Date(update.getUpdatedAt())); - } - - @VisibleForTesting - protected void validateAndIndexRecord(ConsumerRecord record) { - AntisocialUserUpdate update = record.value(); - if (update == null) { - LOG.warn("null value returned from poll"); - return; - } - if (update.getType() == null) { - LOG.error("User update does not have type set: " + update); - NUM_CORRUPT_DATA_ERRORS.increment(); - return; - } - - SearchTimer timer = searchIndexingMetricSet.userUpdateIndexingStats.startNewTimer(); - boolean isUpdateIndexed = segmentManager.indexUserUpdate( - convertToUserInfoUpdate(update)); - searchIndexingMetricSet.userUpdateIndexingStats.stopTimerAndIncrement(timer); - - if (isUpdateIndexed) { - indexingSuccesses.increment(); - } else { - indexingFailures.increment(); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/FreshStartupHandler.docx b/src/java/com/twitter/search/earlybird/partition/freshstartup/FreshStartupHandler.docx new file mode 100644 index 000000000..15d4f9676 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/freshstartup/FreshStartupHandler.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/FreshStartupHandler.java b/src/java/com/twitter/search/earlybird/partition/freshstartup/FreshStartupHandler.java deleted file mode 100644 index 4b54a56b4..000000000 --- a/src/java/com/twitter/search/earlybird/partition/freshstartup/FreshStartupHandler.java +++ /dev/null @@ -1,439 +0,0 @@ -package com.twitter.search.earlybird.partition.freshstartup; - -import java.io.IOException; -import java.time.Duration; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.google.common.base.Stopwatch; -import com.google.common.base.Verify; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.consumer.OffsetAndTimestamp; -import org.apache.kafka.common.TopicPartition; -import org.apache.kafka.common.errors.ApiException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import static com.twitter.search.common.util.LogFormatUtil.formatInt; - -import com.twitter.search.common.util.GCUtil; -import com.twitter.common.util.Clock; -import com.twitter.search.common.util.LogFormatUtil; -import com.twitter.search.earlybird.common.NonPagingAssert; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.exception.EarlybirdStartupException; -import com.twitter.search.earlybird.exception.WrappedKafkaApiException; -import com.twitter.search.earlybird.factory.EarlybirdKafkaConsumersFactory; -import com.twitter.search.earlybird.partition.EarlybirdIndex; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentManager; -import com.twitter.search.earlybird.util.ParallelUtil; - -/** - * Bootstraps an index by indexing tweets and updates in parallel. - * - * DEVELOPMENT - * =========== - * - * 1. In earlybird-search.yml, set the following values in the "production" section: - * - max_segment_size to 200000 - * - late_tweet_buffer to 10000 - * - * 2. In KafkaStartup, don't load the index, replace the .loadIndex call as instructed - * in the file. - * - * 3. In the aurora configs, set serving_timeslices to a low number (like 5) for staging. - */ -public class FreshStartupHandler { - private static final Logger LOG = LoggerFactory.getLogger(FreshStartupHandler.class); - private static final NonPagingAssert BUILDING_FEWER_THAN_SPECIFIED_SEGMENTS = - new NonPagingAssert("building_fewer_than_specified_segments"); - - private final Clock clock; - private final TopicPartition tweetTopic; - private final TopicPartition updateTopic; - private final SegmentManager segmentManager; - private final int maxSegmentSize; - private final int lateTweetBuffer; - private final EarlybirdKafkaConsumersFactory earlybirdKafkaConsumersFactory; - private final CriticalExceptionHandler criticalExceptionHandler; - - public FreshStartupHandler( - Clock clock, - EarlybirdKafkaConsumersFactory earlybirdKafkaConsumersFactory, - TopicPartition tweetTopic, - TopicPartition updateTopic, - SegmentManager segmentManager, - int maxSegmentSize, - int lateTweetBuffer, - CriticalExceptionHandler criticalExceptionHandler - ) { - this.clock = clock; - this.earlybirdKafkaConsumersFactory = earlybirdKafkaConsumersFactory; - this.tweetTopic = tweetTopic; - this.updateTopic = updateTopic; - this.segmentManager = segmentManager; - this.maxSegmentSize = maxSegmentSize; - this.criticalExceptionHandler = criticalExceptionHandler; - this.lateTweetBuffer = lateTweetBuffer; - } - - /** - * Don't index in parallel, just pass some time back that the EarlybirdKafkaConsumer - * can start indexing from. - */ - public EarlybirdIndex indexFromScratch() { - long indexTimePeriod = Duration.ofHours( - EarlybirdConfig.getInt("index_from_scratch_hours", 12) - ).toMillis(); - - return runIndexFromScratch(indexTimePeriod); - } - - public EarlybirdIndex fastIndexFromScratchForDevelopment() { - LOG.info("Running fast index from scratch..."); - return runIndexFromScratch(Duration.ofMinutes(10).toMillis()); - } - - private EarlybirdIndex runIndexFromScratch(long indexTimePeriodMs) { - KafkaConsumer consumerForFindingOffsets = - earlybirdKafkaConsumersFactory.createKafkaConsumer("consumer_for_offsets"); - - long timestamp = clock.nowMillis() - indexTimePeriodMs; - - Map offsets; - try { - offsets = consumerForFindingOffsets - .offsetsForTimes(ImmutableMap.of(tweetTopic, timestamp, updateTopic, timestamp)); - } catch (ApiException kafkaApiException) { - throw new WrappedKafkaApiException(kafkaApiException); - } - - return new EarlybirdIndex( - Lists.newArrayList(), - offsets.get(tweetTopic).offset(), - offsets.get(updateTopic).offset()); - } - - - /** - * Index Tweets and updates from scratch, without relying on a serialized index in HDFS. - * - * This function indexes the segments in parallel, limiting the number of segments that - * are currently indexed, due to memory limitations. That's followed by another pass to index - * some updates - see the implementation for more details. - * - * The index this function outputs contains N segments, where the first N-1 are optimized and - * the last one is not. - */ - public EarlybirdIndex parallelIndexFromScratch() throws Exception { - Stopwatch parallelIndexStopwatch = Stopwatch.createStarted(); - - LOG.info("Starting parallel fresh startup."); - LOG.info("Max segment size: {}", maxSegmentSize); - LOG.info("Late tweet buffer size: {}", lateTweetBuffer); - - // Once we finish fresh startup and proceed to indexing from the streams, we'll immediately - // start a new segment, since the output of the fresh startup is full segments. - // - // That's why we index max_segments-1 segments here instead of indexing max_segments segments - // and discarding the first one later. - int numSegments = segmentManager.getMaxEnabledSegments() - 1; - LOG.info("Number of segments to build: {}", numSegments); - - // Find end offsets. - KafkaOffsetPair tweetsOffsetRange = findOffsetRangeForTweetsKafkaTopic(); - - ArrayList segmentBuildInfos = makeSegmentBuildInfos( - numSegments, tweetsOffsetRange); - - segmentManager.logState("Before starting fresh startup"); - - // Index tweets and events. - Stopwatch initialIndexStopwatch = Stopwatch.createStarted(); - - // We index at most `MAX_PARALLEL_INDEXED` (MPI) segments at the same time. If we need to - // produce 20 segments here, we'd need memory for MPI unoptimized and 20-MPI optimized segments. - // - // For back of envelope calculations you can assume optimized segments take ~6GB and unoptimized - // ones ~12GB. - final int MAX_PARALLEL_INDEXED = 8; - - List segmentInfos = ParallelUtil.parmap( - "fresh-startup", - MAX_PARALLEL_INDEXED, - segmentBuildInfo -> indexTweetsAndUpdatesForSegment(segmentBuildInfo, segmentBuildInfos), - segmentBuildInfos - ); - - LOG.info("Finished indexing tweets and updates in {}", initialIndexStopwatch); - - PostOptimizationUpdatesIndexer postOptimizationUpdatesIndexer = - new PostOptimizationUpdatesIndexer( - segmentBuildInfos, - earlybirdKafkaConsumersFactory, - updateTopic); - - postOptimizationUpdatesIndexer.indexRestOfUpdates(); - - // Finished indexing tweets and updates. - LOG.info("Segment build infos after we're done:"); - for (SegmentBuildInfo segmentBuildInfo : segmentBuildInfos) { - segmentBuildInfo.logState(); - } - - segmentManager.logState("After finishing fresh startup"); - - LOG.info("Collected {} segment infos", segmentInfos.size()); - LOG.info("Segment names:"); - for (SegmentInfo segmentInfo : segmentInfos) { - LOG.info(segmentInfo.getSegmentName()); - } - - SegmentBuildInfo lastSegmentBuildInfo = segmentBuildInfos.get(segmentBuildInfos.size() - 1); - long finishedUpdatesAtOffset = lastSegmentBuildInfo.getUpdateKafkaOffsetPair().getEndOffset(); - long maxIndexedTweetId = lastSegmentBuildInfo.getMaxIndexedTweetId(); - - LOG.info("Max indexed tweet id: {}", maxIndexedTweetId); - LOG.info("Parallel startup finished in {}", parallelIndexStopwatch); - - // verifyConstructedIndex(segmentBuildInfos); - // Run a GC to free up some memory after the fresh startup. - GCUtil.runGC(); - logMemoryStats(); - - return new EarlybirdIndex( - segmentInfos, - tweetsOffsetRange.getEndOffset() + 1, - finishedUpdatesAtOffset + 1, - maxIndexedTweetId - ); - } - - private void logMemoryStats() { - double toGB = 1024 * 1024 * 1024; - double totalMemoryGB = Runtime.getRuntime().totalMemory() / toGB; - double freeMemoryGB = Runtime.getRuntime().freeMemory() / toGB; - LOG.info("Memory stats: Total memory GB: {}, Free memory GB: {}", - totalMemoryGB, freeMemoryGB); - } - - /** - * Prints statistics about the constructed index compared to all tweets in the - * tweets stream. - * - * Only run this for testing and debugging purposes, never in prod environment. - */ - private void verifyConstructedIndex(List segmentBuildInfos) - throws IOException { - LOG.info("Verifying constructed index..."); - // Read every tweet from the offset range that we're constructing an index for. - KafkaConsumer tweetsKafkaConsumer = - earlybirdKafkaConsumersFactory.createKafkaConsumer("tweets_verify"); - try { - tweetsKafkaConsumer.assign(ImmutableList.of(tweetTopic)); - tweetsKafkaConsumer.seek(tweetTopic, segmentBuildInfos.get(0).getTweetStartOffset()); - } catch (ApiException apiException) { - throw new WrappedKafkaApiException(apiException); - } - long finalTweetOffset = segmentBuildInfos.get(segmentBuildInfos.size() - 1).getTweetEndOffset(); - boolean done = false; - Set uniqueTweetIds = new HashSet<>(); - long readTweetsCount = 0; - do { - for (ConsumerRecord record - : tweetsKafkaConsumer.poll(Duration.ofSeconds(1))) { - if (record.offset() > finalTweetOffset) { - done = true; - break; - } - readTweetsCount++; - uniqueTweetIds.add(record.value().getId()); - } - } while (!done); - - LOG.info("Total amount of read tweets: {}", formatInt(readTweetsCount)); - // Might be less, due to duplicates. - LOG.info("Unique tweet ids : {}", LogFormatUtil.formatInt(uniqueTweetIds.size())); - - int notFoundInIndex = 0; - for (Long tweetId : uniqueTweetIds) { - boolean found = false; - for (SegmentBuildInfo segmentBuildInfo : segmentBuildInfos) { - if (segmentBuildInfo.getSegmentWriter().hasTweet(tweetId)) { - found = true; - break; - } - } - if (!found) { - notFoundInIndex++; - } - } - - LOG.info("Tweets not found in the index: {}", LogFormatUtil.formatInt(notFoundInIndex)); - - long totalIndexedTweets = 0; - for (SegmentBuildInfo segmentBuildInfo : segmentBuildInfos) { - SegmentInfo si = segmentBuildInfo.getSegmentWriter().getSegmentInfo(); - totalIndexedTweets += si.getIndexStats().getStatusCount(); - } - - LOG.info("Total indexed tweets: {}", formatInt(totalIndexedTweets)); - } - - /** - * Find the end offsets for the tweets Kafka topic this partition is reading - * from. - */ - private KafkaOffsetPair findOffsetRangeForTweetsKafkaTopic() { - KafkaConsumer consumerForFindingOffsets = - earlybirdKafkaConsumersFactory.createKafkaConsumer("consumer_for_end_offsets"); - - Map endOffsets; - Map beginningOffsets; - - try { - endOffsets = consumerForFindingOffsets.endOffsets(ImmutableList.of(tweetTopic)); - beginningOffsets = consumerForFindingOffsets.beginningOffsets(ImmutableList.of(tweetTopic)); - } catch (ApiException kafkaApiException) { - throw new WrappedKafkaApiException(kafkaApiException); - } finally { - consumerForFindingOffsets.close(); - } - - long tweetsBeginningOffset = beginningOffsets.get(tweetTopic); - long tweetsEndOffset = endOffsets.get(tweetTopic); - LOG.info(String.format("Tweets beginning offset: %,d", tweetsBeginningOffset)); - LOG.info(String.format("Tweets end offset: %,d", tweetsEndOffset)); - LOG.info(String.format("Total amount of records in the stream: %,d", - tweetsEndOffset - tweetsBeginningOffset + 1)); - - return new KafkaOffsetPair(tweetsBeginningOffset, tweetsEndOffset); - } - - /** - * For each segment, we know what offset it begins at. This function finds the tweet ids - * for these offsets. - */ - private void fillTweetIdsForSegmentStarts(List segmentBuildInfos) - throws EarlybirdStartupException { - KafkaConsumer consumerForTweetIds = - earlybirdKafkaConsumersFactory.createKafkaConsumer("consumer_for_tweet_ids", 1); - consumerForTweetIds.assign(ImmutableList.of(tweetTopic)); - - // Find first tweet ids for each segment. - for (SegmentBuildInfo buildInfo : segmentBuildInfos) { - long tweetOffset = buildInfo.getTweetStartOffset(); - ConsumerRecords records; - try { - consumerForTweetIds.seek(tweetTopic, tweetOffset); - records = consumerForTweetIds.poll(Duration.ofSeconds(1)); - } catch (ApiException kafkaApiException) { - throw new WrappedKafkaApiException(kafkaApiException); - } - - if (records.count() > 0) { - ConsumerRecord recordAtOffset = records.iterator().next(); - if (recordAtOffset.offset() != tweetOffset) { - LOG.error(String.format("We were looking for offset %,d. Found a record at offset %,d", - tweetOffset, recordAtOffset.offset())); - } - - buildInfo.setStartTweetId(recordAtOffset.value().getId()); - } else { - throw new EarlybirdStartupException("Didn't get any tweets back for an offset"); - } - } - - // Check that something weird didn't happen where we end up with segment ids - // which are in non-incresing order. - // Goes from oldest to newest. - for (int i = 1; i < segmentBuildInfos.size(); i++) { - long startTweetId = segmentBuildInfos.get(i).getStartTweetId(); - long prevStartTweetId = segmentBuildInfos.get(i - 1).getStartTweetId(); - Verify.verify(prevStartTweetId < startTweetId); - } - } - - /** - * Generate the offsets at which tweets begin and end for each segment that we want - * to create. - */ - private ArrayList makeSegmentBuildInfos( - int numSegments, KafkaOffsetPair tweetsOffsets) throws EarlybirdStartupException { - ArrayList segmentBuildInfos = new ArrayList<>(); - - // If we have 3 segments, the starting tweet offsets are: - // end-3N, end-2N, end-N - int segmentSize = maxSegmentSize - lateTweetBuffer; - LOG.info("Segment size: {}", segmentSize); - - long tweetsInStream = tweetsOffsets.getEndOffset() - tweetsOffsets.getBeginOffset() + 1; - double numBuildableSegments = ((double) tweetsInStream) / segmentSize; - - LOG.info("Number of segments we can build: {}", numBuildableSegments); - - int numSegmentsToBuild = numSegments; - int numBuildableSegmentsInt = (int) numBuildableSegments; - - if (numBuildableSegmentsInt < numSegmentsToBuild) { - // This can happen if we get a low amount of tweets such that the ~10 days of tweets stored in - // Kafka are not enough to build the specified number of segments. - LOG.warn("Building {} segments instead of the specified {} segments because there are not " - + "enough tweets", numSegmentsToBuild, numSegments); - BUILDING_FEWER_THAN_SPECIFIED_SEGMENTS.assertFailed(); - numSegmentsToBuild = numBuildableSegmentsInt; - } - - for (int rewind = numSegmentsToBuild; rewind >= 1; rewind--) { - long tweetStartOffset = (tweetsOffsets.getEndOffset() + 1) - (rewind * segmentSize); - long tweetEndOffset = tweetStartOffset + segmentSize - 1; - - int index = segmentBuildInfos.size(); - - segmentBuildInfos.add(new SegmentBuildInfo( - tweetStartOffset, - tweetEndOffset, - index, - rewind == 1 - )); - } - - Verify.verify(segmentBuildInfos.get(segmentBuildInfos.size() - 1) - .getTweetEndOffset() == tweetsOffsets.getEndOffset()); - - LOG.info("Filling start tweet ids ..."); - fillTweetIdsForSegmentStarts(segmentBuildInfos); - - return segmentBuildInfos; - } - - private SegmentInfo indexTweetsAndUpdatesForSegment( - SegmentBuildInfo segmentBuildInfo, - ArrayList segmentBuildInfos) throws Exception { - - PreOptimizationSegmentIndexer preOptimizationSegmentIndexer = - new PreOptimizationSegmentIndexer( - segmentBuildInfo, - segmentBuildInfos, - this.segmentManager, - this.tweetTopic, - this.updateTopic, - this.earlybirdKafkaConsumersFactory, - this.lateTweetBuffer - ); - - return preOptimizationSegmentIndexer.runIndexing(); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/KafkaOffsetPair.docx b/src/java/com/twitter/search/earlybird/partition/freshstartup/KafkaOffsetPair.docx new file mode 100644 index 000000000..6f6977176 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/freshstartup/KafkaOffsetPair.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/KafkaOffsetPair.java b/src/java/com/twitter/search/earlybird/partition/freshstartup/KafkaOffsetPair.java deleted file mode 100644 index 9300bfb3e..000000000 --- a/src/java/com/twitter/search/earlybird/partition/freshstartup/KafkaOffsetPair.java +++ /dev/null @@ -1,23 +0,0 @@ -package com.twitter.search.earlybird.partition.freshstartup; - -class KafkaOffsetPair { - private final long beginOffset; - private final long endOffset; - - public KafkaOffsetPair(long beginOffset, long endOffset) { - this.beginOffset = beginOffset; - this.endOffset = endOffset; - } - - public boolean includes(long offset) { - return beginOffset <= offset && offset <= endOffset; - } - - public long getBeginOffset() { - return beginOffset; - } - - public long getEndOffset() { - return endOffset; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/PostOptimizationUpdatesIndexer.docx b/src/java/com/twitter/search/earlybird/partition/freshstartup/PostOptimizationUpdatesIndexer.docx new file mode 100644 index 000000000..f2834477b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/freshstartup/PostOptimizationUpdatesIndexer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/PostOptimizationUpdatesIndexer.java b/src/java/com/twitter/search/earlybird/partition/freshstartup/PostOptimizationUpdatesIndexer.java deleted file mode 100644 index 93e6c9362..000000000 --- a/src/java/com/twitter/search/earlybird/partition/freshstartup/PostOptimizationUpdatesIndexer.java +++ /dev/null @@ -1,169 +0,0 @@ -package com.twitter.search.earlybird.partition.freshstartup; - -import java.io.IOException; -import java.time.Duration; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.TimeUnit; - -import com.google.common.base.Stopwatch; -import com.google.common.collect.ImmutableList; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.common.TopicPartition; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.metrics.SearchTimer; -import com.twitter.search.common.metrics.SearchTimerStats; -import com.twitter.search.earlybird.factory.EarlybirdKafkaConsumersFactory; -import com.twitter.search.earlybird.partition.IndexingResultCounts; - -/** - * Indexes updates for all segments after they have been optimized. Some of the updates have been - * indexed before in the PreOptimizationSegmentIndexer, but the rest are indexed here. - */ -class PostOptimizationUpdatesIndexer { - private static final Logger LOG = LoggerFactory.getLogger(PostOptimizationUpdatesIndexer.class); - - private static final String STAT_PREFIX = "post_optimization_"; - private static final String READ_STAT_PREFIX = STAT_PREFIX + "read_updates_for_segment_"; - private static final String APPLIED_STAT_PREFIX = STAT_PREFIX + "applied_updates_for_segment_"; - - private final ArrayList segmentBuildInfos; - private final EarlybirdKafkaConsumersFactory earlybirdKafkaConsumersFactory; - private final TopicPartition updateTopic; - - PostOptimizationUpdatesIndexer( - ArrayList segmentBuildInfos, - EarlybirdKafkaConsumersFactory earlybirdKafkaConsumersFactory, - TopicPartition updateTopic) { - this.segmentBuildInfos = segmentBuildInfos; - this.earlybirdKafkaConsumersFactory = earlybirdKafkaConsumersFactory; - this.updateTopic = updateTopic; - } - - void indexRestOfUpdates() throws IOException { - LOG.info("Indexing rest of updates."); - - long updatesStartOffset = segmentBuildInfos.get(0) - .getUpdateKafkaOffsetPair().getBeginOffset(); - long updatesEndOffset = segmentBuildInfos.get(segmentBuildInfos.size() - 1) - .getUpdateKafkaOffsetPair().getEndOffset(); - - LOG.info(String.format("Total updates to go through: %,d", - updatesEndOffset - updatesStartOffset + 1)); - - KafkaConsumer kafkaConsumer = - earlybirdKafkaConsumersFactory.createKafkaConsumer("index_rest_of_updates"); - kafkaConsumer.assign(ImmutableList.of(updateTopic)); - kafkaConsumer.seek(updateTopic, updatesStartOffset); - - long readEvents = 0; - long foundSegment = 0; - long applied = 0; - - Map perSegmentReadUpdates = new HashMap<>(); - Map perSegmentAppliedUpdates = new HashMap<>(); - Map perSegmentIndexingResultCounts = new HashMap<>(); - - for (int i = 0; i < segmentBuildInfos.size(); i++) { - perSegmentReadUpdates.put(i, SearchRateCounter.export(READ_STAT_PREFIX + i)); - perSegmentAppliedUpdates.put(i, SearchRateCounter.export(APPLIED_STAT_PREFIX + i)); - perSegmentIndexingResultCounts.put(i, new IndexingResultCounts()); - } - - SearchTimerStats pollStats = SearchTimerStats.export( - "final_pass_polls", TimeUnit.NANOSECONDS, false); - SearchTimerStats indexStats = SearchTimerStats.export( - "final_pass_index", TimeUnit.NANOSECONDS, false); - - Stopwatch totalTime = Stopwatch.createStarted(); - - boolean done = false; - do { - // Poll events. - SearchTimer pt = pollStats.startNewTimer(); - ConsumerRecords records = - kafkaConsumer.poll(Duration.ofSeconds(1)); - pollStats.stopTimerAndIncrement(pt); - - // Index events. - SearchTimer it = indexStats.startNewTimer(); - for (ConsumerRecord record : records) { - if (record.offset() >= updatesEndOffset) { - done = true; - } - - readEvents++; - - ThriftVersionedEvents tve = record.value(); - long tweetId = tve.getId(); - - // Find segment to apply to. If we can't find a segment, this is an - // update for an old tweet that's not in the index. - int segmentIndex = -1; - for (int i = segmentBuildInfos.size() - 1; i >= 0; i--) { - if (segmentBuildInfos.get(i).getStartTweetId() <= tweetId) { - segmentIndex = i; - foundSegment++; - break; - } - } - - if (segmentIndex != -1) { - SegmentBuildInfo segmentBuildInfo = segmentBuildInfos.get(segmentIndex); - - perSegmentReadUpdates.get(segmentIndex).increment(); - - // Not already applied? - if (!segmentBuildInfo.getUpdateKafkaOffsetPair().includes(record.offset())) { - applied++; - - // Index the update. - // - // IMPORTANT: Note that there you'll see about 2-3% of updates that - // fail as "retryable". This type of failure happens when the update is - // for a tweet that's not found in the index. We found out that we are - // receiving some updates for protected tweets and these are not in the - // realtime index - they are the source of this error. - perSegmentIndexingResultCounts.get(segmentIndex).countResult( - segmentBuildInfo.getSegmentWriter().indexThriftVersionedEvents(tve) - ); - - perSegmentAppliedUpdates.get(segmentIndex).increment(); - } - } - if (record.offset() >= updatesEndOffset) { - break; - } - } - indexStats.stopTimerAndIncrement(it); - - } while (!done); - - LOG.info(String.format("Done in: %s, read %,d events, found segment for %,d, applied %,d", - totalTime, readEvents, foundSegment, applied)); - - LOG.info("Indexing time: {}", indexStats.getElapsedTimeAsString()); - LOG.info("Polling time: {}", pollStats.getElapsedTimeAsString()); - - LOG.info("Per segment indexing result counts:"); - for (int i = 0; i < segmentBuildInfos.size(); i++) { - LOG.info("{} : {}", i, perSegmentIndexingResultCounts.get(i)); - } - - LOG.info("Found and applied per segment:"); - for (int i = 0; i < segmentBuildInfos.size(); i++) { - LOG.info("{}: found: {}, applied: {}", - i, - perSegmentReadUpdates.get(i).getCount(), - perSegmentAppliedUpdates.get(i).getCount()); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/PreOptimizationSegmentIndexer.docx b/src/java/com/twitter/search/earlybird/partition/freshstartup/PreOptimizationSegmentIndexer.docx new file mode 100644 index 000000000..77dc658b5 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/freshstartup/PreOptimizationSegmentIndexer.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/PreOptimizationSegmentIndexer.java b/src/java/com/twitter/search/earlybird/partition/freshstartup/PreOptimizationSegmentIndexer.java deleted file mode 100644 index b7e896248..000000000 --- a/src/java/com/twitter/search/earlybird/partition/freshstartup/PreOptimizationSegmentIndexer.java +++ /dev/null @@ -1,459 +0,0 @@ -package com.twitter.search.earlybird.partition.freshstartup; - -import java.io.IOException; -import java.time.Duration; -import java.util.ArrayList; -import java.util.Optional; - -import com.google.common.base.Preconditions; -import com.google.common.base.Stopwatch; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; - -import org.apache.kafka.clients.consumer.ConsumerRecord; -import org.apache.kafka.clients.consumer.ConsumerRecords; -import org.apache.kafka.clients.consumer.KafkaConsumer; -import org.apache.kafka.clients.consumer.OffsetAndTimestamp; -import org.apache.kafka.common.TopicPartition; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.indexing.thriftjava.ThriftVersionedEvents; -import com.twitter.search.earlybird.factory.EarlybirdKafkaConsumersFactory; -import com.twitter.search.earlybird.partition.IndexingResultCounts; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentManager; -import com.twitter.search.earlybird.partition.SegmentWriter; - -/** - * Responsible for indexing the tweets and updates that need to be applied to a single segment - * before it gets optimized and then optimizing the segment (except if it's the last one). - * - * After that, no more tweets are added to the segment and the rest of the updates are added - * in PostOptimizationUpdatesIndexer. - */ -class PreOptimizationSegmentIndexer { - private static final Logger LOG = LoggerFactory.getLogger(PreOptimizationSegmentIndexer.class); - - private SegmentBuildInfo segmentBuildInfo; - private final ArrayList segmentBuildInfos; - private SegmentManager segmentManager; - private final TopicPartition tweetTopic; - private final TopicPartition updateTopic; - private final EarlybirdKafkaConsumersFactory earlybirdKafkaConsumersFactory; - private final long lateTweetBuffer; - - public PreOptimizationSegmentIndexer( - SegmentBuildInfo segmentBuildInfo, - ArrayList segmentBuildInfos, - SegmentManager segmentManager, - TopicPartition tweetTopic, - TopicPartition updateTopic, - EarlybirdKafkaConsumersFactory earlybirdKafkaConsumersFactory, - long lateTweetBuffer) { - this.segmentBuildInfo = segmentBuildInfo; - this.segmentBuildInfos = segmentBuildInfos; - this.segmentManager = segmentManager; - this.tweetTopic = tweetTopic; - this.updateTopic = updateTopic; - this.earlybirdKafkaConsumersFactory = earlybirdKafkaConsumersFactory; - this.lateTweetBuffer = lateTweetBuffer; - } - - SegmentInfo runIndexing() throws IOException { - LOG.info(String.format("Starting segment building for segment %d. " - + "Tweet offset range [ %,d, %,d ]", - segmentBuildInfo.getIndex(), - segmentBuildInfo.getTweetStartOffset(), - segmentBuildInfo.getTweetEndOffset())); - - Optional firstTweetIdInNextSegment = Optional.empty(); - int index = segmentBuildInfo.getIndex(); - if (index + 1 < segmentBuildInfos.size()) { - firstTweetIdInNextSegment = Optional.of( - segmentBuildInfos.get(index + 1).getStartTweetId()); - } - - // Index tweets. - SegmentTweetsIndexingResult tweetIndexingResult = indexSegmentTweetsFromStream( - tweetTopic, - String.format("tweet_consumer_for_segment_%d", segmentBuildInfo.getIndex()), - firstTweetIdInNextSegment - ); - - // Index updates. - KafkaOffsetPair updatesIndexingOffsets = findUpdateStreamOffsetRange(tweetIndexingResult); - - String updatesConsumerClientId = - String.format("update_consumer_for_segment_%d", segmentBuildInfo.getIndex()); - - LOG.info(String.format("Consumer: %s :: Tweets start time: %d, end time: %d ==> " - + "Updates start offset: %,d, end offset: %,d", - updatesConsumerClientId, - tweetIndexingResult.getMinRecordTimestampMs(), - tweetIndexingResult.getMaxRecordTimestampMs(), - updatesIndexingOffsets.getBeginOffset(), - updatesIndexingOffsets.getEndOffset())); - - indexUpdatesFromStream( - updateTopic, - updatesConsumerClientId, - updatesIndexingOffsets.getBeginOffset(), - updatesIndexingOffsets.getEndOffset(), - tweetIndexingResult.getSegmentWriter() - ); - - if (segmentBuildInfo.isLastSegment()) { - /* - * We don't optimize the last segment for a few reasons: - * - * 1. We might have tweets coming next in the stream, which are supposed to end - * up in this segment. - * - * 2. We might have updates coming next in the stream, which need to be applied to - * this segment before it's optimized. - * - * So the segment is kept unoptimized and later we take care of setting up things - * so that PartitionWriter and the tweet create/update handlers can start correctly. - */ - LOG.info("Not optimizing the last segment ({})", segmentBuildInfo.getIndex()); - } else { - Stopwatch optimizationStopwatch = Stopwatch.createStarted(); - try { - LOG.info("Starting to optimize segment: {}", segmentBuildInfo.getIndex()); - tweetIndexingResult.getSegmentWriter().getSegmentInfo() - .getIndexSegment().optimizeIndexes(); - } finally { - LOG.info("Optimization of segment {} finished in {}.", - segmentBuildInfo.getIndex(), optimizationStopwatch); - } - } - - segmentBuildInfo.setUpdateKafkaOffsetPair(updatesIndexingOffsets); - segmentBuildInfo.setMaxIndexedTweetId(tweetIndexingResult.getMaxIndexedTweetId()); - segmentBuildInfo.setSegmentWriter(tweetIndexingResult.getSegmentWriter()); - - return tweetIndexingResult.getSegmentWriter().getSegmentInfo(); - } - - private SegmentTweetsIndexingResult indexSegmentTweetsFromStream( - TopicPartition topicPartition, - String consumerClientId, - Optional firstTweetIdInNextSegment) throws IOException { - long startOffset = segmentBuildInfo.getTweetStartOffset(); - long endOffset = segmentBuildInfo.getTweetEndOffset(); - long marginSize = lateTweetBuffer / 2; - - boolean isFirstSegment = segmentBuildInfo.getIndex() == 0; - - long startReadingAtOffset = startOffset; - if (!isFirstSegment) { - startReadingAtOffset -= marginSize; - } else { - LOG.info("Not moving start offset backwards for segment {}.", segmentBuildInfo.getIndex()); - } - - long endReadingAtOffset = endOffset; - if (firstTweetIdInNextSegment.isPresent()) { - endReadingAtOffset += marginSize; - } else { - LOG.info("Not moving end offset forwards for segment {}.", segmentBuildInfo.getIndex()); - } - - KafkaConsumer tweetsKafkaConsumer = - makeKafkaConsumerForIndexing(consumerClientId, - topicPartition, startReadingAtOffset); - - boolean done = false; - long minIndexedTimestampMs = Long.MAX_VALUE; - long maxIndexedTimestampMs = Long.MIN_VALUE; - int indexedEvents = 0; - - Stopwatch stopwatch = Stopwatch.createStarted(); - - LOG.info("Creating segment writer for timeslice ID {}.", segmentBuildInfo.getStartTweetId()); - SegmentWriter segmentWriter = segmentManager.createSegmentWriter( - segmentBuildInfo.getStartTweetId()); - - /* - * We don't have a guarantee that tweets come in sorted order, so when we're building segment - * X', we try to pick some tweets from the previous and next ranges we're going to index. - * - * We also ignore tweets in the beginning and the end of our tweets range, which are picked - * by the previous or following segment. - * - * Segment X Segment X' Segment X'' - * -------------- o ----------------------------------------- o --------------- - * [~~~~~] ^ [~~~~~] [~~~~~] | [~~~~~] - * | | | | | | - * front margin | front padding (size K) back padding | back margin - * | | - * segment boundary at offset B' (1) B'' - * - * (1) This is at a predetermined tweet offset / tweet id. - * - * For segment X', we start to read tweets at offset B'-K and finish reading - * tweets at offset B''+K. K is a constant. - * - * For middle segments X' - * ====================== - * We move some tweets from the front margin and back margin into segment X'. - * Some tweets from the front and back padding are ignored, as they are moved - * into the previous and next segments. - * - * For the first segment - * ===================== - * No front margin, no front padding. We just read from the beginning offset - * and insert everything. - * - * For the last segment - * ==================== - * No back margin, no back padding. We just read until the end. - */ - - SkippedPickedCounter frontMargin = new SkippedPickedCounter("front margin"); - SkippedPickedCounter backMargin = new SkippedPickedCounter("back margin"); - SkippedPickedCounter frontPadding = new SkippedPickedCounter("front padding"); - SkippedPickedCounter backPadding = new SkippedPickedCounter("back padding"); - SkippedPickedCounter regular = new SkippedPickedCounter("regular"); - int totalRead = 0; - long maxIndexedTweetId = -1; - - Stopwatch pollTimer = Stopwatch.createUnstarted(); - Stopwatch indexTimer = Stopwatch.createUnstarted(); - - do { - // This can cause an exception, See P33896 - pollTimer.start(); - ConsumerRecords records = - tweetsKafkaConsumer.poll(Duration.ofSeconds(1)); - pollTimer.stop(); - - indexTimer.start(); - for (ConsumerRecord record : records) { - // Done reading? - if (record.offset() >= endReadingAtOffset) { - done = true; - } - - ThriftVersionedEvents tve = record.value(); - boolean indexTweet = false; - SkippedPickedCounter skippedPickedCounter; - - if (record.offset() < segmentBuildInfo.getTweetStartOffset()) { - // Front margin. - skippedPickedCounter = frontMargin; - if (tve.getId() > segmentBuildInfo.getStartTweetId()) { - indexTweet = true; - } - } else if (record.offset() > segmentBuildInfo.getTweetEndOffset()) { - // Back margin. - skippedPickedCounter = backMargin; - if (firstTweetIdInNextSegment.isPresent() - && tve.getId() < firstTweetIdInNextSegment.get()) { - indexTweet = true; - } - } else if (record.offset() < segmentBuildInfo.getTweetStartOffset() + marginSize) { - // Front padding. - skippedPickedCounter = frontPadding; - if (tve.getId() >= segmentBuildInfo.getStartTweetId()) { - indexTweet = true; - } - } else if (firstTweetIdInNextSegment.isPresent() - && record.offset() > segmentBuildInfo.getTweetEndOffset() - marginSize) { - // Back padding. - skippedPickedCounter = backPadding; - if (tve.getId() < firstTweetIdInNextSegment.get()) { - indexTweet = true; - } - } else { - skippedPickedCounter = regular; - // These we just pick. A tweet that came very late can end up in the wrong - // segment, but it's better for it to be present in a segment than dropped. - indexTweet = true; - } - - if (indexTweet) { - skippedPickedCounter.incrementPicked(); - segmentWriter.indexThriftVersionedEvents(tve); - maxIndexedTweetId = Math.max(maxIndexedTweetId, tve.getId()); - indexedEvents++; - - // Note that records don't necessarily have increasing timestamps. - // Why? The timestamps whatever timestamp we picked when creating the record - // in ingesters and there are many ingesters. - minIndexedTimestampMs = Math.min(minIndexedTimestampMs, record.timestamp()); - maxIndexedTimestampMs = Math.max(maxIndexedTimestampMs, record.timestamp()); - } else { - skippedPickedCounter.incrementSkipped(); - } - totalRead++; - - if (record.offset() >= endReadingAtOffset) { - break; - } - } - indexTimer.stop(); - } while (!done); - - tweetsKafkaConsumer.close(); - - SegmentTweetsIndexingResult result = new SegmentTweetsIndexingResult( - minIndexedTimestampMs, maxIndexedTimestampMs, maxIndexedTweetId, segmentWriter); - - LOG.info("Finished indexing {} tweets for {} in {}. Read {} tweets. Result: {}." - + " Time polling: {}, Time indexing: {}.", - indexedEvents, consumerClientId, stopwatch, totalRead, result, - pollTimer, indexTimer); - - // In normal conditions, expect to pick just a few in front and in the back. - LOG.info("SkippedPicked ({}) -- {}, {}, {}, {}, {}", - consumerClientId, frontMargin, frontPadding, backPadding, backMargin, regular); - - return result; - } - - - /** - * After indexing all the tweets for a segment, index updates that need to be applied before - * the segment is optimized. - * - * This is required because some updates (URL updates, cards and Named Entities) can only be - * applied to an unoptimized segment. Luckily, all of these updates should arrive close to when - * the Tweet is created. - */ - private KafkaOffsetPair findUpdateStreamOffsetRange( - SegmentTweetsIndexingResult tweetsIndexingResult) { - KafkaConsumer offsetsConsumer = - earlybirdKafkaConsumersFactory.createKafkaConsumer( - "consumer_for_update_offsets_" + segmentBuildInfo.getIndex()); - - // Start one minute before the first indexed tweet. One minute is excessive, but - // we need to start a bit earlier in case the first tweet we indexed came in - // later than some of its updates. - long updatesStartOffset = offsetForTime(offsetsConsumer, updateTopic, - tweetsIndexingResult.getMinRecordTimestampMs() - Duration.ofMinutes(1).toMillis()); - - // Two cases: - // - // 1. If we're not indexing the last segment, end 10 minutes after the last tweet. So for - // example if we resolve an url in a tweet 3 minutes after the tweet is published, - // we'll apply that update before the segment is optimized. 10 minutes is a bit too - // much, but that doesn't matter a whole lot, since we're indexing about ~10 hours of - // updates. - // - // 2. If we're indexing the last segment, end a bit before the last indexed tweet. We might - // have incoming tweets that are a bit late. In fresh startup, we don't have a mechanism - // to store these tweets to be applied when the tweet arrives, as in TweetUpdateHandler, - // so just stop a bit earlier and let TweetCreateHandler and TweetUpdateHandler deal with - // that. - long millisAdjust; - if (segmentBuildInfo.getIndex() == segmentBuildInfos.size() - 1) { - millisAdjust = -Duration.ofMinutes(1).toMillis(); - } else { - millisAdjust = Duration.ofMinutes(10).toMillis(); - } - long updatesEndOffset = offsetForTime(offsetsConsumer, updateTopic, - tweetsIndexingResult.getMaxRecordTimestampMs() + millisAdjust); - - offsetsConsumer.close(); - - return new KafkaOffsetPair(updatesStartOffset, updatesEndOffset); - } - - /** - * Get the earliest offset with a timestamp >= $timestamp. - * - * The guarantee we get is that if we start reading from here on, we will get - * every single message that came in with a timestamp >= $timestamp. - */ - private long offsetForTime(KafkaConsumer kafkaConsumer, - TopicPartition partition, - long timestamp) { - Preconditions.checkNotNull(kafkaConsumer); - Preconditions.checkNotNull(partition); - - OffsetAndTimestamp offsetAndTimestamp = kafkaConsumer - .offsetsForTimes(ImmutableMap.of(partition, timestamp)) - .get(partition); - if (offsetAndTimestamp == null) { - return -1; - } else { - return offsetAndTimestamp.offset(); - } - } - - private void indexUpdatesFromStream( - TopicPartition topicPartition, - String consumerClientId, - long startOffset, - long endOffset, - SegmentWriter segmentWriter) throws IOException { - KafkaConsumer kafkaConsumer = - makeKafkaConsumerForIndexing(consumerClientId, topicPartition, startOffset); - - // Index TVEs. - boolean done = false; - - Stopwatch pollTimer = Stopwatch.createUnstarted(); - Stopwatch indexTimer = Stopwatch.createUnstarted(); - - SkippedPickedCounter updatesSkippedPicked = new SkippedPickedCounter("streamed_updates"); - IndexingResultCounts indexingResultCounts = new IndexingResultCounts(); - - long segmentTimesliceId = segmentWriter.getSegmentInfo().getTimeSliceID(); - - Stopwatch totalTime = Stopwatch.createStarted(); - - do { - pollTimer.start(); - ConsumerRecords records = - kafkaConsumer.poll(Duration.ofSeconds(1)); - pollTimer.stop(); - - indexTimer.start(); - for (ConsumerRecord record : records) { - if (record.value().getId() < segmentTimesliceId) { - // Doesn't apply to this segment, can be skipped instead of skipping it - // inside the more costly segmentWriter.indexThriftVersionedEvents call. - updatesSkippedPicked.incrementSkipped(); - } else { - if (record.offset() >= endOffset) { - done = true; - } - - updatesSkippedPicked.incrementPicked(); - indexingResultCounts.countResult( - segmentWriter.indexThriftVersionedEvents(record.value())); - } - - if (record.offset() >= endOffset) { - break; - } - } - indexTimer.stop(); - } while (!done); - - // Note that there'll be a decent amount of failed retryable updates. Since we index - // updates in a range that's a bit wider, they can't be applied here. - LOG.info("Client: {}, Finished indexing updates: {}. " - + "Times -- total: {}. polling: {}, indexing: {}. Indexing result counts: {}", - consumerClientId, updatesSkippedPicked, - totalTime, pollTimer, indexTimer, indexingResultCounts); - } - - /** - * Make a consumer that reads from a single partition, starting at some offset. - */ - private KafkaConsumer makeKafkaConsumerForIndexing( - String consumerClientId, - TopicPartition topicPartition, - long offset) { - KafkaConsumer kafkaConsumer = - earlybirdKafkaConsumersFactory.createKafkaConsumer(consumerClientId); - kafkaConsumer.assign(ImmutableList.of(topicPartition)); - kafkaConsumer.seek(topicPartition, offset); - LOG.info("Indexing TVEs. Kafka consumer: {}", consumerClientId); - return kafkaConsumer; - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentBuildInfo.docx b/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentBuildInfo.docx new file mode 100644 index 000000000..6fcddf3a4 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentBuildInfo.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentBuildInfo.java b/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentBuildInfo.java deleted file mode 100644 index 93d8436c7..000000000 --- a/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentBuildInfo.java +++ /dev/null @@ -1,92 +0,0 @@ -package com.twitter.search.earlybird.partition.freshstartup; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.earlybird.partition.SegmentWriter; - -// Data collected and produced while building a segment. -class SegmentBuildInfo { - private static final Logger LOG = LoggerFactory.getLogger(SegmentBuildInfo.class); - - // Inclusive boundaries. [start, end]. - private final long tweetStartOffset; - private final long tweetEndOffset; - private final int index; - private final boolean lastSegment; - - private long startTweetId; - private long maxIndexedTweetId; - private KafkaOffsetPair updateKafkaOffsetPair; - private SegmentWriter segmentWriter; - - public SegmentBuildInfo(long tweetStartOffset, - long tweetEndOffset, - int index, - boolean lastSegment) { - this.tweetStartOffset = tweetStartOffset; - this.tweetEndOffset = tweetEndOffset; - this.index = index; - this.lastSegment = lastSegment; - - this.startTweetId = -1; - this.updateKafkaOffsetPair = null; - this.maxIndexedTweetId = -1; - this.segmentWriter = null; - } - - public void setUpdateKafkaOffsetPair(KafkaOffsetPair updateKafkaOffsetPair) { - this.updateKafkaOffsetPair = updateKafkaOffsetPair; - } - - public KafkaOffsetPair getUpdateKafkaOffsetPair() { - return updateKafkaOffsetPair; - } - - public boolean isLastSegment() { - return lastSegment; - } - - public void setStartTweetId(long startTweetId) { - this.startTweetId = startTweetId; - } - - public long getTweetStartOffset() { - return tweetStartOffset; - } - - public long getTweetEndOffset() { - return tweetEndOffset; - } - - public long getStartTweetId() { - return startTweetId; - } - - public int getIndex() { - return index; - } - - public void setMaxIndexedTweetId(long maxIndexedTweetId) { - this.maxIndexedTweetId = maxIndexedTweetId; - } - - public long getMaxIndexedTweetId() { - return maxIndexedTweetId; - } - - public SegmentWriter getSegmentWriter() { - return segmentWriter; - } - - public void setSegmentWriter(SegmentWriter segmentWriter) { - this.segmentWriter = segmentWriter; - } - - public void logState() { - LOG.info("SegmentBuildInfo (index:{})", index); - LOG.info(String.format(" Start offset: %,d", tweetStartOffset)); - LOG.info(String.format(" End offset: %,d", tweetEndOffset)); - LOG.info(String.format(" Start tweet id: %d", startTweetId)); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentTweetsIndexingResult.docx b/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentTweetsIndexingResult.docx new file mode 100644 index 000000000..6c93eead0 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentTweetsIndexingResult.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentTweetsIndexingResult.java b/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentTweetsIndexingResult.java deleted file mode 100644 index d7a8c1c56..000000000 --- a/src/java/com/twitter/search/earlybird/partition/freshstartup/SegmentTweetsIndexingResult.java +++ /dev/null @@ -1,46 +0,0 @@ -package com.twitter.search.earlybird.partition.freshstartup; - -import com.twitter.search.earlybird.partition.SegmentWriter; - -/** - * Data collected and created while indexing tweets for a single segment. - */ -class SegmentTweetsIndexingResult { - private final long minRecordTimestampMs; - private final long maxRecordTimestampMs; - private final long maxIndexedTweetId; - private final SegmentWriter segmentWriter; - - public SegmentTweetsIndexingResult(long minRecordTimestampMs, long maxRecordTimestampMs, - long maxIndexedTweetId, - SegmentWriter segmentWriter) { - this.minRecordTimestampMs = minRecordTimestampMs; - this.maxRecordTimestampMs = maxRecordTimestampMs; - this.maxIndexedTweetId = maxIndexedTweetId; - this.segmentWriter = segmentWriter; - } - - public long getMinRecordTimestampMs() { - return minRecordTimestampMs; - } - - public long getMaxRecordTimestampMs() { - return maxRecordTimestampMs; - } - - public SegmentWriter getSegmentWriter() { - return segmentWriter; - } - - public long getMaxIndexedTweetId() { - return maxIndexedTweetId; - } - - @Override - public String toString() { - return String.format("Start time: %d, end time: %d, segment name: %s, max indexed: %d", - minRecordTimestampMs, maxRecordTimestampMs, - segmentWriter.getSegmentInfo().getSegmentName(), - maxIndexedTweetId); - } -} diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/SkippedPickedCounter.docx b/src/java/com/twitter/search/earlybird/partition/freshstartup/SkippedPickedCounter.docx new file mode 100644 index 000000000..298a8d1f7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/partition/freshstartup/SkippedPickedCounter.docx differ diff --git a/src/java/com/twitter/search/earlybird/partition/freshstartup/SkippedPickedCounter.java b/src/java/com/twitter/search/earlybird/partition/freshstartup/SkippedPickedCounter.java deleted file mode 100644 index f71d73f34..000000000 --- a/src/java/com/twitter/search/earlybird/partition/freshstartup/SkippedPickedCounter.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.twitter.search.earlybird.partition.freshstartup; - -class SkippedPickedCounter { - private long skipped; - private long picked; - private String name; - - public SkippedPickedCounter(String name) { - this.skipped = 0; - this.picked = 0; - this.name = name; - } - - @Override - public String toString() { - return String.format("[%s - picked: %,d, skipped: %,d]", - name, picked, skipped); - } - - void incrementSkipped() { - skipped++; - } - void incrementPicked() { - picked++; - } -} diff --git a/src/java/com/twitter/search/earlybird/querycache/CachedFilterQuery.docx b/src/java/com/twitter/search/earlybird/querycache/CachedFilterQuery.docx new file mode 100644 index 000000000..b79b15bb2 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/querycache/CachedFilterQuery.docx differ diff --git a/src/java/com/twitter/search/earlybird/querycache/CachedFilterQuery.java b/src/java/com/twitter/search/earlybird/querycache/CachedFilterQuery.java deleted file mode 100644 index f0a888430..000000000 --- a/src/java/com/twitter/search/earlybird/querycache/CachedFilterQuery.java +++ /dev/null @@ -1,310 +0,0 @@ -package com.twitter.search.earlybird.querycache; - -import java.io.IOException; -import java.util.Objects; -import java.util.Set; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.ConstantScoreScorer; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Weight; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.query.DefaultFilterWeight; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.core.earlybird.index.QueryCacheResultForSegment; - -/** - * Query to iterate QueryCache result (the cache) - */ -public final class CachedFilterQuery extends Query { - private static final String STAT_PREFIX = "querycache_serving_"; - private static final SearchCounter REWRITE_CALLS = SearchCounter.export( - STAT_PREFIX + "rewrite_calls"); - private static final SearchCounter NO_CACHE_FOUND = SearchCounter.export( - STAT_PREFIX + "no_cache_found"); - private static final SearchCounter USED_CACHE_AND_FRESH_DOCS = SearchCounter.export( - STAT_PREFIX + "used_cache_and_fresh_docs"); - private static final SearchCounter USED_CACHE_ONLY = SearchCounter.export( - STAT_PREFIX + "used_cache_only"); - - - public static class NoSuchFilterException extends Exception { - NoSuchFilterException(String filterName) { - super("Filter [" + filterName + "] does not exists"); - } - } - - private static class CachedResultQuery extends Query { - private final QueryCacheResultForSegment cachedResult; - - public CachedResultQuery(QueryCacheResultForSegment cachedResult) { - this.cachedResult = cachedResult; - } - - @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) { - return new DefaultFilterWeight(this) { - @Override - protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) - throws IOException { - return cachedResult.getDocIdSet().iterator(); - } - }; - } - - @Override - public int hashCode() { - return cachedResult == null ? 0 : cachedResult.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof CachedResultQuery)) { - return false; - } - - CachedResultQuery query = (CachedResultQuery) obj; - return Objects.equals(cachedResult, query.cachedResult); - } - - @Override - public String toString(String field) { - return "CACHED_RESULT"; - } - } - - private static class CachedResultAndFreshDocsQuery extends Query { - private final Query cacheLuceneQuery; - private final QueryCacheResultForSegment cachedResult; - - public CachedResultAndFreshDocsQuery( - Query cacheLuceneQuery, QueryCacheResultForSegment cachedResult) { - this.cacheLuceneQuery = cacheLuceneQuery; - this.cachedResult = cachedResult; - } - - @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) { - return new Weight(this) { - @Override - public void extractTerms(Set terms) { - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) throws IOException { - Scorer scorer = scorer(context); - if ((scorer != null) && (scorer.iterator().advance(doc) == doc)) { - return Explanation.match(0f, "Match on id " + doc); - } - return Explanation.match(0f, "No match on id " + doc); - } - - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - Weight luceneWeight; - try { - luceneWeight = cacheLuceneQuery.createWeight(searcher, scoreMode, boost); - } catch (UnsupportedOperationException e) { - // Some queries do not support weights. This is fine, it simply means the query has - // no docs, and means the same thing as a null scorer. - return null; - } - - Scorer luceneScorer = luceneWeight.scorer(context); - if (luceneScorer == null) { - return null; - } - - DocIdSetIterator iterator = new CachedResultDocIdSetIterator( - cachedResult.getSmallestDocID(), - luceneScorer.iterator(), - cachedResult.getDocIdSet().iterator()); - return new ConstantScoreScorer(luceneWeight, 0.0f, scoreMode, iterator); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return true; - } - }; - } - - @Override - public int hashCode() { - return (cacheLuceneQuery == null ? 0 : cacheLuceneQuery.hashCode()) * 13 - + (cachedResult == null ? 0 : cachedResult.hashCode()); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof CachedResultAndFreshDocsQuery)) { - return false; - } - - CachedResultAndFreshDocsQuery query = (CachedResultAndFreshDocsQuery) obj; - return Objects.equals(cacheLuceneQuery, query.cacheLuceneQuery) - && Objects.equals(cachedResult, query.cachedResult); - } - - @Override - public String toString(String field) { - return "CACHED_RESULT_AND_FRESH_DOCS"; - } - } - - private static final Query DUMMY_FILTER = wrapFilter(new Query() { - @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) { - return new DefaultFilterWeight(this) { - @Override - protected DocIdSetIterator getDocIdSetIterator(LeafReaderContext context) { - return null; - } - }; - } - - @Override - public int hashCode() { - return System.identityHashCode(this); - } - - @Override - public boolean equals(Object obj) { - return this == obj; - } - - @Override - public String toString(String field) { - return "DUMMY_FILTER"; - } - }); - - private final QueryCacheFilter queryCacheFilter; - - // Lucene Query used to fill the cache - private final Query cacheLuceneQuery; - - public static Query getCachedFilterQuery(String filterName, QueryCacheManager queryCacheManager) - throws NoSuchFilterException { - return wrapFilter(new CachedFilterQuery(filterName, queryCacheManager)); - } - - private static Query wrapFilter(Query filter) { - return new BooleanQuery.Builder() - .add(filter, BooleanClause.Occur.FILTER) - .build(); - } - - private CachedFilterQuery(String filterName, QueryCacheManager queryCacheManager) - throws NoSuchFilterException { - queryCacheFilter = queryCacheManager.getFilter(filterName); - if (queryCacheFilter == null) { - throw new NoSuchFilterException(filterName); - } - queryCacheFilter.incrementUsageStat(); - - // retrieve the query that was used to populate the cache - cacheLuceneQuery = queryCacheFilter.getLuceneQuery(); - } - - /** - * Creates a query base on the cache situation - */ - @Override - public Query rewrite(IndexReader reader) { - EarlybirdIndexSegmentAtomicReader twitterReader = (EarlybirdIndexSegmentAtomicReader) reader; - QueryCacheResultForSegment cachedResult = - twitterReader.getSegmentData().getQueryCacheResult(queryCacheFilter.getFilterName()); - REWRITE_CALLS.increment(); - - if (cachedResult == null || cachedResult.getSmallestDocID() == -1) { - // No cached result, or cache has never been updated - // This happens to the newly created segment, between the segment creation and first - // query cache update - NO_CACHE_FOUND.increment(); - - if (queryCacheFilter.getCacheModeOnly()) { - // since this query cache filter allows cache mode only, we return a query that - // matches no doc - return DUMMY_FILTER; - } - - return wrapFilter(cacheLuceneQuery); - } - - if (!queryCacheFilter.getCacheModeOnly() && // is this a cache mode only filter? - // the following check is only necessary for the realtime segment, which - // grows. Since we decrement docIds in the realtime segment, a reader - // having a smallestDocID less than the one in the cachedResult indicates - // that the segment/reader has new documents. - cachedResult.getSmallestDocID() > twitterReader.getSmallestDocID()) { - // The segment has more documents than the cached result. IOW, there are new - // documents that are not cached. This happens to latest segment that we're indexing to. - USED_CACHE_AND_FRESH_DOCS.increment(); - return wrapFilter(new CachedResultAndFreshDocsQuery(cacheLuceneQuery, cachedResult)); - } - - // The segment has not grown since the cache was last updated. - // This happens mostly to old segments that we're no longer indexing to. - USED_CACHE_ONLY.increment(); - return wrapFilter(new CachedResultQuery(cachedResult)); - } - - @Override - public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) - throws IOException { - final Weight luceneWeight = cacheLuceneQuery.createWeight(searcher, scoreMode, boost); - - return new Weight(this) { - @Override - public Scorer scorer(LeafReaderContext context) throws IOException { - return luceneWeight.scorer(context); - } - - @Override - public void extractTerms(Set terms) { - luceneWeight.extractTerms(terms); - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) throws IOException { - return luceneWeight.explain(context, doc); - } - - @Override - public boolean isCacheable(LeafReaderContext ctx) { - return luceneWeight.isCacheable(ctx); - } - }; - } - - @Override - public int hashCode() { - return cacheLuceneQuery == null ? 0 : cacheLuceneQuery.hashCode(); - } - - @Override - public boolean equals(Object obj) { - if (!(obj instanceof CachedFilterQuery)) { - return false; - } - - CachedFilterQuery filter = (CachedFilterQuery) obj; - return Objects.equals(cacheLuceneQuery, filter.cacheLuceneQuery); - } - - @Override - public String toString(String s) { - return "CachedFilterQuery[" + queryCacheFilter.getFilterName() + "]"; - } -} diff --git a/src/java/com/twitter/search/earlybird/querycache/CachedResultDocIdSetIterator.docx b/src/java/com/twitter/search/earlybird/querycache/CachedResultDocIdSetIterator.docx new file mode 100644 index 000000000..16cae8eb8 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/querycache/CachedResultDocIdSetIterator.docx differ diff --git a/src/java/com/twitter/search/earlybird/querycache/CachedResultDocIdSetIterator.java b/src/java/com/twitter/search/earlybird/querycache/CachedResultDocIdSetIterator.java deleted file mode 100644 index 07ad639a3..000000000 --- a/src/java/com/twitter/search/earlybird/querycache/CachedResultDocIdSetIterator.java +++ /dev/null @@ -1,72 +0,0 @@ -package com.twitter.search.earlybird.querycache; - -import java.io.IOException; - -import org.apache.lucene.search.DocIdSetIterator; - -public class CachedResultDocIdSetIterator extends DocIdSetIterator { - // With the realtime index, we grow the doc id negatively. - // Hence the smallest doc id is the ID the latest/newest document in the cache. - private final int cachedSmallestDocID; - - // Documents that were indexed after the last cache update - private final DocIdSetIterator freshDocIdIterator; - // Documents that were cached - private final DocIdSetIterator cachedDocIdIterator; - - private int currentDocId; - private boolean initialized = false; - - public CachedResultDocIdSetIterator(int cachedSmallestDocID, - DocIdSetIterator freshDocIdIterator, - DocIdSetIterator cachedDocIdIterator) { - this.cachedSmallestDocID = cachedSmallestDocID; - - this.freshDocIdIterator = freshDocIdIterator; - this.cachedDocIdIterator = cachedDocIdIterator; - this.currentDocId = -1; - } - - @Override - public int docID() { - return currentDocId; - } - - @Override - public int nextDoc() throws IOException { - if (currentDocId < cachedSmallestDocID) { - currentDocId = freshDocIdIterator.nextDoc(); - } else if (currentDocId != NO_MORE_DOCS) { - if (!initialized) { - // the first time we come in here, currentDocId should be pointing to - // something >= cachedMinDocID. We need to go to the doc after cachedMinDocID. - currentDocId = cachedDocIdIterator.advance(currentDocId + 1); - initialized = true; - } else { - currentDocId = cachedDocIdIterator.nextDoc(); - } - } - return currentDocId; - } - - @Override - public int advance(int target) throws IOException { - if (target < cachedSmallestDocID) { - currentDocId = freshDocIdIterator.advance(target); - } else if (currentDocId != NO_MORE_DOCS) { - initialized = true; - currentDocId = cachedDocIdIterator.advance(target); - } - - return currentDocId; - } - - @Override - public long cost() { - if (currentDocId < cachedSmallestDocID) { - return freshDocIdIterator.cost(); - } else { - return cachedDocIdIterator.cost(); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheConfig.docx b/src/java/com/twitter/search/earlybird/querycache/QueryCacheConfig.docx new file mode 100644 index 000000000..9dc998936 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/querycache/QueryCacheConfig.docx differ diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheConfig.java b/src/java/com/twitter/search/earlybird/querycache/QueryCacheConfig.java deleted file mode 100644 index fdee0ba23..000000000 --- a/src/java/com/twitter/search/earlybird/querycache/QueryCacheConfig.java +++ /dev/null @@ -1,101 +0,0 @@ -package com.twitter.search.earlybird.querycache; - -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.Reader; -import java.util.ArrayList; -import java.util.List; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.yaml.snakeyaml.TypeDescription; -import org.yaml.snakeyaml.Yaml; -import org.yaml.snakeyaml.constructor.Constructor; - -import com.twitter.search.common.config.Config; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; - -// QueryCacheConfig is not thread safe. *Do not* attempt to create multiple QueryCacheConfig -// in different threads -public class QueryCacheConfig { - private static final Logger LOG = LoggerFactory.getLogger(QueryCacheConfig.class); - private static final String DEFAULT_CONFIG_FILE = "querycache.yml"; - private final SearchStatsReceiver statsReceiver; - - private List filters; - - public QueryCacheConfig(SearchStatsReceiver statsReceiver) { - this(locateConfigFile(EarlybirdConfig.getString("query_cache_config_file_name", - DEFAULT_CONFIG_FILE)), statsReceiver); - } - - // package protected constructor for unit test only - QueryCacheConfig(Reader reader, SearchStatsReceiver statsReceiver) { - this.statsReceiver = statsReceiver; - if (reader == null) { - throw new RuntimeException("Query cache config not loaded"); - } - loadConfig(reader); - } - - public List filters() { - return filters; - } - - int getFilterSize() { - return filters.size(); - } - - private static FileReader locateConfigFile(String configFileName) { - File configFile = null; - String dir = Config.locateSearchConfigDir(EarlybirdConfig.EARLYBIRD_CONFIG_DIR, configFileName); - if (dir != null) { - configFile = openConfigFile(dir + "/" + configFileName); - } - if (configFile != null) { - try { - return new FileReader(configFile); - } catch (FileNotFoundException e) { - // This should not happen as the caller should make sure that the file exists before - // calling this function. - LOG.error("Unexpected exception", e); - throw new RuntimeException("Query cache config file not loaded!", e); - } - } - return null; - } - - private static File openConfigFile(String configFilePath) { - File configFile = new File(configFilePath); - if (!configFile.exists()) { - LOG.warn("QueryCache config file [" + configFile + "] not found"); - configFile = null; - } else { - LOG.info("Opened QueryCacheFilter config file [" + configFile + "]"); - } - return configFile; - } - - private void loadConfig(Reader reader) { - TypeDescription qcEntryDescription = new TypeDescription(QueryCacheFilter.class); - Constructor constructor = new Constructor(qcEntryDescription); - Yaml yaml = new Yaml(constructor); - - filters = new ArrayList<>(); - - for (Object data : yaml.loadAll(reader)) { - QueryCacheFilter cacheFilter = (QueryCacheFilter) data; - try { - cacheFilter.sanityCheck(); - } catch (QueryCacheFilter.InvalidEntryException e) { - throw new RuntimeException(e); - } - cacheFilter.createQueryCounter(statsReceiver); - filters.add(cacheFilter); - LOG.info("Loaded filter from config {}", cacheFilter.toString()); - } - LOG.info("Total filters loaded: {}", filters.size()); - } -} diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheConversionRules.docx b/src/java/com/twitter/search/earlybird/querycache/QueryCacheConversionRules.docx new file mode 100644 index 000000000..3c991aafd Binary files /dev/null and b/src/java/com/twitter/search/earlybird/querycache/QueryCacheConversionRules.docx differ diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheConversionRules.java b/src/java/com/twitter/search/earlybird/querycache/QueryCacheConversionRules.java deleted file mode 100644 index 60f1a1f85..000000000 --- a/src/java/com/twitter/search/earlybird/querycache/QueryCacheConversionRules.java +++ /dev/null @@ -1,100 +0,0 @@ -package com.twitter.search.earlybird.querycache; - -import java.util.Arrays; -import java.util.List; -import java.util.Set; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Sets; - -import com.twitter.search.common.constants.QueryCacheConstants; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchOperatorConstants; - -import static com.twitter.search.common.util.RuleBasedConverter.Rule; - -/** - * Rules to convert exclude operators into cached filters and consolidate them. - * NOTE: this is copied from blender/core/parser/service/queryparser/QueryCacheConversionRules.java - * We should remove the blender one once this is in production. - */ -public final class QueryCacheConversionRules { - static final SearchOperator EXCLUDE_ANTISOCIAL = - new SearchOperator(SearchOperator.Type.EXCLUDE, SearchOperatorConstants.ANTISOCIAL); - static final SearchOperator EXCLUDE_SPAM = - new SearchOperator(SearchOperator.Type.EXCLUDE, SearchOperatorConstants.SPAM); - static final SearchOperator EXCLUDE_REPLIES = - new SearchOperator(SearchOperator.Type.EXCLUDE, SearchOperatorConstants.REPLIES); - static final SearchOperator EXCLUDE_NATIVERETWEETS = - new SearchOperator(SearchOperator.Type.EXCLUDE, SearchOperatorConstants.NATIVE_RETWEETS); - - public static final SearchOperator CACHED_EXCLUDE_ANTISOCIAL = - new SearchOperator(SearchOperator.Type.CACHED_FILTER, - QueryCacheConstants.EXCLUDE_ANTISOCIAL); - static final SearchOperator CACHED_EXCLUDE_NATIVERETWEETS = - new SearchOperator(SearchOperator.Type.CACHED_FILTER, - QueryCacheConstants.EXCLUDE_ANTISOCIAL_AND_NATIVERETWEETS); - static final SearchOperator CACHED_EXCLUDE_SPAM = - new SearchOperator(SearchOperator.Type.CACHED_FILTER, - QueryCacheConstants.EXCLUDE_SPAM); - static final SearchOperator CACHED_EXCLUDE_SPAM_AND_NATIVERETWEETS = - new SearchOperator(SearchOperator.Type.CACHED_FILTER, - QueryCacheConstants.EXCLUDE_SPAM_AND_NATIVERETWEETS); - static final SearchOperator CACHED_EXCLUDE_REPLIES = - new SearchOperator(SearchOperator.Type.CACHED_FILTER, - QueryCacheConstants.EXCLUDE_REPLIES); - - private QueryCacheConversionRules() { - } - - public static final List> DEFAULT_RULES = ImmutableList.of( - // basic translation from exclude:filter to cached filter - new Rule<>(new Query[]{EXCLUDE_ANTISOCIAL}, - new Query[]{CACHED_EXCLUDE_ANTISOCIAL}), - - new Rule<>(new Query[]{EXCLUDE_SPAM}, - new Query[]{CACHED_EXCLUDE_SPAM}), - - new Rule<>(new Query[]{EXCLUDE_NATIVERETWEETS}, - new Query[]{CACHED_EXCLUDE_NATIVERETWEETS}), - - new Rule<>(new Query[]{EXCLUDE_REPLIES}, - new Query[]{CACHED_EXCLUDE_REPLIES}), - - // combine two cached filter to a new one - new Rule<>(new Query[]{CACHED_EXCLUDE_SPAM, CACHED_EXCLUDE_NATIVERETWEETS}, - new Query[]{CACHED_EXCLUDE_SPAM_AND_NATIVERETWEETS}), - - // Remove redundant filters. A cached filter is redundant when it coexist with a - // more strict filter. Note all the filter will filter out antisocial. - new Rule<>( - new Query[]{CACHED_EXCLUDE_SPAM, CACHED_EXCLUDE_ANTISOCIAL}, - new Query[]{CACHED_EXCLUDE_SPAM}), - - new Rule<>( - new Query[]{CACHED_EXCLUDE_NATIVERETWEETS, CACHED_EXCLUDE_ANTISOCIAL}, - new Query[]{CACHED_EXCLUDE_NATIVERETWEETS}), - - new Rule<>( - new Query[]{CACHED_EXCLUDE_SPAM_AND_NATIVERETWEETS, CACHED_EXCLUDE_ANTISOCIAL}, - new Query[]{CACHED_EXCLUDE_SPAM_AND_NATIVERETWEETS}), - - new Rule<>( - new Query[]{CACHED_EXCLUDE_SPAM_AND_NATIVERETWEETS, CACHED_EXCLUDE_SPAM}, - new Query[]{CACHED_EXCLUDE_SPAM_AND_NATIVERETWEETS}), - - new Rule<>( - new Query[]{CACHED_EXCLUDE_SPAM_AND_NATIVERETWEETS, CACHED_EXCLUDE_NATIVERETWEETS}, - new Query[]{CACHED_EXCLUDE_SPAM_AND_NATIVERETWEETS}) - ); - - public static final List STRIP_ANNOTATIONS_QUERIES; - static { - Set stripAnnotationsQueries = Sets.newHashSet(); - for (Rule rule : DEFAULT_RULES) { - stripAnnotationsQueries.addAll(Arrays.asList(rule.getSources())); - } - STRIP_ANNOTATIONS_QUERIES = ImmutableList.copyOf(stripAnnotationsQueries); - } -} diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheFilter.docx b/src/java/com/twitter/search/earlybird/querycache/QueryCacheFilter.docx new file mode 100644 index 000000000..d12fd99c0 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/querycache/QueryCacheFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheFilter.java b/src/java/com/twitter/search/earlybird/querycache/QueryCacheFilter.java deleted file mode 100644 index d2726338e..000000000 --- a/src/java/com/twitter/search/earlybird/querycache/QueryCacheFilter.java +++ /dev/null @@ -1,302 +0,0 @@ -package com.twitter.search.earlybird.querycache; - -import java.util.List; -import java.util.TreeMap; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.search.Query; - -import com.twitter.common.collections.Pair; -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.common.util.Clock; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.query.thriftjava.CollectorParams; -import com.twitter.search.common.query.thriftjava.CollectorTerminationParams; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.search.TerminationTracker; -import com.twitter.search.common.util.text.regex.Regex; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.queryparser.EarlybirdLuceneQueryVisitor; -import com.twitter.search.earlybird.search.SearchRequestInfo; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.queryparser.parser.SerializedQueryParser; -import com.twitter.search.queryparser.query.QueryParserException; - -/** - * The definition of a QueryCache filter/entry, like the name of the filter, the query used - * to populate the cache, update schedule, etc.. - * - * Instances of this class are created by the YAML loader when loading the config file. Most - * members are populated by YAML using setters through reflection. - */ -public class QueryCacheFilter { - // Data structure type supported as cache result holder - public enum ResultSetType { - FixedBitSet, - SparseFixedBitSet - } - - // Fields set directly from YML config file. - private String filterName; // unique name for cached filter - private String query; // serialized query string - private ResultSetType resultType; - private boolean cacheModeOnly; - private List schedule; - private SearchCounter queries; - - // Fields generated based on config (but not directly). - private volatile Pair queryPair; - private TreeMap scheduleMap; // tree map from index to interval - - public class InvalidEntryException extends Exception { - public InvalidEntryException(String message) { - super("Filter [" + filterName + "]: " + message); - } - } - - public static class UpdateInterval { - // Overrides *all* query cache update frequencies to be this value, in seconds. - private final int overrideSecondsForTests = EarlybirdConfig.getInt( - "override_query_cache_update_frequency", -1); - - // Fields set directly from YML config file. - private int segment; - private long seconds; - - public void setSegment(int segment) { - this.segment = segment; - } - - /** - * Sets the update period in seconds. If the override_query_cache_update_frequency parameter is - * specified in the earlybird configuration, its value is used instead (the value passed to this - * method is ignored). - */ - public void setSeconds(long seconds) { - if (overrideSecondsForTests != -1) { - this.seconds = overrideSecondsForTests; - } else { - this.seconds = seconds; - } - } - - public int getSegment() { - return segment; - } - - public long getSeconds() { - return seconds; - } - } - - public void setFilterName(String filterName) throws InvalidEntryException { - sanityCheckFilterName(filterName); - this.filterName = filterName; - } - - /** - * Sets the driving query for this query cache filter. - */ - public void setQuery(String query) throws InvalidEntryException { - if (query == null || query.isEmpty()) { - throw new InvalidEntryException("Empty query string"); - } - - this.query = query; - } - - /** - * Sets the type of the results that will be generated by this query cache filter. - */ - public void setResultType(String resultType) throws InvalidEntryException { - if (ResultSetType.FixedBitSet.toString().equalsIgnoreCase(resultType)) { - this.resultType = ResultSetType.FixedBitSet; - } else if (ResultSetType.SparseFixedBitSet.toString().equalsIgnoreCase(resultType)) { - this.resultType = ResultSetType.SparseFixedBitSet; - } else { - throw new InvalidEntryException("Unregconized result type [" + resultType + "]"); - } - } - - public void setCacheModeOnly(boolean cacheModeOnly) { - this.cacheModeOnly = cacheModeOnly; - } - - public void setSchedule(List schedule) - throws QueryCacheFilter.InvalidEntryException { - sanityCheckSchedule(schedule); - this.schedule = schedule; - this.scheduleMap = createScheduleMap(schedule); - } - - public void createQueryCounter(SearchStatsReceiver statsReceiver) { - queries = statsReceiver.getCounter("cached_filter_" + filterName + "_queries"); - } - - public void incrementUsageStat() { - queries.increment(); - } - - public String getFilterName() { - return filterName; - } - - public String getQueryString() { - return query; - } - - // snakeyaml does not like a getter named getResultType() that does not return a string - public ResultSetType getResultSetType() { - return resultType; - } - - public boolean getCacheModeOnly() { - return cacheModeOnly; - } - - public Query getLuceneQuery() { - return queryPair.getSecond(); - } - - public ThriftSearchQuery getSearchQuery() { - return queryPair.getFirst(); - } - - /** - * Create a new {@link SearchRequestInfo} using {@link #queryPair}. - * - * @return a new {@link SearchRequestInfo} - */ - public SearchRequestInfo createSearchRequestInfo() { - ThriftSearchQuery searchQuery = Preconditions.checkNotNull(queryPair.getFirst()); - Query luceneQuery = Preconditions.checkNotNull(queryPair.getSecond()); - - return new SearchRequestInfo( - searchQuery, luceneQuery, new TerminationTracker(Clock.SYSTEM_CLOCK)); - } - - public void setup( - QueryCacheManager queryCacheManager, - UserTable userTable, - EarlybirdCluster earlybirdCluster) throws QueryParserException { - createQuery(queryCacheManager, userTable, earlybirdCluster); - } - - // index corresponds to 'segment' from the config file. this is the index of the - // segment, starting with the current segment (0) and counting backwards in time. - public Amount getUpdateInterval(int index) { - long seconds = scheduleMap.floorEntry(index).getValue().getSeconds(); - return Amount.of(seconds, Time.SECONDS); - } - - private TreeMap createScheduleMap(List scheduleToUse) { - TreeMap map = new TreeMap<>(); - for (UpdateInterval interval : scheduleToUse) { - map.put(interval.segment, interval); - } - return map; - } - - private void createQuery( - QueryCacheManager queryCacheManager, - UserTable userTable, - EarlybirdCluster earlybirdCluster) throws QueryParserException { - - int maxSegmentSize = EarlybirdConfig.getMaxSegmentSize(); - CollectorParams collectionParams = new CollectorParams(); - collectionParams.setNumResultsToReturn(maxSegmentSize); - CollectorTerminationParams terminationParams = new CollectorTerminationParams(); - terminationParams.setMaxHitsToProcess(maxSegmentSize); - collectionParams.setTerminationParams(terminationParams); - - ThriftSearchQuery searchQuery = new ThriftSearchQuery(); - searchQuery.setMaxHitsPerUser(maxSegmentSize); - searchQuery.setCollectorParams(collectionParams); - searchQuery.setSerializedQuery(query); - - final SerializedQueryParser parser = new SerializedQueryParser( - EarlybirdConfig.getPenguinVersion()); - - Query luceneQuery = parser.parse(query).simplify().accept( - new EarlybirdLuceneQueryVisitor( - queryCacheManager.getIndexConfig().getSchema().getSchemaSnapshot(), - queryCacheManager, - userTable, - queryCacheManager.getUserScrubGeoMap(), - earlybirdCluster, - queryCacheManager.getDecider())); - if (luceneQuery == null) { - throw new QueryParserException("Unable to create lucene query from " + query); - } - - queryPair = new Pair<>(searchQuery, luceneQuery); - } - - private void sanityCheckFilterName(String filter) throws InvalidEntryException { - if (filter == null || filter.isEmpty()) { - throw new InvalidEntryException("Missing filter name"); - } - if (Regex.FILTER_NAME_CHECK.matcher(filter).find()) { - throw new InvalidEntryException( - "Invalid character in filter name. Chars allowed [a-zA-Z_0-9]"); - } - } - - private void sanityCheckSchedule(List intervals) - throws InvalidEntryException { - // Make sure there's at least 1 interval defined - if (intervals == null || intervals.isEmpty()) { - throw new InvalidEntryException("No schedule defined"); - } - - // Make sure the first interval starts with segment 0 - if (intervals.get(0).getSegment() != 0) { - throw new InvalidEntryException( - "The first interval in the schedule must start from segment 0"); - } - - // Make sure segments are defined in order, and no segment is defined more than twice - int prevSegment = intervals.get(0).getSegment(); - for (int i = 1; i < intervals.size(); ++i) { - int currentSegment = intervals.get(i).getSegment(); - if (prevSegment > currentSegment) { - throw new InvalidEntryException("Segment intervals out of order. Segment " + prevSegment - + " is defined before segment " + currentSegment); - } - - if (prevSegment == intervals.get(i).getSegment()) { - throw new InvalidEntryException("Segment " + prevSegment + " is defined twice"); - } - - prevSegment = currentSegment; - } - } - - protected void sanityCheck() throws InvalidEntryException { - sanityCheckFilterName(filterName); - if (query == null || query.isEmpty()) { - throw new InvalidEntryException("Missing query"); - } - if (resultType == null) { - throw new InvalidEntryException("Missing result type"); - } - if (schedule == null || schedule.size() == 0) { - throw new InvalidEntryException("Missing update schedule"); - } - if (scheduleMap == null || scheduleMap.size() == 0) { - throw new InvalidEntryException("Missing update schedule map"); - } - } - - @Override - public String toString() { - return "filterName: [" + getFilterName() - + "] query: [" + getQueryString() - + "] result type [" + getResultSetType() - + "] schedule: " + schedule; - } -} diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheManager.docx b/src/java/com/twitter/search/earlybird/querycache/QueryCacheManager.docx new file mode 100644 index 000000000..c03174827 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/querycache/QueryCacheManager.docx differ diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheManager.java b/src/java/com/twitter/search/earlybird/querycache/QueryCacheManager.java deleted file mode 100644 index 0795df0cc..000000000 --- a/src/java/com/twitter/search/earlybird/querycache/QueryCacheManager.java +++ /dev/null @@ -1,365 +0,0 @@ -package com.twitter.search.earlybird.querycache; - -import java.util.Collection; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.base.Stopwatch; -import com.google.common.collect.Lists; -import com.google.common.primitives.Longs; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.search.common.concurrent.ScheduledExecutorServiceFactory; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.EarlybirdIndexConfig; -import com.twitter.search.earlybird.EarlybirdStatus; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.userupdates.UserScrubGeoMap; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.partition.SegmentManager; -import com.twitter.search.earlybird.partition.SegmentManager.Filter; -import com.twitter.search.earlybird.partition.SegmentManager.Order; -import com.twitter.search.earlybird.partition.SegmentManager.SegmentUpdateListener; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.EarlybirdStatusCode; -import com.twitter.search.queryparser.query.QueryParserException; - -/** - * Main class to manage Earlybird's QueryCache. - * - * Initialize the QueryCache and new segments are notified to the QueryCache subsystem - * through this class. - * - * This class is thread-safe when calling methods that modify the list of tasks that - * we're executing or when we need to traverse all tasks and check something. The way - * thread-safety is achieved here right now is through making methods synchronized. - */ -public class QueryCacheManager implements SegmentUpdateListener { - private static final Logger LOG = LoggerFactory.getLogger(QueryCacheManager.class); - - private static final Amount ZERO_SECONDS = Amount.of(0L, Time.SECONDS); - - private final boolean enabled = EarlybirdConfig.getBool("querycache", false); - - // segments are removed from SegmentInfoMap lazily, and there may be a wait time. - // So, beware that there's short period of time where there's more segments than - // maxEnabledSegments. - private final int maxEnabledSegments; - - private final UserTable userTable; - private final UserScrubGeoMap userScrubGeoMap; - private final EarlybirdIndexConfig indexConfig; - private QueryCacheUpdater updater; - private final Map filters; - private final ScheduledExecutorServiceFactory updaterScheduledExecutorServiceFactory; - - private final SearchStatsReceiver searchStatsReceiver; - - private static final SearchLongGauge NUM_CACHE_ENTRY_STAT = - SearchLongGauge.export("querycache_num_entries"); - - private static final SearchCounter NUM_UPDATE_SEGMENTS_CALLS = - SearchCounter.export("querycache_num_update_segments_calls"); - - private volatile boolean didSetup = false; - - private final EarlybirdSearcherStats searcherStats; - private final Decider decider; - private final CriticalExceptionHandler criticalExceptionHandler; - private final Clock clock; - - public QueryCacheManager( - QueryCacheConfig config, - EarlybirdIndexConfig indexConfig, - int maxEnabledSegments, - UserTable userTable, - UserScrubGeoMap userScrubGeoMap, - ScheduledExecutorServiceFactory updaterScheduledExecutorServiceFactory, - SearchStatsReceiver searchStatsReceiver, - EarlybirdSearcherStats searcherStats, - Decider decider, - CriticalExceptionHandler criticalExceptionHandler, - Clock clock) { - - Preconditions.checkArgument(maxEnabledSegments > 0); - - QueryCacheConfig queryCacheConfig = config; - if (queryCacheConfig == null) { - queryCacheConfig = new QueryCacheConfig(searchStatsReceiver); - } - this.indexConfig = indexConfig; - this.maxEnabledSegments = maxEnabledSegments; - this.userTable = userTable; - this.userScrubGeoMap = userScrubGeoMap; - this.updaterScheduledExecutorServiceFactory = updaterScheduledExecutorServiceFactory; - this.searchStatsReceiver = searchStatsReceiver; - this.searcherStats = searcherStats; - this.filters = new HashMap<>(); - this.decider = decider; - this.criticalExceptionHandler = criticalExceptionHandler; - this.clock = clock; - for (QueryCacheFilter filter : queryCacheConfig.filters()) { - filters.put(filter.getFilterName(), filter); - } - NUM_CACHE_ENTRY_STAT.set(filters.size()); - } - - public EarlybirdIndexConfig getIndexConfig() { - return indexConfig; - } - - public UserScrubGeoMap getUserScrubGeoMap() { - return userScrubGeoMap; - } - - /** Setup all update tasks at once, should only be called after Earlybird has loaded/indexed all - * segments during start-up - * - * Only the first call to the function has effect, subsequent calls are no-ops - */ - public void setupTasksIfNeeded(SegmentManager segmentManager) - throws QueryParserException { - setupTasks( - segmentManager.getSegmentInfos(Filter.All, Order.OLD_TO_NEW), - segmentManager.getEarlybirdIndexConfig().getCluster()); - } - - @VisibleForTesting - synchronized void setupTasks( - Iterable newSegments, - EarlybirdCluster earlybirdCluster) throws QueryParserException { - // Setup needs to be done only once after all index caught up. - if (didSetup) { - return; - } - - LOG.info("Setting up {} query cache tasks", filters.values().size()); - - for (QueryCacheFilter filter : filters.values()) { - filter.setup(this, userTable, earlybirdCluster); - } - - if (!enabled()) { - // Note that the definition of disabling the query caches here is "don't compute the caches". - // We still load the queries from the .yml, we still rewrite search queries to use - // cached queries. The reason we are choosing this definition is that it's somewhat simpler - // to implement (no need to turn off rewriting) and because we might get external queries that - // contain cached filters (they're listed in go/searchsyntax). - // - // If we need a stricter definition of turning off query caches, we can implement it too, or - // just tighten this one. - return; - } - - Preconditions.checkState(updater == null); - updater = new QueryCacheUpdater( - filters.values(), - updaterScheduledExecutorServiceFactory, - userTable, - searchStatsReceiver, - searcherStats, - decider, - criticalExceptionHandler, - clock); - - LOG.info("Finished setting up query cache updater."); - - scheduleTasks(newSegments, false); - - didSetup = true; - } - - private void scheduleTasks(Iterable segments, boolean isCurrent) { - List sortedSegments = Lists.newArrayList(segments); - Collections.sort(sortedSegments, (o1, o2) -> { - // sort new to old (o2 and o1 are reversed here) - return Longs.compare(o2.getTimeSliceID(), o1.getTimeSliceID()); - }); - - LOG.info("Scheduling tasks for {} segments.", sortedSegments.size()); - - for (int segmentIndex = 0; segmentIndex < sortedSegments.size(); ++segmentIndex) { - SegmentInfo segmentInfo = sortedSegments.get(segmentIndex); - if (segmentIndex == maxEnabledSegments) { - LOG.warn("Tried to add more segments than MaxEnabledSegments (" + maxEnabledSegments - + "). Removed oldest segment " + segmentInfo.getTimeSliceID()); - continue; - } - addQueryCacheTasksForSegment(segmentInfo, segmentIndex, !isCurrent); - } - } - - /** - * Rebuilds the query cache for the given segment after it was optimized. - */ - public synchronized void rebuildQueryCachesAfterSegmentOptimization( - SegmentInfo optimizedSegment) { - Preconditions.checkState(optimizedSegment.getIndexSegment().isOptimized(), - "Segment " + optimizedSegment.getSegmentName() + " is not optimized."); - - if (!didSetup) { - // Once our indexing is current, we'll just start tasks for all segments, optimized or not. - // Before that event, we don't do anything query cache related. - LOG.info("Haven't done initial setup, returning."); - return; - } - - LOG.info("Rebuilding query caches for optimized segment {}", - optimizedSegment.getSegmentName()); - - // The optimized segment should always be the 1st segment (the current segment has index 0). - Stopwatch stopwatch = Stopwatch.createStarted(); - updater.removeAllTasksForSegment(optimizedSegment); - addQueryCacheTasksForSegment(optimizedSegment, 1, true); - - while (!updater.allTasksRanForSegment(optimizedSegment)) { - try { - Thread.sleep(1000); - } catch (InterruptedException e) { - // Ignore - } - } - - LOG.info("Rebuilding all query caches for the optimized segment {} took {}.", - optimizedSegment.getSegmentName(), stopwatch); - } - - /** - * Block until all the tasks inside this manager have ran at least once. - */ - public void waitUntilAllQueryCachesAreBuilt() { - LOG.info("Waiting until all query caches are built..."); - - Stopwatch stopwatch = Stopwatch.createStarted(); - while (!allTasksRan()) { - try { - Thread.sleep(1000); - } catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - } - } - - LOG.info("Ran query cache tasks in: {}", stopwatch); - } - - private void addQueryCacheTasksForSegment( - SegmentInfo segmentInfo, int segmentIndex, boolean scheduleImmediately) { - LOG.info("Adding query cache tasks for segment {}.", segmentInfo.getTimeSliceID()); - double updateIntervalMultiplier = - EarlybirdConfig.getDouble("query_cache_update_interval_multiplier", 1.0); - for (QueryCacheFilter filter : filters.values()) { - Amount updateIntervalFromConfig = filter.getUpdateInterval(segmentIndex); - Amount updateInterval = Amount.of( - (long) (updateIntervalFromConfig.getValue() * updateIntervalMultiplier), - updateIntervalFromConfig.getUnit()); - - Amount initialDelay = scheduleImmediately ? ZERO_SECONDS : updateInterval; - updater.addTask(filter, segmentInfo, updateInterval, initialDelay); - } - } - - /** - * Notify QueryCacheManager of a new list of segments we currently have, so that cache tasks - * can be updated. - * - * @param segments fresh list of all segments - * - * All existing tasks will be canceled/removed/destroyed, new tasks will be created for all - * segments. - */ - @Override - public synchronized void update(Collection segments, String message) { - if (!enabled()) { - return; - } - - // This manager is created right at the beginning of a startup. Before we set it up, - // we'll read tweets and create segments and therefore this method will be called. - // We don't want to start computing query caches during that time, so we just return. - if (!didSetup) { - return; - } - - NUM_UPDATE_SEGMENTS_CALLS.increment(); - - LOG.info("Rescheduling all query cache tasks ({}). Number of segments received = {}.", - message, segments.size()); - updater.clearTasks(); // cancel and remove all scheduled tasks - - // If Earlybird is still starting up, and we get a partition roll, don't delay rebuilding - // the query cache. - boolean isCurrent = EarlybirdStatus.getStatusCode() == EarlybirdStatusCode.CURRENT; - scheduleTasks(segments, isCurrent); - } - - /** - * Determines if all query cache tasks ran at least once (even if they failed). - */ - public synchronized boolean allTasksRan() { - return (!(enabled() && didSetup)) || updater.allTasksRan(); - } - - /** - * Determines if the query cache manager is enabled. - */ - public boolean enabled() { - return enabled; - } - - /** - * Returns the query cache filter with the given name. - */ - public QueryCacheFilter getFilter(String filterName) { - return filters.get(filterName); - } - - /** - * Shuts down the query cache manager. - */ - public synchronized void shutdown() throws InterruptedException { - LOG.info("Shutting down QueryCacheManager"); - if (updater != null) { - updater.shutdown(); - updater = null; - } - didSetup = false; // needed for unit test - } - - /** - * After startup, we want only one thread to update the query cache. - */ - public void setWorkerPoolSizeAfterStartup() { - if (this.updater != null) { - this.updater.setWorkerPoolSizeAfterStartup(); - } - } - - public Decider getDecider() { - return this.decider; - } - - ////////////////////////// - // for unit tests only - ////////////////////////// - QueryCacheUpdater getUpdaterForTest() { - return updater; - } - Map getCacheMapForTest() { - return filters; - } -} diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheResultCollector.docx b/src/java/com/twitter/search/earlybird/querycache/QueryCacheResultCollector.docx new file mode 100644 index 000000000..85c4048ef Binary files /dev/null and b/src/java/com/twitter/search/earlybird/querycache/QueryCacheResultCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheResultCollector.java b/src/java/com/twitter/search/earlybird/querycache/QueryCacheResultCollector.java deleted file mode 100644 index 5f69f57a5..000000000 --- a/src/java/com/twitter/search/earlybird/querycache/QueryCacheResultCollector.java +++ /dev/null @@ -1,124 +0,0 @@ -package com.twitter.search.earlybird.querycache; - -import java.io.IOException; - -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.util.BitDocIdSet; -import org.apache.lucene.util.BitSet; -import org.apache.lucene.util.FixedBitSet; -import org.apache.lucene.util.SparseFixedBitSet; - -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.core.earlybird.index.QueryCacheResultForSegment; -import com.twitter.search.earlybird.RecentTweetRestriction; -import com.twitter.search.earlybird.search.AbstractResultsCollector; -import com.twitter.search.earlybird.search.SearchRequestInfo; -import com.twitter.search.earlybird.search.SearchResultsInfo; -import com.twitter.search.earlybird.search.queries.SinceUntilFilter; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; - -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - -import static com.twitter.search.core.earlybird.index.TimeMapper.ILLEGAL_TIME; - -/** - * Collector to update the query cache (one segment for a filter) - */ -public class QueryCacheResultCollector - extends AbstractResultsCollector { - private static final int UNSET = -1; - - private final QueryCacheFilter queryCacheFilter; - private final Decider decider; - - private BitSet bitSet; - private long cardinality = 0L; - private int startingDocID = UNSET; - - public QueryCacheResultCollector( - ImmutableSchemaInterface schema, - QueryCacheFilter queryCacheFilter, - EarlybirdSearcherStats searcherStats, - Decider decider, - Clock clock, - int requestDebugMode) { - super(schema, - queryCacheFilter.createSearchRequestInfo(), - clock, - searcherStats, - requestDebugMode); - this.queryCacheFilter = queryCacheFilter; - this.decider = decider; - } - - @Override - public void startSegment() throws IOException { - // The doc IDs in the optimized segments are always in the 0 .. (segmentSize - 1) range, so we - // can use a dense bitset to collect the hits. However, unoptimized segments can use any int - // doc IDs, so we have to use a sparse bitset to collect the hits in those segments. - if (currTwitterReader.getSegmentData().isOptimized()) { - switch (queryCacheFilter.getResultSetType()) { - case FixedBitSet: - bitSet = new FixedBitSet(currTwitterReader.maxDoc()); - break; - case SparseFixedBitSet: - bitSet = new SparseFixedBitSet(currTwitterReader.maxDoc()); - break; - default: - throw new IllegalStateException( - "Unknown ResultSetType: " + queryCacheFilter.getResultSetType().name()); - } - } else { - bitSet = new SparseFixedBitSet(currTwitterReader.maxDoc()); - } - - startingDocID = findStartingDocID(); - cardinality = 0; - } - - @Override - protected void doCollect(long tweetID) { - bitSet.set(curDocId); - cardinality++; - } - - @Override - protected SearchResultsInfo doGetResults() { - return new SearchResultsInfo(); - } - - public QueryCacheResultForSegment getCachedResult() { - // Note that BitSet.cardinality takes linear time in the size of the maxDoc, so we track - // cardinality separately. - return new QueryCacheResultForSegment(new BitDocIdSet(bitSet, cardinality), - cardinality, startingDocID); - } - - /** - * We don't want to return results less than 15 seconds older than the most recently indexed tweet, - * as they might not be completely indexed. - * We can't simply use the first hit, as some cached filters might not have any hits, - * e.g. has_engagement in the protected cluster. - * We can't use a clock because streams can lag. - */ - private int findStartingDocID() throws IOException { - int lastTime = currTwitterReader.getSegmentData().getTimeMapper().getLastTime(); - if (lastTime == ILLEGAL_TIME) { - return NO_MORE_DOCS; - } - - int untilTime = RecentTweetRestriction.queryCacheUntilTime(decider, lastTime); - if (untilTime == 0) { - return currTwitterReader.getSmallestDocID(); - } - - return SinceUntilFilter.getUntilQuery(untilTime) - .createWeight(new IndexSearcher(currTwitterReader), ScoreMode.COMPLETE_NO_SCORES, 1.0f) - .scorer(currTwitterReader.getContext()) - .iterator() - .nextDoc(); - } -} diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdateTask.docx b/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdateTask.docx new file mode 100644 index 000000000..ab364a39a Binary files /dev/null and b/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdateTask.docx differ diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdateTask.java b/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdateTask.java deleted file mode 100644 index db2ba8d4b..000000000 --- a/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdateTask.java +++ /dev/null @@ -1,283 +0,0 @@ -package com.twitter.search.earlybird.querycache; - -import java.io.IOException; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.cache.CacheBuilder; -import com.google.common.cache.CacheLoader; -import com.google.common.cache.LoadingCache; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.metrics.SearchLongGauge; -import com.twitter.search.common.metrics.Timer; -import com.twitter.search.common.search.TerminationTracker; -import com.twitter.search.core.earlybird.index.QueryCacheResultForSegment; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.exception.EarlybirdException; -import com.twitter.search.earlybird.index.EarlybirdSegment; -import com.twitter.search.earlybird.index.EarlybirdSingleSegmentSearcher; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.search.SearchResultsInfo; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.util.ScheduledExecutorTask; - -/** - * Each task is responsible for one filter on one segment. We should have a total - * of num_of_filter * num_of_segments tasks - */ -@VisibleForTesting -class QueryCacheUpdateTask extends ScheduledExecutorTask { - private static final Logger LOG = LoggerFactory.getLogger(QueryCacheUpdateTask.class); - - // See OBSERVE-10347 - private static final boolean EXPORT_STATS = - EarlybirdConfig.getBool("export_query_cache_update_task_stats", false); - - private static final LoadingCache TASK_STATS = - CacheBuilder.newBuilder().build(new CacheLoader() { - @Override - public TaskStats load(String statNamePrefix) { - return new TaskStats(statNamePrefix, EXPORT_STATS); - } - }); - - private static final SearchCounter FINISHED_TASKS = SearchCounter.export( - "querycache_finished_tasks"); - - private final QueryCacheFilter filter; - - // Info/data of the segment this task is responsible for - private final SegmentInfo segmentInfo; - - private final UserTable userTable; - - private volatile boolean ranOnce; - private final TaskStats stats; - private Amount lastRunFinishTime; - - // See SEARCH-4346 - private final String filterAndSegment; - - private final Decider decider; - - private static final class TaskStats { - private final SearchLongGauge numHitsStat; - private final SearchLongGauge updateLatencyStat; - private final SearchCounter updateSuccessCountStat; - private final SearchCounter updateFailureCountStat; - - private TaskStats(String statNamePrefix, boolean exportStats) { - // See SEARCH-3698 - numHitsStat = exportStats ? SearchLongGauge.export(statNamePrefix + "numhit") - : new SearchLongGauge(statNamePrefix + "numhit"); - updateLatencyStat = exportStats - ? SearchLongGauge.export(statNamePrefix + "update_latency_ms") - : new SearchLongGauge(statNamePrefix + "update_latency_ms"); - updateSuccessCountStat = exportStats - ? SearchCounter.export(statNamePrefix + "update_success_count") - : SearchCounter.create(statNamePrefix + "update_success_count"); - updateFailureCountStat = exportStats - ? SearchCounter.export(statNamePrefix + "update_failure_count") - : SearchCounter.create(statNamePrefix + "update_failure_count"); - } - } - - private final Amount updateInterval; - private final Amount initialDelay; - - private final EarlybirdSearcherStats searcherStats; - private final CriticalExceptionHandler criticalExceptionHandler; - - /** - * Constructor - * @param filter Filter to be used to populate the cache - * @param segmentInfo Segment this task is responsible for - * @param updateInterval Time between successive updates - * @param initialDelay Time before the first update - * @param updateIterationCounter - * @param decider - */ - public QueryCacheUpdateTask(QueryCacheFilter filter, - SegmentInfo segmentInfo, - UserTable userTable, - Amount updateInterval, - Amount initialDelay, - SearchCounter updateIterationCounter, - EarlybirdSearcherStats searcherStats, - Decider decider, - CriticalExceptionHandler criticalExceptionHandler, - Clock clock) { - super(updateIterationCounter, clock); - this.filter = filter; - this.segmentInfo = segmentInfo; - this.userTable = userTable; - this.ranOnce = false; - this.updateInterval = updateInterval; - this.initialDelay = initialDelay; - this.stats = setupStats(); - this.filterAndSegment = String.format( - "QueryCacheFilter: %s | Segment: %d", - filter.getFilterName(), segmentInfo.getTimeSliceID()); - this.searcherStats = searcherStats; - this.criticalExceptionHandler = criticalExceptionHandler; - this.decider = decider; - } - - @Override - protected void runOneIteration() { - try { - if (LOG.isDebugEnabled()) { - LOG.debug( - "[{}] Updating with query [{}] for the {} th time.", - filterAndSegment, - filter.getQueryString(), - stats.updateSuccessCountStat.get() + stats.updateFailureCountStat.get() + 1 - ); - if (lastRunFinishTime != null) { - LOG.debug( - "[{}] Last run, {} th time, finished {} secs ago. Should run every {} secs", - filterAndSegment, - stats.updateSuccessCountStat.get() + stats.updateFailureCountStat.get(), - TimeUnit.NANOSECONDS.toSeconds( - System.nanoTime() - lastRunFinishTime.as(Time.NANOSECONDS)), - updateInterval.as(Time.SECONDS) - ); - } - } - - Timer timer = new Timer(TimeUnit.MILLISECONDS); - SearchResultsInfo result = null; - try { - result = update(); - } catch (Exception e) { - String msg = "Failed to update query cache entry [" + filter.getFilterName() - + "] on segment [" + segmentInfo.getTimeSliceID() + "]"; - LOG.warn(msg, e); - } - - long endTime = timer.stop(); - updateStats(result, endTime); - - if (LOG.isDebugEnabled()) { - LOG.debug("[{}] Updated in {} ms, hit {} docs.", - filterAndSegment, endTime, stats.numHitsStat.read()); - } - // Need to catch throwable here instead of exception so we handle errors like OutOfMemory - // See RB=528695 and SEARCH-4402 - } catch (Throwable t) { - String message = String.format("Got unexpected throwable in %s", getClass().getName()); - LOG.error(message, t); - - // Wrap the Throwable in a FatalEarlybirdException to categorize it and ensure it's - // handled as a fatal exception - criticalExceptionHandler.handle(this, - new EarlybirdException(message, t)); - } finally { - // Earlybird won't become CURRENT until all tasks are run at least once. We don't want - // failed "run" (update) to prevent Earlybird from becoming CURRENT. As long as all tasks - // got a chance to run at least once, we are good to go. - ranOnce = true; - - lastRunFinishTime = Amount.of(System.nanoTime(), Time.NANOSECONDS); - } - } - - public boolean ranOnce() { - return ranOnce; - } - - private TaskStats setupStats() { - return TASK_STATS.getUnchecked(statNamePrefix()); - } - - private SearchResultsInfo update() throws IOException { - // There's a chance that the EarlybirdSegment of a SegmentInfo to change at any - // time. Therefore, it's not safe to operate segments on the SegmentInfo level. - // On the archive clusters we create a new EarlybirdSegment and then swap it in when there's - // new data instead of appending to an existing EarlybirdSegment. - EarlybirdSegment earlybirdSegment = segmentInfo.getIndexSegment(); - - EarlybirdSingleSegmentSearcher searcher = earlybirdSegment.getSearcher(userTable); - if (searcher == null) { - LOG.warn("Unable to get searcher from TwitterIndexManager for segment [" - + segmentInfo.getTimeSliceID() + "]. Has it been dropped?"); - return null; - } - - QueryCacheResultCollector collector = new QueryCacheResultCollector( - searcher.getSchemaSnapshot(), filter, searcherStats, decider, clock, 0); - searcher.search(filter.getLuceneQuery(), collector); - - QueryCacheResultForSegment cacheResult = collector.getCachedResult(); - searcher.getTwitterIndexReader().getSegmentData().updateQueryCacheResult( - filter.getFilterName(), cacheResult); - - FINISHED_TASKS.increment(); - - if (LOG.isDebugEnabled()) { - TerminationTracker tracker = collector.getSearchRequestInfo().getTerminationTracker(); - LOG.debug( - "[{}] Updating query finished, start time ms is {}, termination reason is {}", - filterAndSegment, - tracker.getLocalStartTimeMillis(), - tracker.getEarlyTerminationState().getTerminationReason()); - } - - return collector.getResults(); - } - - private void updateStats(SearchResultsInfo result, long endTime) { - if (result != null) { - stats.numHitsStat.set(result.getNumHitsProcessed()); - stats.updateSuccessCountStat.increment(); - } else { - stats.updateFailureCountStat.increment(); - } - stats.updateLatencyStat.set(endTime); - } - - @VisibleForTesting - String statNamePrefix() { - // If we use this and try to display in monviz "ts(partition, single_instance, querycache*)", - // the UI shows "Really expensive query" message. We can keep this around for times when we - // want to start things manually and debug. - return "querycache_" + filter.getFilterName() + "_" + segmentInfo.getTimeSliceID() + "_"; - } - - public long getTimeSliceID() { - return segmentInfo.getTimeSliceID(); - } - - ////////////////////////// - // for unit tests only - ////////////////////////// - @VisibleForTesting - String getFilterNameForTest() { - return filter.getFilterName(); - } - - @VisibleForTesting - Amount getUpdateIntervalForTest() { - return updateInterval; - } - - @VisibleForTesting - Amount getInitialDelayForTest() { - return initialDelay; - } - - @VisibleForTesting - TaskStats getTaskStatsForTest() { - return stats; - } -} diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdater.docx b/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdater.docx new file mode 100644 index 000000000..d88634950 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdater.docx differ diff --git a/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdater.java b/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdater.java deleted file mode 100644 index f76e197ec..000000000 --- a/src/java/com/twitter/search/earlybird/querycache/QueryCacheUpdater.java +++ /dev/null @@ -1,242 +0,0 @@ -package com.twitter.search.earlybird.querycache; - -import java.util.ArrayList; -import java.util.Collection; -import java.util.Iterator; -import java.util.List; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.TimeUnit; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; -import com.google.common.collect.Lists; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.quantity.Amount; -import com.twitter.common.quantity.Time; -import com.twitter.common.util.Clock; -import com.twitter.decider.Decider; -import com.twitter.search.common.concurrent.ScheduledExecutorServiceFactory; -import com.twitter.search.common.metrics.SearchCustomGauge; -import com.twitter.search.common.metrics.SearchStatsReceiver; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.exception.CriticalExceptionHandler; -import com.twitter.search.earlybird.factory.QueryCacheUpdaterScheduledExecutorService; -import com.twitter.search.earlybird.partition.SegmentInfo; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.util.PeriodicActionParams; -import com.twitter.search.earlybird.util.ScheduledExecutorManager; -import com.twitter.search.earlybird.util.ShutdownWaitTimeParams; - -/** - * Class to manage the scheduler service and all the update tasks. Through this - * class, update tasks are created and scheduled, canceled and removed. - * - * This class is not thread-safe. - */ -@VisibleForTesting -final class QueryCacheUpdater extends ScheduledExecutorManager { - private static final Logger LOG = LoggerFactory.getLogger(QueryCacheUpdater.class); - - private final List tasks; - private final EarlybirdSearcherStats searcherStats; - private final Decider decider; - private final UserTable userTable; - private final Clock clock; - - @VisibleForTesting - static final class Task { - @VisibleForTesting public final QueryCacheUpdateTask updateTask; - @VisibleForTesting public final ScheduledFuture future; - - private Task(QueryCacheUpdateTask updateTask, ScheduledFuture future) { - this.updateTask = updateTask; - this.future = future; - } - } - - public QueryCacheUpdater(Collection cacheFilters, - ScheduledExecutorServiceFactory updaterScheduledExecutorServiceFactory, - UserTable userTable, - SearchStatsReceiver searchStatsReceiver, - EarlybirdSearcherStats searcherStats, - Decider decider, - CriticalExceptionHandler criticalExceptionHandler, - Clock clock) { - super(updaterScheduledExecutorServiceFactory.build("QueryCacheUpdateThread-%d", true), - ShutdownWaitTimeParams.immediately(), searchStatsReceiver, - criticalExceptionHandler, clock); - Preconditions.checkNotNull(cacheFilters); - Preconditions.checkArgument(getExecutor() instanceof QueryCacheUpdaterScheduledExecutorService, - getExecutor().getClass()); - - this.searcherStats = searcherStats; - this.decider = decider; - this.userTable = userTable; - this.clock = clock; - - shouldLog = false; - // One update task per - tasks = Lists.newArrayListWithCapacity(cacheFilters.size() * 20); - - SearchCustomGauge.export( - "querycache_num_tasks", - tasks::size - ); - } - - /** - * Create an update task and add it to the executor - * - * @param filter The filter the task should execute - * @param segmentInfo The segment that this task would be responsible for - * @param updateInterval time in milliseconds between successive updates - * @param initialDelay Introduce a delay when adding the task to the executor - */ - void addTask(QueryCacheFilter filter, SegmentInfo segmentInfo, - Amount updateInterval, Amount initialDelay) { - String filterName = filter.getFilterName(); - String query = filter.getQueryString(); - - // Create the task. - QueryCacheUpdateTask qcTask = new QueryCacheUpdateTask( - filter, - segmentInfo, - userTable, - updateInterval, - initialDelay, - getIterationCounter(), - searcherStats, - decider, - criticalExceptionHandler, - clock); - - long initialDelayAsMS = initialDelay.as(Time.MILLISECONDS); - long updateIntervalAsMS = updateInterval.as(Time.MILLISECONDS); - Preconditions.checkArgument( - initialDelayAsMS >= initialDelay.getValue(), "initial delay unit granularity too small"); - Preconditions.checkArgument( - updateIntervalAsMS >= updateInterval.getValue(), - "update interval unit granularity too small"); - - // Schedule the task. - ScheduledFuture future = scheduleNewTask(qcTask, - PeriodicActionParams.withIntialWaitAndFixedDelay( - initialDelayAsMS, updateIntervalAsMS, TimeUnit.MILLISECONDS - ) - ); - - tasks.add(new Task(qcTask, future)); - - LOG.debug("Added a task for filter [" + filterName - + "] for segment [" + segmentInfo.getTimeSliceID() - + "] with query [" + query - + "] update interval " + updateInterval + " " - + (initialDelay.getValue() == 0 ? "without" : "with " + initialDelay) - + " initial delay"); - - } - - void removeAllTasksForSegment(SegmentInfo segmentInfo) { - int removedTasksCount = 0; - for (Iterator it = tasks.iterator(); it.hasNext();) { - Task task = it.next(); - if (task.updateTask.getTimeSliceID() == segmentInfo.getTimeSliceID()) { - task.future.cancel(true); - it.remove(); - removedTasksCount += 1; - } - } - - LOG.info("Removed {} update tasks for segment {}.", removedTasksCount, - segmentInfo.getTimeSliceID()); - } - - public void clearTasks() { - int totalTasks = tasks.size(); - LOG.info("Removing {} update tasks for all segments.", totalTasks); - for (Task task : tasks) { - task.future.cancel(true); - } - tasks.clear(); - LOG.info("Canceled {} QueryCache update tasks", totalTasks); - } - - // Have all tasks run at least once (even if they failed)? - public boolean allTasksRan() { - boolean allTasksRan = true; - for (Task task : tasks) { - if (!task.updateTask.ranOnce()) { - allTasksRan = false; - break; - } - } - - return allTasksRan; - } - - // Have all tasks for this run at least once (even if they failed)? - public boolean allTasksRanForSegment(SegmentInfo segmentInfo) { - boolean allTasksRanForSegment = true; - for (Task task : tasks) { - if ((task.updateTask.getTimeSliceID() == segmentInfo.getTimeSliceID()) - && !task.updateTask.ranOnce()) { - allTasksRanForSegment = false; - break; - } - } - - return allTasksRanForSegment; - } - - /** - * After startup, we want only one thread to update the query cache. - */ - void setWorkerPoolSizeAfterStartup() { - QueryCacheUpdaterScheduledExecutorService executor = - (QueryCacheUpdaterScheduledExecutorService) getExecutor(); - executor.setWorkerPoolSizeAfterStartup(); - LOG.info("Done setting executor core pool size to one"); - } - - @Override - protected void shutdownComponent() { - clearTasks(); - } - - ////////////////////////// - // for unit tests only - ////////////////////////// - - /** - * Returns the list of all query cache updater tasks. This method should be used only in tests. - */ - @VisibleForTesting - List getTasksForTest() { - synchronized (tasks) { - return new ArrayList<>(tasks); - } - } - - @VisibleForTesting - int getTasksSize() { - synchronized (tasks) { - return tasks.size(); - } - } - - @VisibleForTesting - boolean tasksContains(Task task) { - synchronized (tasks) { - return tasks.contains(task); - } - } - - @VisibleForTesting - public ScheduledExecutorService getExecutorForTest() { - return getExecutor(); - } -} diff --git a/src/java/com/twitter/search/earlybird/queryparser/DetectAntisocialVisitor.docx b/src/java/com/twitter/search/earlybird/queryparser/DetectAntisocialVisitor.docx new file mode 100644 index 000000000..1758353fa Binary files /dev/null and b/src/java/com/twitter/search/earlybird/queryparser/DetectAntisocialVisitor.docx differ diff --git a/src/java/com/twitter/search/earlybird/queryparser/DetectAntisocialVisitor.java b/src/java/com/twitter/search/earlybird/queryparser/DetectAntisocialVisitor.java deleted file mode 100644 index b42ce5cb9..000000000 --- a/src/java/com/twitter/search/earlybird/queryparser/DetectAntisocialVisitor.java +++ /dev/null @@ -1,131 +0,0 @@ -package com.twitter.search.earlybird.queryparser; - -import com.twitter.search.common.constants.QueryCacheConstants; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Phrase; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.SpecialTerm; -import com.twitter.search.queryparser.query.Term; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchOperatorConstants; -import com.twitter.search.queryparser.query.search.SearchQueryVisitor; - -/** - * Visitor to detect presence of any antisocial / spam operator in a Query. - * Visitor returns true if any operators it detects were found. - */ -public class DetectAntisocialVisitor extends SearchQueryVisitor { - // True if the query contains any operator to include antisocial tweets. - private boolean includeAntisocial = false; - - // True if the query contains any operator to exclude antisocial/spam tweets. - private boolean excludeAntisocial = false; - - // True if the query contains an antisocial tweets filter. - private boolean filterAntisocial = false; - - public boolean hasIncludeAntisocial() { - return includeAntisocial; - } - - public boolean hasExcludeAntisocial() { - return excludeAntisocial; - } - - public boolean hasFilterAntisocial() { - return filterAntisocial; - } - - public boolean hasAnyAntisocialOperator() { - // Top tweets is considered an antisocial operator due to scoring also excluding - // spam tweets. - return hasIncludeAntisocial() || hasExcludeAntisocial() || hasFilterAntisocial(); - } - - @Override public Boolean visit(Disjunction disjunction) throws QueryParserException { - boolean found = false; - for (com.twitter.search.queryparser.query.Query node : disjunction.getChildren()) { - if (node.accept(this)) { - found = true; - } - } - return found; - } - - @Override public Boolean visit(Conjunction conjunction) throws QueryParserException { - boolean found = false; - for (com.twitter.search.queryparser.query.Query node : conjunction.getChildren()) { - if (node.accept(this)) { - found = true; - } - } - return found; - } - - @Override public Boolean visit(SearchOperator operator) throws QueryParserException { - boolean found = false; - switch (operator.getOperatorType()) { - case INCLUDE: - if (SearchOperatorConstants.ANTISOCIAL.equals(operator.getOperand())) { - if (operator.mustNotOccur()) { - excludeAntisocial = true; - } else { - includeAntisocial = true; - } - found = true; - } - break; - case EXCLUDE: - if (SearchOperatorConstants.ANTISOCIAL.equals(operator.getOperand())) { - if (operator.mustNotOccur()) { - includeAntisocial = true; - } else { - excludeAntisocial = true; - } - found = true; - } - break; - case FILTER: - if (SearchOperatorConstants.ANTISOCIAL.equals(operator.getOperand())) { - if (operator.mustNotOccur()) { - excludeAntisocial = true; - } else { - filterAntisocial = true; - } - found = true; - } - break; - case CACHED_FILTER: - if (QueryCacheConstants.EXCLUDE_SPAM.equals(operator.getOperand()) - || QueryCacheConstants.EXCLUDE_SPAM_AND_NATIVERETWEETS.equals(operator.getOperand()) - || QueryCacheConstants.EXCLUDE_ANTISOCIAL.equals(operator.getOperand()) - || QueryCacheConstants.EXCLUDE_ANTISOCIAL_AND_NATIVERETWEETS - .equals(operator.getOperand())) { - - excludeAntisocial = true; - found = true; - } - break; - default: - break; - } - - return found; - } - - @Override - public Boolean visit(SpecialTerm special) throws QueryParserException { - return false; - } - - @Override - public Boolean visit(Phrase phrase) throws QueryParserException { - return false; - } - - @Override - public Boolean visit(Term term) throws QueryParserException { - return false; - } -} diff --git a/src/java/com/twitter/search/earlybird/queryparser/DetectFieldAnnotationVisitor.docx b/src/java/com/twitter/search/earlybird/queryparser/DetectFieldAnnotationVisitor.docx new file mode 100644 index 000000000..270cfb2ca Binary files /dev/null and b/src/java/com/twitter/search/earlybird/queryparser/DetectFieldAnnotationVisitor.docx differ diff --git a/src/java/com/twitter/search/earlybird/queryparser/DetectFieldAnnotationVisitor.java b/src/java/com/twitter/search/earlybird/queryparser/DetectFieldAnnotationVisitor.java deleted file mode 100644 index 5c565ce91..000000000 --- a/src/java/com/twitter/search/earlybird/queryparser/DetectFieldAnnotationVisitor.java +++ /dev/null @@ -1,99 +0,0 @@ -package com.twitter.search.earlybird.queryparser; - -import java.util.Set; - -import com.google.common.collect.ImmutableSet; - -import com.twitter.search.queryparser.query.BooleanQuery; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Operator; -import com.twitter.search.queryparser.query.Phrase; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.QueryVisitor; -import com.twitter.search.queryparser.query.SpecialTerm; -import com.twitter.search.queryparser.query.Term; -import com.twitter.search.queryparser.query.annotation.Annotation; -import com.twitter.search.queryparser.query.annotation.FieldNameWithBoost; - -/** - * Detects whether the query tree has certain field annotations. - */ -public class DetectFieldAnnotationVisitor extends QueryVisitor { - private final ImmutableSet fieldNames; - - /** - * This visitor will return true if the query tree has a FIELD annotation with any of the given - * field names. If the set is empty, any FIELD annotation will match. - */ - public DetectFieldAnnotationVisitor(Set fieldNames) { - this.fieldNames = ImmutableSet.copyOf(fieldNames); - } - - /** - * This visitor will return true if the query tree has a FIELD annotation. - */ - public DetectFieldAnnotationVisitor() { - this.fieldNames = ImmutableSet.of(); - } - - @Override - public Boolean visit(Disjunction disjunction) throws QueryParserException { - return visitQuery(disjunction) || visitBooleanQuery(disjunction); - } - - @Override - public Boolean visit(Conjunction conjunction) throws QueryParserException { - return visitQuery(conjunction) || visitBooleanQuery(conjunction); - } - - @Override - public Boolean visit(Phrase phrase) throws QueryParserException { - return visitQuery(phrase); - } - - @Override - public Boolean visit(Term term) throws QueryParserException { - return visitQuery(term); - } - - @Override - public Boolean visit(Operator operator) throws QueryParserException { - return visitQuery(operator); - } - - @Override - public Boolean visit(SpecialTerm special) throws QueryParserException { - return visitQuery(special); - } - - private Boolean visitQuery(Query query) throws QueryParserException { - if (query.hasAnnotations()) { - for (Annotation annotation : query.getAnnotations()) { - if (!Annotation.Type.FIELD.equals(annotation.getType())) { - continue; - } - if (fieldNames.isEmpty()) { - return true; - } - FieldNameWithBoost value = (FieldNameWithBoost) annotation.getValue(); - if (fieldNames.contains(value.getFieldName())) { - return true; - } - } - } - - return false; - } - - private boolean visitBooleanQuery(BooleanQuery query) throws QueryParserException { - for (Query subQuery : query.getChildren()) { - if (subQuery.accept(this)) { - return true; - } - } - - return false; - } -} diff --git a/src/java/com/twitter/search/earlybird/queryparser/EarlybirdLuceneQueryVisitor.docx b/src/java/com/twitter/search/earlybird/queryparser/EarlybirdLuceneQueryVisitor.docx new file mode 100644 index 000000000..e2f711ec9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/queryparser/EarlybirdLuceneQueryVisitor.docx differ diff --git a/src/java/com/twitter/search/earlybird/queryparser/EarlybirdLuceneQueryVisitor.java b/src/java/com/twitter/search/earlybird/queryparser/EarlybirdLuceneQueryVisitor.java deleted file mode 100644 index d78e7d8b1..000000000 --- a/src/java/com/twitter/search/earlybird/queryparser/EarlybirdLuceneQueryVisitor.java +++ /dev/null @@ -1,1781 +0,0 @@ -package com.twitter.search.earlybird.queryparser; - -import java.util.Arrays; -import java.util.Collection; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; -import javax.annotation.Nullable; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Functions; -import com.google.common.base.Optional; -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; - -import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.BooleanClause.Occur; -import org.apache.lucene.search.BooleanQuery; -import org.apache.lucene.search.BoostQuery; -import org.apache.lucene.search.MatchNoDocsQuery; -import org.apache.lucene.search.PhraseQuery; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.TermQuery; -import org.locationtech.spatial4j.shape.Point; -import org.locationtech.spatial4j.shape.Rectangle; -import org.locationtech.spatial4j.shape.impl.PointImpl; -import org.locationtech.spatial4j.shape.impl.RectangleImpl; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.decider.Decider; -import com.twitter.search.common.constants.QueryCacheConstants; -import com.twitter.search.common.decider.DeciderUtil; -import com.twitter.search.common.encoding.features.ByteNormalizer; -import com.twitter.search.common.indexing.thriftjava.ThriftGeoLocationSource; -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.query.BoostUtils; -import com.twitter.search.common.query.FieldWeightUtil; -import com.twitter.search.common.query.FilteredQuery; -import com.twitter.search.common.query.HitAttributeHelper; -import com.twitter.search.common.query.MappableField; -import com.twitter.search.common.schema.ImmutableSchema; -import com.twitter.search.common.schema.SchemaUtil; -import com.twitter.search.common.schema.base.FieldWeightDefault; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentBuilder; -import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil; -import com.twitter.search.common.schema.thriftjava.ThriftCSFType; -import com.twitter.search.common.search.TerminationTracker; -import com.twitter.search.common.search.termination.QueryTimeout; -import com.twitter.search.common.util.analysis.IntTermAttributeImpl; -import com.twitter.search.common.util.analysis.LongTermAttributeImpl; -import com.twitter.search.common.util.spatial.GeohashChunkImpl; -import com.twitter.search.common.util.text.HighFrequencyTermPairs; -import com.twitter.search.common.util.text.NormalizerHelper; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.userupdates.UserScrubGeoMap; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.partition.MultiSegmentTermDictionaryManager; -import com.twitter.search.earlybird.querycache.CachedFilterQuery; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.earlybird.search.queries.CSFDisjunctionFilter; -import com.twitter.search.earlybird.search.queries.DocValRangeFilter; -import com.twitter.search.earlybird.search.queries.FeatureValueInAcceptListOrUnsetFilter; -import com.twitter.search.earlybird.search.GeoQuadTreeQueryBuilder; -import com.twitter.search.earlybird.search.queries.MatchAllDocsQuery; -import com.twitter.search.earlybird.search.queries.RequiredStatusIDsFilter; -import com.twitter.search.earlybird.search.queries.SinceMaxIDFilter; -import com.twitter.search.earlybird.search.queries.SinceUntilFilter; -import com.twitter.search.earlybird.search.queries.TermQueryWithSafeToString; -import com.twitter.search.earlybird.search.queries.UserFlagsExcludeFilter; -import com.twitter.search.earlybird.search.queries.UserScrubGeoFilter; -import com.twitter.search.earlybird.search.queries.UserIdMultiSegmentQuery; -import com.twitter.search.earlybird.search.relevance.MinFeatureValueFilter; -import com.twitter.search.earlybird.search.relevance.ScoreFilterQuery; -import com.twitter.search.earlybird.search.relevance.scoring.ScoringFunctionProvider; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Phrase; -import com.twitter.search.queryparser.query.QueryNodeUtils; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.SpecialTerm; -import com.twitter.search.queryparser.query.Term; -import com.twitter.search.queryparser.query.annotation.Annotation; -import com.twitter.search.queryparser.query.annotation.FloatAnnotation; -import com.twitter.search.queryparser.query.search.Link; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchOperatorConstants; -import com.twitter.search.queryparser.query.search.SearchQueryVisitor; -import com.twitter.search.queryparser.util.GeoCode; -import com.twitter.service.spiderduck.gen.LinkCategory; -import com.twitter.tweetypie.thriftjava.ComposerSource; - -/** - * Visitor for {@link com.twitter.search.queryparser.query.Query}, which produces a Lucene - * Query ({@link Query}). - */ -public class EarlybirdLuceneQueryVisitor extends SearchQueryVisitor { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdLuceneQueryVisitor.class); - - @VisibleForTesting - static final String UNSUPPORTED_OPERATOR_PREFIX = "unsupported_query_operator_"; - - private static final String SMILEY_FORMAT_STRING = "__has_%s_smiley"; - private static final String PHRASE_WILDCARD = "*"; - private static final float DEFAULT_FIELD_WEIGHT = 1.0f; - - private static final SearchCounter SINCE_TIME_INVALID_INT_COUNTER = - SearchCounter.export("EarlybirdLuceneQueryVisitor_since_time_invalid_int"); - private static final SearchCounter UNTIL_TIME_INVALID_INT_COUNTER = - SearchCounter.export("EarlybirdLuceneQueryVisitor_until_time_invalid_int"); - - private static final SearchCounter NUM_QUERIES_BELOW_MIN_ENGAGEMENT_THRESHOLD = - SearchCounter.export( - "EarlybirdLuceneQueryVisitor_num_queries_below_min_engagement_threshold"); - private static final SearchCounter NUM_QUERIES_ABOVE_MIN_ENGAGEMENT_THRESHOLD = - SearchCounter.export( - "EarlybirdLuceneQueryVisitor_num_queries_above_min_engagement_threshold"); - - private static final SearchOperator OPERATOR_CACHED_EXCLUDE_ANTISOCIAL_AND_NATIVERETWEETS = - new SearchOperator(SearchOperator.Type.CACHED_FILTER, - QueryCacheConstants.EXCLUDE_ANTISOCIAL_AND_NATIVERETWEETS); - - private static final Map> OPERATORS_BY_SAFE_EXCLUDE_OPERAND = - ImmutableMap.of( - SearchOperatorConstants.TWEET_SPAM, ImmutableList.of( - new SearchOperator(SearchOperator.Type.DOCVAL_RANGE_FILTER, - "extended_encoded_tweet_features.label_spam_flag", "0", "1"), - new SearchOperator(SearchOperator.Type.DOCVAL_RANGE_FILTER, - "extended_encoded_tweet_features.label_spam_hi_rcl_flag", "0", "1"), - new SearchOperator(SearchOperator.Type.DOCVAL_RANGE_FILTER, - "extended_encoded_tweet_features.label_dup_content_flag", "0", "1")), - - SearchOperatorConstants.TWEET_ABUSIVE, ImmutableList.of( - new SearchOperator(SearchOperator.Type.DOCVAL_RANGE_FILTER, - "extended_encoded_tweet_features.label_abusive_flag", "0", "1")), - - SearchOperatorConstants.TWEET_UNSAFE, ImmutableList.of( - new SearchOperator(SearchOperator.Type.DOCVAL_RANGE_FILTER, - "extended_encoded_tweet_features.label_nsfw_hi_prc_flag", "0", "1")) - ); - - private static final ImmutableMap DEFAULT_FIELDS = - ImmutableMap.of(EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), - new FieldWeightDefault(true, DEFAULT_FIELD_WEIGHT)); - - // All Earlybird fields that should have geo scrubbed tweets filtered out when searched. - // See go/realtime-geo-filtering - @VisibleForTesting - public static final List GEO_FIELDS_TO_BE_SCRUBBED = Arrays.asList( - EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(), - EarlybirdFieldConstant.PLACE_FIELD.getFieldName(), - EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName(), - EarlybirdFieldConstant.PLACE_FULL_NAME_FIELD.getFieldName(), - EarlybirdFieldConstant.PLACE_COUNTRY_CODE_FIELD.getFieldName()); - - // Geo scrubbing doesn't remove user profile location, so when using the geo location type filters - // we only need to filter out geo scrubbed tweets for the geo location types other than - // ThriftGeoLocationSource.USER_PROFILE. - // Separately, we also need to filter out geo scrubbed tweets for the place_id filter. - private static final List GEO_FILTERS_TO_BE_SCRUBBED = Arrays.asList( - EarlybirdFieldConstants.formatGeoType(ThriftGeoLocationSource.GEOTAG), - EarlybirdFieldConstants.formatGeoType(ThriftGeoLocationSource.TWEET_TEXT), - EarlybirdThriftDocumentUtil.formatFilter( - EarlybirdFieldConstant.PLACE_ID_FIELD.getFieldName())); - - // queries whose parents are negated. - // used to decide if a negated query is within a negated parent or not. - private final Set parentNegatedQueries = - Sets.newIdentityHashSet(); - - private final ImmutableSchemaInterface schemaSnapshot; - private final ImmutableMap defaultFieldWeightMap; - private final QueryCacheManager queryCacheManager; - private final UserTable userTable; - private final UserScrubGeoMap userScrubGeoMap; - - @Nullable - private final TerminationTracker terminationTracker; - private final Map mappableFieldMap; - private final MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager; - private final Decider decider; - private final EarlybirdCluster earlybirdCluster; - - private float proximityPhraseWeight = 1.0f; - private int proximityPhraseSlop = 255; - private ImmutableMap enabledFieldWeightMap; - private Set queriedFields; - - // If we need to accumulate and collect per-field and per query node hit attribution information, - // this will have a mapping between the query nodes and their unique ranks, as well as the - // attribute collector. - @Nullable - private HitAttributeHelper hitAttributeHelper; - - @Nullable - private QueryTimeout queryTimeout; - - public EarlybirdLuceneQueryVisitor( - ImmutableSchemaInterface schemaSnapshot, - QueryCacheManager queryCacheManager, - UserTable userTable, - UserScrubGeoMap userScrubGeoMap, - EarlybirdCluster earlybirdCluster, - Decider decider) { - this(schemaSnapshot, queryCacheManager, userTable, userScrubGeoMap, null, DEFAULT_FIELDS, - Collections.emptyMap(), null, decider, earlybirdCluster, null); - } - - public EarlybirdLuceneQueryVisitor( - ImmutableSchemaInterface schemaSnapshot, - QueryCacheManager queryCacheManager, - UserTable userTable, - UserScrubGeoMap userScrubGeoMap, - @Nullable TerminationTracker terminationTracker, - Map fieldWeightMap, - Map mappableFieldMap, - MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager, - Decider decider, - EarlybirdCluster earlybirdCluster, - QueryTimeout queryTimeout) { - this.schemaSnapshot = schemaSnapshot; - this.defaultFieldWeightMap = ImmutableMap.copyOf(fieldWeightMap); - this.enabledFieldWeightMap = FieldWeightDefault.getOnlyEnabled(defaultFieldWeightMap); - this.queryCacheManager = queryCacheManager; - this.userTable = userTable; - this.userScrubGeoMap = userScrubGeoMap; - this.mappableFieldMap = Preconditions.checkNotNull(mappableFieldMap); - this.terminationTracker = terminationTracker; - this.multiSegmentTermDictionaryManager = multiSegmentTermDictionaryManager; - this.decider = decider; - this.earlybirdCluster = earlybirdCluster; - this.queryTimeout = queryTimeout; - this.queriedFields = new TreeSet<>(); - } - - public ImmutableMap getEnabledFieldWeightMap() { - return enabledFieldWeightMap; - } - - public ImmutableMap getDefaultFieldWeightMap() { - return defaultFieldWeightMap; - } - - public EarlybirdLuceneQueryVisitor setProximityPhraseWeight(float weight) { - this.proximityPhraseWeight = weight; - return this; - } - - public EarlybirdLuceneQueryVisitor setProximityPhraseSlop(int slop) { - this.proximityPhraseSlop = slop; - return this; - } - - public void setFieldHitAttributeHelper(HitAttributeHelper newHitAttributeHelper) { - this.hitAttributeHelper = newHitAttributeHelper; - } - - @Override - public final Query visit(Disjunction disjunction) throws QueryParserException { - BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); - List children = disjunction.getChildren(); - // Do a final round of check, if all nodes under a disjunction are MUST, - // treat them all as DEFAULT (SHOULD in Lucene). - boolean allMust = true; - for (com.twitter.search.queryparser.query.Query child : children) { - if (!child.mustOccur()) { - allMust = false; - break; - } - } - if (allMust) { - children = Lists.transform(children, QueryNodeUtils.MAKE_QUERY_DEFAULT); - } - // Actually converting all children now. - for (com.twitter.search.queryparser.query.Query child : children) { - final Query q = child.accept(this); - if (q != null) { - // if a node is marked with MUSTHAVE annotation, we set it to must even if it's a - // disjunction. - if (child.mustOccur()) { - bqBuilder.add(q, Occur.MUST); - } else { - bqBuilder.add(q, Occur.SHOULD); - } - } - } - - Query bq = bqBuilder.build(); - float boost = (float) getBoostFromAnnotations(disjunction.getAnnotations()); - if (boost >= 0) { - bq = BoostUtils.maybeWrapInBoostQuery(bq, boost); - } - return bq; - } - - @Override - public Query visit(Conjunction conjunction) throws QueryParserException { - BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); - List children = conjunction.getChildren(); - boolean hasPositiveTerms = false; - for (com.twitter.search.queryparser.query.Query child : children) { - boolean childMustNotOccur = child.mustNotOccur(); - boolean childAdded = addQuery(bqBuilder, child); - if (childAdded && !childMustNotOccur) { - hasPositiveTerms = true; - } - } - if (!children.isEmpty() && !hasPositiveTerms) { - bqBuilder.add(new MatchAllDocsQuery(), Occur.MUST); - } - - Query bq = bqBuilder.build(); - float boost = (float) getBoostFromAnnotations(conjunction.getAnnotations()); - if (boost >= 0) { - bq = BoostUtils.maybeWrapInBoostQuery(bq, boost); - } - return bq; - } - - @Override - public Query visit(Phrase phrase) throws QueryParserException { - return visit(phrase, false); - } - - @Override - public Query visit(Term term) throws QueryParserException { - return finalizeQuery(createTermQueryDisjunction(term), term); - } - - @Override - public Query visit(SpecialTerm special) throws QueryParserException { - String field; - - switch (special.getType()) { - case HASHTAG: - field = EarlybirdFieldConstant.HASHTAGS_FIELD.getFieldName(); - break; - case STOCK: - field = EarlybirdFieldConstant.STOCKS_FIELD.getFieldName(); - break; - case MENTION: - field = EarlybirdFieldConstant.MENTIONS_FIELD.getFieldName(); - break; - default: - field = EarlybirdFieldConstant.TEXT_FIELD.getFieldName(); - } - - String termText = special.getSpecialChar() + special.getValue(); - Query q = createSimpleTermQuery(special, field, termText); - - float boost = (float) getBoostFromAnnotations(special.getAnnotations()); - if (boost >= 0) { - q = BoostUtils.maybeWrapInBoostQuery(q, boost); - } - - return negateQueryIfNodeNegated(special, q); - } - - @Override - public Query visit(Link link) throws QueryParserException { - Query q = createSimpleTermQuery( - link, EarlybirdFieldConstant.LINKS_FIELD.getFieldName(), link.getOperand()); - - float boost = (float) getBoostFromAnnotations(link.getAnnotations()); - if (boost >= 0) { - q = BoostUtils.maybeWrapInBoostQuery(q, boost); - } - - return negateQueryIfNodeNegated(link, q); - } - - @Override - public Query visit(final SearchOperator op) throws QueryParserException { - final Query query; - SearchOperator.Type type = op.getOperatorType(); - - switch (type) { - case TO: - query = visitToOperator(op); - break; - - case FROM: - query = visitFromOperator(op); - break; - - case FILTER: - query = visitFilterOperator(op); - break; - - case INCLUDE: - query = visitIncludeOperator(op); - break; - - case EXCLUDE: - query = visitExcludeOperator(op); - break; - - case LANG: - query = visitLangOperator(op); - break; - - case SOURCE: - query = visitSourceOperator(op); - break; - - case SMILEY: - query = visitSmileyOperator(op); - break; - - case DOCVAL_RANGE_FILTER: - query = visitDocValRangeFilterOperator(op); - break; - - case CACHED_FILTER: - query = visitCachedFilterOperator(op); - break; - - case SCORE_FILTER: - query = visitScoredFilterOperator(op); - break; - - case SINCE_TIME: - query = visitSinceTimeOperator(op); - break; - - case UNTIL_TIME: - query = visitUntilTimeOperator(op); - break; - - case SINCE_ID: - query = visitSinceIDOperator(op); - break; - - case MAX_ID: - query = visitMaxIDOperator(op); - break; - - case GEOLOCATION_TYPE: - query = visitGeoLocationTypeOperator(op); - break; - - case GEOCODE: - query = visitGeocodeOperator(op); - break; - - case GEO_BOUNDING_BOX: - query = visitGeoBoundingBoxOperator(op); - break; - - case PLACE: - query = visitPlaceOperator(op); - break; - - case LINK: - // This should never be called - the Link visitor (visitor(Link link)) should be. - query = visitLinkOperator(op); - break; - - case ENTITY_ID: - query = visitEntityIdOperator(op); - break; - - case FROM_USER_ID: - query = visitFromUserIDOperator(op); - break; - - case IN_REPLY_TO_TWEET_ID: - query = visitInReplyToTweetIdOperator(op); - break; - - case IN_REPLY_TO_USER_ID: - query = visitInReplyToUserIdOperator(op); - break; - - case LIKED_BY_USER_ID: - query = visitLikedByUserIdOperator(op); - break; - - case RETWEETED_BY_USER_ID: - query = visitRetweetedByUserIdOperator(op); - break; - - case REPLIED_TO_BY_USER_ID: - query = visitRepliedToByUserIdOperator(op); - break; - - case QUOTED_USER_ID: - query = visitQuotedUserIdOperator(op); - break; - - case QUOTED_TWEET_ID: - query = visitQuotedTweetIdOperator(op); - break; - - case DIRECTED_AT_USER_ID: - query = visitDirectedAtUserIdOperator(op); - break; - - case CONVERSATION_ID: - query = visitConversationIdOperator(op); - break; - - case COMPOSER_SOURCE: - query = visitComposerSourceOperator(op); - break; - - case RETWEETS_OF_TWEET_ID: - query = visitRetweetsOfTweetIdOperator(op); - break; - - case RETWEETS_OF_USER_ID: - query = visitRetweetsOfUserIdOperator(op); - break; - - case LINK_CATEGORY: - query = visitLinkCategoryOperator(op); - break; - - case CARD_NAME: - query = visitCardNameOperator(op); - break; - - case CARD_DOMAIN: - query = visitCardDomainOperator(op); - break; - - case CARD_LANG: - query = visitCardLangOperator(op); - break; - - case HF_TERM_PAIR: - query = visitHFTermPairOperator(op); - break; - - case HF_PHRASE_PAIR: - query = visitHFTermPhrasePairOperator(op); - break; - - case PROXIMITY_GROUP: - Phrase phrase = new Phrase( - Lists.transform(op.getOperands(), - s -> NormalizerHelper.normalizeWithUnknownLocale( - s, EarlybirdConfig.getPenguinVersion()))); - - query = visit(phrase, true); - break; - - case MULTI_TERM_DISJUNCTION: - query = visitMultiTermDisjunction(op); - break; - - case CSF_DISJUNCTION_FILTER: - query = visitCSFDisjunctionFilter(op); - break; - - case SAFETY_EXCLUDE: - query = visitSafetyExclude(op); - break; - - case SPACE_ID: - query = visitSpaceId(op); - break; - - case NAMED_ENTITY: - query = visitNamedEntity(op); - break; - - case NAMED_ENTITY_WITH_TYPE: - query = visitNamedEntityWithType(op); - break; - - case MIN_FAVES: - case MIN_QUALITY_SCORE: - case MIN_REPLIES: - case MIN_RETWEETS: - case MIN_REPUTATION: - query = visitMinFeatureValueOperator(type, op); - break; - - case FEATURE_VALUE_IN_ACCEPT_LIST_OR_UNSET: - query = visitFeatureValueInAcceptListOrUnsetFilterOperator(op); - break; - - case NEAR: - case RELATED_TO_TWEET_ID: - case SINCE: - case SITE: - case UNTIL: - case WITHIN: - case WITHIN_TIME: - query = createUnsupportedOperatorQuery(op); - break; - - case NAMED_CSF_DISJUNCTION_FILTER: - case NAMED_MULTI_TERM_DISJUNCTION: - query = logAndThrowQueryParserException( - "Named disjunction operator could not be converted to a disjunction operator."); - break; - - default: - query = logAndThrowQueryParserException("Unknown operator " + op.toString()); - } - - return negateQueryIfNodeNegated(op, query); - } - - protected Query visitToOperator(SearchOperator op) throws QueryParserException { - return createNormalizedTermQuery( - op, EarlybirdFieldConstant.TO_USER_FIELD.getFieldName(), op.getOperand()); - } - - protected Query visitFromOperator(SearchOperator op) throws QueryParserException { - return createNormalizedTermQuery( - op, EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName(), op.getOperand()); - } - - protected Query visitFilterOperator(SearchOperator op) throws QueryParserException { - return visitFilterOperator(op, false); - } - - protected Query visitIncludeOperator(SearchOperator op) throws QueryParserException { - // Include is a bit funny. If we have [include retweets] we are saying - // do include retweets, which is the default. Also conjunctions re-negate - // whatever node we emit from the visitor. - if (!isParentNegated(op) && !nodeIsNegated(op)) { - // positive include - no-op. - return null; - } - return visitFilterOperator(op, false); - } - - protected Query visitExcludeOperator(SearchOperator op) throws QueryParserException { - // Exclude is a bit funny. If we have -[exclude retweets] we are saying - // dont exclude retweets, which is the default. - if (isParentNegated(op) || nodeIsNegated(op)) { - // Negative exclude. Do nothing - parent will not add this to the list of children. - return null; - } else { - // Positive exclude. - return visitFilterOperator(op, true); - } - } - - protected Query visitFilterOperator(SearchOperator op, boolean negate) - throws QueryParserException { - Query q; - boolean negateQuery = negate; - - if (op.getOperand().equals(SearchOperatorConstants.ANTISOCIAL)) { - // Since the object we use to implement these filters is actually an - // EXCLUDE filter, we need to negate it to get it to work as a regular filter. - q = UserFlagsExcludeFilter.getUserFlagsExcludeFilter(userTable, true, false, false); - negateQuery = !negateQuery; - } else if (op.getOperand().equals(SearchOperatorConstants.OFFENSIVE_USER)) { - q = UserFlagsExcludeFilter.getUserFlagsExcludeFilter(userTable, false, true, false); - negateQuery = !negateQuery; - } else if (op.getOperand().equals(SearchOperatorConstants.ANTISOCIAL_OFFENSIVE_USER)) { - q = UserFlagsExcludeFilter.getUserFlagsExcludeFilter(userTable, true, true, false); - negateQuery = !negateQuery; - } else if (op.getOperand().equals(SearchOperatorConstants.PROTECTED)) { - q = UserFlagsExcludeFilter.getUserFlagsExcludeFilter(userTable, false, false, true); - negateQuery = !negateQuery; - } else if (op.getOperand().equals(SearchOperatorConstants.HAS_ENGAGEMENT)) { - return buildHasEngagementsQuery(); - } else if (op.getOperand().equals(SearchOperatorConstants.SAFE_SEARCH_FILTER)) { - BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); - bqBuilder.add( - createNoScoreTermQuery( - op, - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.IS_OFFENSIVE), - Occur.SHOULD); - - // The following internal field __filter_sensitive_content - // is not currently built by earlybird. - // This means the safe search filter soley operates on the is_offensive bit - bqBuilder.add( - createNoScoreTermQuery( - op, - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdThriftDocumentUtil.formatFilter(SearchOperatorConstants.SENSITIVE_CONTENT)), - Occur.SHOULD); - q = bqBuilder.build(); - negateQuery = !negateQuery; - } else if (op.getOperand().equals(SearchOperatorConstants.RETWEETS)) { - // Special case for filter:retweets - we use the text field search "-rt" - // mostly for legacy reasons. - q = createSimpleTermQuery( - op, - EarlybirdFieldConstant.TEXT_FIELD.getFieldName(), - EarlybirdThriftDocumentBuilder.RETWEET_TERM); - } else if (schemaSnapshot.getFacetFieldByFacetName(op.getOperand()) != null) { - Schema.FieldInfo facetField = schemaSnapshot.getFacetFieldByFacetName(op.getOperand()); - if (facetField.getFieldType().isStoreFacetSkiplist()) { - q = createSimpleTermQuery( - op, - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstant.getFacetSkipFieldName(facetField.getName())); - } else { - // return empty BQ that doesn't match anything - q = new BooleanQuery.Builder().build(); - } - } else if (op.getOperand().equals(SearchOperatorConstants.VINE_LINK)) { - // Temporary special case for filter:vine_link. The filter is called "vine_link", but it - // should use the internal field "__filter_vine". We need this special case because otherwise - // it would look for the non-existing "__filter_vine_link" field. See SEARCH-9390 - q = createNoScoreTermQuery( - op, - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdThriftDocumentUtil.formatFilter("vine")); - } else { - // The default vanilla filters just uses the filter format string and the - // operand text. - q = createNoScoreTermQuery( - op, - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdThriftDocumentUtil.formatFilter(op.getOperand())); - } - // Double check: no filters should have any score contribution. - q = new BoostQuery(q, 0.0f); - return negateQuery ? negateQuery(q) : q; - } - - private Query buildHasEngagementsQuery() { - if (earlybirdCluster == EarlybirdCluster.PROTECTED) { - // Engagements and engagement counts are not indexed on Earlybirds, so there is no need to - // traverse the entire segment with the MinFeatureValueFilter. See SEARCH-28120 - return new MatchNoDocsQuery(); - } - - Query favFilter = MinFeatureValueFilter.getMinFeatureValueFilter( - EarlybirdFieldConstant.FAVORITE_COUNT.getFieldName(), 1); - Query retweetFilter = MinFeatureValueFilter.getMinFeatureValueFilter( - EarlybirdFieldConstant.RETWEET_COUNT.getFieldName(), 1); - Query replyFilter = MinFeatureValueFilter.getMinFeatureValueFilter( - EarlybirdFieldConstant.REPLY_COUNT.getFieldName(), 1); - return new BooleanQuery.Builder() - .add(favFilter, Occur.SHOULD) - .add(retweetFilter, Occur.SHOULD) - .add(replyFilter, Occur.SHOULD) - .build(); - } - - protected Query visitLangOperator(SearchOperator op) throws QueryParserException { - return createNoScoreTermQuery( - op, EarlybirdFieldConstant.ISO_LANGUAGE_FIELD.getFieldName(), op.getOperand()); - } - - protected Query visitSourceOperator(SearchOperator op) throws QueryParserException { - return createNoScoreTermQuery( - op, EarlybirdFieldConstant.NORMALIZED_SOURCE_FIELD.getFieldName(), op.getOperand()); - } - - protected Query visitSmileyOperator(SearchOperator op) throws QueryParserException { - return createSimpleTermQuery( - op, - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - String.format(SMILEY_FORMAT_STRING, op.getOperand())); - } - - protected Query visitDocValRangeFilterOperator(SearchOperator op) throws QueryParserException { - String csfFieldName = op.getOperands().get(0).toLowerCase(); - - ThriftCSFType csfFieldType = schemaSnapshot.getCSFFieldType(csfFieldName); - if (csfFieldType == null) { - throw new QueryParserException("invalid csf field name " + op.getOperands().get(0) - + " used in " + op.serialize()); - } - - try { - if (csfFieldType == ThriftCSFType.DOUBLE - || csfFieldType == ThriftCSFType.FLOAT) { - return DocValRangeFilter.getDocValRangeQuery(csfFieldName, csfFieldType, - Double.parseDouble(op.getOperands().get(1)), - Double.parseDouble(op.getOperands().get(2))); - } else if (csfFieldType == ThriftCSFType.LONG - || csfFieldType == ThriftCSFType.INT - || csfFieldType == ThriftCSFType.BYTE) { - Query query = DocValRangeFilter.getDocValRangeQuery(csfFieldName, csfFieldType, - Long.parseLong(op.getOperands().get(1)), - Long.parseLong(op.getOperands().get(2))); - if (csfFieldName.equals(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName())) { - return wrapQueryInUserScrubGeoFilter(query); - } - return query; - } else { - throw new QueryParserException("invalid ThriftCSFType. drop this op: " + op.serialize()); - } - } catch (NumberFormatException e) { - throw new QueryParserException("invalid range numeric type used in " + op.serialize()); - } - } - - protected final Query visitCachedFilterOperator(SearchOperator op) throws QueryParserException { - try { - return CachedFilterQuery.getCachedFilterQuery(op.getOperand(), queryCacheManager); - } catch (CachedFilterQuery.NoSuchFilterException e) { - throw new QueryParserException(e.getMessage(), e); - } - } - - protected final Query visitScoredFilterOperator(SearchOperator op) throws QueryParserException { - final List operands = op.getOperands(); - final String scoreFunction = operands.get(0); - ScoringFunctionProvider.NamedScoringFunctionProvider scoringFunctionProvider = - ScoringFunctionProvider.getScoringFunctionProviderByName(scoreFunction, schemaSnapshot); - if (scoringFunctionProvider == null) { - throw new QueryParserException("Unknown scoring function name [" + scoreFunction - + " ] used as score_filter's operand"); - } - - return ScoreFilterQuery.getScoreFilterQuery( - schemaSnapshot, - scoringFunctionProvider, - Float.parseFloat(operands.get(1)), - Float.parseFloat(operands.get(2))); - } - - protected Query visitSinceTimeOperator(SearchOperator op) { - try { - return SinceUntilFilter.getSinceQuery(Integer.parseInt(op.getOperand())); - } catch (NumberFormatException e) { - LOG.warn("since time is not a valid integer, the date isn't reasonable. drop this op: " - + op.serialize()); - SINCE_TIME_INVALID_INT_COUNTER.increment(); - return null; - } - } - - protected Query visitUntilTimeOperator(SearchOperator op) { - try { - return SinceUntilFilter.getUntilQuery(Integer.parseInt(op.getOperand())); - } catch (NumberFormatException e) { - LOG.warn("until time is not a valid integer, the date isn't reasonable. drop this op: " - + op.serialize()); - UNTIL_TIME_INVALID_INT_COUNTER.increment(); - return null; - } - } - - protected Query visitSinceIDOperator(SearchOperator op) { - long id = Long.parseLong(op.getOperand()); - return SinceMaxIDFilter.getSinceIDQuery(id); - } - - protected Query visitMaxIDOperator(SearchOperator op) { - long id = Long.parseLong(op.getOperand()); - return SinceMaxIDFilter.getMaxIDQuery(id); - } - - protected Query visitGeoLocationTypeOperator(SearchOperator op) throws QueryParserException { - String operand = op.getOperand(); - ThriftGeoLocationSource source = ThriftGeoLocationSource.valueOf(operand.toUpperCase()); - // If necessary, this query will be wrapped by the UserScrubGeoFilter within - // the createSimpleTermQuery() helper method - return createNoScoreTermQuery( - op, - EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName(), - EarlybirdFieldConstants.formatGeoType(source)); - } - - protected Query visitGeocodeOperator(SearchOperator op) throws QueryParserException { - return visitGeocodeOrGeocodePrivateOperator(op); - } - - protected Query visitGeoBoundingBoxOperator(SearchOperator op) throws QueryParserException { - Rectangle rectangle = boundingBoxFromSearchOperator(op); - return wrapQueryInUserScrubGeoFilter( - GeoQuadTreeQueryBuilder.buildGeoQuadTreeQuery(rectangle, terminationTracker)); - } - - protected Query visitPlaceOperator(SearchOperator op) throws QueryParserException { - // This query will be wrapped by the UserScrubGeoFilter within the createSimpleTermQuery() - // helper method - return createSimpleTermQuery( - op, EarlybirdFieldConstant.PLACE_FIELD.getFieldName(), op.getOperand()); - } - - protected Query visitLinkOperator(SearchOperator op) throws QueryParserException { - // This should never be called - the Link visitor (visitor(Link link)) should be. - if (op instanceof Link) { - LOG.warn("Unexpected Link operator " + op.serialize()); - return visit((Link) op); - } else { - throw new QueryParserException("Operator type set to " + op.getOperatorName() - + " but it is not an instance of Link [" + op.toString() + "]"); - } - } - - protected Query visitEntityIdOperator(SearchOperator op) throws QueryParserException { - return createSimpleTermQuery( - op, EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName(), op.getOperand()); - } - - protected Query visitFromUserIDOperator(SearchOperator op) { - return buildLongTermAttributeQuery( - op, EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName()); - } - - protected Query visitInReplyToTweetIdOperator(SearchOperator op) { - return buildLongTermAttributeQuery( - op, EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName()); - } - - protected Query visitInReplyToUserIdOperator(SearchOperator op) { - return buildLongTermAttributeQuery( - op, EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName()); - } - - protected Query visitLikedByUserIdOperator(SearchOperator op) throws QueryParserException { - return buildLongTermAttributeQuery(op, - EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName()); - } - - protected Query visitRetweetedByUserIdOperator(SearchOperator op) throws QueryParserException { - return buildLongTermAttributeQuery(op, - EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName()); - } - - protected Query visitRepliedToByUserIdOperator(SearchOperator op) throws QueryParserException { - return buildLongTermAttributeQuery(op, - EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName()); - } - - protected Query visitQuotedUserIdOperator(SearchOperator op) throws QueryParserException { - return buildLongTermAttributeQuery(op, - EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName()); - } - - protected Query visitQuotedTweetIdOperator(SearchOperator op) throws QueryParserException { - return buildLongTermAttributeQuery(op, - EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName()); - } - - protected Query visitDirectedAtUserIdOperator(SearchOperator op) throws QueryParserException { - return buildLongTermAttributeQuery(op, - EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName()); - } - - protected Query visitConversationIdOperator(SearchOperator op) throws QueryParserException { - return buildLongTermAttributeQuery( - op, EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName()); - } - - protected Query visitComposerSourceOperator(SearchOperator op) throws QueryParserException { - Preconditions.checkNotNull(op.getOperand(), "composer_source requires operand"); - try { - ComposerSource composerSource = ComposerSource.valueOf(op.getOperand().toUpperCase()); - return buildNoScoreIntTermQuery( - op, EarlybirdFieldConstant.COMPOSER_SOURCE, composerSource.getValue()); - } catch (IllegalArgumentException e) { - throw new QueryParserException("Invalid operand for composer_source: " + op.getOperand(), e); - } - } - - protected Query visitRetweetsOfTweetIdOperator(SearchOperator op) { - return buildLongTermAttributeQuery( - op, EarlybirdFieldConstant.RETWEET_SOURCE_TWEET_ID_FIELD.getFieldName()); - } - - protected Query visitRetweetsOfUserIdOperator(SearchOperator op) { - return buildLongTermAttributeQuery( - op, EarlybirdFieldConstant.RETWEET_SOURCE_USER_ID_FIELD.getFieldName()); - } - - protected Query visitLinkCategoryOperator(SearchOperator op) { - int linkCategory; - try { - linkCategory = LinkCategory.valueOf(op.getOperand()).getValue(); - } catch (IllegalArgumentException e) { - linkCategory = Integer.parseInt(op.getOperand()); - } - - String fieldName = EarlybirdFieldConstant.LINK_CATEGORY_FIELD.getFieldName(); - org.apache.lucene.index.Term term = new org.apache.lucene.index.Term( - fieldName, IntTermAttributeImpl.copyIntoNewBytesRef(linkCategory)); - return wrapQuery( - new TermQueryWithSafeToString(term, Integer.toString(linkCategory)), op, fieldName); - } - - protected Query visitCardNameOperator(SearchOperator op) throws QueryParserException { - return createNoScoreTermQuery( - op, EarlybirdFieldConstant.CARD_NAME_FIELD.getFieldName(), op.getOperand()); - } - - protected Query visitCardDomainOperator(SearchOperator op) throws QueryParserException { - return createNoScoreTermQuery( - op, EarlybirdFieldConstant.CARD_DOMAIN_FIELD.getFieldName(), op.getOperand()); - } - - protected Query visitCardLangOperator(SearchOperator op) throws QueryParserException { - return createNoScoreTermQuery( - op, EarlybirdFieldConstant.CARD_LANG.getFieldName(), op.getOperand()); - } - - protected Query visitHFTermPairOperator(SearchOperator op) throws QueryParserException { - final List operands = op.getOperands(); - String termPair = HighFrequencyTermPairs.createPair(op.getOperands().get(0), - op.getOperands().get(1)); - Query q = createSimpleTermQuery(op, ImmutableSchema.HF_TERM_PAIRS_FIELD, termPair); - float boost = Float.parseFloat(operands.get(2)); - if (boost >= 0) { - q = BoostUtils.maybeWrapInBoostQuery(q, boost); - } - return q; - } - - protected Query visitHFTermPhrasePairOperator(SearchOperator op) throws QueryParserException { - final List operands = op.getOperands(); - String termPair = HighFrequencyTermPairs.createPhrasePair(op.getOperands().get(0), - op.getOperands().get(1)); - Query q = createSimpleTermQuery(op, ImmutableSchema.HF_PHRASE_PAIRS_FIELD, termPair); - float boost = Float.parseFloat(operands.get(2)); - if (boost >= 0) { - q = BoostUtils.maybeWrapInBoostQuery(q, boost); - } - return q; - } - - private Query logAndThrowQueryParserException(String message) throws QueryParserException { - LOG.error(message); - throw new QueryParserException(message); - } - - private Query logMissingEntriesAndThrowQueryParserException(String field, SearchOperator op) - throws QueryParserException { - return logAndThrowQueryParserException( - String.format("Missing required %s entries for %s", field, op.serialize())); - } - - // previous implementation of this operator allowed insertion of - // operands from the thrift search query. This was reverted to ensure simplicity - // of the api, and to keep the serialized query self contained. - protected final Query visitMultiTermDisjunction(SearchOperator op) throws QueryParserException { - final List operands = op.getOperands(); - final String field = operands.get(0); - - if (isUserIdField(field)) { - List ids = Lists.newArrayList(); - parseLongArgs(operands.subList(1, operands.size()), ids, op); - if (ids.size() > 0) { - // Try to get ranks for ids if exist from hitAttributeHelper. - // Otherwise just pass in a empty list. - List ranks; - if (hitAttributeHelper != null - && hitAttributeHelper.getExpandedNodeToRankMap().containsKey(op)) { - ranks = hitAttributeHelper.getExpandedNodeToRankMap().get(op); - } else { - ranks = Lists.newArrayList(); - } - return UserIdMultiSegmentQuery.createIdDisjunctionQuery( - "multi_term_disjunction_" + field, - ids, - field, - schemaSnapshot, - multiSegmentTermDictionaryManager, - decider, - earlybirdCluster, - ranks, - hitAttributeHelper, - queryTimeout); - } else { - return logMissingEntriesAndThrowQueryParserException(field, op); - } - } else if (EarlybirdFieldConstant.ID_FIELD.getFieldName().equals(field)) { - List ids = Lists.newArrayList(); - parseLongArgs(operands.subList(1, operands.size()), ids, op); - if (ids.size() > 0) { - return RequiredStatusIDsFilter.getRequiredStatusIDsQuery(ids); - } else { - return logMissingEntriesAndThrowQueryParserException(field, op); - } - } else if (isTweetIdField(field)) { - List ids = Lists.newArrayList(); - parseLongArgs(operands.subList(1, operands.size()), ids, op); - if (ids.size() > 0) { - BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder(); - int numClauses = 0; - for (long id : ids) { - if (numClauses >= BooleanQuery.getMaxClauseCount()) { - BooleanQuery saved = bqBuilder.build(); - bqBuilder = new BooleanQuery.Builder(); - bqBuilder.add(saved, BooleanClause.Occur.SHOULD); - numClauses = 1; - } - bqBuilder.add(buildLongTermAttributeQuery(op, field, id), Occur.SHOULD); - ++numClauses; - } - return bqBuilder.build(); - } else { - return logMissingEntriesAndThrowQueryParserException(field, op); - } - } else { - return createUnsupportedOperatorQuery(op); - } - } - - protected final Query visitCSFDisjunctionFilter(SearchOperator op) - throws QueryParserException { - List operands = op.getOperands(); - String field = operands.get(0); - - ThriftCSFType csfType = schemaSnapshot.getCSFFieldType(field); - if (csfType == null) { - throw new QueryParserException("Field must be a CSF"); - } - - if (csfType != ThriftCSFType.LONG) { - throw new QueryParserException("csf_disjunction_filter only works with long fields"); - } - - Set values = new HashSet<>(); - parseLongArgs(operands.subList(1, operands.size()), values, op); - - Query query = CSFDisjunctionFilter.getCSFDisjunctionFilter(field, values); - if (field.equals(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName())) { - return wrapQueryInUserScrubGeoFilter(query); - } - return query; - } - - protected Query visitSafetyExclude(SearchOperator op) throws QueryParserException { - // We do not allow negating safety_exclude operator. Note the operator is internal so if we - // get here, it means there's a bug in the query construction side. - if (isParentNegated(op) || nodeIsNegated(op)) { - throw new QueryParserException("Negating safety_exclude operator is not allowed: " + op); - } - - // Convert the safety filter to other operators depending on cluster setting - // The safety filter is interpreted differently on archive because the underlying safety labels - // in extended encoded field are not available on archive. - if (EarlybirdCluster.isArchive(earlybirdCluster)) { - return visit(OPERATOR_CACHED_EXCLUDE_ANTISOCIAL_AND_NATIVERETWEETS); - } else { - List children = Lists.newArrayList(); - for (String filterName : op.getOperands()) { - children.addAll( - OPERATORS_BY_SAFE_EXCLUDE_OPERAND.getOrDefault(filterName, ImmutableList.of())); - } - return visit(new Conjunction(children)); - } - } - - protected Query visitNamedEntity(SearchOperator op) throws QueryParserException { - List operands = op.getOperands(); - Preconditions.checkState(operands.size() == 1, - "named_entity: wrong number of operands"); - - return createDisjunction( - operands.get(0).toLowerCase(), - op, - EarlybirdFieldConstant.NAMED_ENTITY_FROM_TEXT_FIELD, - EarlybirdFieldConstant.NAMED_ENTITY_FROM_URL_FIELD); - } - - protected Query visitSpaceId(SearchOperator op) throws QueryParserException { - List operands = op.getOperands(); - Preconditions.checkState(operands.size() == 1, - "space_id: wrong number of operands"); - - return createSimpleTermQuery( - op, - EarlybirdFieldConstant.SPACE_ID_FIELD.getFieldName(), - op.getOperand() - ); - } - - protected Query visitNamedEntityWithType(SearchOperator op) throws QueryParserException { - List operands = op.getOperands(); - Preconditions.checkState(operands.size() == 2, - "named_entity_with_type: wrong number of operands"); - - String name = operands.get(0); - String type = operands.get(1); - return createDisjunction( - String.format("%s:%s", name, type).toLowerCase(), - op, - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD, - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD); - } - - // Create a disjunction query for a given value in one of the given fields - private Query createDisjunction( - String value, SearchOperator operator, EarlybirdFieldConstant... fields) - throws QueryParserException { - BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); - for (EarlybirdFieldConstant field : fields) { - booleanQueryBuilder.add( - createSimpleTermQuery(operator, field.getFieldName(), value), Occur.SHOULD); - } - return booleanQueryBuilder.build(); - } - - protected Query visitMinFeatureValueOperator(SearchOperator.Type type, SearchOperator op) { - final List operands = op.getOperands(); - - String featureName; - switch (type) { - case MIN_FAVES: - featureName = EarlybirdFieldConstant.FAVORITE_COUNT.getFieldName(); - break; - case MIN_QUALITY_SCORE: - featureName = EarlybirdFieldConstant.PARUS_SCORE.getFieldName(); - break; - case MIN_REPLIES: - featureName = EarlybirdFieldConstant.REPLY_COUNT.getFieldName(); - break; - case MIN_REPUTATION: - featureName = EarlybirdFieldConstant.USER_REPUTATION.getFieldName(); - break; - case MIN_RETWEETS: - featureName = EarlybirdFieldConstant.RETWEET_COUNT.getFieldName(); - break; - default: - throw new IllegalArgumentException("Unknown min feature type " + type); - } - - double operand = Double.parseDouble(operands.get(0)); - - // SEARCH-16751: Because we use QueryCacheConstants.HAS_ENGAGEMENT as a driving query below, we - // won't return tweets with 0 engagements when we handle a query with a [min_X 0] filter (e.g. - // (* cat [min_faves 0] ). Thus we need to return a MatchAllDocsQuery in that case. - if (operand == 0) { - return new MatchAllDocsQuery(); - } - - // Only perform the rewrite if the operator is a min engagement operator. - if (isOperatorTypeEngagementFilter(type)) { - return buildQueryForEngagementOperator(op, operands, featureName); - } - - if (type == SearchOperator.Type.MIN_REPUTATION) { - return buildQueryForMinReputationOperator(operands, featureName); - } - - return MinFeatureValueFilter.getMinFeatureValueFilter( - featureName, Double.parseDouble(operands.get(0))); - } - - protected Query visitFeatureValueInAcceptListOrUnsetFilterOperator(SearchOperator op) - throws QueryParserException { - final List operands = op.getOperands(); - final String field = operands.get(0); - - if (isIdCSFField(field)) { - Set ids = Sets.newHashSet(); - parseLongArgs(operands.subList(1, operands.size()), ids, op); - return FeatureValueInAcceptListOrUnsetFilter.getFeatureValueInAcceptListOrUnsetFilter( - field, ids); - } else { - return logAndThrowQueryParserException( - "Invalid CSF field passed to operator " + op.toString()); - } - } - - /** - * Creates a Lucene query for an operator that's not supported by the search service. - * - * NOTE: Developer, if you are writing a class to extends this class, make sure the - * behaviour of this function makes sense for your search service. - * - * @param op The operator that's not supported by the search service. - * @return The Lucene query for this operator - */ - protected Query createUnsupportedOperatorQuery(SearchOperator op) throws QueryParserException { - SearchCounter - .export(UNSUPPORTED_OPERATOR_PREFIX + op.getOperatorType().getOperatorName()) - .increment(); - return visit(op.toPhrase()); - } - - private Query buildNoScoreIntTermQuery( - SearchOperator op, - EarlybirdFieldConstant field, - int termValue) { - org.apache.lucene.index.Term term = new org.apache.lucene.index.Term( - field.getFieldName(), IntTermAttributeImpl.copyIntoNewBytesRef(termValue)); - return wrapQuery( - new TermQueryWithSafeToString(term, Integer.toString(termValue)), op, field.getFieldName()); - } - - private Query buildQueryForMinReputationOperator(List operands, String featureName) { - int operand = (int) Double.parseDouble(operands.get(0)); - // Driving by MinFeatureValueFilter's DocIdSetIterator is very slow, because we have to - // perform an expensive check for all doc IDs in the segment, so we use a cached result to - // drive the query, and use MinFeatureValueFilter as a secondary filter. - String queryCacheFilterName; - if (operand >= 50) { - queryCacheFilterName = QueryCacheConstants.MIN_REPUTATION_50; - } else if (operand >= 36) { - queryCacheFilterName = QueryCacheConstants.MIN_REPUTATION_36; - } else if (operand >= 30) { - queryCacheFilterName = QueryCacheConstants.MIN_REPUTATION_30; - } else { - return MinFeatureValueFilter.getMinFeatureValueFilter(featureName, operand); - } - - try { - Query drivingQuery = CachedFilterQuery.getCachedFilterQuery( - queryCacheFilterName, queryCacheManager); - return new FilteredQuery( - drivingQuery, MinFeatureValueFilter.getDocIdFilterFactory(featureName, operand)); - } catch (Exception e) { - // If the filter is not found, that's OK, it might be our first time running the query cache, - // or there may be no tweets with that high reputation. - return MinFeatureValueFilter.getMinFeatureValueFilter(featureName, operand); - } - } - - private Query buildQueryForEngagementOperator( - SearchOperator op, List operands, String featureName) { - // Engagements and engagement counts are not indexed on Protected Earlybirds, so there is no - // need to traverse the entire segment with the MinFeatureValueFilter. SEARCH-28120 - if (earlybirdCluster == EarlybirdCluster.PROTECTED) { - return new MatchNoDocsQuery(); - } - - EarlybirdFieldConstant field = - EarlybirdFieldConstants.CSF_NAME_TO_MIN_ENGAGEMENT_FIELD_MAP.get(featureName); - if (field == null) { - throw new IllegalArgumentException(String.format("Expected the feature to be " - + "FAVORITE_COUNT, REPLY_COUNT, or RETWEET_COUNT. Got %s.", featureName)); - } - int operand = (int) Double.parseDouble(operands.get(0)); - ByteNormalizer normalizer = MinFeatureValueFilter.getMinFeatureValueNormalizer(featureName); - int minValue = normalizer.unsignedByteToInt(normalizer.normalize(operand)); - - // We default to the old behavior of filtering posts instead of consulting the min engagement - // field if the operand is less than some threshold value because it seems, empirically, that - // the old method results in lower query latencies for lower values of the filter operand. - // This threshold can be controlled by the "use_min_engagement_field_threshold" decider. The - // current default value is 90. SEARCH-16102 - int useMinEngagementFieldThreshold = decider.getAvailability( - "use_min_engagement_field_threshold").getOrElse(() -> 0); - if (operand >= useMinEngagementFieldThreshold) { - NUM_QUERIES_ABOVE_MIN_ENGAGEMENT_THRESHOLD.increment(); - } else { - NUM_QUERIES_BELOW_MIN_ENGAGEMENT_THRESHOLD.increment(); - } - if (schemaHasField(field) && operand >= useMinEngagementFieldThreshold) { - return buildNoScoreIntTermQuery(op, field, minValue); - } - // Driving by MinFeatureValueFilter's DocIdSetIterator is very slow, because we have to - // perform an expensive check for all doc IDs in the segment, so we use a cached result to - // drive the query, and use MinFeatureValueFilter as a secondary filter. - try { - Query drivingQuery = minEngagmentsDrivingQuery(op, operand); - return new FilteredQuery( - drivingQuery, MinFeatureValueFilter.getDocIdFilterFactory(featureName, operand)); - } catch (Exception e) { - // If the filter is not found, that's OK, it might be our first time running the query cache, - // or there may be no Tweets with that many engagements (we would only expect this in tests). - return MinFeatureValueFilter.getMinFeatureValueFilter(featureName, operand); - } - } - - private Query minEngagmentsDrivingQuery(SearchOperator operator, int minValue) - throws CachedFilterQuery.NoSuchFilterException, QueryParserException { - // If the min engagements value is large, then many of the hits that have engagement will still - // not match the query, leading to extremely slow queries. Therefore, if there is more than 100 - // engagements, we drive by a more restricted filter. See SEARCH-33740 - String filter; - if (minValue < 100) { - filter = QueryCacheConstants.HAS_ENGAGEMENT; - } else if (operator.getOperatorType() == SearchOperator.Type.MIN_FAVES) { - filter = QueryCacheConstants.MIN_FAVES_100; - } else if (operator.getOperatorType() == SearchOperator.Type.MIN_REPLIES) { - filter = QueryCacheConstants.MIN_REPLIES_100; - } else if (operator.getOperatorType() == SearchOperator.Type.MIN_RETWEETS) { - filter = QueryCacheConstants.MIN_RETWEETS_100; - } else { - throw new QueryParserException("Missing engagement filter."); - } - return CachedFilterQuery.getCachedFilterQuery(filter, queryCacheManager); - } - - private boolean isOperatorTypeEngagementFilter(SearchOperator.Type type) { - return type == SearchOperator.Type.MIN_FAVES - || type == SearchOperator.Type.MIN_RETWEETS - || type == SearchOperator.Type.MIN_REPLIES; - } - - private boolean schemaHasField(EarlybirdFieldConstant field) { - return schemaSnapshot.hasField(field.getFieldId()); - } - - // Helper functions - private Query createSimpleTermQuery( - com.twitter.search.queryparser.query.Query node, String field, String text) - throws QueryParserException { - Query baseQuery = new TermQuery(createTerm(field, text)); - if (isGeoFieldThatShouldBeScrubbed(field, text)) { - baseQuery = wrapQueryInUserScrubGeoFilter(baseQuery); - } - return wrapQuery(baseQuery, node, field); - } - - private boolean isGeoFieldThatShouldBeScrubbed(String field, String text) { - if (field.equals(EarlybirdFieldConstant.INTERNAL_FIELD.getFieldName())) { - // the internal field is used for the place id filter and the geo location type filters, some - // of which should be scrubbed - return GEO_FILTERS_TO_BE_SCRUBBED.contains(text); - } - return GEO_FIELDS_TO_BE_SCRUBBED.contains(field); - } - - // Like above, but sets boost to 0 to disable scoring component. This should be used - // for filters that do not impact scoring (such as filter:images). - private Query createNoScoreTermQuery(com.twitter.search.queryparser.query.Query node, - String field, String text) - throws QueryParserException { - Query query = createSimpleTermQuery(node, field, text); - return new BoostQuery(query, 0.0f); // No score contribution. - } - - private Query createNormalizedTermQuery(com.twitter.search.queryparser.query.Query node, - String field, String text) - throws QueryParserException { - return createSimpleTermQuery( - node, - field, - NormalizerHelper.normalizeWithUnknownLocale(text, EarlybirdConfig.getPenguinVersion())); - } - - /** - * Get the boost from the annotation list of a query node. - * Right now this is very simple, we simple extract the value of some annotations and ignore all - * others, also, if there are multiple annotations that have values, we only use the first one we - * see in the list (although the rewritten query EB receives should have this). - * NOTE: we use simple weight selection logic here based on the assumption that the annotator - * and rewriter will not produce ambiguous weight information. There should always be only one - * weight-bearing annotation for a specific node. - * - * @param annotations The list of annotations of the query node. - * @return The boost for this query node, 0 if there is no boost, in which case you shouldn't - * apply it at all. - */ - private static double getBoostFromAnnotations(List annotations) { - if (annotations != null) { - for (Annotation anno : annotations) { - switch (anno.getType()) { - case VARIANT: - case SPELLING: - case WEIGHT: - case OPTIONAL: - return ((FloatAnnotation) anno).getValue(); - default: - } - } - } - return -1; - } - - private static double getPhraseProximityFromAnnotations(List annotations) { - if (annotations != null) { - for (Annotation anno : annotations) { - if (anno.getType() == Annotation.Type.PROXIMITY) { - return ((FloatAnnotation) anno).getValue(); - } - } - } - return -1; - } - - private static boolean isOptional(com.twitter.search.queryparser.query.Query node) { - return node.hasAnnotationType(Annotation.Type.OPTIONAL); - } - - private static boolean isProximityGroup(com.twitter.search.queryparser.query.Query node) { - if (node.isTypeOf(com.twitter.search.queryparser.query.Query.QueryType.OPERATOR)) { - SearchOperator op = (SearchOperator) node; - if (op.getOperatorType() == SearchOperator.Type.PROXIMITY_GROUP) { - return true; - } - } - return false; - } - - private final Query simplifyBooleanQuery(BooleanQuery q) { - if (q.clauses() == null || q.clauses().size() != 1) { - return q; - } - - return q.clauses().get(0).getQuery(); - } - - private Query visit(final Phrase phrase, boolean sloppy) throws QueryParserException { - Optional fieldOpt = phrase.getAnnotationOf(Annotation.Type.FIELD); - if (fieldOpt.isPresent()) { - String field = fieldOpt.get().valueToString(); - Schema.FieldInfo fieldInfo = schemaSnapshot.getFieldInfo(field); - if (fieldInfo != null && !fieldInfo.getFieldType().hasPositions()) { - throw new QueryParserException(String.format("Field %s does not support phrase queries " - + "because it does not have position information.", field)); - } - } - BooleanQuery.Builder queryBuilder = new BooleanQuery.Builder(); - Map actualFieldWeights = getFieldWeightMapForNode(phrase); - for (Map.Entry entry : actualFieldWeights.entrySet()) { - PhraseQuery.Builder phraseQueryBuilder = new PhraseQuery.Builder(); - int curPos = 0; - for (String term : phrase.getTerms()) { - if (!term.equals(PHRASE_WILDCARD)) { - phraseQueryBuilder.add(createTerm(entry.getKey(), term), curPos); - curPos++; - } else if (curPos != 0) { //"*" at the beggining of a phrase has no effect/meaning - curPos++; - } - } - - // No actual terms added to query - if (curPos == 0) { - break; - } - int annotatedSloppiness = (int) getPhraseProximityFromAnnotations(phrase.getAnnotations()); - if (annotatedSloppiness > 0) { - phraseQueryBuilder.setSlop(annotatedSloppiness); - } else if (sloppy) { - phraseQueryBuilder.setSlop(proximityPhraseSlop); - } - float fieldWeight = entry.getValue(); - float boost = (float) getBoostFromAnnotations(phrase.getAnnotations()); - Query query = phraseQueryBuilder.build(); - if (boost >= 0) { - query = BoostUtils.maybeWrapInBoostQuery(query, boost * fieldWeight); - } else if (fieldWeight != DEFAULT_FIELD_WEIGHT) { - query = BoostUtils.maybeWrapInBoostQuery(query, fieldWeight); - } else { - query = BoostUtils.maybeWrapInBoostQuery(query, proximityPhraseWeight); - } - Occur occur = actualFieldWeights.size() > 1 ? Occur.SHOULD : Occur.MUST; - queryBuilder.add(wrapQuery(query, phrase, entry.getKey()), occur); - } - Query q = simplifyBooleanQuery(queryBuilder.build()); - return negateQueryIfNodeNegated(phrase, q); - } - - private Query wrapQuery( - org.apache.lucene.search.Query query, - com.twitter.search.queryparser.query.Query node, - String fieldName) { - return EarlybirdQueryHelper.maybeWrapWithTimeout( - EarlybirdQueryHelper.maybeWrapWithHitAttributionCollector( - query, node, schemaSnapshot.getFieldInfo(fieldName), hitAttributeHelper), - node, queryTimeout); - } - - private final boolean nodeIsNegated(com.twitter.search.queryparser.query.Query node) { - if (isParentNegated(node)) { - return !node.mustNotOccur(); - } else { - return node.mustNotOccur(); - } - } - - private final Query negateQuery(Query q) { - return new BooleanQuery.Builder() - .add(q, Occur.MUST_NOT) - .add(new MatchAllDocsQuery(), Occur.MUST) - .build(); - } - - // Simple helper to examine node, and negate the lucene query if necessary. - private final Query negateQueryIfNodeNegated(com.twitter.search.queryparser.query.Query node, - Query query) { - if (query == null) { - return null; - } - return nodeIsNegated(node) ? negateQuery(query) : query; - } - - private boolean isParentNegated(com.twitter.search.queryparser.query.Query query) { - return parentNegatedQueries.contains(query); - } - - private org.apache.lucene.index.Term createTerm(String field, String text) - throws QueryParserException { - Schema.FieldInfo fieldInfo = schemaSnapshot.getFieldInfo(field); - if (fieldInfo == null) { - throw new QueryParserException("Unknown field: " + field); - } - - queriedFields.add(field); - - try { - return new org.apache.lucene.index.Term(field, SchemaUtil.toBytesRef(fieldInfo, text)); - } catch (UnsupportedOperationException e) { - throw new QueryParserException(e.getMessage(), e.getCause()); - } - } - - /** - * Get field weight map for a node, combing default values and its annotations. - */ - private Map getFieldWeightMapForNode( - com.twitter.search.queryparser.query.Query query) throws QueryParserException { - return FieldWeightUtil.combineDefaultWithAnnotation( - query, - defaultFieldWeightMap, - enabledFieldWeightMap, - Functions.identity(), - mappableFieldMap, - Functions.identity()); - } - - private boolean addQuery( - BooleanQuery.Builder bqBuilder, - com.twitter.search.queryparser.query.Query child) throws QueryParserException { - Occur occur = Occur.MUST; - if (child.mustNotOccur()) { - // To build a conjunction, we will not rely on the negation in the child visitor. - // Instead we will add the term as MUST_NOT occur. - // Store this in parentNegatedQueries so the child visitor can do the right thing. - occur = Occur.MUST_NOT; - parentNegatedQueries.add(child); - } else if (isOptional(child) || isProximityGroup(child)) { - occur = Occur.SHOULD; - } - - Query q = child.accept(this); - if (q != null) { - bqBuilder.add(q, occur); - return true; - } - return false; - } - - /** - * Constructs a BooleanQuery from a queryparser Query node. - * Adds fields as configured in the fieldWeightMap and specified by termQueryDisjunctionType - * - TermQueryDisjunctionType.ONLY_OPTIONALIZED adds optional fields - * (only resolved_links_text for now), - * - TermQueryDisjunctionType.DROP_OPTIONALIZED adds all other valid fields expect - * resolved_links_text (for now), - * - TermQueryDisjunctionType.NORMAL adds all valid fields - * @param query an instance of com.twitter.search.queryparser.query.Query or - * com.twitter.search.queryparser.query.Term - * @return a BooleanQuery consists of fields from query - */ - private BooleanQuery createTermQueryDisjunction( - com.twitter.search.queryparser.query.Query query) throws QueryParserException { - String normTerm = query.isTypeOf(com.twitter.search.queryparser.query.Query.QueryType.TERM) - ? ((Term) query).getValue() : query.toString(false); - BooleanQuery.Builder booleanQueryBuilder = new BooleanQuery.Builder(); - Map actualFieldWeightMap = getFieldWeightMapForNode(query); - Set fieldsToUse = Sets.newLinkedHashSet(actualFieldWeightMap.keySet()); - Occur occur = fieldsToUse.size() > 1 ? Occur.SHOULD : Occur.MUST; - for (String field : fieldsToUse) { - addTermQueryWithField(booleanQueryBuilder, query, normTerm, field, occur, - actualFieldWeightMap.get(field)); - } - return booleanQueryBuilder.build(); - } - - private void addTermQueryWithField( - BooleanQuery.Builder bqBuilder, - com.twitter.search.queryparser.query.Query term, - String normTerm, - String fieldName, - Occur occur, - float fieldWeight) throws QueryParserException { - float boost = (float) getBoostFromAnnotations(term.getAnnotations()); - Query query = createSimpleTermQuery(term, fieldName, normTerm); - if (boost >= 0) { - query = BoostUtils.maybeWrapInBoostQuery(query, boost * fieldWeight); - } else { - query = BoostUtils.maybeWrapInBoostQuery(query, fieldWeight); - } - bqBuilder.add(query, occur); - } - - private Query finalizeQuery(BooleanQuery bq, Term term) { - Query q = simplifyBooleanQuery(bq); - return negateQueryIfNodeNegated(term, q); - } - - private Rectangle boundingBoxFromSearchOperator(SearchOperator op) throws QueryParserException { - Preconditions.checkArgument(op.getOperatorType() == SearchOperator.Type.GEO_BOUNDING_BOX); - Preconditions.checkNotNull(op.getOperands()); - Preconditions.checkState(op.getOperands().size() == 4); - - List operands = op.getOperands(); - try { - // Unfortunately, we store coordinates as floats in our index, which causes a lot of precision - // loss. On the query side, we have to cast into floats to match. - float minLat = (float) Double.parseDouble(operands.get(0)); - float minLon = (float) Double.parseDouble(operands.get(1)); - float maxLat = (float) Double.parseDouble(operands.get(2)); - float maxLon = (float) Double.parseDouble(operands.get(3)); - - Point lowerLeft = new PointImpl(minLon, minLat, GeohashChunkImpl.getSpatialContext()); - Point upperRight = new PointImpl(maxLon, maxLat, GeohashChunkImpl.getSpatialContext()); - return new RectangleImpl(lowerLeft, upperRight, GeohashChunkImpl.getSpatialContext()); - } catch (NumberFormatException e) { - // consider operator invalid if any of the coordinate cannot be parsed. - throw new QueryParserException("Malformed bounding box operator." + op.serialize()); - } - } - - private Query visitGeocodeOrGeocodePrivateOperator(SearchOperator op) - throws QueryParserException { - - GeoCode geoCode = GeoCode.fromOperator(op); - if (geoCode == null) { - throw new QueryParserException("Invalid GeoCode operator:" + op.serialize()); - } - - return wrapQueryInUserScrubGeoFilter( - GeoQuadTreeQueryBuilder.buildGeoQuadTreeQuery(geoCode, terminationTracker)); - } - - private Query wrapQueryInUserScrubGeoFilter(Query baseQuery) { - if (DeciderUtil.isAvailableForRandomRecipient( - decider, "filter_out_geo_scrubbed_tweets_" + earlybirdCluster.getNameForStats())) { - return new FilteredQuery( - baseQuery, - UserScrubGeoFilter.getDocIdFilterFactory(userScrubGeoMap)); - } else { - return baseQuery; - } - } - - private Query buildLongTermAttributeQuery(SearchOperator op, String fieldName) { - return buildLongTermAttributeQuery(op, fieldName, Long.parseLong(op.getOperand())); - } - - private Query buildLongTermAttributeQuery(SearchOperator op, String fieldName, long argValue) { - org.apache.lucene.index.Term term = new org.apache.lucene.index.Term( - fieldName, LongTermAttributeImpl.copyIntoNewBytesRef(argValue)); - return wrapQuery(new TermQueryWithSafeToString(term, Long.toString(argValue)), op, fieldName); - } - - private static void parseLongArgs(List operands, - Collection arguments, - SearchOperator op) throws QueryParserException { - for (String operand : operands) { - try { - arguments.add(Long.parseLong(operand)); - } catch (NumberFormatException e) { - throw new QueryParserException("Invalid long operand in " + op.serialize(), e); - } - } - } - - private static boolean isUserIdField(String field) { - return EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName().equals(field) - || EarlybirdFieldConstant.IN_REPLY_TO_USER_ID_FIELD.getFieldName().equals(field) - || EarlybirdFieldConstant.RETWEET_SOURCE_USER_ID_FIELD.getFieldName().equals(field) - || EarlybirdFieldConstant.LIKED_BY_USER_ID_FIELD.getFieldName().equals(field) - || EarlybirdFieldConstant.RETWEETED_BY_USER_ID.getFieldName().equals(field) - || EarlybirdFieldConstant.REPLIED_TO_BY_USER_ID.getFieldName().equals(field) - || EarlybirdFieldConstant.QUOTED_USER_ID_FIELD.getFieldName().equals(field) - || EarlybirdFieldConstant.DIRECTED_AT_USER_ID_FIELD.getFieldName().equals(field); - } - - private static boolean isTweetIdField(String field) { - return EarlybirdFieldConstant.IN_REPLY_TO_TWEET_ID_FIELD.getFieldName().equals(field) - || EarlybirdFieldConstant.RETWEET_SOURCE_TWEET_ID_FIELD.getFieldName().equals(field) - || EarlybirdFieldConstant.QUOTED_TWEET_ID_FIELD.getFieldName().equals(field) - || EarlybirdFieldConstant.CONVERSATION_ID_FIELD.getFieldName().equals(field); - } - - private static boolean isIdCSFField(String field) { - return EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF.getFieldName().equals(field); - } - - public Set getQueriedFields() { - return queriedFields; - } -} diff --git a/src/java/com/twitter/search/earlybird/queryparser/EarlybirdQueryHelper.docx b/src/java/com/twitter/search/earlybird/queryparser/EarlybirdQueryHelper.docx new file mode 100644 index 000000000..03a6d3a9d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/queryparser/EarlybirdQueryHelper.docx differ diff --git a/src/java/com/twitter/search/earlybird/queryparser/EarlybirdQueryHelper.java b/src/java/com/twitter/search/earlybird/queryparser/EarlybirdQueryHelper.java deleted file mode 100644 index 45c3fe77a..000000000 --- a/src/java/com/twitter/search/earlybird/queryparser/EarlybirdQueryHelper.java +++ /dev/null @@ -1,154 +0,0 @@ -package com.twitter.search.earlybird.queryparser; - -import javax.annotation.Nullable; - -import com.google.common.base.Optional; -import com.google.common.base.Preconditions; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.constants.QueryCacheConstants; -import com.twitter.search.common.query.HitAttributeCollector; -import com.twitter.search.common.query.HitAttributeHelper; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.search.termination.QueryTimeout; -import com.twitter.search.common.search.termination.TerminationQuery; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryNodeUtils; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.annotation.Annotation; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchOperatorConstants; - -public abstract class EarlybirdQueryHelper { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdQueryHelper.class); - - /** - * Wraps the given query and some clauses to exclude antisocial tweets into a conjunction. - */ - public static Query requireExcludeAntisocial( - Query basicQuery, - QueryCacheManager queryCacheManager) throws QueryParserException { - // Do not set exclude antisocial if they have any other antisocial filters set - Query query = basicQuery; - DetectAntisocialVisitor detectAntisocialVisitor = new DetectAntisocialVisitor(); - query.accept(detectAntisocialVisitor); - if (detectAntisocialVisitor.hasAnyAntisocialOperator()) { - return query; - } - - // No operator found, force antisocial filter. - if (queryCacheManager.enabled()) { - SearchOperator filter = - new SearchOperator(SearchOperator.Type.CACHED_FILTER, - QueryCacheConstants.EXCLUDE_ANTISOCIAL); - - query = QueryNodeUtils.appendAsConjunction(query, filter); - } else { - SearchOperator filter = new SearchOperator(SearchOperator.Type.EXCLUDE, - SearchOperatorConstants.ANTISOCIAL); - - query = QueryNodeUtils.appendAsConjunction(query, filter); - } - return query; - } - - /** - * Wraps the given query into an equivalent query that will also collect hit attribution data. - * - * @param query The original query. - * @param node The query parser node storing this query. - * @param fieldInfo The field in which the given query will be searching. - * @param hitAttributeHelper The helper that will collect all hit attribution data. - * @return An equivalent query that will also collect hit attribution data. - */ - public static final org.apache.lucene.search.Query maybeWrapWithHitAttributionCollector( - org.apache.lucene.search.Query query, - @Nullable com.twitter.search.queryparser.query.Query node, - Schema.FieldInfo fieldInfo, - @Nullable HitAttributeHelper hitAttributeHelper) { - // Prevents lint error for assigning to a function parameter. - org.apache.lucene.search.Query luceneQuery = query; - if (hitAttributeHelper != null && node != null) { - Optional annotation = node.getAnnotationOf(Annotation.Type.NODE_RANK); - - if (annotation.isPresent()) { - Integer nodeRank = (Integer) annotation.get().getValue(); - luceneQuery = wrapWithHitAttributionCollector( - luceneQuery, - fieldInfo, - nodeRank, - hitAttributeHelper.getFieldRankHitAttributeCollector()); - } - } - - return luceneQuery; - } - - /** - * Wraps the given query into an equivalent query that will also collect hit attribution data. - * - * @param query The original query. - * @param nodeRank The rank of the given query in the overall request query. - * @param fieldInfo The field in which the given query will be searching. - * @param hitAttributeHelper The helper that will collect all hit attribution data. - * @return An equivalent query that will also collect hit attribution data. - */ - public static final org.apache.lucene.search.Query maybeWrapWithHitAttributionCollector( - org.apache.lucene.search.Query query, - int nodeRank, - Schema.FieldInfo fieldInfo, - @Nullable HitAttributeHelper hitAttributeHelper) { - - org.apache.lucene.search.Query luceneQuery = query; - if (hitAttributeHelper != null && nodeRank != -1) { - Preconditions.checkArgument(nodeRank > 0); - luceneQuery = wrapWithHitAttributionCollector( - luceneQuery, fieldInfo, nodeRank, hitAttributeHelper.getFieldRankHitAttributeCollector()); - } - return luceneQuery; - } - - private static final org.apache.lucene.search.Query wrapWithHitAttributionCollector( - org.apache.lucene.search.Query luceneQuery, - Schema.FieldInfo fieldInfo, - int nodeRank, - HitAttributeCollector hitAttributeCollector) { - Preconditions.checkNotNull(fieldInfo, - "Tried collecting hit attribution for unknown field: " + fieldInfo.getName() - + " luceneQuery: " + luceneQuery); - return hitAttributeCollector.newIdentifiableQuery( - luceneQuery, fieldInfo.getFieldId(), nodeRank); - } - - /** - * Returns a query equivalent to the given query, and with the given timeout enforced. - */ - public static org.apache.lucene.search.Query maybeWrapWithTimeout( - org.apache.lucene.search.Query query, - QueryTimeout timeout) { - if (timeout != null) { - return new TerminationQuery(query, timeout); - } - return query; - } - - /** - * Returns a query equivalent to the given query, and with the given timeout enforced. If the - * given query is negated, it is returned without any modifications. - */ - public static org.apache.lucene.search.Query maybeWrapWithTimeout( - org.apache.lucene.search.Query query, - @Nullable com.twitter.search.queryparser.query.Query node, - QueryTimeout timeout) { - // If the node is looking for negation of something, we don't want to include it in node-level - // timeout checks. In general, nodes keep track of the last doc seen, but non-matching docs - // encountered by "must not occur" node do not reflect overall progress in the index. - if (node != null && node.mustNotOccur()) { - return query; - } - return maybeWrapWithTimeout(query, timeout); - } -} diff --git a/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairExtractor.docx b/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairExtractor.docx new file mode 100644 index 000000000..461be0b29 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairExtractor.docx differ diff --git a/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairExtractor.java b/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairExtractor.java deleted file mode 100644 index 83a928185..000000000 --- a/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairExtractor.java +++ /dev/null @@ -1,211 +0,0 @@ -package com.twitter.search.earlybird.queryparser; - -import java.util.ArrayList; -import java.util.IdentityHashMap; - -import com.google.common.base.Preconditions; - -import com.twitter.search.common.util.text.HighFrequencyTermPairs; -import com.twitter.search.queryparser.query.BooleanQuery; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Operator; -import com.twitter.search.queryparser.query.Phrase; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.QueryVisitor; -import com.twitter.search.queryparser.query.SpecialTerm; -import com.twitter.search.queryparser.query.Term; -import com.twitter.search.queryparser.query.annotation.Annotation; - -/** - * Iterates over the Query, populating information of an ArrayList of HighFrequencyTermQueryGroup so that - * HighFrequencyTermPairRewriteVisitor can rewrite the query to use hf term pairs. Returns the - * (approximate) number of high frequency terms it has detected. Iff that number is greater than 1 - * it MAY be able to rewrite the query to use the hf_term_pairs field. - * - * The key to HF Term Pair rewriting is understanding which nodes can be combined. This extractor - * accomplishes this job by grouping nodes of the query together. All positive children of a - * conjunction are grouped together, and all negative children of a disjunction are grouped - * together. The end result is a tree of groups, where every child of a single group will have the - * opposite value of isPositive of the parent group. - * - * I'll try to break it down a bit further. Let's assume "a" and "b" are hf terms, and ' - * "[hf_term_pair a b]" represents querying their co-occurence. - * Query (* a b not_hf) can become (* [hf_term_pair a b] not_hf) - * Query (+ -a -b -not_hf) can become (+ -[hf_term_pair a b] -not_hf) - * These two rules represent the bulk of the rewrites that this class makes. - * - * We also keep track of another form of rewrite. A member of a group can be paired up with a member - * of any of its parent groups as long as both groups have the same isPositive value. This - * operation mimics boolean distribution. As this is probably better explained with an example: - * Query (* a (+ not_hf (* b not_hf2))) can become (* a (+ not_hf (* [hf_term_pair a b ] not_hf2))) - * Query (+ -a (* not_hf (+ -b not_hf2))) can become (+ -a (* not_hf (+ -[hf_term_pair a b] not_hf2))) - */ -public class HighFrequencyTermPairExtractor extends QueryVisitor { - - private final ArrayList groupList; - private final IdentityHashMap groupIds; - - public HighFrequencyTermPairExtractor(ArrayList groupList, - IdentityHashMap groupIds) { - Preconditions.checkNotNull(groupList); - Preconditions.checkArgument(groupList.isEmpty()); - this.groupList = groupList; - this.groupIds = groupIds; - } - - @Override - public Integer visit(Disjunction disjunction) throws QueryParserException { - return visit((BooleanQuery) disjunction); - } - - @Override - public Integer visit(Conjunction conjunction) throws QueryParserException { - return visit((BooleanQuery) conjunction); - } - - /** - * All positive children under a conjunction (negative children under disjunction) belong in the - * same group as booleanQuery. All other children belong in their own, separate, new groups. - * @param booleanQuery - * @return Number of high frequency terms seen by this node and its children - * @throws QueryParserException - */ - private Integer visit(BooleanQuery booleanQuery) throws QueryParserException { - HighFrequencyTermQueryGroup group = getGroupForQuery(booleanQuery); - int numHits = 0; - - for (Query node : booleanQuery.getChildren()) { - boolean neg = node.mustNotOccur(); - if (node.isTypeOf(Query.QueryType.DISJUNCTION)) { - // Disjunctions, being negative conjunctions, are inherently negative nodes. In terms of - // being in a positive or negative group, we must flip their Occur value. - neg = !neg; - } - - if (booleanQuery.isTypeOf(Query.QueryType.DISJUNCTION) && node.mustOccur()) { - // Potential Example: (* a (+ +b not_c)) => (* (+ +b not_c) [hf_term_pair a b 0.05]) - // Implementation is too difficult and would make this rewriter even MORE complicated for - // a rarely used query. For now, we ignore it completely. We might gain some benefit in the - // future if we decide to create a new extractor and rewriter and rewrite this subquery, and - // that wouldn't complicate things too much. - continue; - } - - if (booleanQuery.isTypeOf(Query.QueryType.CONJUNCTION) != neg) { // Add node to current group - groupIds.put(node, group.groupIdx); - group.numMembers++; - } else { // Create a new group - HighFrequencyTermQueryGroup newGroup = - new HighFrequencyTermQueryGroup(groupList.size(), group.groupIdx, !group.isPositive); - newGroup.numMembers++; - groupIds.put(node, newGroup.groupIdx); - groupList.add(newGroup); - } - numHits += node.accept(this); - } - - return numHits; - } - - @Override - public Integer visit(Phrase phrase) throws QueryParserException { - HighFrequencyTermQueryGroup group = getGroupForQuery(phrase); - - int numFound = 0; - if (!phrase.hasAnnotationType(Annotation.Type.OPTIONAL)) { - boolean canBeRewritten = false; - - // Special case: phrases with exactly 2 terms that are both high frequency can be - // rewritten. In all other cases terms will be treated as pre-used hf term phrases. - if (!phrase.hasAnnotations() && phrase.size() == 2 - && HighFrequencyTermPairs.HF_TERM_SET.contains(phrase.getTerms().get(0)) - && HighFrequencyTermPairs.HF_TERM_SET.contains(phrase.getTerms().get(1))) { - canBeRewritten = true; - } - - // Special case: do not treat phrase containing :prox annotation as a real phrase. - boolean proximityPhrase = phrase.hasAnnotationType(Annotation.Type.PROXIMITY); - - String lastHFToken = null; - for (String token : phrase.getTerms()) { - if (HighFrequencyTermPairs.HF_TERM_SET.contains(token)) { - group.preusedHFTokens.add(token); - if (group.distributiveToken == null) { - group.distributiveToken = token; - } - if (lastHFToken != null && !proximityPhrase) { - if (canBeRewritten) { - group.hfPhrases.add(lastHFToken + " " + token); - } else { - group.preusedHFPhrases.add(lastHFToken + " " + token); - } - } - lastHFToken = token; - numFound++; - } else { - lastHFToken = null; - } - } - } - - return numFound; - } - - @Override - public Integer visit(Term term) throws QueryParserException { - if (groupList.isEmpty()) { // Shortcut for 1 term queries. - return 0; - } - - HighFrequencyTermQueryGroup group = getGroupForQuery(term); - - if (!term.hasAnnotationType(Annotation.Type.OPTIONAL) - && HighFrequencyTermPairs.HF_TERM_SET.contains(term.getValue())) { - if (!term.hasAnnotations()) { - group.hfTokens.add(term.getValue()); - } else { // Should not remove the annotated term. - group.preusedHFTokens.add(term.getValue()); - } - - if (group.distributiveToken == null) { - group.distributiveToken = term.getValue(); - } - return 1; - } - - return 0; - } - - @Override - public Integer visit(Operator operator) throws QueryParserException { - return 0; - } - - @Override - public Integer visit(SpecialTerm special) throws QueryParserException { - return 0; - } - - /** - * Uses the query's visitor data as an index and returns the group it belongs to. If groupList is - * empty, create a new group and set this group's visitor data to be index 0. - * @param query - * @return the group which query belongs to. - */ - private HighFrequencyTermQueryGroup getGroupForQuery(Query query) { - if (groupList.isEmpty()) { - boolean pos = !query.mustNotOccur(); - if (query instanceof Disjunction) { - pos = !pos; - } - HighFrequencyTermQueryGroup group = new HighFrequencyTermQueryGroup(0, pos); - group.numMembers++; - groupList.add(group); - groupIds.put(query, 0); - } - - return groupList.get(groupIds.get(query)); - } -} diff --git a/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairRewriteVisitor.docx b/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairRewriteVisitor.docx new file mode 100644 index 000000000..e521d0e7d Binary files /dev/null and b/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairRewriteVisitor.docx differ diff --git a/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairRewriteVisitor.java b/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairRewriteVisitor.java deleted file mode 100644 index a54b46e5e..000000000 --- a/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermPairRewriteVisitor.java +++ /dev/null @@ -1,477 +0,0 @@ -package com.twitter.search.earlybird.queryparser; - -import java.util.ArrayList; -import java.util.IdentityHashMap; -import java.util.List; -import java.util.Set; - -import javax.annotation.Nullable; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.metrics.SearchRateCounter; -import com.twitter.search.common.util.text.HighFrequencyTermPairs; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.queryparser.parser.SerializedQueryParser; -import com.twitter.search.queryparser.query.BooleanQuery; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Operator; -import com.twitter.search.queryparser.query.Phrase; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.QueryNodeUtils; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.query.QueryVisitor; -import com.twitter.search.queryparser.query.SpecialTerm; -import com.twitter.search.queryparser.query.Term; -import com.twitter.search.queryparser.query.search.SearchOperator; - -/** - * Iterates over the Query, modifying it to include high frequency term pairs, replacing - * singular high frequency terms where possible. - * - * Assumes that this will be used IMMEDIATELY after using HighFrequencyTermPairExtractor - * - * There are two primary functions of this visitor: - * 1. Append hf_term_pairs to each group's root node. - * 2. Remove all unnecessary term queries (unnecessary as they are captured by an hf_term_pair) - * - * Every time the visitor finishes visiting a node, HighFrequencyTermQueryGroup.numVisits will be - * incremented for that node's group. When numVisits == numChildren, we know we have just finished - * processing the root of the group. At this point, we must append relevant hf_term_pairs to this - * node. - */ -public class HighFrequencyTermPairRewriteVisitor extends QueryVisitor { - private static final Logger LOG = LoggerFactory.getLogger( - HighFrequencyTermPairRewriteVisitor.class); - private static final SearchRateCounter SEARCH_HF_PAIR_COUNTER = - SearchRateCounter.export("hf_pair_rewrite"); - - private final ArrayList groupList; - private final IdentityHashMap groupIds; - private final boolean allowNegativeOrRewrite; - - /** - * Creates a new HighFrequencyTermPairRewriteVisitor. Should be used only IMMEDIATELY after using - * a HighFrequencyTermPairExtractor - * @param groupList The groups extracted using HighFrequencyTermPairExtractor - * @param groupIds the mapping from query to the HF term query group - */ - public HighFrequencyTermPairRewriteVisitor(ArrayList groupList, - IdentityHashMap groupIds) { - this(groupList, groupIds, true); - } - - /** - * Creates a new HighFrequencyTermPairRewriteVisitor. Should be used only IMMEDIATELY after using - * a HighFrequencyTermPairExtractor - * @param groupList The groups extracted using HighFrequencyTermPairExtractor - * @param groupIds the mapping from query to the HF term query group - * @param allowNegativeOrRewrite whether to allow rewrite for 'or (-terms)' - */ - public HighFrequencyTermPairRewriteVisitor(ArrayList groupList, - IdentityHashMap groupIds, - boolean allowNegativeOrRewrite) { - this.groupList = groupList; - this.groupIds = groupIds; - this.allowNegativeOrRewrite = allowNegativeOrRewrite; - } - - /** - * This method logs successful rewrites, and protects against unsuccessful ones by - * catching all exceptions and restoring the previous query. - */ - public static Query safeRewrite(Query safeQuery, boolean allowNegativeOrRewrite) - throws QueryParserException { - Query query = safeQuery; - - ArrayList groups = Lists.newArrayList(); - IdentityHashMap groupIds = Maps.newIdentityHashMap(); - - // Step 1: extract high frequency term pairs and phrases. - try { - int hfTermsFound = query.accept(new HighFrequencyTermPairExtractor(groups, groupIds)); - if (hfTermsFound < 2) { - return query; - } - } catch (Exception e) { - LOG.error("Exception while extracting high frequency term pairs", e); - return query; - } - - // Step 2: rewrite (safely). - String original = query.serialize(); - try { - query = query.accept( - new HighFrequencyTermPairRewriteVisitor(groups, groupIds, allowNegativeOrRewrite)) - .simplify(); - String rewrite = query.serialize(); - if (LOG.isDebugEnabled()) { - LOG.debug("Optimized query: " + original + " -> " + rewrite); - } - SEARCH_HF_PAIR_COUNTER.increment(); - return query; - } catch (Exception e) { - LOG.error("Exception rewriting high frequency term pairs", e); - return new SerializedQueryParser(EarlybirdConfig.getPenguinVersion()).parse(original); - } - } - - /** - * The rewritten query to use the hf_term_pair operators. - * - * @param disjunction query node which must have been previously visited by - * HighFrequencyTermPairExtractor and not had its visitor data cleared. - */ - @Override - public Query visit(Disjunction disjunction) throws QueryParserException { - return visit((BooleanQuery) disjunction); - } - - /** - * The rewritten query to use the hf_term_pair operators. - * - * @param conjunction query node which must have been previously visited by - * HighFrequencyTermPairExtractor and not had its visitor data cleared. - */ - @Override - public Query visit(Conjunction conjunction) throws QueryParserException { - return visit((BooleanQuery) conjunction); - } - - /** - * Applies this visitor to a BooleanQuery. - */ - public Query visit(BooleanQuery booleanQuery) throws QueryParserException { - HighFrequencyTermQueryGroup group = groupList.get(groupIds.get(booleanQuery)); - queryPreprocess(group); - - ArrayList children = Lists.newArrayList(); - for (Query node : booleanQuery.getChildren()) { - if (booleanQuery.isTypeOf(Query.QueryType.DISJUNCTION) && node.mustOccur()) { - // Potential Example: (* a (+ +b not_c)) => (* (+ +b not_c) [hf_term_pair a b 0.05]) - // Implementation is too difficult and would make this rewriter even MORE complicated for - // a rarely used query. For now, we ignore it completely. We might gain some benefit in the - // future if we decide to create a new extractor and rewriter and rewrite this subquery, and - // that wouldn't complicate things too much. - children.add(node); - continue; - } - Query child = node.accept(this); - if (child != null) { - children.add(child); - } - } - - Query newBooleanQuery = booleanQuery.newBuilder().setChildren(children).build(); - - return queryPostprocess(newBooleanQuery, group); - } - - /** - * The rewritten query to use the hf_term_pair operators. - * - * @param phraseToVisit query node which must have been previously visited by - * HighFrequencyTermPairExtractor and not had its visitor data cleared. - */ - @Override - public Query visit(Phrase phraseToVisit) throws QueryParserException { - Phrase phrase = phraseToVisit; - - HighFrequencyTermQueryGroup group = groupList.get(groupIds.get(phrase)); - queryPreprocess(group); - - // Remove all high frequency phrases from the query that do not have any annotations. - // This will cause phrase de-duping, which we probably don't care about. - if (!hasAnnotations(phrase) && ( - group.hfPhrases.contains(phrase.getPhraseValue()) - || group.preusedHFPhrases.contains(phrase.getPhraseValue()))) { - // This term will be appended to the end of the query in the form of a pair. - phrase = null; - } - - return queryPostprocess(phrase, group); - } - - /** - * The rewritten query to use the hf_term_pair operators. - * - * @param termToVisit query node which must have been previously visited by - * HighFrequencyTermPairExtractor and not had its visitor data cleared. - */ - @Override - public Query visit(Term termToVisit) throws QueryParserException { - Term term = termToVisit; - - HighFrequencyTermQueryGroup group = groupList.get(groupIds.get(term)); - queryPreprocess(group); - - // Remove all high frequency terms from the query that do not have any annotations. This will - // do term de-duping within a group, which may effect scoring, but since these are high df - // terms, they don't have much of an impact anyways. - if (!hasAnnotations(term) - && (group.preusedHFTokens.contains(term.getValue()) - || group.hfTokens.contains(term.getValue()))) { - // This term will be appended to the end of the query in the form of a pair. - term = null; - } - - return queryPostprocess(term, group); - } - - /** - * The rewritten query to use the hf_term_pair operators. - * - * @param operator query node which must have been previously visited by - * HighFrequencyTermPairExtractor and not had its visitor data cleared. - */ - @Override - public Query visit(Operator operator) throws QueryParserException { - HighFrequencyTermQueryGroup group = groupList.get(groupIds.get(operator)); - queryPreprocess(group); - - return queryPostprocess(operator, group); - } - - /** - * The rewritten query to use the hf_term_pair operators. - * - * @param special query node which must have been previously visited by - * HighFrequencyTermPairExtractor and not had its visitor data cleared. - */ - @Override - public Query visit(SpecialTerm special) throws QueryParserException { - HighFrequencyTermQueryGroup group = groupList.get(groupIds.get(special)); - queryPreprocess(group); - - return queryPostprocess(special, group); - } - - /** - * Before visiting a node's children, we must process its group's distributiveToken. This way, a - * node only has to check its grandparent group for a distributiveToken instead of recursing all - * of the way up to the root of the tree. - */ - private void queryPreprocess(HighFrequencyTermQueryGroup group) { - if (group.distributiveToken == null) { - group.distributiveToken = getAncestorDistributiveToken(group); - } - } - - /** - * If the query isn't the root of the group, returns the query. Otherwise, if the query's - * group has at most one hf term, return the query. Otherwise, returns the query with hf_term_pair - * operators created from the group's hf terms appended to it. - */ - private Query queryPostprocess(@Nullable Query query, HighFrequencyTermQueryGroup group) - throws QueryParserException { - - group.numVisits++; - if (group.numMembers == group.numVisits - && (!group.hfTokens.isEmpty() || !group.preusedHFTokens.isEmpty() - || group.hasPhrases())) { - - group.removePreusedTokens(); - String ancestorDistributiveToken = getAncestorDistributiveToken(group); - - // Need at least 2 tokens to perform a pair rewrite. Try to get one - // additional token from ancestors, and if that fails, from phrases. - if ((group.hfTokens.size() + group.preusedHFTokens.size()) == 1 - && ancestorDistributiveToken != null) { - group.preusedHFTokens.add(ancestorDistributiveToken); - } - if ((group.hfTokens.size() + group.preusedHFTokens.size()) == 1) { - String tokenFromPhrase = group.getTokenFromPhrase(); - if (tokenFromPhrase != null) { - group.preusedHFTokens.add(tokenFromPhrase); - } - } - - return appendPairs(query, group); - } - - return query; - } - - /** - * Returns the distributiveToken of group's grandparent. - */ - private String getAncestorDistributiveToken(HighFrequencyTermQueryGroup group) { - String ancestorDistributiveToken = null; - if (group.parentGroupIdx >= 0 && groupList.get(group.parentGroupIdx).parentGroupIdx >= 0) { - ancestorDistributiveToken = - groupList.get(groupList.get(group.parentGroupIdx).parentGroupIdx).distributiveToken; - } - return ancestorDistributiveToken; - } - - /** - * Returns the hf_term_pair operators created using the hf terms of the group appended to query. - * - * @param query The query which the new hf_term_pair operators will be appended to. - * @param group The group which this query belongs to. - * @return The hf_term_pair operators created using the hf terms of the group appended to query. - */ - private Query appendPairs(@Nullable Query query, HighFrequencyTermQueryGroup group) - throws QueryParserException { - - BooleanQuery query2 = createQueryFromGroup(group); - - // If either of the queries are null, do not have to worry about combining them. - if (query2 == null) { - return query; - } else if (query == null) { - return query2; - } - - Query newQuery; - - if (query.isTypeOf(Query.QueryType.CONJUNCTION) - || query.isTypeOf(Query.QueryType.DISJUNCTION)) { - // Adding children in this way is safer when its query is a conjunction or disjunction - // ex. Other way: (+ +de -la -the) => (+ (+ +de -la -the) -[hf_term_pair la the 0.005]) - // This way: (+ +de -la -the) => (+ +de -la -the -[hf_term_pair la the 0.005]) - return ((BooleanQuery.Builder) query.newBuilder()).addChildren(query2.getChildren()).build(); - } else if (!group.isPositive) { - // In lucene, [+ (-term1, -term2, ...)] has non-deterministic behavior and the rewrite is not - // efficient from query execution perspective. So, we will not do this rewrite if it is - // configured that way. - if (!allowNegativeOrRewrite) { - return query; - } - - // Negate both queries to combine, and the append as a conjunction, followed by negating - // whole query. Equivalent to appending as a disjunction. - newQuery = QueryNodeUtils.appendAsConjunction( - query.negate(), - query2.negate() - ); - newQuery = newQuery.makeMustNot(); - } else { - newQuery = QueryNodeUtils.appendAsConjunction(query, query2); - newQuery = newQuery.makeDefault(); - } - - return newQuery; - } - - /** - * Creates a conjunction of term_pairs using the sets of hf terms in HighFrequencyTermQueryGroup - * group. If !group.isPositive, will return a disjunction of negated pairs. If there aren't enough - * hfTokens, will return null. - */ - private BooleanQuery createQueryFromGroup(HighFrequencyTermQueryGroup group) - throws QueryParserException { - - if (!group.hfTokens.isEmpty() || group.preusedHFTokens.size() > 1 || group.hasPhrases()) { - List terms = createTermPairsForGroup(group.hfTokens, - group.preusedHFTokens, - group.hfPhrases, - group.preusedHFPhrases); - - if (group.isPositive) { - return new Conjunction(terms); - } else { - return new Disjunction(Lists.transform(terms, QueryNodeUtils.NEGATE_QUERY)); - } - } - - return null; - } - - /** - * Creates HF_TERM_PAIR terms out of hfTokens and optHFTokens. Attempts to create the minimal - * amount of tokens necessary. optHFToken pairs should be given a weight of 0.0 and not be scored, - * as they are likely already included in the query in a phrase or an annotated term. - * @param hfTokens - * @param optHFTokens - * @return A list of hf_term_pair operators. - */ - private List createTermPairsForGroup(Set hfTokens, - Set optHFTokens, - Set hfPhrases, - Set optHFPhrases) { - // Handle sets with only one token. - if (optHFTokens.size() == 1 && hfTokens.size() > 0) { - // (* "a not_hf" b c) => (* "a not_hf" [hf_term_pair a b 0.05] [hf_term_pair b c 0.05]) - // optHFTokens: [a] hfTokens: [b, c] => optHFTokens: [] hfTokens: [a, b, c] - hfTokens.addAll(optHFTokens); - optHFTokens.clear(); - } else if (hfTokens.size() == 1 && optHFTokens.size() > 0) { - // (* "a b" not_hf c) => (* "a b" not_hf [hf_term_pair a b 0.0] [hf_term_pair a c 0.005]) - // optHFTokens: [a, b] hfTokens: [c] => optHFTokens: [a, b] hfTokens: [a, c] - String term = optHFTokens.iterator().next(); - hfTokens.add(term); - } - - List terms = createTermPairs(hfTokens, true, HighFrequencyTermPairs.HF_DEFAULT_WEIGHT); - terms.addAll(createTermPairs(optHFTokens, false, 0)); - terms.addAll(createPhrasePairs(hfPhrases, HighFrequencyTermPairs.HF_DEFAULT_WEIGHT)); - terms.addAll(createPhrasePairs(optHFPhrases, 0)); - - return terms; - } - - /** - * Turns a set of hf terms into a list of hf_term_pair operators. Each term will be used at least - * once in as few pairs as possible. - * @param tokens - * @param createSingle If the set contains only one query, the returned list will contain a single - * Term for that query if createSingle is true, and an empty list otherwise. - * @param weight Each term pair will be given a score boost of serializedWeight. - * @return - */ - private static List createTermPairs(Set tokens, boolean createSingle, - double weight) { - - List terms = Lists.newArrayList(); - if (tokens.size() >= 2) { - int tokensLeft = tokens.size(); - String token1 = null; - for (String token2 : tokens) { - if (token1 == null) { - token1 = token2; - } else { - terms.add(createHFTermPair(token1, token2, weight)); - - if (tokensLeft > 2) { // Only reset if there is more than one token remaining. - token1 = null; - } - } - tokensLeft--; - } - } else if (createSingle && !tokens.isEmpty()) { // Only one high frequency token - // Need to add token as a term because it was removed from the query earlier in rewriting. - Term newTerm = new Term(tokens.iterator().next()); - terms.add(newTerm); - } - - return terms; - } - - private static List createPhrasePairs(Set phrases, double weight) { - List ops = Lists.newArrayList(); - for (String phrase : phrases) { - String[] terms = phrase.split(" "); - assert terms.length == 2; - SearchOperator op = new SearchOperator(SearchOperator.Type.HF_PHRASE_PAIR, - terms[0], terms[1], Double.toString(weight)); - ops.add(op); - } - return ops; - } - - private static SearchOperator createHFTermPair(String token1, String token2, double weight) { - SearchOperator op = new SearchOperator(SearchOperator.Type.HF_TERM_PAIR, - token1, token2, Double.toString(weight)); - return op; - } - - private static boolean hasAnnotations(com.twitter.search.queryparser.query.Query node) { - return node.hasAnnotations(); - } -} diff --git a/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermQueryGroup.docx b/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermQueryGroup.docx new file mode 100644 index 000000000..ee5f7107c Binary files /dev/null and b/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermQueryGroup.docx differ diff --git a/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermQueryGroup.java b/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermQueryGroup.java deleted file mode 100644 index f6b40f868..000000000 --- a/src/java/com/twitter/search/earlybird/queryparser/HighFrequencyTermQueryGroup.java +++ /dev/null @@ -1,94 +0,0 @@ -package com.twitter.search.earlybird.queryparser; - -import java.util.ArrayList; -import java.util.List; -import java.util.Set; - -import com.google.common.collect.Sets; - -/** - * Used to store information relevant to processing query groups for HighFrequencyTermPairExtractor - * and HighFrequencyTermPairRewriter - */ -public class HighFrequencyTermQueryGroup { - protected final int groupIdx; - protected final int parentGroupIdx; - // The number of nodes in this group. - protected int numMembers = 0; - // For the rewrite visitor: Incremented once at the end of each of this group's nodes' visits. - protected int numVisits = 0; - - // The set of tokens that should be removed from the query if seen as an individual term and - // rewritten in the query as a hf term pair. - protected final Set hfTokens = Sets.newTreeSet(); - - // Tokens that can be used to restrict searches but should not be scored. They will be given a - // weight of 0. - protected final Set preusedHFTokens = Sets.newTreeSet(); - - // Set of phrases that should be removed from the query if seen as an individual phrase and - // rewritten in the query as a hf term phrase pair. - protected final Set hfPhrases = Sets.newTreeSet(); - - // Phrases that can be used to restrict searches but should not be scored. They will be given a - // weight of 0. - protected final Set preusedHFPhrases = Sets.newTreeSet(); - - // The first found hf_term, or the hf_term of an ancestor with the same isPositive value. - protected String distributiveToken = null; - - // If it is a single node group, isPositive is true iff that node is true. - // Otherwise, isPositive is false iff the root of the group is a disjunction. - protected final boolean isPositive; - - public HighFrequencyTermQueryGroup(int groupIdx, boolean positive) { - this(groupIdx, -1, positive); - } - - public HighFrequencyTermQueryGroup(int groupIdx, int parentGroupIdx, boolean positive) { - this.groupIdx = groupIdx; - this.parentGroupIdx = parentGroupIdx; - isPositive = positive; - } - - public boolean hasPhrases() { - return !hfPhrases.isEmpty() || !preusedHFPhrases.isEmpty(); - } - - protected List tokensFromPhrases() { - if (!hasPhrases()) { - return null; - } - List tokens = new ArrayList<>(); - for (String phrase : hfPhrases) { - for (String term : phrase.split(" ")) { - tokens.add(term); - } - } - for (String phrase : preusedHFPhrases) { - for (String term : phrase.split(" ")) { - tokens.add(term); - } - } - return tokens; - } - - protected void removePreusedTokens() { - hfTokens.removeAll(preusedHFTokens); - List phraseTokens = tokensFromPhrases(); - if (phraseTokens != null) { - hfTokens.removeAll(phraseTokens); - preusedHFTokens.removeAll(phraseTokens); - } - hfPhrases.removeAll(preusedHFPhrases); - } - - protected String getTokenFromPhrase() { - List phraseTokens = tokensFromPhrases(); - if (phraseTokens != null) { - return phraseTokens.get(0); - } else { - return null; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/queryparser/LuceneRelevanceQueryVisitor.docx b/src/java/com/twitter/search/earlybird/queryparser/LuceneRelevanceQueryVisitor.docx new file mode 100644 index 000000000..9f0ac9df3 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/queryparser/LuceneRelevanceQueryVisitor.docx differ diff --git a/src/java/com/twitter/search/earlybird/queryparser/LuceneRelevanceQueryVisitor.java b/src/java/com/twitter/search/earlybird/queryparser/LuceneRelevanceQueryVisitor.java deleted file mode 100644 index cf749ad20..000000000 --- a/src/java/com/twitter/search/earlybird/queryparser/LuceneRelevanceQueryVisitor.java +++ /dev/null @@ -1,69 +0,0 @@ -package com.twitter.search.earlybird.queryparser; - -import java.util.Map; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.lucene.search.Query; - -import com.twitter.decider.Decider; -import com.twitter.search.common.query.MappableField; -import com.twitter.search.common.schema.base.FieldWeightDefault; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.search.TerminationTracker; -import com.twitter.search.common.search.termination.QueryTimeout; -import com.twitter.search.earlybird.common.userupdates.UserScrubGeoMap; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.partition.MultiSegmentTermDictionaryManager; -import com.twitter.search.earlybird.querycache.QueryCacheManager; -import com.twitter.search.queryparser.query.search.SearchOperator; - -public class LuceneRelevanceQueryVisitor extends EarlybirdLuceneQueryVisitor { - public LuceneRelevanceQueryVisitor( - ImmutableSchemaInterface schema, - QueryCacheManager queryCacheManager, - UserTable userTable, - UserScrubGeoMap userScrubGeoMap, - TerminationTracker terminationTracker, - Map fieldWeightMap, - Map mappableFieldMap, - MultiSegmentTermDictionaryManager multiSegmentTermDictionaryManager, - Decider decider, - EarlybirdCluster earlybirdCluster, - QueryTimeout queryTimeout) { - super( - schema, - queryCacheManager, - userTable, - userScrubGeoMap, - terminationTracker, - fieldWeightMap, - mappableFieldMap, - multiSegmentTermDictionaryManager, - decider, - earlybirdCluster, - queryTimeout); - } - - @VisibleForTesting - protected LuceneRelevanceQueryVisitor( - ImmutableSchemaInterface schema, - QueryCacheManager queryCacheManager, - UserTable userTable, - UserScrubGeoMap userScrubGeoMap, - EarlybirdCluster earlybirdCluster) { - super(schema, - queryCacheManager, - userTable, - userScrubGeoMap, - earlybirdCluster, - queryCacheManager.getDecider()); - } - - @Override - protected Query visitSinceIDOperator(SearchOperator op) { - // since_id is handled by the blender for relevance queries, so don't filter on it. - return null; - } -} diff --git a/src/java/com/twitter/search/earlybird/queryparser/ProtectedOperatorQueryRewriter.docx b/src/java/com/twitter/search/earlybird/queryparser/ProtectedOperatorQueryRewriter.docx new file mode 100644 index 000000000..bb639213f Binary files /dev/null and b/src/java/com/twitter/search/earlybird/queryparser/ProtectedOperatorQueryRewriter.docx differ diff --git a/src/java/com/twitter/search/earlybird/queryparser/ProtectedOperatorQueryRewriter.java b/src/java/com/twitter/search/earlybird/queryparser/ProtectedOperatorQueryRewriter.java deleted file mode 100644 index fd35ac61c..000000000 --- a/src/java/com/twitter/search/earlybird/queryparser/ProtectedOperatorQueryRewriter.java +++ /dev/null @@ -1,153 +0,0 @@ -package com.twitter.search.earlybird.queryparser; - -import java.util.List; - -import com.google.common.base.Preconditions; -import com.google.common.collect.ImmutableList; - -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.queryparser.query.Conjunction; -import com.twitter.search.queryparser.query.Disjunction; -import com.twitter.search.queryparser.query.Query; -import com.twitter.search.queryparser.query.search.SearchOperator; -import com.twitter.search.queryparser.query.search.SearchOperatorConstants; - -public class ProtectedOperatorQueryRewriter { - private static final String ERROR_MESSAGE = "Positive 'protected' operator must be in the root" - + " query node and the root query node must be a Conjunction."; - private static final Query EXCLUDE_PROTECTED_OPERATOR = - new SearchOperator(SearchOperator.Type.EXCLUDE, SearchOperatorConstants.PROTECTED); - - /** - * Rewrite a query with positive 'protected' operator into an equivalent query without the positive - * 'protected' operator. This method assumes the following preconditions hold: - * 1. 'followedUserIds' is not empty - * 2. the query's root node is of type Conjunction - * 3. the query's root node is not negated - * 4. there is one positive 'protected' operator in the root node - * 5. there is only one 'protected' operator in the whole query - * - * Query with '[include protected]' operator is rewritten into a Disjunction of a query with - * protected Tweets only and a query with public Tweets only. - * For example, - * Original query: - * (* "cat" [include protected]) - * with followedUserIds=[1, 7, 12] where 1 and 7 are protected users - * Rewritten query: - * (+ - * (* "cat" [multi_term_disjunction from_user_id 1 7]) - * (* "cat" [exclude protected]) - * ) - * - * Query with '[filter protected]' operator is rewritten with multi_term_disjunction from_user_id - * operator. - * For example, - * Original query: - * (* "cat" [filter protected]) - * with followedUserIds=[1, 7, 12] where 1 and 7 are protected users - * Rewritten query: - * (* "cat" [multi_term_disjunction from_user_id 1 7]) - */ - public Query rewrite(Query parsedQuery, List followedUserIds, UserTable userTable) { - Preconditions.checkState(followedUserIds != null && !followedUserIds.isEmpty(), - "'followedUserIds' should not be empty when positive 'protected' operator exists."); - Preconditions.checkState( - parsedQuery.isTypeOf(com.twitter.search.queryparser.query.Query.QueryType.CONJUNCTION), - ERROR_MESSAGE); - Conjunction parsedConjQuery = (Conjunction) parsedQuery; - List children = parsedConjQuery.getChildren(); - int opIndex = findPositiveProtectedOperatorIndex(children); - Preconditions.checkState(opIndex >= 0, ERROR_MESSAGE); - SearchOperator protectedOp = (SearchOperator) children.get(opIndex); - - ImmutableList.Builder otherChildrenBuilder = ImmutableList.builder(); - otherChildrenBuilder.addAll(children.subList(0, opIndex)); - if (opIndex + 1 < children.size()) { - otherChildrenBuilder.addAll(children.subList(opIndex + 1, children.size())); - } - List otherChildren = otherChildrenBuilder.build(); - - List protectedUserIds = getProtectedUserIds(followedUserIds, userTable); - if (protectedOp.getOperatorType() == SearchOperator.Type.FILTER) { - if (protectedUserIds.isEmpty()) { - // match none query - return Disjunction.EMPTY_DISJUNCTION; - } else { - return parsedConjQuery.newBuilder() - .setChildren(otherChildren) - .addChild(createFromUserIdMultiTermDisjunctionQuery(protectedUserIds)) - .build(); - } - } else { - // 'include' or negated 'exclude' operator - // negated 'exclude' is considered the same as 'include' to be consistent with the logic in - // EarlybirdLuceneQueryVisitor - if (protectedUserIds.isEmpty()) { - // return public only query - return parsedConjQuery.newBuilder() - .setChildren(otherChildren) - .addChild(EXCLUDE_PROTECTED_OPERATOR) - .build(); - } else { - // build a disjunction of protected only query and public only query - Query protectedOnlyQuery = parsedConjQuery.newBuilder() - .setChildren(otherChildren) - .addChild(createFromUserIdMultiTermDisjunctionQuery(protectedUserIds)) - .build(); - Query publicOnlyQuery = parsedConjQuery.newBuilder() - .setChildren(otherChildren) - .addChild(EXCLUDE_PROTECTED_OPERATOR) - .build(); - return new Disjunction(protectedOnlyQuery, publicOnlyQuery); - } - } - } - - private Query createFromUserIdMultiTermDisjunctionQuery(List userIds) { - ImmutableList.Builder operandsBuilder = ImmutableList.builder(); - operandsBuilder - .add(EarlybirdFieldConstants.EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName()); - for (Long userId : userIds) { - operandsBuilder.add(userId.toString()); - } - List operands = operandsBuilder.build(); - return new SearchOperator(SearchOperator.Type.MULTI_TERM_DISJUNCTION, operands); - } - - private List getProtectedUserIds(List followedUserIds, UserTable userTable) { - ImmutableList.Builder protectedUserIds = ImmutableList.builder(); - for (Long userId : followedUserIds) { - if (userTable.isSet(userId, UserTable.IS_PROTECTED_BIT)) { - protectedUserIds.add(userId); - } - } - return protectedUserIds.build(); - } - - private int findPositiveProtectedOperatorIndex(List children) { - for (int i = 0; i < children.size(); i++) { - Query child = children.get(i); - if (child instanceof SearchOperator) { - SearchOperator searchOp = (SearchOperator) child; - if (SearchOperatorConstants.PROTECTED.equals(searchOp.getOperand()) - && (isNegateExclude(searchOp) || isPositive(searchOp))) { - return i; - } - } - } - - return -1; - } - - private boolean isNegateExclude(SearchOperator searchOp) { - return searchOp.mustNotOccur() - && searchOp.getOperatorType() == SearchOperator.Type.EXCLUDE; - } - - private boolean isPositive(SearchOperator searchOp) { - return !searchOp.mustNotOccur() - && (searchOp.getOperatorType() == SearchOperator.Type.INCLUDE - || searchOp.getOperatorType() == SearchOperator.Type.FILTER); - } -} diff --git a/src/java/com/twitter/search/earlybird/search/AbstractResultsCollector.docx b/src/java/com/twitter/search/earlybird/search/AbstractResultsCollector.docx new file mode 100644 index 000000000..00b666736 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/AbstractResultsCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/AbstractResultsCollector.java b/src/java/com/twitter/search/earlybird/search/AbstractResultsCollector.java deleted file mode 100644 index d18fcdfda..000000000 --- a/src/java/com/twitter/search/earlybird/search/AbstractResultsCollector.java +++ /dev/null @@ -1,630 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Optional; -import com.google.common.base.Preconditions; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; - -import org.apache.commons.collections.CollectionUtils; -import org.apache.lucene.index.LeafReader; -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.ScoreMode; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; -import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser; -import com.twitter.search.common.relevance.features.EarlybirdDocumentFeatures; -import com.twitter.search.common.results.thriftjava.FieldHitAttribution; -import com.twitter.search.common.results.thriftjava.FieldHitList; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.search.TwitterEarlyTerminationCollector; -import com.twitter.search.common.util.spatial.GeoUtil; -import com.twitter.search.core.earlybird.facets.AbstractFacetCountingArray; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; -import com.twitter.search.core.earlybird.index.TimeMapper; -import com.twitter.search.core.earlybird.index.inverted.QueryCostTracker; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.index.EarlybirdSingleSegmentSearcher; -import com.twitter.search.earlybird.index.TweetIDMapper; -import com.twitter.search.earlybird.search.facets.FacetLabelCollector; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.ThriftFacetLabel; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.earlybird.thrift.ThriftSearchResultExtraMetadata; -import com.twitter.search.earlybird.thrift.ThriftSearchResultGeoLocation; -import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata; -import com.twitter.search.queryparser.util.IdTimeRanges; - -import geo.google.datamodel.GeoCoordinate; - -/** - * Abstract parent class for all results collectors in earlybird. - * This collector should be able to handle both single-segment and - * multi-segment collection. - */ -public abstract class AbstractResultsCollector - extends TwitterEarlyTerminationCollector { - enum IdAndRangeUpdateType { - BEGIN_SEGMENT, - END_SEGMENT, - HIT - } - - // Earlybird used to have a special early termination logic: at segment boundaries - // the collector estimates how much time it'll take to search the next segment. - // If this estimate * 1.5 will cause the request to timeout, the search early terminates. - // That logic is removed in favor of more fine grained checks---now we check timeout - // within a segment, every 2,000,000 docs processed. - private static final int EXPENSIVE_TERMINATION_CHECK_INTERVAL = - EarlybirdConfig.getInt("expensive_termination_check_interval", 2000000); - - private static final long NO_TIME_SLICE_ID = -1; - - protected final R searchRequestInfo; - - // Sometimes maxHitsToProcess can also come from places other than collector params. - // E.g. from searchQuery.getRelevanceOptions(). This provides a way to allow - // subclasses to override the maxHitsToProcess on collector params. - private final long maxHitsToProcessOverride; - - // min and max status id actually considered in the search (may not be a hit) - private long minSearchedStatusID = Long.MAX_VALUE; - private long maxSearchedStatusID = Long.MIN_VALUE; - - private int minSearchedTime = Integer.MAX_VALUE; - private int maxSearchedTime = Integer.MIN_VALUE; - - // per-segment start time. Will be re-started in setNextReader(). - private long segmentStartTime; - - // Current segment being searched. - protected EarlybirdIndexSegmentAtomicReader currTwitterReader; - protected TweetIDMapper tweetIdMapper; - protected TimeMapper timeMapper; - protected long currTimeSliceID = NO_TIME_SLICE_ID; - - private final long queryTime; - - // Time periods, in milliseconds, for which hits are counted. - private final List hitCountsThresholdsMsec; - - // hitCounts[i] is the number of hits that are more recent than hitCountsThresholdsMsec[i] - private final int[] hitCounts; - - private final ImmutableSchemaInterface schema; - - private final EarlybirdSearcherStats searcherStats; - // For collectors that fill in the results' geo locations, this will be used to retrieve the - // documents' lat/lon coordinates. - private GeoCoordinate resultGeoCoordinate; - protected final boolean fillInLatLonForHits; - - protected EarlybirdDocumentFeatures documentFeatures; - protected boolean featuresRequested = false; - - private final FacetLabelCollector facetCollector; - - // debugMode set in request to determine debugging level. - private int requestDebugMode; - - // debug info to be returned in earlybird response - protected List debugInfo; - - private int numHitsCollectedPerSegment; - - public AbstractResultsCollector( - ImmutableSchemaInterface schema, - R searchRequestInfo, - Clock clock, - EarlybirdSearcherStats searcherStats, - int requestDebugMode) { - super(searchRequestInfo.getSearchQuery().getCollectorParams(), - searchRequestInfo.getTerminationTracker(), - QueryCostTracker.getTracker(), - EXPENSIVE_TERMINATION_CHECK_INTERVAL, - clock); - - this.schema = schema; - this.searchRequestInfo = searchRequestInfo; - ThriftSearchQuery thriftSearchQuery = searchRequestInfo.getSearchQuery(); - this.maxHitsToProcessOverride = searchRequestInfo.getMaxHitsToProcess(); - this.facetCollector = buildFacetCollector(searchRequestInfo, schema); - - if (searchRequestInfo.getTimestamp() > 0) { - queryTime = searchRequestInfo.getTimestamp(); - } else { - queryTime = System.currentTimeMillis(); - } - hitCountsThresholdsMsec = thriftSearchQuery.getHitCountBuckets(); - hitCounts = hitCountsThresholdsMsec == null || hitCountsThresholdsMsec.size() == 0 - ? null - : new int[hitCountsThresholdsMsec.size()]; - - this.searcherStats = searcherStats; - - Schema.FieldInfo latLonCSFField = - schema.hasField(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName()) - ? schema.getFieldInfo(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName()) - : null; - boolean loadLatLonMapperIntoRam = true; - if (latLonCSFField != null) { - // If the latlon_csf field is explicitly defined, then take the config from the schema. - // If it's not defined, we assume that the latlon mapper is stored in memory. - loadLatLonMapperIntoRam = latLonCSFField.getFieldType().isCsfLoadIntoRam(); - } - // Default to not fill in lat/lon if the lat/lon CSF field is not loaded into RAM - this.fillInLatLonForHits = EarlybirdConfig.getBool("fill_in_lat_lon_for_hits", - loadLatLonMapperIntoRam); - this.requestDebugMode = requestDebugMode; - - if (shouldCollectDetailedDebugInfo()) { - this.debugInfo = new ArrayList<>(); - debugInfo.add("Starting Search"); - } - } - - private static FacetLabelCollector buildFacetCollector( - SearchRequestInfo request, - ImmutableSchemaInterface schema) { - if (CollectionUtils.isEmpty(request.getFacetFieldNames())) { - return null; - } - - // Get all facet field ids requested. - Set requiredFields = Sets.newHashSet(); - for (String fieldName : request.getFacetFieldNames()) { - Schema.FieldInfo field = schema.getFacetFieldByFacetName(fieldName); - if (field != null) { - requiredFields.add(field.getFieldType().getFacetName()); - } - } - - if (requiredFields.size() > 0) { - return new FacetLabelCollector(requiredFields); - } else { - return null; - } - } - - /** - * Subclasses should implement the following methods. - */ - - // Subclasses should process collected hits and construct a final - // AbstractSearchResults object. - protected abstract S doGetResults() throws IOException; - - // Subclasses can override this method to add more collection logic. - protected abstract void doCollect(long tweetID) throws IOException; - - public final ImmutableSchemaInterface getSchema() { - return schema; - } - - // Updates the hit count array - each result only increments the first qualifying bucket. - protected final void updateHitCounts(long statusId) { - if (hitCounts == null) { - return; - } - - long delta = queryTime - SnowflakeIdParser.getTimestampFromTweetId(statusId); - for (int i = 0; i < hitCountsThresholdsMsec.size(); ++i) { - if (delta >= 0 && delta < hitCountsThresholdsMsec.get(i)) { - hitCounts[i]++; - // Increments to the rest of the count array are implied, and aggregated later, since the - // array is sorted. - break; - } - } - } - - private boolean searchedStatusIDsAndTimesInitialized() { - return maxSearchedStatusID != Long.MIN_VALUE; - } - - // Updates the first searched status ID when starting to search a new segment. - private void updateFirstSearchedStatusID() { - // Only try to update the min/max searched ids, if this segment/reader actually has documents - // See SEARCH-4535 - int minDocID = currTwitterReader.getSmallestDocID(); - if (currTwitterReader.hasDocs() && minDocID >= 0 && !searchedStatusIDsAndTimesInitialized()) { - final long firstStatusID = tweetIdMapper.getTweetID(minDocID); - final int firstStatusTime = timeMapper.getTime(minDocID); - if (shouldCollectDetailedDebugInfo()) { - debugInfo.add( - "updateFirstSearchedStatusID. minDocId=" + minDocID + ", firstStatusID=" - + firstStatusID + ", firstStatusTime=" + firstStatusTime); - } - updateIDandTimeRanges(firstStatusID, firstStatusTime, IdAndRangeUpdateType.BEGIN_SEGMENT); - } - } - - public final R getSearchRequestInfo() { - return searchRequestInfo; - } - - public final long getMinSearchedStatusID() { - return minSearchedStatusID; - } - - public final long getMaxSearchedStatusID() { - return maxSearchedStatusID; - } - - public final int getMinSearchedTime() { - return minSearchedTime; - } - - public boolean isSetMinSearchedTime() { - return minSearchedTime != Integer.MAX_VALUE; - } - - public final int getMaxSearchedTime() { - return maxSearchedTime; - } - - @Override - public final long getMaxHitsToProcess() { - return maxHitsToProcessOverride; - } - - // Notifies classes that a new index segment is about to be searched. - @Override - public final void setNextReader(LeafReaderContext context) throws IOException { - super.setNextReader(context); - setNextReader(context.reader()); - } - - /** - * Notifies the collector that a new segment is about to be searched. - * - * It's easier to use this method from tests, because LeafReader is not a final class, so it can - * be mocked (unlike LeafReaderContext). - */ - @VisibleForTesting - public final void setNextReader(LeafReader reader) throws IOException { - if (!(reader instanceof EarlybirdIndexSegmentAtomicReader)) { - throw new RuntimeException("IndexReader type not supported: " + reader.getClass()); - } - - currTwitterReader = (EarlybirdIndexSegmentAtomicReader) reader; - documentFeatures = new EarlybirdDocumentFeatures(currTwitterReader); - tweetIdMapper = (TweetIDMapper) currTwitterReader.getSegmentData().getDocIDToTweetIDMapper(); - timeMapper = currTwitterReader.getSegmentData().getTimeMapper(); - currTimeSliceID = currTwitterReader.getSegmentData().getTimeSliceID(); - updateFirstSearchedStatusID(); - if (shouldCollectDetailedDebugInfo()) { - debugInfo.add("Starting search in segment with timeslice ID: " + currTimeSliceID); - } - - segmentStartTime = getClock().nowMillis(); - startSegment(); - } - - protected abstract void startSegment() throws IOException; - - @Override - protected final void doCollect() throws IOException { - documentFeatures.advance(curDocId); - long tweetID = tweetIdMapper.getTweetID(curDocId); - updateIDandTimeRanges(tweetID, timeMapper.getTime(curDocId), IdAndRangeUpdateType.HIT); - doCollect(tweetID); - numHitsCollectedPerSegment++; - } - - protected void collectFeatures(ThriftSearchResultMetadata metadata) throws IOException { - if (featuresRequested) { - ensureExtraMetadataIsSet(metadata); - - metadata.getExtraMetadata().setDirectedAtUserId( - documentFeatures.getFeatureValue(EarlybirdFieldConstant.DIRECTED_AT_USER_ID_CSF)); - metadata.getExtraMetadata().setQuotedTweetId( - documentFeatures.getFeatureValue(EarlybirdFieldConstant.QUOTED_TWEET_ID_CSF)); - metadata.getExtraMetadata().setQuotedUserId( - documentFeatures.getFeatureValue(EarlybirdFieldConstant.QUOTED_USER_ID_CSF)); - - int cardLangValue = - (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.CARD_LANG_CSF); - ThriftLanguage thriftLanguage = ThriftLanguage.findByValue(cardLangValue); - metadata.getExtraMetadata().setCardLang(thriftLanguage); - - long cardNumericUri = - (long) documentFeatures.getFeatureValue(EarlybirdFieldConstant.CARD_URI_CSF); - if (cardNumericUri > 0) { - metadata.getExtraMetadata().setCardUri(String.format("card://%s", cardNumericUri)); - } - } - } - - protected void collectIsProtected( - ThriftSearchResultMetadata metadata, EarlybirdCluster cluster, UserTable userTable) - throws IOException { - // 'isUserProtected' field is only set for archive cluster because only archive cluster user - // table has IS_PROTECTED_BIT populated. - // Since this bit is checked after UserFlagsExcludeFilter checked this bit, there is a slight - // chance that this bit is updated in-between. When that happens, it is possible that we will - // see a small number of protected Tweets in the response when we meant to exclude them. - if (cluster == EarlybirdCluster.FULL_ARCHIVE) { - ensureExtraMetadataIsSet(metadata); - long userId = documentFeatures.getFeatureValue(EarlybirdFieldConstant.FROM_USER_ID_CSF); - boolean isProtected = userTable.isSet(userId, UserTable.IS_PROTECTED_BIT); - metadata.getExtraMetadata().setIsUserProtected(isProtected); - } - } - - protected void collectExclusiveConversationAuthorId(ThriftSearchResultMetadata metadata) - throws IOException { - if (searchRequestInfo.isCollectExclusiveConversationAuthorId()) { - long exclusiveConversationAuthorId = documentFeatures.getFeatureValue( - EarlybirdFieldConstant.EXCLUSIVE_CONVERSATION_AUTHOR_ID_CSF); - if (exclusiveConversationAuthorId != 0L) { - ensureExtraMetadataIsSet(metadata); - metadata.getExtraMetadata().setExclusiveConversationAuthorId(exclusiveConversationAuthorId); - } - } - } - - // It only makes sense to collectFacets for search types that return individual results (recency, - // relevance and top_tweets), which use the AbstractRelevanceCollector and SearchResultsCollector, - // so this method should only be called from these classes. - protected void collectFacets(ThriftSearchResultMetadata metadata) { - if (currTwitterReader == null) { - return; - } - - AbstractFacetCountingArray facetCountingArray = currTwitterReader.getFacetCountingArray(); - EarlybirdIndexSegmentData segmentData = currTwitterReader.getSegmentData(); - - if (facetCountingArray == null || facetCollector == null) { - return; - } - - facetCollector.resetFacetLabelProviders( - segmentData.getFacetLabelProviders(), - segmentData.getFacetIDMap()); - - facetCountingArray.collectForDocId(curDocId, facetCollector); - - List labels = facetCollector.getLabels(); - if (labels.size() > 0) { - metadata.setFacetLabels(labels); - } - } - - protected void ensureExtraMetadataIsSet(ThriftSearchResultMetadata metadata) { - if (!metadata.isSetExtraMetadata()) { - metadata.setExtraMetadata(new ThriftSearchResultExtraMetadata()); - } - } - - @Override - protected final void doFinishSegment(int lastSearchedDocID) { - if (shouldCollectDetailedDebugInfo()) { - long timeSpentSearchingSegmentInMillis = getClock().nowMillis() - segmentStartTime; - debugInfo.add("Finished segment at doc id: " + lastSearchedDocID); - debugInfo.add("Time spent searching " + currTimeSliceID - + ": " + timeSpentSearchingSegmentInMillis + "ms"); - debugInfo.add("Number of hits collected in segment " + currTimeSliceID + ": " - + numHitsCollectedPerSegment); - } - - if (!currTwitterReader.hasDocs()) { - // Due to race between the reader and the indexing thread, a seemingly empty segment that - // does not have document committed in the posting lists, might already have a document - // inserted into the id/time mappers, which we do not want to take into account. - // If there are no documents in the segment, we don't update searched min/max ids to - // anything. - return; - } else if (lastSearchedDocID == DocIdSetIterator.NO_MORE_DOCS) { - // Segment exhausted. - if (shouldCollectDetailedDebugInfo()) { - debugInfo.add("Segment exhausted"); - } - updateIDandTimeRanges(tweetIdMapper.getMinTweetID(), timeMapper.getFirstTime(), - IdAndRangeUpdateType.END_SEGMENT); - } else if (lastSearchedDocID >= 0) { - long lastSearchedTweetID = tweetIdMapper.getTweetID(lastSearchedDocID); - int lastSearchTweetTime = timeMapper.getTime(lastSearchedDocID); - if (shouldCollectDetailedDebugInfo()) { - debugInfo.add("lastSearchedDocId=" + lastSearchedDocID); - } - updateIDandTimeRanges(lastSearchedTweetID, lastSearchTweetTime, - IdAndRangeUpdateType.END_SEGMENT); - } - - numHitsCollectedPerSegment = 0; - } - - private void updateIDandTimeRanges(long tweetID, int time, IdAndRangeUpdateType updateType) { - // We need to update minSearchedStatusID/maxSearchedStatusID and - // minSearchedTime/maxSearchedTime independently: SEARCH-6139 - minSearchedStatusID = Math.min(minSearchedStatusID, tweetID); - maxSearchedStatusID = Math.max(maxSearchedStatusID, tweetID); - if (time > 0) { - minSearchedTime = Math.min(minSearchedTime, time); - maxSearchedTime = Math.max(maxSearchedTime, time); - } - if (shouldCollectVerboseDebugInfo()) { - debugInfo.add( - String.format("call to updateIDandTimeRanges(%d, %d, %s)" - + " set minSearchStatusID=%d, maxSearchedStatusID=%d," - + " minSearchedTime=%d, maxSearchedTime=%d)", - tweetID, time, updateType.toString(), - minSearchedStatusID, maxSearchedStatusID, - minSearchedTime, maxSearchedTime)); - } - } - - /** - * This is called when a segment is skipped but we would want to do accounting - * for minSearchDocId as well as numDocsProcessed. - */ - public void skipSegment(EarlybirdSingleSegmentSearcher searcher) throws IOException { - setNextReader(searcher.getTwitterIndexReader().getContext()); - trackCompleteSegment(DocIdSetIterator.NO_MORE_DOCS); - if (shouldCollectDetailedDebugInfo()) { - debugInfo.add("Skipping segment: " + currTimeSliceID); - } - } - - /** - * Returns the results collected by this collector. - */ - public final S getResults() throws IOException { - // In order to make pagination work, if minSearchedStatusID is greater than the asked max_id. - // We force the minSearchedStatusID to be max_id + 1. - IdTimeRanges idTimeRanges = searchRequestInfo.getIdTimeRanges(); - if (idTimeRanges != null) { - Optional maxIDInclusive = idTimeRanges.getMaxIDInclusive(); - if (maxIDInclusive.isPresent() && minSearchedStatusID > maxIDInclusive.get()) { - searcherStats.numCollectorAdjustedMinSearchedStatusID.increment(); - minSearchedStatusID = maxIDInclusive.get() + 1; - } - } - - S results = doGetResults(); - results.setNumHitsProcessed((int) getNumHitsProcessed()); - results.setNumSearchedSegments(getNumSearchedSegments()); - if (searchedStatusIDsAndTimesInitialized()) { - results.setMaxSearchedStatusID(maxSearchedStatusID); - results.setMinSearchedStatusID(minSearchedStatusID); - results.setMaxSearchedTime(maxSearchedTime); - results.setMinSearchedTime(minSearchedTime); - } - results.setEarlyTerminated(getEarlyTerminationState().isTerminated()); - if (getEarlyTerminationState().isTerminated()) { - results.setEarlyTerminationReason(getEarlyTerminationState().getTerminationReason()); - } - Map counts = getHitCountMap(); - if (counts != null) { - results.hitCounts.putAll(counts); - } - return results; - } - - /** - * Returns a map of timestamps (specified in the query) to the number of hits that are more recent - * that the respective timestamps. - */ - public final Map getHitCountMap() { - int total = 0; - if (hitCounts == null) { - return null; - } - Map map = Maps.newHashMap(); - // since the array is incremental, need to aggregate here. - for (int i = 0; i < hitCounts.length; ++i) { - map.put(hitCountsThresholdsMsec.get(i), total += hitCounts[i]); - } - return map; - } - - /** - * Common helper for collecting per-field hit attribution data (if it's available). - * - * @param metadata the metadata to fill for this hit. - */ - protected final void fillHitAttributionMetadata(ThriftSearchResultMetadata metadata) { - if (searchRequestInfo.getHitAttributeHelper() == null) { - return; - } - - Map> hitAttributeMapping = - searchRequestInfo.getHitAttributeHelper().getHitAttribution(curDocId); - Preconditions.checkNotNull(hitAttributeMapping); - - FieldHitAttribution fieldHitAttribution = new FieldHitAttribution(); - for (Map.Entry> entry : hitAttributeMapping.entrySet()) { - FieldHitList fieldHitList = new FieldHitList(); - fieldHitList.setHitFields(entry.getValue()); - - fieldHitAttribution.putToHitMap(entry.getKey(), fieldHitList); - } - metadata.setFieldHitAttribution(fieldHitAttribution); - } - - /** - * Fill the geo location of the given document in metadata, if we have the lat/lon for it. - * For queries that specify a geolocation, this will also have the distance from - * the location specified in the query, and the location of this document. - */ - protected final void fillResultGeoLocation(ThriftSearchResultMetadata metadata) - throws IOException { - Preconditions.checkNotNull(metadata); - if (currTwitterReader != null && fillInLatLonForHits) { - // See if we can have a lat/lon for this doc. - if (resultGeoCoordinate == null) { - resultGeoCoordinate = new GeoCoordinate(); - } - // Only fill if necessary - if (searchRequestInfo.isCollectResultLocation() - && GeoUtil.decodeLatLonFromInt64( - documentFeatures.getFeatureValue(EarlybirdFieldConstant.LAT_LON_CSF_FIELD), - resultGeoCoordinate)) { - ThriftSearchResultGeoLocation resultLocation = new ThriftSearchResultGeoLocation(); - resultLocation.setLatitude(resultGeoCoordinate.getLatitude()); - resultLocation.setLongitude(resultGeoCoordinate.getLongitude()); - metadata.setResultLocation(resultLocation); - } - } - } - - @Override - public ScoreMode scoreMode() { - return ScoreMode.COMPLETE; - } - - private int terminationDocID = -1; - - @Override - protected void collectedEnoughResults() throws IOException { - // We find 'terminationDocID' once we collect enough results, so that we know the point at which - // we can stop searching. We must do this because with the unordered doc ID mapper, tweets - // are not ordered within a millisecond, so we must search the entire millisecond bucket before - // terminating the search, otherwise we could skip over tweets and have an incorrect - // minSearchedStatusID. - if (curDocId != -1 && terminationDocID == -1) { - long tweetId = tweetIdMapper.getTweetID(curDocId); - // We want to find the highest possible doc ID for this tweetId, so pass true. - boolean findMaxDocID = true; - terminationDocID = tweetIdMapper.findDocIdBound(tweetId, - findMaxDocID, - curDocId, - curDocId); - } - } - - @Override - protected boolean shouldTerminate() { - return curDocId >= terminationDocID; - } - - @Override - public List getDebugInfo() { - return debugInfo; - } - - protected boolean shouldCollectDetailedDebugInfo() { - return requestDebugMode >= 5; - } - - // Use this for per-result debug info. Useful for queries with no results - // or a very small number of results. - protected boolean shouldCollectVerboseDebugInfo() { - return requestDebugMode >= 6; - } -} diff --git a/src/java/com/twitter/search/earlybird/search/AntiGamingFilter.docx b/src/java/com/twitter/search/earlybird/search/AntiGamingFilter.docx new file mode 100644 index 000000000..dbf518016 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/AntiGamingFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/AntiGamingFilter.java b/src/java/com/twitter/search/earlybird/search/AntiGamingFilter.java deleted file mode 100644 index fbb95443c..000000000 --- a/src/java/com/twitter/search/earlybird/search/AntiGamingFilter.java +++ /dev/null @@ -1,228 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.io.IOException; -import java.util.Comparator; -import java.util.HashSet; -import java.util.Set; -import java.util.SortedSet; -import java.util.TreeSet; - -import com.google.common.annotations.VisibleForTesting; - -import org.apache.commons.lang.mutable.MutableInt; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreMode; - -import com.twitter.common_internal.collections.RandomAccessPriorityQueue; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.search.TwitterIndexSearcher; -import com.twitter.search.common.util.analysis.LongTermAttributeImpl; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -public class AntiGamingFilter { - private interface Acceptor { - boolean accept(int internalDocID) throws IOException; - } - - private NumericDocValues userReputation; - private NumericDocValues fromUserIDs; - - private final Query luceneQuery; - - private boolean termsExtracted = false; - private final Set queryTerms; - - // we ignore these user ids for anti-gaming filtering, because they were explicitly queried for - private Set segmentUserIDWhitelist = null; - // we gather the whitelisted userIDs from all segments here - private Set globalUserIDWhitelist = null; - - /** - * Used to track the number of occurrences of a particular user. - */ - private static final class UserCount - implements RandomAccessPriorityQueue.SignatureProvider { - private long userID; - private int count; - - @Override - public Long getSignature() { - return userID; - } - - @Override - public void clear() { - userID = 0; - count = 0; - } - } - - private static final Comparator USER_COUNT_COMPARATOR = - (d1, d2) -> d1.count == d2.count ? Long.compare(d1.userID, d2.userID) : d1.count - d2.count; - - private final RandomAccessPriorityQueue priorityQueue = - new RandomAccessPriorityQueue(1024, USER_COUNT_COMPARATOR) { - @Override - protected UserCount getSentinelObject() { - return new UserCount(); - } - }; - - private final Acceptor acceptor; - private final int maxHitsPerUser; - - /** - * Creates an AntiGamingFilter that either accepts or rejects tweets from all users. - * This method should only be called in tests. - * - * @param alwaysValue Determines if tweets should always be accepted or rejected. - * @return An AntiGamingFilter that either accepts or rejects tweets from all users. - */ - @VisibleForTesting - public static AntiGamingFilter newMock(boolean alwaysValue) { - return new AntiGamingFilter(alwaysValue) { - @Override - public void startSegment(EarlybirdIndexSegmentAtomicReader reader) { - } - }; - } - - private AntiGamingFilter(boolean alwaysValue) { - acceptor = internalDocID -> alwaysValue; - maxHitsPerUser = Integer.MAX_VALUE; - termsExtracted = true; - luceneQuery = null; - queryTerms = null; - } - - public AntiGamingFilter(int maxHitsPerUser, int maxTweepCred, Query luceneQuery) { - this.maxHitsPerUser = maxHitsPerUser; - this.luceneQuery = luceneQuery; - - if (maxTweepCred != -1) { - this.acceptor = internalDocID -> { - long userReputationVal = - userReputation.advanceExact(internalDocID) ? userReputation.longValue() : 0L; - return ((byte) userReputationVal > maxTweepCred) || acceptUser(internalDocID); - }; - } else { - this.acceptor = this::acceptUser; - } - - this.queryTerms = new HashSet<>(); - } - - public Set getUserIDWhitelist() { - return globalUserIDWhitelist; - } - - private boolean acceptUser(int internalDocID) throws IOException { - final long fromUserID = getUserId(internalDocID); - final MutableInt freq = new MutableInt(); - // try to increment UserCount for an user already exist in the priority queue. - boolean incremented = priorityQueue.incrementElement( - fromUserID, element -> freq.setValue(++element.count)); - - // If not incremented, it means the user node does not exist in the priority queue yet. - if (!incremented) { - priorityQueue.updateTop(element -> { - element.userID = fromUserID; - element.count = 1; - freq.setValue(element.count); - }); - } - - if (freq.intValue() <= maxHitsPerUser) { - return true; - } else if (segmentUserIDWhitelist == null) { - return false; - } - return segmentUserIDWhitelist.contains(fromUserID); - } - - /** - * Initializes this filter with the new feature source. This method should be called every time an - * earlybird searcher starts searching in a new segment. - * - * @param reader The reader for the new segment. - */ - public void startSegment(EarlybirdIndexSegmentAtomicReader reader) throws IOException { - if (!termsExtracted) { - extractTerms(reader); - } - - fromUserIDs = - reader.getNumericDocValues(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName()); - - // fill the id whitelist for the current segment. initialize lazily. - segmentUserIDWhitelist = null; - - SortedSet sortedFromUserDocIds = new TreeSet<>(); - for (Term t : queryTerms) { - if (t.field().equals(EarlybirdFieldConstant.FROM_USER_ID_FIELD.getFieldName())) { - // Add the operand of the from_user_id operator to the whitelist - long fromUserID = LongTermAttributeImpl.copyBytesRefToLong(t.bytes()); - addUserToWhitelists(fromUserID); - } else if (t.field().equals(EarlybirdFieldConstant.FROM_USER_FIELD.getFieldName())) { - // For a [from X] filter, we need to find a document that has the from_user field set to X, - // and then we need to get the value of the from_user_id field for that document and add it - // to the whitelist. We can get the from_user_id value from the fromUserIDs NumericDocValues - // instance, but we need to traverse it in increasing order of doc IDs. So we add a doc ID - // for each term to a sorted set for now, and then we traverse it in increasing doc ID order - // and add the from_user_id values for those docs to the whitelist. - int firstInternalDocID = reader.getNewestDocID(t); - if (firstInternalDocID != EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) { - sortedFromUserDocIds.add(firstInternalDocID); - } - } - } - - for (int fromUserDocId : sortedFromUserDocIds) { - addUserToWhitelists(getUserId(fromUserDocId)); - } - - userReputation = - reader.getNumericDocValues(EarlybirdFieldConstant.USER_REPUTATION.getFieldName()); - - // Reset the fromUserIDs NumericDocValues so that the acceptor can use it to iterate over docs. - fromUserIDs = - reader.getNumericDocValues(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName()); - } - - private void extractTerms(IndexReader reader) throws IOException { - Query query = luceneQuery; - for (Query rewrittenQuery = query.rewrite(reader); rewrittenQuery != query; - rewrittenQuery = query.rewrite(reader)) { - query = rewrittenQuery; - } - - // Create a new TwitterIndexSearcher instance here instead of an IndexSearcher instance, to use - // the TwitterIndexSearcher.collectionStatistics() implementation. - query.createWeight(new TwitterIndexSearcher(reader), ScoreMode.COMPLETE, 1.0f) - .extractTerms(queryTerms); - termsExtracted = true; - } - - public boolean accept(int internalDocID) throws IOException { - return acceptor.accept(internalDocID); - } - - private void addUserToWhitelists(long userID) { - if (this.segmentUserIDWhitelist == null) { - this.segmentUserIDWhitelist = new HashSet<>(); - } - if (this.globalUserIDWhitelist == null) { - this.globalUserIDWhitelist = new HashSet<>(); - } - this.segmentUserIDWhitelist.add(userID); - this.globalUserIDWhitelist.add(userID); - } - - @VisibleForTesting - protected long getUserId(int internalDocId) throws IOException { - return fromUserIDs.advanceExact(internalDocId) ? fromUserIDs.longValue() : 0L; - } -} diff --git a/src/java/com/twitter/search/earlybird/search/EarlybirdLuceneSearcher.docx b/src/java/com/twitter/search/earlybird/search/EarlybirdLuceneSearcher.docx new file mode 100644 index 000000000..6b21db501 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/EarlybirdLuceneSearcher.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/EarlybirdLuceneSearcher.java b/src/java/com/twitter/search/earlybird/search/EarlybirdLuceneSearcher.java deleted file mode 100644 index 14f742eac..000000000 --- a/src/java/com/twitter/search/earlybird/search/EarlybirdLuceneSearcher.java +++ /dev/null @@ -1,98 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.io.IOException; -import java.util.Map; - -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.IndexSearcher; - -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.earlybird.EarlybirdSearcher; -import com.twitter.search.earlybird.search.facets.AbstractFacetTermCollector; -import com.twitter.search.earlybird.search.facets.FacetResultsCollector; -import com.twitter.search.earlybird.search.facets.TermStatisticsCollector.TermStatisticsSearchResults; -import com.twitter.search.earlybird.search.facets.TermStatisticsRequestInfo; -import com.twitter.search.earlybird.thrift.ThriftFacetCount; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird.thrift.ThriftTermStatisticsResults; - -public abstract class EarlybirdLuceneSearcher extends IndexSearcher { - public EarlybirdLuceneSearcher(IndexReader r) { - super(r); - } - - /** - * Fills facet information for all given search results. - * - * @param collector A collector that knows how collect facet information. - * @param searchResults The search results. - */ - public abstract void fillFacetResults( - AbstractFacetTermCollector collector, ThriftSearchResults searchResults) - throws IOException; - - /** - * Fills metadata for all given facet results. - * - * @param facetResults The facet results. - * @param schema The earlybird schema. - * @param debugMode The debug mode for the request that yielded these results. - */ - public abstract void fillFacetResultMetadata( - Map facetResults, - ImmutableSchemaInterface schema, - byte debugMode) throws IOException; - - /** - * Fills metadata for all given term stats results. - * - * @param termStatsResults The term stats results. - * @param schema The earlybird schema. - * @param debugMode The debug mode for the request that yielded these results. - */ - public abstract void fillTermStatsMetadata( - ThriftTermStatisticsResults termStatsResults, - ImmutableSchemaInterface schema, - byte debugMode) throws IOException; - - /** - * Returns the results for the given term stats request. - * - * @param searchRequestInfo Stores the original term stats request and some other useful request - * information. - * @param searcher The searcher that should be used to execute the request. - * @param requestDebugMode The debug mode for this request. - * @return The term stats results for the given request. - */ - public abstract TermStatisticsSearchResults collectTermStatistics( - TermStatisticsRequestInfo searchRequestInfo, - EarlybirdSearcher searcher, - int requestDebugMode) throws IOException; - - /** - * Writes an explanation for the given hits into the given ThriftSearchResults instance. - * - * @param searchRequestInfo Stores the original request and some other useful request context. - * @param hits The hits. - * @param searchResults The ThriftSearchResults where the explanation for the given hits will be - * stored. - */ - // Writes explanations into the searchResults thrift. - public abstract void explainSearchResults(SearchRequestInfo searchRequestInfo, - SimpleSearchResults hits, - ThriftSearchResults searchResults) throws IOException; - - public static class FacetSearchResults extends SearchResultsInfo { - private FacetResultsCollector collector; - - public FacetSearchResults(FacetResultsCollector collector) { - this.collector = collector; - } - - public ThriftFacetFieldResults getFacetResults(String facetName, int topK) { - return collector.getFacetResults(facetName, topK); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/search/EarlybirdMultiSegmentSearcher.docx b/src/java/com/twitter/search/earlybird/search/EarlybirdMultiSegmentSearcher.docx new file mode 100644 index 000000000..c34672cab Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/EarlybirdMultiSegmentSearcher.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/EarlybirdMultiSegmentSearcher.java b/src/java/com/twitter/search/earlybird/search/EarlybirdMultiSegmentSearcher.java deleted file mode 100644 index 61bd21ef1..000000000 --- a/src/java/com/twitter/search/earlybird/search/EarlybirdMultiSegmentSearcher.java +++ /dev/null @@ -1,254 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashSet; -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.base.Preconditions; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.MultiReader; -import org.apache.lucene.index.Term; -import org.apache.lucene.search.Collector; -import org.apache.lucene.search.Explanation; -import org.apache.lucene.search.Query; -import org.apache.lucene.search.Scorer; -import org.apache.lucene.search.ScoreMode; -import org.apache.lucene.search.Weight; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentData; -import com.twitter.search.earlybird.EarlybirdSearcher; -import com.twitter.search.earlybird.index.EarlybirdSingleSegmentSearcher; -import com.twitter.search.earlybird.index.TweetIDMapper; -import com.twitter.search.earlybird.search.facets.AbstractFacetTermCollector; -import com.twitter.search.earlybird.search.facets.TermStatisticsCollector; -import com.twitter.search.earlybird.search.facets.TermStatisticsCollector.TermStatisticsSearchResults; -import com.twitter.search.earlybird.search.facets.TermStatisticsRequestInfo; -import com.twitter.search.earlybird.search.queries.SinceMaxIDFilter; -import com.twitter.search.earlybird.search.queries.SinceUntilFilter; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.ThriftFacetCount; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResults; -import com.twitter.search.earlybird.thrift.ThriftTermStatisticsResults; -import com.twitter.search.queryparser.util.IdTimeRanges; - -public class EarlybirdMultiSegmentSearcher extends EarlybirdLuceneSearcher { - private static final Logger LOG = LoggerFactory.getLogger(EarlybirdMultiSegmentSearcher.class); - - private final ImmutableSchemaInterface schema; - private final Map segmentSearchers; - protected final int numSegments; - private final Clock clock; - - // This will prevent us from even considering segments that are out of range. - // It's an important optimization for a certain class of queries. - protected IdTimeRanges idTimeRanges = null; - - private final EarlybirdSearcherStats searcherStats; - - public EarlybirdMultiSegmentSearcher( - ImmutableSchemaInterface schema, - List searchers, - EarlybirdSearcherStats searcherStats, - Clock clock) throws IOException { - // NOTE: We pass in an empty MultiReader to super and retain the list of searchers in this - // class since MultiReader does not allow an aggregate of more than Integer.MAX_VALUE docs, - // which some of our larger archive indexes may have. - super(new MultiReader()); - // segmentSearchers are mapped from time slice IDs to searchers so that we can quickly - // find the correct searcher for a given time slice ID (see fillPayload). - // make sure we maintain order of segments, hence a LinkedHashMap instead of just a HashMap - this.segmentSearchers = new LinkedHashMap<>(); - this.schema = schema; - for (EarlybirdSingleSegmentSearcher searcher : searchers) { - if (searcher != null) { - long timeSliceID = searcher.getTimeSliceID(); - this.segmentSearchers.put(timeSliceID, searcher); - } - } - // initializing this after populating the list. previously initialized before, and - // this may have lead to a race condition, although this doesn't seem possible given - // that segments should be an immutable cloned list. - this.numSegments = segmentSearchers.size(); - - this.searcherStats = searcherStats; - this.clock = clock; - } - - public void setIdTimeRanges(IdTimeRanges idTimeRanges) { - this.idTimeRanges = idTimeRanges; - } - - @Override - protected void search(List unusedLeaves, Weight weight, Collector coll) - throws IOException { - Preconditions.checkState(coll instanceof AbstractResultsCollector); - AbstractResultsCollector collector = (AbstractResultsCollector) coll; - - for (EarlybirdSingleSegmentSearcher segmentSearcher : segmentSearchers.values()) { - if (shouldSkipSegment(segmentSearcher)) { - collector.skipSegment(segmentSearcher); - } else { - segmentSearcher.search(weight.getQuery(), collector); - if (collector.isTerminated()) { - break; - } - } - } - } - - @VisibleForTesting - protected boolean shouldSkipSegment(EarlybirdSingleSegmentSearcher segmentSearcher) { - EarlybirdIndexSegmentData segmentData = - segmentSearcher.getTwitterIndexReader().getSegmentData(); - if (idTimeRanges != null) { - if (!SinceMaxIDFilter.sinceMaxIDsInRange( - (TweetIDMapper) segmentData.getDocIDToTweetIDMapper(), - idTimeRanges.getSinceIDExclusive().or(SinceMaxIDFilter.NO_FILTER), - idTimeRanges.getMaxIDInclusive().or(SinceMaxIDFilter.NO_FILTER)) - || !SinceUntilFilter.sinceUntilTimesInRange( - segmentData.getTimeMapper(), - idTimeRanges.getSinceTimeInclusive().or(SinceUntilFilter.NO_FILTER), - idTimeRanges.getUntilTimeExclusive().or(SinceUntilFilter.NO_FILTER))) { - return true; - } - } - return false; - } - - @Override - public void fillFacetResults( - AbstractFacetTermCollector collector, ThriftSearchResults searchResults) throws IOException { - for (EarlybirdSingleSegmentSearcher segmentSearcher : segmentSearchers.values()) { - segmentSearcher.fillFacetResults(collector, searchResults); - } - } - - @Override - public TermStatisticsSearchResults collectTermStatistics( - TermStatisticsRequestInfo searchRequestInfo, - EarlybirdSearcher searcher, - int requestDebugMode) throws IOException { - TermStatisticsCollector collector = new TermStatisticsCollector( - schema, searchRequestInfo, searcherStats, clock, requestDebugMode); - search(collector.getSearchRequestInfo().getLuceneQuery(), collector); - searcher.maybeSetCollectorDebugInfo(collector); - return collector.getResults(); - } - - @Override - public void explainSearchResults(SearchRequestInfo searchRequestInfo, - SimpleSearchResults hits, ThriftSearchResults searchResults) throws IOException { - for (EarlybirdSingleSegmentSearcher segmentSearcher : segmentSearchers.values()) { - // the hits that are getting passed into this method are hits across - // all searched segments. We need to get the per segment hits and - // generate explanations one segment at a time. - List hitsForCurrentSegment = new ArrayList<>(); - Set tweetIdsForCurrentSegment = new HashSet<>(); - List hitResultsForCurrentSegment = new ArrayList<>(); - - for (Hit hit : hits.hits) { - if (hit.getTimeSliceID() == segmentSearcher.getTimeSliceID()) { - hitsForCurrentSegment.add(hit); - tweetIdsForCurrentSegment.add(hit.statusID); - } - } - for (ThriftSearchResult result : searchResults.getResults()) { - if (tweetIdsForCurrentSegment.contains(result.id)) { - hitResultsForCurrentSegment.add(result); - } - } - ThriftSearchResults resultsForSegment = new ThriftSearchResults() - .setResults(hitResultsForCurrentSegment); - - SimpleSearchResults finalHits = new SimpleSearchResults(hitsForCurrentSegment); - segmentSearcher.explainSearchResults(searchRequestInfo, finalHits, resultsForSegment); - } - // We should not see hits that are not associated with an active segment - List hitsWithUnknownSegment = - Arrays.stream(hits.hits()).filter(hit -> !hit.isHasExplanation()) - .collect(Collectors.toList()); - for (Hit hit : hitsWithUnknownSegment) { - LOG.error("Unable to find segment associated with hit: " + hit.toString()); - } - } - - @Override - public void fillFacetResultMetadata(Map facetResults, - ImmutableSchemaInterface documentSchema, byte debugMode) - throws IOException { - for (EarlybirdSingleSegmentSearcher segmentSearcher : segmentSearchers.values()) { - segmentSearcher.fillFacetResultMetadata(facetResults, documentSchema, debugMode); - } - } - - @Override - public void fillTermStatsMetadata(ThriftTermStatisticsResults termStatsResults, - ImmutableSchemaInterface documentSchema, byte debugMode) - throws IOException { - for (EarlybirdSingleSegmentSearcher segmentSearcher : segmentSearchers.values()) { - segmentSearcher.fillTermStatsMetadata(termStatsResults, documentSchema, debugMode); - } - } - - /** - * The searchers for individual segments will rewrite the query as they see fit, so the multi - * segment searcher does not need to rewrite it. In fact, not rewriting the query here improves - * the request latency by ~5%. - */ - @Override - public Query rewrite(Query original) { - return original; - } - - /** - * The searchers for individual segments will create their own weights. This method only creates - * a dummy weight to pass the Lucene query to the search() method of these individual segment - * searchers. - */ - @Override - public Weight createWeight(Query query, ScoreMode scoreMode, float boost) { - return new DummyWeight(query); - } - - /** - * Dummy weight used solely to pass Lucene Query around. - */ - private static final class DummyWeight extends Weight { - private DummyWeight(Query luceneQuery) { - super(luceneQuery); - } - - @Override - public Explanation explain(LeafReaderContext context, int doc) { - throw new UnsupportedOperationException(); - } - - @Override - public Scorer scorer(LeafReaderContext context) { - throw new UnsupportedOperationException(); - } - - @Override - public void extractTerms(Set terms) { - throw new UnsupportedOperationException(); - } - - @Override - public boolean isCacheable(LeafReaderContext context) { - return true; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/search/GeoQuadTreeQueryBuilder.docx b/src/java/com/twitter/search/earlybird/search/GeoQuadTreeQueryBuilder.docx new file mode 100644 index 000000000..647cd7c54 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/GeoQuadTreeQueryBuilder.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/GeoQuadTreeQueryBuilder.java b/src/java/com/twitter/search/earlybird/search/GeoQuadTreeQueryBuilder.java deleted file mode 100644 index e0fc9f8cb..000000000 --- a/src/java/com/twitter/search/earlybird/search/GeoQuadTreeQueryBuilder.java +++ /dev/null @@ -1,199 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.io.IOException; -import java.util.LinkedHashSet; -import java.util.Set; - -import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.search.Query; -import org.apache.lucene.spatial.prefix.tree.Cell; -import org.apache.lucene.spatial.prefix.tree.CellIterator; -import org.apache.lucene.util.BytesRef; -import org.locationtech.spatial4j.shape.Rectangle; - -import com.twitter.search.common.query.MultiTermDisjunctionQuery; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.search.GeoQuadTreeQueryBuilderUtil; -import com.twitter.search.common.search.TerminationTracker; -import com.twitter.search.common.util.spatial.BoundingBox; -import com.twitter.search.common.util.spatial.GeoUtil; -import com.twitter.search.common.util.spatial.GeohashChunkImpl; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.earlybird.search.queries.GeoTwoPhaseQuery; -import com.twitter.search.earlybird.search.queries.GeoTwoPhaseQuery.SecondPhaseDocAccepter; -import com.twitter.search.queryparser.query.QueryParserException; -import com.twitter.search.queryparser.util.GeoCode; - -import geo.google.datamodel.GeoCoordinate; - -/** - * A class that builds queries to query the quadtree. - */ -public final class GeoQuadTreeQueryBuilder { - private GeoQuadTreeQueryBuilder() { - } - - /** - * Returns a GeoTwoPhaseQuery for the given geocode. - */ - public static Query buildGeoQuadTreeQuery(final GeoCode geocode) { - return buildGeoQuadTreeQuery(geocode, null); - } - - /** - * Returns a GeoTwoPhaseQuery for the given geocode. - * - * @param geocode The geocode. - * @param terminationTracker The tracker that determines when the query needs to terminate. - */ - public static Query buildGeoQuadTreeQuery(GeoCode geocode, - TerminationTracker terminationTracker) { - Query geoHashDisjuntiveQuery = GeoQuadTreeQueryBuilderUtil.buildGeoQuadTreeQuery( - geocode, EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName()); - - // 5. Create post filtering accepter - final SecondPhaseDocAccepter accepter = (geocode.distanceKm != GeoCode.DOUBLE_DISTANCE_NOT_SET) - ? new CenterRadiusAccepter(geocode.latitude, geocode.longitude, geocode.distanceKm) - : GeoTwoPhaseQuery.ALL_DOCS_ACCEPTER; - - return new GeoTwoPhaseQuery(geoHashDisjuntiveQuery, accepter, terminationTracker); - } - - /** - * Construct a query as below: - * 1. Compute all quadtree cells that intersects the bounding box. - * 2. Create a disjunction of the geohashes of all the intersecting cells. - * 3. Add a filter to only keep points inside the giving bounding box. - */ - public static Query buildGeoQuadTreeQuery(final Rectangle boundingBox, - final TerminationTracker terminationTracker) - throws QueryParserException { - // 1. Locate the main quadtree cell---the cell containing the bounding box's center point whose - // diagonal is just longer than the bounding box's diagonal. - final Cell centerCell = GeohashChunkImpl.getGeoNodeByBoundingBox(boundingBox); - - // 2. Determine quadtree level to search. - int treeLevel = -1; - if (centerCell != null) { - treeLevel = centerCell.getLevel(); - } else { - // This should not happen. - throw new QueryParserException( - "Unable to locate quadtree cell containing the given bounding box." - + "Bounding box is: " + boundingBox); - } - - // 3. get all quadtree cells at treeLevel that intersects the given bounding box. - CellIterator intersectingCells = - GeohashChunkImpl.getNodesIntersectingBoundingBox(boundingBox, treeLevel); - - // 4. Construct disjunction query - final Set geoHashSet = new LinkedHashSet<>(); - - // Add center node - geoHashSet.add(centerCell.getTokenBytesNoLeaf(new BytesRef())); - // If there are other nodes intersecting query circle, also add them in. - if (intersectingCells != null) { - while (intersectingCells.hasNext()) { - geoHashSet.add(intersectingCells.next().getTokenBytesNoLeaf(new BytesRef())); - } - } - MultiTermDisjunctionQuery geoHashDisjuntiveQuery = new MultiTermDisjunctionQuery( - EarlybirdFieldConstant.GEO_HASH_FIELD.getFieldName(), geoHashSet); - - // 5. Create post filtering accepter - final GeoDocAccepter accepter = new BoundingBoxAccepter(boundingBox); - - return new GeoTwoPhaseQuery(geoHashDisjuntiveQuery, accepter, terminationTracker); - } - - private abstract static class GeoDocAccepter extends SecondPhaseDocAccepter { - private NumericDocValues latLonDocValues; - private final GeoCoordinate geoCoordReuse = new GeoCoordinate(); - - @Override - public void initialize(LeafReaderContext context) throws IOException { - final EarlybirdIndexSegmentAtomicReader reader = - (EarlybirdIndexSegmentAtomicReader) context.reader(); - latLonDocValues = - reader.getNumericDocValues(EarlybirdFieldConstant.LAT_LON_CSF_FIELD.getFieldName()); - } - - // Decides whether a point should be accepted. - protected abstract boolean acceptPoint(double lat, double lon); - - // Decides whether a document should be accepted based on its geo coordinates. - @Override - public final boolean accept(int internalDocId) throws IOException { - // Cannot obtain valid geo coordinates for the document. Not acceptable. - if (latLonDocValues == null - || !latLonDocValues.advanceExact(internalDocId) - || !GeoUtil.decodeLatLonFromInt64(latLonDocValues.longValue(), geoCoordReuse)) { - return false; - } - - return acceptPoint(geoCoordReuse.getLatitude(), geoCoordReuse.getLongitude()); - } - } - - // Accepts points within a circle defined by a center point and a radius. - private static final class CenterRadiusAccepter extends GeoDocAccepter { - private final double centerLat; - private final double centerLon; - private final double radiusKm; - - public CenterRadiusAccepter(double centerLat, double centerLon, double radiusKm) { - this.centerLat = centerLat; - this.centerLon = centerLon; - this.radiusKm = radiusKm; - } - - @Override - protected boolean acceptPoint(double lat, double lon) { - double actualDistance = - BoundingBox.approxDistanceC(centerLat, centerLon, lat, lon); - if (actualDistance < radiusKm) { - return true; - } else if (Double.isNaN(actualDistance)) { - // There seems to be a rare bug in GeoUtils that computes NaN - // for two identical lat/lon pairs on occasion. Check for that here. - if (lat == centerLat && lon == centerLon) { - return true; - } - } - - return false; - } - - @Override - public String toString() { - return String.format("CenterRadiusAccepter(Center: %.4f, %.4f Radius (km): %.4f)", - centerLat, centerLon, radiusKm); - } - } - - // Accepts points within a BoundingBox - private static final class BoundingBoxAccepter extends GeoDocAccepter { - private final Rectangle boundingBox; - - public BoundingBoxAccepter(Rectangle boundingBox) { - this.boundingBox = boundingBox; - } - - @Override - protected boolean acceptPoint(double lat, double lon) { - return GeohashChunkImpl.isPointInBoundingBox(lat, lon, boundingBox); - - } - - @Override - public String toString() { - return String.format("PointInBoundingBoxAccepter((%.4f, %.4f), (%.4f, %.4f), " - + "crossesDateLine=%b)", - boundingBox.getMinY(), boundingBox.getMinX(), - boundingBox.getMaxY(), boundingBox.getMaxX(), - boundingBox.getCrossesDateLine()); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/search/Hit.docx b/src/java/com/twitter/search/earlybird/search/Hit.docx new file mode 100644 index 000000000..da5989113 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/Hit.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/Hit.java b/src/java/com/twitter/search/earlybird/search/Hit.java deleted file mode 100644 index c8abb5043..000000000 --- a/src/java/com/twitter/search/earlybird/search/Hit.java +++ /dev/null @@ -1,59 +0,0 @@ -package com.twitter.search.earlybird.search; - -import javax.annotation.Nullable; - -import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata; - -/** - * Class that abstracts a document that matches a query we're processing in Earlybird. - */ -public class Hit implements Comparable { - protected long timeSliceID; - protected long statusID; - private boolean hasExplanation; - - @Nullable - protected ThriftSearchResultMetadata metadata; - - public Hit(long timeSliceID, long statusID) { - this.timeSliceID = timeSliceID; - this.statusID = statusID; - this.metadata = null; - } - - public long getTimeSliceID() { - return timeSliceID; - } - - public long getStatusID() { - return statusID; - } - - @Nullable - public ThriftSearchResultMetadata getMetadata() { - return metadata; - } - - public void setMetadata(ThriftSearchResultMetadata metadata) { - this.metadata = metadata; - } - - @Override - public int compareTo(Hit other) { - return -Long.compare(this.statusID, other.statusID); - } - - @Override - public String toString() { - return "Hit[tweetID=" + statusID + ",timeSliceID=" + timeSliceID - + ",score=" + (metadata == null ? "null" : metadata.getScore()) + "]"; - } - - public boolean isHasExplanation() { - return hasExplanation; - } - - public void setHasExplanation(boolean hasExplanation) { - this.hasExplanation = hasExplanation; - } -} diff --git a/src/java/com/twitter/search/earlybird/search/SearchRequestInfo.docx b/src/java/com/twitter/search/earlybird/search/SearchRequestInfo.docx new file mode 100644 index 000000000..789f5cb15 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/SearchRequestInfo.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/SearchRequestInfo.java b/src/java/com/twitter/search/earlybird/search/SearchRequestInfo.java deleted file mode 100644 index 51fdf7935..000000000 --- a/src/java/com/twitter/search/earlybird/search/SearchRequestInfo.java +++ /dev/null @@ -1,180 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.util.List; -import javax.annotation.Nullable; - -import com.google.common.base.Preconditions; - -import org.apache.lucene.search.Query; - -import com.twitter.search.common.metrics.SearchCounter; -import com.twitter.search.common.query.HitAttributeHelper; -import com.twitter.search.common.search.TerminationTracker; -import com.twitter.search.earlybird.QualityFactor; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; -import com.twitter.search.queryparser.util.IdTimeRanges; - -public class SearchRequestInfo { - private final ThriftSearchQuery searchQuery; - private final Query luceneQuery; - private final boolean collectConversationId; - private final boolean collectResultLocation; - private final boolean getInReplyToStatusId; - private final boolean getReferenceAuthorId; - private final boolean getFromUserId; - private final boolean collectExclusiveConversationAuthorId; - - private final int numResultsRequested; - private final int maxHitsToProcess; - private final List facetFieldNames; - private long timestamp; - - private final TerminationTracker terminationTracker; - - protected final QualityFactor qualityFactor; - - // Set if we want to collect per-field hit attributes for this request. - @Nullable - private HitAttributeHelper hitAttributeHelper; - - private IdTimeRanges idTimeRanges; - - private static final int DEFAULT_MAX_HITS = 1000; - - private static final SearchCounter RESET_MAX_HITS_TO_PROCESS_COUNTER = - SearchCounter.export("search_request_info_reset_max_hits_to_process"); - - public SearchRequestInfo( - ThriftSearchQuery searchQuery, - Query luceneQuery, - TerminationTracker terminationTracker) { - this(searchQuery, luceneQuery, terminationTracker, null); - } - - public SearchRequestInfo( - ThriftSearchQuery searchQuery, - Query luceneQuery, - TerminationTracker terminationTracker, - QualityFactor qualityFactor) { - Preconditions.checkNotNull(searchQuery.getCollectorParams()); - Preconditions.checkNotNull(terminationTracker); - - this.searchQuery = searchQuery; - this.luceneQuery = luceneQuery; - this.collectConversationId = searchQuery.isCollectConversationId(); - if (searchQuery.isSetResultMetadataOptions()) { - this.collectResultLocation = searchQuery.getResultMetadataOptions().isGetResultLocation(); - this.getInReplyToStatusId = searchQuery.getResultMetadataOptions().isGetInReplyToStatusId(); - this.getReferenceAuthorId = - searchQuery.getResultMetadataOptions().isGetReferencedTweetAuthorId(); - this.getFromUserId = searchQuery.getResultMetadataOptions().isGetFromUserId(); - this.collectExclusiveConversationAuthorId = - searchQuery.getResultMetadataOptions().isGetExclusiveConversationAuthorId(); - } else { - this.collectResultLocation = false; - this.getInReplyToStatusId = false; - this.getReferenceAuthorId = false; - this.getFromUserId = false; - this.collectExclusiveConversationAuthorId = false; - } - - this.qualityFactor = qualityFactor; - - this.numResultsRequested = searchQuery.getCollectorParams().getNumResultsToReturn(); - this.maxHitsToProcess = calculateMaxHitsToProcess(searchQuery); - this.terminationTracker = terminationTracker; - this.facetFieldNames = searchQuery.getFacetFieldNames(); - } - - /** - * Gets the value to be used as max hits to process for this query. The base class gets it from - * the searchQuery directly, and uses a default if that's not set. - * - * Subclasses can override this to compute a different value for max hits to process. - */ - protected int calculateMaxHitsToProcess(ThriftSearchQuery thriftSearchQuery) { - int maxHits = thriftSearchQuery.getCollectorParams().isSetTerminationParams() - ? thriftSearchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess() : 0; - - if (maxHits <= 0) { - maxHits = DEFAULT_MAX_HITS; - RESET_MAX_HITS_TO_PROCESS_COUNTER.increment(); - } - return maxHits; - } - - public final ThriftSearchQuery getSearchQuery() { - return this.searchQuery; - } - - public Query getLuceneQuery() { - return luceneQuery; - } - - public final int getNumResultsRequested() { - return numResultsRequested; - } - - public final int getMaxHitsToProcess() { - return maxHitsToProcess; - } - - public boolean isCollectConversationId() { - return collectConversationId; - } - - public boolean isCollectResultLocation() { - return collectResultLocation; - } - - public boolean isGetInReplyToStatusId() { - return getInReplyToStatusId; - } - - public boolean isGetReferenceAuthorId() { - return getReferenceAuthorId; - } - - public boolean isCollectExclusiveConversationAuthorId() { - return collectExclusiveConversationAuthorId; - } - - public final IdTimeRanges getIdTimeRanges() { - return idTimeRanges; - } - - public SearchRequestInfo setIdTimeRanges(IdTimeRanges newIdTimeRanges) { - this.idTimeRanges = newIdTimeRanges; - return this; - } - - public SearchRequestInfo setTimestamp(long newTimestamp) { - this.timestamp = newTimestamp; - return this; - } - - public long getTimestamp() { - return timestamp; - } - - public TerminationTracker getTerminationTracker() { - return this.terminationTracker; - } - - @Nullable - public HitAttributeHelper getHitAttributeHelper() { - return hitAttributeHelper; - } - - public void setHitAttributeHelper(@Nullable HitAttributeHelper hitAttributeHelper) { - this.hitAttributeHelper = hitAttributeHelper; - } - - public List getFacetFieldNames() { - return facetFieldNames; - } - - public boolean isGetFromUserId() { - return getFromUserId; - } -} diff --git a/src/java/com/twitter/search/earlybird/search/SearchResultsCollector.docx b/src/java/com/twitter/search/earlybird/search/SearchResultsCollector.docx new file mode 100644 index 000000000..49564c5af Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/SearchResultsCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/SearchResultsCollector.java b/src/java/com/twitter/search/earlybird/search/SearchResultsCollector.java deleted file mode 100644 index fff5d1f0f..000000000 --- a/src/java/com/twitter/search/earlybird/search/SearchResultsCollector.java +++ /dev/null @@ -1,188 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; -import com.twitter.search.common.features.thrift.ThriftSearchResultFeatures; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.search.EarlyTerminationState; -import com.twitter.search.common.util.LongIntConverter; -import com.twitter.search.earlybird.common.config.EarlybirdConfig; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata; -import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadataOptions; -import com.twitter.search.earlybird.thrift.ThriftSearchResultType; - -/** - * This class collects results for Recency queries for delegation to collectors based on query mode - */ -public class SearchResultsCollector - extends AbstractResultsCollector { - private static final EarlyTerminationState TERMINATED_COLLECTED_ENOUGH_RESULTS = - new EarlyTerminationState("terminated_collected_enough_results", true); - - protected final List results; - private final Set requestedFeatureIds; - private final EarlybirdCluster cluster; - private final UserTable userTable; - - public SearchResultsCollector( - ImmutableSchemaInterface schema, - SearchRequestInfo searchRequestInfo, - Clock clock, - EarlybirdSearcherStats searcherStats, - EarlybirdCluster cluster, - UserTable userTable, - int requestDebugMode) { - super(schema, searchRequestInfo, clock, searcherStats, requestDebugMode); - results = new ArrayList<>(); - this.cluster = cluster; - this.userTable = userTable; - - ThriftSearchResultMetadataOptions options = - searchRequestInfo.getSearchQuery().getResultMetadataOptions(); - if (options != null && options.isReturnSearchResultFeatures()) { - requestedFeatureIds = schema.getSearchFeatureSchema().getEntries().keySet(); - } else if (options != null && options.isSetRequestedFeatureIDs()) { - requestedFeatureIds = new HashSet<>(options.getRequestedFeatureIDs()); - } else { - requestedFeatureIds = null; - } - } - - @Override - public void startSegment() throws IOException { - featuresRequested = requestedFeatureIds != null; - } - - @Override - public void doCollect(long tweetID) throws IOException { - Hit hit = new Hit(currTimeSliceID, tweetID); - ThriftSearchResultMetadata metadata = - new ThriftSearchResultMetadata(ThriftSearchResultType.RECENCY) - .setPenguinVersion(EarlybirdConfig.getPenguinVersionByte()); - - // Set tweet language in metadata - ThriftLanguage thriftLanguage = ThriftLanguage.findByValue( - (int) documentFeatures.getFeatureValue(EarlybirdFieldConstant.LANGUAGE)); - metadata.setLanguage(thriftLanguage); - - // Check and collect hit attribution data, if it's available. - fillHitAttributionMetadata(metadata); - - // Set the nullcast flag in metadata - metadata.setIsNullcast(documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_NULLCAST_FLAG)); - - if (searchRequestInfo.isCollectConversationId()) { - long conversationId = - documentFeatures.getFeatureValue(EarlybirdFieldConstant.CONVERSATION_ID_CSF); - if (conversationId != 0) { - ensureExtraMetadataIsSet(metadata); - metadata.getExtraMetadata().setConversationId(conversationId); - } - } - - fillResultGeoLocation(metadata); - collectRetweetAndReplyMetadata(metadata); - - long fromUserId = documentFeatures.getFeatureValue(EarlybirdFieldConstant.FROM_USER_ID_CSF); - if (requestedFeatureIds != null) { - ThriftSearchResultFeatures features = documentFeatures.getSearchResultFeatures( - getSchema(), requestedFeatureIds::contains); - ensureExtraMetadataIsSet(metadata); - metadata.getExtraMetadata().setFeatures(features); - metadata.setFromUserId(fromUserId); - if (documentFeatures.isFlagSet(EarlybirdFieldConstant.HAS_CARD_FLAG)) { - metadata.setCardType( - (byte) documentFeatures.getFeatureValue(EarlybirdFieldConstant.CARD_TYPE_CSF_FIELD)); - } - } - if (searchRequestInfo.isGetFromUserId()) { - metadata.setFromUserId(fromUserId); - } - - collectExclusiveConversationAuthorId(metadata); - collectFacets(metadata); - collectFeatures(metadata); - collectIsProtected(metadata, cluster, userTable); - hit.setMetadata(metadata); - results.add(hit); - updateHitCounts(tweetID); - } - - private final void collectRetweetAndReplyMetadata(ThriftSearchResultMetadata metadata) - throws IOException { - if (searchRequestInfo.isGetInReplyToStatusId() || searchRequestInfo.isGetReferenceAuthorId()) { - boolean isRetweet = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_RETWEET_FLAG); - boolean isReply = documentFeatures.isFlagSet(EarlybirdFieldConstant.IS_REPLY_FLAG); - // Set the isRetweet and isReply metadata so that clients who request retweet and reply - // metadata know whether a result is a retweet or reply or neither. - metadata.setIsRetweet(isRetweet); - metadata.setIsReply(isReply); - - // Only store the shared status id if the hit is a reply or a retweet and - // the getInReplyToStatusId flag is set. - if (searchRequestInfo.isGetInReplyToStatusId() && (isReply || isRetweet)) { - long sharedStatusID = - documentFeatures.getFeatureValue(EarlybirdFieldConstant.SHARED_STATUS_ID_CSF); - if (sharedStatusID != 0) { - metadata.setSharedStatusId(sharedStatusID); - } - } - - // Only store the reference tweet author ID if the hit is a reply or a retweet and the - // getReferenceAuthorId flag is set. - if (searchRequestInfo.isGetReferenceAuthorId() && (isReply || isRetweet)) { - // the REFERENCE_AUTHOR_ID_CSF stores the source tweet author id for all retweets - long referenceAuthorId = - documentFeatures.getFeatureValue(EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_CSF); - if (referenceAuthorId != 0) { - metadata.setReferencedTweetAuthorId(referenceAuthorId); - } else if (cluster != EarlybirdCluster.FULL_ARCHIVE) { - // we also store the reference author id for retweets, directed at tweets, and self - // threaded tweets separately on Realtime/Protected Earlybirds. This data will be moved to - // the REFERENCE_AUTHOR_ID_CSF and these fields will be deprecated in SEARCH-34958. - referenceAuthorId = LongIntConverter.convertTwoIntToOneLong( - (int) documentFeatures.getFeatureValue( - EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_MOST_SIGNIFICANT_INT), - (int) documentFeatures.getFeatureValue( - EarlybirdFieldConstant.REFERENCE_AUTHOR_ID_LEAST_SIGNIFICANT_INT)); - if (referenceAuthorId > 0) { - metadata.setReferencedTweetAuthorId(referenceAuthorId); - } - } - } - } - } - - /** - * This differs from base class because we check against num results collected instead of - * num hits collected. - */ - @Override - public EarlyTerminationState innerShouldCollectMore() throws IOException { - if (results.size() >= searchRequestInfo.getNumResultsRequested()) { - collectedEnoughResults(); - if (shouldTerminate()) { - return setEarlyTerminationState(TERMINATED_COLLECTED_ENOUGH_RESULTS); - } - } - return EarlyTerminationState.COLLECTING; - } - - @Override - public SimpleSearchResults doGetResults() { - // Sort hits by tweet id. - Collections.sort(results); - return new SimpleSearchResults(results); - } -} diff --git a/src/java/com/twitter/search/earlybird/search/SearchResultsInfo.docx b/src/java/com/twitter/search/earlybird/search/SearchResultsInfo.docx new file mode 100644 index 000000000..2bac59e05 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/SearchResultsInfo.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/SearchResultsInfo.java b/src/java/com/twitter/search/earlybird/search/SearchResultsInfo.java deleted file mode 100644 index ff19de98d..000000000 --- a/src/java/com/twitter/search/earlybird/search/SearchResultsInfo.java +++ /dev/null @@ -1,99 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.util.Map; - -import com.google.common.collect.Maps; - -import com.twitter.search.earlybird.search.queries.SinceMaxIDFilter; - -public class SearchResultsInfo { - public static final long NO_ID = SinceMaxIDFilter.NO_FILTER; - public static final int NO_TIME = -1; - - private int numHitsProcessed = 0; - private int numSearchedSegments = 0; - - private boolean earlyTerminated = false; - private String earlyTerminationReason = null; - - private long maxSearchedStatusID = NO_ID; - private long minSearchedStatusID = NO_ID; - - private int maxSearchedTime = NO_TIME; - private int minSearchedTime = NO_TIME; - - // Map from time thresholds (in milliseconds) to number of results more recent than this period. - protected final Map hitCounts = Maps.newHashMap(); - - public final int getNumHitsProcessed() { - return numHitsProcessed; - } - - public final void setNumHitsProcessed(int numHitsProcessed) { - this.numHitsProcessed = numHitsProcessed; - } - - public final int getNumSearchedSegments() { - return numSearchedSegments; - } - - public final void setNumSearchedSegments(int numSearchedSegments) { - this.numSearchedSegments = numSearchedSegments; - } - - public final long getMaxSearchedStatusID() { - return maxSearchedStatusID; - } - - public final long getMinSearchedStatusID() { - return minSearchedStatusID; - } - - public final int getMaxSearchedTime() { - return maxSearchedTime; - } - - public final int getMinSearchedTime() { - return minSearchedTime; - } - - public boolean isSetSearchedStatusIDs() { - return maxSearchedStatusID != NO_ID && minSearchedStatusID != NO_ID; - } - - public boolean isSetSearchedTimes() { - return maxSearchedTime != NO_TIME && minSearchedTime != NO_TIME; - } - - public void setMaxSearchedStatusID(long maxSearchedStatusID) { - this.maxSearchedStatusID = maxSearchedStatusID; - } - - public void setMinSearchedStatusID(long minSearchedStatusID) { - this.minSearchedStatusID = minSearchedStatusID; - } - - public void setMaxSearchedTime(int maxSearchedTime) { - this.maxSearchedTime = maxSearchedTime; - } - - public void setMinSearchedTime(int minSearchedTime) { - this.minSearchedTime = minSearchedTime; - } - - public void setEarlyTerminated(boolean earlyTerminated) { - this.earlyTerminated = earlyTerminated; - } - - public boolean isEarlyTerminated() { - return earlyTerminated; - } - - public String getEarlyTerminationReason() { - return earlyTerminationReason; - } - - public void setEarlyTerminationReason(String earlyTerminationReason) { - this.earlyTerminationReason = earlyTerminationReason; - } -} diff --git a/src/java/com/twitter/search/earlybird/search/SimpleSearchResults.docx b/src/java/com/twitter/search/earlybird/search/SimpleSearchResults.docx new file mode 100644 index 000000000..1ca3d1d5a Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/SimpleSearchResults.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/SimpleSearchResults.java b/src/java/com/twitter/search/earlybird/search/SimpleSearchResults.java deleted file mode 100644 index e3e0894fd..000000000 --- a/src/java/com/twitter/search/earlybird/search/SimpleSearchResults.java +++ /dev/null @@ -1,35 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.util.List; - -public class SimpleSearchResults extends SearchResultsInfo { - protected Hit[] hits; - protected int numHits; - - public SimpleSearchResults(int size) { - this.hits = new Hit[size]; - this.numHits = 0; - } - - public SimpleSearchResults(List hits) { - this.hits = new Hit[hits.size()]; - this.numHits = hits.size(); - hits.toArray(this.hits); - } - - public Hit[] hits() { - return hits; - } - - public int numHits() { - return numHits; - } - - public void setNumHits(int numHits) { - this.numHits = numHits; - } - - public Hit getHit(int hitIndex) { - return hits[hitIndex]; - } -} diff --git a/src/java/com/twitter/search/earlybird/search/SocialFilter.docx b/src/java/com/twitter/search/earlybird/search/SocialFilter.docx new file mode 100644 index 000000000..c81d9be9a Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/SocialFilter.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/SocialFilter.java b/src/java/com/twitter/search/earlybird/search/SocialFilter.java deleted file mode 100644 index 9761171bc..000000000 --- a/src/java/com/twitter/search/earlybird/search/SocialFilter.java +++ /dev/null @@ -1,98 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.io.IOException; - -import com.google.common.base.Preconditions; -import com.google.common.primitives.Longs; - -import org.apache.lucene.index.NumericDocValues; - -import com.twitter.common_internal.bloomfilter.BloomFilter; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.earlybird.thrift.ThriftSocialFilterType; - -/** - * Filter class used by the SearchResultsCollector to filter social tweets - * from the hits. - */ -public class SocialFilter { - private interface Acceptor { - boolean accept(long fromUserLong, byte[] userIDInBytes); - } - - private NumericDocValues fromUserID; - private final Acceptor acceptor; - private final long searcherId; - private final BloomFilter trustedFilter; - private final BloomFilter followFilter; - - private class FollowsAcceptor implements Acceptor { - @Override - public boolean accept(long fromUserLong, byte[] userIdInBytes) { - return followFilter.contains(userIdInBytes); - } - } - - private class TrustedAcceptor implements Acceptor { - @Override - public boolean accept(long fromUserLong, byte[] userIdInBytes) { - return trustedFilter.contains(userIdInBytes); - } - } - - private class AllAcceptor implements Acceptor { - @Override - public boolean accept(long fromUserLong, byte[] userIdInBytes) { - return trustedFilter.contains(userIdInBytes) - || followFilter.contains(userIdInBytes) - || fromUserLong == searcherId; - } - } - - public SocialFilter( - ThriftSocialFilterType socialFilterType, - final long searcherId, - final byte[] trustedFilter, - final byte[] followFilter) throws IOException { - Preconditions.checkNotNull(socialFilterType); - Preconditions.checkNotNull(trustedFilter); - Preconditions.checkNotNull(followFilter); - this.searcherId = searcherId; - this.trustedFilter = new BloomFilter(trustedFilter); - this.followFilter = new BloomFilter(followFilter); - - - switch (socialFilterType) { - case FOLLOWS: - this.acceptor = new FollowsAcceptor(); - break; - case TRUSTED: - this.acceptor = new TrustedAcceptor(); - break; - case ALL: - this.acceptor = new AllAcceptor(); - break; - default: - throw new UnsupportedOperationException("Invalid social filter type passed"); - } - } - - public void startSegment(EarlybirdIndexSegmentAtomicReader indexReader) throws IOException { - fromUserID = - indexReader.getNumericDocValues(EarlybirdFieldConstant.FROM_USER_ID_CSF.getFieldName()); - } - - /** - * Determines if the given doc ID should be accepted. - */ - public boolean accept(int internalDocID) throws IOException { - if (!fromUserID.advanceExact(internalDocID)) { - return false; - } - - long fromUserLong = fromUserID.longValue(); - byte[] userIDInBytes = Longs.toByteArray(fromUserLong); - return acceptor.accept(fromUserLong, userIDInBytes); - } -} diff --git a/src/java/com/twitter/search/earlybird/search/SocialSearchResultsCollector.docx b/src/java/com/twitter/search/earlybird/search/SocialSearchResultsCollector.docx new file mode 100644 index 000000000..f7843176e Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/SocialSearchResultsCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/SocialSearchResultsCollector.java b/src/java/com/twitter/search/earlybird/search/SocialSearchResultsCollector.java deleted file mode 100644 index 170db4faa..000000000 --- a/src/java/com/twitter/search/earlybird/search/SocialSearchResultsCollector.java +++ /dev/null @@ -1,47 +0,0 @@ -package com.twitter.search.earlybird.search; - -import java.io.IOException; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.earlybird.EarlybirdCluster; -import com.twitter.search.earlybird.common.userupdates.UserTable; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; - -/** - * Created with IntelliJ IDEA. - * Date: 6/20/12 - * Time: 12:06 PM - * To change this template use File | Settings | File Templates. - */ -public class SocialSearchResultsCollector extends SearchResultsCollector { - - private final SocialFilter socialFilter; - - public SocialSearchResultsCollector( - ImmutableSchemaInterface schema, - SearchRequestInfo searchRequestInfo, - SocialFilter socialFilter, - EarlybirdSearcherStats searcherStats, - EarlybirdCluster cluster, - UserTable userTable, - int requestDebugMode) { - super(schema, searchRequestInfo, Clock.SYSTEM_CLOCK, searcherStats, cluster, userTable, - requestDebugMode); - this.socialFilter = socialFilter; - } - - @Override - public final void doCollect(long tweetID) throws IOException { - if (socialFilter == null || socialFilter.accept(curDocId)) { - results.add(new Hit(currTimeSliceID, tweetID)); - } - } - - @Override - public void startSegment() throws IOException { - if (socialFilter != null) { - socialFilter.startSegment(currTwitterReader); - } - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/AbstractFacetTermCollector.docx b/src/java/com/twitter/search/earlybird/search/facets/AbstractFacetTermCollector.docx new file mode 100644 index 000000000..a5c13f08c Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/AbstractFacetTermCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/AbstractFacetTermCollector.java b/src/java/com/twitter/search/earlybird/search/facets/AbstractFacetTermCollector.java deleted file mode 100644 index eb07d3fd1..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/AbstractFacetTermCollector.java +++ /dev/null @@ -1,67 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.util.Map; -import java.util.Set; - -import com.twitter.search.core.earlybird.facets.FacetIDMap; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.facets.FacetTermCollector; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResultExtraMetadata; -import com.twitter.search.earlybird.thrift.ThriftSearchResultMetadata; - -public abstract class AbstractFacetTermCollector implements FacetTermCollector { - private Map facetLabelProviders; - private FacetIDMap facetIdMap; - - /** - * Populates the given ThriftSearchResult instance with the results collected by this collector - * and clears all collected results in this collector. - * - * @param result The ThriftSearchResult instance to be populated with the results collected in - * this collector. - */ - public abstract void fillResultAndClear(ThriftSearchResult result); - - public void resetFacetLabelProviders( - Map facetLabelProvidersToReset, FacetIDMap facetIdMapToReset) { - this.facetLabelProviders = facetLabelProvidersToReset; - this.facetIdMap = facetIdMapToReset; - } - - String findFacetName(int fieldId) { - return fieldId < 0 ? null : facetIdMap.getFacetFieldByFacetID(fieldId).getFacetName(); - } - - protected ThriftSearchResultExtraMetadata getExtraMetadata(ThriftSearchResult result) { - ThriftSearchResultMetadata metadata = result.getMetadata(); - if (!metadata.isSetExtraMetadata()) { - metadata.setExtraMetadata(new ThriftSearchResultExtraMetadata()); - } - return metadata.getExtraMetadata(); - } - - protected String getTermFromProvider( - String facetName, long termID, FacetLabelProvider provider) { - return provider.getLabelAccessor().getTermText(termID); - } - - protected String getTermFromFacet(long termID, int fieldID, Set facetsToCollectFrom) { - if (termID == EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND) { - return null; - } - - String facetName = findFacetName(fieldID); - if (!facetsToCollectFrom.contains(facetName)) { - return null; - } - - final FacetLabelProvider provider = facetLabelProviders.get(facetName); - if (provider == null) { - return null; - } - - return getTermFromProvider(facetName, termID, provider); - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/DefaultFacetScorer.docx b/src/java/com/twitter/search/earlybird/search/facets/DefaultFacetScorer.docx new file mode 100644 index 000000000..782f7be26 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/DefaultFacetScorer.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/DefaultFacetScorer.java b/src/java/com/twitter/search/earlybird/search/facets/DefaultFacetScorer.java deleted file mode 100644 index 729d6ea24..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/DefaultFacetScorer.java +++ /dev/null @@ -1,236 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.io.IOException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; -import com.twitter.search.common.ranking.thriftjava.ThriftFacetEarlybirdSortingMode; -import com.twitter.search.common.ranking.thriftjava.ThriftFacetRankingOptions; -import com.twitter.search.common.relevance.features.EarlybirdDocumentFeatures; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.common.util.lang.ThriftLanguageUtil; -import com.twitter.search.core.earlybird.facets.FacetAccumulator; -import com.twitter.search.core.earlybird.facets.FacetCountIterator; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.earlybird.search.AntiGamingFilter; -import com.twitter.search.earlybird.search.facets.FacetResultsCollector.Accumulator; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; - -public class DefaultFacetScorer extends FacetScorer { - private static final Logger LOG = LoggerFactory.getLogger(FacetScorer.class.getName()); - private static final double DEFAULT_FEATURE_WEIGHT = 0.0; - private static final byte DEFAULT_PENALTY = 1; - - private static final byte DEFAULT_REPUTATION_MIN = 45; - - private final AntiGamingFilter antiGamingFilter; - - // tweepcreds below this value will not be counted at all - private final byte reputationMinFilterThresholdVal; - - // tweepcreds between reputationMinFilterThresholdVal and this value will be counted - // with a score of 1 - private final byte reputationMinScoreVal; - - private final double userRepWeight; - private final double favoritesWeight; - private final double parusWeight; - private final double parusBase; - private final double queryIndependentPenaltyWeight; - - private final ThriftLanguage uiLang; - private final double langEnglishUIBoost; - private final double langEnglishFacetBoost; - private final double langDefaultBoost; - - private final int antigamingPenalty; - private final int offensiveTweetPenalty; - private final int multipleHashtagsOrTrendsPenalty; - - private final int maxScorePerTweet; - private final ThriftFacetEarlybirdSortingMode sortingMode; - - private EarlybirdIndexSegmentAtomicReader reader; - private EarlybirdDocumentFeatures features; - - /** - * Creates a new facet scorer. - */ - public DefaultFacetScorer(ThriftSearchQuery searchQuery, - ThriftFacetRankingOptions rankingOptions, - AntiGamingFilter antiGamingFilter, - ThriftFacetEarlybirdSortingMode sortingMode) { - this.sortingMode = sortingMode; - this.antiGamingFilter = antiGamingFilter; - - maxScorePerTweet = - rankingOptions.isSetMaxScorePerTweet() - ? rankingOptions.getMaxScorePerTweet() - : Integer.MAX_VALUE; - - // filters - reputationMinFilterThresholdVal = - rankingOptions.isSetMinTweepcredFilterThreshold() - ? (byte) (rankingOptions.getMinTweepcredFilterThreshold() & 0xFF) - : DEFAULT_REPUTATION_MIN; - - // weights - // reputationMinScoreVal must be >= reputationMinFilterThresholdVal - reputationMinScoreVal = - (byte) Math.max(rankingOptions.isSetReputationParams() - ? (byte) rankingOptions.getReputationParams().getMin() - : DEFAULT_REPUTATION_MIN, reputationMinFilterThresholdVal); - - parusWeight = - rankingOptions.isSetParusScoreParams() && rankingOptions.getParusScoreParams().isSetWeight() - ? rankingOptions.getParusScoreParams().getWeight() - : DEFAULT_FEATURE_WEIGHT; - // compute this once so that base ** parusScore is backwards-compatible - parusBase = Math.sqrt(1 + parusWeight); - - userRepWeight = - rankingOptions.isSetReputationParams() && rankingOptions.getReputationParams().isSetWeight() - ? rankingOptions.getReputationParams().getWeight() - : DEFAULT_FEATURE_WEIGHT; - - favoritesWeight = - rankingOptions.isSetFavoritesParams() && rankingOptions.getFavoritesParams().isSetWeight() - ? rankingOptions.getFavoritesParams().getWeight() - : DEFAULT_FEATURE_WEIGHT; - - queryIndependentPenaltyWeight = - rankingOptions.isSetQueryIndependentPenaltyWeight() - ? rankingOptions.getQueryIndependentPenaltyWeight() - : DEFAULT_FEATURE_WEIGHT; - - // penalty increment - antigamingPenalty = - rankingOptions.isSetAntigamingPenalty() - ? rankingOptions.getAntigamingPenalty() - : DEFAULT_PENALTY; - - offensiveTweetPenalty = - rankingOptions.isSetOffensiveTweetPenalty() - ? rankingOptions.getOffensiveTweetPenalty() - : DEFAULT_PENALTY; - - multipleHashtagsOrTrendsPenalty = - rankingOptions.isSetMultipleHashtagsOrTrendsPenalty() - ? rankingOptions.getMultipleHashtagsOrTrendsPenalty() - : DEFAULT_PENALTY; - - // query information - if (!searchQuery.isSetUiLang() || searchQuery.getUiLang().isEmpty()) { - uiLang = ThriftLanguage.UNKNOWN; - } else { - uiLang = ThriftLanguageUtil.getThriftLanguageOf(searchQuery.getUiLang()); - } - langEnglishUIBoost = rankingOptions.getLangEnglishUIBoost(); - langEnglishFacetBoost = rankingOptions.getLangEnglishFacetBoost(); - langDefaultBoost = rankingOptions.getLangDefaultBoost(); - } - - @Override - protected void startSegment(EarlybirdIndexSegmentAtomicReader segmentReader) throws IOException { - reader = segmentReader; - features = new EarlybirdDocumentFeatures(reader); - if (antiGamingFilter != null) { - antiGamingFilter.startSegment(reader); - } - } - - @Override - public void incrementCounts(Accumulator accumulator, int internalDocID) throws IOException { - FacetCountIterator.IncrementData data = accumulator.accessor.incrementData; - data.accumulators = accumulator.accumulators; - features.advance(internalDocID); - - // Also keep track of the tweet language of tweet themselves. - data.languageId = (int) features.getFeatureValue(EarlybirdFieldConstant.LANGUAGE); - - if (antigamingPenalty > 0 - && antiGamingFilter != null - && !antiGamingFilter.accept(internalDocID)) { - data.weightedCountIncrement = 0; - data.penaltyIncrement = antigamingPenalty; - data.tweepCred = 0; - accumulator.accessor.collect(internalDocID); - return; - } - - if (offensiveTweetPenalty > 0 && features.isFlagSet(EarlybirdFieldConstant.IS_OFFENSIVE_FLAG)) { - data.weightedCountIncrement = 0; - data.penaltyIncrement = offensiveTweetPenalty; - data.tweepCred = 0; - accumulator.accessor.collect(internalDocID); - return; - } - - byte userRep = (byte) features.getFeatureValue(EarlybirdFieldConstant.USER_REPUTATION); - - if (userRep < reputationMinFilterThresholdVal) { - // don't penalize - data.weightedCountIncrement = 0; - data.penaltyIncrement = 0; - data.tweepCred = 0; - accumulator.accessor.collect(internalDocID); - return; - } - - // Other non-terminating penalties - int penalty = 0; - if (multipleHashtagsOrTrendsPenalty > 0 - && features.isFlagSet(EarlybirdFieldConstant.HAS_MULTIPLE_HASHTAGS_OR_TRENDS_FLAG)) { - penalty += multipleHashtagsOrTrendsPenalty; - } - - double parus = 0xFF & (byte) features.getFeatureValue(EarlybirdFieldConstant.PARUS_SCORE); - - double score = Math.pow(1 + userRepWeight, Math.max(0, userRep - reputationMinScoreVal)); - - if (parus > 0) { - score += Math.pow(parusBase, parus); - } - - int favoriteCount = - (int) features.getUnnormalizedFeatureValue(EarlybirdFieldConstant.FAVORITE_COUNT); - if (favoriteCount > 0) { - score += favoriteCount * favoritesWeight; - } - - // Language preferences - int tweetLinkLangId = (int) features.getFeatureValue(EarlybirdFieldConstant.LINK_LANGUAGE); - if (tweetLinkLangId == ThriftLanguage.UNKNOWN.getValue()) { - // fall back to use the tweet language itself. - tweetLinkLangId = (int) features.getFeatureValue(EarlybirdFieldConstant.LANGUAGE); - } - if (uiLang != ThriftLanguage.UNKNOWN && uiLang.getValue() != tweetLinkLangId) { - if (uiLang == ThriftLanguage.ENGLISH) { - score *= langEnglishUIBoost; - } else if (tweetLinkLangId == ThriftLanguage.ENGLISH.getValue()) { - score *= langEnglishFacetBoost; - } else { - score *= langDefaultBoost; - } - } - - // make sure a single tweet can't contribute too high a score - if (score > maxScorePerTweet) { - score = maxScorePerTweet; - } - - data.weightedCountIncrement = (int) score; - data.penaltyIncrement = penalty; - data.tweepCred = userRep & 0xFF; - accumulator.accessor.collect(internalDocID); - } - - @Override - public FacetAccumulator getFacetAccumulator(FacetLabelProvider labelProvider) { - return new HashingAndPruningFacetAccumulator(labelProvider, queryIndependentPenaltyWeight, - HashingAndPruningFacetAccumulator.getComparator(sortingMode)); - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/EntityAnnotationCollector.docx b/src/java/com/twitter/search/earlybird/search/facets/EntityAnnotationCollector.docx new file mode 100644 index 000000000..4fc2971a9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/EntityAnnotationCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/EntityAnnotationCollector.java b/src/java/com/twitter/search/earlybird/search/facets/EntityAnnotationCollector.java deleted file mode 100644 index 81e07a718..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/EntityAnnotationCollector.java +++ /dev/null @@ -1,48 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.util.List; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.Lists; -import com.google.common.collect.Sets; - -import org.apache.commons.lang.StringUtils; - -import com.twitter.escherbird.thriftjava.TweetEntityAnnotation; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; - -public class EntityAnnotationCollector extends AbstractFacetTermCollector { - private List annotations = Lists.newArrayList(); - - @Override - public boolean collect(int docID, long termID, int fieldID) { - - String term = getTermFromFacet(termID, fieldID, - Sets.newHashSet(EarlybirdFieldConstant.ENTITY_ID_FIELD.getFieldName())); - if (StringUtils.isEmpty(term)) { - return false; - } - - String[] idParts = term.split("\\."); - - // Only include the full three-part form of the entity ID: "groupId.domainId.entityId" - // Exclude the less-specific forms we index: "domainId.entityId" and "entityId" - if (idParts.length < 3) { - return false; - } - - annotations.add(new TweetEntityAnnotation( - Long.valueOf(idParts[0]), - Long.valueOf(idParts[1]), - Long.valueOf(idParts[2]))); - - return true; - } - - @Override - public void fillResultAndClear(ThriftSearchResult result) { - getExtraMetadata(result).setEntityAnnotations(ImmutableList.copyOf(annotations)); - annotations.clear(); - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/ExpandedUrlCollector.docx b/src/java/com/twitter/search/earlybird/search/facets/ExpandedUrlCollector.docx new file mode 100644 index 000000000..0f4d49466 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/ExpandedUrlCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/ExpandedUrlCollector.java b/src/java/com/twitter/search/earlybird/search/facets/ExpandedUrlCollector.java deleted file mode 100644 index 65721747f..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/ExpandedUrlCollector.java +++ /dev/null @@ -1,118 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.util.LinkedHashMap; -import java.util.List; -import java.util.Map; - -import com.google.common.annotations.VisibleForTesting; -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableSet; - -import org.apache.lucene.util.BytesRef; - -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResultUrl; -import com.twitter.service.spiderduck.gen.MediaTypes; - -/** - * A collector for collecting expanded urls from facets. Note that the only thing connecting this - * collector with expanded URLs is the fact that we only store the expanded url in the facet fields. - */ -public class ExpandedUrlCollector extends AbstractFacetTermCollector { - private static final ImmutableSet FACET_CONTAINS_URL = ImmutableSet.of( - EarlybirdFieldConstant.VIDEOS_FACET, - EarlybirdFieldConstant.IMAGES_FACET, - EarlybirdFieldConstant.NEWS_FACET, - EarlybirdFieldConstant.LINKS_FACET, - EarlybirdFieldConstant.TWIMG_FACET); - - private final Map dedupedUrls = new LinkedHashMap<>(); - - - @Override - protected String getTermFromProvider( - String facetName, - long termID, - FacetLabelProvider provider) { - String url = null; - if (EarlybirdFieldConstant.TWIMG_FACET.equals(facetName)) { - // Special case extraction of media url for twimg. - FacetLabelProvider.FacetLabelAccessor photoAccessor = provider.getLabelAccessor(); - BytesRef termPayload = photoAccessor.getTermPayload(termID); - if (termPayload != null) { - url = termPayload.utf8ToString(); - } - } else { - url = provider.getLabelAccessor().getTermText(termID); - } - return url; - } - - @Override - public boolean collect(int docID, long termID, int fieldID) { - - String url = getTermFromFacet(termID, fieldID, FACET_CONTAINS_URL); - if (url == null || url.isEmpty()) { - return false; - } - - ThriftSearchResultUrl resultUrl = new ThriftSearchResultUrl(); - resultUrl.setOriginalUrl(url); - MediaTypes mediaType = getMediaType(findFacetName(fieldID)); - resultUrl.setMediaType(mediaType); - - // Media links will show up twice: - // - once in image/native_image/video/news facets - // - another time in the links facet - // - // For those urls, we only want to return the media version. If it is non-media version, only - // write to map if doesn't exist already, if media version, overwrite any previous entries. - if (mediaType == MediaTypes.UNKNOWN) { - if (!dedupedUrls.containsKey(url)) { - dedupedUrls.put(url, resultUrl); - } - } else { - dedupedUrls.put(url, resultUrl); - } - - return true; - } - - @Override - public void fillResultAndClear(ThriftSearchResult result) { - result.getMetadata().setTweetUrls(getExpandedUrls()); - dedupedUrls.clear(); - } - - @VisibleForTesting - List getExpandedUrls() { - return ImmutableList.copyOf(dedupedUrls.values()); - } - - /** - * Gets the Spiderduck media type for a given facet name. - * - * @param facetName A given facet name. - * @return {@code MediaTypes} enum corresponding to the facet name. - */ - private static MediaTypes getMediaType(String facetName) { - if (facetName == null) { - return MediaTypes.UNKNOWN; - } - - switch (facetName) { - case EarlybirdFieldConstant.TWIMG_FACET: - return MediaTypes.NATIVE_IMAGE; - case EarlybirdFieldConstant.IMAGES_FACET: - return MediaTypes.IMAGE; - case EarlybirdFieldConstant.VIDEOS_FACET: - return MediaTypes.VIDEO; - case EarlybirdFieldConstant.NEWS_FACET: - return MediaTypes.NEWS; - default: - return MediaTypes.UNKNOWN; - } - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/ExplainFacetResultsCollector.docx b/src/java/com/twitter/search/earlybird/search/facets/ExplainFacetResultsCollector.docx new file mode 100644 index 000000000..a4a8afb91 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/ExplainFacetResultsCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/ExplainFacetResultsCollector.java b/src/java/com/twitter/search/earlybird/search/facets/ExplainFacetResultsCollector.java deleted file mode 100644 index 76dc918c7..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/ExplainFacetResultsCollector.java +++ /dev/null @@ -1,159 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import com.google.common.collect.Maps; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import com.twitter.common.collections.Pair; -import com.twitter.common.util.Clock; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.core.earlybird.facets.FacetIDMap; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.earlybird.search.AntiGamingFilter; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.ThriftFacetCount; -import com.twitter.search.earlybird.thrift.ThriftFacetCountMetadata; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults; -import com.twitter.search.earlybird.thrift.ThriftFacetResults; - -public class ExplainFacetResultsCollector extends FacetResultsCollector { - private static final Logger LOG = - LoggerFactory.getLogger(ExplainFacetResultsCollector.class.getName()); - - protected final List> proofs; - protected final Map>> proofAccumulators; - - protected Map facetLabelProviders; - private FacetIDMap facetIDMap; - - /** - * Creates a new facet collector with the ability to provide explanations for the search results. - */ - public ExplainFacetResultsCollector( - ImmutableSchemaInterface schema, - FacetSearchRequestInfo searchRequestInfo, - AntiGamingFilter antiGamingFilter, - EarlybirdSearcherStats searcherStats, - Clock clock, - int requestDebugMode) throws IOException { - super(schema, searchRequestInfo, antiGamingFilter, searcherStats, clock, requestDebugMode); - - proofs = new ArrayList<>(128); - - proofAccumulators = Maps.newHashMap(); - for (Schema.FieldInfo facetField : schema.getFacetFields()) { - HashMap> fieldLabelToTweetIdsMap = new HashMap<>(); - proofAccumulators.put(facetField.getFieldType().getFacetName(), fieldLabelToTweetIdsMap); - } - } - - @Override - protected Accumulator newPerSegmentAccumulator(EarlybirdIndexSegmentAtomicReader indexReader) { - Accumulator accumulator = super.newPerSegmentAccumulator(indexReader); - accumulator.accessor.setProofs(proofs); - facetLabelProviders = indexReader.getFacetLabelProviders(); - facetIDMap = indexReader.getFacetIDMap(); - - return accumulator; - } - - @Override - public void doCollect(long tweetID) throws IOException { - proofs.clear(); - - // FacetResultsCollector.doCollect() calls FacetScorer.incrementCounts(), - // FacetResultsCollector.doCollect() creates a FacetResultsCollector.Accumulator, if - // necessary, which contains the accessor (a CompositeFacetIterator) and accumulators - // (FacetAccumulator of each field) - super.doCollect(tweetID); - - for (Pair fieldIdTermIdPair : proofs) { - int fieldID = fieldIdTermIdPair.getFirst(); - long termID = fieldIdTermIdPair.getSecond(); - - // Convert term ID to the term text, a.k.a. facet label - String facetName = facetIDMap.getFacetFieldByFacetID(fieldID).getFacetName(); - if (facetName != null) { - String facetLabel = facetLabelProviders.get(facetName) - .getLabelAccessor().getTermText(termID); - - List tweetIDs = proofAccumulators.get(facetName).get(facetLabel); - if (tweetIDs == null) { - tweetIDs = new ArrayList<>(); - proofAccumulators.get(facetName).put(facetLabel, tweetIDs); - } - - tweetIDs.add(tweetID); - } - } - - // clear it again just to be sure - proofs.clear(); - } - - /** - * Sets explanations for the facet results. - */ - public void setExplanations(ThriftFacetResults facetResults) { - StringBuilder explanation = new StringBuilder(); - - for (Map.Entry facetFieldResultsEntry - : facetResults.getFacetFields().entrySet()) { - String facetName = facetFieldResultsEntry.getKey(); - ThriftFacetFieldResults facetFieldResults = facetFieldResultsEntry.getValue(); - - Map> proofAccumulator = proofAccumulators.get(facetName); - - if (proofAccumulator == null) { - // did not accumulate explanation for this facet type? a bug? - LOG.warn("No explanation accumulated for facet type " + facetName); - continue; - } - - for (ThriftFacetCount facetCount : facetFieldResults.getTopFacets()) { - String facetLabel = facetCount.getFacetLabel(); // a.k.a. term text - ThriftFacetCountMetadata metadata = facetCount.getMetadata(); - - List tweetIDs = proofAccumulator.get(facetLabel); - if (tweetIDs == null) { - // did not accumulate explanation for this facet label? a bug? - LOG.warn("No explanation accumulated for " + facetLabel + " of facet type " + facetName); - continue; - } - - explanation.setLength(0); - String oldExplanation = null; - if (metadata.isSetExplanation()) { - // save the old explanation from TwitterInMemoryIndexSearcher.fillTermMetadata() - oldExplanation = metadata.getExplanation(); - // as of 2012/05/29, we have 18 digits tweet IDs - explanation.ensureCapacity(oldExplanation.length() + (18 + 2) + 10); - } else { - // as of 2012/05/29, we have 18 digits tweet IDs - explanation.ensureCapacity(tweetIDs.size() * (18 + 2) + 10); - } - - explanation.append("["); - for (Long tweetID : tweetIDs) { - explanation.append(tweetID) - .append(", "); - } - explanation.setLength(explanation.length() - 2); // remove the last ", " - explanation.append("]\n"); - if (oldExplanation != null) { - explanation.append(oldExplanation); - } - metadata.setExplanation(explanation.toString()); - } - } - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetLabelCollector.docx b/src/java/com/twitter/search/earlybird/search/facets/FacetLabelCollector.docx new file mode 100644 index 000000000..c917484c1 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/FacetLabelCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetLabelCollector.java b/src/java/com/twitter/search/earlybird/search/facets/FacetLabelCollector.java deleted file mode 100644 index 7ea471582..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/FacetLabelCollector.java +++ /dev/null @@ -1,62 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.util.ArrayList; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import com.twitter.search.core.earlybird.facets.FacetIDMap; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.facets.FacetTermCollector; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.earlybird.thrift.ThriftFacetLabel; - -/** - * A collector for facet labels of given fields. - */ -public class FacetLabelCollector implements FacetTermCollector { - - private final Set requiredFields; - private FacetIDMap facetIDMap; - private Map facetLabelProviders; - - private final List labels = new ArrayList<>(); - - public FacetLabelCollector(Set requiredFields) { - this.requiredFields = requiredFields; - } - - public void resetFacetLabelProviders(Map facetLabelProvidersToReset, - FacetIDMap facetIDMapToReset) { - this.facetLabelProviders = facetLabelProvidersToReset; - this.facetIDMap = facetIDMapToReset; - labels.clear(); - } - - @Override - public boolean collect(int docID, long termID, int fieldID) { - String facetName = facetIDMap.getFacetFieldByFacetID(fieldID).getFacetName(); - if (facetName == null || !requiredFields.contains(facetName)) { - return false; - } - if (termID != EarlybirdIndexSegmentAtomicReader.TERM_NOT_FOUND && fieldID >= 0) { - final FacetLabelProvider provider = facetLabelProviders.get(facetName); - if (provider != null) { - FacetLabelProvider.FacetLabelAccessor labelAccessor = provider.getLabelAccessor(); - String label = labelAccessor.getTermText(termID); - int offensiveCount = labelAccessor.getOffensiveCount(termID); - labels.add(new ThriftFacetLabel() - .setFieldName(facetName) - .setLabel(label) - .setOffensiveCount(offensiveCount)); - return true; - } - } - return false; - } - - public List getLabels() { - // Make a copy - return new ArrayList<>(labels); - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetRankingModule.docx b/src/java/com/twitter/search/earlybird/search/facets/FacetRankingModule.docx new file mode 100644 index 000000000..b247932e7 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/FacetRankingModule.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetRankingModule.java b/src/java/com/twitter/search/earlybird/search/facets/FacetRankingModule.java deleted file mode 100644 index a32ac2253..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/FacetRankingModule.java +++ /dev/null @@ -1,26 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.util.ArrayList; -import java.util.List; - -import com.twitter.search.core.earlybird.facets.FacetCountState; -import com.twitter.search.earlybird.search.EarlybirdLuceneSearcher; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults; - -public abstract class FacetRankingModule { - public static final List REGISTERED_RANKING_MODULES = - new ArrayList<>(); - - static { - REGISTERED_RANKING_MODULES.add(new SimpleCountRankingModule()); - } - - /** - * Prepares the {@link com.twitter.search.earlybird.thrift.ThriftFacetFieldResults} - * in {@link FacetCountState} before they're returned. This extension point therefore allows - * post-processing the facet results, e.g. for re-ranking or sorting purposes. - */ - public abstract void prepareResults( - EarlybirdLuceneSearcher.FacetSearchResults hits, - FacetCountState facetCountState); -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetResultsCollector.docx b/src/java/com/twitter/search/earlybird/search/facets/FacetResultsCollector.docx new file mode 100644 index 000000000..db2257a1c Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/FacetResultsCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetResultsCollector.java b/src/java/com/twitter/search/earlybird/search/facets/FacetResultsCollector.java deleted file mode 100644 index ba6a920e0..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/FacetResultsCollector.java +++ /dev/null @@ -1,229 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.PriorityQueue; - -import com.google.common.base.Preconditions; - -import com.twitter.common.util.Clock; -import com.twitter.search.common.constants.thriftjava.ThriftLanguage; -import com.twitter.search.common.ranking.thriftjava.ThriftFacetEarlybirdSortingMode; -import com.twitter.search.common.schema.base.ImmutableSchemaInterface; -import com.twitter.search.core.earlybird.facets.DummyFacetAccumulator; -import com.twitter.search.core.earlybird.facets.FacetAccumulator; -import com.twitter.search.core.earlybird.facets.FacetCountIterator; -import com.twitter.search.core.earlybird.facets.FacetIDMap; -import com.twitter.search.core.earlybird.facets.FacetIDMap.FacetField; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.facets.LanguageHistogram; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.earlybird.search.AbstractResultsCollector; -import com.twitter.search.earlybird.search.AntiGamingFilter; -import com.twitter.search.earlybird.search.EarlybirdLuceneSearcher.FacetSearchResults; -import com.twitter.search.earlybird.stats.EarlybirdSearcherStats; -import com.twitter.search.earlybird.thrift.ThriftFacetCount; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults; - -public class FacetResultsCollector extends - AbstractResultsCollector { - - private final FacetScorer facetScorer; - private final ThriftFacetEarlybirdSortingMode sortingMode; - - static class Accumulator { - protected final FacetAccumulator[] accumulators; - protected final FacetCountIterator accessor; - protected final FacetIDMap facetIDMap; - - Accumulator(FacetAccumulator[] accumulators, - FacetCountIterator accessor, - FacetIDMap facetIDMap) { - this.accumulators = accumulators; - this.accessor = accessor; - this.facetIDMap = facetIDMap; - } - - FacetAccumulator getFacetAccumulator(String facetName) { - FacetField facet = facetIDMap.getFacetFieldByFacetName(facetName); - return accumulators[facet.getFacetId()]; - } - } - - private Accumulator currentAccumulator; - private List segAccumulators; - private final HashingAndPruningFacetAccumulator.FacetComparator facetComparator; - - /** - * Creates a new FacetResultsCollector for the given facet search request. - */ - public FacetResultsCollector( - ImmutableSchemaInterface schema, - FacetSearchRequestInfo searchRequestInfo, - AntiGamingFilter antiGamingFilter, - EarlybirdSearcherStats searcherStats, - Clock clock, - int requestDebugInfo) { - super(schema, searchRequestInfo, clock, searcherStats, requestDebugInfo); - - if (searchRequestInfo.rankingOptions != null - && searchRequestInfo.rankingOptions.isSetSortingMode()) { - this.sortingMode = searchRequestInfo.rankingOptions.getSortingMode(); - } else { - this.sortingMode = ThriftFacetEarlybirdSortingMode.SORT_BY_WEIGHTED_COUNT; - } - - this.facetComparator = HashingAndPruningFacetAccumulator.getComparator(sortingMode); - this.facetScorer = createScorer(antiGamingFilter); - this.segAccumulators = new ArrayList<>(); - } - - @Override - public void startSegment() { - currentAccumulator = null; - } - - @Override - public void doCollect(long tweetID) throws IOException { - if (currentAccumulator == null) { - // Lazily create accumulators. Most segment / query / facet combinations have no hits. - currentAccumulator = newPerSegmentAccumulator(currTwitterReader); - segAccumulators.add(currentAccumulator); - facetScorer.startSegment(currTwitterReader); - } - facetScorer.incrementCounts(currentAccumulator, curDocId); - } - - @Override - public FacetSearchResults doGetResults() { - return new FacetSearchResults(this); - } - - /** - * Returns the top-k facet results for the requested facetName. - */ - public ThriftFacetFieldResults getFacetResults(String facetName, int topK) { - int totalCount = 0; - final Map map = new HashMap<>(); - - LanguageHistogram languageHistogram = new LanguageHistogram(); - - for (Accumulator segAccumulator : segAccumulators) { - FacetAccumulator accumulator = - segAccumulator.getFacetAccumulator(facetName); - Preconditions.checkNotNull(accumulator); - - ThriftFacetFieldResults results = accumulator.getAllFacets(); - if (results == null) { - continue; - } - - totalCount += results.totalCount; - - // merge language histograms from different segments - languageHistogram.addAll(accumulator.getLanguageHistogram()); - - for (ThriftFacetCount facetCount : results.getTopFacets()) { - String label = facetCount.getFacetLabel(); - ThriftFacetCount oldCount = map.get(label); - if (oldCount != null) { - oldCount.setSimpleCount(oldCount.getSimpleCount() + facetCount.getSimpleCount()); - oldCount.setWeightedCount(oldCount.getWeightedCount() + facetCount.getWeightedCount()); - - oldCount.setFacetCount(oldCount.getFacetCount() + facetCount.getFacetCount()); - oldCount.setPenaltyCount(oldCount.getPenaltyCount() + facetCount.getPenaltyCount()); - } else { - map.put(label, facetCount); - } - } - } - - if (map.size() == 0 || totalCount == 0) { - // No results. - return null; - } - - // sort table wrt percentage - PriorityQueue pq = - new PriorityQueue<>(map.size(), facetComparator.getThriftComparator(true)); - pq.addAll(map.values()); - - ThriftFacetFieldResults results = new ThriftFacetFieldResults(); - results.setTopFacets(new ArrayList<>()); - results.setTotalCount(totalCount); - - // Store merged language histogram into thrift object - for (Map.Entry entry - : languageHistogram.getLanguageHistogramAsMap().entrySet()) { - results.putToLanguageHistogram(entry.getKey(), entry.getValue()); - } - - // Get top facets. - for (int i = 0; i < topK && i < map.size(); i++) { - ThriftFacetCount facetCount = pq.poll(); - if (facetCount != null) { - results.addToTopFacets(facetCount); - } - } - return results; - } - - protected FacetScorer createScorer(AntiGamingFilter antiGamingFilter) { - if (searchRequestInfo.rankingOptions != null) { - return new DefaultFacetScorer(searchRequestInfo.getSearchQuery(), - searchRequestInfo.rankingOptions, - antiGamingFilter, - sortingMode); - } else { - return new FacetScorer() { - @Override - protected void startSegment(EarlybirdIndexSegmentAtomicReader reader) { - } - - @Override - public void incrementCounts(Accumulator accumulator, int internalDocID) throws IOException { - accumulator.accessor.incrementData.accumulators = accumulator.accumulators; - accumulator.accessor.incrementData.weightedCountIncrement = 1; - accumulator.accessor.incrementData.penaltyIncrement = 0; - accumulator.accessor.incrementData.languageId = ThriftLanguage.UNKNOWN.getValue(); - accumulator.accessor.collect(internalDocID); - } - - @Override - public FacetAccumulator getFacetAccumulator(FacetLabelProvider labelProvider) { - return new HashingAndPruningFacetAccumulator(labelProvider, facetComparator); - } - }; - } - } - - protected Accumulator newPerSegmentAccumulator(EarlybirdIndexSegmentAtomicReader indexReader) { - final FacetIDMap facetIDMap = indexReader.getFacetIDMap(); - final FacetCountIterator accessor = - indexReader.getFacetCountingArray().getIterator( - indexReader, - getSearchRequestInfo().getFacetCountState(), - TweetSearchFacetCountIteratorFactory.FACTORY); - - final FacetAccumulator[] accumulators = - (FacetAccumulator[]) - new FacetAccumulator[facetIDMap.getNumberOfFacetFields()]; - - Map labelProviders = indexReader.getFacetLabelProviders(); - for (FacetField f : facetIDMap.getFacetFields()) { - int id = f.getFacetId(); - if (getSearchRequestInfo().getFacetCountState().isCountField(f.getFieldInfo())) { - accumulators[id] = (FacetAccumulator) facetScorer - .getFacetAccumulator(labelProviders.get(f.getFacetName())); - } else { - // Dummmy accumulator does nothing. - accumulators[id] = new DummyFacetAccumulator(); - } - } - - return new Accumulator(accumulators, accessor, facetIDMap); - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetScorer.docx b/src/java/com/twitter/search/earlybird/search/facets/FacetScorer.docx new file mode 100644 index 000000000..e74e8587b Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/FacetScorer.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetScorer.java b/src/java/com/twitter/search/earlybird/search/facets/FacetScorer.java deleted file mode 100644 index 0e8725bac..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/FacetScorer.java +++ /dev/null @@ -1,24 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.io.IOException; - -import com.twitter.search.core.earlybird.facets.FacetAccumulator; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; -import com.twitter.search.earlybird.search.facets.FacetResultsCollector.Accumulator; - -public abstract class FacetScorer { - protected abstract void startSegment(EarlybirdIndexSegmentAtomicReader reader) throws IOException; - - /** - * Increments facet counts for the given document. - */ - public abstract void incrementCounts(Accumulator accumulator, int internalDocID) - throws IOException; - - /** - * Returns a FacetAccumulator for counting facets. It will use the given FacetLabelProvider - * for facet result labeling. - */ - public abstract FacetAccumulator getFacetAccumulator(FacetLabelProvider labelProvider); -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetSearchRequestInfo.docx b/src/java/com/twitter/search/earlybird/search/facets/FacetSearchRequestInfo.docx new file mode 100644 index 000000000..b9242b9a9 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/FacetSearchRequestInfo.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/FacetSearchRequestInfo.java b/src/java/com/twitter/search/earlybird/search/facets/FacetSearchRequestInfo.java deleted file mode 100644 index 948b098d2..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/FacetSearchRequestInfo.java +++ /dev/null @@ -1,28 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import org.apache.lucene.search.Query; - -import com.twitter.search.common.ranking.thriftjava.ThriftFacetRankingOptions; -import com.twitter.search.common.search.TerminationTracker; -import com.twitter.search.core.earlybird.facets.FacetCountState; -import com.twitter.search.earlybird.search.SearchRequestInfo; -import com.twitter.search.earlybird.thrift.ThriftSearchQuery; - -public class FacetSearchRequestInfo extends SearchRequestInfo { - protected final FacetCountState facetCountState; - protected final ThriftFacetRankingOptions rankingOptions; - - public FacetSearchRequestInfo(ThriftSearchQuery searchQuery, - ThriftFacetRankingOptions rankingOptions, - Query query, - FacetCountState facetCountState, - TerminationTracker terminationTracker) { - super(searchQuery, query, terminationTracker); - this.facetCountState = facetCountState; - this.rankingOptions = rankingOptions; - } - - public final FacetCountState getFacetCountState() { - return this.facetCountState; - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/HashingAndPruningFacetAccumulator.docx b/src/java/com/twitter/search/earlybird/search/facets/HashingAndPruningFacetAccumulator.docx new file mode 100644 index 000000000..d3473ab50 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/HashingAndPruningFacetAccumulator.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/HashingAndPruningFacetAccumulator.java b/src/java/com/twitter/search/earlybird/search/facets/HashingAndPruningFacetAccumulator.java deleted file mode 100644 index 9415b19ee..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/HashingAndPruningFacetAccumulator.java +++ /dev/null @@ -1,492 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.util.Arrays; -import java.util.Comparator; -import java.util.PriorityQueue; - -import com.twitter.search.common.ranking.thriftjava.ThriftFacetEarlybirdSortingMode; -import com.twitter.search.core.earlybird.facets.FacetAccumulator; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider; -import com.twitter.search.core.earlybird.facets.FacetLabelProvider.FacetLabelAccessor; -import com.twitter.search.core.earlybird.facets.LanguageHistogram; -import com.twitter.search.earlybird.thrift.ThriftFacetCount; -import com.twitter.search.earlybird.thrift.ThriftFacetCountMetadata; -import com.twitter.search.earlybird.thrift.ThriftFacetFieldResults; - -public class HashingAndPruningFacetAccumulator extends FacetAccumulator { - private static final int DEFAULT_HASH_SIZE = 4096; - /** - * 4 longs per entry accommodates long termIDs. - * Although entries could be encoded in 3 bytes, 4 ensures that no entry is split - * across cache lines. - */ - protected static final int LONGS_PER_ENTRY = 4; - private static final double LOAD_FACTOR = 0.5; - private static final long BITSHIFT_MAX_TWEEPCRED = 32; - private static final long PENALTY_COUNT_MASK = (1L << BITSHIFT_MAX_TWEEPCRED) - 1; - - protected static final long UNASSIGNED = -1; - - protected LanguageHistogram languageHistogram = new LanguageHistogram(); - - protected static final class HashTable { - protected final long[] hash; - protected final int size; - protected final int maxLoad; - protected final int mask; - - public HashTable(int size) { - hash = new long[LONGS_PER_ENTRY * size]; - Arrays.fill(hash, UNASSIGNED); - this.size = size; - // Ensure alignment to LONGS_PER_ENTRY-byte boundaries - this.mask = LONGS_PER_ENTRY * (size - 1); - this.maxLoad = (int) (size * LOAD_FACTOR); - } - - protected void reset() { - Arrays.fill(hash, UNASSIGNED); - } - - private final Cursor cursor = new Cursor(); - - public int findHashPosition(long termID) { - int code = (new Long(termID)).hashCode(); - int hashPos = code & mask; - - if (cursor.readFromHash(hashPos) && (cursor.termID != termID)) { - final int inc = ((code >> 8) + code) | 1; - do { - code += inc; - hashPos = code & this.mask; - } while (cursor.readFromHash(hashPos) && (cursor.termID != termID)); - } - - return hashPos; - } - - /** - * The cursor can be used to access the different fields of a hash entry. - * Callers should always position the cursor with readFromHash() before - * accessing the members. - */ - private final class Cursor { - private int simpleCount; - private int weightedCount; - private int penaltyCount; - private int maxTweepcred; - private long termID; - - public void writeToHash(int position) { - long payload = (((long) maxTweepcred) << BITSHIFT_MAX_TWEEPCRED) - | ((long) penaltyCount); - - assert itemPenaltyCount(payload) == penaltyCount : payload + ", " - + itemPenaltyCount(payload) + " != " + penaltyCount; - assert itemMaxTweepCred(payload) == maxTweepcred; - - hash[position] = termID; - hash[position + 1] = simpleCount; - hash[position + 2] = weightedCount; - hash[position + 3] = payload; - } - - /** Returns the item ID, or UNASSIGNED */ - public boolean readFromHash(int position) { - long entry = hash[position]; - if (entry == UNASSIGNED) { - termID = UNASSIGNED; - return false; - } - - termID = entry; - - simpleCount = (int) hash[position + 1]; - weightedCount = (int) hash[position + 2]; - long payload = hash[position + 3]; - - penaltyCount = itemPenaltyCount(payload); - maxTweepcred = itemMaxTweepCred(payload); - - return true; - } - } - } - - protected static int itemPenaltyCount(long payload) { - return (int) (payload & PENALTY_COUNT_MASK); - } - - protected static int itemMaxTweepCred(long payload) { - return (int) (payload >>> BITSHIFT_MAX_TWEEPCRED); - } - - protected int numItems; - protected final HashTable hashTable; - protected final long[] sortBuffer; - private FacetLabelProvider facetLabelProvider; - - private int totalSimpleCount; - private int totalWeightedCount; - private int totalPenalty; - - static final double DEFAULT_QUERY_INDEPENDENT_PENALTY_WEIGHT = 1.0; - private final double queryIndependentPenaltyWeight; - - private final FacetComparator facetComparator; - - public HashingAndPruningFacetAccumulator(FacetLabelProvider facetLabelProvider, - FacetComparator comparator) { - this(DEFAULT_HASH_SIZE, facetLabelProvider, - DEFAULT_QUERY_INDEPENDENT_PENALTY_WEIGHT, comparator); - } - - public HashingAndPruningFacetAccumulator(FacetLabelProvider facetLabelProvider, - double queryIndependentPenaltyWeight, FacetComparator comparator) { - this(DEFAULT_HASH_SIZE, facetLabelProvider, queryIndependentPenaltyWeight, comparator); - } - - /** - * Creates a new, empty HashingAndPruningFacetAccumulator with the given initial size. - * HashSize will be rounded up to the next power-of-2 value. - */ - public HashingAndPruningFacetAccumulator(int hashSize, FacetLabelProvider facetLabelProvider, - double queryIndependentPenaltyWeight, FacetComparator comparator) { - int powerOfTwoSize = 2; - while (hashSize > powerOfTwoSize) { - powerOfTwoSize *= 2; - } - - this.facetComparator = comparator; - hashTable = new HashTable(powerOfTwoSize); - sortBuffer = new long[LONGS_PER_ENTRY * (int) Math.ceil(LOAD_FACTOR * powerOfTwoSize)]; - this.facetLabelProvider = facetLabelProvider; - this.queryIndependentPenaltyWeight = queryIndependentPenaltyWeight; - } - - @Override - public void reset(FacetLabelProvider facetLabelProviderToReset) { - this.facetLabelProvider = facetLabelProviderToReset; - this.numItems = 0; - this.hashTable.reset(); - this.totalSimpleCount = 0; - this.totalPenalty = 0; - this.totalWeightedCount = 0; - languageHistogram.clear(); - } - - - @Override - public int add(long termID, int weightedCounterIncrement, int penaltyIncrement, int tweepCred) { - int hashPos = hashTable.findHashPosition(termID); - - totalPenalty += penaltyIncrement; - totalSimpleCount++; - totalWeightedCount += weightedCounterIncrement; - - if (hashTable.cursor.termID == UNASSIGNED) { - hashTable.cursor.termID = termID; - hashTable.cursor.simpleCount = 1; - hashTable.cursor.weightedCount = weightedCounterIncrement; - hashTable.cursor.penaltyCount = penaltyIncrement; - hashTable.cursor.maxTweepcred = tweepCred; - hashTable.cursor.writeToHash(hashPos); - - numItems++; - if (numItems >= hashTable.maxLoad) { - prune(); - } - return 1; - } else { - - hashTable.cursor.simpleCount++; - hashTable.cursor.weightedCount += weightedCounterIncrement; - - if (tweepCred > hashTable.cursor.maxTweepcred) { - hashTable.cursor.maxTweepcred = tweepCred; - } - - hashTable.cursor.penaltyCount += penaltyIncrement; - hashTable.cursor.writeToHash(hashPos); - return hashTable.cursor.simpleCount; - } - } - - @Override - public void recordLanguage(int languageId) { - languageHistogram.increment(languageId); - } - - @Override - public LanguageHistogram getLanguageHistogram() { - return languageHistogram; - } - - private void prune() { - copyToSortBuffer(); - hashTable.reset(); - - int targetNumItems = (int) (hashTable.maxLoad >> 1); - - int minCount = 2; - int nextMinCount = Integer.MAX_VALUE; - - final int n = LONGS_PER_ENTRY * numItems; - - while (numItems > targetNumItems) { - for (int i = 0; i < n; i += LONGS_PER_ENTRY) { - long item = sortBuffer[i]; - if (item != UNASSIGNED) { - int count = (int) sortBuffer[i + 1]; - if (count < minCount) { - evict(i); - } else if (count < nextMinCount) { - nextMinCount = count; - } - } - } - if (minCount == nextMinCount) { - minCount++; - } else { - minCount = nextMinCount; - } - nextMinCount = Integer.MAX_VALUE; - } - - // rehash - for (int i = 0; i < n; i += LONGS_PER_ENTRY) { - long item = sortBuffer[i]; - if (item != UNASSIGNED) { - final long termID = item; - int hashPos = hashTable.findHashPosition(termID); - for (int j = 0; j < LONGS_PER_ENTRY; ++j) { - hashTable.hash[hashPos + j] = sortBuffer[i + j]; - } - } - } - } - - // overridable for unit test - protected void evict(int index) { - sortBuffer[index] = UNASSIGNED; - numItems--; - } - - @Override - public ThriftFacetFieldResults getAllFacets() { - return getTopFacets(numItems); - } - - @Override - public ThriftFacetFieldResults getTopFacets(final int numRequested) { - int n = numRequested > numItems ? numItems : numRequested; - - if (n == 0) { - return null; - } - - ThriftFacetFieldResults facetResults = new ThriftFacetFieldResults(); - facetResults.setTotalCount(totalSimpleCount); - facetResults.setTotalScore(totalWeightedCount); - facetResults.setTotalPenalty(totalPenalty); - - copyToSortBuffer(); - - // sort table using the facet comparator - PriorityQueue pq = new PriorityQueue<>(numItems, facetComparator.getComparator(true)); - - for (int i = 0; i < LONGS_PER_ENTRY * numItems; i += LONGS_PER_ENTRY) { - pq.add(new Item(sortBuffer, i)); - } - - FacetLabelAccessor accessor = facetLabelProvider.getLabelAccessor(); - - for (int i = 0; i < n; i++) { - Item item = pq.poll(); - long id = item.getTermId(); - - int penalty = item.getPenaltyCount() + (int) (queryIndependentPenaltyWeight - * accessor.getOffensiveCount(id)); - ThriftFacetCount result = new ThriftFacetCount().setFacetLabel(accessor.getTermText(id)); - result.setPenaltyCount(penalty); - result.setSimpleCount(item.getSimpleCount()); - result.setWeightedCount(item.getWeightedCount()); - result.setMetadata(new ThriftFacetCountMetadata().setMaxTweepCred(item.getMaxTweetCred())); - - result.setFacetCount(result.getWeightedCount()); - facetResults.addToTopFacets(result); - } - - return facetResults; - } - - // Compacts the hashtable entries in place by removing empty hashes. After - // this operation it's no longer a hash table but a array of entries. - private void copyToSortBuffer() { - int upto = 0; - - for (int i = 0; i < hashTable.hash.length; i += LONGS_PER_ENTRY) { - if (hashTable.hash[i] != UNASSIGNED) { - for (int j = 0; j < LONGS_PER_ENTRY; ++j) { - sortBuffer[upto + j] = hashTable.hash[i + j]; - } - upto += LONGS_PER_ENTRY; - } - } - assert upto == numItems * LONGS_PER_ENTRY; - } - - /** - * Sorts facets in the following order: - * 1) ascending by weightedCount - * 2) if weightedCount equal: ascending by simpleCount - * 3) if weightedCount and simpleCount equal: descending by penaltyCount - */ - public static int compareFacetCounts(int weightedCount1, int simpleCount1, int penaltyCount1, - int weightedCount2, int simpleCount2, int penaltyCount2, - boolean simpleCountPrecedence) { - if (simpleCountPrecedence) { - if (simpleCount1 < simpleCount2) { - return -1; - } else if (simpleCount1 > simpleCount2) { - return 1; - } else { - if (weightedCount1 < weightedCount2) { - return -1; - } else if (weightedCount1 > weightedCount2) { - return 1; - } else { - if (penaltyCount1 < penaltyCount2) { - // descending - return 1; - } else if (penaltyCount1 > penaltyCount2) { - return -1; - } else { - return 0; - } - } - } - } else { - if (weightedCount1 < weightedCount2) { - return -1; - } else if (weightedCount1 > weightedCount2) { - return 1; - } else { - if (simpleCount1 < simpleCount2) { - return -1; - } else if (simpleCount1 > simpleCount2) { - return 1; - } else { - if (penaltyCount1 < penaltyCount2) { - // descending - return 1; - } else if (penaltyCount1 > penaltyCount2) { - return -1; - } else { - return 0; - } - } - } - } - } - - public static final class FacetComparator { - private final Comparator thriftComparator; - private final Comparator comparator; - - private FacetComparator(Comparator thriftComparator, - Comparator comparator) { - this.thriftComparator = thriftComparator; - this.comparator = comparator; - } - - public Comparator getThriftComparator() { - return getThriftComparator(false); - } - - public Comparator getThriftComparator(boolean reverse) { - return reverse ? getReverseComparator(thriftComparator) : thriftComparator; - } - - private Comparator getComparator(boolean reverse) { - return reverse ? getReverseComparator(comparator) : comparator; - } - } - - public static final FacetComparator SIMPLE_COUNT_COMPARATOR = new FacetComparator( - (facet1, facet2) -> compareFacetCounts( - facet1.weightedCount, facet1.simpleCount, facet1.penaltyCount, - facet2.weightedCount, facet2.simpleCount, facet2.penaltyCount, - true), - (facet1, facet2) -> compareFacetCounts( - facet1.getWeightedCount(), facet1.getSimpleCount(), facet1.getPenaltyCount(), - facet2.getWeightedCount(), facet2.getSimpleCount(), facet2.getPenaltyCount(), - true)); - - - public static final FacetComparator WEIGHTED_COUNT_COMPARATOR = new FacetComparator( - (facet1, facet2) -> compareFacetCounts( - facet1.weightedCount, facet1.simpleCount, facet1.penaltyCount, - facet2.weightedCount, facet2.simpleCount, facet2.penaltyCount, - false), - (facet1, facet2) -> compareFacetCounts( - facet1.getWeightedCount(), facet1.getSimpleCount(), facet1.getPenaltyCount(), - facet2.getWeightedCount(), facet2.getSimpleCount(), facet2.getPenaltyCount(), - false)); - - /** - * Returns the appropriate FacetComparator for the specified sortingMode. - */ - public static FacetComparator getComparator(ThriftFacetEarlybirdSortingMode sortingMode) { - switch (sortingMode) { - case SORT_BY_WEIGHTED_COUNT: - return WEIGHTED_COUNT_COMPARATOR; - case SORT_BY_SIMPLE_COUNT: - default: - return SIMPLE_COUNT_COMPARATOR; - } - } - - private static Comparator getReverseComparator(final Comparator comparator) { - return (t1, t2) -> -comparator.compare(t1, t2); - } - - static final class Item { - private final long[] data; - private final int offset; - - Item(long[] data, int offset) { - this.data = data; - this.offset = offset; - } - - public long getTermId() { - return data[offset]; - } - - public int getSimpleCount() { - return (int) data[offset + 1]; - } - - public int getWeightedCount() { - return (int) data[offset + 2]; - } - - public int getPenaltyCount() { - return itemPenaltyCount(data[offset + 3]); - } - - public int getMaxTweetCred() { - return itemMaxTweepCred(data[offset + 3]); - } - - @Override public int hashCode() { - return (int) (31 * getTermId()); - } - - @Override public boolean equals(Object o) { - return getTermId() == ((Item) o).getTermId(); - } - - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/NamedEntityCollector.docx b/src/java/com/twitter/search/earlybird/search/facets/NamedEntityCollector.docx new file mode 100644 index 000000000..0e8c450c3 Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/NamedEntityCollector.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/NamedEntityCollector.java b/src/java/com/twitter/search/earlybird/search/facets/NamedEntityCollector.java deleted file mode 100644 index 2e7be9e30..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/NamedEntityCollector.java +++ /dev/null @@ -1,49 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.util.List; -import java.util.Map; - -import com.google.common.collect.ImmutableList; -import com.google.common.collect.ImmutableMap; -import com.google.common.collect.Lists; - -import org.apache.commons.lang.StringUtils; - -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.earlybird.thrift.NamedEntitySource; -import com.twitter.search.earlybird.thrift.ThriftSearchResult; -import com.twitter.search.earlybird.thrift.ThriftSearchResultNamedEntity; - -public class NamedEntityCollector extends AbstractFacetTermCollector { - private static final Map NAMED_ENTITY_WITH_TYPE_FIELDS = - ImmutableMap.of( - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_TEXT_FIELD.getFieldName(), - NamedEntitySource.TEXT, - EarlybirdFieldConstant.NAMED_ENTITY_WITH_TYPE_FROM_URL_FIELD.getFieldName(), - NamedEntitySource.URL); - - private List namedEntities = Lists.newArrayList(); - - @Override - public boolean collect(int docID, long termID, int fieldID) { - - String term = getTermFromFacet(termID, fieldID, NAMED_ENTITY_WITH_TYPE_FIELDS.keySet()); - if (StringUtils.isEmpty(term)) { - return false; - } - - int index = term.lastIndexOf(":"); - namedEntities.add(new ThriftSearchResultNamedEntity( - term.substring(0, index), - term.substring(index + 1), - NAMED_ENTITY_WITH_TYPE_FIELDS.get(findFacetName(fieldID)))); - - return true; - } - - @Override - public void fillResultAndClear(ThriftSearchResult result) { - getExtraMetadata(result).setNamedEntities(ImmutableList.copyOf(namedEntities)); - namedEntities.clear(); - } -} diff --git a/src/java/com/twitter/search/earlybird/search/facets/RetweetFacetCountIterator.docx b/src/java/com/twitter/search/earlybird/search/facets/RetweetFacetCountIterator.docx new file mode 100644 index 000000000..a7687876a Binary files /dev/null and b/src/java/com/twitter/search/earlybird/search/facets/RetweetFacetCountIterator.docx differ diff --git a/src/java/com/twitter/search/earlybird/search/facets/RetweetFacetCountIterator.java b/src/java/com/twitter/search/earlybird/search/facets/RetweetFacetCountIterator.java deleted file mode 100644 index 1693b8cf2..000000000 --- a/src/java/com/twitter/search/earlybird/search/facets/RetweetFacetCountIterator.java +++ /dev/null @@ -1,36 +0,0 @@ -package com.twitter.search.earlybird.search.facets; - -import java.io.IOException; - -import org.apache.lucene.index.NumericDocValues; - -import com.twitter.search.common.schema.base.Schema; -import com.twitter.search.common.schema.earlybird.EarlybirdFieldConstants.EarlybirdFieldConstant; -import com.twitter.search.core.earlybird.facets.CSFFacetCountIterator; -import com.twitter.search.core.earlybird.index.EarlybirdIndexSegmentAtomicReader; - -/** - * And iterator for counting retweets. Reads from shared_status_id CSF but doesn't count - * replies. - */ -public class RetweetFacetCountIterator extends CSFFacetCountIterator { - private final NumericDocValues featureReaderIsRetweetFlag; - - public RetweetFacetCountIterator( - EarlybirdIndexSegmentAtomicReader reader, - Schema.FieldInfo facetFieldInfo) throws IOException { - super(reader, facetFieldInfo); - featureReaderIsRetweetFlag = - reader.getNumericDocValues(EarlybirdFieldConstant.IS_RETWEET_FLAG.getFieldName()); - } - - @Override - protected boolean shouldCollect(int internalDocID, long termID) throws IOException { - // termID == 0 means that we didn't set shared_status_csf, so don't collect - // (tweet IDs are all positive) - // Also only collect if this doc is a retweet, not a reply - return termID > 0 - && featureReaderIsRetweetFlag.advanceExact(internalDocID) - && (featureReaderIsRetweetFlag.longValue() != 0); - } -}