[docx] split commit for file 4200

Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
2024-11-16 08:29:21 +01:00 · 2024-01-23 19:15:15 +02:00 · 2024-01-23 19:15:15 +02:00 · 8948d714f6
commit 8948d714f6
parent 47a8228a09
400 changed files with 0 additions and 31949 deletions
--- a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.docx
+++ b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.docx
--- a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.java
+++ b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentUpdater.java
@ -1,279 +0,0 @@
 package com.twitter.search.earlybird.archive;
 import java.io.IOException;
 import java.util.Date;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Predicate;
 import org.apache.commons.lang.time.FastDateFormat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.common.util.Clock;
 import com.twitter.search.common.metrics.SearchRateCounter;
 import com.twitter.search.common.metrics.SearchStatsReceiver;
 import com.twitter.search.common.metrics.SearchStatsReceiverImpl;
 import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
 import com.twitter.search.common.util.io.recordreader.RecordReader;
 import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
 import com.twitter.search.earlybird.EarlybirdIndexConfig;
 import com.twitter.search.earlybird.common.config.EarlybirdConfig;
 import com.twitter.search.earlybird.document.DocumentFactory;
 import com.twitter.search.earlybird.document.TweetDocument;
 import com.twitter.search.earlybird.exception.CriticalExceptionHandler;
 import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
 import com.twitter.search.earlybird.partition.SearchIndexingMetricSet;
 import com.twitter.search.earlybird.partition.SegmentHdfsFlusher;
 import com.twitter.search.earlybird.partition.SegmentInfo;
 import com.twitter.search.earlybird.partition.SegmentLoader;
 import com.twitter.search.earlybird.partition.SegmentOptimizer;
 import com.twitter.search.earlybird.partition.SegmentSyncConfig;
 import com.twitter.search.earlybird.partition.SimpleSegmentIndexer;
 import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
 /**
 * Given a segment, this class checks if the segment has an index built on HDFS:
 *   if not, use SimpleSegmentIndexer to build an index
 *   if yes, load the HDFS index, build a new index for the new status data which has dates newer
 *   than the HDFS index, then append the loaded HDFS index.
 */
 public class ArchiveSegmentUpdater {
  private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentUpdater.class);
  private final SegmentSyncConfig sync;
  private final EarlybirdIndexConfig earlybirdIndexConfig;
  private final ZooKeeperTryLockFactory zkTryLockFactory;
  private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl();
  private final SearchIndexingMetricSet searchIndexingMetricSet =
      new SearchIndexingMetricSet(statsReceiver);
  private final EarlybirdSearcherStats searcherStats =
      new EarlybirdSearcherStats(statsReceiver);
  private final SearchRateCounter indexNewSegment =
      new SearchRateCounter("index_new_segment");
  private final SearchRateCounter updateExistingSegment =
      new SearchRateCounter("update_existing_segment");
  private final SearchRateCounter skipExistingSegment =
      new SearchRateCounter("skip_existing_segment");
  private Clock clock;
  public ArchiveSegmentUpdater(ZooKeeperTryLockFactory zooKeeperTryLockFactory,
                               SegmentSyncConfig sync,
                               EarlybirdIndexConfig earlybirdIndexConfig,
                               Clock clock) {
    this.sync = sync;
    this.earlybirdIndexConfig = earlybirdIndexConfig;
    this.zkTryLockFactory = zooKeeperTryLockFactory;
    this.clock = clock;
  }
  private boolean canUpdateSegment(SegmentInfo segmentInfo) {
    if (!(segmentInfo.getSegment() instanceof ArchiveSegment)) {
      LOG.info("only ArchiveSegment is available for updating now: "
          + segmentInfo);
      return false;
    }
    if (!segmentInfo.isEnabled()) {
      LOG.debug("Segment is disabled: " + segmentInfo);
      return false;
    }
    if (segmentInfo.isComplete() || segmentInfo.isIndexing()
        || segmentInfo.getSyncInfo().isLoaded()) {
      LOG.debug("Cannot update already indexed segment: " + segmentInfo);
      return false;
    }
    return true;
  }
  /**
   * Given a segment, checks if the segment has an index built on HDFS:
   *   if not, use SimpleSegmentIndexer to build an index
   *   if yes, load the HDFS index, build a new index for the new status data which has dates newer
   *   than the HDFS index, then append the loaded HDFS index.
   *
   * Returns whether the segment was successfully updated.
   */
  public boolean updateSegment(SegmentInfo segmentInfo) {
    Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
    if (!canUpdateSegment(segmentInfo)) {
      return false;
    }
    if (segmentInfo.isIndexing()) {
      LOG.error("Segment is already being indexed: " + segmentInfo);
      return false;
    }
    final Date hdfsEndDate = ArchiveHDFSUtils.getSegmentEndDateOnHdfs(sync, segmentInfo);
    if (hdfsEndDate == null) {
      indexNewSegment.increment();
      if (!indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE)) {
        return false;
      }
    } else {
      final Date curEndDate = ((ArchiveSegment) segmentInfo.getSegment()).getDataEndDate();
      if (!hdfsEndDate.before(curEndDate)) {
        skipExistingSegment.increment();
        LOG.info("Segment is up-to-date: " + segmentInfo.getSegment().getTimeSliceID()
            + " Found flushed segment on HDFS with end date: "
            + FastDateFormat.getInstance("yyyyMMdd").format(hdfsEndDate));
        segmentInfo.setComplete(true);
        segmentInfo.getSyncInfo().setFlushed(true);
        return true;
      }
      updateExistingSegment.increment();
      LOG.info("Updating segment: " + segmentInfo.getSegment().getTimeSliceID()
          + "; new endDate will be " + FastDateFormat.getInstance("yyyyMMdd").format(curEndDate));
      if (!updateSegment(segmentInfo, hdfsEndDate)) {
        return false;
      }
    }
    boolean success = SegmentOptimizer.optimize(segmentInfo);
    if (!success) {
      // Clean up the segment dir on local disk
      segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
      LOG.info("Error optimizing segment: " + segmentInfo);
      return false;
    }
    // Verify segment before uploading.
    success = ArchiveSegmentVerifier.verifySegment(segmentInfo);
    if (!success) {
      segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
      LOG.info("Segment not uploaded to HDFS because it did not pass verification: " + segmentInfo);
      return false;
    }
    // upload the index to HDFS
    success = new SegmentHdfsFlusher(zkTryLockFactory, sync, false)
        .flushSegmentToDiskAndHDFS(segmentInfo);
    if (success) {
      ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, false, true);
    } else {
      // Clean up the segment dir on hdfs
      ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, true, false);
      LOG.info("Error uploading segment to HDFS: " + segmentInfo);
    }
    segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
    return success;
  }
  /**
   * Build index for the given segmentInfo. Only those statuses passing the dateFilter are indexed.
   */
  private boolean indexSegment(final SegmentInfo segmentInfo, Predicate<Date> dateFilter) {
    Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
    RecordReader<TweetDocument> documentReader = null;
    try {
      ArchiveSegment archiveSegment = (ArchiveSegment) segmentInfo.getSegment();
      DocumentFactory<ThriftIndexingEvent> documentFactory =
          earlybirdIndexConfig.createDocumentFactory();
      documentReader = archiveSegment.getStatusRecordReader(documentFactory, dateFilter);
      // Read and index the statuses
      boolean success = new SimpleSegmentIndexer(documentReader, searchIndexingMetricSet)
          .indexSegment(segmentInfo);
      if (!success) {
        // Clean up segment dir on local disk
        segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
        LOG.info("Error indexing segment: " + segmentInfo);
      }
      return success;
    } catch (IOException e) {
      segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
      LOG.info("Exception while indexing segment: " + segmentInfo, e);
      return false;
    } finally {
      if (documentReader != null) {
        documentReader.stop();
      }
    }
  }
  /**
   * Load the index built on HDFS for the given segmentInfo, index the new data and append the
   * HDFS index to the new indexed segment
   */
  private boolean updateSegment(final SegmentInfo segmentInfo, final Date hdfsEndDate) {
    SegmentInfo hdfsSegmentInfo = loadSegmentFromHdfs(segmentInfo, hdfsEndDate);
    if (hdfsSegmentInfo == null) {
      return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE);
    }
    boolean success = indexSegment(segmentInfo, input -> {
      // we're updating the segment - only index days after the old end date,
      // and we're sure that the previous days have already been indexed.
      return input.after(hdfsEndDate);
    });
    if (!success) {
      LOG.error("Error indexing new data: " + segmentInfo);
      return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE);
    }
    // Now, append the index loaded from hdfs
    try {
      segmentInfo.getIndexSegment().append(hdfsSegmentInfo.getIndexSegment());
      hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
      LOG.info("Deleted local segment directories with end date " + hdfsEndDate + " : "
          + segmentInfo);
    } catch (IOException e) {
      LOG.warn("Caught IOException while appending segment " + hdfsSegmentInfo.getSegmentName(), e);
      hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
      segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
      return false;
    }
    segmentInfo.setComplete(true);
    return true;
  }
  /**
   * Load the index built on HDFS for the given segmentInfo and end date
   */
  private SegmentInfo loadSegmentFromHdfs(final SegmentInfo segmentInfo, final Date hdfsEndDate) {
    Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
    ArchiveSegment segment = new ArchiveSegment(
        segmentInfo.getTimeSliceID(),
        EarlybirdConfig.getMaxSegmentSize(),
        segmentInfo.getNumPartitions(),
        segmentInfo.getSegment().getHashPartitionID(),
        hdfsEndDate);
    EarlybirdSegmentFactory factory = new EarlybirdSegmentFactory(
        earlybirdIndexConfig,
        searchIndexingMetricSet,
        searcherStats,
        clock);
    SegmentInfo hdfsSegmentInfo;
    try {
      hdfsSegmentInfo = new SegmentInfo(segment,  factory, sync);
      CriticalExceptionHandler criticalExceptionHandler =
          new CriticalExceptionHandler();
      boolean success = new SegmentLoader(sync, criticalExceptionHandler)
          .load(hdfsSegmentInfo);
      if (!success) {
        // If not successful, segmentLoader has already cleaned up the local dir.
        LOG.info("Error loading hdfs segment " + hdfsSegmentInfo
            + ", building segment from scratch.");
        hdfsSegmentInfo = null;
      }
    } catch (IOException e) {
      LOG.error("Exception while loading segment from hdfs: " + segmentInfo, e);
      hdfsSegmentInfo = null;
    }
    return hdfsSegmentInfo;
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.docx
+++ b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.docx
--- a/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.java
+++ b/src/java/com/twitter/search/earlybird/archive/ArchiveSegmentVerifier.java
@ -1,75 +0,0 @@
 package com.twitter.search.earlybird.archive;
 import java.io.IOException;
 import java.util.List;
 import com.google.common.annotations.VisibleForTesting;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.store.Directory;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.search.earlybird.partition.SegmentInfo;
 public final class ArchiveSegmentVerifier {
  private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentVerifier.class);
  private ArchiveSegmentVerifier() {
  }
  @VisibleForTesting
  static boolean shouldVerifySegment(SegmentInfo segmentInfo) {
    if (segmentInfo.isIndexing()) {
      LOG.warn("ArchiveSegmentVerifier got segment still indexing.");
      return false;
    }
    if (!segmentInfo.isComplete()) {
      LOG.warn("ArchiveSegmentVerifyer got incomplete segment.");
      return false;
    }
    if (!segmentInfo.isOptimized()) {
      LOG.warn("ArchiveSegmentVerifyer got unoptimized segment.");
      return false;
    }
    return true;
  }
  /**
   * Verifies an archive segment has a sane number of leaves.
   */
  public static boolean verifySegment(SegmentInfo segmentInfo) {
    if (!shouldVerifySegment(segmentInfo)) {
      return false;
    }
    Directory directory = segmentInfo.getIndexSegment().getLuceneDirectory();
    return verifyLuceneIndex(directory);
  }
  private static boolean verifyLuceneIndex(Directory directory) {
    try {
      DirectoryReader indexerReader = DirectoryReader.open(directory);
      List<LeafReaderContext> leaves = indexerReader.getContext().leaves();
      if (leaves.size() != 1) {
        LOG.warn("Lucene index does not have exactly one segment: " + leaves.size() + " != 1. "
            + "Lucene segments should have been merged during optimization.");
        return false;
      }
      LeafReader reader = leaves.get(0).reader();
      if (reader.numDocs() <= 0) {
        LOG.warn("Lucene index has no document: " + reader);
        return false;
      }
      return true;
    } catch (IOException e) {
      LOG.warn("Found bad lucene index at: " + directory);
      return false;
    }
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.docx
+++ b/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.docx
--- a/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.java
+++ b/src/java/com/twitter/search/earlybird/archive/ArchiveTimeSlicer.java
@ -1,322 +0,0 @@
 package com.twitter.search.earlybird.archive;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Calendar;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.Date;
 import java.util.List;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Predicate;
 import com.google.common.collect.Lists;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
 import com.twitter.search.common.util.io.MergingSortedRecordReader;
 import com.twitter.search.common.util.io.recordreader.RecordReader;
 import com.twitter.search.earlybird.config.TierConfig;
 import com.twitter.search.earlybird.document.DocumentFactory;
 import com.twitter.search.earlybird.document.ThriftIndexingEventDocumentFactory;
 import com.twitter.search.earlybird.document.TweetDocument;
 /**
 * Responsible for taking a number of daily status batches and partitioning them into time slices
 * which will be used to build segments.
 *
 * We try to put at most N number of tweets into a time slice.
 */
 public class ArchiveTimeSlicer {
  private static final Logger LOG = LoggerFactory.getLogger(ArchiveTimeSlicer.class);
  private static final Comparator<TweetDocument> ASCENDING =
      (o1, o2) -> Long.compare(o1.getTweetID(), o2.getTweetID());
  private static final Comparator<TweetDocument> DESCENDING =
      (o1, o2) -> Long.compare(o2.getTweetID(), o1.getTweetID());
  // Represents a number of daily batches which will go into a segment.
  public static final class ArchiveTimeSlice {
    private Date startDate;
    private Date endDate;
    private int statusCount;
    private final DailyStatusBatches directory;
    private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig;
    // This list is always ordered from oldest day, to the newest day.
    // For the on-disk archive, we reverse the days in getTweetReaders().
    private final List<DailyStatusBatch> batches = Lists.newArrayList();
    private ArchiveTimeSlice(DailyStatusBatches directory,
                             ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
      this.directory = directory;
      this.earlybirdIndexConfig = earlybirdIndexConfig;
    }
    public Date getEndDate() {
      return endDate;
    }
    public int getStatusCount() {
      return statusCount;
    }
    public int getNumHashPartitions() {
      return batches.isEmpty() ? 0 : batches.get(0).getNumHashPartitions();
    }
    /**
     * Returns a reader for reading tweets from this timeslice.
     *
     * @param archiveSegment The segment to which the timeslice belongs.
     * @param documentFactory The ThriftIndexingEvent to TweetDocument converter.
     * @param filter A filter that determines what dates should be read.
     */
    public RecordReader<TweetDocument> getStatusReader(
        ArchiveSegment archiveSegment,
        DocumentFactory<ThriftIndexingEvent> documentFactory,
        Predicate<Date> filter) throws IOException {
      // We no longer support ThriftStatus based document factories.
      Preconditions.checkState(documentFactory instanceof ThriftIndexingEventDocumentFactory);
      final int hashPartitionID = archiveSegment.getHashPartitionID();
      List<RecordReader<TweetDocument>> readers = new ArrayList<>(batches.size());
      List<DailyStatusBatch> orderedForReading = orderBatchesForReading(batches);
      LOG.info("Creating new status reader for hashPartition: "
          + hashPartitionID + " timeslice: " + getDescription());
      for (DailyStatusBatch batch : orderedForReading) {
        if (filter.apply(batch.getDate())) {
          LOG.info("Adding reader for " + batch.getDate() + " " + getDescription());
          PartitionedBatch partitionedBatch = batch.getPartition(hashPartitionID);
          // Don't even try to create a reader if the partition is empty.
          // There does not seem to be any problem in production now, but HDFS FileSystem's javadoc
          // does indicate that listStatus() is allowed to throw a FileNotFoundException if the
          // partition does not exist. This check makes the code more robust against future
          // HDFS FileSystem implementation changes.
          if (partitionedBatch.getStatusCount() > 0) {
            RecordReader<TweetDocument> tweetReaders = partitionedBatch.getTweetReaders(
                archiveSegment,
                directory.getStatusPathToUseForDay(batch.getDate()),
                documentFactory);
            readers.add(tweetReaders);
          }
        } else {
          LOG.info("Filtered reader for " + batch.getDate() + " " + getDescription());
        }
      }
      LOG.info("Creating reader for timeslice: " + getDescription()
          + " with " + readers.size() + " readers");
      return new MergingSortedRecordReader<TweetDocument>(getMergingComparator(), readers);
    }
    private List<DailyStatusBatch> orderBatchesForReading(List<DailyStatusBatch> orderedBatches) {
      // For the index formats using stock lucene, we want the most recent days to be indexed first.
      // In the twitter in-memory optimized indexes, older tweets will be added first, and
      // optimization will reverse the documents to make most recent tweets be first.
      return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering()
          ? orderedBatches : Lists.reverse(orderedBatches);
    }
    private Comparator<TweetDocument> getMergingComparator() {
      // We always want to retrieve larger tweet ids first.
      // LIFO means that the smaller ids get inserted first --> ASCENDING order.
      // FIFO would mean that we want to first insert the larger ids --> DESCENDING order.
      return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering()
          ? ASCENDING : DESCENDING;
    }
    /**
     * Returns the smallest indexed tweet ID in this timeslice for the given partition.
     *
     * @param hashPartitionID The partition.
     */
    public long getMinStatusID(int hashPartitionID) {
      if (batches.isEmpty()) {
        return 0;
      }
      for (int i = 0; i < batches.size(); i++) {
        long minStatusID = batches.get(i).getPartition(hashPartitionID).getMinStatusID();
        if (minStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) {
          return minStatusID;
        }
      }
      return 0;
    }
    /**
     * Returns the highest indexed tweet ID in this timeslice for the given partition.
     *
     * @param hashPartitionID The partition.
     */
    public long getMaxStatusID(int hashPartitionID) {
      if (batches.isEmpty()) {
        return Long.MAX_VALUE;
      }
      for (int i = batches.size() - 1; i >= 0; i--) {
        long maxStatusID = batches.get(i).getPartition(hashPartitionID).getMaxStatusID();
        if (maxStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) {
          return maxStatusID;
        }
      }
      return Long.MAX_VALUE;
    }
    /**
     * Returns a string with some information for this timeslice.
     */
    public String getDescription() {
      StringBuilder builder = new StringBuilder();
      builder.append("TimeSlice[start date=");
      builder.append(DailyStatusBatches.DATE_FORMAT.format(startDate));
      builder.append(", end date=");
      builder.append(DailyStatusBatches.DATE_FORMAT.format(endDate));
      builder.append(", status count=");
      builder.append(statusCount);
      builder.append(", days count=");
      builder.append(batches.size());
      builder.append("]");
      return builder.toString();
    }
  }
  private final int maxSegmentSize;
  private final DailyStatusBatches dailyStatusBatches;
  private final Date tierStartDate;
  private final Date tierEndDate;
  private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig;
  private List<ArchiveTimeSlice> lastCachedTimeslices = null;
  public ArchiveTimeSlicer(int maxSegmentSize,
                           DailyStatusBatches dailyStatusBatches,
                           ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
    this(maxSegmentSize, dailyStatusBatches, TierConfig.DEFAULT_TIER_START_DATE,
        TierConfig.DEFAULT_TIER_END_DATE, earlybirdIndexConfig);
  }
  public ArchiveTimeSlicer(int maxSegmentSize,
                           DailyStatusBatches dailyStatusBatches,
                           Date tierStartDate,
                           Date tierEndDate,
                           ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
    this.maxSegmentSize = maxSegmentSize;
    this.dailyStatusBatches = dailyStatusBatches;
    this.tierStartDate = tierStartDate;
    this.tierEndDate = tierEndDate;
    this.earlybirdIndexConfig = earlybirdIndexConfig;
  }
  private boolean cacheIsValid() throws IOException {
    return lastCachedTimeslices != null
        && !lastCachedTimeslices.isEmpty()
        && cacheIsValid(lastCachedTimeslices.get(lastCachedTimeslices.size() - 1).endDate);
  }
  private boolean cacheIsValid(Date lastDate) throws IOException {
    if (lastCachedTimeslices == null || lastCachedTimeslices.isEmpty()) {
      return false;
    }
    // Check if we have a daily batch newer than the last batch used for the newest timeslice.
    Calendar cal = Calendar.getInstance();
    cal.setTime(lastDate);
    cal.add(Calendar.DATE, 1);
    Date nextDate = cal.getTime();
    boolean foundBatch = dailyStatusBatches.hasValidBatchForDay(nextDate);
    LOG.info("Checking cache: Looked for valid batch for day {}. Found: {}",
        DailyStatusBatches.DATE_FORMAT.format(nextDate), foundBatch);
    return !foundBatch;
  }
  private boolean timesliceIsFull(ArchiveTimeSlice timeSlice, DailyStatusBatch batch) {
    return timeSlice.statusCount + batch.getMaxPerPartitionStatusCount() > maxSegmentSize;
  }
  private void doTimeSlicing() throws IOException {
    dailyStatusBatches.refresh();
    lastCachedTimeslices = Lists.newArrayList();
    ArchiveTimeSlice currentTimeSlice = null;
    // Iterate over each day and add it to the current timeslice, until it gets full.
    for (DailyStatusBatch batch : dailyStatusBatches.getStatusBatches()) {
      if (!batch.isValid()) {
        LOG.warn("Skipping hole: " + batch.getDate());
        continue;
      }
      if (currentTimeSlice == null || timesliceIsFull(currentTimeSlice, batch)) {
        if (currentTimeSlice != null) {
          LOG.info("Filled timeslice: " + currentTimeSlice.getDescription());
        }
        currentTimeSlice = new ArchiveTimeSlice(dailyStatusBatches, earlybirdIndexConfig);
        currentTimeSlice.startDate = batch.getDate();
        lastCachedTimeslices.add(currentTimeSlice);
      }
      currentTimeSlice.endDate = batch.getDate();
      currentTimeSlice.statusCount += batch.getMaxPerPartitionStatusCount();
      currentTimeSlice.batches.add(batch);
    }
    LOG.info("Last timeslice: {}", currentTimeSlice.getDescription());
    LOG.info("Done with time slicing. Number of timeslices: {}",
        lastCachedTimeslices.size());
  }
  /**
   * Returns all timeslices for this earlybird.
   */
  public List<ArchiveTimeSlice> getTimeSlices() throws IOException {
    if (cacheIsValid()) {
      return lastCachedTimeslices;
    }
    LOG.info("Cache is outdated. Loading new daily batches now...");
    doTimeSlicing();
    return lastCachedTimeslices != null ? Collections.unmodifiableList(lastCachedTimeslices) : null;
  }
  /**
   * Return the timeslices that overlap the tier start/end date ranges if they are specified
   */
  public List<ArchiveTimeSlice> getTimeSlicesInTierRange() throws IOException {
    List<ArchiveTimeSlice> timeSlices = getTimeSlices();
    if (tierStartDate == TierConfig.DEFAULT_TIER_START_DATE
        && tierEndDate == TierConfig.DEFAULT_TIER_END_DATE) {
      return timeSlices;
    }
    List<ArchiveTimeSlice> filteredTimeSlice = Lists.newArrayList();
    for (ArchiveTimeSlice timeSlice : timeSlices) {
      if (timeSlice.startDate.before(tierEndDate) && !timeSlice.endDate.before(tierStartDate)) {
        filteredTimeSlice.add(timeSlice);
      }
    }
    return filteredTimeSlice;
  }
  @VisibleForTesting
  protected DailyStatusBatches getDailyStatusBatches() {
    return dailyStatusBatches;
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.docx
+++ b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.docx
--- a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.java
+++ b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatch.java
@ -1,166 +0,0 @@
 package com.twitter.search.earlybird.archive;
 import java.io.IOException;
 import java.util.Date;
 import java.util.Map;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Maps;
 import com.google.gson.Gson;
 import com.google.gson.JsonParseException;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 /**
 * Represents a day's worth of statuses (tweets) for multiple hash partitions.
 *
 * Note that what this class contains is not the data, but metadata.
 *
 * A day of tweets will come from:
 * - A scrubgen, if it has happened before the scrubgen date.
 * - Our daily jobs pipeline, if it has happened after that.
 *
 * This class checks the _SUCCESS file exists in the "statuses" subdirectory and extracts the status
 * count, min status id and max status id.
 */
 public class DailyStatusBatch implements Comparable<DailyStatusBatch> {
  private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatch.class);
  public static final long EMPTY_BATCH_STATUS_ID = -1;
  private static final String PARTITION_FORMAT = "p_%d_of_%d";
  private static final String SUCCESS_FILE_NAME = "_SUCCESS";
  private final Map<Integer, PartitionedBatch> hashPartitionToStatuses = Maps.newHashMap();
  private final Date date;
  private final int numHashPartitions;
  private final boolean hasSuccessFiles;
  public DailyStatusBatch(Date date, int numHashPartitions, Path statusPath, FileSystem hdfs) {
    this.date = date;
    this.numHashPartitions = numHashPartitions;
    this.hasSuccessFiles = checkForSuccessFile(hdfs, date, statusPath);
  }
  public Date getDate() {
    return date;
  }
  /**
   * Check for the presence of the _SUCCESS file for the given day's path on HDFS for the statuses
   * field group.
   */
  private boolean checkForSuccessFile(FileSystem hdfs, Date inputDate, Path statusPath) {
    Path dayPath = new Path(statusPath, ArchiveHDFSUtils.dateToPath(inputDate, "/"));
    Path successFilePath = new Path(dayPath, SUCCESS_FILE_NAME);
    try {
      return hdfs.getFileStatus(successFilePath).isFile();
    } catch (IOException e) {
      LOG.error("Could not verify existence of the _SUCCESS file. Assuming it doesn't exist.", e);
    }
    return false;
  }
  /**
   * Loads the data for this day for the given partition.
   */
  public PartitionedBatch addPartition(FileSystem hdfs, Path dayPath, int hashPartitionID)
      throws IOException {
    String partitionDir = String.format(PARTITION_FORMAT, hashPartitionID, numHashPartitions);
    Path path = new Path(dayPath, partitionDir);
    PartitionedBatch batch =
        new PartitionedBatch(path, hashPartitionID, numHashPartitions, date);
    batch.load(hdfs);
    hashPartitionToStatuses.put(hashPartitionID, batch);
    return batch;
  }
  public PartitionedBatch getPartition(int hashPartitionID) {
    return hashPartitionToStatuses.get(hashPartitionID);
  }
  /**
   * Returns the greatest status count in all partitions belonging to this batch.
   */
  public int getMaxPerPartitionStatusCount() {
    int maxPerPartitionStatusCount = 0;
    for (PartitionedBatch batch : hashPartitionToStatuses.values()) {
      maxPerPartitionStatusCount = Math.max(batch.getStatusCount(), maxPerPartitionStatusCount);
    }
    return maxPerPartitionStatusCount;
  }
  public int getNumHashPartitions() {
    return numHashPartitions;
  }
  @VisibleForTesting
  boolean hasSuccessFiles() {
    return hasSuccessFiles;
  }
  /**
   * Returns true if the _status_counts files could be found in each
   * hash partition subfolder that belongs to this timeslice
   * AND the _SUCCESS file can be found at the root folder for day
   */
  public boolean isValid() {
    // make sure we have data for all hash partitions
    for (int i = 0; i < numHashPartitions; i++) {
      PartitionedBatch day = hashPartitionToStatuses.get(i);
      if (day == null || !day.hasStatusCount() || day.isDisallowedEmptyPartition()) {
        return false;
      }
    }
    return hasSuccessFiles;
  }
  @Override
  public String toString() {
    StringBuilder builder = new StringBuilder();
    builder.append("DailyStatusBatch[date=").append(date)
           .append(",valid=").append(isValid())
           .append(",hasSuccessFiles=").append(hasSuccessFiles)
           .append(",numHashPartitions=").append(numHashPartitions)
           .append("]:\n");
    for (int i = 0; i < numHashPartitions; i++) {
      builder.append('\t').append(hashPartitionToStatuses.get(i).toString()).append('\n');
    }
    return builder.toString();
  }
  @Override
  public int compareTo(DailyStatusBatch o) {
    return date.compareTo(o.date);
  }
  /**
   * Serialize DailyStatusBatch to a json string.
   */
  public String serializeToJson() {
    return serializeToJson(new Gson());
  }
  @VisibleForTesting
  String serializeToJson(Gson gson) {
    return gson.toJson(this);
  }
  /**
   * Given a json string, parse its fields and construct a daily status batch.
   * @param batchStr the json string representation of a daily status batch.
   * @return the daily status batch constructed; if the string is of invalid format, null will be
   *         returned.
   */
  static DailyStatusBatch deserializeFromJson(String batchStr) {
    try {
      return new Gson().fromJson(batchStr, DailyStatusBatch.class);
    } catch (JsonParseException e) {
      LOG.error("Error parsing json string: " + batchStr, e);
      return null;
    }
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.docx
+++ b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.docx
--- a/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.java
+++ b/src/java/com/twitter/search/earlybird/archive/DailyStatusBatches.java
@ -1,702 +0,0 @@
 package com.twitter.search.earlybird.archive;
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.util.Calendar;
 import java.util.Collection;
 import java.util.Date;
 import java.util.NavigableMap;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicBoolean;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Stopwatch;
 import com.google.common.collect.Maps;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang3.time.FastDateFormat;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.common.quantity.Amount;
 import com.twitter.common.quantity.Time;
 import com.twitter.search.common.database.DatabaseConfig;
 import com.twitter.search.common.util.date.DateUtil;
 import com.twitter.search.common.util.io.LineRecordFileReader;
 import com.twitter.search.common.util.zktrylock.TryLock;
 import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
 import com.twitter.search.earlybird.common.config.EarlybirdConfig;
 import com.twitter.search.earlybird.common.config.EarlybirdProperty;
 import com.twitter.search.earlybird.partition.HdfsUtil;
 import com.twitter.search.earlybird.partition.StatusBatchFlushVersion;
 /**
 * Provides access to preprocessed statuses (tweets) to be indexed by archive search earlybirds.
 *
 * These tweets can be coming from a scrub gen or from the output of the daily jobs.
 */
 public class DailyStatusBatches {
  private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatches.class);
  // Maximum time to spend on obtaining daily status batches by computing or loading from HDFS
  private static final Amount<Long, Time> MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES =
      Amount.of(EarlybirdConfig.getLong("daily_status_batches_max_initial_load_time_minutes"),
          Time.MINUTES);
  // Time to wait before trying again when obtaining daily status batches fails
  private static final Amount<Long, Time> DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES =
      Amount.of(EarlybirdConfig.getLong("daily_status_batches_waiting_time_minutes"),
          Time.MINUTES);
  private static final String DAILY_STATUS_BATCHES_SYNC_PATH =
      EarlybirdProperty.ZK_APP_ROOT.get() + "/daily_batches_sync";
  private static final String DAILY_BATCHES_ZK_LOCK = "daily_batches_zk_lock";
  private static final Amount<Long, Time> DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES =
      Amount.of(EarlybirdConfig.getLong("daily_status_batches_zk_lock_expiration_minutes"),
          Time.MINUTES);
  static final FastDateFormat DATE_FORMAT = FastDateFormat.getInstance("yyyyMMdd");
  // before this date, there was no twitter
  private static final Date FIRST_TWITTER_DAY = DateUtil.toDate(2006, 2, 1);
  private static final String STATUS_BATCHES_PREFIX = "status_batches";
  private final String rootDir =
      EarlybirdConfig.getString("hdfs_offline_segment_sync_dir", "top_archive_statuses");
  private final String buildGen =
      EarlybirdConfig.getString("offline_segment_build_gen", "bg_1");
  public static final String STATUS_SUBDIR_NAME = "statuses";
  public static final String LAYOUT_SUBDIR_NAME = "layouts";
  public static final String SCRUB_GEN_SUFFIX_PATTERN = "scrubbed/%s";
  private static final String INTERMEDIATE_COUNTS_SUBDIR_NAME = "counts";
  private static final String SUCCESS_FILE_NAME = "_SUCCESS";
  private static final Pattern HASH_PARTITION_PATTERN = Pattern.compile("p_(\\d+)_of_(\\d+)");
  private static final Date FIRST_TWEET_DAY = DateUtil.toDate(2006, 3, 21);
  private final Path rootPath = new Path(rootDir);
  private final Path buildGenPath = new Path(rootPath, buildGen);
  private final Path statusPath = new Path(buildGenPath, STATUS_SUBDIR_NAME);
  private final NavigableMap<Date, DailyStatusBatch> statusBatches = Maps.newTreeMap();
  private Date firstValidDay = null;
  private Date lastValidDay = null;
  private final ZooKeeperTryLockFactory zkTryLockFactory;
  private final Date scrubGenDay;
  private long numberOfDaysWithValidScrubGenData;
  public DailyStatusBatches(
      ZooKeeperTryLockFactory zooKeeperTryLockFactory, Date scrubGenDay) throws IOException {
    this.zkTryLockFactory = zooKeeperTryLockFactory;
    this.scrubGenDay = scrubGenDay;
    FileSystem hdfs = null;
    try {
      hdfs = HdfsUtil.getHdfsFileSystem();
      verifyDirectory(hdfs);
    } finally {
      IOUtils.closeQuietly(hdfs);
    }
  }
  @VisibleForTesting
  public Date getScrubGenDay() {
    return scrubGenDay;
  }
  public Collection<DailyStatusBatch> getStatusBatches() {
    return statusBatches.values();
  }
  /**
   * Reset the states of the directory
   */
  private void resetDirectory() {
    statusBatches.clear();
    firstValidDay = null;
    lastValidDay = null;
  }
  /**
   *  Indicate whether the directory has been initialized
   */
  private boolean isInitialized() {
    return lastValidDay != null;
  }
  /**
   * Load the daily status batches from HDFS; return true if one or more batches could be loaded.
   **/
  private boolean refreshByLoadingHDFSStatusBatches(final FileSystem fs) throws IOException {
    // first find the latest valid end date of statuses
    final Date lastValidStatusDay = getLastValidInputDateFromNow(fs);
    if (lastValidStatusDay != null) {
      if (hasStatusBatchesOnHdfs(fs, lastValidStatusDay)) {
        if (loadStatusBatchesFromHdfs(fs, lastValidStatusDay)) {
          return true;
        }
      }
    }
    resetDirectory();
    return false;
  }
  /**
   * Checks the directory for new data and returns true, if one or more new batches could be loaded.
   */
  public void refresh() throws IOException {
    final FileSystem hdfs = HdfsUtil.getHdfsFileSystem();
    final Stopwatch stopwatch = Stopwatch.createStarted();
    try {
      if (!isInitialized()) {
        if (initializeDailyStatusBatches(hdfs, stopwatch)) {
          LOG.info("Successfully obtained daily status batches after {}", stopwatch);
        } else {
          String errMsg = "Failed to load or compute daily status batches after "
              + stopwatch.toString();
          LOG.error(errMsg);
          throw new IOException(errMsg);
        }
      } else {
        loadNewDailyBatches(hdfs);
      }
    } finally {
      IOUtils.closeQuietly(hdfs);
    }
  }
  private boolean initializeDailyStatusBatches(final FileSystem hdfs,
                                               final Stopwatch stopwatch) throws IOException {
    long timeSpentOnDailyBatches = 0L;
    long maxAllowedTimeMs = MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES.as(Time.MILLISECONDS);
    long waitingTimeMs = DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES.as(Time.MILLISECONDS);
    boolean firstLoop = true;
    LOG.info("Starting to load or compute daily status batches for the first time.");
    while (timeSpentOnDailyBatches <= maxAllowedTimeMs && !Thread.currentThread().isInterrupted()) {
      if (!firstLoop) {
        try {
          LOG.info("Sleeping " + waitingTimeMs
              + " millis before trying to obtain daily batches again");
          Thread.sleep(waitingTimeMs);
        } catch (InterruptedException e) {
          LOG.warn("Interrupted while waiting to load daily batches", e);
          Thread.currentThread().interrupt();
          break;
        }
      }
      if (isStatusBatchLoadingEnabled() && refreshByLoadingHDFSStatusBatches(hdfs)) {
        LOG.info("Successfully loaded daily status batches after {}", stopwatch);
        return true;
      }
      final AtomicBoolean successRef = new AtomicBoolean(false);
      if (computeDailyBatchesWithZKLock(hdfs, successRef, stopwatch)) {
        return successRef.get();
      }
      timeSpentOnDailyBatches = stopwatch.elapsed(TimeUnit.MILLISECONDS);
      firstLoop = false;
    }
    return false;
  }
  private boolean computeDailyBatchesWithZKLock(final FileSystem hdfs,
                                                final AtomicBoolean successRef,
                                                final Stopwatch stopwatch) throws IOException {
    // Using a global lock to coordinate among earlybirds and segment builders so that only
    // one instance would hit the HDFS name node to query the daily status directories
    TryLock lock = zkTryLockFactory.createTryLock(
        DatabaseConfig.getLocalHostname(),
        DAILY_STATUS_BATCHES_SYNC_PATH,
        DAILY_BATCHES_ZK_LOCK,
        DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES);
    return lock.tryWithLock(() -> {
      LOG.info("Obtained ZK lock to compute daily status batches after {}", stopwatch);
      successRef.set(initialLoadDailyBatchInfos(hdfs));
      if (successRef.get()) {
        LOG.info("Successfully computed daily status batches after {}", stopwatch);
        if (isStatusBatchFlushingEnabled()) {
          LOG.info("Starting to store daily status batches to HDFS");
          if (storeStatusBatchesToHdfs(hdfs, lastValidDay)) {
            LOG.info("Successfully stored daily status batches to HDFS");
          } else {
            LOG.warn("Failed storing daily status batches to HDFS");
          }
        }
      } else {
        LOG.info("Failed loading daily status info");
      }
    });
  }
  private void verifyDirectory(FileSystem hdfs) throws IOException {
    if (!hdfs.exists(rootPath)) {
      throw new IOException("Root dir '" + rootPath + "' does not exist.");
    }
    if (!hdfs.exists(buildGenPath)) {
      throw new IOException("Build gen dir '" + buildGenPath + "' does not exist.");
    }
    if (!hdfs.exists(statusPath)) {
      throw new IOException("Status dir '" + statusPath + "' does not exist.");
    }
  }
  private void loadNewDailyBatches(FileSystem hdfs) throws IOException {
    Preconditions.checkNotNull(lastValidDay);
    Calendar day = Calendar.getInstance();
    day.setTime(lastValidDay);
    day.add(Calendar.DATE, 1);
    while (loadDay(hdfs, day.getTime()) != null) {
      lastValidDay = day.getTime();
      day.add(Calendar.DATE, 1);
    }
  }
  private boolean initialLoadDailyBatchInfos(FileSystem hdfs) throws IOException {
    LOG.info("Starting to build timeslice map from scratch.");
    final Date lastValidStatusDay = getLastValidInputDateFromNow(hdfs);
    if (lastValidStatusDay == null) {
      LOG.warn("No data found in " + statusPath + " and scrubbed path");
      return false;
    }
    int mostRecentYear = DateUtil.getCalendar(lastValidStatusDay).get(Calendar.YEAR);
    for (int year = 2006; year <= mostRecentYear; ++year) {
      // construct path to avoid hdfs.listStatus() calls
      Calendar day = Calendar.getInstance();
      day.set(year, Calendar.JANUARY, 1, 0, 0, 0);
      day.set(Calendar.MILLISECOND, 0);
      Calendar yearEnd = Calendar.getInstance();
      yearEnd.set(year, Calendar.DECEMBER, 31, 0, 0, 0);
      yearEnd.set(Calendar.MILLISECOND, 0);
      if (lastValidDay != null) {
        // We're updating.
        if (lastValidDay.after(yearEnd.getTime())) {
          // This year was already loaded.
          continue;
        }
        if (lastValidDay.after(day.getTime())) {
          // Start one day after last valid date.
          day.setTime(lastValidDay);
          day.add(Calendar.DATE, 1);
        }
      }
      for (; !day.after(yearEnd); day.add(Calendar.DATE, 1)) {
        loadDay(hdfs, day.getTime());
      }
    }
    boolean updated = false;
    numberOfDaysWithValidScrubGenData = 0;
    // Iterate batches in sorted order.
    for (DailyStatusBatch batch : statusBatches.values()) {
      if (!batch.isValid()) {
        break;
      }
      if (batch.getDate().before(scrubGenDay)) {
        numberOfDaysWithValidScrubGenData++;
      }
      if (firstValidDay == null) {
        firstValidDay = batch.getDate();
      }
      if (lastValidDay == null || lastValidDay.before(batch.getDate())) {
        lastValidDay = batch.getDate();
        updated = true;
      }
    }
    LOG.info("Number of statusBatches: {}", statusBatches.size());
    return updated;
  }
  private static String filesToString(FileStatus[] files) {
    if (files == null) {
      return "null";
    }
    StringBuilder b = new StringBuilder();
    for (FileStatus s : files) {
      b.append(s.getPath().toString()).append(", ");
    }
    return b.toString();
  }
  @VisibleForTesting
  protected DailyStatusBatch loadDay(FileSystem hdfs, Date day) throws IOException {
    Path dayPath = new Path(getStatusPathToUseForDay(day), ArchiveHDFSUtils.dateToPath(day, "/"));
    LOG.debug("Looking for batch in " + dayPath.toString());
    DailyStatusBatch result = this.statusBatches.get(day);
    if (result != null) {
      return result;
    }
    final FileStatus[] files;
    try {
      files = hdfs.listStatus(dayPath);
      LOG.debug("Files found:  " + filesToString(files));
    } catch (FileNotFoundException e) {
      LOG.debug("loadDay() called, but directory does not exist for day: " + day
          + " in: " + dayPath);
      return null;
    }
    if (files != null && files.length > 0) {
      for (FileStatus file : files) {
        Matcher matcher = HASH_PARTITION_PATTERN.matcher(file.getPath().getName());
        if (matcher.matches()) {
          int numHashPartitions = Integer.parseInt(matcher.group(2));
          result = new DailyStatusBatch(
              day, numHashPartitions, getStatusPathToUseForDay(day), hdfs);
          for (int partitionID = 0; partitionID < numHashPartitions; partitionID++) {
            result.addPartition(hdfs, dayPath, partitionID);
          }
          if (result.isValid()) {
            statusBatches.put(day, result);
            return result;
          } else {
            LOG.info("Invalid batch found for day: " + day + ", batch: " + result);
          }
        } else {
          // skip logging the intermediate count subdirectories or _SUCCESS files.
          if (!INTERMEDIATE_COUNTS_SUBDIR_NAME.equals(file.getPath().getName())
              && !SUCCESS_FILE_NAME.equals(file.getPath().getName())) {
            LOG.warn("Path does not match hash partition pattern: " + file.getPath());
          }
        }
      }
    } else {
      LOG.warn("No data found for day: " + day + " in: " + dayPath
              + " files null: " + (files == null));
    }
    return null;
  }
  /**
   * Determines if this directory has a valid batch for the given day.
   */
  public boolean hasValidBatchForDay(Date day) throws IOException {
    FileSystem hdfs = null;
    try {
      hdfs = HdfsUtil.getHdfsFileSystem();
      return hasValidBatchForDay(hdfs, day);
    } finally {
      IOUtils.closeQuietly(hdfs);
    }
  }
  private boolean hasValidBatchForDay(FileSystem fs, Date day) throws IOException {
    DailyStatusBatch batch = loadDay(fs, day);
    return batch != null && batch.isValid();
  }
  @VisibleForTesting
  Date getFirstValidDay() {
    return firstValidDay;
  }
  @VisibleForTesting
  Date getLastValidDay() {
    return lastValidDay;
  }
  private Date getLastValidInputDateFromNow(FileSystem hdfs) throws IOException {
    Calendar cal = Calendar.getInstance();
    cal.setTime(new Date()); // current date
    return getLastValidInputDate(hdfs, cal);
  }
  /**
   * Starting from current date, probe backward till we find a valid input Date
   */
  @VisibleForTesting
  Date getLastValidInputDate(FileSystem hdfs, Calendar cal) throws IOException {
    cal.set(Calendar.MILLISECOND, 0);
    cal.set(Calendar.HOUR_OF_DAY, 0);
    cal.set(Calendar.MINUTE, 0);
    cal.set(Calendar.SECOND, 0);
    cal.set(Calendar.MILLISECOND, 0);
    Date lastValidInputDate = cal.getTime();
    LOG.info("Probing backwards for last valid data date from " + lastValidInputDate);
    while (lastValidInputDate.after(FIRST_TWITTER_DAY)) {
      if (hasValidBatchForDay(hdfs, lastValidInputDate)) {
        LOG.info("Found latest valid data on date " + lastValidInputDate);
        LOG.info("  Used path: {}", getStatusPathToUseForDay(lastValidInputDate));
        return lastValidInputDate;
      }
      cal.add(Calendar.DATE, -1);
      lastValidInputDate = cal.getTime();
    }
    return null;
  }
  /**
   * Check if the daily status batches are already on HDFS
   */
  @VisibleForTesting
  boolean hasStatusBatchesOnHdfs(FileSystem fs, Date lastDataDay) {
    String hdfsFileName = getHdfsStatusBatchSyncFileName(lastDataDay);
    try {
      return fs.exists(new Path(hdfsFileName));
    } catch (IOException ex) {
      LOG.error("Failed checking status batch file on HDFS: " + hdfsFileName, ex);
      return false;
    }
  }
  /**
   * Load the daily status batches from HDFS by first copying the file from HDFS to local disk
   * and then reading from the local disk.
   *
   * @param day the latest day of valid statuses.
   * @return true if the loading is successful.
   */
  @VisibleForTesting
  boolean loadStatusBatchesFromHdfs(FileSystem fs, Date day) {
    // set the directory state to initial state
    resetDirectory();
    String fileHdfsPath = getHdfsStatusBatchSyncFileName(day);
    String fileLocalPath = getLocalStatusBatchSyncFileName(day);
    LOG.info("Using " + fileHdfsPath + " as the HDFS batch summary load path.");
    LOG.info("Using " + fileLocalPath + " as the local batch summary sync path.");
    LineRecordFileReader lineReader = null;
    try {
      fs.copyToLocalFile(new Path(fileHdfsPath), new Path(fileLocalPath));
      lineReader = new LineRecordFileReader(fileLocalPath);
      String batchLine;
      while ((batchLine = lineReader.readNext()) != null) {
        DailyStatusBatch batch = DailyStatusBatch.deserializeFromJson(batchLine);
        if (batch == null) {
          LOG.error("Invalid daily status batch constructed from line: " + batchLine);
          resetDirectory();
          return false;
        }
        Date date = batch.getDate();
        if (firstValidDay == null || firstValidDay.after(date)) {
          firstValidDay = date;
        }
        if (lastValidDay == null || lastValidDay.before(date)) {
          lastValidDay = date;
        }
        statusBatches.put(date, batch);
      }
      LOG.info("Loaded {} status batches from HDFS: {}",
          statusBatches.size(), fileHdfsPath);
      LOG.info("First entry: {}", statusBatches.firstEntry().getValue().toString());
      LOG.info("Last entry: {}", statusBatches.lastEntry().getValue().toString());
      return true;
    } catch (IOException ex) {
      LOG.error("Failed loading time slices from HDFS: " + fileHdfsPath, ex);
      resetDirectory();
      return false;
    } finally {
      if (lineReader != null) {
        lineReader.stop();
      }
    }
  }
  /**
   * Flush the daily status batches to local disk and then upload to HDFS.
   */
  private boolean storeStatusBatchesToHdfs(FileSystem fs, Date day) {
    Preconditions.checkNotNull(lastValidDay);
    if (!StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.isOfficial()) {
      LOG.info("Status batch flush version is not official, no batches will be flushed to HDFS");
      return true;
    }
    String fileLocalPath = getLocalStatusBatchSyncFileName(day);
    // Flush to local disk
    File outputFile = null;
    FileWriter fileWriter = null;
    try {
      LOG.info("Flushing daily status batches into: " + fileLocalPath);
      outputFile = new File(fileLocalPath);
      outputFile.getParentFile().mkdirs();
      if (!outputFile.getParentFile().exists()) {
        LOG.error("Cannot create directory: " + outputFile.getParentFile().toString());
        return false;
      }
      fileWriter = new FileWriter(outputFile, false);
      for (Date date : statusBatches.keySet()) {
        fileWriter.write(statusBatches.get(date).serializeToJson());
        fileWriter.write("\n");
      }
      fileWriter.flush();
      // Upload the file to HDFS
      return uploadStatusBatchesToHdfs(fs, day);
    } catch (IOException e) {
      String fileHdfsPath = getHdfsStatusBatchSyncFileName(day);
      LOG.error("Failed storing status batches to HDFS: " + fileHdfsPath, e);
      return false;
    } finally {
      try {
        if (fileWriter != null) {
          fileWriter.close();
        }
      } catch (IOException e) {
        LOG.error("Error to close fileWrite.", e);
      }
      if (outputFile != null) {
        // Delete the local file
        outputFile.delete();
      }
    }
  }
  /**
   * Upload the status batches to HDFS.
   */
  @VisibleForTesting
  boolean uploadStatusBatchesToHdfs(FileSystem fs, Date day) {
    String localFileName = getLocalStatusBatchSyncFileName(day);
    String hdfsFileName = getHdfsStatusBatchSyncFileName(day);
    LOG.info("Using " + hdfsFileName + " as the HDFS batch summary upload path.");
    LOG.info("Using " + localFileName + " as the local batch summary sync path.");
    try {
      Path hdfsFilePath = new Path(hdfsFileName);
      if (fs.exists(hdfsFilePath)) {
        LOG.warn("Found status batch file on HDFS: " + hdfsFileName);
        return true;
      }
      String hdfsTempName = getHdfsStatusBatchTempSyncFileName(day);
      Path hdfsTempPath = new Path(hdfsTempName);
      if (fs.exists(hdfsTempPath)) {
        LOG.info("Found existing temporary status batch file on HDFS, removing: " + hdfsTempName);
        if (!fs.delete(hdfsTempPath, false)) {
          LOG.error("Failed to delete temporary file: " + hdfsTempName);
          return false;
        }
      }
      fs.copyFromLocalFile(new Path(localFileName), hdfsTempPath);
      if (fs.rename(hdfsTempPath, hdfsFilePath)) {
        LOG.debug("Renamed " + hdfsTempName + " on HDFS to: " + hdfsFileName);
        return true;
      } else {
        LOG.error("Failed to rename " + hdfsTempName + " on HDFS to: " + hdfsFileName);
        return false;
      }
    } catch (IOException ex) {
      LOG.error("Failed uploading status batch file to HDFS: " + hdfsFileName, ex);
      return false;
    }
  }
  private static boolean isStatusBatchFlushingEnabled() {
    return EarlybirdProperty.ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED.get(false);
  }
  private static boolean isStatusBatchLoadingEnabled() {
    return EarlybirdConfig.getBool("archive_daily_status_batch_loading_enabled", false);
  }
  private static String getVersionFileExtension() {
    return StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.getVersionFileExtension();
  }
  String getStatusBatchSyncRootDir() {
    return EarlybirdConfig.getString("archive_daily_status_batch_sync_dir",
        "daily_status_batches") + "/" + scrubGenSuffix();
  }
  @VisibleForTesting
  String getLocalStatusBatchSyncFileName(Date day) {
    return  getStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_"
        + DATE_FORMAT.format(day) + getVersionFileExtension();
  }
  String getHdfsStatusBatchSyncRootDir() {
    return EarlybirdConfig.getString("hdfs_archive_daily_status_batch_sync_dir",
        "daily_status_batches") + "/" + scrubGenSuffix();
  }
  @VisibleForTesting
  String getHdfsStatusBatchSyncFileName(Date day) {
    return getHdfsStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_"
        + DATE_FORMAT.format(day) + getVersionFileExtension();
  }
  private String getHdfsStatusBatchTempSyncFileName(Date day) {
    return getHdfsStatusBatchSyncRootDir() + "/" + DatabaseConfig.getLocalHostname() + "_"
        + STATUS_BATCHES_PREFIX + "_" + DATE_FORMAT.format(day) + getVersionFileExtension();
  }
  private String scrubGenSuffix() {
    return String.format(SCRUB_GEN_SUFFIX_PATTERN, DATE_FORMAT.format(scrubGenDay));
  }
  /**
   * Returns the path to the directory that stores the statuses for the given day.
   */
  public Path getStatusPathToUseForDay(Date day) {
    if (!day.before(scrubGenDay)) {
      return statusPath;
    }
    String suffix = scrubGenSuffix();
    Preconditions.checkArgument(!suffix.isEmpty());
    Path scrubPath = new Path(buildGenPath, suffix);
    return new Path(scrubPath, STATUS_SUBDIR_NAME);
  }
  /**
   * Determines if the data for the specified scrub gen was fully built, by checking the number of
   * days for which data was built against the expected number of days extracted from the specified
   * scrub gen date.
   */
  public boolean isScrubGenDataFullyBuilt(FileSystem hdfs) throws IOException {
    initialLoadDailyBatchInfos(hdfs);
    if (numberOfDaysWithValidScrubGenData == 0) {
      LOG.warn("numberOfDaysWithValidScrubGenData is 0");
    }
    long expectedDays = getDiffBetweenDays(scrubGenDay);
    return expectedDays == numberOfDaysWithValidScrubGenData;
  }
  @VisibleForTesting
  long getDiffBetweenDays(Date day) {
    long diff = day.getTime() - FIRST_TWEET_DAY.getTime();
    return TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS);
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.docx
+++ b/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.docx
--- a/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.java
+++ b/src/java/com/twitter/search/earlybird/archive/PartitionedBatch.java
@ -1,333 +0,0 @@
 package com.twitter.search.earlybird.archive;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.util.Comparator;
 import java.util.Date;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Function;
 import com.google.common.base.Predicate;
 import com.google.common.collect.ComparisonChain;
 import com.google.common.collect.Lists;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.PathFilter;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.search.common.config.Config;
 import com.twitter.search.common.metrics.SearchCounter;
 import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
 import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil;
 import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
 import com.twitter.search.common.util.date.DateUtil;
 import com.twitter.search.common.util.io.EmptyRecordReader;
 import com.twitter.search.common.util.io.LzoThriftBlockFileReader;
 import com.twitter.search.common.util.io.MergingSortedRecordReader;
 import com.twitter.search.common.util.io.TransformingRecordReader;
 import com.twitter.search.common.util.io.recordreader.RecordReader;
 import com.twitter.search.earlybird.common.config.EarlybirdConfig;
 import com.twitter.search.earlybird.document.DocumentFactory;
 import com.twitter.search.earlybird.document.TweetDocument;
 import com.twitter.search.earlybird.partition.HdfsUtil;
 /**
 * A batch of pre-processed tweets for a single hash partition from a particular day.
 */
 public class PartitionedBatch {
  private static final Logger LOG = LoggerFactory.getLogger(PartitionedBatch.class);
  private static final Date START_DATE_INCLUSIVE = DateUtil.toDate(2006, 03, 21);
  private static final String STATUS_COUNT_FILE_PREFIX = "_status_count_";
  private static final Pattern STATUS_COUNT_FILE_PATTERN =
      Pattern.compile(STATUS_COUNT_FILE_PREFIX + "(\\d+)_minid_(\\d+)_maxid_(\\d+)");
  private static final int MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS =
      EarlybirdConfig.getInt("archive_max_out_of_order_tolerance_hours", 12);
  private static final int READER_INIT_IOEXCEPTION_RETRIES = 20;
  private static final PathFilter LZO_DATA_FILES_FILTER = file -> file.getName().endsWith(".lzo");
  private static final PathFilter TXT_DATA_FILES_FILTER = file -> file.getName().endsWith(".txt");
  private static final Comparator<ThriftIndexingEvent> DESC_THRIFT_INDEXING_EVENT_COMPARATOR =
      (o1, o2) -> ComparisonChain.start()
          .compare(o2.getSortId(), o1.getSortId())
          .compare(o2.getUid(), o1.getUid())
          .result();
  // Number archive tweets skipped because they are too out-of-order.
  private static final SearchCounter OUT_OF_ORDER_STATUSES_SKIPPED =
      SearchCounter.export("out_of_order_archive_statuses_skipped");
  @VisibleForTesting
  protected static final long MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS =
      TimeUnit.HOURS.toMillis(MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS);
  private final Date date;
  private final Path path;
  private int statusCount;
  private long minStatusID;
  private long maxStatusID;
  private final int hashPartitionID;
  private boolean hasStatusCountFile;
  private final int numHashPartitions;
  @VisibleForTesting
  public PartitionedBatch(
      Path path,
      int hashPartitionID,
      int numHashPartitions,
      Date date) {
    this.path = path;
    this.hashPartitionID = hashPartitionID;
    this.numHashPartitions = numHashPartitions;
    this.date = date;
  }
  /**
   * Loads all the information (tweet count, etc.) for this partition and day from HDFS.
   */
  public void load(FileSystem hdfs) throws IOException {
    FileStatus[] dailyBatchFiles = null;
    try {
      // listStatus() javadoc says it throws FileNotFoundException when path does not exist.
      // However, the actual implementations return null or an empty array instead.
      // We handle all 3 cases: null, empty array, or FileNotFoundException.
      dailyBatchFiles = hdfs.listStatus(path);
    } catch (FileNotFoundException e) {
      // don't do anything here and the day will be handled as empty.
    }
    if (dailyBatchFiles != null && dailyBatchFiles.length > 0) {
      for (FileStatus file : dailyBatchFiles) {
        String fileName = file.getPath().getName();
        if (fileName.equals(STATUS_COUNT_FILE_PREFIX)) {
          // zero tweets in this partition - this can happen for early days in 2006
          handleEmptyPartition();
        } else {
          Matcher matcher = STATUS_COUNT_FILE_PATTERN.matcher(fileName);
          if (matcher.matches()) {
            try {
              statusCount = Integer.parseInt(matcher.group(1));
              // Only adjustMinStatusId in production. For tests, this makes the tests harder to
              // understand.
              minStatusID = Config.environmentIsTest() ? Long.parseLong(matcher.group(2))
                  : adjustMinStatusId(Long.parseLong(matcher.group(2)), date);
              maxStatusID = Long.parseLong(matcher.group(3));
              hasStatusCountFile = true;
            } catch (NumberFormatException e) {
              // invalid file - ignore
              LOG.warn("Could not parse status count file name.", e);
            }
          }
        }
      }
    } else {
      // Partition folder does not exist. This case can happen for early days of twitter
      // where some partitions are empty. Set us to having a status count file, the validity of
      // the parent DailyStatusBatch will still be determined by whether there was a _SUCCESS file
      // in the day root.
      handleEmptyPartition();
      if (date.after(getEarliestDenseDay())) {
        LOG.error("Unexpected empty directory {} for {}", path, date);
      }
    }
  }
  private void handleEmptyPartition() {
    statusCount = 0;
    minStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID;
    maxStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID;
    hasStatusCountFile = true;
  }
  /**
   * Sometimes tweets are out-of-order (E.g. a tweet from Sep 2012 got into a
   * batch in July 2013). See SEARCH-1750 for more details.
   * This adjust the minStatusID if it is badly out-of-order.
   */
  @VisibleForTesting
  protected static long adjustMinStatusId(long minStatusID, Date date) {
    long dateTime = date.getTime();
    // If the daily batch is for a day before we started using snow flake IDs. Never adjust.
    if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(dateTime)) {
      return minStatusID;
    }
    long earliestStartTime = dateTime - MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS;
    long minStatusTime = SnowflakeIdParser.getTimestampFromTweetId(minStatusID);
    if (minStatusTime < earliestStartTime) {
      long newMinId =  SnowflakeIdParser.generateValidStatusId(earliestStartTime, 0);
      LOG.info("Daily batch for " + date + " has badly out of order tweet: " + minStatusID
          + ". The minStatusID for the day this batch is adjusted to " + newMinId);
      return newMinId;
    } else {
      return minStatusID;
    }
  }
  /**
   * Returns a reader that reads tweets from the given directory.
   *
   * @param archiveSegment Determines the timeslice ID of all read tweets.
   * @param tweetsPath The path to the directory where the tweets for this day are stored.
   * @param documentFactory The ThriftIndexingEvent to TweetDocument converter.
   */
  public RecordReader<TweetDocument> getTweetReaders(
      ArchiveSegment archiveSegment,
      Path tweetsPath,
      DocumentFactory<ThriftIndexingEvent> documentFactory) throws IOException {
    RecordReader<TweetDocument> tweetDocumentReader =
        new TransformingRecordReader<>(
            createTweetReader(tweetsPath), new Function<ThriftIndexingEvent, TweetDocument>() {
          @Override
          public TweetDocument apply(ThriftIndexingEvent event) {
            return new TweetDocument(
                event.getSortId(),
                archiveSegment.getTimeSliceID(),
                EarlybirdThriftDocumentUtil.getCreatedAtMs(event.getDocument()),
                documentFactory.newDocument(event)
            );
          }
        });
    tweetDocumentReader.setExhaustStream(true);
    return tweetDocumentReader;
  }
  private RecordReader<ThriftIndexingEvent> createTweetReader(Path tweetsPath) throws IOException {
    if (date.before(START_DATE_INCLUSIVE)) {
      return new EmptyRecordReader<>();
    }
    List<RecordReader<ThriftIndexingEvent>> readers = Lists.newArrayList();
    FileSystem hdfs = HdfsUtil.getHdfsFileSystem();
    try {
      Path dayPath = new Path(tweetsPath, ArchiveHDFSUtils.dateToPath(date, "/"));
      Path partitionPath =
          new Path(dayPath, String.format("p_%d_of_%d", hashPartitionID, numHashPartitions));
      PathFilter pathFilter =
          Config.environmentIsTest() ? TXT_DATA_FILES_FILTER : LZO_DATA_FILES_FILTER;
      FileStatus[] files = hdfs.listStatus(partitionPath, pathFilter);
      for (FileStatus fileStatus : files) {
        String fileStatusPath = fileStatus.getPath().toString().replaceAll("file:/", "/");
        RecordReader<ThriftIndexingEvent> reader = createRecordReaderWithRetries(fileStatusPath);
        readers.add(reader);
      }
    } finally {
      IOUtils.closeQuietly(hdfs);
    }
    if (readers.isEmpty()) {
      return new EmptyRecordReader<>();
    }
    return new MergingSortedRecordReader<>(DESC_THRIFT_INDEXING_EVENT_COMPARATOR, readers);
  }
  private RecordReader<ThriftIndexingEvent> createRecordReaderWithRetries(String filePath)
      throws IOException {
    Predicate<ThriftIndexingEvent> recordFilter = getRecordFilter();
    int numTries = 0;
    while (true) {
      try {
        ++numTries;
        return new LzoThriftBlockFileReader<>(filePath, ThriftIndexingEvent.class, recordFilter);
      } catch (IOException e) {
        if (numTries < READER_INIT_IOEXCEPTION_RETRIES) {
          LOG.warn("Failed to open LzoThriftBlockFileReader for " + filePath + ". Will retry.", e);
        } else {
          LOG.error("Failed to open LzoThriftBlockFileReader for " + filePath
              + " after too many retries.", e);
          throw e;
        }
      }
    }
  }
  private Predicate<ThriftIndexingEvent> getRecordFilter() {
    return Config.environmentIsTest() ? null : input -> {
      if (input == null) {
        return false;
      }
      // We only guard against status IDs that are too small, because it is possible
      // for a very old tweet to get into today's batch, but not possible for a very
      // large ID (a future tweet ID that is not yet published) to get in today's
      // batch, unless tweet ID generation messed up.
      long statusId = input.getSortId();
      boolean keep = statusId >= minStatusID;
      if (!keep) {
        LOG.debug("Out of order documentId: {} minStatusID: {} Date: {} Path: {}",
            statusId, minStatusID, date, path);
        OUT_OF_ORDER_STATUSES_SKIPPED.increment();
      }
      return keep;
    };
  }
  /**
   * Returns the number of statuses in this batch
   */
  public int getStatusCount() {
    return statusCount;
  }
  /**
   * Was the _status_count file was found in this folder.
   */
  public boolean hasStatusCount() {
    return hasStatusCountFile;
  }
  public long getMinStatusID() {
    return minStatusID;
  }
  public long getMaxStatusID() {
    return maxStatusID;
  }
  public Date getDate() {
    return date;
  }
  public Path getPath() {
    return path;
  }
  /**
   * Check whether the partition is
   * . empty and
   * . it is disallowed (empty partition can only happen before 2010)
   * (Empty partition means that the directory is missing when scan happens.)
   *
   * @return true if the partition has no documents and it is not allowed.
   */
  public boolean isDisallowedEmptyPartition() {
    return hasStatusCountFile
        && statusCount == 0
        && minStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID
        && maxStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID
        && date.after(getEarliestDenseDay());
  }
  @Override
  public String toString() {
    return "PartitionedBatch[hashPartitionId=" + hashPartitionID
        + ",numHashPartitions=" + numHashPartitions
        + ",date=" + date
        + ",path=" + path
        + ",hasStatusCountFile=" + hasStatusCountFile
        + ",statusCount=" + statusCount + "]";
  }
  private Date getEarliestDenseDay() {
    return EarlybirdConfig.getDate("archive_search_earliest_dense_day");
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.bazel
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.bazel
@ -1,64 +0,0 @@
 java_library(
    name = "segment_builder_lib",
    sources = ["**/*.java"],
    platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-only",
    ],
    dependencies = [
        "3rdparty/jvm/com/google/guava",
        "3rdparty/jvm/com/google/inject:guice",
        "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server",
        "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider",
        "3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
        "3rdparty/jvm/org/apache/thrift:libthrift",
        "3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
        "3rdparty/jvm/org/slf4j:slf4j-api",
        "decider/src/main/scala",
        "finatra/inject/inject-core/src/main/scala",
        "finatra/inject/inject-server/src/main/scala/com/twitter/inject/server",
        "src/java/com/twitter/common/base",
        "src/java/com/twitter/common/quantity",
        "src/java/com/twitter/common/util:system-mocks",
        "src/java/com/twitter/common_internal/text/version",
        "src/java/com/twitter/search/common/config",
        "src/java/com/twitter/search/common/database",
        "src/java/com/twitter/search/common/metrics",
        "src/java/com/twitter/search/common/partitioning/base",
        "src/java/com/twitter/search/common/partitioning/zookeeper",
        "src/java/com/twitter/search/common/schema",
        "src/java/com/twitter/search/common/schema/base",
        "src/java/com/twitter/search/common/util:closeresourceutil",
        "src/java/com/twitter/search/common/util:gcutil",
        "src/java/com/twitter/search/common/util:kerberos",
        "src/java/com/twitter/search/common/util/date",
        "src/java/com/twitter/search/common/util/io:flushable",
        "src/java/com/twitter/search/common/util/zktrylock",
        "src/java/com/twitter/search/common/util/zookeeper",
        "src/java/com/twitter/search/earlybird:earlybird-lib",
        "src/java/com/twitter/search/earlybird/common",
        "src/java/com/twitter/search/earlybird/common/config",
        "src/java/com/twitter/search/earlybird/common/userupdates",
        "util/util-core:scala",
    ],
 )
 # Using hadoop_binary target can automatically exclude hadoop related jars in the built jar
 # and load in the right jars based on hadoop config.
 hadoop_binary(
    name = "segment_builder_binary",
    basename = "segment_builder",
    main = "com.twitter.search.earlybird.archive.segmentbuilder.SegmentBuilderMain",
    platform = "java8",
    runtime_platform = "java8",
    tags = [
        "bazel-compatible",
        "bazel-compatible:migrated",
        "bazel-only",
    ],
    dependencies = [
        ":segment_builder_lib",
        "src/java/com/twitter/search/common/logging:search-log4j",
    ],
 )
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BUILD.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/BuiltAndFinalizedSegment.java
@ -1,29 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
 import com.twitter.search.earlybird.partition.SegmentInfo;
 import com.twitter.search.earlybird.partition.SegmentSyncConfig;
 public class BuiltAndFinalizedSegment extends SegmentBuilderSegment {
  public BuiltAndFinalizedSegment(
      SegmentInfo segmentInfo,
      SegmentConfig segmentConfig,
      EarlybirdSegmentFactory earlybirdSegmentFactory,
      int alreadyRetriedCount,
      SegmentSyncConfig sync) {
    super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
  }
  @Override
  public SegmentBuilderSegment handle() throws SegmentInfoConstructionException,
      SegmentUpdaterException {
    throw new IllegalStateException("Should not handle a BuildAndFinalizedSegment.");
  }
  @Override
  public boolean isBuilt() {
    return true;
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/NotYetBuiltSegment.java
@ -1,101 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import java.util.concurrent.atomic.AtomicBoolean;
 import com.google.common.base.Stopwatch;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.common.util.Clock;
 import com.twitter.search.common.util.GCUtil;
 import com.twitter.search.common.util.zktrylock.TryLock;
 import com.twitter.search.earlybird.archive.ArchiveSegmentUpdater;
 import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
 import com.twitter.search.earlybird.partition.SegmentInfo;
 import com.twitter.search.earlybird.partition.SegmentSyncConfig;
 public class NotYetBuiltSegment extends SegmentBuilderSegment {
  private static final Logger LOG = LoggerFactory.getLogger(NotYetBuiltSegment.class);
  public NotYetBuiltSegment(
      SegmentInfo segmentInfo,
      SegmentConfig segmentConfig,
      EarlybirdSegmentFactory earlybirdSegmentFactory,
      int alreadyRetriedCount,
      SegmentSyncConfig sync) {
    super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
  }
  /**
   * 1. Grab the ZK lock for this segment.
   *   2a. if lock fails, another host is updating; return the SOMEONE_ELSE_IS_BUILDING state.
   *   2b. if lock succeeds, check again if the updated segment exists on HDFS.
   *     3a. if so, just move on.
   *     3b. if not, update the segment.
   *     In both cases, we need to check if the segment can now be marked as BUILT_AND_FINALIZED.
   */
  @Override
  public SegmentBuilderSegment handle()
      throws SegmentUpdaterException, SegmentInfoConstructionException {
    LOG.info("Handling a not yet built segment: {}", this.getSegmentName());
    Stopwatch stopwatch = Stopwatch.createStarted();
    TryLock lock = getZooKeeperTryLock();
    // The tryWithLock can only access variables from parent class that are final. However, we
    // would like to pass the process() return value to the parent class. So here we use
    // AtomicBoolean reference instead of Boolean.
    final AtomicBoolean successRef = new AtomicBoolean(false);
    boolean gotLock = lock.tryWithLock(() -> {
      ArchiveSegmentUpdater updater = new ArchiveSegmentUpdater(
          segmentConfig.getTryLockFactory(),
          sync,
          segmentConfig.getEarlybirdIndexConfig(),
          Clock.SYSTEM_CLOCK);
      boolean success = updater.updateSegment(segmentInfo);
      successRef.set(success);
    });
    if (!gotLock) {
      LOG.info("cannot acquire zookeeper lock for: " + segmentInfo);
      return new SomeoneElseIsBuildingSegment(
          segmentInfo,
          segmentConfig,
          earlybirdSegmentFactory,
          alreadyRetriedCount,
          sync);
    }
    // 1. we want to make sure the heap is clean right after building a segment so that it's ready
    //   for us to start allocations for a new segment
    // — I think we've had cases where we were seeing OOM's while building
    // 2. the thing that I think it helps with is compaction (vs just organically running CMS)
    // — which would clean up the heap, but may leave it in a fragmented state
    // — and running a Full GC is supposed to compact the remaining tenured space.
    GCUtil.runGC();
    if (successRef.get()) {
      LOG.info("Indexing segment {} took {}", segmentInfo, stopwatch);
      LOG.info("Finished building {}", segmentInfo.getSegment().getSegmentName());
      return new BuiltAndFinalizedSegment(
          segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync);
    } else {
      int alreadyTried = alreadyRetriedCount + 1;
      String errMsg = "failed updating segment for: " + segmentInfo
          + " for " + alreadyTried + " times";
      LOG.error(errMsg);
      if (alreadyTried < segmentConfig.getMaxRetriesOnFailure()) {
        return new NotYetBuiltSegment(
            createNewSegmentInfo(segmentInfo),
            segmentConfig,
            earlybirdSegmentFactory,
            alreadyTried,
            sync);
      } else {
        throw new SegmentUpdaterException(errMsg);
      }
    }
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/RateLimitingSegmentHandler.java
@ -1,39 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import java.util.HashMap;
 import java.util.Map;
 import com.twitter.common.util.Clock;
 /**
 * A class that prevents handling a given segment more than once every hdfsCheckIntervalMillis
 */
 public class RateLimitingSegmentHandler {
  private final long hdfsCheckIntervalMillis;
  private final Clock clock;
  private final Map<String, Long> segmentNameToLastUpdatedTimeMillis = new HashMap<>();
  RateLimitingSegmentHandler(long hdfsCheckIntervalMillis, Clock clock) {
    this.hdfsCheckIntervalMillis = hdfsCheckIntervalMillis;
    this.clock = clock;
  }
  SegmentBuilderSegment processSegment(SegmentBuilderSegment segment)
      throws SegmentUpdaterException, SegmentInfoConstructionException {
    String segmentName = segment.getSegmentName();
    Long lastUpdatedMillis = segmentNameToLastUpdatedTimeMillis.get(segmentName);
    if (lastUpdatedMillis == null) {
      lastUpdatedMillis = 0L;
    }
    long nowMillis = clock.nowMillis();
    if (nowMillis - lastUpdatedMillis < hdfsCheckIntervalMillis) {
      return segment;
    }
    segmentNameToLastUpdatedTimeMillis.put(segmentName, nowMillis);
    return segment.handle();
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilder.java
@ -1,540 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
 import java.util.Random;
 import java.util.concurrent.TimeUnit;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.base.Stopwatch;
 import com.google.common.collect.ComparisonChain;
 import com.google.common.collect.ImmutableList;
 import com.google.common.util.concurrent.Uninterruptibles;
 import com.google.inject.Inject;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.common.quantity.Amount;
 import com.twitter.common.quantity.Time;
 import com.twitter.common.util.Clock;
 import com.twitter.decider.Decider;
 import com.twitter.inject.annotations.Flag;
 import com.twitter.search.common.metrics.SearchCounter;
 import com.twitter.search.common.metrics.SearchLongGauge;
 import com.twitter.search.common.metrics.SearchStatsReceiver;
 import com.twitter.search.common.metrics.SearchStatsReceiverImpl;
 import com.twitter.search.common.partitioning.zookeeper.SearchZkClient;
 import com.twitter.search.common.util.Kerberos;
 import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
 import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig;
 import com.twitter.search.earlybird.archive.ArchiveSegment;
 import com.twitter.search.earlybird.archive.DailyStatusBatches;
 import com.twitter.search.earlybird.archive.ArchiveTimeSlicer;
 import com.twitter.search.earlybird.common.config.EarlybirdConfig;
 import com.twitter.search.earlybird.util.ScrubGenUtil;
 import com.twitter.search.earlybird.exception.CriticalExceptionHandler;
 import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
 import com.twitter.search.earlybird.partition.SearchIndexingMetricSet;
 import com.twitter.search.earlybird.partition.SegmentInfo;
 import com.twitter.search.earlybird.partition.SegmentSyncConfig;
 import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
 /**
 * This class provides the core logic to build segment indices offline.
 * For each server, it coordinate via zookeeper to pick the next segment, build the indices for it
 * and upload them to HDFS. A state machine is used to handle the build state transitions. There
 * are three states:
 *  NOT_BUILD_YET: a segment that needs to be built
 *  SOMEONE_ELSE_IS_BUILDING: another server is building the segment.
 *  BUILT_AND_FINALIZED: the indices of this segment have already been built.
 */
 public class SegmentBuilder {
  private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilder.class);
  private final boolean onlyRunOnce;
  private final int waitBetweenLoopsMins;
  private final int startUpBatchSize;
  private final int instance;
  private final int waitBetweenSegmentsSecs;
  private final int waitBeforeQuitMins;
  // When multiple segment builders start simultaneously, they might make the HDFS name node and
  // zookeeper overwhelmed. So, we let some instances sleep sometimes before they start to avoid
  // the issues.
  private final long startUpSleepMins;
  // If no more segments to built, wait this interval before checking again.
  private final long processWaitingInterval = TimeUnit.MINUTES.toMillis(10);
  // The hash partitions that segments will be built.
  private final ImmutableList<Integer> hashPartitions;
  private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl();
  private final SearchIndexingMetricSet searchIndexingMetricSet =
      new SearchIndexingMetricSet(statsReceiver);
  private final EarlybirdSearcherStats searcherStats =
      new EarlybirdSearcherStats(statsReceiver);
  private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig;
  private final ZooKeeperTryLockFactory zkTryLockFactory;
  private final RateLimitingSegmentHandler segmentHandler;
  private final Clock clock;
  private final int numSegmentBuilderPartitions;
  private final int myPartitionId;
  private final SegmentConfig segmentConfig;
  private final EarlybirdSegmentFactory segmentFactory;
  private final SegmentBuilderCoordinator segmentBuilderCoordinator;
  private final SegmentSyncConfig segmentSyncConfig;
  private final Random random = new Random();
  private static final double SLEEP_RANDOMIZATION_RATIO = .2;
  // Stats
  // The flush version used to build segments
  private static final SearchLongGauge CURRENT_FLUSH_VERSION =
      SearchLongGauge.export("current_flush_version");
  // Accumulated number and time in seconds spent on building segments locally
  private static SearchCounter segmentsBuiltLocally =
      SearchCounter.export("segments_built_locally");
  private static SearchCounter timeSpentOnSuccessfulBuildSecs =
      SearchCounter.export("time_spent_on_successful_build_secs");
  // The total number of segments to be built
  private static final SearchLongGauge SEGMENTS_TO_BUILD =
      SearchLongGauge.export("segments_to_build");
  // How many segments failed locally
  private static final SearchCounter FAILED_SEGMENTS =
      SearchCounter.export("failed_segments");
  @Inject
  protected SegmentBuilder(@Flag("onlyRunOnce") boolean onlyRunOnceFlag,
                           @Flag("waitBetweenLoopsMins") int waitBetweenLoopsMinsFlag,
                           @Flag("startup_batch_size") int startUpBatchSizeFlag,
                           @Flag("instance") int instanceFlag,
                           @Flag("segmentZkLockExpirationHours")
                                 int segmentZkLockExpirationHoursFlag,
                           @Flag("startupSleepMins") long startupSleepMinsFlag,
                           @Flag("maxRetriesOnFailure") int maxRetriesOnFailureFlag,
                           @Flag("hash_partitions") List<Integer> hashPartitionsFlag,
                           @Flag("numSegmentBuilderPartitions") int numSegmentBuilderPartitionsFlag,
                           @Flag("waitBetweenSegmentsSecs") int waitBetweenSegmentsSecsFlag,
                           @Flag("waitBeforeQuitMins") int waitBeforeQuitMinsFlag,
                           @Flag("scrubGen") String scrubGen,
                           Decider decider) {
    this(onlyRunOnceFlag,
        waitBetweenLoopsMinsFlag,
        startUpBatchSizeFlag,
        instanceFlag,
        segmentZkLockExpirationHoursFlag,
        startupSleepMinsFlag,
        hashPartitionsFlag,
        maxRetriesOnFailureFlag,
        waitBetweenSegmentsSecsFlag,
        waitBeforeQuitMinsFlag,
        SearchZkClient.getSZooKeeperClient().createZooKeeperTryLockFactory(),
        new RateLimitingSegmentHandler(TimeUnit.MINUTES.toMillis(10), Clock.SYSTEM_CLOCK),
        Clock.SYSTEM_CLOCK,
        numSegmentBuilderPartitionsFlag,
        decider,
        getSyncConfig(scrubGen));
  }
  @VisibleForTesting
  protected SegmentBuilder(boolean onlyRunOnceFlag,
                           int waitBetweenLoopsMinsFlag,
                           int startUpBatchSizeFlag,
                           int instanceFlag,
                           int segmentZkLockExpirationHoursFlag,
                           long startupSleepMinsFlag,
                           List<Integer> hashPartitions,
                           int maxRetriesOnFailure,
                           int waitBetweenSegmentsSecsFlag,
                           int waitBeforeQuitMinsFlag,
                           ZooKeeperTryLockFactory zooKeeperTryLockFactory,
                           RateLimitingSegmentHandler segmentHandler,
                           Clock clock,
                           int numSegmentBuilderPartitions,
                           Decider decider,
                           SegmentSyncConfig syncConfig) {
    LOG.info("Creating SegmentBuilder");
    LOG.info("Penguin version in use: " + EarlybirdConfig.getPenguinVersion());
    // Set command line flag values
    this.onlyRunOnce = onlyRunOnceFlag;
    this.waitBetweenLoopsMins = waitBetweenLoopsMinsFlag;
    this.startUpBatchSize = startUpBatchSizeFlag;
    this.instance = instanceFlag;
    this.waitBetweenSegmentsSecs = waitBetweenSegmentsSecsFlag;
    this.waitBeforeQuitMins = waitBeforeQuitMinsFlag;
    this.segmentHandler = segmentHandler;
    this.zkTryLockFactory = zooKeeperTryLockFactory;
    this.segmentSyncConfig = syncConfig;
    this.startUpSleepMins = startupSleepMinsFlag;
    if (!hashPartitions.isEmpty()) {
      this.hashPartitions = ImmutableList.copyOf(hashPartitions);
    } else {
      this.hashPartitions = null;
    }
    Amount<Long, Time> segmentZKLockExpirationTime = Amount.of((long)
        segmentZkLockExpirationHoursFlag, Time.HOURS);
    this.earlybirdIndexConfig =
        new ArchiveOnDiskEarlybirdIndexConfig(decider, searchIndexingMetricSet,
            new CriticalExceptionHandler());
    this.segmentConfig = new SegmentConfig(
        earlybirdIndexConfig,
        segmentZKLockExpirationTime,
        maxRetriesOnFailure,
        zkTryLockFactory);
    this.segmentFactory = new EarlybirdSegmentFactory(
        earlybirdIndexConfig,
        searchIndexingMetricSet,
        searcherStats,
        clock);
    this.segmentBuilderCoordinator = new SegmentBuilderCoordinator(
        zkTryLockFactory, syncConfig, clock);
    this.clock = clock;
    this.numSegmentBuilderPartitions = numSegmentBuilderPartitions;
    this.myPartitionId = instance % numSegmentBuilderPartitions;
    SearchLongGauge.export("segment_builder_partition_id_" + myPartitionId).set(1);
    CURRENT_FLUSH_VERSION.set(earlybirdIndexConfig.getSchema().getMajorVersionNumber());
  }
  void run() {
    LOG.info("Config values: {}", EarlybirdConfig.allValuesAsString());
    // Sleep some time uninterruptibly before get started so that if multiple instances are running,
    // the HDFS name node and zookeeper wont be overwhelmed
    // Say, we have 100 instances (instance_arg will have value from 0 - 99, our
    // STARTUP_BATCH_SIZE_ARG is 20 and startUpSleepMins is 3 mins. Then the first 20 instances
    // will not sleep, but start immediately. then instance 20 - 39 will sleep 3 mins and then
    // start to run. instance 40 - 59 will sleep 6 mins then start to run. instances 60 - 79 will
    // sleep 9 mins and then start to run and so forth.
    long sleepTime = instance / startUpBatchSize * startUpSleepMins;
    LOG.info("Instance={}, Start up batch size={}", instance, startUpBatchSize);
    LOG.info("Sleep {} minutes to void HDFS name node and ZooKeeper overwhelmed.", sleepTime);
    Uninterruptibles.sleepUninterruptibly(sleepTime, TimeUnit.MINUTES);
    // Kinit here.
    Kerberos.kinit(
        EarlybirdConfig.getString("kerberos_user", ""),
        EarlybirdConfig.getString("kerberos_keytab_path", "")
    );
    long waitBetweenLoopsMs = TimeUnit.MINUTES.toMillis(waitBetweenLoopsMins);
    if (onlyRunOnce) {
      LOG.info("This segment builder will run the full rebuild of all the segments");
    } else {
      LOG.info("This segment builder will incrementally check for new data and rebuilt "
          + "current segments as needed.");
      LOG.info("The waiting interval between two new data checking is: "
          + waitBetweenLoopsMs + " ms.");
    }
    boolean scrubGenPresent = segmentSyncConfig.getScrubGen().isPresent();
    LOG.info("Scrub gen present: {}", scrubGenPresent);
    boolean scrubGenDataFullyBuilt = segmentBuilderCoordinator.isScrubGenDataFullyBuilt(instance);
    LOG.info("Scrub gen data fully built: {}", scrubGenDataFullyBuilt);
    if (!scrubGenPresent || scrubGenDataFullyBuilt) {
      LOG.info("Starting segment building loop...");
      while (!Thread.currentThread().isInterrupted()) {
        try {
          indexingLoop();
          if (onlyRunOnce) {
            LOG.info("only run once is true, breaking");
            break;
          }
          clock.waitFor(waitBetweenLoopsMs);
        } catch (InterruptedException e) {
          LOG.info("Interrupted, quitting segment builder");
          Thread.currentThread().interrupt();
        } catch (SegmentInfoConstructionException e) {
          LOG.error("Error creating new segmentInfo, quitting segment builder: ", e);
          break;
        } catch (SegmentUpdaterException e) {
          FAILED_SEGMENTS.increment();
          // Before the segment builder quits, sleep for WAIT_BEFORE_QUIT_MINS minutes so that the
          // FAILED_SEGMENTS stat can be exported.
          try {
            clock.waitFor(TimeUnit.MINUTES.toMillis(waitBeforeQuitMins));
          } catch (InterruptedException ex) {
            LOG.info("Interrupted, quitting segment builder");
            Thread.currentThread().interrupt();
          }
          LOG.error("SegmentUpdater processing segment error, quitting segment builder: ", e);
          break;
        }
      }
    } else {
      LOG.info("Cannot build the segments for scrub gen yet.");
    }
  }
  // Refactoring the run loop to here for unittest
  @VisibleForTesting
  void indexingLoop()
      throws SegmentInfoConstructionException, InterruptedException, SegmentUpdaterException {
    // This map contains all the segments to be processed; if a segment is built, it will be removed
    // from the map.
    Map<String, SegmentBuilderSegment> buildableSegmentInfoMap;
    try {
      buildableSegmentInfoMap = createSegmentInfoMap();
      printSegmentInfoMap(buildableSegmentInfoMap);
    } catch (IOException e) {
      LOG.error("Error creating segmentInfoMap: ", e);
      return;
    }
    while (!buildableSegmentInfoMap.isEmpty()) {
      boolean hasBuiltSegment = processSegments(buildableSegmentInfoMap);
      if (!hasBuiltSegment) {
        // If we successfully built a segment, no need to sleep since building a segment takes a
        // long time
        clock.waitFor(processWaitingInterval);
      }
    }
  }
  // Actual shutdown.
  protected void doShutdown() {
    LOG.info("doShutdown()...");
    try {
      earlybirdIndexConfig.getResourceCloser().shutdownExecutor();
    } catch (InterruptedException e) {
      LOG.error("Interrupted during shutdown. ", e);
    }
    LOG.info("Segment builder stopped!");
  }
  private List<ArchiveTimeSlicer.ArchiveTimeSlice> createTimeSlices() throws IOException {
    Preconditions.checkState(segmentSyncConfig.getScrubGen().isPresent());
    Date scrubGen = ScrubGenUtil.parseScrubGenToDate(segmentSyncConfig.getScrubGen().get());
    final DailyStatusBatches dailyStatusBatches =
        new DailyStatusBatches(zkTryLockFactory, scrubGen);
    final ArchiveTimeSlicer archiveTimeSlicer = new ArchiveTimeSlicer(
        EarlybirdConfig.getMaxSegmentSize(), dailyStatusBatches, earlybirdIndexConfig);
    Stopwatch stopwatch = Stopwatch.createStarted();
    List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices = archiveTimeSlicer.getTimeSlices();
    if (timeSlices == null) {
      LOG.error("Failed to load timeslice map after {}", stopwatch);
      return Collections.emptyList();
    }
    LOG.info("Took {} to get timeslices", stopwatch);
    return timeSlices;
  }
  private static class TimeSliceAndHashPartition implements Comparable<TimeSliceAndHashPartition> {
    public final ArchiveTimeSlicer.ArchiveTimeSlice timeSlice;
    public final Integer hashPartition;
    public TimeSliceAndHashPartition(
        ArchiveTimeSlicer.ArchiveTimeSlice timeSlice,
        Integer hashPartition) {
      this.timeSlice = timeSlice;
      this.hashPartition = hashPartition;
    }
    @Override
    public int compareTo(TimeSliceAndHashPartition o) {
      Integer myHashPartition = this.hashPartition;
      Integer otherHashPartition = o.hashPartition;
      long myTimeSliceId = this.timeSlice.getMinStatusID(myHashPartition);
      long otherTimeSliceId = o.timeSlice.getMinStatusID(otherHashPartition);
      return ComparisonChain.start()
          .compare(myHashPartition, otherHashPartition)
          .compare(myTimeSliceId, otherTimeSliceId)
          .result();
    }
  }
  /**
   * For all the timeslices, create the corresponding SegmentInfo and store in a map
   */
  @VisibleForTesting
  Map<String, SegmentBuilderSegment> createSegmentInfoMap() throws IOException {
    final List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices = createTimeSlices();
    List<TimeSliceAndHashPartition> timeSlicePairs = createPairs(timeSlices);
    // Export how many segments should be built
    SEGMENTS_TO_BUILD.set(timeSlicePairs.size());
    LOG.info("Total number of segments to be built across all segment builders: {}",
        timeSlicePairs.size());
    List<TimeSliceAndHashPartition> mySegments = getSegmentsForMyPartition(timeSlicePairs);
    Map<String, SegmentBuilderSegment> segmentInfoMap = new HashMap<>();
    for (TimeSliceAndHashPartition mySegment : mySegments) {
      ArchiveSegment segment = new ArchiveSegment(mySegment.timeSlice, mySegment.hashPartition,
          EarlybirdConfig.getMaxSegmentSize());
      SegmentInfo segmentInfo = new SegmentInfo(segment, segmentFactory, segmentSyncConfig);
      segmentInfoMap.put(segmentInfo.getSegment().getSegmentName(), new NotYetBuiltSegment(
          segmentInfo, segmentConfig, segmentFactory, 0, segmentSyncConfig));
    }
    return segmentInfoMap;
  }
  private List<TimeSliceAndHashPartition> createPairs(
      List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices) {
    List<TimeSliceAndHashPartition> timeSlicePairs = new ArrayList<>();
    for (ArchiveTimeSlicer.ArchiveTimeSlice slice : timeSlices) {
      List<Integer> localPartitions = hashPartitions;
      if (localPartitions == null) {
        localPartitions = range(slice.getNumHashPartitions());
      }
      for (Integer partition : localPartitions) {
        timeSlicePairs.add(new TimeSliceAndHashPartition(slice, partition));
      }
    }
    return timeSlicePairs;
  }
  private List<TimeSliceAndHashPartition> getSegmentsForMyPartition(
      List<TimeSliceAndHashPartition> timeSlicePairs) {
    Collections.sort(timeSlicePairs);
    List<TimeSliceAndHashPartition> myTimeSlices = new ArrayList<>();
    for (int i = myPartitionId; i < timeSlicePairs.size(); i += numSegmentBuilderPartitions) {
      myTimeSlices.add(timeSlicePairs.get(i));
    }
    LOG.info("Getting segments to be built for partition: {}", myPartitionId);
    LOG.info("Total number of partitions: {}", numSegmentBuilderPartitions);
    LOG.info("Number of segments picked: {}", myTimeSlices.size());
    return myTimeSlices;
  }
  /**
   * Print out the segmentInfo Map for debugging
   */
  private void printSegmentInfoMap(Map<String, SegmentBuilderSegment> segmentInfoMap) {
    LOG.info("SegmentInfoMap: ");
    for (Map.Entry<String, SegmentBuilderSegment> entry : segmentInfoMap.entrySet()) {
      LOG.info(entry.getValue().toString());
    }
    LOG.info("Total SegmentInfoMap size: " + segmentInfoMap.size() + ". done.");
  }
  /**
   * Build indices or refresh state for the segments in the specified segmentInfoMap, which only
   * contains the segments that need to build or are building. When a segment has not been built,
   * it is built here. If built successfully, it will be removed from the map; otherwise, its
   * state will be updated in the map.
   *
   * Returns true iff this process has built a segment.
   */
  @VisibleForTesting
  boolean processSegments(Map<String, SegmentBuilderSegment> segmentInfoMap)
      throws SegmentInfoConstructionException, SegmentUpdaterException, InterruptedException {
    boolean hasBuiltSegment = false;
    Iterator<Map.Entry<String, SegmentBuilderSegment>> iter =
        segmentInfoMap.entrySet().iterator();
    while (iter.hasNext()) {
      Map.Entry<String, SegmentBuilderSegment> entry = iter.next();
      SegmentBuilderSegment originalSegment = entry.getValue();
      LOG.info("About to process segment: {}", originalSegment.getSegmentName());
      long startMillis = System.currentTimeMillis();
      SegmentBuilderSegment updatedSegment = segmentHandler.processSegment(originalSegment);
      if (updatedSegment.isBuilt()) {
        iter.remove();
        hasBuiltSegment = true;
        if (originalSegment instanceof NotYetBuiltSegment) {
          // Record the total time spent on successfully building a semgent, used to compute the
          // average segment building time.
          long timeSpent = System.currentTimeMillis() - startMillis;
          segmentsBuiltLocally.increment();
          timeSpentOnSuccessfulBuildSecs.add(timeSpent / 1000);
        }
      } else {
        entry.setValue(updatedSegment);
      }
      clock.waitFor(getSegmentSleepTime());
    }
    return hasBuiltSegment;
  }
  private long getSegmentSleepTime() {
    // The Hadoop name node can handle only about 200 requests/sec before it gets overloaded.
    // Updating the state of a node that has been built takes about 1 second.  In the worst case
    // scenario with 800 segment builders, we end up with about 800 requests/sec.  Adding a 10
    // second sleep lowers the worst case to about 80 requests/sec.
    long sleepMillis = TimeUnit.SECONDS.toMillis(waitBetweenSegmentsSecs);
    // Use randomization so that we can't get all segment builders hitting it at the exact same time
    int lowerSleepBoundMillis = (int) (sleepMillis * (1.0 - SLEEP_RANDOMIZATION_RATIO));
    int upperSleepBoundMillis = (int) (sleepMillis * (1.0 + SLEEP_RANDOMIZATION_RATIO));
    return randRange(lowerSleepBoundMillis, upperSleepBoundMillis);
  }
  /**
   * Returns a pseudo-random number between min and max, inclusive.
   */
  private int randRange(int min, int max) {
    return random.nextInt((max - min) + 1) + min;
  }
  /**
   * Returns list of integers 0, 1, 2, ..., count-1.
   */
  private static List<Integer> range(int count) {
    List<Integer> nums = new ArrayList<>(count);
    for (int i = 0; i < count; i++) {
      nums.add(i);
    }
    return nums;
  }
  private static SegmentSyncConfig getSyncConfig(String scrubGen) {
    if (scrubGen == null || scrubGen.isEmpty()) {
      throw new RuntimeException(
          "Scrub gen expected, but could not get it from the arguments.");
    }
    LOG.info("Scrub gen: " + scrubGen);
    return new SegmentSyncConfig(Optional.of(scrubGen));
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderApp.java
@ -1,109 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import java.util.Collection;
 import com.google.common.collect.ImmutableList;
 import com.google.inject.Module;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.app.Flaggable;
 import com.twitter.inject.server.AbstractTwitterServer;
 import com.twitter.util.Future;
 import com.twitter.util.Time;
 public class SegmentBuilderApp extends AbstractTwitterServer {
  private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderApp.class);
  public SegmentBuilderApp() {
    createFlag("onlyRunOnce",
        true,
        "whether to stop segment builder after one loop",
        Flaggable.ofBoolean());
    createFlag("waitBetweenLoopsMins",
        60,
        "how many minutes to wait between building loops",
        Flaggable.ofInt());
    createFlag("startup_batch_size",
        30,
        "How many instances can start and read timeslice info from HDFS at the same time. "
            + "If you don't know what this parameter is, please do not change this parameter.",
        Flaggable.ofInt());
    createFlag("instance",
        20,
        "the job instance number",
        Flaggable.ofInt());
    createFlag("segmentZkLockExpirationHours",
        0,
        "max hours to hold the zookeeper lock while building segment",
        Flaggable.ofInt());
    createFlag("startupSleepMins",
        2L,
        "sleep multiplier of startupSleepMins before job runs",
        Flaggable.ofLong());
    createFlag("maxRetriesOnFailure",
        3,
        "how many times we should try to rebuild a segment when failure happens",
        Flaggable.ofInt());
    createFlag("hash_partitions",
        ImmutableList.of(),
        "comma separated hash partition ids, e.g., 0,1,3,4. "
            + "If not specified, all the partitions will be built.",
        Flaggable.ofJavaList(Flaggable.ofInt()));
    createFlag("numSegmentBuilderPartitions",
        100,
        "Number of partitions for dividing up all segment builder work",
        Flaggable.ofInt());
    createFlag("waitBetweenSegmentsSecs",
        10,
        "Time to sleep between processing segments.",
        Flaggable.ofInt());
    createFlag("waitBeforeQuitMins",
        2,
        "How many minutes to sleep before quitting.",
        Flaggable.ofInt());
    createFlag("scrubGen",
        "",
        "Scrub gen for which segment builders should be run.",
        Flaggable.ofString());
  }
  @Override
  public void start() {
    SegmentBuilder segmentBuilder = injector().instance(SegmentBuilder.class);
    closeOnExit((Time time) -> {
      segmentBuilder.doShutdown();
      return Future.Unit();
    });
    LOG.info("Starting run()");
    segmentBuilder.run();
    LOG.info("run() complete");
    // Now shutdown
    shutdown();
  }
  protected void shutdown() {
    LOG.info("Calling close() to initiate shutdown");
    close();
  }
  @Override
  public Collection<Module> javaModules() {
    return ImmutableList.of(new SegmentBuilderModule());
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderCoordinator.java
@ -1,200 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import java.io.IOException;
 import java.util.Date;
 import java.util.Optional;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.common.quantity.Amount;
 import com.twitter.common.quantity.Time;
 import com.twitter.common.util.Clock;
 import com.twitter.search.common.database.DatabaseConfig;
 import com.twitter.search.common.util.zktrylock.TryLock;
 import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
 import com.twitter.search.earlybird.archive.DailyStatusBatches;
 import com.twitter.search.earlybird.common.config.EarlybirdProperty;
 import com.twitter.search.earlybird.util.ScrubGenUtil;
 import com.twitter.search.earlybird.partition.HdfsUtil;
 import com.twitter.search.earlybird.partition.SegmentSyncConfig;
 import com.twitter.util.Duration;
 /**
 * Coordinate between segment builders for scrubbing pipeline.
 * When segment builder is running, all of them will try to find a HDFS file indicating if data is
 * ready. If the file does not exist, only one of them will go through the files and see if
 * scrubbing pipeline has generated all data for this scrub gen.
 *
 * If the instance that got the lock found all data, it still exists, because otherwise we will
 * have one single segmentbuilder instance trying to build all segments, which is not what we want.
 * But if it exists, then the next time all segmentbuilder instances are scheduled, they will all
 * find the file, and will start building segments.
 */
 class SegmentBuilderCoordinator {
  private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderCoordinator.class);
  private static final Amount<Long, Time> ZK_LOCK_EXPIRATION_MIN = Amount.of(5L, Time.MINUTES);
  private static final String SEGMENT_BUILDER_SYNC_NODE = "scrub_gen_data_sync";
  private static final String SEGMENT_BUILDER_SYNC_ZK_PATH =
      EarlybirdProperty.ZK_APP_ROOT.get() + "/segment_builder_sync";
  private static final String DATA_FULLY_BUILT_FILE = "_data_fully_built";
  static final int FIRST_INSTANCE = 0;
  private static final long NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS =
      Duration.fromHours(1).inMillis();
  private final ZooKeeperTryLockFactory zkTryLockFactory;
  private final SegmentSyncConfig syncConfig;
  private final Optional<Date> scrubGenDayOpt;
  private final Optional<String> scrubGenOpt;
  private final Clock clock;
  SegmentBuilderCoordinator(
      ZooKeeperTryLockFactory zkTryLockFactory, SegmentSyncConfig syncConfig, Clock clock) {
    this.zkTryLockFactory = zkTryLockFactory;
    this.syncConfig = syncConfig;
    this.scrubGenOpt = syncConfig.getScrubGen();
    this.scrubGenDayOpt = scrubGenOpt.map(ScrubGenUtil::parseScrubGenToDate);
    this.clock = clock;
  }
  public boolean isScrubGenDataFullyBuilt(int instanceNumber) {
    // Only segment builder that takes scrub gen should use isPartitioningOutputReady to coordinate
    Preconditions.checkArgument(scrubGenDayOpt.isPresent());
    final FileSystem hdfs;
    try {
      hdfs = HdfsUtil.getHdfsFileSystem();
    } catch (IOException e) {
      LOG.error("Could not create HDFS file system.", e);
      return false;
    }
    return isScrubGenDataFullyBuilt(
        instanceNumber,
        scrubGenDayOpt.get(),
        NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS,
        hdfs
    );
  }
  @VisibleForTesting
  boolean isScrubGenDataFullyBuilt(
      int instanceNumber,
      Date scrubGenDay,
      long nonFirstInstanceSleepBeforeRetryDuration,
      FileSystem hdfs) {
    // Check if the scrub gen has been fully built file exists.
    if (checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs)) {
      return true;
    }
    // If it doesn't exist, let first instance see if scrub gen has been fully built and create the
    // file.
    if (instanceNumber == FIRST_INSTANCE) {
      // We were missing some data on HDFS for this scrub gen in previous run,
      // but we might've gotten more data in the meantime, check again.
      // Only allow instance 0 to do this mainly for 2 reasons:
      // 1) Since instances are scheduled in batches, it's possible that a instance from latter
      // batch find the fully built file in hdfs and start processing. We end up doing work with
      // only partial instances.
      // 2) If we sleep before we release lock, it's hard to estimate how long a instance will
      // be scheduled.
      // For deterministic reason, we simplify a bit and only allow instance 0 to check and write
      // data is fully build file to hdfs.
      try {
        checkIfScrubGenDataIsFullyBuilt(hdfs, scrubGenDay);
      } catch (IOException e) {
        LOG.error("Failed to grab lock and check scrub gen data.", e);
      }
    } else {
      // for all other instances, sleep for a bit to give time for first instance to check if scrub
      // gen has been fully built and create the file, then check again.
      try {
        LOG.info(
            "Sleeping for {} ms before re-checking if scrub gen has been fully built file exists",
            nonFirstInstanceSleepBeforeRetryDuration);
        clock.waitFor(nonFirstInstanceSleepBeforeRetryDuration);
        return checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs);
      } catch (InterruptedException e) {
        LOG.warn("Interrupted when sleeping before re-checking if scrub gen has been fully built "
            + "file exists", e);
      }
    }
    // if hasSuccessFileToHdfs returns false, then should always return false in the end.
    // next run will find success file for this scrub gen and move forward.
    return false;
  }
  private void checkIfScrubGenDataIsFullyBuilt(
      FileSystem hdfs, Date scrubGenDay) throws IOException {
    // Build the lock, try to acquire it, and check the data on HDFS
    TryLock lock = zkTryLockFactory.createTryLock(
        DatabaseConfig.getLocalHostname(),
        SEGMENT_BUILDER_SYNC_ZK_PATH,
        SEGMENT_BUILDER_SYNC_NODE,
        ZK_LOCK_EXPIRATION_MIN);
    Preconditions.checkState(scrubGenOpt.isPresent());
    String scrubGen = scrubGenOpt.get();
    lock.tryWithLock(() -> {
      LOG.info(String.format(
          "Obtained ZK lock to check if data for scrub gen %s is ready.", scrubGen));
      final DailyStatusBatches directory =
          new DailyStatusBatches(zkTryLockFactory, scrubGenDay);
      if (directory.isScrubGenDataFullyBuilt(hdfs)
          && createScrubGenDataFullyBuiltFileOnHdfs(hdfs)) {
        LOG.info(String.format("All data for scrub gen %s is ready.", scrubGen));
      } else {
        LOG.info(String.format("Data for scrub gen %s is not ready yet.", scrubGen));
      }
    });
  }
  private boolean createScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) {
    Path path = getScrubGenDataFullyBuiltFilePath();
    try {
      fs.mkdirs(new Path(statusReadyHDFSPath()));
      if (fs.createNewFile(path)) {
        LOG.info("Successfully created file " + path + " on HDFS.");
        return true;
      } else {
        LOG.warn("Failed to create file " + path + " on HDFS.");
      }
    } catch (IOException e) {
      LOG.error("Failed to create file on HDFS " + path.toString(), e);
    }
    return false;
  }
  private boolean checkHaveScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) {
    Path path = getScrubGenDataFullyBuiltFilePath();
    try {
      boolean ret = fs.exists(path);
      LOG.info("Checking if file exists showing scrubgen is fully built.");
      LOG.info("Path checked: {}, Exist check: {}", path, ret);
      return ret;
    } catch (IOException e) {
      LOG.error("Failed to check file on HDFS " + path.toString(), e);
      return false;
    }
  }
  @VisibleForTesting
  Path getScrubGenDataFullyBuiltFilePath() {
    return new Path(statusReadyHDFSPath(), DATA_FULLY_BUILT_FILE);
  }
  @VisibleForTesting
  String statusReadyHDFSPath() {
    return syncConfig.getHdfsSegmentSyncRootDir() + "/segment_builder_sync";
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderMain.java
@ -1,10 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 public final class SegmentBuilderMain {
  private SegmentBuilderMain() { }
  public static void main(String[] args) {
    new SegmentBuilderApp().main(args);
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderModule.java
@ -1,58 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import java.io.File;
 import com.google.inject.Provides;
 import com.google.inject.Singleton;
 import com.twitter.app.Flaggable;
 import com.twitter.decider.Decider;
 import com.twitter.inject.TwitterModule;
 import com.twitter.inject.annotations.Flag;
 import com.twitter.search.common.config.LoggerConfiguration;
 import com.twitter.search.earlybird.common.config.EarlybirdConfig;
 import com.twitter.search.earlybird.util.EarlybirdDecider;
 public class SegmentBuilderModule extends TwitterModule {
  private static final String CONFIG_FILE_FLAG_NAME = "config_file";
  private static final String SEGMENT_LOG_DIR_FLAG_NAME = "segment_log_dir";
  public SegmentBuilderModule() {
    createFlag(CONFIG_FILE_FLAG_NAME,
            new File("earlybird-search.yml"),
            "specify config file",
            Flaggable.ofFile());
    createFlag(SEGMENT_LOG_DIR_FLAG_NAME,
            "",
            "override log dir from config file",
            Flaggable.ofString());
  }
  /**
   * Initializes the Earlybird config and the log configuration, and returns an EarlybirdDecider
   * object, which will be injected into the SegmentBuilder instance.
   *
   * @param configFile The config file to use to initialize EarlybirdConfig
   * @param segmentLogDir If not empty, used to override the log directory from the config file
   * @return An initialized EarlybirdDecider
   */
  @Provides
  @Singleton
  public Decider provideDecider(@Flag(CONFIG_FILE_FLAG_NAME) File configFile,
                                @Flag(SEGMENT_LOG_DIR_FLAG_NAME) String segmentLogDir) {
    // By default Guice will build singletons eagerly:
    //    https://github.com/google/guice/wiki/Scopes#eager-singletons
    // So in order to ensure that the EarlybirdConfig and LoggerConfiguration initializations occur
    // before the EarlybirdDecider initialization, we place them here.
    EarlybirdConfig.init(configFile.getName());
    if (!segmentLogDir.isEmpty()) {
      EarlybirdConfig.overrideLogDir(segmentLogDir);
    }
    new LoggerConfiguration(EarlybirdConfig.getLogPropertiesFile(), EarlybirdConfig.getLogDir())
            .configure();
    return EarlybirdDecider.initialize();
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentBuilderSegment.java
@ -1,100 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import java.io.IOException;
 import com.google.common.base.Preconditions;
 import com.twitter.common.quantity.Amount;
 import com.twitter.common.quantity.Time;
 import com.twitter.search.common.database.DatabaseConfig;
 import com.twitter.search.common.util.zktrylock.TryLock;
 import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
 import com.twitter.search.earlybird.archive.ArchiveSegment;
 import com.twitter.search.earlybird.common.config.EarlybirdConfig;
 import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
 import com.twitter.search.earlybird.partition.SegmentInfo;
 import com.twitter.search.earlybird.partition.SegmentSyncConfig;
 public abstract class SegmentBuilderSegment {
  protected final SegmentInfo segmentInfo;
  protected final SegmentConfig segmentConfig;
  protected final EarlybirdSegmentFactory earlybirdSegmentFactory;
  protected final int alreadyRetriedCount;
  protected final SegmentSyncConfig sync;
  public SegmentBuilderSegment(SegmentInfo segmentInfo,
                               SegmentConfig segmentConfig,
                               EarlybirdSegmentFactory earlybirdSegmentFactory,
                               int alreadyRetriedCount,
                               SegmentSyncConfig segmentSyncConfig) {
    this.segmentConfig = segmentConfig;
    this.earlybirdSegmentFactory = earlybirdSegmentFactory;
    this.alreadyRetriedCount = alreadyRetriedCount;
    this.sync = segmentSyncConfig;
    Preconditions.checkState(segmentInfo.getSegment() instanceof ArchiveSegment);
    this.segmentInfo = Preconditions.checkNotNull(segmentInfo);
  }
  public SegmentInfo getSegmentInfo() {
    return segmentInfo;
  }
  public String getSegmentName() {
    return segmentInfo.getSegmentName();
  }
  public int getAlreadyRetriedCount() {
    return alreadyRetriedCount;
  }
  /**
   * Handle the segment, potentially transitioning to a new state.
   * @return The state after handling.
   */
  public abstract SegmentBuilderSegment handle()
      throws SegmentInfoConstructionException, SegmentUpdaterException;
  public boolean isBuilt() {
    return false;
  }
  @Override
  public String toString() {
    return "SegmentBuilderSegment{"
        + "segmentInfo=" + segmentInfo
        + ", state=" + this.getClass().getSimpleName()
        + ", alreadyRetriedCount=" + alreadyRetriedCount + '}';
  }
  /**
   * Given a SegmentInfo, create a new one with the same time slice and partitionID but clean
   * internal state.
   */
  protected SegmentInfo createNewSegmentInfo(SegmentInfo oldSegmentInfo)
      throws SegmentInfoConstructionException {
    Preconditions.checkArgument(oldSegmentInfo.getSegment() instanceof ArchiveSegment);
    ArchiveSegment archiveSegment = (ArchiveSegment) oldSegmentInfo.getSegment();
    try {
      ArchiveSegment segment = new ArchiveSegment(archiveSegment.getArchiveTimeSlice(),
          archiveSegment.getHashPartitionID(), EarlybirdConfig.getMaxSegmentSize());
      return new SegmentInfo(segment, earlybirdSegmentFactory, sync);
    } catch (IOException e) {
      throw new SegmentInfoConstructionException("Error creating new segments", e);
    }
  }
  protected TryLock getZooKeeperTryLock() {
    ZooKeeperTryLockFactory tryLockFactory = segmentConfig.getTryLockFactory();
    String zkRootPath = sync.getZooKeeperSyncFullPath();
    String nodeName = segmentInfo.getZkNodeName();
    Amount<Long, Time> expirationTime = segmentConfig.getSegmentZKLockExpirationTime();
    return tryLockFactory.createTryLock(
        DatabaseConfig.getLocalHostname(),
        zkRootPath,
        nodeName,
        expirationTime);
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentConfig.java
@ -1,41 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import com.twitter.common.quantity.Amount;
 import com.twitter.common.quantity.Time;
 import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
 import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig;
 public class SegmentConfig {
  private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig;
  private final Amount<Long, Time> segmentZKLockExpirationTime;
  private final int maxRetriesOnFailure;
  private final ZooKeeperTryLockFactory tryLockFactory;
  public SegmentConfig(
      ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig,
      Amount<Long, Time> segmentZKLockExpirationTime,
      int maxRetriesOnFailure,
      ZooKeeperTryLockFactory tryLockFactory) {
    this.earlybirdIndexConfig = earlybirdIndexConfig;
    this.segmentZKLockExpirationTime = segmentZKLockExpirationTime;
    this.maxRetriesOnFailure = maxRetriesOnFailure;
    this.tryLockFactory = tryLockFactory;
  }
  public ArchiveOnDiskEarlybirdIndexConfig getEarlybirdIndexConfig() {
    return earlybirdIndexConfig;
  }
  public Amount<Long, Time> getSegmentZKLockExpirationTime() {
    return segmentZKLockExpirationTime;
  }
  public int getMaxRetriesOnFailure() {
    return maxRetriesOnFailure;
  }
  public ZooKeeperTryLockFactory getTryLockFactory() {
    return tryLockFactory;
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentInfoConstructionException.java
@ -1,12 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import java.io.IOException;
 /**
 * Used if exceptions are thrown during creating new SegmentInfo during the indexing loop
 */
 class SegmentInfoConstructionException extends Exception {
  SegmentInfoConstructionException(String msg, IOException e) {
    super(msg, e);
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SegmentUpdaterException.java
@ -1,13 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import com.google.common.annotations.VisibleForTesting;
 /**
 * Used when when SegmentUpdater fails processing segments.
 */
@VisibleForTesting
 class SegmentUpdaterException extends Exception {
  SegmentUpdaterException(String msg) {
    super(msg);
  }
 }
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.docx
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.docx
--- a/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.java
+++ b/src/java/com/twitter/search/earlybird/archive/segmentbuilder/SomeoneElseIsBuildingSegment.java
@ -1,69 +0,0 @@
 package com.twitter.search.earlybird.archive.segmentbuilder;
 import java.util.concurrent.atomic.AtomicBoolean;
 import com.google.common.annotations.VisibleForTesting;
 import com.twitter.common.base.Command;
 import com.twitter.search.common.util.zktrylock.TryLock;
 import com.twitter.search.earlybird.archive.ArchiveHDFSUtils;
 import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
 import com.twitter.search.earlybird.partition.SegmentInfo;
 import com.twitter.search.earlybird.partition.SegmentSyncConfig;
 public class SomeoneElseIsBuildingSegment extends SegmentBuilderSegment {
  public SomeoneElseIsBuildingSegment(
      SegmentInfo segmentInfo,
      SegmentConfig segmentConfig,
      EarlybirdSegmentFactory earlybirdSegmentFactory,
      int alreadyRetriedCount,
      SegmentSyncConfig sync) {
    super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
  }
  /**
   * This method refreshes local state of a segment.
   * 1. Try to grab the ZK lock
   *   2a. if got the lock, the segment is not being built; mark segment as NOT_BUILT_YET.
   *   2b. otherwise, the segment is being built; keep the SOMEONE_ELSE_IS_BUILDING state
   */
  @Override
  public SegmentBuilderSegment handle()
      throws SegmentInfoConstructionException, SegmentUpdaterException {
    TryLock lock = getZooKeeperTryLock();
    final AtomicBoolean alreadyBuilt = new AtomicBoolean(false);
    boolean gotLock = lock.tryWithLock((Command) () -> {
      // The segment might have already finished built by others
      if (segmentExistsOnHdfs()) {
        alreadyBuilt.set(true);
      }
    });
    if (!gotLock) {
      return this;
    }
    if (alreadyBuilt.get()) {
      return new BuiltAndFinalizedSegment(
          segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync);
    } else {
      // When a segment failed building, its state might not be clean. So, it is necessary to
      // create a new SegmentInfo with a clean state
      SegmentInfo newSegmentInfo = createNewSegmentInfo(segmentInfo);
      return new NotYetBuiltSegment(
          newSegmentInfo,
          segmentConfig,
          earlybirdSegmentFactory,
          alreadyRetriedCount + 1,
          sync);
    }
  }
  @VisibleForTesting
  boolean segmentExistsOnHdfs() {
    return ArchiveHDFSUtils.hasSegmentIndicesOnHDFS(sync, segmentInfo);
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/BUILD
+++ b/src/java/com/twitter/search/earlybird/common/BUILD
@ -1,37 +0,0 @@
 java_library(
    sources = ["*.java"],
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        "3rdparty/jvm/com/google/guava",
        "3rdparty/jvm/com/twitter/elephantbird:core",
        "3rdparty/jvm/commons-codec",
        "3rdparty/jvm/commons-httpclient",
        "3rdparty/jvm/geo/google:geoGoogle",
        "3rdparty/jvm/org/apache/lucene:lucene-core",
        "3rdparty/jvm/org/apache/thrift:libthrift",
        "3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
        "decider/src/main/scala",
        "finagle/finagle-core/src/main",
        "finagle/finagle-thrift/src/main/java",
        "finagle/finagle-thrift/src/main/scala",
        "scrooge/scrooge-core/src/main/scala",
        "src/java/com/twitter/common/base",
        "src/java/com/twitter/common/optional",
        "src/java/com/twitter/search/common/decider",
        "src/java/com/twitter/search/common/logging",
        "src/java/com/twitter/search/common/metrics",
        "src/java/com/twitter/search/common/util:finagleutil",
        "src/java/com/twitter/search/common/util/earlybird",
        "src/java/com/twitter/search/common/util/thrift:thrift-utils",
        "src/java/com/twitter/search/queryparser/query:core-query-nodes",
        "src/thrift/com/twitter/context:twitter-context-scala",
        "src/thrift/com/twitter/search:earlybird-java",
        "src/thrift/com/twitter/search/common:caching-java",
        "src/thrift/com/twitter/search/common:constants-java",
        "src/thrift/com/twitter/search/common:query-java",
        "strato/src/main/scala/com/twitter/strato/opcontext",
        "twitter-context/src/main/scala",
        "util/util-core:scala",
    ],
 )
--- a/src/java/com/twitter/search/earlybird/common/BUILD.docx
+++ b/src/java/com/twitter/search/earlybird/common/BUILD.docx
--- a/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.docx
+++ b/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.docx
--- a/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.java
+++ b/src/java/com/twitter/search/earlybird/common/Base64RequestResponseForLogging.java
@ -1,120 +0,0 @@
 package com.twitter.search.earlybird.common;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.thrift.TException;
 import org.apache.thrift.TSerializer;
 import org.apache.thrift.protocol.TBinaryProtocol;
 import org.slf4j.Logger;
 import com.twitter.search.earlybird.thrift.EarlybirdRequest;
 import com.twitter.search.earlybird.thrift.EarlybirdResponse;
 public final class Base64RequestResponseForLogging {
  private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger(
      Base64RequestResponseForLogging.class);
  private static final Logger FAILED_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
      Base64RequestResponseForLogging.class.getName() + ".FailedRequests");
  private static final Logger RANDOM_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
      Base64RequestResponseForLogging.class.getName() + ".RandomRequests");
  private static final Logger SLOW_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
      Base64RequestResponseForLogging.class.getName() + ".SlowRequests");
  private enum LogType {
    FAILED,
    RANDOM,
    SLOW,
  };
  private final LogType logtype;
  private final String logLine;
  private final EarlybirdRequest request;
  private final EarlybirdResponse response;
  private final Base64 base64 = new Base64();
  // TSerializer is not threadsafe, so create a new one for each request
  private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
  private Base64RequestResponseForLogging(
      LogType logType, String logLine, EarlybirdRequest request, EarlybirdResponse response) {
    this.logtype = logType;
    this.logLine = logLine;
    this.request = request;
    this.response = response;
  }
  public static Base64RequestResponseForLogging randomRequest(
      String logLine, EarlybirdRequest request, EarlybirdResponse response) {
    return new Base64RequestResponseForLogging(LogType.RANDOM, logLine, request, response);
  }
  public static Base64RequestResponseForLogging failedRequest(
      String logLine, EarlybirdRequest request, EarlybirdResponse response) {
    return new Base64RequestResponseForLogging(LogType.FAILED, logLine, request, response);
  }
  public static Base64RequestResponseForLogging slowRequest(
      String logLine, EarlybirdRequest request, EarlybirdResponse response) {
    return new Base64RequestResponseForLogging(LogType.SLOW, logLine, request, response);
  }
  private String asBase64(EarlybirdRequest clearedRequest) {
    try {
      // The purpose of this log is to make it easy to re-issue requests in formz to reproduce
      // issues. If queries are re-issued as is they will be treated as late-arriving queries and
      // dropped due to the clientRequestTimeMs being set to the original query time. For ease of
      // use purposes we clear clientRequestTimeMs and log it out separately for the rare case it
      // is needed.
      clearedRequest.unsetClientRequestTimeMs();
      return base64.encodeToString(serializer.serialize(clearedRequest));
    } catch (TException e) {
      GENERAL_LOG.error("Failed to serialize request for logging.", e);
      return "failed_to_serialize";
    }
  }
  private String asBase64(EarlybirdResponse earlybirdResponse) {
    try {
      return base64.encodeToString(serializer.serialize(earlybirdResponse));
    } catch (TException e) {
      GENERAL_LOG.error("Failed to serialize response for logging.", e);
      return "failed_to_serialize";
    }
  }
  private String getFormattedMessage() {
    String base64Request = asBase64(
        EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request));
    String base64Response = asBase64(response);
    return logLine + ", clientRequestTimeMs: " + request.getClientRequestTimeMs()
        + ", " + base64Request + ", " + base64Response;
  }
  /**
   * Logs the Base64-encoded request and response to the success or failure log.
   */
  public void log() {
    // Do the serializing/concatting this way so it happens on the background thread for
    // async logging
    Object logObject = new Object() {
      @Override
      public String toString() {
        return getFormattedMessage();
      }
    };
    switch (logtype) {
      case FAILED:
        FAILED_REQUEST_LOG.info("{}", logObject);
        break;
      case RANDOM:
        RANDOM_REQUEST_LOG.info("{}", logObject);
        break;
      case SLOW:
        SLOW_REQUEST_LOG.info("{}", logObject);
        break;
      default:
        // Not logging anything for other log types.
        break;
    }
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.docx
+++ b/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.docx
--- a/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.java
+++ b/src/java/com/twitter/search/earlybird/common/CaughtUpMonitor.java
@ -1,55 +0,0 @@
 package com.twitter.search.earlybird.common;
 import java.util.concurrent.atomic.AtomicBoolean;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.search.common.metrics.SearchCustomGauge;
 /**
 * A monitor which enforces the condition that a single thread's work is caught up, and allows
 * other threads to wait to be notified when the work is complete. An AtomicBoolean ensures the
 * current status is visible to all threads.
 */
 public class CaughtUpMonitor {
  private static final Logger LOG = LoggerFactory.getLogger(CaughtUpMonitor.class);
  protected final AtomicBoolean isCaughtUp = new AtomicBoolean(false);
  public CaughtUpMonitor(String statPrefix) {
    SearchCustomGauge.export(statPrefix + "_is_caught_up", () -> isCaughtUp() ? 1 : 0);
  }
  public boolean isCaughtUp() {
    return isCaughtUp.get();
  }
  /**
   * Set caught up state, and notify waiting threads if caught up.
   */
  public synchronized void setAndNotify(boolean caughtUp) {
    isCaughtUp.set(caughtUp);
    if (caughtUp) {
      // Readers are caught up, notify waiting threads
      notifyAll();
    }
  }
  /**
   * Wait using Object.wait() until caught up or until thread is interrupted.
   */
  public synchronized void resetAndWaitUntilCaughtUp() {
    LOG.info("Waiting to catch up.");
    // Explicitly set isCaughtUp to false before waiting
    isCaughtUp.set(false);
    try {
      while (!isCaughtUp()) {
        wait();
      }
    } catch (InterruptedException e) {
      LOG.error("{} was interrupted while waiting to catch up", Thread.currentThread());
    }
    LOG.info("Caught up.");
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/ClientIdUtil.docx
+++ b/src/java/com/twitter/search/earlybird/common/ClientIdUtil.docx
--- a/src/java/com/twitter/search/earlybird/common/ClientIdUtil.java
+++ b/src/java/com/twitter/search/earlybird/common/ClientIdUtil.java
@ -1,85 +0,0 @@
 package com.twitter.search.earlybird.common;
 import java.util.Optional;
 import com.twitter.common.optional.Optionals;
 import com.twitter.search.common.util.FinagleUtil;
 import com.twitter.search.earlybird.thrift.EarlybirdRequest;
 import com.twitter.strato.opcontext.Attribution;
 import com.twitter.strato.opcontext.HttpEndpoint;
 public final class ClientIdUtil {
  // Blenders should always set the EarlybirdRequest.clientId field. It should be set to the Finagle
  // client ID of the client that caused the blender to send this request to the roots. If the
  // Finagle ID of the blender's client cannot be determined, it will be set to "unknown" (see
  // com.twitter.search.common.util.FinagleUtil.UNKNOWN_CLIENT_NAME). However, other services that
  // send requests to roots might not set EarlybirdRequest.clientId.
  //
  // So an "unset" clientId means: EarlybirdRequest.clientId was null.
  // An "unknown" clientId means: the client that sent us the request
  // tried setting EarlybirdRequest.clientId, but couldn't figure out a good value for it.
  public static final String UNSET_CLIENT_ID = "unset";
  private static final String CLIENT_ID_FOR_UNKNOWN_CLIENTS = "unknown_client_id";
  private static final String CLIENT_ID_PREFIX = "client_id_";
  private static final String FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN =
      "finagle_id_%s_and_client_id_%s";
  private static final String CLIENT_ID_AND_REQUEST_TYPE = "client_id_%s_and_type_%s";
  private ClientIdUtil() {
  }
  /** Returns the ID of the client that initiated this request or UNSET_CLIENT_ID if not set. */
  public static String getClientIdFromRequest(EarlybirdRequest request) {
    return Optional
        .ofNullable(request.getClientId())
        .map(String::toLowerCase)
        .orElse(UNSET_CLIENT_ID);
  }
  /**
   * Returns the Strato http endpoint attribution as an Optional.
   */
  public static Optional<String> getClientIdFromHttpEndpointAttribution() {
    return Optionals
        .optional(Attribution.httpEndpoint())
        .map(HttpEndpoint::name)
        .map(String::toLowerCase);
  }
  /** Formats the given clientId into a string that can be used for stats. */
  public static String formatClientId(String clientId) {
    return CLIENT_ID_PREFIX + clientId;
  }
  /**
   * Formats the given Finagle clientId and the given clientId into a single string that can be used
   * for stats, or other purposes where the two IDs need to be combined.
   */
  public static String formatFinagleClientIdAndClientId(String finagleClientId, String clientId) {
    return String.format(FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN, finagleClientId, clientId);
  }
  /**
   * Formats the given clientId and requestType into a single string that can be used
   * for stats or other purposes.
   */
  public static String formatClientIdAndRequestType(
      String clientId, String requestType) {
    return String.format(CLIENT_ID_AND_REQUEST_TYPE, clientId, requestType);
  }
  /**
   * Format the quota client id
   */
  public static String getQuotaClientId(String clientId) {
    if (FinagleUtil.UNKNOWN_CLIENT_NAME.equals(clientId) || UNSET_CLIENT_ID.equals(clientId)) {
      return CLIENT_ID_FOR_UNKNOWN_CLIENTS;
    }
    return clientId;
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.docx
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.docx
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.java
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestLogger.java
@ -1,365 +0,0 @@
 package com.twitter.search.earlybird.common;
 import java.util.EnumMap;
 import java.util.Map;
 import scala.Option;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.collect.Maps;
 import com.twitter.context.TwitterContext;
 import com.twitter.context.thriftscala.Viewer;
 import com.twitter.decider.Decider;
 import com.twitter.finagle.thrift.ClientId;
 import com.twitter.finagle.thrift.ClientId$;
 import com.twitter.search.TwitterContextPermit;
 import com.twitter.search.common.constants.thriftjava.ThriftQuerySource;
 import com.twitter.search.common.decider.DeciderUtil;
 import com.twitter.search.common.logging.RPCLogger;
 import com.twitter.search.common.metrics.FailureRatioCounter;
 import com.twitter.search.common.metrics.Timer;
 import com.twitter.search.common.util.earlybird.TermStatisticsUtil;
 import com.twitter.search.common.util.earlybird.ThriftSearchResultUtil;
 import com.twitter.search.earlybird.thrift.EarlybirdRequest;
 import com.twitter.search.earlybird.thrift.EarlybirdResponse;
 import com.twitter.search.earlybird.thrift.ThriftFacetFieldRequest;
 import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
 import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
 import com.twitter.search.earlybird.thrift.ThriftTermStatisticsRequest;
 import static com.twitter.search.common.util.earlybird.EarlybirdResponseUtil
    .responseConsideredFailed;
 public class EarlybirdRequestLogger extends RPCLogger {
  protected enum ExtraFields {
    QUERY_MAX_HITS_TO_PROCESS,
    COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS,
    RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS,
    NUM_HITS_PROCESSED,
    QUERY_COST,
    CPU_TOTAL,
    QUERY_SOURCE,
    CLIENT_ID,
    FINAGLE_CLIENT_ID
  }
  protected enum ShardOnlyExtraFields {
    NUM_SEARCHED_SEGMENTS,
    SCORING_TIME_NANOS
  }
  protected enum RootOnlyExtraFields {
    CACHING_ALLOWED,
    DEBUG_MODE,
    CACHE_HIT,
    USER_AGENT,
    // See JIRA APPSEC-2303 for IP addresses logging
  }
  private static final String LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY =
      "log_full_request_details_on_error";
  private static final String LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY =
      "log_full_request_details_random_fraction";
  private static final String LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY =
      "log_full_slow_request_details_random_fraction";
  private static final String SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY =
      "slow_request_latency_threshold_ms";
  private final Decider decider;
  private final boolean enableLogUnknownClientRequests;
  private static final Map<ThriftQuerySource, FailureRatioCounter>
      FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE = preBuildFailureRatioCounters();
  private static final FailureRatioCounter NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER =
      new FailureRatioCounter("earlybird_logger", "query_source", "not_set");
  static EarlybirdRequestLogger buildForRoot(
      String loggerName, int latencyWarnThreshold, Decider decider) {
    return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold,
        decider, true, RPCLogger.Fields.values(), ExtraFields.values(),
        RootOnlyExtraFields.values());
  }
  static EarlybirdRequestLogger buildForShard(
      String loggerName, int latencyWarnThreshold, Decider decider) {
    return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold,
        decider, false, RPCLogger.Fields.values(), ExtraFields.values(),
        ShardOnlyExtraFields.values());
  }
  @VisibleForTesting
  EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider) {
    this(loggerName, latencyWarnThreshold, decider, false, RPCLogger.Fields.values(),
        ExtraFields.values(), RootOnlyExtraFields.values(), ShardOnlyExtraFields.values());
  }
  private EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider,
                                 boolean enableLogUnknownClientRequests, Enum[]... fieldEnums) {
    super(loggerName, fieldEnums);
    this.decider = decider;
    this.enableLogUnknownClientRequests = enableLogUnknownClientRequests;
    setLatencyWarnThreshold(latencyWarnThreshold);
  }
  /**
   * Logs the given earlybird request and response.
   *
   * @param request The earlybird request.
   * @param response The earlybird response.
   * @param timer The time it took to process this request.
   */
  public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) {
    try {
      LogEntry entry = newLogEntry();
      setRequestLogEntries(entry, request);
      setResponseLogEntries(entry, response);
      if (timer != null) {
        entry.setField(ExtraFields.CPU_TOTAL, Long.toString(timer.getElapsedCpuTotal()));
      }
      boolean wasError = response != null && responseConsideredFailed(response.getResponseCode());
      long responseTime = response != null ? response.getResponseTime() : 0L;
      String logLine = writeLogLine(entry, responseTime, wasError);
      // This code path is called for pre/post logging
      // Prevent same request showing up twice by only logging on post logging
      if (response != null && DeciderUtil.isAvailableForRandomRecipient(
          decider, LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) {
        Base64RequestResponseForLogging.randomRequest(logLine, request, response).log();
      }
      // Unknown client request logging only applies to pre-logging.
      if (enableLogUnknownClientRequests && response == null) {
        UnknownClientRequestForLogging unknownClientRequestLogger =
            UnknownClientRequestForLogging.unknownClientRequest(logLine, request);
        if (unknownClientRequestLogger != null) {
          unknownClientRequestLogger.log();
        }
      }
      if (wasError
          && DeciderUtil.isAvailableForRandomRecipient(
          decider, LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY)) {
        new RequestResponseForLogging(request, response).logFailedRequest();
        Base64RequestResponseForLogging.failedRequest(logLine, request, response).log();
      }
      boolean wasSlow = response != null
          && responseTime >= DeciderUtil.getAvailability(
              decider, SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY);
      if (wasSlow
          && DeciderUtil.isAvailableForRandomRecipient(
              decider, LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) {
        Base64RequestResponseForLogging.slowRequest(logLine, request, response).log();
      }
      FailureRatioCounter failureRatioCounter =
          FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE.get(request.getQuerySource());
      if (failureRatioCounter != null) {
        failureRatioCounter.requestFinished(!wasError);
      } else {
        NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER.requestFinished(!wasError);
      }
    } catch (Exception e) {
      LOG.error("Exception building log entry ", e);
    }
  }
  private void setRequestLogEntries(LogEntry entry, EarlybirdRequest request) {
    entry.setField(Fields.CLIENT_HOST, request.getClientHost());
    entry.setField(Fields.CLIENT_REQUEST_ID, request.getClientRequestID());
    entry.setField(Fields.REQUEST_TYPE, requestTypeForLog(request));
    if (request.isSetSearchQuery()) {
      ThriftSearchQuery searchQuery = request.getSearchQuery();
      entry.setField(Fields.QUERY, searchQuery.getSerializedQuery());
      if (searchQuery.isSetMaxHitsToProcess()) {
        entry.setField(ExtraFields.QUERY_MAX_HITS_TO_PROCESS,
                       Integer.toString(searchQuery.getMaxHitsToProcess()));
      }
      if (searchQuery.isSetCollectorParams()
          && searchQuery.getCollectorParams().isSetTerminationParams()
          && searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
        entry.setField(ExtraFields.COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS,
                       Integer.toString(searchQuery.getCollectorParams().getTerminationParams()
                                        .getMaxHitsToProcess()));
      }
      if (searchQuery.isSetRelevanceOptions()
          && searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
        entry.setField(ExtraFields.RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS,
                       Integer.toString(searchQuery.getRelevanceOptions().getMaxHitsToProcess()));
      }
    }
    entry.setField(Fields.NUM_REQUESTED, Integer.toString(numRequestedForLog(request)));
    if (request.isSetQuerySource()) {
      entry.setField(ExtraFields.QUERY_SOURCE, request.getQuerySource().name());
    }
    if (request.isSetClientId()) {
      entry.setField(ExtraFields.CLIENT_ID, request.getClientId());
    }
    entry.setField(RootOnlyExtraFields.CACHING_ALLOWED,
                   Boolean.toString(EarlybirdRequestUtil.isCachingAllowed(request)));
    entry.setField(RootOnlyExtraFields.DEBUG_MODE, Byte.toString(request.getDebugMode()));
    Option<ClientId> clientIdOption = ClientId$.MODULE$.current();
    if (clientIdOption.isDefined()) {
      entry.setField(ExtraFields.FINAGLE_CLIENT_ID, clientIdOption.get().name());
    }
    setLogEntriesFromTwitterContext(entry);
  }
  @VisibleForTesting
  Option<Viewer> getTwitterContext() {
    return TwitterContext.acquire(TwitterContextPermit.get()).apply();
  }
  private void setLogEntriesFromTwitterContext(LogEntry entry) {
    Option<Viewer> viewerOption = getTwitterContext();
    if (viewerOption.nonEmpty()) {
      Viewer viewer = viewerOption.get();
      if (viewer.userAgent().nonEmpty()) {
        String userAgent = viewer.userAgent().get();
        // we only replace the comma in the user-agent with %2C to make it easily parseable,
        // specially with command line tools like cut/sed/awk
        userAgent = userAgent.replace(",", "%2C");
        entry.setField(RootOnlyExtraFields.USER_AGENT, userAgent);
      }
    }
  }
  private void setResponseLogEntries(LogEntry entry, EarlybirdResponse response) {
    if (response != null) {
      entry.setField(Fields.NUM_RETURNED, Integer.toString(numResultsForLog(response)));
      entry.setField(Fields.RESPONSE_CODE, String.valueOf(response.getResponseCode()));
      entry.setField(Fields.RESPONSE_TIME_MICROS, Long.toString(response.getResponseTimeMicros()));
      if (response.isSetSearchResults()) {
        entry.setField(ExtraFields.NUM_HITS_PROCESSED,
            Integer.toString(response.getSearchResults().getNumHitsProcessed()));
        entry.setField(ExtraFields.QUERY_COST,
            Double.toString(response.getSearchResults().getQueryCost()));
        if (response.getSearchResults().isSetScoringTimeNanos()) {
          entry.setField(ShardOnlyExtraFields.SCORING_TIME_NANOS,
              Long.toString(response.getSearchResults().getScoringTimeNanos()));
        }
      }
      if (response.isSetCacheHit()) {
        entry.setField(RootOnlyExtraFields.CACHE_HIT, String.valueOf(response.isCacheHit()));
      }
      if (response.isSetNumSearchedSegments()) {
        entry.setField(ShardOnlyExtraFields.NUM_SEARCHED_SEGMENTS,
            Integer.toString(response.getNumSearchedSegments()));
      }
    }
  }
  private static int numRequestedForLog(EarlybirdRequest request) {
    int num = 0;
    if (request.isSetFacetRequest() && request.getFacetRequest().isSetFacetFields()) {
      for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) {
        num += field.getNumResults();
      }
    } else if (request.isSetTermStatisticsRequest()) {
      num = request.getTermStatisticsRequest().getTermRequestsSize();
    } else if (request.isSetSearchQuery()) {
      num =  request.getSearchQuery().isSetCollectorParams()
          ? request.getSearchQuery().getCollectorParams().getNumResultsToReturn() : 0;
      if (request.getSearchQuery().getSearchStatusIdsSize() > 0) {
        num = Math.max(num, request.getSearchQuery().getSearchStatusIdsSize());
      }
    }
    return num;
  }
  /**
   * Returns the number of results in the given response. If the response is a term stats response,
   * then the returned value will be the number of term results. If the response is a facet
   * response, then the returned value will be the number of facet results. Otherwise, the returned
   * value will be the number of search results.
   */
  public static int numResultsForLog(EarlybirdResponse response) {
    if (response == null) {
      return 0;
    } else if (response.isSetFacetResults()) {
      return ThriftSearchResultUtil.numFacetResults(response.getFacetResults());
    } else if (response.isSetTermStatisticsResults()) {
      return response.getTermStatisticsResults().getTermResultsSize();
    } else {
      return ThriftSearchResultUtil.numResults(response.getSearchResults());
    }
  }
  private static String requestTypeForLog(EarlybirdRequest request) {
    StringBuilder requestType = new StringBuilder(64);
    if (request.isSetFacetRequest()) {
      requestType.append("FACETS");
      int numFields = request.getFacetRequest().getFacetFieldsSize();
      if (numFields > 0) {
        // For 1 or 2 fields, just put them in the request type.  For more, just log the number.
        if (numFields <= 2) {
          for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) {
            requestType.append(":").append(field.getFieldName().toUpperCase());
          }
        } else {
          requestType.append(":MULTI-").append(numFields);
        }
      }
    } else if (request.isSetTermStatisticsRequest()) {
      ThriftTermStatisticsRequest termStatsRequest = request.getTermStatisticsRequest();
      requestType.append("TERMSTATS-")
          .append(termStatsRequest.getTermRequestsSize());
      ThriftHistogramSettings histoSettings = termStatsRequest.getHistogramSettings();
      if (histoSettings != null) {
        String binSizeVal = String.valueOf(TermStatisticsUtil.determineBinSize(histoSettings));
        String numBinsVal = String.valueOf(histoSettings.getNumBins());
        requestType.append(":NUMBINS-").append(numBinsVal).append(":BINSIZE-").append(binSizeVal);
      }
    } else if (request.isSetSearchQuery()) {
      requestType.append("SEARCH:");
      requestType.append(request.getSearchQuery().getRankingMode().name());
      // Denote when a from user id is present.
      if (request.getSearchQuery().isSetFromUserIDFilter64()) {
        requestType.append(":NETWORK-")
            .append(request.getSearchQuery().getFromUserIDFilter64Size());
      }
      // Denote when required status ids are present.
      if (request.getSearchQuery().getSearchStatusIdsSize() > 0) {
        requestType.append(":IDS-").append(request.getSearchQuery().getSearchStatusIdsSize());
      }
    }
    return requestType.toString();
  }
  private static Map<ThriftQuerySource, FailureRatioCounter> preBuildFailureRatioCounters() {
    Map<ThriftQuerySource, FailureRatioCounter> counterByQuerySource =
        new EnumMap<>(ThriftQuerySource.class);
    for (ThriftQuerySource thriftQuerySource : ThriftQuerySource.values()) {
      FailureRatioCounter counter = new FailureRatioCounter("earlybird_logger", "query_source",
          thriftQuerySource.toString());
      counterByQuerySource.put(thriftQuerySource, counter);
    }
    return Maps.immutableEnumMap(counterByQuerySource);
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.docx
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.docx
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.java
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPostLogger.java
@ -1,37 +0,0 @@
 package com.twitter.search.earlybird.common;
 import com.twitter.decider.Decider;
 import com.twitter.search.common.metrics.Timer;
 import com.twitter.search.earlybird.thrift.EarlybirdRequest;
 import com.twitter.search.earlybird.thrift.EarlybirdResponse;
 public final class EarlybirdRequestPostLogger {
  private final EarlybirdRequestLogger logger;
  public static EarlybirdRequestPostLogger buildForRoot(
      int latencyWarnThreshold, Decider decider) {
    EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot(
        EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider);
    return new EarlybirdRequestPostLogger(requestLogger);
  }
  public static EarlybirdRequestPostLogger buildForShard(
      int latencyWarnThreshold, Decider decider) {
    EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard(
        EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider);
    return new EarlybirdRequestPostLogger(requestLogger);
  }
  private EarlybirdRequestPostLogger(EarlybirdRequestLogger logger) {
    this.logger = logger;
  }
  public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) {
    EarlybirdRequestUtil.updateHitsCounters(request);
    logger.logRequest(request, response, timer);
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.docx
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.docx
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.java
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestPreLogger.java
@ -1,32 +0,0 @@
 package com.twitter.search.earlybird.common;
 import com.twitter.decider.Decider;
 import com.twitter.search.earlybird.thrift.EarlybirdRequest;
 public final class EarlybirdRequestPreLogger {
  private final EarlybirdRequestLogger logger;
  public static EarlybirdRequestPreLogger buildForRoot(Decider decider) {
    EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot(
        EarlybirdRequestPreLogger.class.getName(), Integer.MAX_VALUE, decider);
    return new EarlybirdRequestPreLogger(requestLogger);
  }
  public static EarlybirdRequestPreLogger buildForShard(
      int latencyWarnThreshold, Decider decider) {
    EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard(
        EarlybirdRequestPreLogger.class.getName(), latencyWarnThreshold, decider);
    return new EarlybirdRequestPreLogger(requestLogger);
  }
  private EarlybirdRequestPreLogger(EarlybirdRequestLogger logger) {
    this.logger = logger;
  }
  public void logRequest(EarlybirdRequest request) {
    logger.logRequest(request, null, null);
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.docx
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.docx
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.java
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdRequestUtil.java
@ -1,244 +0,0 @@
 package com.twitter.search.earlybird.common;
 import java.util.concurrent.TimeUnit;
 import com.google.common.annotations.VisibleForTesting;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.search.common.metrics.SearchCounter;
 import com.twitter.search.common.metrics.SearchMovingAverage;
 import com.twitter.search.common.metrics.SearchRateCounter;
 import com.twitter.search.common.metrics.SearchTimerStats;
 import com.twitter.search.common.query.thriftjava.CollectorParams;
 import com.twitter.search.common.query.thriftjava.CollectorTerminationParams;
 import com.twitter.search.earlybird.thrift.EarlybirdRequest;
 import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
 import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
 public final class EarlybirdRequestUtil {
  // This logger is setup to log to a separate set of log files (request_info) and use an
  // async logger so as to not block the searcher thread. See search/earlybird/config/log4j.xml
  private static final Logger LOG = LoggerFactory.getLogger(EarlybirdRequestUtil.class);
  @VisibleForTesting
  static final SearchMovingAverage REQUESTED_NUM_RESULTS_STAT =
      SearchMovingAverage.export("requested_num_results");
  @VisibleForTesting
  static final SearchMovingAverage REQUESTED_MAX_HITS_TO_PROCESS_STAT =
      SearchMovingAverage.export("requested_max_hits_to_process");
  @VisibleForTesting
  static final SearchMovingAverage REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT =
      SearchMovingAverage.export("requested_collector_params_max_hits_to_process");
  @VisibleForTesting
  static final SearchMovingAverage REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT =
      SearchMovingAverage.export("requested_relevance_options_max_hits_to_process");
  @VisibleForTesting
  static final SearchCounter REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT =
      SearchCounter.export("requested_max_hits_to_process_are_different");
  private static final SearchRateCounter REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT =
      SearchRateCounter.export("request_with_more_than_2k_num_result");
  private static final SearchRateCounter REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT =
      SearchRateCounter.export("request_with_more_than_4k_num_result");
  // Stats for tracking clock skew between earlybird and the client-specified request timestamp.
  @VisibleForTesting
  public static final SearchTimerStats CLIENT_CLOCK_DIFF_ABS =
      SearchTimerStats.export("client_clock_diff_abs", TimeUnit.MILLISECONDS, false, true);
  @VisibleForTesting
  public static final SearchTimerStats CLIENT_CLOCK_DIFF_POS =
      SearchTimerStats.export("client_clock_diff_pos", TimeUnit.MILLISECONDS, false, true);
  @VisibleForTesting
  public static final SearchTimerStats CLIENT_CLOCK_DIFF_NEG =
      SearchTimerStats.export("client_clock_diff_neg", TimeUnit.MILLISECONDS, false, true);
  @VisibleForTesting
  public static final SearchRateCounter CLIENT_CLOCK_DIFF_MISSING =
      SearchRateCounter.export("client_clock_diff_missing");
  private static final int MAX_NUM_RESULTS = 4000;
  private static final int OLD_MAX_NUM_RESULTS = 2000;
  private EarlybirdRequestUtil() {
  }
  /**
   * Logs and fixes some potentially excessive values in the given request.
   */
  public static void logAndFixExcessiveValues(EarlybirdRequest request) {
    ThriftSearchQuery searchQuery = request.getSearchQuery();
    if (searchQuery != null) {
      int maxHitsToProcess = 0;
      int numResultsToReturn = 0;
      if (searchQuery.isSetCollectorParams()) {
        numResultsToReturn = searchQuery.getCollectorParams().getNumResultsToReturn();
        if (searchQuery.getCollectorParams().isSetTerminationParams()) {
          maxHitsToProcess =
              searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess();
        }
      }
      if (maxHitsToProcess > 50000) {
        LOG.warn("Excessive max hits in " + request.toString());
      }
      // We used to limit number of results to 2000. These two counters help us track if we receive
      // too many requests with large number of results set.
      String warningMessageTemplate = "Exceed %d num result in %s";
      if (numResultsToReturn > MAX_NUM_RESULTS) {
        LOG.warn(String.format(warningMessageTemplate, MAX_NUM_RESULTS, request.toString()));
        REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT.increment();
        searchQuery.getCollectorParams().setNumResultsToReturn(MAX_NUM_RESULTS);
      } else if (numResultsToReturn > OLD_MAX_NUM_RESULTS) {
        LOG.warn(String.format(warningMessageTemplate, OLD_MAX_NUM_RESULTS, request.toString()));
        REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT.increment();
      }
      ThriftSearchRelevanceOptions options = searchQuery.getRelevanceOptions();
      if (options != null) {
        if (options.getMaxHitsToProcess() > 50000) {
          LOG.warn("Excessive max hits in " + request.toString());
        }
      }
    }
  }
  /**
   * Sets {@code request.searchQuery.collectorParams} if they are not already set.
   */
  public static void checkAndSetCollectorParams(EarlybirdRequest request) {
    ThriftSearchQuery searchQuery = request.getSearchQuery();
    if (searchQuery == null) {
      return;
    }
    if (!searchQuery.isSetCollectorParams()) {
      searchQuery.setCollectorParams(new CollectorParams());
    }
    if (!searchQuery.getCollectorParams().isSetNumResultsToReturn()) {
      searchQuery.getCollectorParams().setNumResultsToReturn(searchQuery.getNumResults());
    }
    if (!searchQuery.getCollectorParams().isSetTerminationParams()) {
      CollectorTerminationParams terminationParams = new CollectorTerminationParams();
      if (request.isSetTimeoutMs()) {
        terminationParams.setTimeoutMs(request.getTimeoutMs());
      }
      if (request.isSetMaxQueryCost()) {
        terminationParams.setMaxQueryCost(request.getMaxQueryCost());
      }
      searchQuery.getCollectorParams().setTerminationParams(terminationParams);
    }
    setMaxHitsToProcess(searchQuery);
  }
  // Early birds will only look for maxHitsToProcess in CollectorParameters.TerminationParameters.
  // Priority to set  CollectorParameters.TerminationParameters.maxHitsToProcess is
  // 1 Collector parameters
  // 2 RelevanceParameters
  // 3 ThrfitQuery.maxHitsToProcess
  private static void setMaxHitsToProcess(ThriftSearchQuery thriftSearchQuery) {
    CollectorTerminationParams terminationParams = thriftSearchQuery
        .getCollectorParams().getTerminationParams();
    if (!terminationParams.isSetMaxHitsToProcess()) {
      if (thriftSearchQuery.isSetRelevanceOptions()
          && thriftSearchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
        terminationParams.setMaxHitsToProcess(
            thriftSearchQuery.getRelevanceOptions().getMaxHitsToProcess());
      } else {
        terminationParams.setMaxHitsToProcess(thriftSearchQuery.getMaxHitsToProcess());
      }
    }
  }
  /**
   * Creates a copy of the given request and unsets the binary fields to make the logged line for
   * this request look nicer.
   */
  public static EarlybirdRequest copyAndClearUnnecessaryValuesForLogging(EarlybirdRequest request) {
    EarlybirdRequest copiedRequest = request.deepCopy();
    if (copiedRequest.isSetSearchQuery()) {
      // These fields are very large and the binary data doesn't play well with formz
      copiedRequest.getSearchQuery().unsetTrustedFilter();
      copiedRequest.getSearchQuery().unsetDirectFollowFilter();
    }
    return copiedRequest;
  }
  /**
   * Updates some hit-related stats based on the parameters in the given request.
   */
  public static void updateHitsCounters(EarlybirdRequest request) {
    if ((request == null) || !request.isSetSearchQuery()) {
      return;
    }
    ThriftSearchQuery searchQuery = request.getSearchQuery();
    if (searchQuery.isSetNumResults()) {
      REQUESTED_NUM_RESULTS_STAT.addSample(searchQuery.getNumResults());
    }
    if (searchQuery.isSetMaxHitsToProcess()) {
      REQUESTED_MAX_HITS_TO_PROCESS_STAT.addSample(searchQuery.getMaxHitsToProcess());
    }
    Integer collectorParamsMaxHitsToProcess = null;
    if (searchQuery.isSetCollectorParams()
        && searchQuery.getCollectorParams().isSetTerminationParams()
        && searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
      collectorParamsMaxHitsToProcess =
          searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess();
      REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT
          .addSample(collectorParamsMaxHitsToProcess);
    }
    Integer relevanceOptionsMaxHitsToProcess = null;
    if (searchQuery.isSetRelevanceOptions()
        && searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
      relevanceOptionsMaxHitsToProcess = searchQuery.getRelevanceOptions().getMaxHitsToProcess();
      REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT
          .addSample(relevanceOptionsMaxHitsToProcess);
    }
    if ((collectorParamsMaxHitsToProcess != null)
        && (relevanceOptionsMaxHitsToProcess != null)
        && (collectorParamsMaxHitsToProcess != relevanceOptionsMaxHitsToProcess)) {
      REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT.increment();
    }
  }
  public static boolean isCachingAllowed(EarlybirdRequest request) {
    return !request.isSetCachingParams() || request.getCachingParams().isCache();
  }
  /**
   * Track the clock difference between this server and its client's specified request time.
   * When there is no clock drift between machines, this will record the inflight time between this
   * server and the client.
   *
   * @param request the incoming earlybird request.
   */
  public static void recordClientClockDiff(EarlybirdRequest request) {
    if (request.isSetClientRequestTimeMs()) {
      final long timeDiff = System.currentTimeMillis() - request.getClientRequestTimeMs();
      final long timeDiffAbs = Math.abs(timeDiff);
      if (timeDiff >= 0) {
        CLIENT_CLOCK_DIFF_POS.timerIncrement(timeDiffAbs);
      } else {
        CLIENT_CLOCK_DIFF_NEG.timerIncrement(timeDiffAbs);
      }
      CLIENT_CLOCK_DIFF_ABS.timerIncrement(timeDiffAbs);
    } else {
      CLIENT_CLOCK_DIFF_MISSING.increment();
    }
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.docx
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.docx
--- a/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.java
+++ b/src/java/com/twitter/search/earlybird/common/EarlybirdThriftBackend.java
@ -1,28 +0,0 @@
 package com.twitter.search.earlybird.common;
 import javax.inject.Inject;
 import javax.inject.Singleton;
 import org.apache.thrift.protocol.TProtocolFactory;
 import com.twitter.finagle.Service;
 import com.twitter.search.common.util.thrift.ThriftToBytesFilter;
 import com.twitter.search.earlybird.thrift.EarlybirdService;
@Singleton
 public class EarlybirdThriftBackend extends EarlybirdService.ServiceToClient {
  /**
   * Wrapping the bytes svc back to a EarlybirdService.ServiceToClient, which
   * is a EarlybirdService.ServiceIface again.
   */
  @Inject
  public EarlybirdThriftBackend(
      ThriftToBytesFilter thriftToBytesFilter,
      Service<byte[], byte[]> byteService,
      TProtocolFactory protocolFactory) {
    super(thriftToBytesFilter.andThen(byteService), protocolFactory);
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/NonPagingAssert.docx
+++ b/src/java/com/twitter/search/earlybird/common/NonPagingAssert.docx
--- a/src/java/com/twitter/search/earlybird/common/NonPagingAssert.java
+++ b/src/java/com/twitter/search/earlybird/common/NonPagingAssert.java
@ -1,34 +0,0 @@
 package com.twitter.search.earlybird.common;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.search.common.metrics.SearchRateCounter;
 /**
 * When incremented, a non-paging alert will be triggered. Use this to assert for bad conditions
 * that should generally never happen.
 */
 public class NonPagingAssert {
    private static final Logger LOG = LoggerFactory.getLogger(NonPagingAssert.class);
    private static final String ASSERT_STAT_PREFIX = "non_paging_assert_";
    private final String name;
    private final SearchRateCounter assertCounter;
    public NonPagingAssert(String name) {
        this.name = name;
        this.assertCounter = SearchRateCounter.export(ASSERT_STAT_PREFIX + name);
    }
    public void assertFailed() {
        LOG.error("NonPagingAssert failed: {}", name);
        assertCounter.increment();
    }
    public static void assertFailed(String name) {
        NonPagingAssert nonPagingAssert = new NonPagingAssert(name);
        nonPagingAssert.assertFailed();
    }
 }
--- a/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.docx
+++ b/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.docx
--- a/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.java
+++ b/src/java/com/twitter/search/earlybird/common/RequestResponseForLogging.java
@ -1,55 +0,0 @@
 package com.twitter.search.earlybird.common;
 import org.apache.thrift.TException;
 import org.apache.thrift.TSerializer;
 import org.apache.thrift.protocol.TSimpleJSONProtocol;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.search.earlybird.thrift.EarlybirdRequest;
 import com.twitter.search.earlybird.thrift.EarlybirdResponse;
 public class RequestResponseForLogging {
  private static final Logger LOG = LoggerFactory.getLogger(
      RequestResponseForLogging.class);
  private static final Logger FAILED_REQUEST_LOG = LoggerFactory.getLogger(
      RequestResponseForLogging.class.getName() + ".FailedRequests");
  private final EarlybirdRequest request;
  private final EarlybirdResponse response;
  public RequestResponseForLogging(EarlybirdRequest request,
                                   EarlybirdResponse response) {
    this.request = request;
    this.response = response;
  }
  private String serialize(EarlybirdRequest clearedRequest, EarlybirdResponse theResponse) {
    TSerializer serializer = new TSerializer(new TSimpleJSONProtocol.Factory());
    try {
      String requestJson = serializer.toString(clearedRequest);
      String responseJson = serializer.toString(theResponse);
      return "{\"request\":" + requestJson + ", \"response\":" + responseJson + "}";
    } catch (TException e) {
      LOG.error("Failed to serialize request/response for logging.", e);
      return "";
    }
  }
  /**
   * Logs the request and response stored in this instance to the failure log file.
   */
  public void logFailedRequest() {
    // Do the serializing/concatting this way so it happens on the background thread for
    // async logging
    FAILED_REQUEST_LOG.info("{}", new Object() {
      @Override
      public String toString() {
        return serialize(
            EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request), response);
      }
    });
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/RequestResponsePair.docx
+++ b/src/java/com/twitter/search/earlybird/common/RequestResponsePair.docx
--- a/src/java/com/twitter/search/earlybird/common/RequestResponsePair.java
+++ b/src/java/com/twitter/search/earlybird/common/RequestResponsePair.java
@ -1,44 +0,0 @@
 package com.twitter.search.earlybird.common;
 import org.apache.lucene.search.Query;
 import com.twitter.search.earlybird.thrift.EarlybirdRequest;
 import com.twitter.search.earlybird.thrift.EarlybirdResponse;
 public class RequestResponsePair {
  private final EarlybirdRequest request;
  private final EarlybirdResponse response;
  private final org.apache.lucene.search.Query luceneQuery;
  // The serialized query in its final form, after various modifications have been applied to it.
  // As a note, we have some code paths in which this can be null, but I don't really see them
  // triggered in production right now.
  private final com.twitter.search.queryparser.query.Query finalSerializedQuery;
  public RequestResponsePair(
      EarlybirdRequest request,
      com.twitter.search.queryparser.query.Query finalSerializedQuery,
      org.apache.lucene.search.Query luceneQuery,
      EarlybirdResponse response) {
    this.request = request;
    this.luceneQuery = luceneQuery;
    this.response = response;
    this.finalSerializedQuery = finalSerializedQuery;
  }
  public String getFinalSerializedQuery() {
    return finalSerializedQuery != null ? finalSerializedQuery.serialize() : "N/A";
  }
  public EarlybirdRequest getRequest() {
    return request;
  }
  public EarlybirdResponse getResponse() {
    return response;
  }
  public Query getLuceneQuery() {
    return luceneQuery;
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.docx
+++ b/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.docx
--- a/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.java
+++ b/src/java/com/twitter/search/earlybird/common/UnknownClientRequestForLogging.java
@ -1,77 +0,0 @@
 package com.twitter.search.earlybird.common;
 import org.apache.commons.codec.binary.Base64;
 import org.apache.thrift.TException;
 import org.apache.thrift.TSerializer;
 import org.apache.thrift.protocol.TBinaryProtocol;
 import org.slf4j.Logger;
 import com.twitter.search.common.util.FinagleUtil;
 import com.twitter.search.earlybird.thrift.EarlybirdRequest;
 /**
 * This class logs all requests that misses either the finagle Id or the client Id.
 */
 public final class UnknownClientRequestForLogging {
  private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger(
      UnknownClientRequestForLogging.class);
  private static final Logger LOG = org.slf4j.LoggerFactory.getLogger(
      UnknownClientRequestForLogging.class.getName() + ".unknownClientRequests");
  private final String logLine;
  private final EarlybirdRequest request;
  private final String clientId;
  private final String finagleId;
  private final Base64 base64 = new Base64();
  private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
  private UnknownClientRequestForLogging(
      String logLine,
      EarlybirdRequest request,
      String clientId,
      String finagleId) {
    this.logLine = logLine;
    this.request = request;
    this.clientId = clientId;
    this.finagleId = finagleId;
  }
  /**
   * Returns an UnknownClientRequestForLogging instance if a client ID is not set on the given
   * earlybird request. If the request has a client ID set, {@code null} is returned.
   *
   * @param logLine Additional information to propagate to the log file, when logging this request.
   * @param request The earlybird request.
   */
  public static UnknownClientRequestForLogging unknownClientRequest(
      String logLine, EarlybirdRequest request) {
    String clientId = ClientIdUtil.getClientIdFromRequest(request);
    String finagleId = FinagleUtil.getFinagleClientName();
    if (clientId.equals(ClientIdUtil.UNSET_CLIENT_ID)) {
      return new UnknownClientRequestForLogging(logLine, request, clientId, finagleId);
    } else {
      return null;
    }
  }
  private String asBase64() {
    try {
      // Need to make a deepCopy() here, because the request may still be in use (e.g. if we are
      // doing this in the pre-logger), and we should not be modifying crucial fields on the
      // EarlybirdRequest in place.
      EarlybirdRequest clearedRequest = request.deepCopy();
      clearedRequest.unsetClientRequestTimeMs();
      return base64.encodeToString(serializer.serialize(clearedRequest));
    } catch (TException e) {
      GENERAL_LOG.error("Failed to serialize request for logging.", e);
      return "failed_to_serialize";
    }
  }
  public void log() {
    LOG.info("{},{},{},{}", clientId, finagleId, logLine, asBase64());
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/config/BUILD
+++ b/src/java/com/twitter/search/earlybird/common/config/BUILD
@ -1,21 +0,0 @@
 java_library(
    sources = ["*.java"],
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        "3rdparty/jvm/com/google/code/findbugs:jsr305",
        "3rdparty/jvm/com/google/guava",
        "3rdparty/jvm/org/apache/commons:commons-lang3",
        "3rdparty/jvm/org/apache/thrift:libthrift",
        "3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
        "3rdparty/jvm/org/slf4j:slf4j-api",
        "3rdparty/jvm/org/yaml:snakeyaml",
        "finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
        "src/java/com/twitter/common/base",
        "src/java/com/twitter/common_internal/text/version",
        "src/java/com/twitter/search/common/aurora",
        "src/java/com/twitter/search/common/config",
        "src/java/com/twitter/search/common/metrics",
        "src/java/com/twitter/search/common/util/zookeeper",
    ],
 )
--- a/src/java/com/twitter/search/earlybird/common/config/BUILD.docx
+++ b/src/java/com/twitter/search/earlybird/common/config/BUILD.docx
--- a/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.docx
+++ b/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.docx
--- a/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.java
+++ b/src/java/com/twitter/search/earlybird/common/config/EarlybirdConfig.java
@ -1,363 +0,0 @@
 package com.twitter.search.earlybird.common.config;
 import java.util.Date;
 import java.util.List;
 import java.util.Map;
 import javax.annotation.Nullable;
 import com.google.common.collect.ImmutableMap;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.common_internal.text.version.PenguinVersion;
 import com.twitter.search.common.aurora.AuroraInstanceKey;
 import com.twitter.search.common.config.Config;
 import com.twitter.search.common.config.ConfigFile;
 import com.twitter.search.common.config.ConfigurationException;
 import com.twitter.search.common.config.SearchPenguinVersionsConfig;
 public final class EarlybirdConfig {
  private static final Logger LOG = LoggerFactory.getLogger(EarlybirdConfig.class);
  private static final String DEFAULT_CONFIG_FILE = "earlybird-search.yml";
  private static final String LATE_TWEET_BUFFER_KEY = "late_tweet_buffer";
  public static final String EARLYBIRD_ZK_CONFIG_DIR = "/twitter/search/production/earlybird/";
  public static final String EARLYBIRD_CONFIG_DIR = "earlybird/config";
  public static final String USER_SNAPSHOT_BASE_DIR = "user_snapshot_base_dir";
  private static volatile ConfigFile earlybirdConfig = null;
  private static volatile Map<String, Object> overrideValueMap = ImmutableMap.of();
  private static String logDirOverride = null;
  private static AuroraInstanceKey auroraInstanceKey = null;
  private static int adminPort;
  private EarlybirdConfig() { }
  private static final class PenguinVersionHolder {
    private static final PenguinVersion PENGUIN_VERSION_SINGLETON =
        SearchPenguinVersionsConfig.getSingleSupportedVersion(
            EarlybirdProperty.PENGUIN_VERSION.get());
    private static final byte PENGUIN_VERSION_BYTE_VALUE =
        PENGUIN_VERSION_SINGLETON.getByteValue();
  }
  public static byte getPenguinVersionByte() {
    return PenguinVersionHolder.PENGUIN_VERSION_BYTE_VALUE;
  }
  public static PenguinVersion getPenguinVersion() {
    return PenguinVersionHolder.PENGUIN_VERSION_SINGLETON;
  }
  /**
   * Reads the earlybird configuration from the given file.
   */
  public static synchronized void init(@Nullable String configFile) {
    if (earlybirdConfig == null) {
      String file = configFile == null ? DEFAULT_CONFIG_FILE : configFile;
      earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, file);
    }
  }
  public static synchronized void setOverrideValues(Map<String, Object> overrideValues) {
    overrideValueMap = ImmutableMap.copyOf(overrideValues);
  }
  /**
   * Pack all values in a string that can be printed for informational purposes.
   * @return the string.
   */
  public static String allValuesAsString() {
    Map<String, String> stringMap = earlybirdConfig.getStringMap();
    StringBuilder stringBuilder = new StringBuilder();
    stringBuilder.append("Config environment: " + Config.getEnvironment() + "\n\n");
    stringBuilder.append(
        String.format("Values from earlybird-search.yml (total %d):\n", stringMap.size()));
    stringMap.forEach((key, value) -> {
      stringBuilder.append(String.format("  %s: %s\n", key, value.toString()));
      if (overrideValueMap.containsKey(key)) {
        stringBuilder.append(String.format(
          "    override value: %s\n", overrideValueMap.get(key).toString()));
      }
    });
    stringBuilder.append(String.format(
        "\n\nAll command-line overrides (total: %d):\n", overrideValueMap.size()));
    overrideValueMap.forEach((key, value) -> {
      stringBuilder.append(String.format("  %s: %s\n", key, value.toString()));
    });
    return stringBuilder.toString();
  }
  /**
   * Returns the value of the given property as a string. If the property is not set, a runtime
   * exception is thrown.
   */
  public static String getString(String property) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (String) overrideValue;
    }
    try {
      return earlybirdConfig.getString(property);
    } catch (ConfigurationException e) {
      LOG.error("Fatal error: could not get config string " + property, e);
      throw new RuntimeException(e);
    }
  }
  /**
   * Returns the value of the given property as a string.
   */
  public static String getString(String property, String defaultValue) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (String) overrideValue;
    }
    return earlybirdConfig.getString(property, defaultValue);
  }
  /**
   * Returns the value of the given property as an integer. If the property is not set, a runtime
   * exception is thrown.
   */
  public static int getInt(String property) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (int) overrideValue;
    }
    try {
      return earlybirdConfig.getInt(property);
    } catch (ConfigurationException e) {
      LOG.error("Fatal error: could not get config int " + property, e);
      throw new RuntimeException(e);
    }
  }
  /**
   * Returns the value of the given property as an integer.
   */
  public static int getInt(String property, int defaultValue) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (int) overrideValue;
    }
    return earlybirdConfig.getInt(property, defaultValue);
  }
  /**
   * Returns the value of the given property as a double.
   */
  public static double getDouble(String property, double defaultValue) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (double) overrideValue;
    }
    return earlybirdConfig.getDouble(property, defaultValue);
  }
  /**
   * Returns the value of the given property as a long. If the property is not set, a runtime
   * exception is thrown.
   */
  public static long getLong(String property) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (long) overrideValue;
    }
    try {
      return earlybirdConfig.getLong(property);
    } catch (ConfigurationException e) {
      LOG.error("Fatal error: could not get config long " + property, e);
      throw new RuntimeException(e);
    }
  }
  /**
   * Returns the value of the given property as a long.
   */
  public static long getLong(String property, long defaultValue) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (long) overrideValue;
    }
    return earlybirdConfig.getLong(property, defaultValue);
  }
  /**
   * Returns the value of the given property as a boolean. If the property is not set, a runtime
   * exception is thrown.
   */
  public static boolean getBool(String property) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (boolean) overrideValue;
    }
    try {
      return earlybirdConfig.getBool(property);
    } catch (ConfigurationException e) {
      LOG.error("Fatal error: could not get config boolean " + property, e);
      throw new RuntimeException(e);
    }
  }
  /**
   * Returns the value of the given property as a boolean.
   */
  public static boolean getBool(String property, boolean defaultValue) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (boolean) overrideValue;
    }
    return earlybirdConfig.getBool(property, defaultValue);
  }
  /**
   * Returns the value of the given property as a date.
   */
  public static Date getDate(String property) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (Date) overrideValue;
    }
    Date date = (Date) earlybirdConfig.getObject(property, null);
    if (date == null) {
      throw new RuntimeException("Could not get config date: " + property);
    }
    return date;
  }
  /**
   * Returns the value of the given property as a list of strings.
   */
  public static List<String> getListOfStrings(String property) {
    Object overrideValue = overrideValueMap.get(property);
    if (overrideValue != null) {
      return (List<String>) overrideValue;
    }
    List<String> list = (List<String>) earlybirdConfig.getObject(property, null);
    if (list == null) {
      throw new RuntimeException("Could not get list of strings: " + property);
    }
    return list;
  }
  /**
   * Returns the value of the given property as a map.
   */
  @SuppressWarnings("unchecked")
  public static Map<String, Object> getMap(String property) {
    Map<String, Object> map = (Map<String, Object>) earlybirdConfig.getObject(property, null);
    if (map == null) {
      throw new RuntimeException("Could not find config property: " + property);
    }
    return map;
  }
  public static int getMaxSegmentSize() {
    return EarlybirdConfig.getInt("max_segment_size", 1 << 16);
  }
  /**
   * Returns the log properties file.
   */
  public static String getLogPropertiesFile() {
    try {
      String filename = earlybirdConfig.getString("log_properties_filename");
      return earlybirdConfig.getConfigFilePath(filename);
    } catch (ConfigurationException e) {
      // Print here rather than use LOG - log was probably not initialized yet.
      LOG.error("Fatal error: could not get log properties file", e);
      throw new RuntimeException(e);
    }
  }
  /**
   * Returns the log directory.
   */
  public static String getLogDir() {
    if (logDirOverride != null) {
      return logDirOverride;
    } else {
      return EarlybirdConfig.getString("log_dir");
    }
  }
  public static void overrideLogDir(String logDir) {
    EarlybirdConfig.logDirOverride = logDir;
  }
  public static int getThriftPort() {
    return EarlybirdProperty.THRIFT_PORT.get();
  }
  public static int getWarmUpThriftPort() {
    return EarlybirdProperty.WARMUP_THRIFT_PORT.get();
  }
  public static int getSearcherThreads() {
    return EarlybirdProperty.SEARCHER_THREADS.get();
  }
  public static int getLateTweetBuffer() {
    return getInt(LATE_TWEET_BUFFER_KEY);
  }
  public static int getAdminPort() {
    return adminPort;
  }
  public static void setAdminPort(int adminPort) {
    EarlybirdConfig.adminPort = adminPort;
  }
  public static boolean isRealtimeOrProtected() {
    String earlybirdName = EarlybirdProperty.EARLYBIRD_NAME.get();
    return earlybirdName.contains("realtime") || earlybirdName.contains("protected");
  }
  public static boolean consumeUserScrubGeoEvents() {
    return EarlybirdProperty.CONSUME_GEO_SCRUB_EVENTS.get();
  }
  @Nullable
  public static AuroraInstanceKey getAuroraInstanceKey() {
    return auroraInstanceKey;
  }
  public static void setAuroraInstanceKey(AuroraInstanceKey auroraInstanceKey) {
    EarlybirdConfig.auroraInstanceKey = auroraInstanceKey;
  }
  public static boolean isAurora() {
    return auroraInstanceKey != null;
  }
  public static void setForTests(String property, Object value) {
    earlybirdConfig.setForTests(DEFAULT_CONFIG_FILE, property, value);
  }
  public static synchronized void clearForTests() {
    earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, DEFAULT_CONFIG_FILE);
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.docx
+++ b/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.docx
--- a/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.java
+++ b/src/java/com/twitter/search/earlybird/common/config/EarlybirdProperty.java
@ -1,390 +0,0 @@
 package com.twitter.search.earlybird.common.config;
 import java.lang.reflect.Modifier;
 import java.util.Arrays;
 import java.util.List;
 import java.util.function.BiFunction;
 import java.util.function.Function;
 import java.util.stream.Collectors;
 import com.google.common.collect.ImmutableList;
 import com.twitter.app.Flag;
 import com.twitter.app.Flaggable;
 import com.twitter.app.Flags;
 import com.twitter.finagle.mtls.authentication.ServiceIdentifier;
 /**
 * Stateless class that represents an Earlybird property that can be specified by a command line
 * flag.
 * <p>
 * This is a regular Java class instead of enum to have a generic type.
 *
 * @param <T>
 */
 public final class EarlybirdProperty<T> {
  private static final class PropertyType<T> {
    private static final PropertyType<Boolean> BOOLEAN = new PropertyType<>(
        Flaggable.ofJavaBoolean(), EarlybirdConfig::getBool, EarlybirdConfig::getBool);
    private static final PropertyType<Integer> INT = new PropertyType<>(
        Flaggable.ofJavaInteger(), EarlybirdConfig::getInt, EarlybirdConfig::getInt);
    private static final PropertyType<String> STRING = new PropertyType<>(
        Flaggable.ofString(), EarlybirdConfig::getString, EarlybirdConfig::getString);
    private final Flaggable<T> flaggable;
    private final Function<String, T> getter;
    private final BiFunction<String, T, T> getterWithDefault;
    private PropertyType(Flaggable<T> flaggable, Function<String, T> getter,
                         BiFunction<String, T, T> getterWithDefault) {
      this.flaggable = flaggable;
      this.getter = getter;
      this.getterWithDefault = getterWithDefault;
    }
  }
  public static final EarlybirdProperty<String> PENGUIN_VERSION =
      new EarlybirdProperty<>(
          "penguin_version",
          "The penguin version to index.",
          PropertyType.STRING,
          false);
  public static final EarlybirdProperty<Integer> THRIFT_PORT = new EarlybirdProperty<>(
      "thrift_port",
      "override thrift port from config file",
      PropertyType.INT,
      false);
  public static final EarlybirdProperty<Integer> WARMUP_THRIFT_PORT = new EarlybirdProperty<>(
      "warmup_thrift_port",
      "override warmup thrift port from config file",
      PropertyType.INT,
      false);
  public static final EarlybirdProperty<Integer> SEARCHER_THREADS = new EarlybirdProperty<>(
      "searcher_threads",
      "override number of searcher threads from config file",
      PropertyType.INT,
      false);
  public static final EarlybirdProperty<String> EARLYBIRD_TIER = new EarlybirdProperty<>(
      "earlybird_tier",
      "the earlybird tier (e.g. tier1), used on Aurora",
      PropertyType.STRING,
      true);
  public static final EarlybirdProperty<Integer> REPLICA_ID = new EarlybirdProperty<>(
      "replica_id",
      "the ID in a partition, used on Aurora",
      PropertyType.INT,
      true);
  public static final EarlybirdProperty<Integer> PARTITION_ID = new EarlybirdProperty<>(
      "partition_id",
      "partition ID, used on Aurora",
      PropertyType.INT,
      true);
  public static final EarlybirdProperty<Integer> NUM_PARTITIONS = new EarlybirdProperty<>(
      "num_partitions",
      "number of partitions, used on Aurora",
      PropertyType.INT,
      true);
  public static final EarlybirdProperty<Integer> NUM_INSTANCES = new EarlybirdProperty<>(
      "num_instances",
      "number of instances in the job, used on Aurora",
      PropertyType.INT,
      true);
  public static final EarlybirdProperty<Integer> SERVING_TIMESLICES = new EarlybirdProperty<>(
      "serving_timeslices",
      "number of time slices to serve, used on Aurora",
      PropertyType.INT,
      true);
  public static final EarlybirdProperty<String> ROLE = new EarlybirdProperty<>(
      "role",
      "Role in the service path of Earlybird",
      PropertyType.STRING,
      true,
      true);
  public static final EarlybirdProperty<String> EARLYBIRD_NAME = new EarlybirdProperty<>(
      "earlybird_name",
      "Name in the service path of Earlybird without hash partition suffix",
      PropertyType.STRING,
      true,
      true);
  public static final EarlybirdProperty<String> ENV = new EarlybirdProperty<>(
      "env",
      "Environment in the service path of Earlybird",
      PropertyType.STRING,
      true,
      true);
  public static final EarlybirdProperty<String> ZONE = new EarlybirdProperty<>(
      "zone",
      "Zone (data center) in the service path of Earlybird",
      PropertyType.STRING,
      true,
      true);
  public static final EarlybirdProperty<String> DL_URI = new EarlybirdProperty<>(
      "dl_uri",
      "DistributedLog URI for default DL reader",
      PropertyType.STRING,
      false);
  public static final EarlybirdProperty<String> USER_UPDATES_DL_URI = new EarlybirdProperty<>(
      "user_updates_dl_uri",
      "DistributedLog URI for user updates DL reader",
      PropertyType.STRING,
      false);
  public static final EarlybirdProperty<String> ANTISOCIAL_USERUPDATES_DL_STREAM =
      new EarlybirdProperty<>(
          "antisocial_userupdates_dl_stream",
          "DL stream name for antisocial user updates without DL version suffix",
          PropertyType.STRING,
          false);
  public static final EarlybirdProperty<String> ZK_APP_ROOT = new EarlybirdProperty<>(
      "zk_app_root",
      "SZooKeeper base root path for this application",
      PropertyType.STRING,
      true);
  public static final EarlybirdProperty<Boolean> SEGMENT_LOAD_FROM_HDFS_ENABLED =
      new EarlybirdProperty<>(
          "segment_load_from_hdfs_enabled",
          "Whether to load segment data from HDFS",
          PropertyType.BOOLEAN,
          false);
  public static final EarlybirdProperty<Boolean> SEGMENT_FLUSH_TO_HDFS_ENABLED =
      new EarlybirdProperty<>(
          "segment_flush_to_hdfs_enabled",
          "Whether to flush segment data to HDFS",
          PropertyType.BOOLEAN,
          false);
  public static final EarlybirdProperty<String> HDFS_SEGMENT_SYNC_DIR = new EarlybirdProperty<>(
      "hdfs_segment_sync_dir",
      "HDFS directory to sync segment data",
      PropertyType.STRING,
      false);
  public static final EarlybirdProperty<String> HDFS_SEGMENT_UPLOAD_DIR = new EarlybirdProperty<>(
      "hdfs_segment_upload_dir",
      "HDFS directory to upload segment data",
      PropertyType.STRING,
      false);
  public static final EarlybirdProperty<Boolean> ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED =
      new EarlybirdProperty<>(
          "archive_daily_status_batch_flushing_enabled",
          "Whether to enable archive daily status batch flushing",
          PropertyType.BOOLEAN,
          false);
  public static final EarlybirdProperty<String> HDFS_INDEX_SYNC_DIR = new EarlybirdProperty<>(
      "hdfs_index_sync_dir",
      "HDFS directory to sync index data",
      PropertyType.STRING,
      true);
  public static final EarlybirdProperty<Boolean> READ_INDEX_FROM_PROD_LOCATION =
      new EarlybirdProperty<>(
      "read_index_from_prod_location",
      "Read index from prod to speed up startup on staging / loadtest",
      PropertyType.BOOLEAN,
      false);
  public static final EarlybirdProperty<Boolean> USE_DECIDER_OVERLAY = new EarlybirdProperty<>(
      "use_decider_overlay",
      "Whether to use decider overlay",
      PropertyType.BOOLEAN,
      false);
  public static final EarlybirdProperty<String> DECIDER_OVERLAY_CONFIG = new EarlybirdProperty<>(
      "decider_overlay_config",
      "Path to decider overlay config",
      PropertyType.STRING,
      false);
  public static final EarlybirdProperty<Integer> MAX_CONCURRENT_SEGMENT_INDEXERS =
      new EarlybirdProperty<>(
        "max_concurrent_segment_indexers",
        "Maximum number of segments indexed concurrently",
        PropertyType.INT,
        false);
  public static final EarlybirdProperty<Boolean> TF_MODELS_ENABLED =
      new EarlybirdProperty<>(
        "tf_models_enabled",
        "Whether tensorflow models should be loaded",
        PropertyType.BOOLEAN,
        false);
  public static final EarlybirdProperty<String> TF_MODELS_CONFIG_PATH =
      new EarlybirdProperty<>(
        "tf_models_config_path",
        "The configuration path of the yaml file containing the list of tensorflow models to load.",
        PropertyType.STRING,
        false);
  public static final EarlybirdProperty<Integer> TF_INTER_OP_THREADS =
      new EarlybirdProperty<>(
        "tf_inter_op_threads",
        "How many tensorflow inter op threads to use. See TF documentation for more information.",
        PropertyType.INT,
        false);
  public static final EarlybirdProperty<Integer> TF_INTRA_OP_THREADS =
      new EarlybirdProperty<>(
        "tf_intra_op_threads",
        "How many tensorflow intra op threads to use. See TF documentation for more information.",
        PropertyType.INT,
        false);
  public static final EarlybirdProperty<Integer> MAX_ALLOWED_REPLICAS_NOT_IN_SERVER_SET =
      new EarlybirdProperty<>(
          "max_allowed_replicas_not_in_server_set",
          "How many replicas are allowed to be missing from the Earlybird server set.",
          PropertyType.INT,
          false);
  public static final EarlybirdProperty<Boolean> CHECK_NUM_REPLICAS_IN_SERVER_SET =
      new EarlybirdProperty<>(
          "check_num_replicas_in_server_set",
          "Whether CoordinatedEarlybirdActions should check the number of alive replicas",
          PropertyType.BOOLEAN,
          false);
  public static final EarlybirdProperty<Integer> MAX_QUEUE_SIZE =
      new EarlybirdProperty<>(
          "max_queue_size",
          "Maximum size of searcher worker executor queue. If <= 0 queue is unbounded.",
          PropertyType.INT,
          false);
  public static final EarlybirdProperty<String> KAFKA_ENV =
      new EarlybirdProperty<>(
          "kafka_env",
          "The environment to use for kafka topics.",
          PropertyType.STRING,
          false);
  public static final EarlybirdProperty<String> KAFKA_PATH =
      new EarlybirdProperty<>(
          "kafka_path",
          "Wily path to the Search kafka cluster.",
          PropertyType.STRING,
          false);
  public static final EarlybirdProperty<String> TWEET_EVENTS_KAFKA_PATH =
      new EarlybirdProperty<>(
          "tweet_events_kafka_path",
          "Wily path to the tweet-events kafka cluster.",
          PropertyType.STRING,
          false);
  public static final EarlybirdProperty<String> USER_UPDATES_KAFKA_TOPIC =
      new EarlybirdProperty<>(
          "user_updates_topic",
          "Name of the Kafka topic that contain user updates.",
          PropertyType.STRING,
          false);
  public static final EarlybirdProperty<String> USER_SCRUB_GEO_KAFKA_TOPIC =
      new EarlybirdProperty<>(
          "user_scrub_geo_topic",
          "Name of the Kafka topic that contain UserScrubGeoEvents.",
          PropertyType.STRING,
          false);
  public static final EarlybirdProperty<String> EARLYBIRD_SCRUB_GEN =
      new EarlybirdProperty<>(
          "earlybird_scrub_gen",
          "SCRUB_GEN TO DEPLOY",
          PropertyType.STRING,
          false);
  public static final EarlybirdProperty<Boolean> CONSUME_GEO_SCRUB_EVENTS =
      new EarlybirdProperty<>(
        "consume_geo_scrub_events",
        "Whether to consume user scrub geo events or not",
        PropertyType.BOOLEAN,
        false);
  private static final List<EarlybirdProperty<?>> ALL_PROPERTIES =
      Arrays.stream(EarlybirdProperty.class.getDeclaredFields())
          .filter(field ->
              (field.getModifiers() & Modifier.STATIC) > 0
                && field.getType() == EarlybirdProperty.class)
          .map(field -> {
            try {
              return (EarlybirdProperty<?>) field.get(EarlybirdProperty.class);
            } catch (Exception e) {
              throw new RuntimeException(e);
            }
          })
          .collect(Collectors.collectingAndThen(Collectors.toList(), ImmutableList::copyOf));
  public static ServiceIdentifier getServiceIdentifier() {
    return new ServiceIdentifier(
        ROLE.get(),
        EARLYBIRD_NAME.get(),
        ENV.get(),
        ZONE.get());
  }
  private final String name;
  private final String help;
  private final PropertyType<T> type;
  private final boolean requiredOnAurora;
  private final boolean requiredOnDedicated;
  private EarlybirdProperty(String name, String help, PropertyType<T> type,
                            boolean requiredOnAurora) {
    this(name, help, type, requiredOnAurora, false);
  }
  private EarlybirdProperty(String name, String help, PropertyType<T> type,
                            boolean requiredOnAurora, boolean requiredOnDedicated) {
    this.name = name;
    this.help = help;
    this.type = type;
    this.requiredOnAurora = requiredOnAurora;
    this.requiredOnDedicated = requiredOnDedicated;
  }
  public String name() {
    return name;
  }
  public boolean isRequiredOnAurora() {
    return requiredOnAurora;
  }
  public boolean isRequiredOnDedicated() {
    return requiredOnDedicated;
  }
  public Flag<T> createFlag(Flags flags) {
    return flags.createMandatory(name, help, null, type.flaggable);
  }
  public T get() {
    return type.getter.apply(name);
  }
  public T get(T devaultValue) {
    return type.getterWithDefault.apply(name, devaultValue);
  }
  public static EarlybirdProperty[] values() {
    return ALL_PROPERTIES.toArray(new EarlybirdProperty[0]);
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/userupdates/BUILD
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/BUILD
@ -1,45 +0,0 @@
 java_library(
    sources = ["*.java"],
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        "3rdparty/jvm/com/google/guava",
        "3rdparty/jvm/com/google/inject:guice",
        "3rdparty/jvm/commons-io",
        "3rdparty/jvm/geo/google:geoGoogle",
        "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server",
        "3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider",
        "3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
        "3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
        "3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn",
        "3rdparty/jvm/org/apache/lucene:lucene-core",
        "3rdparty/jvm/org/apache/lucene:lucene-facet",
        "3rdparty/jvm/org/apache/thrift:libthrift",
        "3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
        "3rdparty/jvm/org/slf4j:slf4j-api",
        "3rdparty/src/jvm/com/twitter/scalding:core",
        "3rdparty/src/jvm/com/twitter/scalding:date",
        "3rdparty/src/jvm/com/twitter/scalding:parquet",
        "decider/src/main/scala",
        "src/java/com/twitter/common/base",
        "src/java/com/twitter/common/util:system-mocks",
        "src/java/com/twitter/common_internal/hadoop",
        "src/java/com/twitter/search/common/logging",
        "src/java/com/twitter/search/common/metrics",
        "src/java/com/twitter/search/common/partitioning/snowflakeparser",
        "src/java/com/twitter/search/common/schema/earlybird",
        "src/java/com/twitter/search/common/util/hash",
        "src/java/com/twitter/search/common/util/io",
        "src/java/com/twitter/search/common/util/io:dl-reader-writer",
        "src/java/com/twitter/search/common/util/io:flushable",
        "src/java/com/twitter/search/common/util/io:record-reader-api",
        "src/java/com/twitter/search/earlybird/common/config",
        "src/scala/com/twitter/scalding_internal/error_handling",
        "src/scala/com/twitter/scalding_internal/multiformat",
        "src/scala/com/twitter/scalding_internal/source",
        "src/scala/com/twitter/search/user_table/sources",
        "src/thrift/com/twitter/search/common:indexing-java",
        "src/thrift/com/twitter/tweetypie:events-java",
        "util/util-core:scala",
    ],
 )
--- a/src/java/com/twitter/search/earlybird/common/userupdates/BUILD.docx
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/BUILD.docx
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.docx
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.docx
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.java
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserScrubGeoMap.java
@ -1,100 +0,0 @@
 package com.twitter.search.earlybird.common.userupdates;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeUnit;
 import com.twitter.search.common.metrics.SearchCounter;
 import com.twitter.search.common.metrics.SearchCustomGauge;
 import com.twitter.search.common.metrics.SearchTimerStats;
 import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
 import com.twitter.tweetypie.thriftjava.UserScrubGeoEvent;
 /**
 * Map of users who have actioned to delete location data from their tweets. UserID's are mapped
 * to the maxTweetId that will eventually be scrubbed from the index (userId -> maxTweetId).
 *
 * ConcurrentHashMap is thread safe without synchronizing the whole map. Reads can happen very fast
 * while writes are done with a lock. This is ideal since many Earlybird Searcher threads could
 * be reading from the map at once, whereas we will only be adding to the map via kafka.
 *
 * This map is checked against to filter out tweets that should not be returned to geo queries.
 * See: go/realtime-geo-filtering
 */
 public class UserScrubGeoMap {
  // The number of geo events that contain a user ID already present in the map. This count is used
  // to verify the number of users in the map against the number of events consumed from kafka.
  private static final SearchCounter USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT =
      SearchCounter.export("user_scrub_geo_event_existing_user_count");
  public static final SearchTimerStats USER_SCRUB_GEO_EVENT_LAG_STAT =
      SearchTimerStats.export("user_scrub_geo_event_lag",
          TimeUnit.MILLISECONDS,
          false,
          true);
  private ConcurrentHashMap<Long, Long> map;
  public UserScrubGeoMap() {
    map = new ConcurrentHashMap<>();
    SearchCustomGauge.export("num_users_in_geo_map", this::getNumUsersInMap);
  }
  /**
   * Ensure that the max_tweet_id in the userScrubGeoEvent is greater than the one already stored
   * in the map for the given user id (if any) before updating the entry for this user.
   * This will protect Earlybirds from potential issues where out of date UserScrubGeoEvents
   * appear in the incoming Kafka stream.
   *
   * @param userScrubGeoEvent
   */
  public void indexUserScrubGeoEvent(UserScrubGeoEvent userScrubGeoEvent) {
    long userId = userScrubGeoEvent.getUser_id();
    long newMaxTweetId = userScrubGeoEvent.getMax_tweet_id();
    long oldMaxTweetId = map.getOrDefault(userId, 0L);
    if (map.containsKey(userId)) {
      USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT.increment();
    }
    map.put(userId, Math.max(oldMaxTweetId, newMaxTweetId));
    USER_SCRUB_GEO_EVENT_LAG_STAT.timerIncrement(computeEventLag(newMaxTweetId));
  }
  /**
   * A tweet is geo scrubbed if it is older than the max tweet id that is scrubbed for the tweet's
   * author.
   * If there is no entry for the tweet's author in the map, then the tweet is not geo scrubbed.
   *
   * @param tweetId
   * @param fromUserId
   * @return
   */
  public boolean isTweetGeoScrubbed(long tweetId, long fromUserId) {
    return tweetId <= map.getOrDefault(fromUserId, 0L);
  }
  /**
   * The lag (in milliseconds) from when a UserScrubGeoEvent is created, until it is applied to the
   * UserScrubGeoMap. Take the maxTweetId found in the current event and convert it to a timestamp.
   * The maxTweetId will give us a timestamp closest to when Tweetypie processes macaw-geo requests.
   *
   * @param maxTweetId
   * @return
   */
  private long computeEventLag(long maxTweetId) {
    long eventCreatedAtTime = SnowflakeIdParser.getTimestampFromTweetId(maxTweetId);
    return System.currentTimeMillis() - eventCreatedAtTime;
  }
  public long getNumUsersInMap() {
    return map.size();
  }
  public ConcurrentHashMap<Long, Long> getMap() {
    return map;
  }
  public boolean isEmpty() {
    return map.isEmpty();
  }
  public boolean isSet(long userId) {
    return map.containsKey(userId);
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.docx
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.docx
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.java
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserTable.java
@ -1,572 +0,0 @@
 package com.twitter.search.earlybird.common.userupdates;
 import java.util.Iterator;
 import java.util.concurrent.atomic.AtomicReference;
 import java.util.function.Predicate;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.search.common.metrics.SearchLongGauge;
 import com.twitter.search.common.metrics.SearchRateCounter;
 import com.twitter.search.common.util.hash.GeneralLongHashFunction;
 /**
 * Table containing metadata about users, like NSFW or Antisocial status.
 * Used for result filtering.
 */
 public class UserTable {
  private static final Logger LOG = LoggerFactory.getLogger(UserTable.class);
  @VisibleForTesting // Not final for testing.
  protected static long userUpdateTableMaxCapacity = 1L << 30;
  private static final int DEFAULT_INITIAL_CAPACITY = 1024;
  private static final int BYTE_WIDTH = 8;
  private static final String USER_TABLE_CAPACITY = "user_table_capacity";
  private static final String USER_TABLE_SIZE = "user_table_size";
  private static final String
      USER_NUM_USERS_WITH_NO_BITS_SET = "user_table_users_with_no_bits_set";
  private static final String USER_TABLE_ANTISOCIAL_USERS = "user_table_antisocial_users";
  private static final String USER_TABLE_OFFENSIVE_USERS = "user_table_offensive_users";
  private static final String USER_TABLE_NSFW_USERS = "user_table_nsfw_users";
  private static final String USER_TABLE_IS_PROTECTED_USERS = "user_table_is_protected_users";
  /**
   * number of users filtered
   */
  private static final SearchRateCounter USER_TABLE_USERS_FILTERED_COUNTER =
      new SearchRateCounter("user_table_users_filtered");
  private SearchLongGauge userTableCapacity;
  private SearchLongGauge userTableSize;
  private SearchLongGauge userTableNumUsersWithNoBitsSet;
  private SearchLongGauge userTableAntisocialUsers;
  private SearchLongGauge userTableOffensiveUsers;
  private SearchLongGauge userTableNsfwUsers;
  private SearchLongGauge userTableIsProtectedUsers;
  private final Predicate<Long> userIdFilter;
  private long lastRecordTimestamp;
  private static final class HashTable {
    private int numUsersInTable;
    private int numUsersWithNoBitsSet;
    // size 8 array contains the number of users who have the bit set at the index (0-7) position
    // e.g. setBitCounts[0] stores the number of users who have the 0 bit set in their bytes
    private long[] setBitCounts;
    private final long[] hash;
    private final byte[] bits;
    private final int hashMask;
    HashTable(int size) {
      this.hash = new long[size];
      this.bits = new byte[size];
      this.hashMask = size - 1;
      this.numUsersInTable = 0;
      this.setBitCounts = new long[BYTE_WIDTH];
    }
    protected int hashSize() {
      return hash.length;
    }
    // If we want to decrease the number of users in the table, we can delete as many users
    // as this table returns, by calling filterTableAndCountValidItems.
    public void setCountOfNumUsersWithNoBitsSet() {
      int count = 0;
      for (int i = 0; i < hash.length; i++) {
        if ((hash[i] > 0) && (bits[i] == 0)) {
          count++;
        }
      }
      numUsersWithNoBitsSet = count;
    }
    public void setSetBitCounts() {
      long[] counts = new long[BYTE_WIDTH];
      for (int i = 0; i < hash.length; i++) {
        if (hash[i] > 0) {
          int tempBits = bits[i] & 0xff;
          int curBitPos = 0;
          while (tempBits != 0) {
            if ((tempBits & 1) != 0) {
              counts[curBitPos]++;
            }
            tempBits = tempBits >>> 1;
            curBitPos++;
          }
        }
      }
      setBitCounts = counts;
    }
  }
  public static final int ANTISOCIAL_BIT = 1;
  public static final int OFFENSIVE_BIT = 1 << 1;
  public static final int NSFW_BIT = 1 << 2;
  public static final int IS_PROTECTED_BIT = 1 << 3;
  public long getLastRecordTimestamp() {
    return this.lastRecordTimestamp;
  }
  public void setLastRecordTimestamp(long lastRecordTimestamp) {
    this.lastRecordTimestamp = lastRecordTimestamp;
  }
  public void setOffensive(long userID, boolean offensive) {
    set(userID, OFFENSIVE_BIT, offensive);
  }
  public void setAntisocial(long userID, boolean antisocial) {
    set(userID, ANTISOCIAL_BIT, antisocial);
  }
  public void setNSFW(long userID, boolean nsfw) {
    set(userID, NSFW_BIT, nsfw);
  }
  public void setIsProtected(long userID, boolean isProtected) {
    set(userID, IS_PROTECTED_BIT, isProtected);
  }
  /**
   * Adds the given user update to this table.
   */
  public boolean indexUserUpdate(UserUpdatesChecker checker, UserUpdate userUpdate) {
    if (checker.skipUserUpdate(userUpdate)) {
      return false;
    }
    switch (userUpdate.updateType) {
      case ANTISOCIAL:
        setAntisocial(userUpdate.twitterUserID, userUpdate.updateValue != 0);
        break;
      case NSFW:
        setNSFW(userUpdate.twitterUserID, userUpdate.updateValue != 0);
        break;
      case OFFENSIVE:
        setOffensive(userUpdate.twitterUserID, userUpdate.updateValue != 0);
        break;
      case PROTECTED:
        setIsProtected(userUpdate.twitterUserID, userUpdate.updateValue != 0);
        break;
      default:
        return false;
    }
    return true;
  }
  private final AtomicReference<HashTable> hashTable = new AtomicReference<>();
  private int hashCode(long userID) {
    return (int) GeneralLongHashFunction.hash(userID);
  }
  /**
   * Returns an iterator for user IDs that have at least one of the bits set.
   */
  public Iterator<Long> getFlaggedUserIdIterator() {
    HashTable table = hashTable.get();
    final long[] currUserIdTable = table.hash;
    final byte[] currBitsTable = table.bits;
    return new Iterator<Long>() {
      private int index = findNext(0);
      private int findNext(int index) {
        int startingIndex = index;
        while (startingIndex < currUserIdTable.length) {
          if (currUserIdTable[startingIndex] != 0 && currBitsTable[startingIndex] != 0) {
            break;
          }
          ++startingIndex;
        }
        return startingIndex;
      }
      @Override
      public boolean hasNext() {
        return index < currUserIdTable.length;
      }
      @Override
      public Long next() {
        Long r = currUserIdTable[index];
        index = findNext(index + 1);
        return r;
      }
      @Override
      public void remove() {
        throw new UnsupportedOperationException();
      }
    };
  }
  /**
   * Constructs an UserUpdatesTable with an given HashTable instance.
   * Use <code>useIdFilter</code> as a Predicate that returns true for the elements
   * needed to be kept in the table.
   * Use shouldRehash to force a rehasing on the given HashTable.
   */
  private UserTable(HashTable hashTable, Predicate<Long> userIdFilter,
                    boolean shouldRehash) {
    Preconditions.checkNotNull(userIdFilter);
    this.hashTable.set(hashTable);
    this.userIdFilter = userIdFilter;
    exportUserUpdatesTableStats();
    LOG.info("User table num users: {}. Users with no bits set: {}. "
            + "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.",
        this.getNumUsersInTable(),
        this.getNumUsersWithNoBitsSet(),
        this.getSetBitCount(ANTISOCIAL_BIT),
        this.getSetBitCount(OFFENSIVE_BIT),
        this.getSetBitCount(NSFW_BIT),
        this.getSetBitCount(IS_PROTECTED_BIT));
    if (shouldRehash) {
      int filteredTableSize = filterTableAndCountValidItems();
      // Having exactly 100% usage can impact lookup. Maintain the table at under 50% usage.
      int newTableCapacity = computeDesiredHashTableCapacity(filteredTableSize * 2);
      rehash(newTableCapacity);
      LOG.info("User table num users after rehash: {}. Users with no bits set: {}. "
              + "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.",
          this.getNumUsersInTable(),
          this.getNumUsersWithNoBitsSet(),
          this.getSetBitCount(ANTISOCIAL_BIT),
          this.getSetBitCount(OFFENSIVE_BIT),
          this.getSetBitCount(NSFW_BIT),
          this.getSetBitCount(IS_PROTECTED_BIT));
    }
  }
  private UserTable(int initialSize, Predicate<Long> userIdFilter) {
    this(new HashTable(computeDesiredHashTableCapacity(initialSize)), userIdFilter, false);
  }
  @VisibleForTesting
  public UserTable(int initialSize) {
    this(initialSize, userId -> true);
  }
  public static UserTable
    newTableWithDefaultCapacityAndPredicate(Predicate<Long> userIdFilter) {
    return new UserTable(DEFAULT_INITIAL_CAPACITY, userIdFilter);
  }
  public static UserTable newTableNonFilteredWithDefaultCapacity() {
    return newTableWithDefaultCapacityAndPredicate(userId -> true);
  }
  private void exportUserUpdatesTableStats() {
    userTableSize = SearchLongGauge.export(USER_TABLE_SIZE);
    userTableCapacity = SearchLongGauge.export(USER_TABLE_CAPACITY);
    userTableNumUsersWithNoBitsSet = SearchLongGauge.export(
        USER_NUM_USERS_WITH_NO_BITS_SET
    );
    userTableAntisocialUsers = SearchLongGauge.export(USER_TABLE_ANTISOCIAL_USERS);
    userTableOffensiveUsers = SearchLongGauge.export(USER_TABLE_OFFENSIVE_USERS);
    userTableNsfwUsers = SearchLongGauge.export(USER_TABLE_NSFW_USERS);
    userTableIsProtectedUsers = SearchLongGauge.export(USER_TABLE_IS_PROTECTED_USERS);
    LOG.info(
        "Exporting stats for user table. Starting with numUsersInTable={}, usersWithZeroBits={}, "
            + "antisocialUsers={}, offensiveUsers={}, nsfwUsers={}, isProtectedUsers={}.",
        getNumUsersInTable(),
        getNumUsersWithNoBitsSet(),
        getSetBitCount(ANTISOCIAL_BIT),
        getSetBitCount(OFFENSIVE_BIT),
        getSetBitCount(NSFW_BIT),
        getSetBitCount(IS_PROTECTED_BIT));
    updateStats();
  }
  private void updateStats() {
    HashTable table = this.hashTable.get();
    userTableSize.set(table.numUsersInTable);
    userTableNumUsersWithNoBitsSet.set(table.numUsersWithNoBitsSet);
    userTableCapacity.set(table.hashSize());
    userTableAntisocialUsers.set(getSetBitCount(ANTISOCIAL_BIT));
    userTableOffensiveUsers.set(getSetBitCount(OFFENSIVE_BIT));
    userTableNsfwUsers.set(getSetBitCount(NSFW_BIT));
    userTableIsProtectedUsers.set(getSetBitCount(IS_PROTECTED_BIT));
  }
  /**
   * Computes the size of the hashtable as the first power of two greater than or equal to initialSize
   */
  private static int computeDesiredHashTableCapacity(int initialSize) {
    long powerOfTwoSize = 2;
    while (initialSize > powerOfTwoSize) {
      powerOfTwoSize *= 2;
    }
    if (powerOfTwoSize > Integer.MAX_VALUE) {
      LOG.error("Error: powerOfTwoSize overflowed Integer.MAX_VALUE! Initial size: " + initialSize);
      powerOfTwoSize = 1 << 30;  // max power of 2
    }
    return (int) powerOfTwoSize;
  }
  public int getNumUsersInTable() {
    return hashTable.get().numUsersInTable;
  }
  /**
   * Get the number of users who have the bit set at the `userStateBit` position
   */
  public long getSetBitCount(int userStateBit) {
    int bit = userStateBit;
    int bitPosition = 0;
    while (bit != 0 && (bit & 1) == 0) {
      bit = bit >>> 1;
      bitPosition++;
    }
    return hashTable.get().setBitCounts[bitPosition];
  }
  public Predicate<Long> getUserIdFilter() {
    return userIdFilter::test;
  }
  /**
   * Updates a user flag in this table.
   */
  public final void set(long userID, int bit, boolean value) {
    // if userID is filtered return immediately
    if (!shouldKeepUser(userID)) {
      USER_TABLE_USERS_FILTERED_COUNTER.increment();
      return;
    }
    HashTable table = this.hashTable.get();
    int hashPos = findHashPosition(table, userID);
    long item = table.hash[hashPos];
    byte bits = 0;
    int bitsDiff = 0;
    if (item != 0) {
      byte bitsOriginally = bits = table.bits[hashPos];
      if (value) {
        bits |= bit;
      } else {
        // AND'ing with the inverse map clears the desired bit, but
        // doesn't change any of the other bits
        bits &= ~bit;
      }
      // Find the changed bits after the above operation, it is possible that no bit is changed if
      // the input 'bit' is already set/unset in the table.
      // Since bitwise operators cannot be directly applied on Byte, Byte is promoted into int to
      // apply the operators. When that happens, if the most significant bit of the Byte is set,
      // the promoted int has all significant bits set to 1. 0xff bitmask is applied here to make
      // sure only the last 8 bits are considered.
      bitsDiff = (bitsOriginally & 0xff) ^ (bits & 0xff);
      if (bitsOriginally > 0 && bits == 0) {
        table.numUsersWithNoBitsSet++;
      } else if (bitsOriginally == 0 && bits > 0) {
        table.numUsersWithNoBitsSet--;
      }
    } else {
      if (!value) {
        // no need to add this user, since all bits would be false anyway
        return;
      }
      // New user string.
      if (table.numUsersInTable + 1 >= (table.hashSize() >> 1)
          && table.hashSize() != userUpdateTableMaxCapacity) {
        if (2L * (long) table.hashSize() < userUpdateTableMaxCapacity) {
          rehash(2 * table.hashSize());
          table = this.hashTable.get();
        } else {
          if (table.hashSize() < (int) userUpdateTableMaxCapacity) {
            rehash((int) userUpdateTableMaxCapacity);
            table = this.hashTable.get();
            LOG.warn("User update table size reached Integer.MAX_VALUE, performance will degrade.");
          }
        }
        // Must repeat this operation with the resized hashTable.
        hashPos = findHashPosition(table, userID);
      }
      item = userID;
      bits |= bit;
      bitsDiff = bit & 0xff;
      table.numUsersInTable++;
    }
    table.hash[hashPos] = item;
    table.bits[hashPos] = bits;
    // update setBitCounts for the changed bits after applying the input 'bit'
    int curBitsDiffPos = 0;
    while (bitsDiff != 0) {
      if ((bitsDiff & 1) != 0) {
        if (value) {
          table.setBitCounts[curBitsDiffPos]++;
        } else {
          table.setBitCounts[curBitsDiffPos]--;
        }
      }
      bitsDiff = bitsDiff >>> 1;
      curBitsDiffPos++;
    }
    updateStats();
  }
  public final boolean isSet(long userID, int bits) {
    HashTable table = hashTable.get();
    int hashPos = findHashPosition(table, userID);
    return table.hash[hashPos] != 0 && (table.bits[hashPos] & bits) != 0;
  }
  /**
   * Returns true when userIdFilter condition is being met.
   * If filter is not present returns true
   */
  private boolean shouldKeepUser(long userID) {
    return userIdFilter.test(userID);
  }
  private int findHashPosition(final HashTable table, final long userID) {
    int code = hashCode(userID);
    int hashPos = code & table.hashMask;
    // Locate user in hash
    long item = table.hash[hashPos];
    if (item != 0 && item != userID) {
      // Conflict: keep searching different locations in
      // the hash table.
      final int inc = ((code >> 8) + code) | 1;
      do {
        code += inc;
        hashPos = code & table.hashMask;
        item = table.hash[hashPos];
      } while (item != 0 && item != userID);
    }
    return hashPos;
  }
  /**
   * Applies the filtering predicate and returns the size of the filtered table.
   */
  private synchronized int filterTableAndCountValidItems() {
    final HashTable oldTable = this.hashTable.get();
    int newSize = 0;
    int clearNoItemSet = 0;
    int clearNoBitsSet = 0;
    int clearDontKeepUser = 0;
    for (int i = 0; i < oldTable.hashSize(); i++) {
      final long item = oldTable.hash[i]; // this is the userID
      final byte bits = oldTable.bits[i];
      boolean clearSlot = false;
      if (item == 0) {
        clearSlot = true;
        clearNoItemSet++;
      } else if (bits == 0) {
        clearSlot = true;
        clearNoBitsSet++;
      } else if (!shouldKeepUser(item)) {
        clearSlot = true;
        clearDontKeepUser++;
      }
      if (clearSlot) {
        oldTable.hash[i] = 0;
        oldTable.bits[i] = 0;
      } else {
        newSize += 1;
      }
    }
    oldTable.setCountOfNumUsersWithNoBitsSet();
    oldTable.setSetBitCounts();
    LOG.info("Done filtering table: clearNoItemSet={}, clearNoBitsSet={}, clearDontKeepUser={}",
        clearNoItemSet, clearNoBitsSet, clearDontKeepUser);
    return newSize;
  }
  /**
   * Called when hash is too small (> 50% occupied)
   */
  private void rehash(final int newSize) {
    final HashTable oldTable = this.hashTable.get();
    final HashTable newTable = new HashTable(newSize);
    final int newMask = newTable.hashMask;
    final long[] newHash = newTable.hash;
    final byte[] newBits = newTable.bits;
    for (int i = 0; i < oldTable.hashSize(); i++) {
      final long item = oldTable.hash[i];
      final byte bits = oldTable.bits[i];
      if (item != 0 && bits != 0) {
        int code = hashCode(item);
        int hashPos = code & newMask;
        assert hashPos >= 0;
        if (newHash[hashPos] != 0) {
          final int inc = ((code >> 8) + code) | 1;
          do {
            code += inc;
            hashPos = code & newMask;
          } while (newHash[hashPos] != 0);
        }
        newHash[hashPos] = item;
        newBits[hashPos] = bits;
        newTable.numUsersInTable++;
      }
    }
    newTable.setCountOfNumUsersWithNoBitsSet();
    newTable.setSetBitCounts();
    this.hashTable.set(newTable);
    updateStats();
  }
  public void setTable(UserTable newTable) {
    hashTable.set(newTable.hashTable.get());
    updateStats();
  }
  @VisibleForTesting
  protected int getHashTableCapacity() {
    return hashTable.get().hashSize();
  }
  @VisibleForTesting
  protected int getNumUsersWithNoBitsSet() {
    return hashTable.get().numUsersWithNoBitsSet;
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.docx
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.docx
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.java
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserTableBuilderFromSnapshot.java
@ -1,263 +0,0 @@
 package com.twitter.search.earlybird.common.userupdates;
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.NoSuchElementException;
 import java.util.Optional;
 import java.util.Spliterator;
 import java.util.Spliterators;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Predicate;
 import java.util.stream.Collectors;
 import java.util.stream.Stream;
 import java.util.stream.StreamSupport;
 import javax.annotation.Nullable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hdfs.HdfsConfiguration;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import com.twitter.common_internal.hadoop.HdfsUtils;
 import com.twitter.scalding.DateRange;
 import com.twitter.scalding.Hours;
 import com.twitter.scalding.RichDate;
 import com.twitter.search.user_table.sources.MostRecentGoodSafetyUserStateSource;
 import com.twitter.search.common.indexing.thriftjava.SafetyUserState;
 import com.twitter.search.common.util.io.LzoThriftBlockFileReader;
 import com.twitter.search.earlybird.common.config.EarlybirdConfig;
 import com.twitter.util.Duration;
 import com.twitter.util.Time;
 /**
 * Builds a user table from a user safety snapshot on HDFS.
 */
 public class UserTableBuilderFromSnapshot {
  private static final Logger LOG = LoggerFactory.getLogger(UserTableBuilderFromSnapshot.class);
  private static final int MAX_DAYS_TO_CHECK = 7;
  public static final String DATA_DIR = "user_states";
  public static final String METADATA_DIR = "last_updated_ms";
  private final String snapshotBaseDir;
  private String snapshotDataPath;
  private String snapshotMetaDataPath;
  private UserTable userTable;
  private long nsfwCount;
  private long antisocialCount;
  private long isProtectedCount;
  public UserTableBuilderFromSnapshot() {
    snapshotBaseDir =
        EarlybirdConfig.getString(EarlybirdConfig.USER_SNAPSHOT_BASE_DIR, null);
    LOG.info("Configured user snapshot directory: " + snapshotBaseDir);
  }
  private static final class UserUpdate {
    public final long userId;
    @Nullable public final Boolean antisocial;
    @Nullable public final Boolean nsfw;
    @Nullable public final Boolean isProtected;
    private UserUpdate(long userId,
                       @Nullable Boolean antisocial,
                       @Nullable Boolean nsfw,
                       @Nullable Boolean isProtected) {
      this.userId = userId;
      this.antisocial = antisocial;
      this.nsfw = nsfw;
      this.isProtected = isProtected;
    }
    public static UserUpdate fromUserState(SafetyUserState safetyUserState) {
      long userId = safetyUserState.getUserID();
      @Nullable Boolean antisocial = null;
      @Nullable Boolean nsfw = null;
      @Nullable Boolean isProtected = null;
      if (safetyUserState.isIsAntisocial()) {
        antisocial = true;
      }
      if (safetyUserState.isIsNsfw()) {
        nsfw = true;
      }
      if (safetyUserState.isSetIsProtected() && safetyUserState.isIsProtected()) {
        isProtected = true;
      }
      return new UserUpdate(userId, antisocial, nsfw, isProtected);
    }
  }
  /**
   * Builds a user table from an HDFS user snapshot.
   * @return The table, or nothing if something went wrong.
   */
  public Optional<UserTable> build(Predicate<Long> userFilter) {
    userTable = UserTable.newTableWithDefaultCapacityAndPredicate(userFilter);
    nsfwCount = 0;
    antisocialCount = 0;
    isProtectedCount = 0;
    if (snapshotBaseDir == null || snapshotBaseDir.isEmpty()) {
      LOG.info("No snapshot directory. Can't build user table.");
      return Optional.empty();
    }
    LOG.info("Starting to build user table.");
    Stream<UserUpdate> stream = null;
    try {
      setSnapshotPath();
      stream = getUserUpdates();
      stream.forEach(this::insertUser);
    } catch (IOException e) {
      LOG.error("IOException while building table: {}", e.getMessage(), e);
      return Optional.empty();
    } finally {
      if (stream != null) {
        stream.close();
      }
    }
    LOG.info("Built user table with {} users, {} nsfw, {} antisocial and {} protected.",
        userTable.getNumUsersInTable(),
        nsfwCount,
        antisocialCount,
        isProtectedCount);
    try {
      userTable.setLastRecordTimestamp(readTimestampOfLastSeenUpdateFromSnapshot());
    } catch (IOException e) {
      LOG.error("IOException reading timestamp of last update: {}", e.getMessage(), e);
      return Optional.empty();
    }
    LOG.info("Setting last record timestamp to {}.", userTable.getLastRecordTimestamp());
    return Optional.of(userTable);
  }
  private void setSnapshotPath() {
    snapshotDataPath =
        new MostRecentGoodSafetyUserStateSource(
            snapshotBaseDir,
            DATA_DIR,
            METADATA_DIR,
            DateRange.apply(
                RichDate.now().$minus(Hours.apply(MAX_DAYS_TO_CHECK * 24)),
                RichDate.now())
        ).partitionHdfsPaths(new HdfsConfiguration())
         ._1()
         .head()
         .replaceAll("\\*$", "");
    snapshotMetaDataPath = snapshotDataPath.replace(DATA_DIR, METADATA_DIR);
    LOG.info("Snapshot data path: {}", snapshotDataPath);
    LOG.info("Snapshot metadata path: {}", snapshotMetaDataPath);
  }
  private Stream<UserUpdate> getUserUpdates() throws IOException {
    FileSystem fs = FileSystem.get(new Configuration());
    List<String> lzoFiles =
        Arrays.stream(fs.listStatus(new Path(snapshotDataPath),
                                    path -> path.getName().startsWith("part-")))
              .map(fileStatus -> Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath())
                                     .toString())
              .collect(Collectors.toList());
    final LzoThriftBlockFileReader<SafetyUserState> thriftReader =
        new LzoThriftBlockFileReader<>(lzoFiles, SafetyUserState.class, null);
    Iterator<UserUpdate> iter = new Iterator<UserUpdate>() {
      private SafetyUserState next;
      @Override
      public boolean hasNext() {
        if (next != null) {
          return true;
        }
        do {
          try {
            next = thriftReader.readNext();
          } catch (IOException e) {
            throw new RuntimeException(e);
          }
        } while (next == null && !thriftReader.isExhausted());
        return next != null;
      }
      @Override
      public UserUpdate next() {
        if (next != null || hasNext()) {
          UserUpdate userUpdate = UserUpdate.fromUserState(next);
          next = null;
          return userUpdate;
        }
        throw new NoSuchElementException();
      }
    };
    return StreamSupport
        .stream(
            Spliterators.spliteratorUnknownSize(iter, Spliterator.ORDERED | Spliterator.NONNULL),
            false)
        .onClose(thriftReader::stop);
  }
  private long readTimestampOfLastSeenUpdateFromSnapshot() throws IOException {
    String timestampFile = snapshotMetaDataPath + "part-00000";
    BufferedReader buffer = new BufferedReader(new InputStreamReader(
        HdfsUtils.getInputStreamSupplier(timestampFile).openStream()));
    long timestampMillis = Long.parseLong(buffer.readLine());
    LOG.info("read timestamp {} from HDFS:{}", timestampMillis, timestampFile);
    Time time = Time.fromMilliseconds(timestampMillis)
                    .minus(Duration.fromTimeUnit(10, TimeUnit.MINUTES));
    return time.inMilliseconds();
  }
  private void insertUser(UserUpdate userUpdate) {
    if (userUpdate == null) {
      return;
    }
    if (userUpdate.antisocial != null) {
      userTable.set(
          userUpdate.userId,
          UserTable.ANTISOCIAL_BIT,
          userUpdate.antisocial);
      antisocialCount++;
    }
    if (userUpdate.nsfw != null) {
      userTable.set(
          userUpdate.userId,
          UserTable.NSFW_BIT,
          userUpdate.nsfw);
      nsfwCount++;
    }
    if (userUpdate.isProtected != null) {
      userTable.set(
          userUpdate.userId,
          UserTable.IS_PROTECTED_BIT,
          userUpdate.isProtected);
      isProtectedCount++;
    }
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.docx
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.docx
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.java
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdate.java
@ -1,38 +0,0 @@
 package com.twitter.search.earlybird.common.userupdates;
 import java.util.Date;
 import com.twitter.search.common.indexing.thriftjava.UserUpdateType;
 /**
 * Contains an update for a user.
 */
 public class UserUpdate {
  public final long twitterUserID;
  public final UserUpdateType updateType;
  public final int updateValue;
  private final Date updatedAt;
  public UserUpdate(long twitterUserID,
                    UserUpdateType updateType,
                    int updateValue,
                    Date updatedAt) {
    this.twitterUserID = twitterUserID;
    this.updateType = updateType;
    this.updateValue = updateValue;
    this.updatedAt = (Date) updatedAt.clone();
  }
  @Override public String toString() {
    return "UserInfoUpdate[userID=" + twitterUserID + ",updateType=" + updateType
           + ",updateValue=" + updateValue + ",updatedAt=" + getUpdatedAt() + "]";
  }
  /**
   * Returns a copy of the updated-at date.
   */
  public Date getUpdatedAt() {
    return (Date) updatedAt.clone();
  }
 }
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.docx
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.docx
--- a/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.java
+++ b/src/java/com/twitter/search/earlybird/common/userupdates/UserUpdatesChecker.java
@ -1,70 +0,0 @@
 package com.twitter.search.earlybird.common.userupdates;
 import java.util.Date;
 import java.util.concurrent.TimeUnit;
 import com.twitter.common.util.Clock;
 import com.twitter.decider.Decider;
 import com.twitter.search.common.indexing.thriftjava.UserUpdateType;
 import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
 import com.twitter.search.earlybird.common.config.EarlybirdConfig;
 /**
 * Contains logic for deciding whether to apply a certain user update to the {@link UserTable}.
 */
 public class UserUpdatesChecker {
  private final Date antisocialStartDate;
  private final Decider decider;
  private final boolean isFullArchiveCluster;
  public UserUpdatesChecker(Clock clock, Decider decider, EarlybirdCluster cluster) {
    // How many days of antisocial users to keep. A value of -1 means keeping all user updates.
    long antisocialRecordDays =
        EarlybirdConfig.getLong("keep_recent_antisocial_user_updates_days", 30);
    this.antisocialStartDate = antisocialRecordDays > 0
        ? new Date(clock.nowMillis() - TimeUnit.DAYS.toMillis(antisocialRecordDays)) : null;
    this.decider = decider;
    this.isFullArchiveCluster = cluster == EarlybirdCluster.FULL_ARCHIVE;
  }
  /**
   * Decides whether to skip the given UserInfoUpdate.
   */
  public boolean skipUserUpdate(UserUpdate userUpdate) {
    if (userUpdate == null) { // always skip null updates
      return true;
    }
    UserUpdateType type = userUpdate.updateType;
    if (type == UserUpdateType.PROTECTED && skipProtectedUserUpdate()) {
      return true;
    }
    if (type == UserUpdateType.ANTISOCIAL && skipAntisocialUserUpdate(userUpdate)) {
      return true;
    }
    // NSFW users can continue to tweet even after they are marked as NSFW. That means
    // that the snapshot needs to have all NSFW users from the beginning of time. Hence, no NSFW
    // users updates check here.
    // pass all checks, do not skip this user update
    return false;
  }
  // Antisocial/suspended users can't tweet after they are suspended. Thus if our index stores
  // tweets from the last 10 days, and they were suspended 60 days ago, we don't need them since
  // there will be no tweets from them. We can save space by not storing info about those users.
  // (For archive, at rebuild time we filter out all suspended users tweets, so for a user that
  // was suspended before a rebuild, no need to use space to store that the user is suspended)
  private boolean skipAntisocialUserUpdate(UserUpdate userUpdate) {
    return antisocialStartDate != null && userUpdate.getUpdatedAt().before(antisocialStartDate);
  }
  // skip protected user updates for realtime and protected clusters
  private boolean skipProtectedUserUpdate() {
    return !isFullArchiveCluster;
  }
 }
--- a/src/java/com/twitter/search/earlybird/config/BUILD
+++ b/src/java/com/twitter/search/earlybird/config/BUILD
@ -1,21 +0,0 @@
 java_library(
    sources = ["**/*.java"],
    platform = "java8",
    tags = ["bazel-compatible"],
    dependencies = [
        "3rdparty/jvm/com/google/code/findbugs:jsr305",
        "3rdparty/jvm/com/google/guava",
        "3rdparty/jvm/com/google/inject:guice",
        "3rdparty/jvm/org/apache/thrift:libthrift",
        "3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
        "3rdparty/jvm/org/slf4j:slf4j-api",
        "src/java/com/twitter/common/base",
        "src/java/com/twitter/common/util:system-mocks",
        "src/java/com/twitter/search/common/config",
        "src/java/com/twitter/search/common/metrics",
        "src/java/com/twitter/search/common/partitioning/snowflakeparser",
        "src/java/com/twitter/search/common/util/date",
        "src/java/com/twitter/search/common/util/zookeeper",
        "src/java/com/twitter/search/earlybird/common/config",
    ],
 )
--- a/src/java/com/twitter/search/earlybird/config/BUILD.docx
+++ b/src/java/com/twitter/search/earlybird/config/BUILD.docx
--- a/src/java/com/twitter/search/earlybird/config/ServingRange.docx
+++ b/src/java/com/twitter/search/earlybird/config/ServingRange.docx
--- a/src/java/com/twitter/search/earlybird/config/ServingRange.java
+++ b/src/java/com/twitter/search/earlybird/config/ServingRange.java
@ -1,26 +0,0 @@
 package com.twitter.search.earlybird.config;
 /**
 * An interface for abstracting a tier's serving range.
 */
 public interface ServingRange {
  /**
   * Returns the serving range's lowest tweet ID.
   */
  long getServingRangeSinceId();
  /**
   * Returns the serving range's highest tweet ID.
   */
  long getServingRangeMaxId();
  /**
   * Returns the serving range's earliest time, in seconds since epoch.
   */
  long getServingRangeSinceTimeSecondsFromEpoch();
  /**
   * Returns the serving range's latest time, in seconds since epoch.
   */
  long getServingRangeUntilTimeSecondsFromEpoch();
 }
--- a/src/java/com/twitter/search/earlybird/config/TierConfig.docx
+++ b/src/java/com/twitter/search/earlybird/config/TierConfig.docx
--- a/src/java/com/twitter/search/earlybird/config/TierConfig.java
+++ b/src/java/com/twitter/search/earlybird/config/TierConfig.java
@ -1,175 +0,0 @@
 package com.twitter.search.earlybird.config;
 import java.util.Date;
 import java.util.Map;
 import java.util.Set;
 import javax.annotation.Nullable;
 import com.google.common.base.Preconditions;
 import com.twitter.common.util.Clock;
 import com.twitter.search.common.config.Config;
 import com.twitter.search.common.config.ConfigFile;
 import com.twitter.search.common.config.ConfigurationException;
 import com.twitter.search.common.metrics.SearchLongGauge;
 import com.twitter.search.common.util.date.DateUtil;
 /**
 * This class provides APIs to access the tier configurations for a cluster.
 * Each tier has tier name, number of partitions, tier start time and end time.
 */
 public final class TierConfig {
  private static final org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(TierConfig.class);
  private static final String DEFAULT_CONFIG_DIR = "common/config";
  public static final String DEFAULT_TIER_FILE = "earlybird-tiers.yml";
  public static final Date DEFAULT_TIER_START_DATE = DateUtil.toDate(2006, 3, 21);
  // It's convenient for DEFAULT_TIER_END_DATE to be before ~2100, because then the output of
  // FieldTermCounter.getHourValue(DEFAULT_TIER_END_END_DATE) can still fit into an integer.
  public static final Date DEFAULT_TIER_END_DATE = DateUtil.toDate(2099, 1, 1);
  public static final String DEFAULT_TIER_NAME = "all";
  public static final boolean DEFAULT_ENABLED = true;
  public static final TierInfo.RequestReadType DEFAULT_READ_TYPE = TierInfo.RequestReadType.LIGHT;
  private static ConfigFile tierConfigFile = null;
  private static ConfigSource tierConfigSource = null;
  public enum ConfigSource {
    LOCAL,
    ZOOKEEPER
  }
  private TierConfig() { }
  private static synchronized void init() {
    if (tierConfigFile == null) {
      tierConfigFile = new ConfigFile(DEFAULT_CONFIG_DIR, DEFAULT_TIER_FILE);
      tierConfigSource = ConfigSource.LOCAL;
      SearchLongGauge.export("tier_config_source_" + tierConfigSource.name()).set(1);
      LOG.info("Tier config file " + DEFAULT_TIER_FILE + " is successfully loaded from bundle.");
    }
  }
  public static ConfigFile getConfigFile() {
    init();
    return tierConfigFile;
  }
  public static String getConfigFileName() {
    return getConfigFile().getConfigFileName();
  }
  /**
   * Return all the tier names specified in the config file.
   */
  public static Set<String> getTierNames() {
    return Config.getConfig().getMapCopy(getConfigFileName()).keySet();
  }
  /**
   * Sets the value of the given tier config property to the given value.
   */
  public static void setForTests(String property, Object value) {
    Config.getConfig().setForTests(DEFAULT_TIER_FILE, property, value);
  }
  /**
   * Returns the config info for the specified tier.
   */
  public static TierInfo getTierInfo(String tierName) {
    return getTierInfo(tierName, null /* use current environment */);
  }
  /**
   * Returns the config info for the specified tier and environment.
   */
  public static TierInfo getTierInfo(String tierName, @Nullable String environment) {
    String tierConfigFileType = getConfigFileName();
    Map<String, Object> tierInfo;
    try {
      tierInfo = (Map<String, Object>) Config.getConfig()
          .getFromEnvironment(environment, tierConfigFileType, tierName);
    } catch (ConfigurationException e) {
      throw new RuntimeException(e);
    }
    if (tierInfo == null) {
      LOG.error("Cannot find tier config for "
          + tierName + "in config file: " + tierConfigFileType);
      throw new RuntimeException("Configuration error: " + tierConfigFileType);
    }
    Long partitions = (Long) tierInfo.get("number_of_partitions");
    if (partitions == null) {
      LOG.error("No number of partition is specified for tier "
          + tierName + " in tier config file " + tierConfigFileType);
      throw new RuntimeException("Configuration error: " + tierConfigFileType);
    }
    Long numTimeslices = (Long) tierInfo.get("serving_timeslices");
    if (numTimeslices == null) {
      LOG.info("No max timeslices is specified for tier "
          + tierName + " in tier config file " + tierConfigFileType
          + ", not setting a cap on number of serving timeslices");
      // NOTE: we use max int32 here because it will ultimately be cast to an int, but the config
      // map expects Longs for all integral types.  Using Long.MAX_VALUE leads to max serving
      // timeslices being set to -1 when it is truncated to an int.
      numTimeslices = (long) Integer.MAX_VALUE;
    }
    Date tierStartDate = (Date) tierInfo.get("data_range_start_date_inclusive");
    if (tierStartDate == null) {
      tierStartDate = DEFAULT_TIER_START_DATE;
    }
    Date tierEndDate = (Date) tierInfo.get("data_range_end_date_exclusive");
    if (tierEndDate == null) {
      tierEndDate = DEFAULT_TIER_END_DATE;
    }
    Boolean tierEnabled = (Boolean) tierInfo.get("tier_enabled");
    if (tierEnabled == null) {
      tierEnabled = DEFAULT_ENABLED;
    }
    TierInfo.RequestReadType readType =
      getRequestReadType((String) tierInfo.get("tier_read_type"), DEFAULT_READ_TYPE);
    TierInfo.RequestReadType readTypeOverride =
      getRequestReadType((String) tierInfo.get("tier_read_type_override"), readType);
    return new TierInfo(
        tierName,
        tierStartDate,
        tierEndDate,
        partitions.intValue(),
        numTimeslices.intValue(),
        tierEnabled,
        (String) tierInfo.get("serving_range_since_id_exclusive"),
        (String) tierInfo.get("serving_range_max_id_inclusive"),
        (Date) tierInfo.get("serving_range_start_date_inclusive_override"),
        (Date) tierInfo.get("serving_range_end_date_exclusive_override"),
        readType,
        readTypeOverride,
        Clock.SYSTEM_CLOCK);
  }
  public static synchronized void clear() {
    tierConfigFile = null;
    tierConfigSource = null;
  }
  protected static synchronized ConfigSource getTierConfigSource() {
    return tierConfigSource;
  }
  private static TierInfo.RequestReadType getRequestReadType(
      String readTypeEnumName, TierInfo.RequestReadType defaultReadType) {
    TierInfo.RequestReadType readType = defaultReadType;
    if (readTypeEnumName != null) {
      readType = TierInfo.RequestReadType.valueOf(readTypeEnumName.trim().toUpperCase());
      Preconditions.checkState(readType != null);
    }
    return readType;
  }
 }
--- a/src/java/com/twitter/search/earlybird/config/TierInfo.docx
+++ b/src/java/com/twitter/search/earlybird/config/TierInfo.docx
--- a/src/java/com/twitter/search/earlybird/config/TierInfo.java
+++ b/src/java/com/twitter/search/earlybird/config/TierInfo.java
@ -1,180 +0,0 @@
 package com.twitter.search.earlybird.config;
 import java.util.Date;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.twitter.common.util.Clock;
 /**
 * Properties of a single tier.
 */
 public class TierInfo implements ServingRange {
  // What I'm seeing historically is that this has been used when adding a new tier. First you
  // add it and send dark traffic to it, then possibly grey and then you launch it by turning on
  // light traffic.
  public static enum RequestReadType {
    // Light read: send request, wait for results, and results are returned
    LIGHT,
    // Dark read: send request, do not wait for results, and results are discarded
    DARK,
    // Grey read: send request, wait for results, but discard after results come back.
    // Same results as dark read; similar latency as light read.
    GREY,
  }
  private final String tierName;
  private final Date dataStartDate;
  private final Date dataEndDate;
  private final int numPartitions;
  private final int maxTimeslices;
  private final TierServingBoundaryEndPoint servingRangeSince;
  private final TierServingBoundaryEndPoint servingRangeMax;
  private final TierServingBoundaryEndPoint servingRangeSinceOverride;
  private final TierServingBoundaryEndPoint servingRangeMaxOverride;
  // These two properties are only used by clients of Earlybird (E.g. roots),
  // but not by Earlybirds.
  private final boolean enabled;
  private final RequestReadType readType;
  private final RequestReadType readTypeOverride;
  public TierInfo(String tierName,
                  Date dataStartDate,
                  Date dataEndDate,
                  int numPartitions,
                  int maxTimeslices,
                  boolean enabled,
                  String sinceIdString,
                  String maxIdString,
                  Date servingStartDateOverride,
                  Date servingEndDateOverride,
                  RequestReadType readType,
                  RequestReadType readTypeOverride,
                  Clock clock) {
    Preconditions.checkArgument(numPartitions > 0);
    Preconditions.checkArgument(maxTimeslices > 0);
    this.tierName = tierName;
    this.dataStartDate = dataStartDate;
    this.dataEndDate = dataEndDate;
    this.numPartitions = numPartitions;
    this.maxTimeslices = maxTimeslices;
    this.enabled = enabled;
    this.readType = readType;
    this.readTypeOverride = readTypeOverride;
    this.servingRangeSince = TierServingBoundaryEndPoint
        .newTierServingBoundaryEndPoint(sinceIdString, dataStartDate, clock);
    this.servingRangeMax = TierServingBoundaryEndPoint
        .newTierServingBoundaryEndPoint(maxIdString, dataEndDate, clock);
    if (servingStartDateOverride != null) {
      this.servingRangeSinceOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint(
          TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingStartDateOverride, clock);
    } else {
      this.servingRangeSinceOverride = servingRangeSince;
    }
    if (servingEndDateOverride != null) {
      this.servingRangeMaxOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint(
          TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingEndDateOverride, clock);
    } else {
      this.servingRangeMaxOverride = servingRangeMax;
    }
  }
  @VisibleForTesting
  public TierInfo(String tierName,
                  Date dataStartDate,
                  Date dataEndDate,
                  int numPartitions,
                  int maxTimeslices,
                  boolean enabled,
                  String sinceIdString,
                  String maxIdString,
                  RequestReadType readType,
                  Clock clock) {
    // No overrides:
    //   servingRangeSinceOverride == servingRangeSince
    //   servingRangeMaxOverride == servingRangeMax
    //   readTypeOverride == readType
    this(tierName, dataStartDate, dataEndDate, numPartitions, maxTimeslices, enabled, sinceIdString,
         maxIdString, null, null, readType, readType, clock);
  }
  @Override
  public String toString() {
    return tierName;
  }
  public String getTierName() {
    return tierName;
  }
  public Date getDataStartDate() {
    return dataStartDate;
  }
  public Date getDataEndDate() {
    return dataEndDate;
  }
  public int getNumPartitions() {
    return numPartitions;
  }
  public int getMaxTimeslices() {
    return maxTimeslices;
  }
  public TierConfig.ConfigSource getSource() {
    return TierConfig.getTierConfigSource();
  }
  public boolean isEnabled() {
    return enabled;
  }
  public boolean isDarkRead() {
    return readType == RequestReadType.DARK;
  }
  public RequestReadType getReadType() {
    return readType;
  }
  public RequestReadType getReadTypeOverride() {
    return readTypeOverride;
  }
  public long getServingRangeSinceId() {
    return servingRangeSince.getBoundaryTweetId();
  }
  public long getServingRangeMaxId() {
    return servingRangeMax.getBoundaryTweetId();
  }
  long getServingRangeOverrideSinceId() {
    return servingRangeSinceOverride.getBoundaryTweetId();
  }
  long getServingRangeOverrideMaxId() {
    return servingRangeMaxOverride.getBoundaryTweetId();
  }
  public long getServingRangeSinceTimeSecondsFromEpoch() {
    return servingRangeSince.getBoundaryTimeSecondsFromEpoch();
  }
  public long getServingRangeUntilTimeSecondsFromEpoch() {
    return servingRangeMax.getBoundaryTimeSecondsFromEpoch();
  }
  long getServingRangeOverrideSinceTimeSecondsFromEpoch() {
    return servingRangeSinceOverride.getBoundaryTimeSecondsFromEpoch();
  }
  long getServingRangeOverrideUntilTimeSecondsFromEpoch() {
    return servingRangeMaxOverride.getBoundaryTimeSecondsFromEpoch();
  }
 }
--- a/src/java/com/twitter/search/earlybird/config/TierInfoSource.docx
+++ b/src/java/com/twitter/search/earlybird/config/TierInfoSource.docx
--- a/src/java/com/twitter/search/earlybird/config/TierInfoSource.java
+++ b/src/java/com/twitter/search/earlybird/config/TierInfoSource.java
@ -1,39 +0,0 @@
 package com.twitter.search.earlybird.config;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import javax.inject.Inject;
 import com.twitter.search.common.util.zookeeper.ZooKeeperProxy;
 public class TierInfoSource {
  private final ZooKeeperProxy zkClient;
  @Inject
  public TierInfoSource(ZooKeeperProxy sZooKeeperClient) {
    this.zkClient = sZooKeeperClient;
  }
  public List<TierInfo> getTierInformation() {
    return getTierInfoWithPrefix("tier");
  }
  public String getConfigFileType() {
    return TierConfig.getConfigFileName();
  }
  private List<TierInfo> getTierInfoWithPrefix(String tierPrefix) {
    Set<String> tierNames = TierConfig.getTierNames();
    List<TierInfo> tierInfos = new ArrayList<>();
    for (String name : tierNames) {
      if (name.startsWith(tierPrefix)) {
        TierInfo tierInfo = TierConfig.getTierInfo(name);
        tierInfos.add(tierInfo);
      }
    }
    return tierInfos;
  }
 }
--- a/src/java/com/twitter/search/earlybird/config/TierInfoUtil.docx
+++ b/src/java/com/twitter/search/earlybird/config/TierInfoUtil.docx
--- a/src/java/com/twitter/search/earlybird/config/TierInfoUtil.java
+++ b/src/java/com/twitter/search/earlybird/config/TierInfoUtil.java
@ -1,78 +0,0 @@
 package com.twitter.search.earlybird.config;
 import java.util.Comparator;
 import java.util.SortedSet;
 import com.google.common.base.Preconditions;
 public final class TierInfoUtil {
  public static final Comparator<TierInfo> TIER_COMPARATOR = (t1, t2) -> {
    // Reverse sort order based on date.
    return t2.getDataStartDate().compareTo(t1.getDataStartDate());
  };
  private TierInfoUtil() {
  }
  /**
   * Checks that the serving ranges and the override serving ranges of the given tiers do not
   * overlap, and do not have gaps. Dark reads tiers are ignored.
   */
  public static void checkTierServingRanges(SortedSet<TierInfo> tierInfos) {
    boolean tierServingRangesOverlap = false;
    boolean tierOverrideServingRangesOverlap = false;
    boolean tierServingRangesHaveGaps = false;
    boolean tierOverrideServingRangesHaveGaps = false;
    TierInfoWrapper previousTierInfoWrapper = null;
    TierInfoWrapper previousOverrideTierInfoWrapper = null;
    for (TierInfo tierInfo : tierInfos) {
      TierInfoWrapper tierInfoWrapper = new TierInfoWrapper(tierInfo, false);
      TierInfoWrapper overrideTierInfoWrapper = new TierInfoWrapper(tierInfo, true);
      // Check only the tiers to which we send light reads.
      if (!tierInfoWrapper.isDarkRead()) {
        if (previousTierInfoWrapper != null) {
          if (TierInfoWrapper.servingRangesOverlap(previousTierInfoWrapper, tierInfoWrapper)) {
            // In case of rebalancing, we may have an overlap data range while
            // overriding with a good serving range.
            if (previousOverrideTierInfoWrapper == null
                || TierInfoWrapper.servingRangesOverlap(
                       previousOverrideTierInfoWrapper, overrideTierInfoWrapper)) {
              tierServingRangesOverlap = true;
            }
          }
          if (TierInfoWrapper.servingRangesHaveGap(previousTierInfoWrapper, tierInfoWrapper)) {
            tierServingRangesHaveGaps = true;
          }
        }
        previousTierInfoWrapper = tierInfoWrapper;
      }
      if (!overrideTierInfoWrapper.isDarkRead()) {
        if (previousOverrideTierInfoWrapper != null) {
          if (TierInfoWrapper.servingRangesOverlap(previousOverrideTierInfoWrapper,
                                                   overrideTierInfoWrapper)) {
            tierOverrideServingRangesOverlap = true;
          }
          if (TierInfoWrapper.servingRangesHaveGap(previousOverrideTierInfoWrapper,
                                                   overrideTierInfoWrapper)) {
            tierOverrideServingRangesHaveGaps = true;
          }
        }
        previousOverrideTierInfoWrapper = overrideTierInfoWrapper;
      }
    }
    Preconditions.checkState(!tierServingRangesOverlap,
                             "Serving ranges of light reads tiers must not overlap.");
    Preconditions.checkState(!tierServingRangesHaveGaps,
                             "Serving ranges of light reads tiers must not have gaps.");
    Preconditions.checkState(!tierOverrideServingRangesOverlap,
                             "Override serving ranges of light reads tiers must not overlap.");
    Preconditions.checkState(!tierOverrideServingRangesHaveGaps,
                             "Override serving ranges of light reads tiers must not have gaps.");
  }
 }
--- a/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.docx
+++ b/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.docx
--- a/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.java
+++ b/src/java/com/twitter/search/earlybird/config/TierInfoWrapper.java
@ -1,89 +0,0 @@
 package com.twitter.search.earlybird.config;
 import java.util.Date;
 import com.google.common.base.Preconditions;
 /**
 * A simple wrapper around TierInfo that returns the "real" or the "overriden" values from the given
 * {@code TierInfo} instance, based on the given {@code useOverrideTierConfig} flag.
 */
 public class TierInfoWrapper implements ServingRange {
  private final TierInfo tierInfo;
  private final boolean useOverrideTierConfig;
  public TierInfoWrapper(TierInfo tierInfo, boolean useOverrideTierConfig) {
    this.tierInfo = Preconditions.checkNotNull(tierInfo);
    this.useOverrideTierConfig = useOverrideTierConfig;
  }
  public String getTierName() {
    return tierInfo.getTierName();
  }
  public Date getDataStartDate() {
    return tierInfo.getDataStartDate();
  }
  public Date getDataEndDate() {
    return tierInfo.getDataEndDate();
  }
  public int getNumPartitions() {
    return tierInfo.getNumPartitions();
  }
  public int getMaxTimeslices() {
    return tierInfo.getMaxTimeslices();
  }
  public TierConfig.ConfigSource getSource() {
    return tierInfo.getSource();
  }
  public boolean isEnabled() {
    return tierInfo.isEnabled();
  }
  public boolean isDarkRead() {
    return getReadType() == TierInfo.RequestReadType.DARK;
  }
  public TierInfo.RequestReadType getReadType() {
    return useOverrideTierConfig ? tierInfo.getReadTypeOverride() : tierInfo.getReadType();
  }
  public long getServingRangeSinceId() {
    return useOverrideTierConfig
      ? tierInfo.getServingRangeOverrideSinceId()
      : tierInfo.getServingRangeSinceId();
  }
  public long getServingRangeMaxId() {
    return useOverrideTierConfig
      ? tierInfo.getServingRangeOverrideMaxId()
      : tierInfo.getServingRangeMaxId();
  }
  public long getServingRangeSinceTimeSecondsFromEpoch() {
    return useOverrideTierConfig
      ? tierInfo.getServingRangeOverrideSinceTimeSecondsFromEpoch()
      : tierInfo.getServingRangeSinceTimeSecondsFromEpoch();
  }
  public long getServingRangeUntilTimeSecondsFromEpoch() {
    return useOverrideTierConfig
      ? tierInfo.getServingRangeOverrideUntilTimeSecondsFromEpoch()
      : tierInfo.getServingRangeUntilTimeSecondsFromEpoch();
  }
  public static boolean servingRangesOverlap(TierInfoWrapper tier1, TierInfoWrapper tier2) {
    return (tier1.getServingRangeMaxId() > tier2.getServingRangeSinceId())
      && (tier2.getServingRangeMaxId() > tier1.getServingRangeSinceId());
  }
  public static boolean servingRangesHaveGap(TierInfoWrapper tier1, TierInfoWrapper tier2) {
    return (tier1.getServingRangeMaxId() < tier2.getServingRangeSinceId())
      || (tier2.getServingRangeMaxId() < tier1.getServingRangeSinceId());
  }
 }
--- a/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.docx
+++ b/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.docx
--- a/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.java
+++ b/src/java/com/twitter/search/earlybird/config/TierServingBoundaryEndPoint.java
@ -1,146 +0,0 @@
 package com.twitter.search.earlybird.config;
 import java.util.Date;
 import javax.annotation.Nullable;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.twitter.common.util.Clock;
 import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
 /**
 * The start or end boundary of a tier's serving range.
 * This is used to add since_id and max_id operators onto search queries.
 */
 public class TierServingBoundaryEndPoint {
  @VisibleForTesting
  public static final String INFERRED_FROM_DATA_RANGE = "inferred_from_data_range";
  public static final String RELATIVE_TO_CURRENT_TIME_MS = "relative_to_current_time_ms";
  // Either offsetToCurrentTimeMillis is set or (absoluteTweetId and timeBoundarySecondsFromEpoch)
  // are set.
  @Nullable
  private final Long offsetToCurrentTimeMillis;
  @Nullable
  private final Long absoluteTweetId;
  @Nullable
  private final Long timeBoundarySecondsFromEpoch;
  private final Clock clock;
  TierServingBoundaryEndPoint(Long absoluteTweetId,
                              Long timeBoundarySecondsFromEpoch,
                              Long offsetToCurrentTimeMillis,
                              Clock clock) {
    this.offsetToCurrentTimeMillis = offsetToCurrentTimeMillis;
    this.absoluteTweetId = absoluteTweetId;
    this.timeBoundarySecondsFromEpoch = timeBoundarySecondsFromEpoch;
    this.clock = clock;
  }
  /**
   * Parse the boundary string and construct a TierServingBoundaryEndPoint instance.
   * @param boundaryString boundary configuration string. Valid values are:
   * <li>
   * "inferred_from_data_range" infers serving range from data range. This only works after
   *                               Nov 2010 when Twitter switched to snowflake IDs.
   *                               This is the default value.
   * </li>
   * <li>
   * "absolute_tweet_id_and_timestamp_millis:id:timestamp" a tweet ID/timestamp is given
   *                                                       explicitly as the serving range
   *                                                       boundary.
   * </li>
   * <li>
   * "relative_to_current_time_ms:offset" adds offset onto current timestamp in millis to
   *                                         compute serving range.
   * </li>
   *
   * @param boundaryDate the data boundary. This is used in conjunction with
   * inferred_from_data_date to determine the serving boundary.
   * @param clock  Clock used to obtain current time, when relative_to_current_time_ms is used.
   *               Tests pass in a FakeClock.
   */
  public static TierServingBoundaryEndPoint newTierServingBoundaryEndPoint(String boundaryString,
      Date boundaryDate,
      Clock clock) {
    if (boundaryString == null || boundaryString.trim().equals(
        INFERRED_FROM_DATA_RANGE)) {
      return inferBoundaryFromDataRange(boundaryDate, clock);
    } else if (boundaryString.trim().startsWith(RELATIVE_TO_CURRENT_TIME_MS)) {
      return getRelativeBoundary(boundaryString, clock);
    } else {
      throw new IllegalStateException("Cannot parse serving range string: " + boundaryString);
    }
  }
  private static TierServingBoundaryEndPoint inferBoundaryFromDataRange(Date boundaryDate,
                                                                        Clock clock) {
    // infer from data range
    // handle default start date and end date, in case the dates are not specified in the config
    if (boundaryDate.equals(TierConfig.DEFAULT_TIER_START_DATE)) {
      return new TierServingBoundaryEndPoint(
          -1L, TierConfig.DEFAULT_TIER_START_DATE.getTime() / 1000, null, clock);
    } else if (boundaryDate.equals(TierConfig.DEFAULT_TIER_END_DATE)) {
      return new TierServingBoundaryEndPoint(
          Long.MAX_VALUE, TierConfig.DEFAULT_TIER_END_DATE.getTime() / 1000, null, clock);
    } else {
      // convert data start / end dates into since / max ID.
      long boundaryTimeMillis = boundaryDate.getTime();
      if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(boundaryTimeMillis)) {
        throw new IllegalStateException("Serving time range can not be determined, because "
            + boundaryDate + " is before Twitter switched to snowflake tweet IDs.");
      }
      // Earlybird since_id is inclusive and max_id is exclusive. We substract 1 here.
      // Consider example:
      //   full0:  5000 (inclusive) - 6000 (exclusive)
      //   full1:  6000 (inclusive) - 7000 (exclusive)
      // For tier full0, we should use max_id 5999 instead of 6000.
      // For tier full1, we should use since_id 5999 instead of 6000.
      // Hence we substract 1 here.
      long adjustedTweetId =
        SnowflakeIdParser.generateValidStatusId(boundaryTimeMillis, 0) - 1;
      Preconditions.checkState(adjustedTweetId >= 0, "boundary tweet ID must be non-negative");
      return new TierServingBoundaryEndPoint(
          adjustedTweetId, boundaryTimeMillis / 1000, null, clock);
    }
  }
  private static TierServingBoundaryEndPoint getRelativeBoundary(String boundaryString,
                                                                 Clock clock) {
    // An offset relative to current time is given
    String[] parts = boundaryString.split(":");
    Preconditions.checkState(parts.length == 2);
    long offset = Long.parseLong(parts[1]);
    return new TierServingBoundaryEndPoint(null, null, offset, clock);
  }
  /**
   * Returns the tweet ID for this tier boundary. If the tier boundary was created using a tweet ID,
   * that tweet ID is returned. Otherwise, a tweet ID is derived from the time boundary.
   */
  @VisibleForTesting
  public long getBoundaryTweetId() {
    // If absoluteTweetId is available, use it.
    if (absoluteTweetId != null) {
      return absoluteTweetId;
    } else {
      Preconditions.checkNotNull(offsetToCurrentTimeMillis);
      long boundaryTime = clock.nowMillis() + offsetToCurrentTimeMillis;
      return SnowflakeIdParser.generateValidStatusId(boundaryTime, 0);
    }
  }
  /**
   * Returns the time boundary for this tier boundary, in seconds since epoch.
   */
  public long getBoundaryTimeSecondsFromEpoch() {
    if (timeBoundarySecondsFromEpoch != null) {
      return timeBoundarySecondsFromEpoch;
    } else {
      Preconditions.checkNotNull(offsetToCurrentTimeMillis);
      return (clock.nowMillis() + offsetToCurrentTimeMillis) / 1000;
    }
  }
 }
--- a/Show More
+++ b/Show More