mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-11-16 00:25:11 +01:00
[docx] split commit for file 4200
Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
parent
47a8228a09
commit
8948d714f6
Binary file not shown.
@ -1,279 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Predicate;
|
||||
|
||||
import org.apache.commons.lang.time.FastDateFormat;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
||||
import com.twitter.search.common.metrics.SearchStatsReceiver;
|
||||
import com.twitter.search.common.metrics.SearchStatsReceiverImpl;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
|
||||
import com.twitter.search.common.util.io.recordreader.RecordReader;
|
||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
||||
import com.twitter.search.earlybird.EarlybirdIndexConfig;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.document.DocumentFactory;
|
||||
import com.twitter.search.earlybird.document.TweetDocument;
|
||||
import com.twitter.search.earlybird.exception.CriticalExceptionHandler;
|
||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
||||
import com.twitter.search.earlybird.partition.SearchIndexingMetricSet;
|
||||
import com.twitter.search.earlybird.partition.SegmentHdfsFlusher;
|
||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
||||
import com.twitter.search.earlybird.partition.SegmentLoader;
|
||||
import com.twitter.search.earlybird.partition.SegmentOptimizer;
|
||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
||||
import com.twitter.search.earlybird.partition.SimpleSegmentIndexer;
|
||||
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
|
||||
|
||||
/**
|
||||
* Given a segment, this class checks if the segment has an index built on HDFS:
|
||||
* if not, use SimpleSegmentIndexer to build an index
|
||||
* if yes, load the HDFS index, build a new index for the new status data which has dates newer
|
||||
* than the HDFS index, then append the loaded HDFS index.
|
||||
*/
|
||||
public class ArchiveSegmentUpdater {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentUpdater.class);
|
||||
|
||||
private final SegmentSyncConfig sync;
|
||||
private final EarlybirdIndexConfig earlybirdIndexConfig;
|
||||
private final ZooKeeperTryLockFactory zkTryLockFactory;
|
||||
private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl();
|
||||
private final SearchIndexingMetricSet searchIndexingMetricSet =
|
||||
new SearchIndexingMetricSet(statsReceiver);
|
||||
private final EarlybirdSearcherStats searcherStats =
|
||||
new EarlybirdSearcherStats(statsReceiver);
|
||||
private final SearchRateCounter indexNewSegment =
|
||||
new SearchRateCounter("index_new_segment");
|
||||
private final SearchRateCounter updateExistingSegment =
|
||||
new SearchRateCounter("update_existing_segment");
|
||||
private final SearchRateCounter skipExistingSegment =
|
||||
new SearchRateCounter("skip_existing_segment");
|
||||
private Clock clock;
|
||||
|
||||
public ArchiveSegmentUpdater(ZooKeeperTryLockFactory zooKeeperTryLockFactory,
|
||||
SegmentSyncConfig sync,
|
||||
EarlybirdIndexConfig earlybirdIndexConfig,
|
||||
Clock clock) {
|
||||
this.sync = sync;
|
||||
this.earlybirdIndexConfig = earlybirdIndexConfig;
|
||||
this.zkTryLockFactory = zooKeeperTryLockFactory;
|
||||
this.clock = clock;
|
||||
}
|
||||
|
||||
private boolean canUpdateSegment(SegmentInfo segmentInfo) {
|
||||
if (!(segmentInfo.getSegment() instanceof ArchiveSegment)) {
|
||||
LOG.info("only ArchiveSegment is available for updating now: "
|
||||
+ segmentInfo);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!segmentInfo.isEnabled()) {
|
||||
LOG.debug("Segment is disabled: " + segmentInfo);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (segmentInfo.isComplete() || segmentInfo.isIndexing()
|
||||
|| segmentInfo.getSyncInfo().isLoaded()) {
|
||||
LOG.debug("Cannot update already indexed segment: " + segmentInfo);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a segment, checks if the segment has an index built on HDFS:
|
||||
* if not, use SimpleSegmentIndexer to build an index
|
||||
* if yes, load the HDFS index, build a new index for the new status data which has dates newer
|
||||
* than the HDFS index, then append the loaded HDFS index.
|
||||
*
|
||||
* Returns whether the segment was successfully updated.
|
||||
*/
|
||||
public boolean updateSegment(SegmentInfo segmentInfo) {
|
||||
Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
|
||||
if (!canUpdateSegment(segmentInfo)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (segmentInfo.isIndexing()) {
|
||||
LOG.error("Segment is already being indexed: " + segmentInfo);
|
||||
return false;
|
||||
}
|
||||
|
||||
final Date hdfsEndDate = ArchiveHDFSUtils.getSegmentEndDateOnHdfs(sync, segmentInfo);
|
||||
if (hdfsEndDate == null) {
|
||||
indexNewSegment.increment();
|
||||
if (!indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE)) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
final Date curEndDate = ((ArchiveSegment) segmentInfo.getSegment()).getDataEndDate();
|
||||
if (!hdfsEndDate.before(curEndDate)) {
|
||||
skipExistingSegment.increment();
|
||||
LOG.info("Segment is up-to-date: " + segmentInfo.getSegment().getTimeSliceID()
|
||||
+ " Found flushed segment on HDFS with end date: "
|
||||
+ FastDateFormat.getInstance("yyyyMMdd").format(hdfsEndDate));
|
||||
segmentInfo.setComplete(true);
|
||||
segmentInfo.getSyncInfo().setFlushed(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
updateExistingSegment.increment();
|
||||
LOG.info("Updating segment: " + segmentInfo.getSegment().getTimeSliceID()
|
||||
+ "; new endDate will be " + FastDateFormat.getInstance("yyyyMMdd").format(curEndDate));
|
||||
|
||||
if (!updateSegment(segmentInfo, hdfsEndDate)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
boolean success = SegmentOptimizer.optimize(segmentInfo);
|
||||
if (!success) {
|
||||
// Clean up the segment dir on local disk
|
||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
||||
LOG.info("Error optimizing segment: " + segmentInfo);
|
||||
return false;
|
||||
}
|
||||
|
||||
// Verify segment before uploading.
|
||||
success = ArchiveSegmentVerifier.verifySegment(segmentInfo);
|
||||
if (!success) {
|
||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
||||
LOG.info("Segment not uploaded to HDFS because it did not pass verification: " + segmentInfo);
|
||||
return false;
|
||||
}
|
||||
|
||||
// upload the index to HDFS
|
||||
success = new SegmentHdfsFlusher(zkTryLockFactory, sync, false)
|
||||
.flushSegmentToDiskAndHDFS(segmentInfo);
|
||||
if (success) {
|
||||
ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, false, true);
|
||||
} else {
|
||||
// Clean up the segment dir on hdfs
|
||||
ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, true, false);
|
||||
LOG.info("Error uploading segment to HDFS: " + segmentInfo);
|
||||
}
|
||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
/**
|
||||
* Build index for the given segmentInfo. Only those statuses passing the dateFilter are indexed.
|
||||
*/
|
||||
private boolean indexSegment(final SegmentInfo segmentInfo, Predicate<Date> dateFilter) {
|
||||
Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
|
||||
|
||||
RecordReader<TweetDocument> documentReader = null;
|
||||
try {
|
||||
ArchiveSegment archiveSegment = (ArchiveSegment) segmentInfo.getSegment();
|
||||
DocumentFactory<ThriftIndexingEvent> documentFactory =
|
||||
earlybirdIndexConfig.createDocumentFactory();
|
||||
documentReader = archiveSegment.getStatusRecordReader(documentFactory, dateFilter);
|
||||
|
||||
// Read and index the statuses
|
||||
boolean success = new SimpleSegmentIndexer(documentReader, searchIndexingMetricSet)
|
||||
.indexSegment(segmentInfo);
|
||||
if (!success) {
|
||||
// Clean up segment dir on local disk
|
||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
||||
LOG.info("Error indexing segment: " + segmentInfo);
|
||||
}
|
||||
|
||||
return success;
|
||||
} catch (IOException e) {
|
||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
||||
LOG.info("Exception while indexing segment: " + segmentInfo, e);
|
||||
return false;
|
||||
} finally {
|
||||
if (documentReader != null) {
|
||||
documentReader.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the index built on HDFS for the given segmentInfo, index the new data and append the
|
||||
* HDFS index to the new indexed segment
|
||||
*/
|
||||
private boolean updateSegment(final SegmentInfo segmentInfo, final Date hdfsEndDate) {
|
||||
SegmentInfo hdfsSegmentInfo = loadSegmentFromHdfs(segmentInfo, hdfsEndDate);
|
||||
if (hdfsSegmentInfo == null) {
|
||||
return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE);
|
||||
}
|
||||
|
||||
boolean success = indexSegment(segmentInfo, input -> {
|
||||
// we're updating the segment - only index days after the old end date,
|
||||
// and we're sure that the previous days have already been indexed.
|
||||
return input.after(hdfsEndDate);
|
||||
});
|
||||
if (!success) {
|
||||
LOG.error("Error indexing new data: " + segmentInfo);
|
||||
return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE);
|
||||
}
|
||||
|
||||
// Now, append the index loaded from hdfs
|
||||
try {
|
||||
segmentInfo.getIndexSegment().append(hdfsSegmentInfo.getIndexSegment());
|
||||
hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
||||
LOG.info("Deleted local segment directories with end date " + hdfsEndDate + " : "
|
||||
+ segmentInfo);
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Caught IOException while appending segment " + hdfsSegmentInfo.getSegmentName(), e);
|
||||
hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
||||
return false;
|
||||
}
|
||||
|
||||
segmentInfo.setComplete(true);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the index built on HDFS for the given segmentInfo and end date
|
||||
*/
|
||||
private SegmentInfo loadSegmentFromHdfs(final SegmentInfo segmentInfo, final Date hdfsEndDate) {
|
||||
Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
|
||||
|
||||
ArchiveSegment segment = new ArchiveSegment(
|
||||
segmentInfo.getTimeSliceID(),
|
||||
EarlybirdConfig.getMaxSegmentSize(),
|
||||
segmentInfo.getNumPartitions(),
|
||||
segmentInfo.getSegment().getHashPartitionID(),
|
||||
hdfsEndDate);
|
||||
EarlybirdSegmentFactory factory = new EarlybirdSegmentFactory(
|
||||
earlybirdIndexConfig,
|
||||
searchIndexingMetricSet,
|
||||
searcherStats,
|
||||
clock);
|
||||
|
||||
SegmentInfo hdfsSegmentInfo;
|
||||
|
||||
try {
|
||||
hdfsSegmentInfo = new SegmentInfo(segment, factory, sync);
|
||||
CriticalExceptionHandler criticalExceptionHandler =
|
||||
new CriticalExceptionHandler();
|
||||
|
||||
boolean success = new SegmentLoader(sync, criticalExceptionHandler)
|
||||
.load(hdfsSegmentInfo);
|
||||
if (!success) {
|
||||
// If not successful, segmentLoader has already cleaned up the local dir.
|
||||
LOG.info("Error loading hdfs segment " + hdfsSegmentInfo
|
||||
+ ", building segment from scratch.");
|
||||
hdfsSegmentInfo = null;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.error("Exception while loading segment from hdfs: " + segmentInfo, e);
|
||||
hdfsSegmentInfo = null;
|
||||
}
|
||||
|
||||
return hdfsSegmentInfo;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,75 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.apache.lucene.index.DirectoryReader;
|
||||
import org.apache.lucene.index.LeafReader;
|
||||
import org.apache.lucene.index.LeafReaderContext;
|
||||
import org.apache.lucene.store.Directory;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
||||
|
||||
public final class ArchiveSegmentVerifier {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentVerifier.class);
|
||||
|
||||
private ArchiveSegmentVerifier() {
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
static boolean shouldVerifySegment(SegmentInfo segmentInfo) {
|
||||
if (segmentInfo.isIndexing()) {
|
||||
LOG.warn("ArchiveSegmentVerifier got segment still indexing.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!segmentInfo.isComplete()) {
|
||||
LOG.warn("ArchiveSegmentVerifyer got incomplete segment.");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!segmentInfo.isOptimized()) {
|
||||
LOG.warn("ArchiveSegmentVerifyer got unoptimized segment.");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Verifies an archive segment has a sane number of leaves.
|
||||
*/
|
||||
public static boolean verifySegment(SegmentInfo segmentInfo) {
|
||||
if (!shouldVerifySegment(segmentInfo)) {
|
||||
return false;
|
||||
}
|
||||
Directory directory = segmentInfo.getIndexSegment().getLuceneDirectory();
|
||||
return verifyLuceneIndex(directory);
|
||||
}
|
||||
|
||||
private static boolean verifyLuceneIndex(Directory directory) {
|
||||
try {
|
||||
DirectoryReader indexerReader = DirectoryReader.open(directory);
|
||||
List<LeafReaderContext> leaves = indexerReader.getContext().leaves();
|
||||
if (leaves.size() != 1) {
|
||||
LOG.warn("Lucene index does not have exactly one segment: " + leaves.size() + " != 1. "
|
||||
+ "Lucene segments should have been merged during optimization.");
|
||||
return false;
|
||||
}
|
||||
|
||||
LeafReader reader = leaves.get(0).reader();
|
||||
if (reader.numDocs() <= 0) {
|
||||
LOG.warn("Lucene index has no document: " + reader);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Found bad lucene index at: " + directory);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,322 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Calendar;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Predicate;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
|
||||
import com.twitter.search.common.util.io.MergingSortedRecordReader;
|
||||
import com.twitter.search.common.util.io.recordreader.RecordReader;
|
||||
import com.twitter.search.earlybird.config.TierConfig;
|
||||
import com.twitter.search.earlybird.document.DocumentFactory;
|
||||
import com.twitter.search.earlybird.document.ThriftIndexingEventDocumentFactory;
|
||||
import com.twitter.search.earlybird.document.TweetDocument;
|
||||
|
||||
|
||||
/**
|
||||
* Responsible for taking a number of daily status batches and partitioning them into time slices
|
||||
* which will be used to build segments.
|
||||
*
|
||||
* We try to put at most N number of tweets into a time slice.
|
||||
*/
|
||||
public class ArchiveTimeSlicer {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ArchiveTimeSlicer.class);
|
||||
|
||||
private static final Comparator<TweetDocument> ASCENDING =
|
||||
(o1, o2) -> Long.compare(o1.getTweetID(), o2.getTweetID());
|
||||
|
||||
private static final Comparator<TweetDocument> DESCENDING =
|
||||
(o1, o2) -> Long.compare(o2.getTweetID(), o1.getTweetID());
|
||||
|
||||
// Represents a number of daily batches which will go into a segment.
|
||||
public static final class ArchiveTimeSlice {
|
||||
private Date startDate;
|
||||
private Date endDate;
|
||||
private int statusCount;
|
||||
private final DailyStatusBatches directory;
|
||||
private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig;
|
||||
|
||||
// This list is always ordered from oldest day, to the newest day.
|
||||
// For the on-disk archive, we reverse the days in getTweetReaders().
|
||||
private final List<DailyStatusBatch> batches = Lists.newArrayList();
|
||||
|
||||
private ArchiveTimeSlice(DailyStatusBatches directory,
|
||||
ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
|
||||
this.directory = directory;
|
||||
this.earlybirdIndexConfig = earlybirdIndexConfig;
|
||||
}
|
||||
|
||||
public Date getEndDate() {
|
||||
return endDate;
|
||||
}
|
||||
|
||||
public int getStatusCount() {
|
||||
return statusCount;
|
||||
}
|
||||
|
||||
public int getNumHashPartitions() {
|
||||
return batches.isEmpty() ? 0 : batches.get(0).getNumHashPartitions();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reader for reading tweets from this timeslice.
|
||||
*
|
||||
* @param archiveSegment The segment to which the timeslice belongs.
|
||||
* @param documentFactory The ThriftIndexingEvent to TweetDocument converter.
|
||||
* @param filter A filter that determines what dates should be read.
|
||||
*/
|
||||
public RecordReader<TweetDocument> getStatusReader(
|
||||
ArchiveSegment archiveSegment,
|
||||
DocumentFactory<ThriftIndexingEvent> documentFactory,
|
||||
Predicate<Date> filter) throws IOException {
|
||||
// We no longer support ThriftStatus based document factories.
|
||||
Preconditions.checkState(documentFactory instanceof ThriftIndexingEventDocumentFactory);
|
||||
|
||||
final int hashPartitionID = archiveSegment.getHashPartitionID();
|
||||
List<RecordReader<TweetDocument>> readers = new ArrayList<>(batches.size());
|
||||
List<DailyStatusBatch> orderedForReading = orderBatchesForReading(batches);
|
||||
LOG.info("Creating new status reader for hashPartition: "
|
||||
+ hashPartitionID + " timeslice: " + getDescription());
|
||||
|
||||
for (DailyStatusBatch batch : orderedForReading) {
|
||||
if (filter.apply(batch.getDate())) {
|
||||
LOG.info("Adding reader for " + batch.getDate() + " " + getDescription());
|
||||
PartitionedBatch partitionedBatch = batch.getPartition(hashPartitionID);
|
||||
// Don't even try to create a reader if the partition is empty.
|
||||
// There does not seem to be any problem in production now, but HDFS FileSystem's javadoc
|
||||
// does indicate that listStatus() is allowed to throw a FileNotFoundException if the
|
||||
// partition does not exist. This check makes the code more robust against future
|
||||
// HDFS FileSystem implementation changes.
|
||||
if (partitionedBatch.getStatusCount() > 0) {
|
||||
RecordReader<TweetDocument> tweetReaders = partitionedBatch.getTweetReaders(
|
||||
archiveSegment,
|
||||
directory.getStatusPathToUseForDay(batch.getDate()),
|
||||
documentFactory);
|
||||
readers.add(tweetReaders);
|
||||
}
|
||||
} else {
|
||||
LOG.info("Filtered reader for " + batch.getDate() + " " + getDescription());
|
||||
}
|
||||
}
|
||||
|
||||
LOG.info("Creating reader for timeslice: " + getDescription()
|
||||
+ " with " + readers.size() + " readers");
|
||||
|
||||
return new MergingSortedRecordReader<TweetDocument>(getMergingComparator(), readers);
|
||||
}
|
||||
|
||||
private List<DailyStatusBatch> orderBatchesForReading(List<DailyStatusBatch> orderedBatches) {
|
||||
// For the index formats using stock lucene, we want the most recent days to be indexed first.
|
||||
// In the twitter in-memory optimized indexes, older tweets will be added first, and
|
||||
// optimization will reverse the documents to make most recent tweets be first.
|
||||
return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering()
|
||||
? orderedBatches : Lists.reverse(orderedBatches);
|
||||
}
|
||||
|
||||
private Comparator<TweetDocument> getMergingComparator() {
|
||||
// We always want to retrieve larger tweet ids first.
|
||||
// LIFO means that the smaller ids get inserted first --> ASCENDING order.
|
||||
// FIFO would mean that we want to first insert the larger ids --> DESCENDING order.
|
||||
return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering()
|
||||
? ASCENDING : DESCENDING;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the smallest indexed tweet ID in this timeslice for the given partition.
|
||||
*
|
||||
* @param hashPartitionID The partition.
|
||||
*/
|
||||
public long getMinStatusID(int hashPartitionID) {
|
||||
if (batches.isEmpty()) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
for (int i = 0; i < batches.size(); i++) {
|
||||
long minStatusID = batches.get(i).getPartition(hashPartitionID).getMinStatusID();
|
||||
if (minStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) {
|
||||
return minStatusID;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the highest indexed tweet ID in this timeslice for the given partition.
|
||||
*
|
||||
* @param hashPartitionID The partition.
|
||||
*/
|
||||
public long getMaxStatusID(int hashPartitionID) {
|
||||
if (batches.isEmpty()) {
|
||||
return Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
for (int i = batches.size() - 1; i >= 0; i--) {
|
||||
long maxStatusID = batches.get(i).getPartition(hashPartitionID).getMaxStatusID();
|
||||
if (maxStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) {
|
||||
return maxStatusID;
|
||||
}
|
||||
}
|
||||
|
||||
return Long.MAX_VALUE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a string with some information for this timeslice.
|
||||
*/
|
||||
public String getDescription() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("TimeSlice[start date=");
|
||||
builder.append(DailyStatusBatches.DATE_FORMAT.format(startDate));
|
||||
builder.append(", end date=");
|
||||
builder.append(DailyStatusBatches.DATE_FORMAT.format(endDate));
|
||||
builder.append(", status count=");
|
||||
builder.append(statusCount);
|
||||
builder.append(", days count=");
|
||||
builder.append(batches.size());
|
||||
builder.append("]");
|
||||
return builder.toString();
|
||||
}
|
||||
}
|
||||
|
||||
private final int maxSegmentSize;
|
||||
private final DailyStatusBatches dailyStatusBatches;
|
||||
private final Date tierStartDate;
|
||||
private final Date tierEndDate;
|
||||
private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig;
|
||||
|
||||
private List<ArchiveTimeSlice> lastCachedTimeslices = null;
|
||||
|
||||
public ArchiveTimeSlicer(int maxSegmentSize,
|
||||
DailyStatusBatches dailyStatusBatches,
|
||||
ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
|
||||
this(maxSegmentSize, dailyStatusBatches, TierConfig.DEFAULT_TIER_START_DATE,
|
||||
TierConfig.DEFAULT_TIER_END_DATE, earlybirdIndexConfig);
|
||||
}
|
||||
|
||||
public ArchiveTimeSlicer(int maxSegmentSize,
|
||||
DailyStatusBatches dailyStatusBatches,
|
||||
Date tierStartDate,
|
||||
Date tierEndDate,
|
||||
ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
|
||||
this.maxSegmentSize = maxSegmentSize;
|
||||
this.dailyStatusBatches = dailyStatusBatches;
|
||||
this.tierStartDate = tierStartDate;
|
||||
this.tierEndDate = tierEndDate;
|
||||
this.earlybirdIndexConfig = earlybirdIndexConfig;
|
||||
}
|
||||
|
||||
private boolean cacheIsValid() throws IOException {
|
||||
return lastCachedTimeslices != null
|
||||
&& !lastCachedTimeslices.isEmpty()
|
||||
&& cacheIsValid(lastCachedTimeslices.get(lastCachedTimeslices.size() - 1).endDate);
|
||||
}
|
||||
|
||||
private boolean cacheIsValid(Date lastDate) throws IOException {
|
||||
if (lastCachedTimeslices == null || lastCachedTimeslices.isEmpty()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if we have a daily batch newer than the last batch used for the newest timeslice.
|
||||
Calendar cal = Calendar.getInstance();
|
||||
cal.setTime(lastDate);
|
||||
cal.add(Calendar.DATE, 1);
|
||||
Date nextDate = cal.getTime();
|
||||
|
||||
boolean foundBatch = dailyStatusBatches.hasValidBatchForDay(nextDate);
|
||||
|
||||
LOG.info("Checking cache: Looked for valid batch for day {}. Found: {}",
|
||||
DailyStatusBatches.DATE_FORMAT.format(nextDate), foundBatch);
|
||||
|
||||
return !foundBatch;
|
||||
}
|
||||
|
||||
private boolean timesliceIsFull(ArchiveTimeSlice timeSlice, DailyStatusBatch batch) {
|
||||
return timeSlice.statusCount + batch.getMaxPerPartitionStatusCount() > maxSegmentSize;
|
||||
}
|
||||
|
||||
private void doTimeSlicing() throws IOException {
|
||||
dailyStatusBatches.refresh();
|
||||
|
||||
lastCachedTimeslices = Lists.newArrayList();
|
||||
ArchiveTimeSlice currentTimeSlice = null;
|
||||
|
||||
// Iterate over each day and add it to the current timeslice, until it gets full.
|
||||
for (DailyStatusBatch batch : dailyStatusBatches.getStatusBatches()) {
|
||||
if (!batch.isValid()) {
|
||||
LOG.warn("Skipping hole: " + batch.getDate());
|
||||
continue;
|
||||
}
|
||||
|
||||
if (currentTimeSlice == null || timesliceIsFull(currentTimeSlice, batch)) {
|
||||
if (currentTimeSlice != null) {
|
||||
LOG.info("Filled timeslice: " + currentTimeSlice.getDescription());
|
||||
}
|
||||
currentTimeSlice = new ArchiveTimeSlice(dailyStatusBatches, earlybirdIndexConfig);
|
||||
currentTimeSlice.startDate = batch.getDate();
|
||||
lastCachedTimeslices.add(currentTimeSlice);
|
||||
}
|
||||
|
||||
currentTimeSlice.endDate = batch.getDate();
|
||||
currentTimeSlice.statusCount += batch.getMaxPerPartitionStatusCount();
|
||||
currentTimeSlice.batches.add(batch);
|
||||
}
|
||||
LOG.info("Last timeslice: {}", currentTimeSlice.getDescription());
|
||||
|
||||
LOG.info("Done with time slicing. Number of timeslices: {}",
|
||||
lastCachedTimeslices.size());
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns all timeslices for this earlybird.
|
||||
*/
|
||||
public List<ArchiveTimeSlice> getTimeSlices() throws IOException {
|
||||
if (cacheIsValid()) {
|
||||
return lastCachedTimeslices;
|
||||
}
|
||||
|
||||
LOG.info("Cache is outdated. Loading new daily batches now...");
|
||||
|
||||
doTimeSlicing();
|
||||
|
||||
return lastCachedTimeslices != null ? Collections.unmodifiableList(lastCachedTimeslices) : null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the timeslices that overlap the tier start/end date ranges if they are specified
|
||||
*/
|
||||
public List<ArchiveTimeSlice> getTimeSlicesInTierRange() throws IOException {
|
||||
List<ArchiveTimeSlice> timeSlices = getTimeSlices();
|
||||
if (tierStartDate == TierConfig.DEFAULT_TIER_START_DATE
|
||||
&& tierEndDate == TierConfig.DEFAULT_TIER_END_DATE) {
|
||||
return timeSlices;
|
||||
}
|
||||
|
||||
List<ArchiveTimeSlice> filteredTimeSlice = Lists.newArrayList();
|
||||
for (ArchiveTimeSlice timeSlice : timeSlices) {
|
||||
if (timeSlice.startDate.before(tierEndDate) && !timeSlice.endDate.before(tierStartDate)) {
|
||||
filteredTimeSlice.add(timeSlice);
|
||||
}
|
||||
}
|
||||
|
||||
return filteredTimeSlice;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected DailyStatusBatches getDailyStatusBatches() {
|
||||
return dailyStatusBatches;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,166 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.Maps;
|
||||
import com.google.gson.Gson;
|
||||
import com.google.gson.JsonParseException;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Represents a day's worth of statuses (tweets) for multiple hash partitions.
|
||||
*
|
||||
* Note that what this class contains is not the data, but metadata.
|
||||
*
|
||||
* A day of tweets will come from:
|
||||
* - A scrubgen, if it has happened before the scrubgen date.
|
||||
* - Our daily jobs pipeline, if it has happened after that.
|
||||
*
|
||||
* This class checks the _SUCCESS file exists in the "statuses" subdirectory and extracts the status
|
||||
* count, min status id and max status id.
|
||||
*/
|
||||
public class DailyStatusBatch implements Comparable<DailyStatusBatch> {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatch.class);
|
||||
|
||||
public static final long EMPTY_BATCH_STATUS_ID = -1;
|
||||
private static final String PARTITION_FORMAT = "p_%d_of_%d";
|
||||
private static final String SUCCESS_FILE_NAME = "_SUCCESS";
|
||||
|
||||
private final Map<Integer, PartitionedBatch> hashPartitionToStatuses = Maps.newHashMap();
|
||||
|
||||
private final Date date;
|
||||
private final int numHashPartitions;
|
||||
private final boolean hasSuccessFiles;
|
||||
|
||||
public DailyStatusBatch(Date date, int numHashPartitions, Path statusPath, FileSystem hdfs) {
|
||||
this.date = date;
|
||||
this.numHashPartitions = numHashPartitions;
|
||||
this.hasSuccessFiles = checkForSuccessFile(hdfs, date, statusPath);
|
||||
}
|
||||
|
||||
public Date getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check for the presence of the _SUCCESS file for the given day's path on HDFS for the statuses
|
||||
* field group.
|
||||
*/
|
||||
private boolean checkForSuccessFile(FileSystem hdfs, Date inputDate, Path statusPath) {
|
||||
Path dayPath = new Path(statusPath, ArchiveHDFSUtils.dateToPath(inputDate, "/"));
|
||||
Path successFilePath = new Path(dayPath, SUCCESS_FILE_NAME);
|
||||
try {
|
||||
return hdfs.getFileStatus(successFilePath).isFile();
|
||||
} catch (IOException e) {
|
||||
LOG.error("Could not verify existence of the _SUCCESS file. Assuming it doesn't exist.", e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads the data for this day for the given partition.
|
||||
*/
|
||||
public PartitionedBatch addPartition(FileSystem hdfs, Path dayPath, int hashPartitionID)
|
||||
throws IOException {
|
||||
String partitionDir = String.format(PARTITION_FORMAT, hashPartitionID, numHashPartitions);
|
||||
Path path = new Path(dayPath, partitionDir);
|
||||
PartitionedBatch batch =
|
||||
new PartitionedBatch(path, hashPartitionID, numHashPartitions, date);
|
||||
batch.load(hdfs);
|
||||
hashPartitionToStatuses.put(hashPartitionID, batch);
|
||||
return batch;
|
||||
}
|
||||
|
||||
public PartitionedBatch getPartition(int hashPartitionID) {
|
||||
return hashPartitionToStatuses.get(hashPartitionID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the greatest status count in all partitions belonging to this batch.
|
||||
*/
|
||||
public int getMaxPerPartitionStatusCount() {
|
||||
int maxPerPartitionStatusCount = 0;
|
||||
for (PartitionedBatch batch : hashPartitionToStatuses.values()) {
|
||||
maxPerPartitionStatusCount = Math.max(batch.getStatusCount(), maxPerPartitionStatusCount);
|
||||
}
|
||||
return maxPerPartitionStatusCount;
|
||||
}
|
||||
|
||||
public int getNumHashPartitions() {
|
||||
return numHashPartitions;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
boolean hasSuccessFiles() {
|
||||
return hasSuccessFiles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true if the _status_counts files could be found in each
|
||||
* hash partition subfolder that belongs to this timeslice
|
||||
* AND the _SUCCESS file can be found at the root folder for day
|
||||
*/
|
||||
public boolean isValid() {
|
||||
// make sure we have data for all hash partitions
|
||||
for (int i = 0; i < numHashPartitions; i++) {
|
||||
PartitionedBatch day = hashPartitionToStatuses.get(i);
|
||||
if (day == null || !day.hasStatusCount() || day.isDisallowedEmptyPartition()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return hasSuccessFiles;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder builder = new StringBuilder();
|
||||
builder.append("DailyStatusBatch[date=").append(date)
|
||||
.append(",valid=").append(isValid())
|
||||
.append(",hasSuccessFiles=").append(hasSuccessFiles)
|
||||
.append(",numHashPartitions=").append(numHashPartitions)
|
||||
.append("]:\n");
|
||||
for (int i = 0; i < numHashPartitions; i++) {
|
||||
builder.append('\t').append(hashPartitionToStatuses.get(i).toString()).append('\n');
|
||||
}
|
||||
return builder.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(DailyStatusBatch o) {
|
||||
return date.compareTo(o.date);
|
||||
}
|
||||
|
||||
/**
|
||||
* Serialize DailyStatusBatch to a json string.
|
||||
*/
|
||||
public String serializeToJson() {
|
||||
return serializeToJson(new Gson());
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
String serializeToJson(Gson gson) {
|
||||
return gson.toJson(this);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a json string, parse its fields and construct a daily status batch.
|
||||
* @param batchStr the json string representation of a daily status batch.
|
||||
* @return the daily status batch constructed; if the string is of invalid format, null will be
|
||||
* returned.
|
||||
*/
|
||||
static DailyStatusBatch deserializeFromJson(String batchStr) {
|
||||
try {
|
||||
return new Gson().fromJson(batchStr, DailyStatusBatch.class);
|
||||
} catch (JsonParseException e) {
|
||||
LOG.error("Error parsing json string: " + batchStr, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,702 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileWriter;
|
||||
import java.io.IOException;
|
||||
import java.util.Calendar;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.NavigableMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Stopwatch;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.commons.lang3.time.FastDateFormat;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.quantity.Amount;
|
||||
import com.twitter.common.quantity.Time;
|
||||
import com.twitter.search.common.database.DatabaseConfig;
|
||||
import com.twitter.search.common.util.date.DateUtil;
|
||||
import com.twitter.search.common.util.io.LineRecordFileReader;
|
||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdProperty;
|
||||
import com.twitter.search.earlybird.partition.HdfsUtil;
|
||||
import com.twitter.search.earlybird.partition.StatusBatchFlushVersion;
|
||||
|
||||
/**
|
||||
* Provides access to preprocessed statuses (tweets) to be indexed by archive search earlybirds.
|
||||
*
|
||||
* These tweets can be coming from a scrub gen or from the output of the daily jobs.
|
||||
*/
|
||||
public class DailyStatusBatches {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatches.class);
|
||||
|
||||
// Maximum time to spend on obtaining daily status batches by computing or loading from HDFS
|
||||
private static final Amount<Long, Time> MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES =
|
||||
Amount.of(EarlybirdConfig.getLong("daily_status_batches_max_initial_load_time_minutes"),
|
||||
Time.MINUTES);
|
||||
// Time to wait before trying again when obtaining daily status batches fails
|
||||
private static final Amount<Long, Time> DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES =
|
||||
Amount.of(EarlybirdConfig.getLong("daily_status_batches_waiting_time_minutes"),
|
||||
Time.MINUTES);
|
||||
private static final String DAILY_STATUS_BATCHES_SYNC_PATH =
|
||||
EarlybirdProperty.ZK_APP_ROOT.get() + "/daily_batches_sync";
|
||||
private static final String DAILY_BATCHES_ZK_LOCK = "daily_batches_zk_lock";
|
||||
private static final Amount<Long, Time> DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES =
|
||||
Amount.of(EarlybirdConfig.getLong("daily_status_batches_zk_lock_expiration_minutes"),
|
||||
Time.MINUTES);
|
||||
|
||||
static final FastDateFormat DATE_FORMAT = FastDateFormat.getInstance("yyyyMMdd");
|
||||
|
||||
// before this date, there was no twitter
|
||||
private static final Date FIRST_TWITTER_DAY = DateUtil.toDate(2006, 2, 1);
|
||||
|
||||
private static final String STATUS_BATCHES_PREFIX = "status_batches";
|
||||
|
||||
private final String rootDir =
|
||||
EarlybirdConfig.getString("hdfs_offline_segment_sync_dir", "top_archive_statuses");
|
||||
|
||||
private final String buildGen =
|
||||
EarlybirdConfig.getString("offline_segment_build_gen", "bg_1");
|
||||
|
||||
public static final String STATUS_SUBDIR_NAME = "statuses";
|
||||
public static final String LAYOUT_SUBDIR_NAME = "layouts";
|
||||
public static final String SCRUB_GEN_SUFFIX_PATTERN = "scrubbed/%s";
|
||||
|
||||
private static final String INTERMEDIATE_COUNTS_SUBDIR_NAME = "counts";
|
||||
private static final String SUCCESS_FILE_NAME = "_SUCCESS";
|
||||
private static final Pattern HASH_PARTITION_PATTERN = Pattern.compile("p_(\\d+)_of_(\\d+)");
|
||||
private static final Date FIRST_TWEET_DAY = DateUtil.toDate(2006, 3, 21);
|
||||
|
||||
private final Path rootPath = new Path(rootDir);
|
||||
private final Path buildGenPath = new Path(rootPath, buildGen);
|
||||
private final Path statusPath = new Path(buildGenPath, STATUS_SUBDIR_NAME);
|
||||
|
||||
private final NavigableMap<Date, DailyStatusBatch> statusBatches = Maps.newTreeMap();
|
||||
|
||||
private Date firstValidDay = null;
|
||||
private Date lastValidDay = null;
|
||||
|
||||
private final ZooKeeperTryLockFactory zkTryLockFactory;
|
||||
private final Date scrubGenDay;
|
||||
private long numberOfDaysWithValidScrubGenData;
|
||||
|
||||
public DailyStatusBatches(
|
||||
ZooKeeperTryLockFactory zooKeeperTryLockFactory, Date scrubGenDay) throws IOException {
|
||||
this.zkTryLockFactory = zooKeeperTryLockFactory;
|
||||
this.scrubGenDay = scrubGenDay;
|
||||
|
||||
FileSystem hdfs = null;
|
||||
try {
|
||||
hdfs = HdfsUtil.getHdfsFileSystem();
|
||||
verifyDirectory(hdfs);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(hdfs);
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public Date getScrubGenDay() {
|
||||
return scrubGenDay;
|
||||
}
|
||||
|
||||
public Collection<DailyStatusBatch> getStatusBatches() {
|
||||
return statusBatches.values();
|
||||
}
|
||||
|
||||
/**
|
||||
* Reset the states of the directory
|
||||
*/
|
||||
private void resetDirectory() {
|
||||
statusBatches.clear();
|
||||
firstValidDay = null;
|
||||
lastValidDay = null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Indicate whether the directory has been initialized
|
||||
*/
|
||||
private boolean isInitialized() {
|
||||
return lastValidDay != null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the daily status batches from HDFS; return true if one or more batches could be loaded.
|
||||
**/
|
||||
private boolean refreshByLoadingHDFSStatusBatches(final FileSystem fs) throws IOException {
|
||||
// first find the latest valid end date of statuses
|
||||
final Date lastValidStatusDay = getLastValidInputDateFromNow(fs);
|
||||
if (lastValidStatusDay != null) {
|
||||
if (hasStatusBatchesOnHdfs(fs, lastValidStatusDay)) {
|
||||
if (loadStatusBatchesFromHdfs(fs, lastValidStatusDay)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resetDirectory();
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the directory for new data and returns true, if one or more new batches could be loaded.
|
||||
*/
|
||||
public void refresh() throws IOException {
|
||||
final FileSystem hdfs = HdfsUtil.getHdfsFileSystem();
|
||||
|
||||
final Stopwatch stopwatch = Stopwatch.createStarted();
|
||||
try {
|
||||
if (!isInitialized()) {
|
||||
if (initializeDailyStatusBatches(hdfs, stopwatch)) {
|
||||
LOG.info("Successfully obtained daily status batches after {}", stopwatch);
|
||||
} else {
|
||||
String errMsg = "Failed to load or compute daily status batches after "
|
||||
+ stopwatch.toString();
|
||||
LOG.error(errMsg);
|
||||
throw new IOException(errMsg);
|
||||
}
|
||||
} else {
|
||||
loadNewDailyBatches(hdfs);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeQuietly(hdfs);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean initializeDailyStatusBatches(final FileSystem hdfs,
|
||||
final Stopwatch stopwatch) throws IOException {
|
||||
long timeSpentOnDailyBatches = 0L;
|
||||
long maxAllowedTimeMs = MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES.as(Time.MILLISECONDS);
|
||||
long waitingTimeMs = DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES.as(Time.MILLISECONDS);
|
||||
boolean firstLoop = true;
|
||||
LOG.info("Starting to load or compute daily status batches for the first time.");
|
||||
while (timeSpentOnDailyBatches <= maxAllowedTimeMs && !Thread.currentThread().isInterrupted()) {
|
||||
if (!firstLoop) {
|
||||
try {
|
||||
LOG.info("Sleeping " + waitingTimeMs
|
||||
+ " millis before trying to obtain daily batches again");
|
||||
Thread.sleep(waitingTimeMs);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.warn("Interrupted while waiting to load daily batches", e);
|
||||
Thread.currentThread().interrupt();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (isStatusBatchLoadingEnabled() && refreshByLoadingHDFSStatusBatches(hdfs)) {
|
||||
LOG.info("Successfully loaded daily status batches after {}", stopwatch);
|
||||
return true;
|
||||
}
|
||||
|
||||
final AtomicBoolean successRef = new AtomicBoolean(false);
|
||||
if (computeDailyBatchesWithZKLock(hdfs, successRef, stopwatch)) {
|
||||
return successRef.get();
|
||||
}
|
||||
|
||||
timeSpentOnDailyBatches = stopwatch.elapsed(TimeUnit.MILLISECONDS);
|
||||
firstLoop = false;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean computeDailyBatchesWithZKLock(final FileSystem hdfs,
|
||||
final AtomicBoolean successRef,
|
||||
final Stopwatch stopwatch) throws IOException {
|
||||
// Using a global lock to coordinate among earlybirds and segment builders so that only
|
||||
// one instance would hit the HDFS name node to query the daily status directories
|
||||
TryLock lock = zkTryLockFactory.createTryLock(
|
||||
DatabaseConfig.getLocalHostname(),
|
||||
DAILY_STATUS_BATCHES_SYNC_PATH,
|
||||
DAILY_BATCHES_ZK_LOCK,
|
||||
DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES);
|
||||
|
||||
return lock.tryWithLock(() -> {
|
||||
LOG.info("Obtained ZK lock to compute daily status batches after {}", stopwatch);
|
||||
successRef.set(initialLoadDailyBatchInfos(hdfs));
|
||||
if (successRef.get()) {
|
||||
LOG.info("Successfully computed daily status batches after {}", stopwatch);
|
||||
if (isStatusBatchFlushingEnabled()) {
|
||||
LOG.info("Starting to store daily status batches to HDFS");
|
||||
if (storeStatusBatchesToHdfs(hdfs, lastValidDay)) {
|
||||
LOG.info("Successfully stored daily status batches to HDFS");
|
||||
} else {
|
||||
LOG.warn("Failed storing daily status batches to HDFS");
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG.info("Failed loading daily status info");
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private void verifyDirectory(FileSystem hdfs) throws IOException {
|
||||
if (!hdfs.exists(rootPath)) {
|
||||
throw new IOException("Root dir '" + rootPath + "' does not exist.");
|
||||
}
|
||||
|
||||
if (!hdfs.exists(buildGenPath)) {
|
||||
throw new IOException("Build gen dir '" + buildGenPath + "' does not exist.");
|
||||
}
|
||||
|
||||
if (!hdfs.exists(statusPath)) {
|
||||
throw new IOException("Status dir '" + statusPath + "' does not exist.");
|
||||
}
|
||||
}
|
||||
|
||||
private void loadNewDailyBatches(FileSystem hdfs) throws IOException {
|
||||
Preconditions.checkNotNull(lastValidDay);
|
||||
|
||||
Calendar day = Calendar.getInstance();
|
||||
day.setTime(lastValidDay);
|
||||
day.add(Calendar.DATE, 1);
|
||||
|
||||
while (loadDay(hdfs, day.getTime()) != null) {
|
||||
lastValidDay = day.getTime();
|
||||
day.add(Calendar.DATE, 1);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean initialLoadDailyBatchInfos(FileSystem hdfs) throws IOException {
|
||||
LOG.info("Starting to build timeslice map from scratch.");
|
||||
|
||||
final Date lastValidStatusDay = getLastValidInputDateFromNow(hdfs);
|
||||
|
||||
if (lastValidStatusDay == null) {
|
||||
LOG.warn("No data found in " + statusPath + " and scrubbed path");
|
||||
return false;
|
||||
}
|
||||
int mostRecentYear = DateUtil.getCalendar(lastValidStatusDay).get(Calendar.YEAR);
|
||||
for (int year = 2006; year <= mostRecentYear; ++year) {
|
||||
// construct path to avoid hdfs.listStatus() calls
|
||||
Calendar day = Calendar.getInstance();
|
||||
day.set(year, Calendar.JANUARY, 1, 0, 0, 0);
|
||||
day.set(Calendar.MILLISECOND, 0);
|
||||
|
||||
Calendar yearEnd = Calendar.getInstance();
|
||||
yearEnd.set(year, Calendar.DECEMBER, 31, 0, 0, 0);
|
||||
yearEnd.set(Calendar.MILLISECOND, 0);
|
||||
|
||||
if (lastValidDay != null) {
|
||||
// We're updating.
|
||||
if (lastValidDay.after(yearEnd.getTime())) {
|
||||
// This year was already loaded.
|
||||
continue;
|
||||
}
|
||||
if (lastValidDay.after(day.getTime())) {
|
||||
// Start one day after last valid date.
|
||||
day.setTime(lastValidDay);
|
||||
day.add(Calendar.DATE, 1);
|
||||
}
|
||||
}
|
||||
|
||||
for (; !day.after(yearEnd); day.add(Calendar.DATE, 1)) {
|
||||
loadDay(hdfs, day.getTime());
|
||||
}
|
||||
}
|
||||
|
||||
boolean updated = false;
|
||||
numberOfDaysWithValidScrubGenData = 0;
|
||||
|
||||
// Iterate batches in sorted order.
|
||||
for (DailyStatusBatch batch : statusBatches.values()) {
|
||||
if (!batch.isValid()) {
|
||||
break;
|
||||
}
|
||||
if (batch.getDate().before(scrubGenDay)) {
|
||||
numberOfDaysWithValidScrubGenData++;
|
||||
}
|
||||
if (firstValidDay == null) {
|
||||
firstValidDay = batch.getDate();
|
||||
}
|
||||
if (lastValidDay == null || lastValidDay.before(batch.getDate())) {
|
||||
lastValidDay = batch.getDate();
|
||||
updated = true;
|
||||
}
|
||||
}
|
||||
|
||||
LOG.info("Number of statusBatches: {}", statusBatches.size());
|
||||
return updated;
|
||||
}
|
||||
|
||||
private static String filesToString(FileStatus[] files) {
|
||||
if (files == null) {
|
||||
return "null";
|
||||
}
|
||||
StringBuilder b = new StringBuilder();
|
||||
for (FileStatus s : files) {
|
||||
b.append(s.getPath().toString()).append(", ");
|
||||
}
|
||||
return b.toString();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected DailyStatusBatch loadDay(FileSystem hdfs, Date day) throws IOException {
|
||||
Path dayPath = new Path(getStatusPathToUseForDay(day), ArchiveHDFSUtils.dateToPath(day, "/"));
|
||||
LOG.debug("Looking for batch in " + dayPath.toString());
|
||||
DailyStatusBatch result = this.statusBatches.get(day);
|
||||
if (result != null) {
|
||||
return result;
|
||||
}
|
||||
|
||||
final FileStatus[] files;
|
||||
try {
|
||||
files = hdfs.listStatus(dayPath);
|
||||
LOG.debug("Files found: " + filesToString(files));
|
||||
} catch (FileNotFoundException e) {
|
||||
LOG.debug("loadDay() called, but directory does not exist for day: " + day
|
||||
+ " in: " + dayPath);
|
||||
return null;
|
||||
}
|
||||
|
||||
if (files != null && files.length > 0) {
|
||||
for (FileStatus file : files) {
|
||||
Matcher matcher = HASH_PARTITION_PATTERN.matcher(file.getPath().getName());
|
||||
if (matcher.matches()) {
|
||||
int numHashPartitions = Integer.parseInt(matcher.group(2));
|
||||
result = new DailyStatusBatch(
|
||||
day, numHashPartitions, getStatusPathToUseForDay(day), hdfs);
|
||||
|
||||
for (int partitionID = 0; partitionID < numHashPartitions; partitionID++) {
|
||||
result.addPartition(hdfs, dayPath, partitionID);
|
||||
}
|
||||
|
||||
if (result.isValid()) {
|
||||
statusBatches.put(day, result);
|
||||
return result;
|
||||
} else {
|
||||
LOG.info("Invalid batch found for day: " + day + ", batch: " + result);
|
||||
}
|
||||
} else {
|
||||
// skip logging the intermediate count subdirectories or _SUCCESS files.
|
||||
if (!INTERMEDIATE_COUNTS_SUBDIR_NAME.equals(file.getPath().getName())
|
||||
&& !SUCCESS_FILE_NAME.equals(file.getPath().getName())) {
|
||||
LOG.warn("Path does not match hash partition pattern: " + file.getPath());
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG.warn("No data found for day: " + day + " in: " + dayPath
|
||||
+ " files null: " + (files == null));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if this directory has a valid batch for the given day.
|
||||
*/
|
||||
public boolean hasValidBatchForDay(Date day) throws IOException {
|
||||
FileSystem hdfs = null;
|
||||
try {
|
||||
hdfs = HdfsUtil.getHdfsFileSystem();
|
||||
return hasValidBatchForDay(hdfs, day);
|
||||
} finally {
|
||||
IOUtils.closeQuietly(hdfs);
|
||||
}
|
||||
}
|
||||
|
||||
private boolean hasValidBatchForDay(FileSystem fs, Date day) throws IOException {
|
||||
DailyStatusBatch batch = loadDay(fs, day);
|
||||
|
||||
return batch != null && batch.isValid();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
Date getFirstValidDay() {
|
||||
return firstValidDay;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
Date getLastValidDay() {
|
||||
return lastValidDay;
|
||||
}
|
||||
|
||||
private Date getLastValidInputDateFromNow(FileSystem hdfs) throws IOException {
|
||||
Calendar cal = Calendar.getInstance();
|
||||
cal.setTime(new Date()); // current date
|
||||
return getLastValidInputDate(hdfs, cal);
|
||||
}
|
||||
|
||||
/**
|
||||
* Starting from current date, probe backward till we find a valid input Date
|
||||
*/
|
||||
@VisibleForTesting
|
||||
Date getLastValidInputDate(FileSystem hdfs, Calendar cal) throws IOException {
|
||||
cal.set(Calendar.MILLISECOND, 0);
|
||||
cal.set(Calendar.HOUR_OF_DAY, 0);
|
||||
cal.set(Calendar.MINUTE, 0);
|
||||
cal.set(Calendar.SECOND, 0);
|
||||
cal.set(Calendar.MILLISECOND, 0);
|
||||
Date lastValidInputDate = cal.getTime();
|
||||
LOG.info("Probing backwards for last valid data date from " + lastValidInputDate);
|
||||
while (lastValidInputDate.after(FIRST_TWITTER_DAY)) {
|
||||
if (hasValidBatchForDay(hdfs, lastValidInputDate)) {
|
||||
LOG.info("Found latest valid data on date " + lastValidInputDate);
|
||||
LOG.info(" Used path: {}", getStatusPathToUseForDay(lastValidInputDate));
|
||||
return lastValidInputDate;
|
||||
}
|
||||
cal.add(Calendar.DATE, -1);
|
||||
lastValidInputDate = cal.getTime();
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the daily status batches are already on HDFS
|
||||
*/
|
||||
@VisibleForTesting
|
||||
boolean hasStatusBatchesOnHdfs(FileSystem fs, Date lastDataDay) {
|
||||
String hdfsFileName = getHdfsStatusBatchSyncFileName(lastDataDay);
|
||||
try {
|
||||
return fs.exists(new Path(hdfsFileName));
|
||||
} catch (IOException ex) {
|
||||
LOG.error("Failed checking status batch file on HDFS: " + hdfsFileName, ex);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Load the daily status batches from HDFS by first copying the file from HDFS to local disk
|
||||
* and then reading from the local disk.
|
||||
*
|
||||
* @param day the latest day of valid statuses.
|
||||
* @return true if the loading is successful.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
boolean loadStatusBatchesFromHdfs(FileSystem fs, Date day) {
|
||||
// set the directory state to initial state
|
||||
resetDirectory();
|
||||
|
||||
String fileHdfsPath = getHdfsStatusBatchSyncFileName(day);
|
||||
String fileLocalPath = getLocalStatusBatchSyncFileName(day);
|
||||
|
||||
LOG.info("Using " + fileHdfsPath + " as the HDFS batch summary load path.");
|
||||
LOG.info("Using " + fileLocalPath + " as the local batch summary sync path.");
|
||||
|
||||
LineRecordFileReader lineReader = null;
|
||||
try {
|
||||
fs.copyToLocalFile(new Path(fileHdfsPath), new Path(fileLocalPath));
|
||||
|
||||
lineReader = new LineRecordFileReader(fileLocalPath);
|
||||
String batchLine;
|
||||
while ((batchLine = lineReader.readNext()) != null) {
|
||||
DailyStatusBatch batch = DailyStatusBatch.deserializeFromJson(batchLine);
|
||||
if (batch == null) {
|
||||
LOG.error("Invalid daily status batch constructed from line: " + batchLine);
|
||||
resetDirectory();
|
||||
return false;
|
||||
}
|
||||
Date date = batch.getDate();
|
||||
if (firstValidDay == null || firstValidDay.after(date)) {
|
||||
firstValidDay = date;
|
||||
}
|
||||
if (lastValidDay == null || lastValidDay.before(date)) {
|
||||
lastValidDay = date;
|
||||
}
|
||||
statusBatches.put(date, batch);
|
||||
}
|
||||
LOG.info("Loaded {} status batches from HDFS: {}",
|
||||
statusBatches.size(), fileHdfsPath);
|
||||
LOG.info("First entry: {}", statusBatches.firstEntry().getValue().toString());
|
||||
LOG.info("Last entry: {}", statusBatches.lastEntry().getValue().toString());
|
||||
|
||||
return true;
|
||||
} catch (IOException ex) {
|
||||
LOG.error("Failed loading time slices from HDFS: " + fileHdfsPath, ex);
|
||||
resetDirectory();
|
||||
return false;
|
||||
} finally {
|
||||
if (lineReader != null) {
|
||||
lineReader.stop();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Flush the daily status batches to local disk and then upload to HDFS.
|
||||
*/
|
||||
private boolean storeStatusBatchesToHdfs(FileSystem fs, Date day) {
|
||||
Preconditions.checkNotNull(lastValidDay);
|
||||
|
||||
if (!StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.isOfficial()) {
|
||||
LOG.info("Status batch flush version is not official, no batches will be flushed to HDFS");
|
||||
return true;
|
||||
}
|
||||
|
||||
String fileLocalPath = getLocalStatusBatchSyncFileName(day);
|
||||
|
||||
// Flush to local disk
|
||||
File outputFile = null;
|
||||
FileWriter fileWriter = null;
|
||||
try {
|
||||
LOG.info("Flushing daily status batches into: " + fileLocalPath);
|
||||
outputFile = new File(fileLocalPath);
|
||||
outputFile.getParentFile().mkdirs();
|
||||
if (!outputFile.getParentFile().exists()) {
|
||||
LOG.error("Cannot create directory: " + outputFile.getParentFile().toString());
|
||||
return false;
|
||||
}
|
||||
fileWriter = new FileWriter(outputFile, false);
|
||||
for (Date date : statusBatches.keySet()) {
|
||||
fileWriter.write(statusBatches.get(date).serializeToJson());
|
||||
fileWriter.write("\n");
|
||||
}
|
||||
fileWriter.flush();
|
||||
|
||||
// Upload the file to HDFS
|
||||
return uploadStatusBatchesToHdfs(fs, day);
|
||||
} catch (IOException e) {
|
||||
String fileHdfsPath = getHdfsStatusBatchSyncFileName(day);
|
||||
LOG.error("Failed storing status batches to HDFS: " + fileHdfsPath, e);
|
||||
return false;
|
||||
} finally {
|
||||
try {
|
||||
if (fileWriter != null) {
|
||||
fileWriter.close();
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.error("Error to close fileWrite.", e);
|
||||
}
|
||||
if (outputFile != null) {
|
||||
// Delete the local file
|
||||
outputFile.delete();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Upload the status batches to HDFS.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
boolean uploadStatusBatchesToHdfs(FileSystem fs, Date day) {
|
||||
String localFileName = getLocalStatusBatchSyncFileName(day);
|
||||
String hdfsFileName = getHdfsStatusBatchSyncFileName(day);
|
||||
|
||||
LOG.info("Using " + hdfsFileName + " as the HDFS batch summary upload path.");
|
||||
LOG.info("Using " + localFileName + " as the local batch summary sync path.");
|
||||
|
||||
try {
|
||||
Path hdfsFilePath = new Path(hdfsFileName);
|
||||
if (fs.exists(hdfsFilePath)) {
|
||||
LOG.warn("Found status batch file on HDFS: " + hdfsFileName);
|
||||
return true;
|
||||
}
|
||||
|
||||
String hdfsTempName = getHdfsStatusBatchTempSyncFileName(day);
|
||||
Path hdfsTempPath = new Path(hdfsTempName);
|
||||
if (fs.exists(hdfsTempPath)) {
|
||||
LOG.info("Found existing temporary status batch file on HDFS, removing: " + hdfsTempName);
|
||||
if (!fs.delete(hdfsTempPath, false)) {
|
||||
LOG.error("Failed to delete temporary file: " + hdfsTempName);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
fs.copyFromLocalFile(new Path(localFileName), hdfsTempPath);
|
||||
|
||||
if (fs.rename(hdfsTempPath, hdfsFilePath)) {
|
||||
LOG.debug("Renamed " + hdfsTempName + " on HDFS to: " + hdfsFileName);
|
||||
return true;
|
||||
} else {
|
||||
LOG.error("Failed to rename " + hdfsTempName + " on HDFS to: " + hdfsFileName);
|
||||
return false;
|
||||
}
|
||||
} catch (IOException ex) {
|
||||
LOG.error("Failed uploading status batch file to HDFS: " + hdfsFileName, ex);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isStatusBatchFlushingEnabled() {
|
||||
return EarlybirdProperty.ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED.get(false);
|
||||
}
|
||||
|
||||
private static boolean isStatusBatchLoadingEnabled() {
|
||||
return EarlybirdConfig.getBool("archive_daily_status_batch_loading_enabled", false);
|
||||
}
|
||||
|
||||
private static String getVersionFileExtension() {
|
||||
return StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.getVersionFileExtension();
|
||||
}
|
||||
|
||||
String getStatusBatchSyncRootDir() {
|
||||
return EarlybirdConfig.getString("archive_daily_status_batch_sync_dir",
|
||||
"daily_status_batches") + "/" + scrubGenSuffix();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
String getLocalStatusBatchSyncFileName(Date day) {
|
||||
return getStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_"
|
||||
+ DATE_FORMAT.format(day) + getVersionFileExtension();
|
||||
}
|
||||
|
||||
String getHdfsStatusBatchSyncRootDir() {
|
||||
return EarlybirdConfig.getString("hdfs_archive_daily_status_batch_sync_dir",
|
||||
"daily_status_batches") + "/" + scrubGenSuffix();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
String getHdfsStatusBatchSyncFileName(Date day) {
|
||||
return getHdfsStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_"
|
||||
+ DATE_FORMAT.format(day) + getVersionFileExtension();
|
||||
}
|
||||
|
||||
private String getHdfsStatusBatchTempSyncFileName(Date day) {
|
||||
return getHdfsStatusBatchSyncRootDir() + "/" + DatabaseConfig.getLocalHostname() + "_"
|
||||
+ STATUS_BATCHES_PREFIX + "_" + DATE_FORMAT.format(day) + getVersionFileExtension();
|
||||
}
|
||||
|
||||
private String scrubGenSuffix() {
|
||||
return String.format(SCRUB_GEN_SUFFIX_PATTERN, DATE_FORMAT.format(scrubGenDay));
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the path to the directory that stores the statuses for the given day.
|
||||
*/
|
||||
public Path getStatusPathToUseForDay(Date day) {
|
||||
if (!day.before(scrubGenDay)) {
|
||||
return statusPath;
|
||||
}
|
||||
|
||||
String suffix = scrubGenSuffix();
|
||||
Preconditions.checkArgument(!suffix.isEmpty());
|
||||
Path scrubPath = new Path(buildGenPath, suffix);
|
||||
return new Path(scrubPath, STATUS_SUBDIR_NAME);
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if the data for the specified scrub gen was fully built, by checking the number of
|
||||
* days for which data was built against the expected number of days extracted from the specified
|
||||
* scrub gen date.
|
||||
*/
|
||||
public boolean isScrubGenDataFullyBuilt(FileSystem hdfs) throws IOException {
|
||||
initialLoadDailyBatchInfos(hdfs);
|
||||
if (numberOfDaysWithValidScrubGenData == 0) {
|
||||
LOG.warn("numberOfDaysWithValidScrubGenData is 0");
|
||||
}
|
||||
long expectedDays = getDiffBetweenDays(scrubGenDay);
|
||||
return expectedDays == numberOfDaysWithValidScrubGenData;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
long getDiffBetweenDays(Date day) {
|
||||
long diff = day.getTime() - FIRST_TWEET_DAY.getTime();
|
||||
return TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,333 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Function;
|
||||
import com.google.common.base.Predicate;
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.fs.PathFilter;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.common.config.Config;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil;
|
||||
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
|
||||
import com.twitter.search.common.util.date.DateUtil;
|
||||
import com.twitter.search.common.util.io.EmptyRecordReader;
|
||||
import com.twitter.search.common.util.io.LzoThriftBlockFileReader;
|
||||
import com.twitter.search.common.util.io.MergingSortedRecordReader;
|
||||
import com.twitter.search.common.util.io.TransformingRecordReader;
|
||||
import com.twitter.search.common.util.io.recordreader.RecordReader;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.document.DocumentFactory;
|
||||
import com.twitter.search.earlybird.document.TweetDocument;
|
||||
import com.twitter.search.earlybird.partition.HdfsUtil;
|
||||
|
||||
/**
|
||||
* A batch of pre-processed tweets for a single hash partition from a particular day.
|
||||
*/
|
||||
public class PartitionedBatch {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(PartitionedBatch.class);
|
||||
private static final Date START_DATE_INCLUSIVE = DateUtil.toDate(2006, 03, 21);
|
||||
private static final String STATUS_COUNT_FILE_PREFIX = "_status_count_";
|
||||
private static final Pattern STATUS_COUNT_FILE_PATTERN =
|
||||
Pattern.compile(STATUS_COUNT_FILE_PREFIX + "(\\d+)_minid_(\\d+)_maxid_(\\d+)");
|
||||
private static final int MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS =
|
||||
EarlybirdConfig.getInt("archive_max_out_of_order_tolerance_hours", 12);
|
||||
private static final int READER_INIT_IOEXCEPTION_RETRIES = 20;
|
||||
private static final PathFilter LZO_DATA_FILES_FILTER = file -> file.getName().endsWith(".lzo");
|
||||
private static final PathFilter TXT_DATA_FILES_FILTER = file -> file.getName().endsWith(".txt");
|
||||
|
||||
private static final Comparator<ThriftIndexingEvent> DESC_THRIFT_INDEXING_EVENT_COMPARATOR =
|
||||
(o1, o2) -> ComparisonChain.start()
|
||||
.compare(o2.getSortId(), o1.getSortId())
|
||||
.compare(o2.getUid(), o1.getUid())
|
||||
.result();
|
||||
|
||||
// Number archive tweets skipped because they are too out-of-order.
|
||||
private static final SearchCounter OUT_OF_ORDER_STATUSES_SKIPPED =
|
||||
SearchCounter.export("out_of_order_archive_statuses_skipped");
|
||||
|
||||
@VisibleForTesting
|
||||
protected static final long MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS =
|
||||
TimeUnit.HOURS.toMillis(MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS);
|
||||
|
||||
private final Date date;
|
||||
private final Path path;
|
||||
private int statusCount;
|
||||
private long minStatusID;
|
||||
private long maxStatusID;
|
||||
private final int hashPartitionID;
|
||||
private boolean hasStatusCountFile;
|
||||
private final int numHashPartitions;
|
||||
|
||||
@VisibleForTesting
|
||||
public PartitionedBatch(
|
||||
Path path,
|
||||
int hashPartitionID,
|
||||
int numHashPartitions,
|
||||
Date date) {
|
||||
this.path = path;
|
||||
this.hashPartitionID = hashPartitionID;
|
||||
this.numHashPartitions = numHashPartitions;
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads all the information (tweet count, etc.) for this partition and day from HDFS.
|
||||
*/
|
||||
public void load(FileSystem hdfs) throws IOException {
|
||||
FileStatus[] dailyBatchFiles = null;
|
||||
try {
|
||||
// listStatus() javadoc says it throws FileNotFoundException when path does not exist.
|
||||
// However, the actual implementations return null or an empty array instead.
|
||||
// We handle all 3 cases: null, empty array, or FileNotFoundException.
|
||||
dailyBatchFiles = hdfs.listStatus(path);
|
||||
} catch (FileNotFoundException e) {
|
||||
// don't do anything here and the day will be handled as empty.
|
||||
}
|
||||
|
||||
if (dailyBatchFiles != null && dailyBatchFiles.length > 0) {
|
||||
for (FileStatus file : dailyBatchFiles) {
|
||||
String fileName = file.getPath().getName();
|
||||
if (fileName.equals(STATUS_COUNT_FILE_PREFIX)) {
|
||||
// zero tweets in this partition - this can happen for early days in 2006
|
||||
handleEmptyPartition();
|
||||
} else {
|
||||
Matcher matcher = STATUS_COUNT_FILE_PATTERN.matcher(fileName);
|
||||
if (matcher.matches()) {
|
||||
try {
|
||||
statusCount = Integer.parseInt(matcher.group(1));
|
||||
// Only adjustMinStatusId in production. For tests, this makes the tests harder to
|
||||
// understand.
|
||||
minStatusID = Config.environmentIsTest() ? Long.parseLong(matcher.group(2))
|
||||
: adjustMinStatusId(Long.parseLong(matcher.group(2)), date);
|
||||
maxStatusID = Long.parseLong(matcher.group(3));
|
||||
hasStatusCountFile = true;
|
||||
} catch (NumberFormatException e) {
|
||||
// invalid file - ignore
|
||||
LOG.warn("Could not parse status count file name.", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Partition folder does not exist. This case can happen for early days of twitter
|
||||
// where some partitions are empty. Set us to having a status count file, the validity of
|
||||
// the parent DailyStatusBatch will still be determined by whether there was a _SUCCESS file
|
||||
// in the day root.
|
||||
handleEmptyPartition();
|
||||
|
||||
if (date.after(getEarliestDenseDay())) {
|
||||
LOG.error("Unexpected empty directory {} for {}", path, date);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void handleEmptyPartition() {
|
||||
statusCount = 0;
|
||||
minStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID;
|
||||
maxStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID;
|
||||
hasStatusCountFile = true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Sometimes tweets are out-of-order (E.g. a tweet from Sep 2012 got into a
|
||||
* batch in July 2013). See SEARCH-1750 for more details.
|
||||
* This adjust the minStatusID if it is badly out-of-order.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
protected static long adjustMinStatusId(long minStatusID, Date date) {
|
||||
long dateTime = date.getTime();
|
||||
// If the daily batch is for a day before we started using snow flake IDs. Never adjust.
|
||||
if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(dateTime)) {
|
||||
return minStatusID;
|
||||
}
|
||||
|
||||
long earliestStartTime = dateTime - MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS;
|
||||
long minStatusTime = SnowflakeIdParser.getTimestampFromTweetId(minStatusID);
|
||||
if (minStatusTime < earliestStartTime) {
|
||||
long newMinId = SnowflakeIdParser.generateValidStatusId(earliestStartTime, 0);
|
||||
LOG.info("Daily batch for " + date + " has badly out of order tweet: " + minStatusID
|
||||
+ ". The minStatusID for the day this batch is adjusted to " + newMinId);
|
||||
return newMinId;
|
||||
} else {
|
||||
return minStatusID;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a reader that reads tweets from the given directory.
|
||||
*
|
||||
* @param archiveSegment Determines the timeslice ID of all read tweets.
|
||||
* @param tweetsPath The path to the directory where the tweets for this day are stored.
|
||||
* @param documentFactory The ThriftIndexingEvent to TweetDocument converter.
|
||||
*/
|
||||
public RecordReader<TweetDocument> getTweetReaders(
|
||||
ArchiveSegment archiveSegment,
|
||||
Path tweetsPath,
|
||||
DocumentFactory<ThriftIndexingEvent> documentFactory) throws IOException {
|
||||
RecordReader<TweetDocument> tweetDocumentReader =
|
||||
new TransformingRecordReader<>(
|
||||
createTweetReader(tweetsPath), new Function<ThriftIndexingEvent, TweetDocument>() {
|
||||
@Override
|
||||
public TweetDocument apply(ThriftIndexingEvent event) {
|
||||
return new TweetDocument(
|
||||
event.getSortId(),
|
||||
archiveSegment.getTimeSliceID(),
|
||||
EarlybirdThriftDocumentUtil.getCreatedAtMs(event.getDocument()),
|
||||
documentFactory.newDocument(event)
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
tweetDocumentReader.setExhaustStream(true);
|
||||
return tweetDocumentReader;
|
||||
}
|
||||
|
||||
private RecordReader<ThriftIndexingEvent> createTweetReader(Path tweetsPath) throws IOException {
|
||||
if (date.before(START_DATE_INCLUSIVE)) {
|
||||
return new EmptyRecordReader<>();
|
||||
}
|
||||
|
||||
List<RecordReader<ThriftIndexingEvent>> readers = Lists.newArrayList();
|
||||
FileSystem hdfs = HdfsUtil.getHdfsFileSystem();
|
||||
try {
|
||||
Path dayPath = new Path(tweetsPath, ArchiveHDFSUtils.dateToPath(date, "/"));
|
||||
Path partitionPath =
|
||||
new Path(dayPath, String.format("p_%d_of_%d", hashPartitionID, numHashPartitions));
|
||||
PathFilter pathFilter =
|
||||
Config.environmentIsTest() ? TXT_DATA_FILES_FILTER : LZO_DATA_FILES_FILTER;
|
||||
FileStatus[] files = hdfs.listStatus(partitionPath, pathFilter);
|
||||
for (FileStatus fileStatus : files) {
|
||||
String fileStatusPath = fileStatus.getPath().toString().replaceAll("file:/", "/");
|
||||
RecordReader<ThriftIndexingEvent> reader = createRecordReaderWithRetries(fileStatusPath);
|
||||
readers.add(reader);
|
||||
}
|
||||
} finally {
|
||||
IOUtils.closeQuietly(hdfs);
|
||||
}
|
||||
|
||||
if (readers.isEmpty()) {
|
||||
return new EmptyRecordReader<>();
|
||||
}
|
||||
|
||||
return new MergingSortedRecordReader<>(DESC_THRIFT_INDEXING_EVENT_COMPARATOR, readers);
|
||||
}
|
||||
|
||||
private RecordReader<ThriftIndexingEvent> createRecordReaderWithRetries(String filePath)
|
||||
throws IOException {
|
||||
Predicate<ThriftIndexingEvent> recordFilter = getRecordFilter();
|
||||
int numTries = 0;
|
||||
while (true) {
|
||||
try {
|
||||
++numTries;
|
||||
return new LzoThriftBlockFileReader<>(filePath, ThriftIndexingEvent.class, recordFilter);
|
||||
} catch (IOException e) {
|
||||
if (numTries < READER_INIT_IOEXCEPTION_RETRIES) {
|
||||
LOG.warn("Failed to open LzoThriftBlockFileReader for " + filePath + ". Will retry.", e);
|
||||
} else {
|
||||
LOG.error("Failed to open LzoThriftBlockFileReader for " + filePath
|
||||
+ " after too many retries.", e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Predicate<ThriftIndexingEvent> getRecordFilter() {
|
||||
return Config.environmentIsTest() ? null : input -> {
|
||||
if (input == null) {
|
||||
return false;
|
||||
}
|
||||
// We only guard against status IDs that are too small, because it is possible
|
||||
// for a very old tweet to get into today's batch, but not possible for a very
|
||||
// large ID (a future tweet ID that is not yet published) to get in today's
|
||||
// batch, unless tweet ID generation messed up.
|
||||
long statusId = input.getSortId();
|
||||
boolean keep = statusId >= minStatusID;
|
||||
if (!keep) {
|
||||
LOG.debug("Out of order documentId: {} minStatusID: {} Date: {} Path: {}",
|
||||
statusId, minStatusID, date, path);
|
||||
OUT_OF_ORDER_STATUSES_SKIPPED.increment();
|
||||
}
|
||||
return keep;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of statuses in this batch
|
||||
*/
|
||||
public int getStatusCount() {
|
||||
return statusCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Was the _status_count file was found in this folder.
|
||||
*/
|
||||
public boolean hasStatusCount() {
|
||||
return hasStatusCountFile;
|
||||
}
|
||||
|
||||
public long getMinStatusID() {
|
||||
return minStatusID;
|
||||
}
|
||||
|
||||
public long getMaxStatusID() {
|
||||
return maxStatusID;
|
||||
}
|
||||
|
||||
public Date getDate() {
|
||||
return date;
|
||||
}
|
||||
|
||||
public Path getPath() {
|
||||
return path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether the partition is
|
||||
* . empty and
|
||||
* . it is disallowed (empty partition can only happen before 2010)
|
||||
* (Empty partition means that the directory is missing when scan happens.)
|
||||
*
|
||||
* @return true if the partition has no documents and it is not allowed.
|
||||
*/
|
||||
public boolean isDisallowedEmptyPartition() {
|
||||
return hasStatusCountFile
|
||||
&& statusCount == 0
|
||||
&& minStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID
|
||||
&& maxStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID
|
||||
&& date.after(getEarliestDenseDay());
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "PartitionedBatch[hashPartitionId=" + hashPartitionID
|
||||
+ ",numHashPartitions=" + numHashPartitions
|
||||
+ ",date=" + date
|
||||
+ ",path=" + path
|
||||
+ ",hasStatusCountFile=" + hasStatusCountFile
|
||||
+ ",statusCount=" + statusCount + "]";
|
||||
}
|
||||
|
||||
private Date getEarliestDenseDay() {
|
||||
return EarlybirdConfig.getDate("archive_search_earliest_dense_day");
|
||||
}
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
java_library(
|
||||
name = "segment_builder_lib",
|
||||
sources = ["**/*.java"],
|
||||
platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/google/inject:guice",
|
||||
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server",
|
||||
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider",
|
||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"decider/src/main/scala",
|
||||
"finatra/inject/inject-core/src/main/scala",
|
||||
"finatra/inject/inject-server/src/main/scala/com/twitter/inject/server",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common/quantity",
|
||||
"src/java/com/twitter/common/util:system-mocks",
|
||||
"src/java/com/twitter/common_internal/text/version",
|
||||
"src/java/com/twitter/search/common/config",
|
||||
"src/java/com/twitter/search/common/database",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/partitioning/base",
|
||||
"src/java/com/twitter/search/common/partitioning/zookeeper",
|
||||
"src/java/com/twitter/search/common/schema",
|
||||
"src/java/com/twitter/search/common/schema/base",
|
||||
"src/java/com/twitter/search/common/util:closeresourceutil",
|
||||
"src/java/com/twitter/search/common/util:gcutil",
|
||||
"src/java/com/twitter/search/common/util:kerberos",
|
||||
"src/java/com/twitter/search/common/util/date",
|
||||
"src/java/com/twitter/search/common/util/io:flushable",
|
||||
"src/java/com/twitter/search/common/util/zktrylock",
|
||||
"src/java/com/twitter/search/common/util/zookeeper",
|
||||
"src/java/com/twitter/search/earlybird:earlybird-lib",
|
||||
"src/java/com/twitter/search/earlybird/common",
|
||||
"src/java/com/twitter/search/earlybird/common/config",
|
||||
"src/java/com/twitter/search/earlybird/common/userupdates",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
||||
|
||||
# Using hadoop_binary target can automatically exclude hadoop related jars in the built jar
|
||||
# and load in the right jars based on hadoop config.
|
||||
hadoop_binary(
|
||||
name = "segment_builder_binary",
|
||||
basename = "segment_builder",
|
||||
main = "com.twitter.search.earlybird.archive.segmentbuilder.SegmentBuilderMain",
|
||||
platform = "java8",
|
||||
runtime_platform = "java8",
|
||||
tags = [
|
||||
"bazel-compatible",
|
||||
"bazel-compatible:migrated",
|
||||
"bazel-only",
|
||||
],
|
||||
dependencies = [
|
||||
":segment_builder_lib",
|
||||
"src/java/com/twitter/search/common/logging:search-log4j",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
@ -1,29 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
||||
|
||||
public class BuiltAndFinalizedSegment extends SegmentBuilderSegment {
|
||||
public BuiltAndFinalizedSegment(
|
||||
SegmentInfo segmentInfo,
|
||||
SegmentConfig segmentConfig,
|
||||
EarlybirdSegmentFactory earlybirdSegmentFactory,
|
||||
int alreadyRetriedCount,
|
||||
SegmentSyncConfig sync) {
|
||||
|
||||
super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
|
||||
}
|
||||
|
||||
@Override
|
||||
public SegmentBuilderSegment handle() throws SegmentInfoConstructionException,
|
||||
SegmentUpdaterException {
|
||||
|
||||
throw new IllegalStateException("Should not handle a BuildAndFinalizedSegment.");
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isBuilt() {
|
||||
return true;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,101 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import com.google.common.base.Stopwatch;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.util.GCUtil;
|
||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
||||
import com.twitter.search.earlybird.archive.ArchiveSegmentUpdater;
|
||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
||||
|
||||
public class NotYetBuiltSegment extends SegmentBuilderSegment {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(NotYetBuiltSegment.class);
|
||||
|
||||
public NotYetBuiltSegment(
|
||||
SegmentInfo segmentInfo,
|
||||
SegmentConfig segmentConfig,
|
||||
EarlybirdSegmentFactory earlybirdSegmentFactory,
|
||||
int alreadyRetriedCount,
|
||||
SegmentSyncConfig sync) {
|
||||
|
||||
super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
|
||||
}
|
||||
|
||||
/**
|
||||
* 1. Grab the ZK lock for this segment.
|
||||
* 2a. if lock fails, another host is updating; return the SOMEONE_ELSE_IS_BUILDING state.
|
||||
* 2b. if lock succeeds, check again if the updated segment exists on HDFS.
|
||||
* 3a. if so, just move on.
|
||||
* 3b. if not, update the segment.
|
||||
* In both cases, we need to check if the segment can now be marked as BUILT_AND_FINALIZED.
|
||||
*/
|
||||
@Override
|
||||
public SegmentBuilderSegment handle()
|
||||
throws SegmentUpdaterException, SegmentInfoConstructionException {
|
||||
LOG.info("Handling a not yet built segment: {}", this.getSegmentName());
|
||||
Stopwatch stopwatch = Stopwatch.createStarted();
|
||||
TryLock lock = getZooKeeperTryLock();
|
||||
|
||||
// The tryWithLock can only access variables from parent class that are final. However, we
|
||||
// would like to pass the process() return value to the parent class. So here we use
|
||||
// AtomicBoolean reference instead of Boolean.
|
||||
final AtomicBoolean successRef = new AtomicBoolean(false);
|
||||
boolean gotLock = lock.tryWithLock(() -> {
|
||||
ArchiveSegmentUpdater updater = new ArchiveSegmentUpdater(
|
||||
segmentConfig.getTryLockFactory(),
|
||||
sync,
|
||||
segmentConfig.getEarlybirdIndexConfig(),
|
||||
Clock.SYSTEM_CLOCK);
|
||||
|
||||
boolean success = updater.updateSegment(segmentInfo);
|
||||
successRef.set(success);
|
||||
});
|
||||
|
||||
if (!gotLock) {
|
||||
LOG.info("cannot acquire zookeeper lock for: " + segmentInfo);
|
||||
return new SomeoneElseIsBuildingSegment(
|
||||
segmentInfo,
|
||||
segmentConfig,
|
||||
earlybirdSegmentFactory,
|
||||
alreadyRetriedCount,
|
||||
sync);
|
||||
}
|
||||
|
||||
// 1. we want to make sure the heap is clean right after building a segment so that it's ready
|
||||
// for us to start allocations for a new segment
|
||||
// — I think we've had cases where we were seeing OOM's while building
|
||||
// 2. the thing that I think it helps with is compaction (vs just organically running CMS)
|
||||
// — which would clean up the heap, but may leave it in a fragmented state
|
||||
// — and running a Full GC is supposed to compact the remaining tenured space.
|
||||
GCUtil.runGC();
|
||||
|
||||
if (successRef.get()) {
|
||||
LOG.info("Indexing segment {} took {}", segmentInfo, stopwatch);
|
||||
LOG.info("Finished building {}", segmentInfo.getSegment().getSegmentName());
|
||||
return new BuiltAndFinalizedSegment(
|
||||
segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync);
|
||||
} else {
|
||||
int alreadyTried = alreadyRetriedCount + 1;
|
||||
String errMsg = "failed updating segment for: " + segmentInfo
|
||||
+ " for " + alreadyTried + " times";
|
||||
LOG.error(errMsg);
|
||||
if (alreadyTried < segmentConfig.getMaxRetriesOnFailure()) {
|
||||
return new NotYetBuiltSegment(
|
||||
createNewSegmentInfo(segmentInfo),
|
||||
segmentConfig,
|
||||
earlybirdSegmentFactory,
|
||||
alreadyTried,
|
||||
sync);
|
||||
} else {
|
||||
throw new SegmentUpdaterException(errMsg);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,39 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
|
||||
/**
|
||||
* A class that prevents handling a given segment more than once every hdfsCheckIntervalMillis
|
||||
*/
|
||||
public class RateLimitingSegmentHandler {
|
||||
private final long hdfsCheckIntervalMillis;
|
||||
private final Clock clock;
|
||||
private final Map<String, Long> segmentNameToLastUpdatedTimeMillis = new HashMap<>();
|
||||
|
||||
RateLimitingSegmentHandler(long hdfsCheckIntervalMillis, Clock clock) {
|
||||
this.hdfsCheckIntervalMillis = hdfsCheckIntervalMillis;
|
||||
this.clock = clock;
|
||||
}
|
||||
|
||||
SegmentBuilderSegment processSegment(SegmentBuilderSegment segment)
|
||||
throws SegmentUpdaterException, SegmentInfoConstructionException {
|
||||
|
||||
String segmentName = segment.getSegmentName();
|
||||
|
||||
Long lastUpdatedMillis = segmentNameToLastUpdatedTimeMillis.get(segmentName);
|
||||
if (lastUpdatedMillis == null) {
|
||||
lastUpdatedMillis = 0L;
|
||||
}
|
||||
|
||||
long nowMillis = clock.nowMillis();
|
||||
if (nowMillis - lastUpdatedMillis < hdfsCheckIntervalMillis) {
|
||||
return segment;
|
||||
}
|
||||
segmentNameToLastUpdatedTimeMillis.put(segmentName, nowMillis);
|
||||
|
||||
return segment.handle();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,540 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Optional;
|
||||
import java.util.Random;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.base.Stopwatch;
|
||||
import com.google.common.collect.ComparisonChain;
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.common.util.concurrent.Uninterruptibles;
|
||||
import com.google.inject.Inject;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.quantity.Amount;
|
||||
import com.twitter.common.quantity.Time;
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.decider.Decider;
|
||||
import com.twitter.inject.annotations.Flag;
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.metrics.SearchLongGauge;
|
||||
import com.twitter.search.common.metrics.SearchStatsReceiver;
|
||||
import com.twitter.search.common.metrics.SearchStatsReceiverImpl;
|
||||
import com.twitter.search.common.partitioning.zookeeper.SearchZkClient;
|
||||
import com.twitter.search.common.util.Kerberos;
|
||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
||||
import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig;
|
||||
import com.twitter.search.earlybird.archive.ArchiveSegment;
|
||||
import com.twitter.search.earlybird.archive.DailyStatusBatches;
|
||||
import com.twitter.search.earlybird.archive.ArchiveTimeSlicer;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.util.ScrubGenUtil;
|
||||
import com.twitter.search.earlybird.exception.CriticalExceptionHandler;
|
||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
||||
import com.twitter.search.earlybird.partition.SearchIndexingMetricSet;
|
||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
||||
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
|
||||
|
||||
/**
|
||||
* This class provides the core logic to build segment indices offline.
|
||||
* For each server, it coordinate via zookeeper to pick the next segment, build the indices for it
|
||||
* and upload them to HDFS. A state machine is used to handle the build state transitions. There
|
||||
* are three states:
|
||||
* NOT_BUILD_YET: a segment that needs to be built
|
||||
* SOMEONE_ELSE_IS_BUILDING: another server is building the segment.
|
||||
* BUILT_AND_FINALIZED: the indices of this segment have already been built.
|
||||
*/
|
||||
public class SegmentBuilder {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilder.class);
|
||||
|
||||
private final boolean onlyRunOnce;
|
||||
private final int waitBetweenLoopsMins;
|
||||
private final int startUpBatchSize;
|
||||
private final int instance;
|
||||
private final int waitBetweenSegmentsSecs;
|
||||
private final int waitBeforeQuitMins;
|
||||
|
||||
// When multiple segment builders start simultaneously, they might make the HDFS name node and
|
||||
// zookeeper overwhelmed. So, we let some instances sleep sometimes before they start to avoid
|
||||
// the issues.
|
||||
private final long startUpSleepMins;
|
||||
|
||||
// If no more segments to built, wait this interval before checking again.
|
||||
private final long processWaitingInterval = TimeUnit.MINUTES.toMillis(10);
|
||||
|
||||
// The hash partitions that segments will be built.
|
||||
private final ImmutableList<Integer> hashPartitions;
|
||||
|
||||
private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl();
|
||||
private final SearchIndexingMetricSet searchIndexingMetricSet =
|
||||
new SearchIndexingMetricSet(statsReceiver);
|
||||
private final EarlybirdSearcherStats searcherStats =
|
||||
new EarlybirdSearcherStats(statsReceiver);
|
||||
|
||||
private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig;
|
||||
|
||||
private final ZooKeeperTryLockFactory zkTryLockFactory;
|
||||
private final RateLimitingSegmentHandler segmentHandler;
|
||||
private final Clock clock;
|
||||
private final int numSegmentBuilderPartitions;
|
||||
private final int myPartitionId;
|
||||
private final SegmentConfig segmentConfig;
|
||||
private final EarlybirdSegmentFactory segmentFactory;
|
||||
private final SegmentBuilderCoordinator segmentBuilderCoordinator;
|
||||
private final SegmentSyncConfig segmentSyncConfig;
|
||||
private final Random random = new Random();
|
||||
|
||||
private static final double SLEEP_RANDOMIZATION_RATIO = .2;
|
||||
|
||||
// Stats
|
||||
// The flush version used to build segments
|
||||
private static final SearchLongGauge CURRENT_FLUSH_VERSION =
|
||||
SearchLongGauge.export("current_flush_version");
|
||||
|
||||
// Accumulated number and time in seconds spent on building segments locally
|
||||
private static SearchCounter segmentsBuiltLocally =
|
||||
SearchCounter.export("segments_built_locally");
|
||||
private static SearchCounter timeSpentOnSuccessfulBuildSecs =
|
||||
SearchCounter.export("time_spent_on_successful_build_secs");
|
||||
|
||||
// The total number of segments to be built
|
||||
private static final SearchLongGauge SEGMENTS_TO_BUILD =
|
||||
SearchLongGauge.export("segments_to_build");
|
||||
|
||||
// How many segments failed locally
|
||||
private static final SearchCounter FAILED_SEGMENTS =
|
||||
SearchCounter.export("failed_segments");
|
||||
|
||||
@Inject
|
||||
protected SegmentBuilder(@Flag("onlyRunOnce") boolean onlyRunOnceFlag,
|
||||
@Flag("waitBetweenLoopsMins") int waitBetweenLoopsMinsFlag,
|
||||
@Flag("startup_batch_size") int startUpBatchSizeFlag,
|
||||
@Flag("instance") int instanceFlag,
|
||||
@Flag("segmentZkLockExpirationHours")
|
||||
int segmentZkLockExpirationHoursFlag,
|
||||
@Flag("startupSleepMins") long startupSleepMinsFlag,
|
||||
@Flag("maxRetriesOnFailure") int maxRetriesOnFailureFlag,
|
||||
@Flag("hash_partitions") List<Integer> hashPartitionsFlag,
|
||||
@Flag("numSegmentBuilderPartitions") int numSegmentBuilderPartitionsFlag,
|
||||
@Flag("waitBetweenSegmentsSecs") int waitBetweenSegmentsSecsFlag,
|
||||
@Flag("waitBeforeQuitMins") int waitBeforeQuitMinsFlag,
|
||||
@Flag("scrubGen") String scrubGen,
|
||||
Decider decider) {
|
||||
this(onlyRunOnceFlag,
|
||||
waitBetweenLoopsMinsFlag,
|
||||
startUpBatchSizeFlag,
|
||||
instanceFlag,
|
||||
segmentZkLockExpirationHoursFlag,
|
||||
startupSleepMinsFlag,
|
||||
hashPartitionsFlag,
|
||||
maxRetriesOnFailureFlag,
|
||||
waitBetweenSegmentsSecsFlag,
|
||||
waitBeforeQuitMinsFlag,
|
||||
SearchZkClient.getSZooKeeperClient().createZooKeeperTryLockFactory(),
|
||||
new RateLimitingSegmentHandler(TimeUnit.MINUTES.toMillis(10), Clock.SYSTEM_CLOCK),
|
||||
Clock.SYSTEM_CLOCK,
|
||||
numSegmentBuilderPartitionsFlag,
|
||||
decider,
|
||||
getSyncConfig(scrubGen));
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected SegmentBuilder(boolean onlyRunOnceFlag,
|
||||
int waitBetweenLoopsMinsFlag,
|
||||
int startUpBatchSizeFlag,
|
||||
int instanceFlag,
|
||||
int segmentZkLockExpirationHoursFlag,
|
||||
long startupSleepMinsFlag,
|
||||
List<Integer> hashPartitions,
|
||||
int maxRetriesOnFailure,
|
||||
int waitBetweenSegmentsSecsFlag,
|
||||
int waitBeforeQuitMinsFlag,
|
||||
ZooKeeperTryLockFactory zooKeeperTryLockFactory,
|
||||
RateLimitingSegmentHandler segmentHandler,
|
||||
Clock clock,
|
||||
int numSegmentBuilderPartitions,
|
||||
Decider decider,
|
||||
SegmentSyncConfig syncConfig) {
|
||||
LOG.info("Creating SegmentBuilder");
|
||||
LOG.info("Penguin version in use: " + EarlybirdConfig.getPenguinVersion());
|
||||
|
||||
// Set command line flag values
|
||||
this.onlyRunOnce = onlyRunOnceFlag;
|
||||
this.waitBetweenLoopsMins = waitBetweenLoopsMinsFlag;
|
||||
this.startUpBatchSize = startUpBatchSizeFlag;
|
||||
this.instance = instanceFlag;
|
||||
this.waitBetweenSegmentsSecs = waitBetweenSegmentsSecsFlag;
|
||||
this.waitBeforeQuitMins = waitBeforeQuitMinsFlag;
|
||||
|
||||
this.segmentHandler = segmentHandler;
|
||||
this.zkTryLockFactory = zooKeeperTryLockFactory;
|
||||
this.segmentSyncConfig = syncConfig;
|
||||
this.startUpSleepMins = startupSleepMinsFlag;
|
||||
|
||||
if (!hashPartitions.isEmpty()) {
|
||||
this.hashPartitions = ImmutableList.copyOf(hashPartitions);
|
||||
} else {
|
||||
this.hashPartitions = null;
|
||||
}
|
||||
|
||||
Amount<Long, Time> segmentZKLockExpirationTime = Amount.of((long)
|
||||
segmentZkLockExpirationHoursFlag, Time.HOURS);
|
||||
|
||||
this.earlybirdIndexConfig =
|
||||
new ArchiveOnDiskEarlybirdIndexConfig(decider, searchIndexingMetricSet,
|
||||
new CriticalExceptionHandler());
|
||||
|
||||
this.segmentConfig = new SegmentConfig(
|
||||
earlybirdIndexConfig,
|
||||
segmentZKLockExpirationTime,
|
||||
maxRetriesOnFailure,
|
||||
zkTryLockFactory);
|
||||
this.segmentFactory = new EarlybirdSegmentFactory(
|
||||
earlybirdIndexConfig,
|
||||
searchIndexingMetricSet,
|
||||
searcherStats,
|
||||
clock);
|
||||
this.segmentBuilderCoordinator = new SegmentBuilderCoordinator(
|
||||
zkTryLockFactory, syncConfig, clock);
|
||||
|
||||
this.clock = clock;
|
||||
|
||||
this.numSegmentBuilderPartitions = numSegmentBuilderPartitions;
|
||||
this.myPartitionId = instance % numSegmentBuilderPartitions;
|
||||
SearchLongGauge.export("segment_builder_partition_id_" + myPartitionId).set(1);
|
||||
|
||||
CURRENT_FLUSH_VERSION.set(earlybirdIndexConfig.getSchema().getMajorVersionNumber());
|
||||
}
|
||||
|
||||
void run() {
|
||||
LOG.info("Config values: {}", EarlybirdConfig.allValuesAsString());
|
||||
|
||||
// Sleep some time uninterruptibly before get started so that if multiple instances are running,
|
||||
// the HDFS name node and zookeeper wont be overwhelmed
|
||||
// Say, we have 100 instances (instance_arg will have value from 0 - 99, our
|
||||
// STARTUP_BATCH_SIZE_ARG is 20 and startUpSleepMins is 3 mins. Then the first 20 instances
|
||||
// will not sleep, but start immediately. then instance 20 - 39 will sleep 3 mins and then
|
||||
// start to run. instance 40 - 59 will sleep 6 mins then start to run. instances 60 - 79 will
|
||||
// sleep 9 mins and then start to run and so forth.
|
||||
long sleepTime = instance / startUpBatchSize * startUpSleepMins;
|
||||
LOG.info("Instance={}, Start up batch size={}", instance, startUpBatchSize);
|
||||
LOG.info("Sleep {} minutes to void HDFS name node and ZooKeeper overwhelmed.", sleepTime);
|
||||
Uninterruptibles.sleepUninterruptibly(sleepTime, TimeUnit.MINUTES);
|
||||
|
||||
// Kinit here.
|
||||
Kerberos.kinit(
|
||||
EarlybirdConfig.getString("kerberos_user", ""),
|
||||
EarlybirdConfig.getString("kerberos_keytab_path", "")
|
||||
);
|
||||
|
||||
long waitBetweenLoopsMs = TimeUnit.MINUTES.toMillis(waitBetweenLoopsMins);
|
||||
if (onlyRunOnce) {
|
||||
LOG.info("This segment builder will run the full rebuild of all the segments");
|
||||
} else {
|
||||
LOG.info("This segment builder will incrementally check for new data and rebuilt "
|
||||
+ "current segments as needed.");
|
||||
LOG.info("The waiting interval between two new data checking is: "
|
||||
+ waitBetweenLoopsMs + " ms.");
|
||||
}
|
||||
|
||||
boolean scrubGenPresent = segmentSyncConfig.getScrubGen().isPresent();
|
||||
LOG.info("Scrub gen present: {}", scrubGenPresent);
|
||||
boolean scrubGenDataFullyBuilt = segmentBuilderCoordinator.isScrubGenDataFullyBuilt(instance);
|
||||
LOG.info("Scrub gen data fully built: {}", scrubGenDataFullyBuilt);
|
||||
|
||||
if (!scrubGenPresent || scrubGenDataFullyBuilt) {
|
||||
LOG.info("Starting segment building loop...");
|
||||
while (!Thread.currentThread().isInterrupted()) {
|
||||
try {
|
||||
indexingLoop();
|
||||
if (onlyRunOnce) {
|
||||
LOG.info("only run once is true, breaking");
|
||||
break;
|
||||
}
|
||||
clock.waitFor(waitBetweenLoopsMs);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.info("Interrupted, quitting segment builder");
|
||||
Thread.currentThread().interrupt();
|
||||
} catch (SegmentInfoConstructionException e) {
|
||||
LOG.error("Error creating new segmentInfo, quitting segment builder: ", e);
|
||||
break;
|
||||
} catch (SegmentUpdaterException e) {
|
||||
FAILED_SEGMENTS.increment();
|
||||
// Before the segment builder quits, sleep for WAIT_BEFORE_QUIT_MINS minutes so that the
|
||||
// FAILED_SEGMENTS stat can be exported.
|
||||
try {
|
||||
clock.waitFor(TimeUnit.MINUTES.toMillis(waitBeforeQuitMins));
|
||||
} catch (InterruptedException ex) {
|
||||
LOG.info("Interrupted, quitting segment builder");
|
||||
Thread.currentThread().interrupt();
|
||||
}
|
||||
LOG.error("SegmentUpdater processing segment error, quitting segment builder: ", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
LOG.info("Cannot build the segments for scrub gen yet.");
|
||||
}
|
||||
}
|
||||
|
||||
// Refactoring the run loop to here for unittest
|
||||
@VisibleForTesting
|
||||
void indexingLoop()
|
||||
throws SegmentInfoConstructionException, InterruptedException, SegmentUpdaterException {
|
||||
// This map contains all the segments to be processed; if a segment is built, it will be removed
|
||||
// from the map.
|
||||
Map<String, SegmentBuilderSegment> buildableSegmentInfoMap;
|
||||
try {
|
||||
buildableSegmentInfoMap = createSegmentInfoMap();
|
||||
printSegmentInfoMap(buildableSegmentInfoMap);
|
||||
} catch (IOException e) {
|
||||
LOG.error("Error creating segmentInfoMap: ", e);
|
||||
return;
|
||||
}
|
||||
|
||||
while (!buildableSegmentInfoMap.isEmpty()) {
|
||||
boolean hasBuiltSegment = processSegments(buildableSegmentInfoMap);
|
||||
|
||||
if (!hasBuiltSegment) {
|
||||
// If we successfully built a segment, no need to sleep since building a segment takes a
|
||||
// long time
|
||||
clock.waitFor(processWaitingInterval);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Actual shutdown.
|
||||
protected void doShutdown() {
|
||||
LOG.info("doShutdown()...");
|
||||
try {
|
||||
earlybirdIndexConfig.getResourceCloser().shutdownExecutor();
|
||||
} catch (InterruptedException e) {
|
||||
LOG.error("Interrupted during shutdown. ", e);
|
||||
}
|
||||
|
||||
LOG.info("Segment builder stopped!");
|
||||
}
|
||||
|
||||
private List<ArchiveTimeSlicer.ArchiveTimeSlice> createTimeSlices() throws IOException {
|
||||
Preconditions.checkState(segmentSyncConfig.getScrubGen().isPresent());
|
||||
Date scrubGen = ScrubGenUtil.parseScrubGenToDate(segmentSyncConfig.getScrubGen().get());
|
||||
|
||||
final DailyStatusBatches dailyStatusBatches =
|
||||
new DailyStatusBatches(zkTryLockFactory, scrubGen);
|
||||
final ArchiveTimeSlicer archiveTimeSlicer = new ArchiveTimeSlicer(
|
||||
EarlybirdConfig.getMaxSegmentSize(), dailyStatusBatches, earlybirdIndexConfig);
|
||||
|
||||
Stopwatch stopwatch = Stopwatch.createStarted();
|
||||
List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices = archiveTimeSlicer.getTimeSlices();
|
||||
|
||||
if (timeSlices == null) {
|
||||
LOG.error("Failed to load timeslice map after {}", stopwatch);
|
||||
return Collections.emptyList();
|
||||
}
|
||||
|
||||
LOG.info("Took {} to get timeslices", stopwatch);
|
||||
return timeSlices;
|
||||
}
|
||||
|
||||
private static class TimeSliceAndHashPartition implements Comparable<TimeSliceAndHashPartition> {
|
||||
public final ArchiveTimeSlicer.ArchiveTimeSlice timeSlice;
|
||||
public final Integer hashPartition;
|
||||
|
||||
public TimeSliceAndHashPartition(
|
||||
ArchiveTimeSlicer.ArchiveTimeSlice timeSlice,
|
||||
Integer hashPartition) {
|
||||
this.timeSlice = timeSlice;
|
||||
this.hashPartition = hashPartition;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int compareTo(TimeSliceAndHashPartition o) {
|
||||
Integer myHashPartition = this.hashPartition;
|
||||
Integer otherHashPartition = o.hashPartition;
|
||||
|
||||
long myTimeSliceId = this.timeSlice.getMinStatusID(myHashPartition);
|
||||
long otherTimeSliceId = o.timeSlice.getMinStatusID(otherHashPartition);
|
||||
|
||||
return ComparisonChain.start()
|
||||
.compare(myHashPartition, otherHashPartition)
|
||||
.compare(myTimeSliceId, otherTimeSliceId)
|
||||
.result();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* For all the timeslices, create the corresponding SegmentInfo and store in a map
|
||||
*/
|
||||
@VisibleForTesting
|
||||
Map<String, SegmentBuilderSegment> createSegmentInfoMap() throws IOException {
|
||||
final List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices = createTimeSlices();
|
||||
|
||||
List<TimeSliceAndHashPartition> timeSlicePairs = createPairs(timeSlices);
|
||||
// Export how many segments should be built
|
||||
SEGMENTS_TO_BUILD.set(timeSlicePairs.size());
|
||||
LOG.info("Total number of segments to be built across all segment builders: {}",
|
||||
timeSlicePairs.size());
|
||||
|
||||
List<TimeSliceAndHashPartition> mySegments = getSegmentsForMyPartition(timeSlicePairs);
|
||||
|
||||
Map<String, SegmentBuilderSegment> segmentInfoMap = new HashMap<>();
|
||||
for (TimeSliceAndHashPartition mySegment : mySegments) {
|
||||
ArchiveSegment segment = new ArchiveSegment(mySegment.timeSlice, mySegment.hashPartition,
|
||||
EarlybirdConfig.getMaxSegmentSize());
|
||||
SegmentInfo segmentInfo = new SegmentInfo(segment, segmentFactory, segmentSyncConfig);
|
||||
|
||||
segmentInfoMap.put(segmentInfo.getSegment().getSegmentName(), new NotYetBuiltSegment(
|
||||
segmentInfo, segmentConfig, segmentFactory, 0, segmentSyncConfig));
|
||||
}
|
||||
|
||||
return segmentInfoMap;
|
||||
}
|
||||
|
||||
private List<TimeSliceAndHashPartition> createPairs(
|
||||
List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices) {
|
||||
|
||||
List<TimeSliceAndHashPartition> timeSlicePairs = new ArrayList<>();
|
||||
|
||||
for (ArchiveTimeSlicer.ArchiveTimeSlice slice : timeSlices) {
|
||||
List<Integer> localPartitions = hashPartitions;
|
||||
if (localPartitions == null) {
|
||||
localPartitions = range(slice.getNumHashPartitions());
|
||||
}
|
||||
|
||||
for (Integer partition : localPartitions) {
|
||||
timeSlicePairs.add(new TimeSliceAndHashPartition(slice, partition));
|
||||
}
|
||||
}
|
||||
return timeSlicePairs;
|
||||
}
|
||||
|
||||
private List<TimeSliceAndHashPartition> getSegmentsForMyPartition(
|
||||
List<TimeSliceAndHashPartition> timeSlicePairs) {
|
||||
|
||||
Collections.sort(timeSlicePairs);
|
||||
|
||||
List<TimeSliceAndHashPartition> myTimeSlices = new ArrayList<>();
|
||||
for (int i = myPartitionId; i < timeSlicePairs.size(); i += numSegmentBuilderPartitions) {
|
||||
myTimeSlices.add(timeSlicePairs.get(i));
|
||||
}
|
||||
|
||||
LOG.info("Getting segments to be built for partition: {}", myPartitionId);
|
||||
LOG.info("Total number of partitions: {}", numSegmentBuilderPartitions);
|
||||
LOG.info("Number of segments picked: {}", myTimeSlices.size());
|
||||
return myTimeSlices;
|
||||
}
|
||||
|
||||
/**
|
||||
* Print out the segmentInfo Map for debugging
|
||||
*/
|
||||
private void printSegmentInfoMap(Map<String, SegmentBuilderSegment> segmentInfoMap) {
|
||||
LOG.info("SegmentInfoMap: ");
|
||||
for (Map.Entry<String, SegmentBuilderSegment> entry : segmentInfoMap.entrySet()) {
|
||||
LOG.info(entry.getValue().toString());
|
||||
}
|
||||
LOG.info("Total SegmentInfoMap size: " + segmentInfoMap.size() + ". done.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Build indices or refresh state for the segments in the specified segmentInfoMap, which only
|
||||
* contains the segments that need to build or are building. When a segment has not been built,
|
||||
* it is built here. If built successfully, it will be removed from the map; otherwise, its
|
||||
* state will be updated in the map.
|
||||
*
|
||||
* Returns true iff this process has built a segment.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
boolean processSegments(Map<String, SegmentBuilderSegment> segmentInfoMap)
|
||||
throws SegmentInfoConstructionException, SegmentUpdaterException, InterruptedException {
|
||||
|
||||
boolean hasBuiltSegment = false;
|
||||
|
||||
Iterator<Map.Entry<String, SegmentBuilderSegment>> iter =
|
||||
segmentInfoMap.entrySet().iterator();
|
||||
while (iter.hasNext()) {
|
||||
Map.Entry<String, SegmentBuilderSegment> entry = iter.next();
|
||||
SegmentBuilderSegment originalSegment = entry.getValue();
|
||||
|
||||
LOG.info("About to process segment: {}", originalSegment.getSegmentName());
|
||||
long startMillis = System.currentTimeMillis();
|
||||
SegmentBuilderSegment updatedSegment = segmentHandler.processSegment(originalSegment);
|
||||
|
||||
if (updatedSegment.isBuilt()) {
|
||||
iter.remove();
|
||||
hasBuiltSegment = true;
|
||||
|
||||
if (originalSegment instanceof NotYetBuiltSegment) {
|
||||
// Record the total time spent on successfully building a semgent, used to compute the
|
||||
// average segment building time.
|
||||
long timeSpent = System.currentTimeMillis() - startMillis;
|
||||
segmentsBuiltLocally.increment();
|
||||
timeSpentOnSuccessfulBuildSecs.add(timeSpent / 1000);
|
||||
}
|
||||
} else {
|
||||
entry.setValue(updatedSegment);
|
||||
}
|
||||
|
||||
clock.waitFor(getSegmentSleepTime());
|
||||
}
|
||||
|
||||
return hasBuiltSegment;
|
||||
}
|
||||
|
||||
private long getSegmentSleepTime() {
|
||||
// The Hadoop name node can handle only about 200 requests/sec before it gets overloaded.
|
||||
// Updating the state of a node that has been built takes about 1 second. In the worst case
|
||||
// scenario with 800 segment builders, we end up with about 800 requests/sec. Adding a 10
|
||||
// second sleep lowers the worst case to about 80 requests/sec.
|
||||
|
||||
long sleepMillis = TimeUnit.SECONDS.toMillis(waitBetweenSegmentsSecs);
|
||||
|
||||
// Use randomization so that we can't get all segment builders hitting it at the exact same time
|
||||
|
||||
int lowerSleepBoundMillis = (int) (sleepMillis * (1.0 - SLEEP_RANDOMIZATION_RATIO));
|
||||
int upperSleepBoundMillis = (int) (sleepMillis * (1.0 + SLEEP_RANDOMIZATION_RATIO));
|
||||
return randRange(lowerSleepBoundMillis, upperSleepBoundMillis);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a pseudo-random number between min and max, inclusive.
|
||||
*/
|
||||
private int randRange(int min, int max) {
|
||||
return random.nextInt((max - min) + 1) + min;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns list of integers 0, 1, 2, ..., count-1.
|
||||
*/
|
||||
private static List<Integer> range(int count) {
|
||||
List<Integer> nums = new ArrayList<>(count);
|
||||
|
||||
for (int i = 0; i < count; i++) {
|
||||
nums.add(i);
|
||||
}
|
||||
|
||||
return nums;
|
||||
}
|
||||
|
||||
private static SegmentSyncConfig getSyncConfig(String scrubGen) {
|
||||
if (scrubGen == null || scrubGen.isEmpty()) {
|
||||
throw new RuntimeException(
|
||||
"Scrub gen expected, but could not get it from the arguments.");
|
||||
}
|
||||
|
||||
LOG.info("Scrub gen: " + scrubGen);
|
||||
return new SegmentSyncConfig(Optional.of(scrubGen));
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,109 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import java.util.Collection;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
import com.google.inject.Module;
|
||||
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.app.Flaggable;
|
||||
import com.twitter.inject.server.AbstractTwitterServer;
|
||||
import com.twitter.util.Future;
|
||||
import com.twitter.util.Time;
|
||||
|
||||
public class SegmentBuilderApp extends AbstractTwitterServer {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderApp.class);
|
||||
|
||||
public SegmentBuilderApp() {
|
||||
createFlag("onlyRunOnce",
|
||||
true,
|
||||
"whether to stop segment builder after one loop",
|
||||
Flaggable.ofBoolean());
|
||||
|
||||
createFlag("waitBetweenLoopsMins",
|
||||
60,
|
||||
"how many minutes to wait between building loops",
|
||||
Flaggable.ofInt());
|
||||
|
||||
createFlag("startup_batch_size",
|
||||
30,
|
||||
"How many instances can start and read timeslice info from HDFS at the same time. "
|
||||
+ "If you don't know what this parameter is, please do not change this parameter.",
|
||||
Flaggable.ofInt());
|
||||
|
||||
createFlag("instance",
|
||||
20,
|
||||
"the job instance number",
|
||||
Flaggable.ofInt());
|
||||
|
||||
createFlag("segmentZkLockExpirationHours",
|
||||
0,
|
||||
"max hours to hold the zookeeper lock while building segment",
|
||||
Flaggable.ofInt());
|
||||
|
||||
createFlag("startupSleepMins",
|
||||
2L,
|
||||
"sleep multiplier of startupSleepMins before job runs",
|
||||
Flaggable.ofLong());
|
||||
|
||||
createFlag("maxRetriesOnFailure",
|
||||
3,
|
||||
"how many times we should try to rebuild a segment when failure happens",
|
||||
Flaggable.ofInt());
|
||||
|
||||
createFlag("hash_partitions",
|
||||
ImmutableList.of(),
|
||||
"comma separated hash partition ids, e.g., 0,1,3,4. "
|
||||
+ "If not specified, all the partitions will be built.",
|
||||
Flaggable.ofJavaList(Flaggable.ofInt()));
|
||||
|
||||
createFlag("numSegmentBuilderPartitions",
|
||||
100,
|
||||
"Number of partitions for dividing up all segment builder work",
|
||||
Flaggable.ofInt());
|
||||
|
||||
createFlag("waitBetweenSegmentsSecs",
|
||||
10,
|
||||
"Time to sleep between processing segments.",
|
||||
Flaggable.ofInt());
|
||||
|
||||
createFlag("waitBeforeQuitMins",
|
||||
2,
|
||||
"How many minutes to sleep before quitting.",
|
||||
Flaggable.ofInt());
|
||||
|
||||
createFlag("scrubGen",
|
||||
"",
|
||||
"Scrub gen for which segment builders should be run.",
|
||||
Flaggable.ofString());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void start() {
|
||||
SegmentBuilder segmentBuilder = injector().instance(SegmentBuilder.class);
|
||||
closeOnExit((Time time) -> {
|
||||
segmentBuilder.doShutdown();
|
||||
return Future.Unit();
|
||||
});
|
||||
|
||||
LOG.info("Starting run()");
|
||||
segmentBuilder.run();
|
||||
LOG.info("run() complete");
|
||||
|
||||
// Now shutdown
|
||||
shutdown();
|
||||
}
|
||||
|
||||
protected void shutdown() {
|
||||
LOG.info("Calling close() to initiate shutdown");
|
||||
close();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Collection<Module> javaModules() {
|
||||
return ImmutableList.of(new SegmentBuilderModule());
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,200 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Date;
|
||||
import java.util.Optional;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common.quantity.Amount;
|
||||
import com.twitter.common.quantity.Time;
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.database.DatabaseConfig;
|
||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
||||
import com.twitter.search.earlybird.archive.DailyStatusBatches;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdProperty;
|
||||
import com.twitter.search.earlybird.util.ScrubGenUtil;
|
||||
import com.twitter.search.earlybird.partition.HdfsUtil;
|
||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
||||
import com.twitter.util.Duration;
|
||||
|
||||
/**
|
||||
* Coordinate between segment builders for scrubbing pipeline.
|
||||
* When segment builder is running, all of them will try to find a HDFS file indicating if data is
|
||||
* ready. If the file does not exist, only one of them will go through the files and see if
|
||||
* scrubbing pipeline has generated all data for this scrub gen.
|
||||
*
|
||||
* If the instance that got the lock found all data, it still exists, because otherwise we will
|
||||
* have one single segmentbuilder instance trying to build all segments, which is not what we want.
|
||||
* But if it exists, then the next time all segmentbuilder instances are scheduled, they will all
|
||||
* find the file, and will start building segments.
|
||||
*/
|
||||
class SegmentBuilderCoordinator {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderCoordinator.class);
|
||||
|
||||
private static final Amount<Long, Time> ZK_LOCK_EXPIRATION_MIN = Amount.of(5L, Time.MINUTES);
|
||||
private static final String SEGMENT_BUILDER_SYNC_NODE = "scrub_gen_data_sync";
|
||||
private static final String SEGMENT_BUILDER_SYNC_ZK_PATH =
|
||||
EarlybirdProperty.ZK_APP_ROOT.get() + "/segment_builder_sync";
|
||||
private static final String DATA_FULLY_BUILT_FILE = "_data_fully_built";
|
||||
static final int FIRST_INSTANCE = 0;
|
||||
|
||||
private static final long NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS =
|
||||
Duration.fromHours(1).inMillis();
|
||||
|
||||
private final ZooKeeperTryLockFactory zkTryLockFactory;
|
||||
private final SegmentSyncConfig syncConfig;
|
||||
private final Optional<Date> scrubGenDayOpt;
|
||||
private final Optional<String> scrubGenOpt;
|
||||
private final Clock clock;
|
||||
|
||||
SegmentBuilderCoordinator(
|
||||
ZooKeeperTryLockFactory zkTryLockFactory, SegmentSyncConfig syncConfig, Clock clock) {
|
||||
this.zkTryLockFactory = zkTryLockFactory;
|
||||
this.syncConfig = syncConfig;
|
||||
this.scrubGenOpt = syncConfig.getScrubGen();
|
||||
this.scrubGenDayOpt = scrubGenOpt.map(ScrubGenUtil::parseScrubGenToDate);
|
||||
this.clock = clock;
|
||||
}
|
||||
|
||||
|
||||
public boolean isScrubGenDataFullyBuilt(int instanceNumber) {
|
||||
// Only segment builder that takes scrub gen should use isPartitioningOutputReady to coordinate
|
||||
Preconditions.checkArgument(scrubGenDayOpt.isPresent());
|
||||
|
||||
final FileSystem hdfs;
|
||||
try {
|
||||
hdfs = HdfsUtil.getHdfsFileSystem();
|
||||
} catch (IOException e) {
|
||||
LOG.error("Could not create HDFS file system.", e);
|
||||
return false;
|
||||
}
|
||||
|
||||
return isScrubGenDataFullyBuilt(
|
||||
instanceNumber,
|
||||
scrubGenDayOpt.get(),
|
||||
NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS,
|
||||
hdfs
|
||||
);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
boolean isScrubGenDataFullyBuilt(
|
||||
int instanceNumber,
|
||||
Date scrubGenDay,
|
||||
long nonFirstInstanceSleepBeforeRetryDuration,
|
||||
FileSystem hdfs) {
|
||||
// Check if the scrub gen has been fully built file exists.
|
||||
if (checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// If it doesn't exist, let first instance see if scrub gen has been fully built and create the
|
||||
// file.
|
||||
if (instanceNumber == FIRST_INSTANCE) {
|
||||
// We were missing some data on HDFS for this scrub gen in previous run,
|
||||
// but we might've gotten more data in the meantime, check again.
|
||||
// Only allow instance 0 to do this mainly for 2 reasons:
|
||||
// 1) Since instances are scheduled in batches, it's possible that a instance from latter
|
||||
// batch find the fully built file in hdfs and start processing. We end up doing work with
|
||||
// only partial instances.
|
||||
// 2) If we sleep before we release lock, it's hard to estimate how long a instance will
|
||||
// be scheduled.
|
||||
// For deterministic reason, we simplify a bit and only allow instance 0 to check and write
|
||||
// data is fully build file to hdfs.
|
||||
try {
|
||||
checkIfScrubGenDataIsFullyBuilt(hdfs, scrubGenDay);
|
||||
} catch (IOException e) {
|
||||
LOG.error("Failed to grab lock and check scrub gen data.", e);
|
||||
}
|
||||
} else {
|
||||
// for all other instances, sleep for a bit to give time for first instance to check if scrub
|
||||
// gen has been fully built and create the file, then check again.
|
||||
try {
|
||||
LOG.info(
|
||||
"Sleeping for {} ms before re-checking if scrub gen has been fully built file exists",
|
||||
nonFirstInstanceSleepBeforeRetryDuration);
|
||||
clock.waitFor(nonFirstInstanceSleepBeforeRetryDuration);
|
||||
return checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs);
|
||||
} catch (InterruptedException e) {
|
||||
LOG.warn("Interrupted when sleeping before re-checking if scrub gen has been fully built "
|
||||
+ "file exists", e);
|
||||
}
|
||||
}
|
||||
|
||||
// if hasSuccessFileToHdfs returns false, then should always return false in the end.
|
||||
// next run will find success file for this scrub gen and move forward.
|
||||
return false;
|
||||
}
|
||||
|
||||
private void checkIfScrubGenDataIsFullyBuilt(
|
||||
FileSystem hdfs, Date scrubGenDay) throws IOException {
|
||||
// Build the lock, try to acquire it, and check the data on HDFS
|
||||
TryLock lock = zkTryLockFactory.createTryLock(
|
||||
DatabaseConfig.getLocalHostname(),
|
||||
SEGMENT_BUILDER_SYNC_ZK_PATH,
|
||||
SEGMENT_BUILDER_SYNC_NODE,
|
||||
ZK_LOCK_EXPIRATION_MIN);
|
||||
Preconditions.checkState(scrubGenOpt.isPresent());
|
||||
String scrubGen = scrubGenOpt.get();
|
||||
|
||||
lock.tryWithLock(() -> {
|
||||
LOG.info(String.format(
|
||||
"Obtained ZK lock to check if data for scrub gen %s is ready.", scrubGen));
|
||||
final DailyStatusBatches directory =
|
||||
new DailyStatusBatches(zkTryLockFactory, scrubGenDay);
|
||||
if (directory.isScrubGenDataFullyBuilt(hdfs)
|
||||
&& createScrubGenDataFullyBuiltFileOnHdfs(hdfs)) {
|
||||
LOG.info(String.format("All data for scrub gen %s is ready.", scrubGen));
|
||||
} else {
|
||||
LOG.info(String.format("Data for scrub gen %s is not ready yet.", scrubGen));
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
private boolean createScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) {
|
||||
Path path = getScrubGenDataFullyBuiltFilePath();
|
||||
try {
|
||||
fs.mkdirs(new Path(statusReadyHDFSPath()));
|
||||
if (fs.createNewFile(path)) {
|
||||
LOG.info("Successfully created file " + path + " on HDFS.");
|
||||
return true;
|
||||
} else {
|
||||
LOG.warn("Failed to create file " + path + " on HDFS.");
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.error("Failed to create file on HDFS " + path.toString(), e);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private boolean checkHaveScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) {
|
||||
Path path = getScrubGenDataFullyBuiltFilePath();
|
||||
try {
|
||||
boolean ret = fs.exists(path);
|
||||
LOG.info("Checking if file exists showing scrubgen is fully built.");
|
||||
LOG.info("Path checked: {}, Exist check: {}", path, ret);
|
||||
return ret;
|
||||
} catch (IOException e) {
|
||||
LOG.error("Failed to check file on HDFS " + path.toString(), e);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
Path getScrubGenDataFullyBuiltFilePath() {
|
||||
return new Path(statusReadyHDFSPath(), DATA_FULLY_BUILT_FILE);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
String statusReadyHDFSPath() {
|
||||
return syncConfig.getHdfsSegmentSyncRootDir() + "/segment_builder_sync";
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,10 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
public final class SegmentBuilderMain {
|
||||
|
||||
private SegmentBuilderMain() { }
|
||||
|
||||
public static void main(String[] args) {
|
||||
new SegmentBuilderApp().main(args);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,58 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import java.io.File;
|
||||
|
||||
import com.google.inject.Provides;
|
||||
import com.google.inject.Singleton;
|
||||
|
||||
import com.twitter.app.Flaggable;
|
||||
import com.twitter.decider.Decider;
|
||||
import com.twitter.inject.TwitterModule;
|
||||
import com.twitter.inject.annotations.Flag;
|
||||
import com.twitter.search.common.config.LoggerConfiguration;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.util.EarlybirdDecider;
|
||||
|
||||
public class SegmentBuilderModule extends TwitterModule {
|
||||
|
||||
private static final String CONFIG_FILE_FLAG_NAME = "config_file";
|
||||
private static final String SEGMENT_LOG_DIR_FLAG_NAME = "segment_log_dir";
|
||||
|
||||
public SegmentBuilderModule() {
|
||||
createFlag(CONFIG_FILE_FLAG_NAME,
|
||||
new File("earlybird-search.yml"),
|
||||
"specify config file",
|
||||
Flaggable.ofFile());
|
||||
|
||||
createFlag(SEGMENT_LOG_DIR_FLAG_NAME,
|
||||
"",
|
||||
"override log dir from config file",
|
||||
Flaggable.ofString());
|
||||
}
|
||||
|
||||
/**
|
||||
* Initializes the Earlybird config and the log configuration, and returns an EarlybirdDecider
|
||||
* object, which will be injected into the SegmentBuilder instance.
|
||||
*
|
||||
* @param configFile The config file to use to initialize EarlybirdConfig
|
||||
* @param segmentLogDir If not empty, used to override the log directory from the config file
|
||||
* @return An initialized EarlybirdDecider
|
||||
*/
|
||||
@Provides
|
||||
@Singleton
|
||||
public Decider provideDecider(@Flag(CONFIG_FILE_FLAG_NAME) File configFile,
|
||||
@Flag(SEGMENT_LOG_DIR_FLAG_NAME) String segmentLogDir) {
|
||||
// By default Guice will build singletons eagerly:
|
||||
// https://github.com/google/guice/wiki/Scopes#eager-singletons
|
||||
// So in order to ensure that the EarlybirdConfig and LoggerConfiguration initializations occur
|
||||
// before the EarlybirdDecider initialization, we place them here.
|
||||
EarlybirdConfig.init(configFile.getName());
|
||||
if (!segmentLogDir.isEmpty()) {
|
||||
EarlybirdConfig.overrideLogDir(segmentLogDir);
|
||||
}
|
||||
new LoggerConfiguration(EarlybirdConfig.getLogPropertiesFile(), EarlybirdConfig.getLogDir())
|
||||
.configure();
|
||||
|
||||
return EarlybirdDecider.initialize();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,100 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.common.quantity.Amount;
|
||||
import com.twitter.common.quantity.Time;
|
||||
import com.twitter.search.common.database.DatabaseConfig;
|
||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
||||
import com.twitter.search.earlybird.archive.ArchiveSegment;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
||||
|
||||
public abstract class SegmentBuilderSegment {
|
||||
protected final SegmentInfo segmentInfo;
|
||||
protected final SegmentConfig segmentConfig;
|
||||
protected final EarlybirdSegmentFactory earlybirdSegmentFactory;
|
||||
protected final int alreadyRetriedCount;
|
||||
protected final SegmentSyncConfig sync;
|
||||
|
||||
public SegmentBuilderSegment(SegmentInfo segmentInfo,
|
||||
SegmentConfig segmentConfig,
|
||||
EarlybirdSegmentFactory earlybirdSegmentFactory,
|
||||
int alreadyRetriedCount,
|
||||
SegmentSyncConfig segmentSyncConfig) {
|
||||
this.segmentConfig = segmentConfig;
|
||||
this.earlybirdSegmentFactory = earlybirdSegmentFactory;
|
||||
this.alreadyRetriedCount = alreadyRetriedCount;
|
||||
this.sync = segmentSyncConfig;
|
||||
Preconditions.checkState(segmentInfo.getSegment() instanceof ArchiveSegment);
|
||||
this.segmentInfo = Preconditions.checkNotNull(segmentInfo);
|
||||
}
|
||||
|
||||
public SegmentInfo getSegmentInfo() {
|
||||
return segmentInfo;
|
||||
}
|
||||
|
||||
public String getSegmentName() {
|
||||
return segmentInfo.getSegmentName();
|
||||
}
|
||||
|
||||
public int getAlreadyRetriedCount() {
|
||||
return alreadyRetriedCount;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle the segment, potentially transitioning to a new state.
|
||||
* @return The state after handling.
|
||||
*/
|
||||
public abstract SegmentBuilderSegment handle()
|
||||
throws SegmentInfoConstructionException, SegmentUpdaterException;
|
||||
|
||||
public boolean isBuilt() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "SegmentBuilderSegment{"
|
||||
+ "segmentInfo=" + segmentInfo
|
||||
+ ", state=" + this.getClass().getSimpleName()
|
||||
+ ", alreadyRetriedCount=" + alreadyRetriedCount + '}';
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a SegmentInfo, create a new one with the same time slice and partitionID but clean
|
||||
* internal state.
|
||||
*/
|
||||
protected SegmentInfo createNewSegmentInfo(SegmentInfo oldSegmentInfo)
|
||||
throws SegmentInfoConstructionException {
|
||||
Preconditions.checkArgument(oldSegmentInfo.getSegment() instanceof ArchiveSegment);
|
||||
ArchiveSegment archiveSegment = (ArchiveSegment) oldSegmentInfo.getSegment();
|
||||
|
||||
try {
|
||||
ArchiveSegment segment = new ArchiveSegment(archiveSegment.getArchiveTimeSlice(),
|
||||
archiveSegment.getHashPartitionID(), EarlybirdConfig.getMaxSegmentSize());
|
||||
|
||||
return new SegmentInfo(segment, earlybirdSegmentFactory, sync);
|
||||
} catch (IOException e) {
|
||||
throw new SegmentInfoConstructionException("Error creating new segments", e);
|
||||
}
|
||||
}
|
||||
|
||||
protected TryLock getZooKeeperTryLock() {
|
||||
ZooKeeperTryLockFactory tryLockFactory = segmentConfig.getTryLockFactory();
|
||||
String zkRootPath = sync.getZooKeeperSyncFullPath();
|
||||
String nodeName = segmentInfo.getZkNodeName();
|
||||
Amount<Long, Time> expirationTime = segmentConfig.getSegmentZKLockExpirationTime();
|
||||
|
||||
return tryLockFactory.createTryLock(
|
||||
DatabaseConfig.getLocalHostname(),
|
||||
zkRootPath,
|
||||
nodeName,
|
||||
expirationTime);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,41 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import com.twitter.common.quantity.Amount;
|
||||
import com.twitter.common.quantity.Time;
|
||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
||||
import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig;
|
||||
|
||||
public class SegmentConfig {
|
||||
private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig;
|
||||
private final Amount<Long, Time> segmentZKLockExpirationTime;
|
||||
private final int maxRetriesOnFailure;
|
||||
private final ZooKeeperTryLockFactory tryLockFactory;
|
||||
|
||||
public SegmentConfig(
|
||||
ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig,
|
||||
Amount<Long, Time> segmentZKLockExpirationTime,
|
||||
int maxRetriesOnFailure,
|
||||
ZooKeeperTryLockFactory tryLockFactory) {
|
||||
|
||||
this.earlybirdIndexConfig = earlybirdIndexConfig;
|
||||
this.segmentZKLockExpirationTime = segmentZKLockExpirationTime;
|
||||
this.maxRetriesOnFailure = maxRetriesOnFailure;
|
||||
this.tryLockFactory = tryLockFactory;
|
||||
}
|
||||
|
||||
public ArchiveOnDiskEarlybirdIndexConfig getEarlybirdIndexConfig() {
|
||||
return earlybirdIndexConfig;
|
||||
}
|
||||
|
||||
public Amount<Long, Time> getSegmentZKLockExpirationTime() {
|
||||
return segmentZKLockExpirationTime;
|
||||
}
|
||||
|
||||
public int getMaxRetriesOnFailure() {
|
||||
return maxRetriesOnFailure;
|
||||
}
|
||||
|
||||
public ZooKeeperTryLockFactory getTryLockFactory() {
|
||||
return tryLockFactory;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,12 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
/**
|
||||
* Used if exceptions are thrown during creating new SegmentInfo during the indexing loop
|
||||
*/
|
||||
class SegmentInfoConstructionException extends Exception {
|
||||
SegmentInfoConstructionException(String msg, IOException e) {
|
||||
super(msg, e);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,13 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
/**
|
||||
* Used when when SegmentUpdater fails processing segments.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
class SegmentUpdaterException extends Exception {
|
||||
SegmentUpdaterException(String msg) {
|
||||
super(msg);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,69 +0,0 @@
|
||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
||||
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import com.twitter.common.base.Command;
|
||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
||||
import com.twitter.search.earlybird.archive.ArchiveHDFSUtils;
|
||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
||||
|
||||
public class SomeoneElseIsBuildingSegment extends SegmentBuilderSegment {
|
||||
public SomeoneElseIsBuildingSegment(
|
||||
SegmentInfo segmentInfo,
|
||||
SegmentConfig segmentConfig,
|
||||
EarlybirdSegmentFactory earlybirdSegmentFactory,
|
||||
int alreadyRetriedCount,
|
||||
SegmentSyncConfig sync) {
|
||||
|
||||
super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
|
||||
}
|
||||
|
||||
/**
|
||||
* This method refreshes local state of a segment.
|
||||
* 1. Try to grab the ZK lock
|
||||
* 2a. if got the lock, the segment is not being built; mark segment as NOT_BUILT_YET.
|
||||
* 2b. otherwise, the segment is being built; keep the SOMEONE_ELSE_IS_BUILDING state
|
||||
*/
|
||||
@Override
|
||||
public SegmentBuilderSegment handle()
|
||||
throws SegmentInfoConstructionException, SegmentUpdaterException {
|
||||
|
||||
TryLock lock = getZooKeeperTryLock();
|
||||
|
||||
final AtomicBoolean alreadyBuilt = new AtomicBoolean(false);
|
||||
boolean gotLock = lock.tryWithLock((Command) () -> {
|
||||
// The segment might have already finished built by others
|
||||
if (segmentExistsOnHdfs()) {
|
||||
alreadyBuilt.set(true);
|
||||
}
|
||||
});
|
||||
|
||||
if (!gotLock) {
|
||||
return this;
|
||||
}
|
||||
|
||||
if (alreadyBuilt.get()) {
|
||||
return new BuiltAndFinalizedSegment(
|
||||
segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync);
|
||||
} else {
|
||||
// When a segment failed building, its state might not be clean. So, it is necessary to
|
||||
// create a new SegmentInfo with a clean state
|
||||
SegmentInfo newSegmentInfo = createNewSegmentInfo(segmentInfo);
|
||||
return new NotYetBuiltSegment(
|
||||
newSegmentInfo,
|
||||
segmentConfig,
|
||||
earlybirdSegmentFactory,
|
||||
alreadyRetriedCount + 1,
|
||||
sync);
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
boolean segmentExistsOnHdfs() {
|
||||
return ArchiveHDFSUtils.hasSegmentIndicesOnHDFS(sync, segmentInfo);
|
||||
}
|
||||
}
|
@ -1,37 +0,0 @@
|
||||
java_library(
|
||||
sources = ["*.java"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/twitter/elephantbird:core",
|
||||
"3rdparty/jvm/commons-codec",
|
||||
"3rdparty/jvm/commons-httpclient",
|
||||
"3rdparty/jvm/geo/google:geoGoogle",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-core",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
||||
"decider/src/main/scala",
|
||||
"finagle/finagle-core/src/main",
|
||||
"finagle/finagle-thrift/src/main/java",
|
||||
"finagle/finagle-thrift/src/main/scala",
|
||||
"scrooge/scrooge-core/src/main/scala",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common/optional",
|
||||
"src/java/com/twitter/search/common/decider",
|
||||
"src/java/com/twitter/search/common/logging",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/util:finagleutil",
|
||||
"src/java/com/twitter/search/common/util/earlybird",
|
||||
"src/java/com/twitter/search/common/util/thrift:thrift-utils",
|
||||
"src/java/com/twitter/search/queryparser/query:core-query-nodes",
|
||||
"src/thrift/com/twitter/context:twitter-context-scala",
|
||||
"src/thrift/com/twitter/search:earlybird-java",
|
||||
"src/thrift/com/twitter/search/common:caching-java",
|
||||
"src/thrift/com/twitter/search/common:constants-java",
|
||||
"src/thrift/com/twitter/search/common:query-java",
|
||||
"strato/src/main/scala/com/twitter/strato/opcontext",
|
||||
"twitter-context/src/main/scala",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
BIN
src/java/com/twitter/search/earlybird/common/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/common/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,120 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.thrift.TException;
|
||||
import org.apache.thrift.TSerializer;
|
||||
import org.apache.thrift.protocol.TBinaryProtocol;
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
||||
|
||||
public final class Base64RequestResponseForLogging {
|
||||
private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger(
|
||||
Base64RequestResponseForLogging.class);
|
||||
private static final Logger FAILED_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
|
||||
Base64RequestResponseForLogging.class.getName() + ".FailedRequests");
|
||||
private static final Logger RANDOM_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
|
||||
Base64RequestResponseForLogging.class.getName() + ".RandomRequests");
|
||||
private static final Logger SLOW_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
|
||||
Base64RequestResponseForLogging.class.getName() + ".SlowRequests");
|
||||
|
||||
private enum LogType {
|
||||
FAILED,
|
||||
RANDOM,
|
||||
SLOW,
|
||||
};
|
||||
|
||||
private final LogType logtype;
|
||||
private final String logLine;
|
||||
private final EarlybirdRequest request;
|
||||
private final EarlybirdResponse response;
|
||||
private final Base64 base64 = new Base64();
|
||||
|
||||
// TSerializer is not threadsafe, so create a new one for each request
|
||||
private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
|
||||
|
||||
private Base64RequestResponseForLogging(
|
||||
LogType logType, String logLine, EarlybirdRequest request, EarlybirdResponse response) {
|
||||
this.logtype = logType;
|
||||
this.logLine = logLine;
|
||||
this.request = request;
|
||||
this.response = response;
|
||||
}
|
||||
|
||||
public static Base64RequestResponseForLogging randomRequest(
|
||||
String logLine, EarlybirdRequest request, EarlybirdResponse response) {
|
||||
return new Base64RequestResponseForLogging(LogType.RANDOM, logLine, request, response);
|
||||
}
|
||||
|
||||
public static Base64RequestResponseForLogging failedRequest(
|
||||
String logLine, EarlybirdRequest request, EarlybirdResponse response) {
|
||||
return new Base64RequestResponseForLogging(LogType.FAILED, logLine, request, response);
|
||||
}
|
||||
|
||||
public static Base64RequestResponseForLogging slowRequest(
|
||||
String logLine, EarlybirdRequest request, EarlybirdResponse response) {
|
||||
return new Base64RequestResponseForLogging(LogType.SLOW, logLine, request, response);
|
||||
}
|
||||
|
||||
private String asBase64(EarlybirdRequest clearedRequest) {
|
||||
try {
|
||||
// The purpose of this log is to make it easy to re-issue requests in formz to reproduce
|
||||
// issues. If queries are re-issued as is they will be treated as late-arriving queries and
|
||||
// dropped due to the clientRequestTimeMs being set to the original query time. For ease of
|
||||
// use purposes we clear clientRequestTimeMs and log it out separately for the rare case it
|
||||
// is needed.
|
||||
clearedRequest.unsetClientRequestTimeMs();
|
||||
return base64.encodeToString(serializer.serialize(clearedRequest));
|
||||
} catch (TException e) {
|
||||
GENERAL_LOG.error("Failed to serialize request for logging.", e);
|
||||
return "failed_to_serialize";
|
||||
}
|
||||
}
|
||||
|
||||
private String asBase64(EarlybirdResponse earlybirdResponse) {
|
||||
try {
|
||||
return base64.encodeToString(serializer.serialize(earlybirdResponse));
|
||||
} catch (TException e) {
|
||||
GENERAL_LOG.error("Failed to serialize response for logging.", e);
|
||||
return "failed_to_serialize";
|
||||
}
|
||||
}
|
||||
|
||||
private String getFormattedMessage() {
|
||||
String base64Request = asBase64(
|
||||
EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request));
|
||||
String base64Response = asBase64(response);
|
||||
return logLine + ", clientRequestTimeMs: " + request.getClientRequestTimeMs()
|
||||
+ ", " + base64Request + ", " + base64Response;
|
||||
}
|
||||
|
||||
/**
|
||||
* Logs the Base64-encoded request and response to the success or failure log.
|
||||
*/
|
||||
public void log() {
|
||||
// Do the serializing/concatting this way so it happens on the background thread for
|
||||
// async logging
|
||||
Object logObject = new Object() {
|
||||
@Override
|
||||
public String toString() {
|
||||
return getFormattedMessage();
|
||||
}
|
||||
};
|
||||
|
||||
switch (logtype) {
|
||||
case FAILED:
|
||||
FAILED_REQUEST_LOG.info("{}", logObject);
|
||||
break;
|
||||
case RANDOM:
|
||||
RANDOM_REQUEST_LOG.info("{}", logObject);
|
||||
break;
|
||||
case SLOW:
|
||||
SLOW_REQUEST_LOG.info("{}", logObject);
|
||||
break;
|
||||
default:
|
||||
// Not logging anything for other log types.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,55 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchCustomGauge;
|
||||
|
||||
/**
|
||||
* A monitor which enforces the condition that a single thread's work is caught up, and allows
|
||||
* other threads to wait to be notified when the work is complete. An AtomicBoolean ensures the
|
||||
* current status is visible to all threads.
|
||||
*/
|
||||
public class CaughtUpMonitor {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(CaughtUpMonitor.class);
|
||||
|
||||
protected final AtomicBoolean isCaughtUp = new AtomicBoolean(false);
|
||||
|
||||
public CaughtUpMonitor(String statPrefix) {
|
||||
SearchCustomGauge.export(statPrefix + "_is_caught_up", () -> isCaughtUp() ? 1 : 0);
|
||||
}
|
||||
|
||||
public boolean isCaughtUp() {
|
||||
return isCaughtUp.get();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set caught up state, and notify waiting threads if caught up.
|
||||
*/
|
||||
public synchronized void setAndNotify(boolean caughtUp) {
|
||||
isCaughtUp.set(caughtUp);
|
||||
if (caughtUp) {
|
||||
// Readers are caught up, notify waiting threads
|
||||
notifyAll();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait using Object.wait() until caught up or until thread is interrupted.
|
||||
*/
|
||||
public synchronized void resetAndWaitUntilCaughtUp() {
|
||||
LOG.info("Waiting to catch up.");
|
||||
// Explicitly set isCaughtUp to false before waiting
|
||||
isCaughtUp.set(false);
|
||||
try {
|
||||
while (!isCaughtUp()) {
|
||||
wait();
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
LOG.error("{} was interrupted while waiting to catch up", Thread.currentThread());
|
||||
}
|
||||
LOG.info("Caught up.");
|
||||
}
|
||||
}
|
BIN
src/java/com/twitter/search/earlybird/common/ClientIdUtil.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/common/ClientIdUtil.docx
Normal file
Binary file not shown.
@ -1,85 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import java.util.Optional;
|
||||
|
||||
import com.twitter.common.optional.Optionals;
|
||||
import com.twitter.search.common.util.FinagleUtil;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.strato.opcontext.Attribution;
|
||||
import com.twitter.strato.opcontext.HttpEndpoint;
|
||||
|
||||
public final class ClientIdUtil {
|
||||
// Blenders should always set the EarlybirdRequest.clientId field. It should be set to the Finagle
|
||||
// client ID of the client that caused the blender to send this request to the roots. If the
|
||||
// Finagle ID of the blender's client cannot be determined, it will be set to "unknown" (see
|
||||
// com.twitter.search.common.util.FinagleUtil.UNKNOWN_CLIENT_NAME). However, other services that
|
||||
// send requests to roots might not set EarlybirdRequest.clientId.
|
||||
//
|
||||
// So an "unset" clientId means: EarlybirdRequest.clientId was null.
|
||||
// An "unknown" clientId means: the client that sent us the request
|
||||
// tried setting EarlybirdRequest.clientId, but couldn't figure out a good value for it.
|
||||
public static final String UNSET_CLIENT_ID = "unset";
|
||||
|
||||
private static final String CLIENT_ID_FOR_UNKNOWN_CLIENTS = "unknown_client_id";
|
||||
|
||||
private static final String CLIENT_ID_PREFIX = "client_id_";
|
||||
|
||||
private static final String FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN =
|
||||
"finagle_id_%s_and_client_id_%s";
|
||||
|
||||
private static final String CLIENT_ID_AND_REQUEST_TYPE = "client_id_%s_and_type_%s";
|
||||
|
||||
private ClientIdUtil() {
|
||||
}
|
||||
|
||||
/** Returns the ID of the client that initiated this request or UNSET_CLIENT_ID if not set. */
|
||||
public static String getClientIdFromRequest(EarlybirdRequest request) {
|
||||
return Optional
|
||||
.ofNullable(request.getClientId())
|
||||
.map(String::toLowerCase)
|
||||
.orElse(UNSET_CLIENT_ID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the Strato http endpoint attribution as an Optional.
|
||||
*/
|
||||
public static Optional<String> getClientIdFromHttpEndpointAttribution() {
|
||||
return Optionals
|
||||
.optional(Attribution.httpEndpoint())
|
||||
.map(HttpEndpoint::name)
|
||||
.map(String::toLowerCase);
|
||||
}
|
||||
|
||||
/** Formats the given clientId into a string that can be used for stats. */
|
||||
public static String formatClientId(String clientId) {
|
||||
return CLIENT_ID_PREFIX + clientId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats the given Finagle clientId and the given clientId into a single string that can be used
|
||||
* for stats, or other purposes where the two IDs need to be combined.
|
||||
*/
|
||||
public static String formatFinagleClientIdAndClientId(String finagleClientId, String clientId) {
|
||||
return String.format(FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN, finagleClientId, clientId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Formats the given clientId and requestType into a single string that can be used
|
||||
* for stats or other purposes.
|
||||
*/
|
||||
public static String formatClientIdAndRequestType(
|
||||
String clientId, String requestType) {
|
||||
return String.format(CLIENT_ID_AND_REQUEST_TYPE, clientId, requestType);
|
||||
}
|
||||
|
||||
/**
|
||||
* Format the quota client id
|
||||
*/
|
||||
public static String getQuotaClientId(String clientId) {
|
||||
if (FinagleUtil.UNKNOWN_CLIENT_NAME.equals(clientId) || UNSET_CLIENT_ID.equals(clientId)) {
|
||||
return CLIENT_ID_FOR_UNKNOWN_CLIENTS;
|
||||
}
|
||||
|
||||
return clientId;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,365 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import java.util.EnumMap;
|
||||
import java.util.Map;
|
||||
|
||||
import scala.Option;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.collect.Maps;
|
||||
|
||||
import com.twitter.context.TwitterContext;
|
||||
import com.twitter.context.thriftscala.Viewer;
|
||||
import com.twitter.decider.Decider;
|
||||
import com.twitter.finagle.thrift.ClientId;
|
||||
import com.twitter.finagle.thrift.ClientId$;
|
||||
import com.twitter.search.TwitterContextPermit;
|
||||
import com.twitter.search.common.constants.thriftjava.ThriftQuerySource;
|
||||
import com.twitter.search.common.decider.DeciderUtil;
|
||||
import com.twitter.search.common.logging.RPCLogger;
|
||||
import com.twitter.search.common.metrics.FailureRatioCounter;
|
||||
import com.twitter.search.common.metrics.Timer;
|
||||
import com.twitter.search.common.util.earlybird.TermStatisticsUtil;
|
||||
import com.twitter.search.common.util.earlybird.ThriftSearchResultUtil;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
||||
import com.twitter.search.earlybird.thrift.ThriftFacetFieldRequest;
|
||||
import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftTermStatisticsRequest;
|
||||
|
||||
import static com.twitter.search.common.util.earlybird.EarlybirdResponseUtil
|
||||
.responseConsideredFailed;
|
||||
|
||||
|
||||
public class EarlybirdRequestLogger extends RPCLogger {
|
||||
protected enum ExtraFields {
|
||||
QUERY_MAX_HITS_TO_PROCESS,
|
||||
COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS,
|
||||
RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS,
|
||||
NUM_HITS_PROCESSED,
|
||||
QUERY_COST,
|
||||
CPU_TOTAL,
|
||||
QUERY_SOURCE,
|
||||
CLIENT_ID,
|
||||
FINAGLE_CLIENT_ID
|
||||
}
|
||||
|
||||
protected enum ShardOnlyExtraFields {
|
||||
NUM_SEARCHED_SEGMENTS,
|
||||
SCORING_TIME_NANOS
|
||||
}
|
||||
|
||||
protected enum RootOnlyExtraFields {
|
||||
CACHING_ALLOWED,
|
||||
DEBUG_MODE,
|
||||
CACHE_HIT,
|
||||
USER_AGENT,
|
||||
// See JIRA APPSEC-2303 for IP addresses logging
|
||||
}
|
||||
|
||||
private static final String LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY =
|
||||
"log_full_request_details_on_error";
|
||||
private static final String LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY =
|
||||
"log_full_request_details_random_fraction";
|
||||
private static final String LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY =
|
||||
"log_full_slow_request_details_random_fraction";
|
||||
private static final String SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY =
|
||||
"slow_request_latency_threshold_ms";
|
||||
|
||||
private final Decider decider;
|
||||
private final boolean enableLogUnknownClientRequests;
|
||||
|
||||
private static final Map<ThriftQuerySource, FailureRatioCounter>
|
||||
FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE = preBuildFailureRatioCounters();
|
||||
private static final FailureRatioCounter NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER =
|
||||
new FailureRatioCounter("earlybird_logger", "query_source", "not_set");
|
||||
|
||||
static EarlybirdRequestLogger buildForRoot(
|
||||
String loggerName, int latencyWarnThreshold, Decider decider) {
|
||||
|
||||
return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold,
|
||||
decider, true, RPCLogger.Fields.values(), ExtraFields.values(),
|
||||
RootOnlyExtraFields.values());
|
||||
}
|
||||
|
||||
static EarlybirdRequestLogger buildForShard(
|
||||
String loggerName, int latencyWarnThreshold, Decider decider) {
|
||||
|
||||
return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold,
|
||||
decider, false, RPCLogger.Fields.values(), ExtraFields.values(),
|
||||
ShardOnlyExtraFields.values());
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider) {
|
||||
this(loggerName, latencyWarnThreshold, decider, false, RPCLogger.Fields.values(),
|
||||
ExtraFields.values(), RootOnlyExtraFields.values(), ShardOnlyExtraFields.values());
|
||||
}
|
||||
|
||||
private EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider,
|
||||
boolean enableLogUnknownClientRequests, Enum[]... fieldEnums) {
|
||||
super(loggerName, fieldEnums);
|
||||
this.decider = decider;
|
||||
this.enableLogUnknownClientRequests = enableLogUnknownClientRequests;
|
||||
setLatencyWarnThreshold(latencyWarnThreshold);
|
||||
}
|
||||
|
||||
/**
|
||||
* Logs the given earlybird request and response.
|
||||
*
|
||||
* @param request The earlybird request.
|
||||
* @param response The earlybird response.
|
||||
* @param timer The time it took to process this request.
|
||||
*/
|
||||
public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) {
|
||||
try {
|
||||
LogEntry entry = newLogEntry();
|
||||
|
||||
setRequestLogEntries(entry, request);
|
||||
setResponseLogEntries(entry, response);
|
||||
if (timer != null) {
|
||||
entry.setField(ExtraFields.CPU_TOTAL, Long.toString(timer.getElapsedCpuTotal()));
|
||||
}
|
||||
|
||||
boolean wasError = response != null && responseConsideredFailed(response.getResponseCode());
|
||||
|
||||
long responseTime = response != null ? response.getResponseTime() : 0L;
|
||||
|
||||
String logLine = writeLogLine(entry, responseTime, wasError);
|
||||
|
||||
// This code path is called for pre/post logging
|
||||
// Prevent same request showing up twice by only logging on post logging
|
||||
if (response != null && DeciderUtil.isAvailableForRandomRecipient(
|
||||
decider, LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) {
|
||||
Base64RequestResponseForLogging.randomRequest(logLine, request, response).log();
|
||||
}
|
||||
|
||||
// Unknown client request logging only applies to pre-logging.
|
||||
if (enableLogUnknownClientRequests && response == null) {
|
||||
UnknownClientRequestForLogging unknownClientRequestLogger =
|
||||
UnknownClientRequestForLogging.unknownClientRequest(logLine, request);
|
||||
if (unknownClientRequestLogger != null) {
|
||||
unknownClientRequestLogger.log();
|
||||
}
|
||||
}
|
||||
|
||||
if (wasError
|
||||
&& DeciderUtil.isAvailableForRandomRecipient(
|
||||
decider, LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY)) {
|
||||
new RequestResponseForLogging(request, response).logFailedRequest();
|
||||
Base64RequestResponseForLogging.failedRequest(logLine, request, response).log();
|
||||
}
|
||||
|
||||
boolean wasSlow = response != null
|
||||
&& responseTime >= DeciderUtil.getAvailability(
|
||||
decider, SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY);
|
||||
if (wasSlow
|
||||
&& DeciderUtil.isAvailableForRandomRecipient(
|
||||
decider, LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) {
|
||||
Base64RequestResponseForLogging.slowRequest(logLine, request, response).log();
|
||||
}
|
||||
|
||||
FailureRatioCounter failureRatioCounter =
|
||||
FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE.get(request.getQuerySource());
|
||||
if (failureRatioCounter != null) {
|
||||
failureRatioCounter.requestFinished(!wasError);
|
||||
} else {
|
||||
NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER.requestFinished(!wasError);
|
||||
}
|
||||
|
||||
} catch (Exception e) {
|
||||
LOG.error("Exception building log entry ", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void setRequestLogEntries(LogEntry entry, EarlybirdRequest request) {
|
||||
entry.setField(Fields.CLIENT_HOST, request.getClientHost());
|
||||
entry.setField(Fields.CLIENT_REQUEST_ID, request.getClientRequestID());
|
||||
entry.setField(Fields.REQUEST_TYPE, requestTypeForLog(request));
|
||||
|
||||
if (request.isSetSearchQuery()) {
|
||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
||||
entry.setField(Fields.QUERY, searchQuery.getSerializedQuery());
|
||||
|
||||
if (searchQuery.isSetMaxHitsToProcess()) {
|
||||
entry.setField(ExtraFields.QUERY_MAX_HITS_TO_PROCESS,
|
||||
Integer.toString(searchQuery.getMaxHitsToProcess()));
|
||||
}
|
||||
|
||||
if (searchQuery.isSetCollectorParams()
|
||||
&& searchQuery.getCollectorParams().isSetTerminationParams()
|
||||
&& searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
|
||||
entry.setField(ExtraFields.COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS,
|
||||
Integer.toString(searchQuery.getCollectorParams().getTerminationParams()
|
||||
.getMaxHitsToProcess()));
|
||||
}
|
||||
|
||||
if (searchQuery.isSetRelevanceOptions()
|
||||
&& searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
|
||||
entry.setField(ExtraFields.RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS,
|
||||
Integer.toString(searchQuery.getRelevanceOptions().getMaxHitsToProcess()));
|
||||
}
|
||||
}
|
||||
|
||||
entry.setField(Fields.NUM_REQUESTED, Integer.toString(numRequestedForLog(request)));
|
||||
|
||||
if (request.isSetQuerySource()) {
|
||||
entry.setField(ExtraFields.QUERY_SOURCE, request.getQuerySource().name());
|
||||
}
|
||||
|
||||
if (request.isSetClientId()) {
|
||||
entry.setField(ExtraFields.CLIENT_ID, request.getClientId());
|
||||
}
|
||||
|
||||
entry.setField(RootOnlyExtraFields.CACHING_ALLOWED,
|
||||
Boolean.toString(EarlybirdRequestUtil.isCachingAllowed(request)));
|
||||
|
||||
entry.setField(RootOnlyExtraFields.DEBUG_MODE, Byte.toString(request.getDebugMode()));
|
||||
|
||||
Option<ClientId> clientIdOption = ClientId$.MODULE$.current();
|
||||
if (clientIdOption.isDefined()) {
|
||||
entry.setField(ExtraFields.FINAGLE_CLIENT_ID, clientIdOption.get().name());
|
||||
}
|
||||
|
||||
setLogEntriesFromTwitterContext(entry);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
Option<Viewer> getTwitterContext() {
|
||||
return TwitterContext.acquire(TwitterContextPermit.get()).apply();
|
||||
}
|
||||
|
||||
private void setLogEntriesFromTwitterContext(LogEntry entry) {
|
||||
Option<Viewer> viewerOption = getTwitterContext();
|
||||
if (viewerOption.nonEmpty()) {
|
||||
Viewer viewer = viewerOption.get();
|
||||
|
||||
if (viewer.userAgent().nonEmpty()) {
|
||||
String userAgent = viewer.userAgent().get();
|
||||
|
||||
// we only replace the comma in the user-agent with %2C to make it easily parseable,
|
||||
// specially with command line tools like cut/sed/awk
|
||||
userAgent = userAgent.replace(",", "%2C");
|
||||
|
||||
entry.setField(RootOnlyExtraFields.USER_AGENT, userAgent);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void setResponseLogEntries(LogEntry entry, EarlybirdResponse response) {
|
||||
if (response != null) {
|
||||
entry.setField(Fields.NUM_RETURNED, Integer.toString(numResultsForLog(response)));
|
||||
entry.setField(Fields.RESPONSE_CODE, String.valueOf(response.getResponseCode()));
|
||||
entry.setField(Fields.RESPONSE_TIME_MICROS, Long.toString(response.getResponseTimeMicros()));
|
||||
if (response.isSetSearchResults()) {
|
||||
entry.setField(ExtraFields.NUM_HITS_PROCESSED,
|
||||
Integer.toString(response.getSearchResults().getNumHitsProcessed()));
|
||||
entry.setField(ExtraFields.QUERY_COST,
|
||||
Double.toString(response.getSearchResults().getQueryCost()));
|
||||
if (response.getSearchResults().isSetScoringTimeNanos()) {
|
||||
entry.setField(ShardOnlyExtraFields.SCORING_TIME_NANOS,
|
||||
Long.toString(response.getSearchResults().getScoringTimeNanos()));
|
||||
}
|
||||
}
|
||||
if (response.isSetCacheHit()) {
|
||||
entry.setField(RootOnlyExtraFields.CACHE_HIT, String.valueOf(response.isCacheHit()));
|
||||
}
|
||||
if (response.isSetNumSearchedSegments()) {
|
||||
entry.setField(ShardOnlyExtraFields.NUM_SEARCHED_SEGMENTS,
|
||||
Integer.toString(response.getNumSearchedSegments()));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private static int numRequestedForLog(EarlybirdRequest request) {
|
||||
int num = 0;
|
||||
if (request.isSetFacetRequest() && request.getFacetRequest().isSetFacetFields()) {
|
||||
for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) {
|
||||
num += field.getNumResults();
|
||||
}
|
||||
} else if (request.isSetTermStatisticsRequest()) {
|
||||
num = request.getTermStatisticsRequest().getTermRequestsSize();
|
||||
} else if (request.isSetSearchQuery()) {
|
||||
num = request.getSearchQuery().isSetCollectorParams()
|
||||
? request.getSearchQuery().getCollectorParams().getNumResultsToReturn() : 0;
|
||||
if (request.getSearchQuery().getSearchStatusIdsSize() > 0) {
|
||||
num = Math.max(num, request.getSearchQuery().getSearchStatusIdsSize());
|
||||
}
|
||||
}
|
||||
return num;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the number of results in the given response. If the response is a term stats response,
|
||||
* then the returned value will be the number of term results. If the response is a facet
|
||||
* response, then the returned value will be the number of facet results. Otherwise, the returned
|
||||
* value will be the number of search results.
|
||||
*/
|
||||
public static int numResultsForLog(EarlybirdResponse response) {
|
||||
if (response == null) {
|
||||
return 0;
|
||||
} else if (response.isSetFacetResults()) {
|
||||
return ThriftSearchResultUtil.numFacetResults(response.getFacetResults());
|
||||
} else if (response.isSetTermStatisticsResults()) {
|
||||
return response.getTermStatisticsResults().getTermResultsSize();
|
||||
} else {
|
||||
return ThriftSearchResultUtil.numResults(response.getSearchResults());
|
||||
}
|
||||
}
|
||||
|
||||
private static String requestTypeForLog(EarlybirdRequest request) {
|
||||
StringBuilder requestType = new StringBuilder(64);
|
||||
if (request.isSetFacetRequest()) {
|
||||
requestType.append("FACETS");
|
||||
int numFields = request.getFacetRequest().getFacetFieldsSize();
|
||||
if (numFields > 0) {
|
||||
// For 1 or 2 fields, just put them in the request type. For more, just log the number.
|
||||
if (numFields <= 2) {
|
||||
for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) {
|
||||
requestType.append(":").append(field.getFieldName().toUpperCase());
|
||||
}
|
||||
} else {
|
||||
requestType.append(":MULTI-").append(numFields);
|
||||
}
|
||||
}
|
||||
} else if (request.isSetTermStatisticsRequest()) {
|
||||
ThriftTermStatisticsRequest termStatsRequest = request.getTermStatisticsRequest();
|
||||
requestType.append("TERMSTATS-")
|
||||
.append(termStatsRequest.getTermRequestsSize());
|
||||
|
||||
ThriftHistogramSettings histoSettings = termStatsRequest.getHistogramSettings();
|
||||
if (histoSettings != null) {
|
||||
String binSizeVal = String.valueOf(TermStatisticsUtil.determineBinSize(histoSettings));
|
||||
String numBinsVal = String.valueOf(histoSettings.getNumBins());
|
||||
requestType.append(":NUMBINS-").append(numBinsVal).append(":BINSIZE-").append(binSizeVal);
|
||||
}
|
||||
} else if (request.isSetSearchQuery()) {
|
||||
requestType.append("SEARCH:");
|
||||
requestType.append(request.getSearchQuery().getRankingMode().name());
|
||||
// Denote when a from user id is present.
|
||||
if (request.getSearchQuery().isSetFromUserIDFilter64()) {
|
||||
requestType.append(":NETWORK-")
|
||||
.append(request.getSearchQuery().getFromUserIDFilter64Size());
|
||||
}
|
||||
// Denote when required status ids are present.
|
||||
if (request.getSearchQuery().getSearchStatusIdsSize() > 0) {
|
||||
requestType.append(":IDS-").append(request.getSearchQuery().getSearchStatusIdsSize());
|
||||
}
|
||||
}
|
||||
return requestType.toString();
|
||||
}
|
||||
|
||||
private static Map<ThriftQuerySource, FailureRatioCounter> preBuildFailureRatioCounters() {
|
||||
Map<ThriftQuerySource, FailureRatioCounter> counterByQuerySource =
|
||||
new EnumMap<>(ThriftQuerySource.class);
|
||||
|
||||
for (ThriftQuerySource thriftQuerySource : ThriftQuerySource.values()) {
|
||||
FailureRatioCounter counter = new FailureRatioCounter("earlybird_logger", "query_source",
|
||||
thriftQuerySource.toString());
|
||||
counterByQuerySource.put(thriftQuerySource, counter);
|
||||
}
|
||||
|
||||
return Maps.immutableEnumMap(counterByQuerySource);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,37 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import com.twitter.decider.Decider;
|
||||
import com.twitter.search.common.metrics.Timer;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
||||
|
||||
public final class EarlybirdRequestPostLogger {
|
||||
private final EarlybirdRequestLogger logger;
|
||||
|
||||
public static EarlybirdRequestPostLogger buildForRoot(
|
||||
int latencyWarnThreshold, Decider decider) {
|
||||
|
||||
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot(
|
||||
EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider);
|
||||
|
||||
return new EarlybirdRequestPostLogger(requestLogger);
|
||||
}
|
||||
|
||||
public static EarlybirdRequestPostLogger buildForShard(
|
||||
int latencyWarnThreshold, Decider decider) {
|
||||
|
||||
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard(
|
||||
EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider);
|
||||
|
||||
return new EarlybirdRequestPostLogger(requestLogger);
|
||||
}
|
||||
|
||||
private EarlybirdRequestPostLogger(EarlybirdRequestLogger logger) {
|
||||
this.logger = logger;
|
||||
}
|
||||
|
||||
public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) {
|
||||
EarlybirdRequestUtil.updateHitsCounters(request);
|
||||
logger.logRequest(request, response, timer);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,32 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import com.twitter.decider.Decider;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
|
||||
public final class EarlybirdRequestPreLogger {
|
||||
private final EarlybirdRequestLogger logger;
|
||||
|
||||
public static EarlybirdRequestPreLogger buildForRoot(Decider decider) {
|
||||
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot(
|
||||
EarlybirdRequestPreLogger.class.getName(), Integer.MAX_VALUE, decider);
|
||||
|
||||
return new EarlybirdRequestPreLogger(requestLogger);
|
||||
}
|
||||
|
||||
public static EarlybirdRequestPreLogger buildForShard(
|
||||
int latencyWarnThreshold, Decider decider) {
|
||||
|
||||
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard(
|
||||
EarlybirdRequestPreLogger.class.getName(), latencyWarnThreshold, decider);
|
||||
|
||||
return new EarlybirdRequestPreLogger(requestLogger);
|
||||
}
|
||||
|
||||
private EarlybirdRequestPreLogger(EarlybirdRequestLogger logger) {
|
||||
this.logger = logger;
|
||||
}
|
||||
|
||||
public void logRequest(EarlybirdRequest request) {
|
||||
logger.logRequest(request, null, null);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,244 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.metrics.SearchMovingAverage;
|
||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
||||
import com.twitter.search.common.metrics.SearchTimerStats;
|
||||
import com.twitter.search.common.query.thriftjava.CollectorParams;
|
||||
import com.twitter.search.common.query.thriftjava.CollectorTerminationParams;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
||||
import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
|
||||
|
||||
public final class EarlybirdRequestUtil {
|
||||
// This logger is setup to log to a separate set of log files (request_info) and use an
|
||||
// async logger so as to not block the searcher thread. See search/earlybird/config/log4j.xml
|
||||
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdRequestUtil.class);
|
||||
|
||||
@VisibleForTesting
|
||||
static final SearchMovingAverage REQUESTED_NUM_RESULTS_STAT =
|
||||
SearchMovingAverage.export("requested_num_results");
|
||||
|
||||
@VisibleForTesting
|
||||
static final SearchMovingAverage REQUESTED_MAX_HITS_TO_PROCESS_STAT =
|
||||
SearchMovingAverage.export("requested_max_hits_to_process");
|
||||
|
||||
@VisibleForTesting
|
||||
static final SearchMovingAverage REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT =
|
||||
SearchMovingAverage.export("requested_collector_params_max_hits_to_process");
|
||||
|
||||
@VisibleForTesting
|
||||
static final SearchMovingAverage REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT =
|
||||
SearchMovingAverage.export("requested_relevance_options_max_hits_to_process");
|
||||
|
||||
@VisibleForTesting
|
||||
static final SearchCounter REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT =
|
||||
SearchCounter.export("requested_max_hits_to_process_are_different");
|
||||
|
||||
private static final SearchRateCounter REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT =
|
||||
SearchRateCounter.export("request_with_more_than_2k_num_result");
|
||||
private static final SearchRateCounter REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT =
|
||||
SearchRateCounter.export("request_with_more_than_4k_num_result");
|
||||
|
||||
// Stats for tracking clock skew between earlybird and the client-specified request timestamp.
|
||||
@VisibleForTesting
|
||||
public static final SearchTimerStats CLIENT_CLOCK_DIFF_ABS =
|
||||
SearchTimerStats.export("client_clock_diff_abs", TimeUnit.MILLISECONDS, false, true);
|
||||
@VisibleForTesting
|
||||
public static final SearchTimerStats CLIENT_CLOCK_DIFF_POS =
|
||||
SearchTimerStats.export("client_clock_diff_pos", TimeUnit.MILLISECONDS, false, true);
|
||||
@VisibleForTesting
|
||||
public static final SearchTimerStats CLIENT_CLOCK_DIFF_NEG =
|
||||
SearchTimerStats.export("client_clock_diff_neg", TimeUnit.MILLISECONDS, false, true);
|
||||
@VisibleForTesting
|
||||
public static final SearchRateCounter CLIENT_CLOCK_DIFF_MISSING =
|
||||
SearchRateCounter.export("client_clock_diff_missing");
|
||||
|
||||
private static final int MAX_NUM_RESULTS = 4000;
|
||||
private static final int OLD_MAX_NUM_RESULTS = 2000;
|
||||
|
||||
private EarlybirdRequestUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Logs and fixes some potentially excessive values in the given request.
|
||||
*/
|
||||
public static void logAndFixExcessiveValues(EarlybirdRequest request) {
|
||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
||||
if (searchQuery != null) {
|
||||
int maxHitsToProcess = 0;
|
||||
int numResultsToReturn = 0;
|
||||
|
||||
if (searchQuery.isSetCollectorParams()) {
|
||||
numResultsToReturn = searchQuery.getCollectorParams().getNumResultsToReturn();
|
||||
|
||||
if (searchQuery.getCollectorParams().isSetTerminationParams()) {
|
||||
maxHitsToProcess =
|
||||
searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess();
|
||||
}
|
||||
}
|
||||
|
||||
if (maxHitsToProcess > 50000) {
|
||||
LOG.warn("Excessive max hits in " + request.toString());
|
||||
}
|
||||
|
||||
// We used to limit number of results to 2000. These two counters help us track if we receive
|
||||
// too many requests with large number of results set.
|
||||
String warningMessageTemplate = "Exceed %d num result in %s";
|
||||
if (numResultsToReturn > MAX_NUM_RESULTS) {
|
||||
LOG.warn(String.format(warningMessageTemplate, MAX_NUM_RESULTS, request.toString()));
|
||||
REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT.increment();
|
||||
searchQuery.getCollectorParams().setNumResultsToReturn(MAX_NUM_RESULTS);
|
||||
} else if (numResultsToReturn > OLD_MAX_NUM_RESULTS) {
|
||||
LOG.warn(String.format(warningMessageTemplate, OLD_MAX_NUM_RESULTS, request.toString()));
|
||||
REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT.increment();
|
||||
}
|
||||
|
||||
ThriftSearchRelevanceOptions options = searchQuery.getRelevanceOptions();
|
||||
if (options != null) {
|
||||
if (options.getMaxHitsToProcess() > 50000) {
|
||||
LOG.warn("Excessive max hits in " + request.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets {@code request.searchQuery.collectorParams} if they are not already set.
|
||||
*/
|
||||
public static void checkAndSetCollectorParams(EarlybirdRequest request) {
|
||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
||||
if (searchQuery == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (!searchQuery.isSetCollectorParams()) {
|
||||
searchQuery.setCollectorParams(new CollectorParams());
|
||||
}
|
||||
if (!searchQuery.getCollectorParams().isSetNumResultsToReturn()) {
|
||||
searchQuery.getCollectorParams().setNumResultsToReturn(searchQuery.getNumResults());
|
||||
}
|
||||
if (!searchQuery.getCollectorParams().isSetTerminationParams()) {
|
||||
CollectorTerminationParams terminationParams = new CollectorTerminationParams();
|
||||
if (request.isSetTimeoutMs()) {
|
||||
terminationParams.setTimeoutMs(request.getTimeoutMs());
|
||||
}
|
||||
if (request.isSetMaxQueryCost()) {
|
||||
terminationParams.setMaxQueryCost(request.getMaxQueryCost());
|
||||
}
|
||||
searchQuery.getCollectorParams().setTerminationParams(terminationParams);
|
||||
}
|
||||
setMaxHitsToProcess(searchQuery);
|
||||
}
|
||||
|
||||
// Early birds will only look for maxHitsToProcess in CollectorParameters.TerminationParameters.
|
||||
// Priority to set CollectorParameters.TerminationParameters.maxHitsToProcess is
|
||||
// 1 Collector parameters
|
||||
// 2 RelevanceParameters
|
||||
// 3 ThrfitQuery.maxHitsToProcess
|
||||
private static void setMaxHitsToProcess(ThriftSearchQuery thriftSearchQuery) {
|
||||
CollectorTerminationParams terminationParams = thriftSearchQuery
|
||||
.getCollectorParams().getTerminationParams();
|
||||
if (!terminationParams.isSetMaxHitsToProcess()) {
|
||||
if (thriftSearchQuery.isSetRelevanceOptions()
|
||||
&& thriftSearchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
|
||||
terminationParams.setMaxHitsToProcess(
|
||||
thriftSearchQuery.getRelevanceOptions().getMaxHitsToProcess());
|
||||
} else {
|
||||
terminationParams.setMaxHitsToProcess(thriftSearchQuery.getMaxHitsToProcess());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a copy of the given request and unsets the binary fields to make the logged line for
|
||||
* this request look nicer.
|
||||
*/
|
||||
public static EarlybirdRequest copyAndClearUnnecessaryValuesForLogging(EarlybirdRequest request) {
|
||||
EarlybirdRequest copiedRequest = request.deepCopy();
|
||||
|
||||
if (copiedRequest.isSetSearchQuery()) {
|
||||
// These fields are very large and the binary data doesn't play well with formz
|
||||
copiedRequest.getSearchQuery().unsetTrustedFilter();
|
||||
copiedRequest.getSearchQuery().unsetDirectFollowFilter();
|
||||
}
|
||||
|
||||
return copiedRequest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates some hit-related stats based on the parameters in the given request.
|
||||
*/
|
||||
public static void updateHitsCounters(EarlybirdRequest request) {
|
||||
if ((request == null) || !request.isSetSearchQuery()) {
|
||||
return;
|
||||
}
|
||||
|
||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
||||
|
||||
if (searchQuery.isSetNumResults()) {
|
||||
REQUESTED_NUM_RESULTS_STAT.addSample(searchQuery.getNumResults());
|
||||
}
|
||||
|
||||
if (searchQuery.isSetMaxHitsToProcess()) {
|
||||
REQUESTED_MAX_HITS_TO_PROCESS_STAT.addSample(searchQuery.getMaxHitsToProcess());
|
||||
}
|
||||
|
||||
Integer collectorParamsMaxHitsToProcess = null;
|
||||
if (searchQuery.isSetCollectorParams()
|
||||
&& searchQuery.getCollectorParams().isSetTerminationParams()
|
||||
&& searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
|
||||
collectorParamsMaxHitsToProcess =
|
||||
searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess();
|
||||
REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT
|
||||
.addSample(collectorParamsMaxHitsToProcess);
|
||||
}
|
||||
|
||||
Integer relevanceOptionsMaxHitsToProcess = null;
|
||||
if (searchQuery.isSetRelevanceOptions()
|
||||
&& searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
|
||||
relevanceOptionsMaxHitsToProcess = searchQuery.getRelevanceOptions().getMaxHitsToProcess();
|
||||
REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT
|
||||
.addSample(relevanceOptionsMaxHitsToProcess);
|
||||
}
|
||||
|
||||
if ((collectorParamsMaxHitsToProcess != null)
|
||||
&& (relevanceOptionsMaxHitsToProcess != null)
|
||||
&& (collectorParamsMaxHitsToProcess != relevanceOptionsMaxHitsToProcess)) {
|
||||
REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT.increment();
|
||||
}
|
||||
}
|
||||
|
||||
public static boolean isCachingAllowed(EarlybirdRequest request) {
|
||||
return !request.isSetCachingParams() || request.getCachingParams().isCache();
|
||||
}
|
||||
|
||||
/**
|
||||
* Track the clock difference between this server and its client's specified request time.
|
||||
* When there is no clock drift between machines, this will record the inflight time between this
|
||||
* server and the client.
|
||||
*
|
||||
* @param request the incoming earlybird request.
|
||||
*/
|
||||
public static void recordClientClockDiff(EarlybirdRequest request) {
|
||||
if (request.isSetClientRequestTimeMs()) {
|
||||
final long timeDiff = System.currentTimeMillis() - request.getClientRequestTimeMs();
|
||||
final long timeDiffAbs = Math.abs(timeDiff);
|
||||
if (timeDiff >= 0) {
|
||||
CLIENT_CLOCK_DIFF_POS.timerIncrement(timeDiffAbs);
|
||||
} else {
|
||||
CLIENT_CLOCK_DIFF_NEG.timerIncrement(timeDiffAbs);
|
||||
}
|
||||
CLIENT_CLOCK_DIFF_ABS.timerIncrement(timeDiffAbs);
|
||||
} else {
|
||||
CLIENT_CLOCK_DIFF_MISSING.increment();
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,28 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import javax.inject.Inject;
|
||||
import javax.inject.Singleton;
|
||||
|
||||
import org.apache.thrift.protocol.TProtocolFactory;
|
||||
|
||||
import com.twitter.finagle.Service;
|
||||
import com.twitter.search.common.util.thrift.ThriftToBytesFilter;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdService;
|
||||
|
||||
@Singleton
|
||||
public class EarlybirdThriftBackend extends EarlybirdService.ServiceToClient {
|
||||
|
||||
/**
|
||||
* Wrapping the bytes svc back to a EarlybirdService.ServiceToClient, which
|
||||
* is a EarlybirdService.ServiceIface again.
|
||||
*/
|
||||
@Inject
|
||||
public EarlybirdThriftBackend(
|
||||
ThriftToBytesFilter thriftToBytesFilter,
|
||||
Service<byte[], byte[]> byteService,
|
||||
TProtocolFactory protocolFactory) {
|
||||
|
||||
super(thriftToBytesFilter.andThen(byteService), protocolFactory);
|
||||
}
|
||||
|
||||
}
|
Binary file not shown.
@ -1,34 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
||||
|
||||
/**
|
||||
* When incremented, a non-paging alert will be triggered. Use this to assert for bad conditions
|
||||
* that should generally never happen.
|
||||
*/
|
||||
public class NonPagingAssert {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(NonPagingAssert.class);
|
||||
|
||||
private static final String ASSERT_STAT_PREFIX = "non_paging_assert_";
|
||||
|
||||
private final String name;
|
||||
private final SearchRateCounter assertCounter;
|
||||
|
||||
public NonPagingAssert(String name) {
|
||||
this.name = name;
|
||||
this.assertCounter = SearchRateCounter.export(ASSERT_STAT_PREFIX + name);
|
||||
}
|
||||
|
||||
public void assertFailed() {
|
||||
LOG.error("NonPagingAssert failed: {}", name);
|
||||
assertCounter.increment();
|
||||
}
|
||||
|
||||
public static void assertFailed(String name) {
|
||||
NonPagingAssert nonPagingAssert = new NonPagingAssert(name);
|
||||
nonPagingAssert.assertFailed();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,55 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
|
||||
import org.apache.thrift.TException;
|
||||
import org.apache.thrift.TSerializer;
|
||||
import org.apache.thrift.protocol.TSimpleJSONProtocol;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
||||
|
||||
public class RequestResponseForLogging {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(
|
||||
RequestResponseForLogging.class);
|
||||
|
||||
private static final Logger FAILED_REQUEST_LOG = LoggerFactory.getLogger(
|
||||
RequestResponseForLogging.class.getName() + ".FailedRequests");
|
||||
|
||||
private final EarlybirdRequest request;
|
||||
private final EarlybirdResponse response;
|
||||
|
||||
public RequestResponseForLogging(EarlybirdRequest request,
|
||||
EarlybirdResponse response) {
|
||||
this.request = request;
|
||||
this.response = response;
|
||||
}
|
||||
|
||||
private String serialize(EarlybirdRequest clearedRequest, EarlybirdResponse theResponse) {
|
||||
TSerializer serializer = new TSerializer(new TSimpleJSONProtocol.Factory());
|
||||
try {
|
||||
String requestJson = serializer.toString(clearedRequest);
|
||||
String responseJson = serializer.toString(theResponse);
|
||||
return "{\"request\":" + requestJson + ", \"response\":" + responseJson + "}";
|
||||
} catch (TException e) {
|
||||
LOG.error("Failed to serialize request/response for logging.", e);
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Logs the request and response stored in this instance to the failure log file.
|
||||
*/
|
||||
public void logFailedRequest() {
|
||||
// Do the serializing/concatting this way so it happens on the background thread for
|
||||
// async logging
|
||||
FAILED_REQUEST_LOG.info("{}", new Object() {
|
||||
@Override
|
||||
public String toString() {
|
||||
return serialize(
|
||||
EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request), response);
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,44 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import org.apache.lucene.search.Query;
|
||||
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
||||
|
||||
public class RequestResponsePair {
|
||||
private final EarlybirdRequest request;
|
||||
private final EarlybirdResponse response;
|
||||
private final org.apache.lucene.search.Query luceneQuery;
|
||||
|
||||
// The serialized query in its final form, after various modifications have been applied to it.
|
||||
// As a note, we have some code paths in which this can be null, but I don't really see them
|
||||
// triggered in production right now.
|
||||
private final com.twitter.search.queryparser.query.Query finalSerializedQuery;
|
||||
|
||||
public RequestResponsePair(
|
||||
EarlybirdRequest request,
|
||||
com.twitter.search.queryparser.query.Query finalSerializedQuery,
|
||||
org.apache.lucene.search.Query luceneQuery,
|
||||
EarlybirdResponse response) {
|
||||
this.request = request;
|
||||
this.luceneQuery = luceneQuery;
|
||||
this.response = response;
|
||||
this.finalSerializedQuery = finalSerializedQuery;
|
||||
}
|
||||
|
||||
public String getFinalSerializedQuery() {
|
||||
return finalSerializedQuery != null ? finalSerializedQuery.serialize() : "N/A";
|
||||
}
|
||||
|
||||
public EarlybirdRequest getRequest() {
|
||||
return request;
|
||||
}
|
||||
|
||||
public EarlybirdResponse getResponse() {
|
||||
return response;
|
||||
}
|
||||
|
||||
public Query getLuceneQuery() {
|
||||
return luceneQuery;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,77 +0,0 @@
|
||||
package com.twitter.search.earlybird.common;
|
||||
|
||||
import org.apache.commons.codec.binary.Base64;
|
||||
import org.apache.thrift.TException;
|
||||
import org.apache.thrift.TSerializer;
|
||||
import org.apache.thrift.protocol.TBinaryProtocol;
|
||||
import org.slf4j.Logger;
|
||||
|
||||
import com.twitter.search.common.util.FinagleUtil;
|
||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
||||
|
||||
/**
|
||||
* This class logs all requests that misses either the finagle Id or the client Id.
|
||||
*/
|
||||
public final class UnknownClientRequestForLogging {
|
||||
private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger(
|
||||
UnknownClientRequestForLogging.class);
|
||||
private static final Logger LOG = org.slf4j.LoggerFactory.getLogger(
|
||||
UnknownClientRequestForLogging.class.getName() + ".unknownClientRequests");
|
||||
|
||||
private final String logLine;
|
||||
private final EarlybirdRequest request;
|
||||
private final String clientId;
|
||||
private final String finagleId;
|
||||
|
||||
private final Base64 base64 = new Base64();
|
||||
private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
|
||||
|
||||
private UnknownClientRequestForLogging(
|
||||
String logLine,
|
||||
EarlybirdRequest request,
|
||||
String clientId,
|
||||
String finagleId) {
|
||||
|
||||
this.logLine = logLine;
|
||||
this.request = request;
|
||||
this.clientId = clientId;
|
||||
this.finagleId = finagleId;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an UnknownClientRequestForLogging instance if a client ID is not set on the given
|
||||
* earlybird request. If the request has a client ID set, {@code null} is returned.
|
||||
*
|
||||
* @param logLine Additional information to propagate to the log file, when logging this request.
|
||||
* @param request The earlybird request.
|
||||
*/
|
||||
public static UnknownClientRequestForLogging unknownClientRequest(
|
||||
String logLine, EarlybirdRequest request) {
|
||||
String clientId = ClientIdUtil.getClientIdFromRequest(request);
|
||||
String finagleId = FinagleUtil.getFinagleClientName();
|
||||
|
||||
if (clientId.equals(ClientIdUtil.UNSET_CLIENT_ID)) {
|
||||
return new UnknownClientRequestForLogging(logLine, request, clientId, finagleId);
|
||||
} else {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
private String asBase64() {
|
||||
try {
|
||||
// Need to make a deepCopy() here, because the request may still be in use (e.g. if we are
|
||||
// doing this in the pre-logger), and we should not be modifying crucial fields on the
|
||||
// EarlybirdRequest in place.
|
||||
EarlybirdRequest clearedRequest = request.deepCopy();
|
||||
clearedRequest.unsetClientRequestTimeMs();
|
||||
return base64.encodeToString(serializer.serialize(clearedRequest));
|
||||
} catch (TException e) {
|
||||
GENERAL_LOG.error("Failed to serialize request for logging.", e);
|
||||
return "failed_to_serialize";
|
||||
}
|
||||
}
|
||||
|
||||
public void log() {
|
||||
LOG.info("{},{},{},{}", clientId, finagleId, logLine, asBase64());
|
||||
}
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
java_library(
|
||||
sources = ["*.java"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/code/findbugs:jsr305",
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/org/apache/commons:commons-lang3",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"3rdparty/jvm/org/yaml:snakeyaml",
|
||||
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common_internal/text/version",
|
||||
"src/java/com/twitter/search/common/aurora",
|
||||
"src/java/com/twitter/search/common/config",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/util/zookeeper",
|
||||
],
|
||||
)
|
BIN
src/java/com/twitter/search/earlybird/common/config/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/common/config/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,363 +0,0 @@
|
||||
package com.twitter.search.earlybird.common.config;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.collect.ImmutableMap;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common_internal.text.version.PenguinVersion;
|
||||
import com.twitter.search.common.aurora.AuroraInstanceKey;
|
||||
import com.twitter.search.common.config.Config;
|
||||
import com.twitter.search.common.config.ConfigFile;
|
||||
import com.twitter.search.common.config.ConfigurationException;
|
||||
import com.twitter.search.common.config.SearchPenguinVersionsConfig;
|
||||
|
||||
public final class EarlybirdConfig {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdConfig.class);
|
||||
|
||||
private static final String DEFAULT_CONFIG_FILE = "earlybird-search.yml";
|
||||
private static final String LATE_TWEET_BUFFER_KEY = "late_tweet_buffer";
|
||||
|
||||
public static final String EARLYBIRD_ZK_CONFIG_DIR = "/twitter/search/production/earlybird/";
|
||||
public static final String EARLYBIRD_CONFIG_DIR = "earlybird/config";
|
||||
|
||||
public static final String USER_SNAPSHOT_BASE_DIR = "user_snapshot_base_dir";
|
||||
|
||||
private static volatile ConfigFile earlybirdConfig = null;
|
||||
private static volatile Map<String, Object> overrideValueMap = ImmutableMap.of();
|
||||
|
||||
private static String logDirOverride = null;
|
||||
private static AuroraInstanceKey auroraInstanceKey = null;
|
||||
|
||||
private static int adminPort;
|
||||
|
||||
private EarlybirdConfig() { }
|
||||
|
||||
private static final class PenguinVersionHolder {
|
||||
private static final PenguinVersion PENGUIN_VERSION_SINGLETON =
|
||||
SearchPenguinVersionsConfig.getSingleSupportedVersion(
|
||||
EarlybirdProperty.PENGUIN_VERSION.get());
|
||||
private static final byte PENGUIN_VERSION_BYTE_VALUE =
|
||||
PENGUIN_VERSION_SINGLETON.getByteValue();
|
||||
}
|
||||
|
||||
public static byte getPenguinVersionByte() {
|
||||
return PenguinVersionHolder.PENGUIN_VERSION_BYTE_VALUE;
|
||||
}
|
||||
|
||||
public static PenguinVersion getPenguinVersion() {
|
||||
return PenguinVersionHolder.PENGUIN_VERSION_SINGLETON;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the earlybird configuration from the given file.
|
||||
*/
|
||||
public static synchronized void init(@Nullable String configFile) {
|
||||
if (earlybirdConfig == null) {
|
||||
String file = configFile == null ? DEFAULT_CONFIG_FILE : configFile;
|
||||
earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, file);
|
||||
}
|
||||
}
|
||||
|
||||
public static synchronized void setOverrideValues(Map<String, Object> overrideValues) {
|
||||
overrideValueMap = ImmutableMap.copyOf(overrideValues);
|
||||
}
|
||||
|
||||
/**
|
||||
* Pack all values in a string that can be printed for informational purposes.
|
||||
* @return the string.
|
||||
*/
|
||||
public static String allValuesAsString() {
|
||||
Map<String, String> stringMap = earlybirdConfig.getStringMap();
|
||||
|
||||
StringBuilder stringBuilder = new StringBuilder();
|
||||
|
||||
stringBuilder.append("Config environment: " + Config.getEnvironment() + "\n\n");
|
||||
stringBuilder.append(
|
||||
String.format("Values from earlybird-search.yml (total %d):\n", stringMap.size()));
|
||||
|
||||
stringMap.forEach((key, value) -> {
|
||||
stringBuilder.append(String.format(" %s: %s\n", key, value.toString()));
|
||||
if (overrideValueMap.containsKey(key)) {
|
||||
stringBuilder.append(String.format(
|
||||
" override value: %s\n", overrideValueMap.get(key).toString()));
|
||||
}
|
||||
});
|
||||
|
||||
stringBuilder.append(String.format(
|
||||
"\n\nAll command-line overrides (total: %d):\n", overrideValueMap.size()));
|
||||
overrideValueMap.forEach((key, value) -> {
|
||||
stringBuilder.append(String.format(" %s: %s\n", key, value.toString()));
|
||||
});
|
||||
|
||||
return stringBuilder.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a string. If the property is not set, a runtime
|
||||
* exception is thrown.
|
||||
*/
|
||||
public static String getString(String property) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (String) overrideValue;
|
||||
}
|
||||
|
||||
try {
|
||||
return earlybirdConfig.getString(property);
|
||||
} catch (ConfigurationException e) {
|
||||
LOG.error("Fatal error: could not get config string " + property, e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a string.
|
||||
*/
|
||||
public static String getString(String property, String defaultValue) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (String) overrideValue;
|
||||
}
|
||||
|
||||
return earlybirdConfig.getString(property, defaultValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as an integer. If the property is not set, a runtime
|
||||
* exception is thrown.
|
||||
*/
|
||||
public static int getInt(String property) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (int) overrideValue;
|
||||
}
|
||||
|
||||
try {
|
||||
return earlybirdConfig.getInt(property);
|
||||
} catch (ConfigurationException e) {
|
||||
LOG.error("Fatal error: could not get config int " + property, e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as an integer.
|
||||
*/
|
||||
public static int getInt(String property, int defaultValue) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (int) overrideValue;
|
||||
}
|
||||
|
||||
return earlybirdConfig.getInt(property, defaultValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a double.
|
||||
*/
|
||||
public static double getDouble(String property, double defaultValue) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (double) overrideValue;
|
||||
}
|
||||
|
||||
return earlybirdConfig.getDouble(property, defaultValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a long. If the property is not set, a runtime
|
||||
* exception is thrown.
|
||||
*/
|
||||
public static long getLong(String property) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (long) overrideValue;
|
||||
}
|
||||
|
||||
try {
|
||||
return earlybirdConfig.getLong(property);
|
||||
} catch (ConfigurationException e) {
|
||||
LOG.error("Fatal error: could not get config long " + property, e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a long.
|
||||
*/
|
||||
public static long getLong(String property, long defaultValue) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (long) overrideValue;
|
||||
}
|
||||
|
||||
return earlybirdConfig.getLong(property, defaultValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a boolean. If the property is not set, a runtime
|
||||
* exception is thrown.
|
||||
*/
|
||||
public static boolean getBool(String property) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (boolean) overrideValue;
|
||||
}
|
||||
|
||||
try {
|
||||
return earlybirdConfig.getBool(property);
|
||||
} catch (ConfigurationException e) {
|
||||
LOG.error("Fatal error: could not get config boolean " + property, e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a boolean.
|
||||
*/
|
||||
public static boolean getBool(String property, boolean defaultValue) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (boolean) overrideValue;
|
||||
}
|
||||
|
||||
return earlybirdConfig.getBool(property, defaultValue);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a date.
|
||||
*/
|
||||
public static Date getDate(String property) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (Date) overrideValue;
|
||||
}
|
||||
|
||||
Date date = (Date) earlybirdConfig.getObject(property, null);
|
||||
if (date == null) {
|
||||
throw new RuntimeException("Could not get config date: " + property);
|
||||
}
|
||||
return date;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a list of strings.
|
||||
*/
|
||||
public static List<String> getListOfStrings(String property) {
|
||||
Object overrideValue = overrideValueMap.get(property);
|
||||
if (overrideValue != null) {
|
||||
return (List<String>) overrideValue;
|
||||
}
|
||||
|
||||
List<String> list = (List<String>) earlybirdConfig.getObject(property, null);
|
||||
if (list == null) {
|
||||
throw new RuntimeException("Could not get list of strings: " + property);
|
||||
}
|
||||
return list;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value of the given property as a map.
|
||||
*/
|
||||
@SuppressWarnings("unchecked")
|
||||
public static Map<String, Object> getMap(String property) {
|
||||
Map<String, Object> map = (Map<String, Object>) earlybirdConfig.getObject(property, null);
|
||||
if (map == null) {
|
||||
throw new RuntimeException("Could not find config property: " + property);
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
public static int getMaxSegmentSize() {
|
||||
return EarlybirdConfig.getInt("max_segment_size", 1 << 16);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the log properties file.
|
||||
*/
|
||||
public static String getLogPropertiesFile() {
|
||||
try {
|
||||
String filename = earlybirdConfig.getString("log_properties_filename");
|
||||
return earlybirdConfig.getConfigFilePath(filename);
|
||||
} catch (ConfigurationException e) {
|
||||
// Print here rather than use LOG - log was probably not initialized yet.
|
||||
LOG.error("Fatal error: could not get log properties file", e);
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the log directory.
|
||||
*/
|
||||
public static String getLogDir() {
|
||||
if (logDirOverride != null) {
|
||||
return logDirOverride;
|
||||
} else {
|
||||
return EarlybirdConfig.getString("log_dir");
|
||||
}
|
||||
}
|
||||
|
||||
public static void overrideLogDir(String logDir) {
|
||||
EarlybirdConfig.logDirOverride = logDir;
|
||||
}
|
||||
|
||||
public static int getThriftPort() {
|
||||
return EarlybirdProperty.THRIFT_PORT.get();
|
||||
}
|
||||
|
||||
public static int getWarmUpThriftPort() {
|
||||
return EarlybirdProperty.WARMUP_THRIFT_PORT.get();
|
||||
}
|
||||
|
||||
public static int getSearcherThreads() {
|
||||
return EarlybirdProperty.SEARCHER_THREADS.get();
|
||||
}
|
||||
|
||||
public static int getLateTweetBuffer() {
|
||||
return getInt(LATE_TWEET_BUFFER_KEY);
|
||||
}
|
||||
|
||||
public static int getAdminPort() {
|
||||
return adminPort;
|
||||
}
|
||||
|
||||
public static void setAdminPort(int adminPort) {
|
||||
EarlybirdConfig.adminPort = adminPort;
|
||||
}
|
||||
|
||||
public static boolean isRealtimeOrProtected() {
|
||||
String earlybirdName = EarlybirdProperty.EARLYBIRD_NAME.get();
|
||||
return earlybirdName.contains("realtime") || earlybirdName.contains("protected");
|
||||
}
|
||||
|
||||
public static boolean consumeUserScrubGeoEvents() {
|
||||
return EarlybirdProperty.CONSUME_GEO_SCRUB_EVENTS.get();
|
||||
}
|
||||
|
||||
@Nullable
|
||||
public static AuroraInstanceKey getAuroraInstanceKey() {
|
||||
return auroraInstanceKey;
|
||||
}
|
||||
|
||||
public static void setAuroraInstanceKey(AuroraInstanceKey auroraInstanceKey) {
|
||||
EarlybirdConfig.auroraInstanceKey = auroraInstanceKey;
|
||||
}
|
||||
|
||||
public static boolean isAurora() {
|
||||
return auroraInstanceKey != null;
|
||||
}
|
||||
|
||||
public static void setForTests(String property, Object value) {
|
||||
earlybirdConfig.setForTests(DEFAULT_CONFIG_FILE, property, value);
|
||||
}
|
||||
|
||||
public static synchronized void clearForTests() {
|
||||
earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, DEFAULT_CONFIG_FILE);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,390 +0,0 @@
|
||||
package com.twitter.search.earlybird.common.config;
|
||||
|
||||
import java.lang.reflect.Modifier;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.function.BiFunction;
|
||||
import java.util.function.Function;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import com.google.common.collect.ImmutableList;
|
||||
|
||||
import com.twitter.app.Flag;
|
||||
import com.twitter.app.Flaggable;
|
||||
import com.twitter.app.Flags;
|
||||
import com.twitter.finagle.mtls.authentication.ServiceIdentifier;
|
||||
|
||||
/**
|
||||
* Stateless class that represents an Earlybird property that can be specified by a command line
|
||||
* flag.
|
||||
* <p>
|
||||
* This is a regular Java class instead of enum to have a generic type.
|
||||
*
|
||||
* @param <T>
|
||||
*/
|
||||
public final class EarlybirdProperty<T> {
|
||||
|
||||
private static final class PropertyType<T> {
|
||||
|
||||
private static final PropertyType<Boolean> BOOLEAN = new PropertyType<>(
|
||||
Flaggable.ofJavaBoolean(), EarlybirdConfig::getBool, EarlybirdConfig::getBool);
|
||||
|
||||
private static final PropertyType<Integer> INT = new PropertyType<>(
|
||||
Flaggable.ofJavaInteger(), EarlybirdConfig::getInt, EarlybirdConfig::getInt);
|
||||
|
||||
private static final PropertyType<String> STRING = new PropertyType<>(
|
||||
Flaggable.ofString(), EarlybirdConfig::getString, EarlybirdConfig::getString);
|
||||
|
||||
private final Flaggable<T> flaggable;
|
||||
private final Function<String, T> getter;
|
||||
private final BiFunction<String, T, T> getterWithDefault;
|
||||
|
||||
private PropertyType(Flaggable<T> flaggable, Function<String, T> getter,
|
||||
BiFunction<String, T, T> getterWithDefault) {
|
||||
this.flaggable = flaggable;
|
||||
this.getter = getter;
|
||||
this.getterWithDefault = getterWithDefault;
|
||||
}
|
||||
}
|
||||
|
||||
public static final EarlybirdProperty<String> PENGUIN_VERSION =
|
||||
new EarlybirdProperty<>(
|
||||
"penguin_version",
|
||||
"The penguin version to index.",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Integer> THRIFT_PORT = new EarlybirdProperty<>(
|
||||
"thrift_port",
|
||||
"override thrift port from config file",
|
||||
PropertyType.INT,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Integer> WARMUP_THRIFT_PORT = new EarlybirdProperty<>(
|
||||
"warmup_thrift_port",
|
||||
"override warmup thrift port from config file",
|
||||
PropertyType.INT,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Integer> SEARCHER_THREADS = new EarlybirdProperty<>(
|
||||
"searcher_threads",
|
||||
"override number of searcher threads from config file",
|
||||
PropertyType.INT,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> EARLYBIRD_TIER = new EarlybirdProperty<>(
|
||||
"earlybird_tier",
|
||||
"the earlybird tier (e.g. tier1), used on Aurora",
|
||||
PropertyType.STRING,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<Integer> REPLICA_ID = new EarlybirdProperty<>(
|
||||
"replica_id",
|
||||
"the ID in a partition, used on Aurora",
|
||||
PropertyType.INT,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<Integer> PARTITION_ID = new EarlybirdProperty<>(
|
||||
"partition_id",
|
||||
"partition ID, used on Aurora",
|
||||
PropertyType.INT,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<Integer> NUM_PARTITIONS = new EarlybirdProperty<>(
|
||||
"num_partitions",
|
||||
"number of partitions, used on Aurora",
|
||||
PropertyType.INT,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<Integer> NUM_INSTANCES = new EarlybirdProperty<>(
|
||||
"num_instances",
|
||||
"number of instances in the job, used on Aurora",
|
||||
PropertyType.INT,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<Integer> SERVING_TIMESLICES = new EarlybirdProperty<>(
|
||||
"serving_timeslices",
|
||||
"number of time slices to serve, used on Aurora",
|
||||
PropertyType.INT,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<String> ROLE = new EarlybirdProperty<>(
|
||||
"role",
|
||||
"Role in the service path of Earlybird",
|
||||
PropertyType.STRING,
|
||||
true,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<String> EARLYBIRD_NAME = new EarlybirdProperty<>(
|
||||
"earlybird_name",
|
||||
"Name in the service path of Earlybird without hash partition suffix",
|
||||
PropertyType.STRING,
|
||||
true,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<String> ENV = new EarlybirdProperty<>(
|
||||
"env",
|
||||
"Environment in the service path of Earlybird",
|
||||
PropertyType.STRING,
|
||||
true,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<String> ZONE = new EarlybirdProperty<>(
|
||||
"zone",
|
||||
"Zone (data center) in the service path of Earlybird",
|
||||
PropertyType.STRING,
|
||||
true,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<String> DL_URI = new EarlybirdProperty<>(
|
||||
"dl_uri",
|
||||
"DistributedLog URI for default DL reader",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> USER_UPDATES_DL_URI = new EarlybirdProperty<>(
|
||||
"user_updates_dl_uri",
|
||||
"DistributedLog URI for user updates DL reader",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> ANTISOCIAL_USERUPDATES_DL_STREAM =
|
||||
new EarlybirdProperty<>(
|
||||
"antisocial_userupdates_dl_stream",
|
||||
"DL stream name for antisocial user updates without DL version suffix",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> ZK_APP_ROOT = new EarlybirdProperty<>(
|
||||
"zk_app_root",
|
||||
"SZooKeeper base root path for this application",
|
||||
PropertyType.STRING,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<Boolean> SEGMENT_LOAD_FROM_HDFS_ENABLED =
|
||||
new EarlybirdProperty<>(
|
||||
"segment_load_from_hdfs_enabled",
|
||||
"Whether to load segment data from HDFS",
|
||||
PropertyType.BOOLEAN,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Boolean> SEGMENT_FLUSH_TO_HDFS_ENABLED =
|
||||
new EarlybirdProperty<>(
|
||||
"segment_flush_to_hdfs_enabled",
|
||||
"Whether to flush segment data to HDFS",
|
||||
PropertyType.BOOLEAN,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> HDFS_SEGMENT_SYNC_DIR = new EarlybirdProperty<>(
|
||||
"hdfs_segment_sync_dir",
|
||||
"HDFS directory to sync segment data",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> HDFS_SEGMENT_UPLOAD_DIR = new EarlybirdProperty<>(
|
||||
"hdfs_segment_upload_dir",
|
||||
"HDFS directory to upload segment data",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Boolean> ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED =
|
||||
new EarlybirdProperty<>(
|
||||
"archive_daily_status_batch_flushing_enabled",
|
||||
"Whether to enable archive daily status batch flushing",
|
||||
PropertyType.BOOLEAN,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> HDFS_INDEX_SYNC_DIR = new EarlybirdProperty<>(
|
||||
"hdfs_index_sync_dir",
|
||||
"HDFS directory to sync index data",
|
||||
PropertyType.STRING,
|
||||
true);
|
||||
|
||||
public static final EarlybirdProperty<Boolean> READ_INDEX_FROM_PROD_LOCATION =
|
||||
new EarlybirdProperty<>(
|
||||
"read_index_from_prod_location",
|
||||
"Read index from prod to speed up startup on staging / loadtest",
|
||||
PropertyType.BOOLEAN,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Boolean> USE_DECIDER_OVERLAY = new EarlybirdProperty<>(
|
||||
"use_decider_overlay",
|
||||
"Whether to use decider overlay",
|
||||
PropertyType.BOOLEAN,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> DECIDER_OVERLAY_CONFIG = new EarlybirdProperty<>(
|
||||
"decider_overlay_config",
|
||||
"Path to decider overlay config",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Integer> MAX_CONCURRENT_SEGMENT_INDEXERS =
|
||||
new EarlybirdProperty<>(
|
||||
"max_concurrent_segment_indexers",
|
||||
"Maximum number of segments indexed concurrently",
|
||||
PropertyType.INT,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Boolean> TF_MODELS_ENABLED =
|
||||
new EarlybirdProperty<>(
|
||||
"tf_models_enabled",
|
||||
"Whether tensorflow models should be loaded",
|
||||
PropertyType.BOOLEAN,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> TF_MODELS_CONFIG_PATH =
|
||||
new EarlybirdProperty<>(
|
||||
"tf_models_config_path",
|
||||
"The configuration path of the yaml file containing the list of tensorflow models to load.",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Integer> TF_INTER_OP_THREADS =
|
||||
new EarlybirdProperty<>(
|
||||
"tf_inter_op_threads",
|
||||
"How many tensorflow inter op threads to use. See TF documentation for more information.",
|
||||
PropertyType.INT,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Integer> TF_INTRA_OP_THREADS =
|
||||
new EarlybirdProperty<>(
|
||||
"tf_intra_op_threads",
|
||||
"How many tensorflow intra op threads to use. See TF documentation for more information.",
|
||||
PropertyType.INT,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Integer> MAX_ALLOWED_REPLICAS_NOT_IN_SERVER_SET =
|
||||
new EarlybirdProperty<>(
|
||||
"max_allowed_replicas_not_in_server_set",
|
||||
"How many replicas are allowed to be missing from the Earlybird server set.",
|
||||
PropertyType.INT,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Boolean> CHECK_NUM_REPLICAS_IN_SERVER_SET =
|
||||
new EarlybirdProperty<>(
|
||||
"check_num_replicas_in_server_set",
|
||||
"Whether CoordinatedEarlybirdActions should check the number of alive replicas",
|
||||
PropertyType.BOOLEAN,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<Integer> MAX_QUEUE_SIZE =
|
||||
new EarlybirdProperty<>(
|
||||
"max_queue_size",
|
||||
"Maximum size of searcher worker executor queue. If <= 0 queue is unbounded.",
|
||||
PropertyType.INT,
|
||||
false);
|
||||
|
||||
public static final EarlybirdProperty<String> KAFKA_ENV =
|
||||
new EarlybirdProperty<>(
|
||||
"kafka_env",
|
||||
"The environment to use for kafka topics.",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
public static final EarlybirdProperty<String> KAFKA_PATH =
|
||||
new EarlybirdProperty<>(
|
||||
"kafka_path",
|
||||
"Wily path to the Search kafka cluster.",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
public static final EarlybirdProperty<String> TWEET_EVENTS_KAFKA_PATH =
|
||||
new EarlybirdProperty<>(
|
||||
"tweet_events_kafka_path",
|
||||
"Wily path to the tweet-events kafka cluster.",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
public static final EarlybirdProperty<String> USER_UPDATES_KAFKA_TOPIC =
|
||||
new EarlybirdProperty<>(
|
||||
"user_updates_topic",
|
||||
"Name of the Kafka topic that contain user updates.",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
public static final EarlybirdProperty<String> USER_SCRUB_GEO_KAFKA_TOPIC =
|
||||
new EarlybirdProperty<>(
|
||||
"user_scrub_geo_topic",
|
||||
"Name of the Kafka topic that contain UserScrubGeoEvents.",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
public static final EarlybirdProperty<String> EARLYBIRD_SCRUB_GEN =
|
||||
new EarlybirdProperty<>(
|
||||
"earlybird_scrub_gen",
|
||||
"SCRUB_GEN TO DEPLOY",
|
||||
PropertyType.STRING,
|
||||
false);
|
||||
public static final EarlybirdProperty<Boolean> CONSUME_GEO_SCRUB_EVENTS =
|
||||
new EarlybirdProperty<>(
|
||||
"consume_geo_scrub_events",
|
||||
"Whether to consume user scrub geo events or not",
|
||||
PropertyType.BOOLEAN,
|
||||
false);
|
||||
|
||||
private static final List<EarlybirdProperty<?>> ALL_PROPERTIES =
|
||||
Arrays.stream(EarlybirdProperty.class.getDeclaredFields())
|
||||
.filter(field ->
|
||||
(field.getModifiers() & Modifier.STATIC) > 0
|
||||
&& field.getType() == EarlybirdProperty.class)
|
||||
.map(field -> {
|
||||
try {
|
||||
return (EarlybirdProperty<?>) field.get(EarlybirdProperty.class);
|
||||
} catch (Exception e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
})
|
||||
.collect(Collectors.collectingAndThen(Collectors.toList(), ImmutableList::copyOf));
|
||||
|
||||
public static ServiceIdentifier getServiceIdentifier() {
|
||||
return new ServiceIdentifier(
|
||||
ROLE.get(),
|
||||
EARLYBIRD_NAME.get(),
|
||||
ENV.get(),
|
||||
ZONE.get());
|
||||
}
|
||||
|
||||
private final String name;
|
||||
private final String help;
|
||||
private final PropertyType<T> type;
|
||||
private final boolean requiredOnAurora;
|
||||
private final boolean requiredOnDedicated;
|
||||
|
||||
private EarlybirdProperty(String name, String help, PropertyType<T> type,
|
||||
boolean requiredOnAurora) {
|
||||
this(name, help, type, requiredOnAurora, false);
|
||||
}
|
||||
|
||||
private EarlybirdProperty(String name, String help, PropertyType<T> type,
|
||||
boolean requiredOnAurora, boolean requiredOnDedicated) {
|
||||
this.name = name;
|
||||
this.help = help;
|
||||
this.type = type;
|
||||
this.requiredOnAurora = requiredOnAurora;
|
||||
this.requiredOnDedicated = requiredOnDedicated;
|
||||
}
|
||||
|
||||
public String name() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public boolean isRequiredOnAurora() {
|
||||
return requiredOnAurora;
|
||||
}
|
||||
|
||||
public boolean isRequiredOnDedicated() {
|
||||
return requiredOnDedicated;
|
||||
}
|
||||
|
||||
public Flag<T> createFlag(Flags flags) {
|
||||
return flags.createMandatory(name, help, null, type.flaggable);
|
||||
}
|
||||
|
||||
public T get() {
|
||||
return type.getter.apply(name);
|
||||
}
|
||||
|
||||
public T get(T devaultValue) {
|
||||
return type.getterWithDefault.apply(name, devaultValue);
|
||||
}
|
||||
|
||||
public static EarlybirdProperty[] values() {
|
||||
return ALL_PROPERTIES.toArray(new EarlybirdProperty[0]);
|
||||
}
|
||||
}
|
@ -1,45 +0,0 @@
|
||||
java_library(
|
||||
sources = ["*.java"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/google/inject:guice",
|
||||
"3rdparty/jvm/commons-io",
|
||||
"3rdparty/jvm/geo/google:geoGoogle",
|
||||
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server",
|
||||
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider",
|
||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-core",
|
||||
"3rdparty/jvm/org/apache/lucene:lucene-facet",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:core",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:date",
|
||||
"3rdparty/src/jvm/com/twitter/scalding:parquet",
|
||||
"decider/src/main/scala",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common/util:system-mocks",
|
||||
"src/java/com/twitter/common_internal/hadoop",
|
||||
"src/java/com/twitter/search/common/logging",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/partitioning/snowflakeparser",
|
||||
"src/java/com/twitter/search/common/schema/earlybird",
|
||||
"src/java/com/twitter/search/common/util/hash",
|
||||
"src/java/com/twitter/search/common/util/io",
|
||||
"src/java/com/twitter/search/common/util/io:dl-reader-writer",
|
||||
"src/java/com/twitter/search/common/util/io:flushable",
|
||||
"src/java/com/twitter/search/common/util/io:record-reader-api",
|
||||
"src/java/com/twitter/search/earlybird/common/config",
|
||||
"src/scala/com/twitter/scalding_internal/error_handling",
|
||||
"src/scala/com/twitter/scalding_internal/multiformat",
|
||||
"src/scala/com/twitter/scalding_internal/source",
|
||||
"src/scala/com/twitter/search/user_table/sources",
|
||||
"src/thrift/com/twitter/search/common:indexing-java",
|
||||
"src/thrift/com/twitter/tweetypie:events-java",
|
||||
"util/util-core:scala",
|
||||
],
|
||||
)
|
Binary file not shown.
Binary file not shown.
@ -1,100 +0,0 @@
|
||||
package com.twitter.search.earlybird.common.userupdates;
|
||||
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchCounter;
|
||||
import com.twitter.search.common.metrics.SearchCustomGauge;
|
||||
import com.twitter.search.common.metrics.SearchTimerStats;
|
||||
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
|
||||
import com.twitter.tweetypie.thriftjava.UserScrubGeoEvent;
|
||||
|
||||
/**
|
||||
* Map of users who have actioned to delete location data from their tweets. UserID's are mapped
|
||||
* to the maxTweetId that will eventually be scrubbed from the index (userId -> maxTweetId).
|
||||
*
|
||||
* ConcurrentHashMap is thread safe without synchronizing the whole map. Reads can happen very fast
|
||||
* while writes are done with a lock. This is ideal since many Earlybird Searcher threads could
|
||||
* be reading from the map at once, whereas we will only be adding to the map via kafka.
|
||||
*
|
||||
* This map is checked against to filter out tweets that should not be returned to geo queries.
|
||||
* See: go/realtime-geo-filtering
|
||||
*/
|
||||
public class UserScrubGeoMap {
|
||||
// The number of geo events that contain a user ID already present in the map. This count is used
|
||||
// to verify the number of users in the map against the number of events consumed from kafka.
|
||||
private static final SearchCounter USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT =
|
||||
SearchCounter.export("user_scrub_geo_event_existing_user_count");
|
||||
public static final SearchTimerStats USER_SCRUB_GEO_EVENT_LAG_STAT =
|
||||
SearchTimerStats.export("user_scrub_geo_event_lag",
|
||||
TimeUnit.MILLISECONDS,
|
||||
false,
|
||||
true);
|
||||
private ConcurrentHashMap<Long, Long> map;
|
||||
|
||||
public UserScrubGeoMap() {
|
||||
map = new ConcurrentHashMap<>();
|
||||
SearchCustomGauge.export("num_users_in_geo_map", this::getNumUsersInMap);
|
||||
}
|
||||
|
||||
/**
|
||||
* Ensure that the max_tweet_id in the userScrubGeoEvent is greater than the one already stored
|
||||
* in the map for the given user id (if any) before updating the entry for this user.
|
||||
* This will protect Earlybirds from potential issues where out of date UserScrubGeoEvents
|
||||
* appear in the incoming Kafka stream.
|
||||
*
|
||||
* @param userScrubGeoEvent
|
||||
*/
|
||||
public void indexUserScrubGeoEvent(UserScrubGeoEvent userScrubGeoEvent) {
|
||||
long userId = userScrubGeoEvent.getUser_id();
|
||||
long newMaxTweetId = userScrubGeoEvent.getMax_tweet_id();
|
||||
long oldMaxTweetId = map.getOrDefault(userId, 0L);
|
||||
if (map.containsKey(userId)) {
|
||||
USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT.increment();
|
||||
}
|
||||
map.put(userId, Math.max(oldMaxTweetId, newMaxTweetId));
|
||||
USER_SCRUB_GEO_EVENT_LAG_STAT.timerIncrement(computeEventLag(newMaxTweetId));
|
||||
}
|
||||
|
||||
/**
|
||||
* A tweet is geo scrubbed if it is older than the max tweet id that is scrubbed for the tweet's
|
||||
* author.
|
||||
* If there is no entry for the tweet's author in the map, then the tweet is not geo scrubbed.
|
||||
*
|
||||
* @param tweetId
|
||||
* @param fromUserId
|
||||
* @return
|
||||
*/
|
||||
public boolean isTweetGeoScrubbed(long tweetId, long fromUserId) {
|
||||
return tweetId <= map.getOrDefault(fromUserId, 0L);
|
||||
}
|
||||
|
||||
/**
|
||||
* The lag (in milliseconds) from when a UserScrubGeoEvent is created, until it is applied to the
|
||||
* UserScrubGeoMap. Take the maxTweetId found in the current event and convert it to a timestamp.
|
||||
* The maxTweetId will give us a timestamp closest to when Tweetypie processes macaw-geo requests.
|
||||
*
|
||||
* @param maxTweetId
|
||||
* @return
|
||||
*/
|
||||
private long computeEventLag(long maxTweetId) {
|
||||
long eventCreatedAtTime = SnowflakeIdParser.getTimestampFromTweetId(maxTweetId);
|
||||
return System.currentTimeMillis() - eventCreatedAtTime;
|
||||
}
|
||||
|
||||
public long getNumUsersInMap() {
|
||||
return map.size();
|
||||
}
|
||||
|
||||
public ConcurrentHashMap<Long, Long> getMap() {
|
||||
return map;
|
||||
}
|
||||
|
||||
public boolean isEmpty() {
|
||||
return map.isEmpty();
|
||||
}
|
||||
|
||||
public boolean isSet(long userId) {
|
||||
return map.containsKey(userId);
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,572 +0,0 @@
|
||||
package com.twitter.search.earlybird.common.userupdates;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.concurrent.atomic.AtomicReference;
|
||||
import java.util.function.Predicate;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.search.common.metrics.SearchLongGauge;
|
||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
||||
import com.twitter.search.common.util.hash.GeneralLongHashFunction;
|
||||
|
||||
/**
|
||||
* Table containing metadata about users, like NSFW or Antisocial status.
|
||||
* Used for result filtering.
|
||||
*/
|
||||
public class UserTable {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(UserTable.class);
|
||||
|
||||
@VisibleForTesting // Not final for testing.
|
||||
protected static long userUpdateTableMaxCapacity = 1L << 30;
|
||||
|
||||
private static final int DEFAULT_INITIAL_CAPACITY = 1024;
|
||||
private static final int BYTE_WIDTH = 8;
|
||||
|
||||
private static final String USER_TABLE_CAPACITY = "user_table_capacity";
|
||||
private static final String USER_TABLE_SIZE = "user_table_size";
|
||||
private static final String
|
||||
USER_NUM_USERS_WITH_NO_BITS_SET = "user_table_users_with_no_bits_set";
|
||||
private static final String USER_TABLE_ANTISOCIAL_USERS = "user_table_antisocial_users";
|
||||
private static final String USER_TABLE_OFFENSIVE_USERS = "user_table_offensive_users";
|
||||
private static final String USER_TABLE_NSFW_USERS = "user_table_nsfw_users";
|
||||
private static final String USER_TABLE_IS_PROTECTED_USERS = "user_table_is_protected_users";
|
||||
|
||||
/**
|
||||
* number of users filtered
|
||||
*/
|
||||
private static final SearchRateCounter USER_TABLE_USERS_FILTERED_COUNTER =
|
||||
new SearchRateCounter("user_table_users_filtered");
|
||||
|
||||
private SearchLongGauge userTableCapacity;
|
||||
private SearchLongGauge userTableSize;
|
||||
private SearchLongGauge userTableNumUsersWithNoBitsSet;
|
||||
private SearchLongGauge userTableAntisocialUsers;
|
||||
private SearchLongGauge userTableOffensiveUsers;
|
||||
private SearchLongGauge userTableNsfwUsers;
|
||||
private SearchLongGauge userTableIsProtectedUsers;
|
||||
|
||||
private final Predicate<Long> userIdFilter;
|
||||
private long lastRecordTimestamp;
|
||||
|
||||
private static final class HashTable {
|
||||
private int numUsersInTable;
|
||||
private int numUsersWithNoBitsSet;
|
||||
// size 8 array contains the number of users who have the bit set at the index (0-7) position
|
||||
// e.g. setBitCounts[0] stores the number of users who have the 0 bit set in their bytes
|
||||
private long[] setBitCounts;
|
||||
|
||||
private final long[] hash;
|
||||
private final byte[] bits;
|
||||
|
||||
private final int hashMask;
|
||||
|
||||
HashTable(int size) {
|
||||
this.hash = new long[size];
|
||||
this.bits = new byte[size];
|
||||
this.hashMask = size - 1;
|
||||
this.numUsersInTable = 0;
|
||||
this.setBitCounts = new long[BYTE_WIDTH];
|
||||
}
|
||||
|
||||
protected int hashSize() {
|
||||
return hash.length;
|
||||
}
|
||||
|
||||
// If we want to decrease the number of users in the table, we can delete as many users
|
||||
// as this table returns, by calling filterTableAndCountValidItems.
|
||||
public void setCountOfNumUsersWithNoBitsSet() {
|
||||
int count = 0;
|
||||
for (int i = 0; i < hash.length; i++) {
|
||||
if ((hash[i] > 0) && (bits[i] == 0)) {
|
||||
count++;
|
||||
}
|
||||
}
|
||||
|
||||
numUsersWithNoBitsSet = count;
|
||||
}
|
||||
|
||||
public void setSetBitCounts() {
|
||||
long[] counts = new long[BYTE_WIDTH];
|
||||
for (int i = 0; i < hash.length; i++) {
|
||||
if (hash[i] > 0) {
|
||||
int tempBits = bits[i] & 0xff;
|
||||
int curBitPos = 0;
|
||||
while (tempBits != 0) {
|
||||
if ((tempBits & 1) != 0) {
|
||||
counts[curBitPos]++;
|
||||
}
|
||||
tempBits = tempBits >>> 1;
|
||||
curBitPos++;
|
||||
}
|
||||
}
|
||||
}
|
||||
setBitCounts = counts;
|
||||
}
|
||||
}
|
||||
|
||||
public static final int ANTISOCIAL_BIT = 1;
|
||||
public static final int OFFENSIVE_BIT = 1 << 1;
|
||||
public static final int NSFW_BIT = 1 << 2;
|
||||
public static final int IS_PROTECTED_BIT = 1 << 3;
|
||||
|
||||
public long getLastRecordTimestamp() {
|
||||
return this.lastRecordTimestamp;
|
||||
}
|
||||
|
||||
public void setLastRecordTimestamp(long lastRecordTimestamp) {
|
||||
this.lastRecordTimestamp = lastRecordTimestamp;
|
||||
}
|
||||
|
||||
public void setOffensive(long userID, boolean offensive) {
|
||||
set(userID, OFFENSIVE_BIT, offensive);
|
||||
}
|
||||
|
||||
public void setAntisocial(long userID, boolean antisocial) {
|
||||
set(userID, ANTISOCIAL_BIT, antisocial);
|
||||
}
|
||||
|
||||
public void setNSFW(long userID, boolean nsfw) {
|
||||
set(userID, NSFW_BIT, nsfw);
|
||||
}
|
||||
|
||||
public void setIsProtected(long userID, boolean isProtected) {
|
||||
set(userID, IS_PROTECTED_BIT, isProtected);
|
||||
}
|
||||
|
||||
/**
|
||||
* Adds the given user update to this table.
|
||||
*/
|
||||
public boolean indexUserUpdate(UserUpdatesChecker checker, UserUpdate userUpdate) {
|
||||
if (checker.skipUserUpdate(userUpdate)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
switch (userUpdate.updateType) {
|
||||
case ANTISOCIAL:
|
||||
setAntisocial(userUpdate.twitterUserID, userUpdate.updateValue != 0);
|
||||
break;
|
||||
case NSFW:
|
||||
setNSFW(userUpdate.twitterUserID, userUpdate.updateValue != 0);
|
||||
break;
|
||||
case OFFENSIVE:
|
||||
setOffensive(userUpdate.twitterUserID, userUpdate.updateValue != 0);
|
||||
break;
|
||||
case PROTECTED:
|
||||
setIsProtected(userUpdate.twitterUserID, userUpdate.updateValue != 0);
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
private final AtomicReference<HashTable> hashTable = new AtomicReference<>();
|
||||
|
||||
private int hashCode(long userID) {
|
||||
return (int) GeneralLongHashFunction.hash(userID);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator for user IDs that have at least one of the bits set.
|
||||
*/
|
||||
public Iterator<Long> getFlaggedUserIdIterator() {
|
||||
HashTable table = hashTable.get();
|
||||
|
||||
final long[] currUserIdTable = table.hash;
|
||||
final byte[] currBitsTable = table.bits;
|
||||
return new Iterator<Long>() {
|
||||
private int index = findNext(0);
|
||||
|
||||
private int findNext(int index) {
|
||||
int startingIndex = index;
|
||||
while (startingIndex < currUserIdTable.length) {
|
||||
if (currUserIdTable[startingIndex] != 0 && currBitsTable[startingIndex] != 0) {
|
||||
break;
|
||||
}
|
||||
++startingIndex;
|
||||
}
|
||||
return startingIndex;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return index < currUserIdTable.length;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Long next() {
|
||||
Long r = currUserIdTable[index];
|
||||
index = findNext(index + 1);
|
||||
return r;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Constructs an UserUpdatesTable with an given HashTable instance.
|
||||
* Use <code>useIdFilter</code> as a Predicate that returns true for the elements
|
||||
* needed to be kept in the table.
|
||||
* Use shouldRehash to force a rehasing on the given HashTable.
|
||||
*/
|
||||
private UserTable(HashTable hashTable, Predicate<Long> userIdFilter,
|
||||
boolean shouldRehash) {
|
||||
|
||||
Preconditions.checkNotNull(userIdFilter);
|
||||
|
||||
this.hashTable.set(hashTable);
|
||||
this.userIdFilter = userIdFilter;
|
||||
|
||||
exportUserUpdatesTableStats();
|
||||
|
||||
LOG.info("User table num users: {}. Users with no bits set: {}. "
|
||||
+ "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.",
|
||||
this.getNumUsersInTable(),
|
||||
this.getNumUsersWithNoBitsSet(),
|
||||
this.getSetBitCount(ANTISOCIAL_BIT),
|
||||
this.getSetBitCount(OFFENSIVE_BIT),
|
||||
this.getSetBitCount(NSFW_BIT),
|
||||
this.getSetBitCount(IS_PROTECTED_BIT));
|
||||
|
||||
if (shouldRehash) {
|
||||
int filteredTableSize = filterTableAndCountValidItems();
|
||||
// Having exactly 100% usage can impact lookup. Maintain the table at under 50% usage.
|
||||
int newTableCapacity = computeDesiredHashTableCapacity(filteredTableSize * 2);
|
||||
|
||||
rehash(newTableCapacity);
|
||||
|
||||
LOG.info("User table num users after rehash: {}. Users with no bits set: {}. "
|
||||
+ "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.",
|
||||
this.getNumUsersInTable(),
|
||||
this.getNumUsersWithNoBitsSet(),
|
||||
this.getSetBitCount(ANTISOCIAL_BIT),
|
||||
this.getSetBitCount(OFFENSIVE_BIT),
|
||||
this.getSetBitCount(NSFW_BIT),
|
||||
this.getSetBitCount(IS_PROTECTED_BIT));
|
||||
}
|
||||
}
|
||||
|
||||
private UserTable(int initialSize, Predicate<Long> userIdFilter) {
|
||||
this(new HashTable(computeDesiredHashTableCapacity(initialSize)), userIdFilter, false);
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public UserTable(int initialSize) {
|
||||
this(initialSize, userId -> true);
|
||||
}
|
||||
|
||||
public static UserTable
|
||||
newTableWithDefaultCapacityAndPredicate(Predicate<Long> userIdFilter) {
|
||||
|
||||
return new UserTable(DEFAULT_INITIAL_CAPACITY, userIdFilter);
|
||||
}
|
||||
|
||||
public static UserTable newTableNonFilteredWithDefaultCapacity() {
|
||||
return newTableWithDefaultCapacityAndPredicate(userId -> true);
|
||||
}
|
||||
|
||||
private void exportUserUpdatesTableStats() {
|
||||
userTableSize = SearchLongGauge.export(USER_TABLE_SIZE);
|
||||
userTableCapacity = SearchLongGauge.export(USER_TABLE_CAPACITY);
|
||||
userTableNumUsersWithNoBitsSet = SearchLongGauge.export(
|
||||
USER_NUM_USERS_WITH_NO_BITS_SET
|
||||
);
|
||||
userTableAntisocialUsers = SearchLongGauge.export(USER_TABLE_ANTISOCIAL_USERS);
|
||||
userTableOffensiveUsers = SearchLongGauge.export(USER_TABLE_OFFENSIVE_USERS);
|
||||
userTableNsfwUsers = SearchLongGauge.export(USER_TABLE_NSFW_USERS);
|
||||
userTableIsProtectedUsers = SearchLongGauge.export(USER_TABLE_IS_PROTECTED_USERS);
|
||||
|
||||
LOG.info(
|
||||
"Exporting stats for user table. Starting with numUsersInTable={}, usersWithZeroBits={}, "
|
||||
+ "antisocialUsers={}, offensiveUsers={}, nsfwUsers={}, isProtectedUsers={}.",
|
||||
getNumUsersInTable(),
|
||||
getNumUsersWithNoBitsSet(),
|
||||
getSetBitCount(ANTISOCIAL_BIT),
|
||||
getSetBitCount(OFFENSIVE_BIT),
|
||||
getSetBitCount(NSFW_BIT),
|
||||
getSetBitCount(IS_PROTECTED_BIT));
|
||||
updateStats();
|
||||
}
|
||||
|
||||
private void updateStats() {
|
||||
HashTable table = this.hashTable.get();
|
||||
userTableSize.set(table.numUsersInTable);
|
||||
userTableNumUsersWithNoBitsSet.set(table.numUsersWithNoBitsSet);
|
||||
userTableCapacity.set(table.hashSize());
|
||||
userTableAntisocialUsers.set(getSetBitCount(ANTISOCIAL_BIT));
|
||||
userTableOffensiveUsers.set(getSetBitCount(OFFENSIVE_BIT));
|
||||
userTableNsfwUsers.set(getSetBitCount(NSFW_BIT));
|
||||
userTableIsProtectedUsers.set(getSetBitCount(IS_PROTECTED_BIT));
|
||||
}
|
||||
|
||||
/**
|
||||
* Computes the size of the hashtable as the first power of two greater than or equal to initialSize
|
||||
*/
|
||||
private static int computeDesiredHashTableCapacity(int initialSize) {
|
||||
long powerOfTwoSize = 2;
|
||||
while (initialSize > powerOfTwoSize) {
|
||||
powerOfTwoSize *= 2;
|
||||
}
|
||||
if (powerOfTwoSize > Integer.MAX_VALUE) {
|
||||
LOG.error("Error: powerOfTwoSize overflowed Integer.MAX_VALUE! Initial size: " + initialSize);
|
||||
powerOfTwoSize = 1 << 30; // max power of 2
|
||||
}
|
||||
|
||||
return (int) powerOfTwoSize;
|
||||
}
|
||||
|
||||
public int getNumUsersInTable() {
|
||||
return hashTable.get().numUsersInTable;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the number of users who have the bit set at the `userStateBit` position
|
||||
*/
|
||||
public long getSetBitCount(int userStateBit) {
|
||||
int bit = userStateBit;
|
||||
int bitPosition = 0;
|
||||
while (bit != 0 && (bit & 1) == 0) {
|
||||
bit = bit >>> 1;
|
||||
bitPosition++;
|
||||
}
|
||||
return hashTable.get().setBitCounts[bitPosition];
|
||||
}
|
||||
|
||||
public Predicate<Long> getUserIdFilter() {
|
||||
return userIdFilter::test;
|
||||
}
|
||||
|
||||
/**
|
||||
* Updates a user flag in this table.
|
||||
*/
|
||||
public final void set(long userID, int bit, boolean value) {
|
||||
// if userID is filtered return immediately
|
||||
if (!shouldKeepUser(userID)) {
|
||||
USER_TABLE_USERS_FILTERED_COUNTER.increment();
|
||||
return;
|
||||
}
|
||||
|
||||
HashTable table = this.hashTable.get();
|
||||
|
||||
int hashPos = findHashPosition(table, userID);
|
||||
long item = table.hash[hashPos];
|
||||
byte bits = 0;
|
||||
int bitsDiff = 0;
|
||||
|
||||
if (item != 0) {
|
||||
byte bitsOriginally = bits = table.bits[hashPos];
|
||||
if (value) {
|
||||
bits |= bit;
|
||||
} else {
|
||||
// AND'ing with the inverse map clears the desired bit, but
|
||||
// doesn't change any of the other bits
|
||||
bits &= ~bit;
|
||||
}
|
||||
|
||||
// Find the changed bits after the above operation, it is possible that no bit is changed if
|
||||
// the input 'bit' is already set/unset in the table.
|
||||
// Since bitwise operators cannot be directly applied on Byte, Byte is promoted into int to
|
||||
// apply the operators. When that happens, if the most significant bit of the Byte is set,
|
||||
// the promoted int has all significant bits set to 1. 0xff bitmask is applied here to make
|
||||
// sure only the last 8 bits are considered.
|
||||
bitsDiff = (bitsOriginally & 0xff) ^ (bits & 0xff);
|
||||
|
||||
if (bitsOriginally > 0 && bits == 0) {
|
||||
table.numUsersWithNoBitsSet++;
|
||||
} else if (bitsOriginally == 0 && bits > 0) {
|
||||
table.numUsersWithNoBitsSet--;
|
||||
}
|
||||
} else {
|
||||
if (!value) {
|
||||
// no need to add this user, since all bits would be false anyway
|
||||
return;
|
||||
}
|
||||
|
||||
// New user string.
|
||||
if (table.numUsersInTable + 1 >= (table.hashSize() >> 1)
|
||||
&& table.hashSize() != userUpdateTableMaxCapacity) {
|
||||
if (2L * (long) table.hashSize() < userUpdateTableMaxCapacity) {
|
||||
rehash(2 * table.hashSize());
|
||||
table = this.hashTable.get();
|
||||
} else {
|
||||
if (table.hashSize() < (int) userUpdateTableMaxCapacity) {
|
||||
rehash((int) userUpdateTableMaxCapacity);
|
||||
table = this.hashTable.get();
|
||||
LOG.warn("User update table size reached Integer.MAX_VALUE, performance will degrade.");
|
||||
}
|
||||
}
|
||||
|
||||
// Must repeat this operation with the resized hashTable.
|
||||
hashPos = findHashPosition(table, userID);
|
||||
}
|
||||
|
||||
item = userID;
|
||||
bits |= bit;
|
||||
bitsDiff = bit & 0xff;
|
||||
|
||||
table.numUsersInTable++;
|
||||
}
|
||||
|
||||
table.hash[hashPos] = item;
|
||||
table.bits[hashPos] = bits;
|
||||
|
||||
// update setBitCounts for the changed bits after applying the input 'bit'
|
||||
int curBitsDiffPos = 0;
|
||||
while (bitsDiff != 0) {
|
||||
if ((bitsDiff & 1) != 0) {
|
||||
if (value) {
|
||||
table.setBitCounts[curBitsDiffPos]++;
|
||||
} else {
|
||||
table.setBitCounts[curBitsDiffPos]--;
|
||||
}
|
||||
}
|
||||
bitsDiff = bitsDiff >>> 1;
|
||||
curBitsDiffPos++;
|
||||
}
|
||||
|
||||
updateStats();
|
||||
}
|
||||
|
||||
public final boolean isSet(long userID, int bits) {
|
||||
HashTable table = hashTable.get();
|
||||
int hashPos = findHashPosition(table, userID);
|
||||
return table.hash[hashPos] != 0 && (table.bits[hashPos] & bits) != 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns true when userIdFilter condition is being met.
|
||||
* If filter is not present returns true
|
||||
*/
|
||||
private boolean shouldKeepUser(long userID) {
|
||||
return userIdFilter.test(userID);
|
||||
}
|
||||
|
||||
private int findHashPosition(final HashTable table, final long userID) {
|
||||
int code = hashCode(userID);
|
||||
int hashPos = code & table.hashMask;
|
||||
|
||||
// Locate user in hash
|
||||
long item = table.hash[hashPos];
|
||||
|
||||
if (item != 0 && item != userID) {
|
||||
// Conflict: keep searching different locations in
|
||||
// the hash table.
|
||||
final int inc = ((code >> 8) + code) | 1;
|
||||
do {
|
||||
code += inc;
|
||||
hashPos = code & table.hashMask;
|
||||
item = table.hash[hashPos];
|
||||
} while (item != 0 && item != userID);
|
||||
}
|
||||
|
||||
return hashPos;
|
||||
}
|
||||
|
||||
/**
|
||||
* Applies the filtering predicate and returns the size of the filtered table.
|
||||
*/
|
||||
private synchronized int filterTableAndCountValidItems() {
|
||||
final HashTable oldTable = this.hashTable.get();
|
||||
int newSize = 0;
|
||||
|
||||
int clearNoItemSet = 0;
|
||||
int clearNoBitsSet = 0;
|
||||
int clearDontKeepUser = 0;
|
||||
|
||||
for (int i = 0; i < oldTable.hashSize(); i++) {
|
||||
final long item = oldTable.hash[i]; // this is the userID
|
||||
final byte bits = oldTable.bits[i];
|
||||
|
||||
boolean clearSlot = false;
|
||||
if (item == 0) {
|
||||
clearSlot = true;
|
||||
clearNoItemSet++;
|
||||
} else if (bits == 0) {
|
||||
clearSlot = true;
|
||||
clearNoBitsSet++;
|
||||
} else if (!shouldKeepUser(item)) {
|
||||
clearSlot = true;
|
||||
clearDontKeepUser++;
|
||||
}
|
||||
|
||||
if (clearSlot) {
|
||||
oldTable.hash[i] = 0;
|
||||
oldTable.bits[i] = 0;
|
||||
} else {
|
||||
newSize += 1;
|
||||
}
|
||||
}
|
||||
|
||||
oldTable.setCountOfNumUsersWithNoBitsSet();
|
||||
oldTable.setSetBitCounts();
|
||||
|
||||
LOG.info("Done filtering table: clearNoItemSet={}, clearNoBitsSet={}, clearDontKeepUser={}",
|
||||
clearNoItemSet, clearNoBitsSet, clearDontKeepUser);
|
||||
|
||||
return newSize;
|
||||
}
|
||||
|
||||
/**
|
||||
* Called when hash is too small (> 50% occupied)
|
||||
*/
|
||||
private void rehash(final int newSize) {
|
||||
final HashTable oldTable = this.hashTable.get();
|
||||
final HashTable newTable = new HashTable(newSize);
|
||||
|
||||
final int newMask = newTable.hashMask;
|
||||
final long[] newHash = newTable.hash;
|
||||
final byte[] newBits = newTable.bits;
|
||||
|
||||
for (int i = 0; i < oldTable.hashSize(); i++) {
|
||||
final long item = oldTable.hash[i];
|
||||
final byte bits = oldTable.bits[i];
|
||||
if (item != 0 && bits != 0) {
|
||||
int code = hashCode(item);
|
||||
|
||||
int hashPos = code & newMask;
|
||||
assert hashPos >= 0;
|
||||
if (newHash[hashPos] != 0) {
|
||||
final int inc = ((code >> 8) + code) | 1;
|
||||
do {
|
||||
code += inc;
|
||||
hashPos = code & newMask;
|
||||
} while (newHash[hashPos] != 0);
|
||||
}
|
||||
newHash[hashPos] = item;
|
||||
newBits[hashPos] = bits;
|
||||
newTable.numUsersInTable++;
|
||||
}
|
||||
}
|
||||
|
||||
newTable.setCountOfNumUsersWithNoBitsSet();
|
||||
newTable.setSetBitCounts();
|
||||
this.hashTable.set(newTable);
|
||||
|
||||
updateStats();
|
||||
}
|
||||
|
||||
public void setTable(UserTable newTable) {
|
||||
hashTable.set(newTable.hashTable.get());
|
||||
updateStats();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected int getHashTableCapacity() {
|
||||
return hashTable.get().hashSize();
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
protected int getNumUsersWithNoBitsSet() {
|
||||
return hashTable.get().numUsersWithNoBitsSet;
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,263 +0,0 @@
|
||||
package com.twitter.search.earlybird.common.userupdates;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.Arrays;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.NoSuchElementException;
|
||||
import java.util.Optional;
|
||||
import java.util.Spliterator;
|
||||
import java.util.Spliterators;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.Stream;
|
||||
import java.util.stream.StreamSupport;
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import com.twitter.common_internal.hadoop.HdfsUtils;
|
||||
import com.twitter.scalding.DateRange;
|
||||
import com.twitter.scalding.Hours;
|
||||
import com.twitter.scalding.RichDate;
|
||||
import com.twitter.search.user_table.sources.MostRecentGoodSafetyUserStateSource;
|
||||
import com.twitter.search.common.indexing.thriftjava.SafetyUserState;
|
||||
import com.twitter.search.common.util.io.LzoThriftBlockFileReader;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
import com.twitter.util.Duration;
|
||||
import com.twitter.util.Time;
|
||||
|
||||
/**
|
||||
* Builds a user table from a user safety snapshot on HDFS.
|
||||
*/
|
||||
public class UserTableBuilderFromSnapshot {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(UserTableBuilderFromSnapshot.class);
|
||||
|
||||
private static final int MAX_DAYS_TO_CHECK = 7;
|
||||
public static final String DATA_DIR = "user_states";
|
||||
public static final String METADATA_DIR = "last_updated_ms";
|
||||
|
||||
private final String snapshotBaseDir;
|
||||
|
||||
private String snapshotDataPath;
|
||||
private String snapshotMetaDataPath;
|
||||
private UserTable userTable;
|
||||
|
||||
private long nsfwCount;
|
||||
private long antisocialCount;
|
||||
private long isProtectedCount;
|
||||
|
||||
public UserTableBuilderFromSnapshot() {
|
||||
snapshotBaseDir =
|
||||
EarlybirdConfig.getString(EarlybirdConfig.USER_SNAPSHOT_BASE_DIR, null);
|
||||
|
||||
LOG.info("Configured user snapshot directory: " + snapshotBaseDir);
|
||||
}
|
||||
|
||||
private static final class UserUpdate {
|
||||
public final long userId;
|
||||
@Nullable public final Boolean antisocial;
|
||||
@Nullable public final Boolean nsfw;
|
||||
@Nullable public final Boolean isProtected;
|
||||
|
||||
private UserUpdate(long userId,
|
||||
@Nullable Boolean antisocial,
|
||||
@Nullable Boolean nsfw,
|
||||
@Nullable Boolean isProtected) {
|
||||
this.userId = userId;
|
||||
this.antisocial = antisocial;
|
||||
this.nsfw = nsfw;
|
||||
this.isProtected = isProtected;
|
||||
}
|
||||
|
||||
public static UserUpdate fromUserState(SafetyUserState safetyUserState) {
|
||||
long userId = safetyUserState.getUserID();
|
||||
@Nullable Boolean antisocial = null;
|
||||
@Nullable Boolean nsfw = null;
|
||||
@Nullable Boolean isProtected = null;
|
||||
|
||||
if (safetyUserState.isIsAntisocial()) {
|
||||
antisocial = true;
|
||||
}
|
||||
if (safetyUserState.isIsNsfw()) {
|
||||
nsfw = true;
|
||||
}
|
||||
if (safetyUserState.isSetIsProtected() && safetyUserState.isIsProtected()) {
|
||||
isProtected = true;
|
||||
}
|
||||
|
||||
return new UserUpdate(userId, antisocial, nsfw, isProtected);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Builds a user table from an HDFS user snapshot.
|
||||
* @return The table, or nothing if something went wrong.
|
||||
*/
|
||||
public Optional<UserTable> build(Predicate<Long> userFilter) {
|
||||
userTable = UserTable.newTableWithDefaultCapacityAndPredicate(userFilter);
|
||||
nsfwCount = 0;
|
||||
antisocialCount = 0;
|
||||
isProtectedCount = 0;
|
||||
|
||||
if (snapshotBaseDir == null || snapshotBaseDir.isEmpty()) {
|
||||
LOG.info("No snapshot directory. Can't build user table.");
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
LOG.info("Starting to build user table.");
|
||||
|
||||
Stream<UserUpdate> stream = null;
|
||||
|
||||
try {
|
||||
setSnapshotPath();
|
||||
|
||||
stream = getUserUpdates();
|
||||
stream.forEach(this::insertUser);
|
||||
} catch (IOException e) {
|
||||
LOG.error("IOException while building table: {}", e.getMessage(), e);
|
||||
|
||||
return Optional.empty();
|
||||
} finally {
|
||||
if (stream != null) {
|
||||
stream.close();
|
||||
}
|
||||
}
|
||||
|
||||
LOG.info("Built user table with {} users, {} nsfw, {} antisocial and {} protected.",
|
||||
userTable.getNumUsersInTable(),
|
||||
nsfwCount,
|
||||
antisocialCount,
|
||||
isProtectedCount);
|
||||
|
||||
try {
|
||||
userTable.setLastRecordTimestamp(readTimestampOfLastSeenUpdateFromSnapshot());
|
||||
} catch (IOException e) {
|
||||
LOG.error("IOException reading timestamp of last update: {}", e.getMessage(), e);
|
||||
return Optional.empty();
|
||||
}
|
||||
|
||||
LOG.info("Setting last record timestamp to {}.", userTable.getLastRecordTimestamp());
|
||||
|
||||
return Optional.of(userTable);
|
||||
}
|
||||
|
||||
private void setSnapshotPath() {
|
||||
snapshotDataPath =
|
||||
new MostRecentGoodSafetyUserStateSource(
|
||||
snapshotBaseDir,
|
||||
DATA_DIR,
|
||||
METADATA_DIR,
|
||||
DateRange.apply(
|
||||
RichDate.now().$minus(Hours.apply(MAX_DAYS_TO_CHECK * 24)),
|
||||
RichDate.now())
|
||||
).partitionHdfsPaths(new HdfsConfiguration())
|
||||
._1()
|
||||
.head()
|
||||
.replaceAll("\\*$", "");
|
||||
snapshotMetaDataPath = snapshotDataPath.replace(DATA_DIR, METADATA_DIR);
|
||||
|
||||
LOG.info("Snapshot data path: {}", snapshotDataPath);
|
||||
LOG.info("Snapshot metadata path: {}", snapshotMetaDataPath);
|
||||
}
|
||||
|
||||
private Stream<UserUpdate> getUserUpdates() throws IOException {
|
||||
FileSystem fs = FileSystem.get(new Configuration());
|
||||
List<String> lzoFiles =
|
||||
Arrays.stream(fs.listStatus(new Path(snapshotDataPath),
|
||||
path -> path.getName().startsWith("part-")))
|
||||
.map(fileStatus -> Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath())
|
||||
.toString())
|
||||
.collect(Collectors.toList());
|
||||
|
||||
final LzoThriftBlockFileReader<SafetyUserState> thriftReader =
|
||||
new LzoThriftBlockFileReader<>(lzoFiles, SafetyUserState.class, null);
|
||||
|
||||
Iterator<UserUpdate> iter = new Iterator<UserUpdate>() {
|
||||
private SafetyUserState next;
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
if (next != null) {
|
||||
return true;
|
||||
}
|
||||
|
||||
do {
|
||||
try {
|
||||
next = thriftReader.readNext();
|
||||
} catch (IOException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
} while (next == null && !thriftReader.isExhausted());
|
||||
return next != null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public UserUpdate next() {
|
||||
if (next != null || hasNext()) {
|
||||
UserUpdate userUpdate = UserUpdate.fromUserState(next);
|
||||
next = null;
|
||||
return userUpdate;
|
||||
}
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
};
|
||||
|
||||
return StreamSupport
|
||||
.stream(
|
||||
Spliterators.spliteratorUnknownSize(iter, Spliterator.ORDERED | Spliterator.NONNULL),
|
||||
false)
|
||||
.onClose(thriftReader::stop);
|
||||
}
|
||||
|
||||
private long readTimestampOfLastSeenUpdateFromSnapshot() throws IOException {
|
||||
String timestampFile = snapshotMetaDataPath + "part-00000";
|
||||
BufferedReader buffer = new BufferedReader(new InputStreamReader(
|
||||
HdfsUtils.getInputStreamSupplier(timestampFile).openStream()));
|
||||
|
||||
long timestampMillis = Long.parseLong(buffer.readLine());
|
||||
LOG.info("read timestamp {} from HDFS:{}", timestampMillis, timestampFile);
|
||||
|
||||
Time time = Time.fromMilliseconds(timestampMillis)
|
||||
.minus(Duration.fromTimeUnit(10, TimeUnit.MINUTES));
|
||||
return time.inMilliseconds();
|
||||
}
|
||||
|
||||
private void insertUser(UserUpdate userUpdate) {
|
||||
if (userUpdate == null) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (userUpdate.antisocial != null) {
|
||||
userTable.set(
|
||||
userUpdate.userId,
|
||||
UserTable.ANTISOCIAL_BIT,
|
||||
userUpdate.antisocial);
|
||||
antisocialCount++;
|
||||
}
|
||||
|
||||
if (userUpdate.nsfw != null) {
|
||||
userTable.set(
|
||||
userUpdate.userId,
|
||||
UserTable.NSFW_BIT,
|
||||
userUpdate.nsfw);
|
||||
nsfwCount++;
|
||||
}
|
||||
|
||||
if (userUpdate.isProtected != null) {
|
||||
userTable.set(
|
||||
userUpdate.userId,
|
||||
UserTable.IS_PROTECTED_BIT,
|
||||
userUpdate.isProtected);
|
||||
isProtectedCount++;
|
||||
}
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,38 +0,0 @@
|
||||
package com.twitter.search.earlybird.common.userupdates;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
import com.twitter.search.common.indexing.thriftjava.UserUpdateType;
|
||||
|
||||
/**
|
||||
* Contains an update for a user.
|
||||
*/
|
||||
public class UserUpdate {
|
||||
public final long twitterUserID;
|
||||
public final UserUpdateType updateType;
|
||||
public final int updateValue;
|
||||
private final Date updatedAt;
|
||||
|
||||
public UserUpdate(long twitterUserID,
|
||||
UserUpdateType updateType,
|
||||
int updateValue,
|
||||
Date updatedAt) {
|
||||
|
||||
this.twitterUserID = twitterUserID;
|
||||
this.updateType = updateType;
|
||||
this.updateValue = updateValue;
|
||||
this.updatedAt = (Date) updatedAt.clone();
|
||||
}
|
||||
|
||||
@Override public String toString() {
|
||||
return "UserInfoUpdate[userID=" + twitterUserID + ",updateType=" + updateType
|
||||
+ ",updateValue=" + updateValue + ",updatedAt=" + getUpdatedAt() + "]";
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a copy of the updated-at date.
|
||||
*/
|
||||
public Date getUpdatedAt() {
|
||||
return (Date) updatedAt.clone();
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,70 +0,0 @@
|
||||
package com.twitter.search.earlybird.common.userupdates;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.decider.Decider;
|
||||
import com.twitter.search.common.indexing.thriftjava.UserUpdateType;
|
||||
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
|
||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
||||
|
||||
/**
|
||||
* Contains logic for deciding whether to apply a certain user update to the {@link UserTable}.
|
||||
*/
|
||||
public class UserUpdatesChecker {
|
||||
private final Date antisocialStartDate;
|
||||
private final Decider decider;
|
||||
private final boolean isFullArchiveCluster;
|
||||
|
||||
public UserUpdatesChecker(Clock clock, Decider decider, EarlybirdCluster cluster) {
|
||||
// How many days of antisocial users to keep. A value of -1 means keeping all user updates.
|
||||
long antisocialRecordDays =
|
||||
EarlybirdConfig.getLong("keep_recent_antisocial_user_updates_days", 30);
|
||||
this.antisocialStartDate = antisocialRecordDays > 0
|
||||
? new Date(clock.nowMillis() - TimeUnit.DAYS.toMillis(antisocialRecordDays)) : null;
|
||||
this.decider = decider;
|
||||
this.isFullArchiveCluster = cluster == EarlybirdCluster.FULL_ARCHIVE;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decides whether to skip the given UserInfoUpdate.
|
||||
*/
|
||||
public boolean skipUserUpdate(UserUpdate userUpdate) {
|
||||
if (userUpdate == null) { // always skip null updates
|
||||
return true;
|
||||
}
|
||||
|
||||
UserUpdateType type = userUpdate.updateType;
|
||||
|
||||
if (type == UserUpdateType.PROTECTED && skipProtectedUserUpdate()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (type == UserUpdateType.ANTISOCIAL && skipAntisocialUserUpdate(userUpdate)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// NSFW users can continue to tweet even after they are marked as NSFW. That means
|
||||
// that the snapshot needs to have all NSFW users from the beginning of time. Hence, no NSFW
|
||||
// users updates check here.
|
||||
|
||||
// pass all checks, do not skip this user update
|
||||
return false;
|
||||
}
|
||||
|
||||
// Antisocial/suspended users can't tweet after they are suspended. Thus if our index stores
|
||||
// tweets from the last 10 days, and they were suspended 60 days ago, we don't need them since
|
||||
// there will be no tweets from them. We can save space by not storing info about those users.
|
||||
|
||||
// (For archive, at rebuild time we filter out all suspended users tweets, so for a user that
|
||||
// was suspended before a rebuild, no need to use space to store that the user is suspended)
|
||||
private boolean skipAntisocialUserUpdate(UserUpdate userUpdate) {
|
||||
return antisocialStartDate != null && userUpdate.getUpdatedAt().before(antisocialStartDate);
|
||||
}
|
||||
|
||||
// skip protected user updates for realtime and protected clusters
|
||||
private boolean skipProtectedUserUpdate() {
|
||||
return !isFullArchiveCluster;
|
||||
}
|
||||
}
|
@ -1,21 +0,0 @@
|
||||
java_library(
|
||||
sources = ["**/*.java"],
|
||||
platform = "java8",
|
||||
tags = ["bazel-compatible"],
|
||||
dependencies = [
|
||||
"3rdparty/jvm/com/google/code/findbugs:jsr305",
|
||||
"3rdparty/jvm/com/google/guava",
|
||||
"3rdparty/jvm/com/google/inject:guice",
|
||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
||||
"src/java/com/twitter/common/base",
|
||||
"src/java/com/twitter/common/util:system-mocks",
|
||||
"src/java/com/twitter/search/common/config",
|
||||
"src/java/com/twitter/search/common/metrics",
|
||||
"src/java/com/twitter/search/common/partitioning/snowflakeparser",
|
||||
"src/java/com/twitter/search/common/util/date",
|
||||
"src/java/com/twitter/search/common/util/zookeeper",
|
||||
"src/java/com/twitter/search/earlybird/common/config",
|
||||
],
|
||||
)
|
BIN
src/java/com/twitter/search/earlybird/config/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/BUILD.docx
Normal file
Binary file not shown.
BIN
src/java/com/twitter/search/earlybird/config/ServingRange.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/ServingRange.docx
Normal file
Binary file not shown.
@ -1,26 +0,0 @@
|
||||
package com.twitter.search.earlybird.config;
|
||||
|
||||
/**
|
||||
* An interface for abstracting a tier's serving range.
|
||||
*/
|
||||
public interface ServingRange {
|
||||
/**
|
||||
* Returns the serving range's lowest tweet ID.
|
||||
*/
|
||||
long getServingRangeSinceId();
|
||||
|
||||
/**
|
||||
* Returns the serving range's highest tweet ID.
|
||||
*/
|
||||
long getServingRangeMaxId();
|
||||
|
||||
/**
|
||||
* Returns the serving range's earliest time, in seconds since epoch.
|
||||
*/
|
||||
long getServingRangeSinceTimeSecondsFromEpoch();
|
||||
|
||||
/**
|
||||
* Returns the serving range's latest time, in seconds since epoch.
|
||||
*/
|
||||
long getServingRangeUntilTimeSecondsFromEpoch();
|
||||
}
|
BIN
src/java/com/twitter/search/earlybird/config/TierConfig.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/TierConfig.docx
Normal file
Binary file not shown.
@ -1,175 +0,0 @@
|
||||
package com.twitter.search.earlybird.config;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.config.Config;
|
||||
import com.twitter.search.common.config.ConfigFile;
|
||||
import com.twitter.search.common.config.ConfigurationException;
|
||||
import com.twitter.search.common.metrics.SearchLongGauge;
|
||||
import com.twitter.search.common.util.date.DateUtil;
|
||||
|
||||
/**
|
||||
* This class provides APIs to access the tier configurations for a cluster.
|
||||
* Each tier has tier name, number of partitions, tier start time and end time.
|
||||
*/
|
||||
public final class TierConfig {
|
||||
private static final org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(TierConfig.class);
|
||||
|
||||
private static final String DEFAULT_CONFIG_DIR = "common/config";
|
||||
public static final String DEFAULT_TIER_FILE = "earlybird-tiers.yml";
|
||||
|
||||
public static final Date DEFAULT_TIER_START_DATE = DateUtil.toDate(2006, 3, 21);
|
||||
// It's convenient for DEFAULT_TIER_END_DATE to be before ~2100, because then the output of
|
||||
// FieldTermCounter.getHourValue(DEFAULT_TIER_END_END_DATE) can still fit into an integer.
|
||||
public static final Date DEFAULT_TIER_END_DATE = DateUtil.toDate(2099, 1, 1);
|
||||
|
||||
public static final String DEFAULT_TIER_NAME = "all";
|
||||
public static final boolean DEFAULT_ENABLED = true;
|
||||
public static final TierInfo.RequestReadType DEFAULT_READ_TYPE = TierInfo.RequestReadType.LIGHT;
|
||||
|
||||
private static ConfigFile tierConfigFile = null;
|
||||
private static ConfigSource tierConfigSource = null;
|
||||
|
||||
public enum ConfigSource {
|
||||
LOCAL,
|
||||
ZOOKEEPER
|
||||
}
|
||||
|
||||
private TierConfig() { }
|
||||
|
||||
private static synchronized void init() {
|
||||
if (tierConfigFile == null) {
|
||||
tierConfigFile = new ConfigFile(DEFAULT_CONFIG_DIR, DEFAULT_TIER_FILE);
|
||||
tierConfigSource = ConfigSource.LOCAL;
|
||||
SearchLongGauge.export("tier_config_source_" + tierConfigSource.name()).set(1);
|
||||
LOG.info("Tier config file " + DEFAULT_TIER_FILE + " is successfully loaded from bundle.");
|
||||
}
|
||||
}
|
||||
|
||||
public static ConfigFile getConfigFile() {
|
||||
init();
|
||||
return tierConfigFile;
|
||||
}
|
||||
|
||||
public static String getConfigFileName() {
|
||||
return getConfigFile().getConfigFileName();
|
||||
}
|
||||
|
||||
/**
|
||||
* Return all the tier names specified in the config file.
|
||||
*/
|
||||
public static Set<String> getTierNames() {
|
||||
return Config.getConfig().getMapCopy(getConfigFileName()).keySet();
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the value of the given tier config property to the given value.
|
||||
*/
|
||||
public static void setForTests(String property, Object value) {
|
||||
Config.getConfig().setForTests(DEFAULT_TIER_FILE, property, value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the config info for the specified tier.
|
||||
*/
|
||||
public static TierInfo getTierInfo(String tierName) {
|
||||
return getTierInfo(tierName, null /* use current environment */);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the config info for the specified tier and environment.
|
||||
*/
|
||||
public static TierInfo getTierInfo(String tierName, @Nullable String environment) {
|
||||
String tierConfigFileType = getConfigFileName();
|
||||
Map<String, Object> tierInfo;
|
||||
try {
|
||||
tierInfo = (Map<String, Object>) Config.getConfig()
|
||||
.getFromEnvironment(environment, tierConfigFileType, tierName);
|
||||
} catch (ConfigurationException e) {
|
||||
throw new RuntimeException(e);
|
||||
}
|
||||
if (tierInfo == null) {
|
||||
LOG.error("Cannot find tier config for "
|
||||
+ tierName + "in config file: " + tierConfigFileType);
|
||||
throw new RuntimeException("Configuration error: " + tierConfigFileType);
|
||||
}
|
||||
|
||||
Long partitions = (Long) tierInfo.get("number_of_partitions");
|
||||
if (partitions == null) {
|
||||
LOG.error("No number of partition is specified for tier "
|
||||
+ tierName + " in tier config file " + tierConfigFileType);
|
||||
throw new RuntimeException("Configuration error: " + tierConfigFileType);
|
||||
}
|
||||
|
||||
Long numTimeslices = (Long) tierInfo.get("serving_timeslices");
|
||||
if (numTimeslices == null) {
|
||||
LOG.info("No max timeslices is specified for tier "
|
||||
+ tierName + " in tier config file " + tierConfigFileType
|
||||
+ ", not setting a cap on number of serving timeslices");
|
||||
// NOTE: we use max int32 here because it will ultimately be cast to an int, but the config
|
||||
// map expects Longs for all integral types. Using Long.MAX_VALUE leads to max serving
|
||||
// timeslices being set to -1 when it is truncated to an int.
|
||||
numTimeslices = (long) Integer.MAX_VALUE;
|
||||
}
|
||||
|
||||
Date tierStartDate = (Date) tierInfo.get("data_range_start_date_inclusive");
|
||||
if (tierStartDate == null) {
|
||||
tierStartDate = DEFAULT_TIER_START_DATE;
|
||||
}
|
||||
Date tierEndDate = (Date) tierInfo.get("data_range_end_date_exclusive");
|
||||
if (tierEndDate == null) {
|
||||
tierEndDate = DEFAULT_TIER_END_DATE;
|
||||
}
|
||||
|
||||
Boolean tierEnabled = (Boolean) tierInfo.get("tier_enabled");
|
||||
if (tierEnabled == null) {
|
||||
tierEnabled = DEFAULT_ENABLED;
|
||||
}
|
||||
|
||||
TierInfo.RequestReadType readType =
|
||||
getRequestReadType((String) tierInfo.get("tier_read_type"), DEFAULT_READ_TYPE);
|
||||
TierInfo.RequestReadType readTypeOverride =
|
||||
getRequestReadType((String) tierInfo.get("tier_read_type_override"), readType);
|
||||
|
||||
return new TierInfo(
|
||||
tierName,
|
||||
tierStartDate,
|
||||
tierEndDate,
|
||||
partitions.intValue(),
|
||||
numTimeslices.intValue(),
|
||||
tierEnabled,
|
||||
(String) tierInfo.get("serving_range_since_id_exclusive"),
|
||||
(String) tierInfo.get("serving_range_max_id_inclusive"),
|
||||
(Date) tierInfo.get("serving_range_start_date_inclusive_override"),
|
||||
(Date) tierInfo.get("serving_range_end_date_exclusive_override"),
|
||||
readType,
|
||||
readTypeOverride,
|
||||
Clock.SYSTEM_CLOCK);
|
||||
}
|
||||
|
||||
public static synchronized void clear() {
|
||||
tierConfigFile = null;
|
||||
tierConfigSource = null;
|
||||
}
|
||||
|
||||
protected static synchronized ConfigSource getTierConfigSource() {
|
||||
return tierConfigSource;
|
||||
}
|
||||
|
||||
private static TierInfo.RequestReadType getRequestReadType(
|
||||
String readTypeEnumName, TierInfo.RequestReadType defaultReadType) {
|
||||
TierInfo.RequestReadType readType = defaultReadType;
|
||||
if (readTypeEnumName != null) {
|
||||
readType = TierInfo.RequestReadType.valueOf(readTypeEnumName.trim().toUpperCase());
|
||||
Preconditions.checkState(readType != null);
|
||||
}
|
||||
return readType;
|
||||
}
|
||||
}
|
BIN
src/java/com/twitter/search/earlybird/config/TierInfo.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/TierInfo.docx
Normal file
Binary file not shown.
@ -1,180 +0,0 @@
|
||||
package com.twitter.search.earlybird.config;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
|
||||
/**
|
||||
* Properties of a single tier.
|
||||
*/
|
||||
public class TierInfo implements ServingRange {
|
||||
// What I'm seeing historically is that this has been used when adding a new tier. First you
|
||||
// add it and send dark traffic to it, then possibly grey and then you launch it by turning on
|
||||
// light traffic.
|
||||
public static enum RequestReadType {
|
||||
// Light read: send request, wait for results, and results are returned
|
||||
LIGHT,
|
||||
// Dark read: send request, do not wait for results, and results are discarded
|
||||
DARK,
|
||||
// Grey read: send request, wait for results, but discard after results come back.
|
||||
// Same results as dark read; similar latency as light read.
|
||||
GREY,
|
||||
}
|
||||
|
||||
private final String tierName;
|
||||
private final Date dataStartDate;
|
||||
private final Date dataEndDate;
|
||||
private final int numPartitions;
|
||||
private final int maxTimeslices;
|
||||
private final TierServingBoundaryEndPoint servingRangeSince;
|
||||
private final TierServingBoundaryEndPoint servingRangeMax;
|
||||
private final TierServingBoundaryEndPoint servingRangeSinceOverride;
|
||||
private final TierServingBoundaryEndPoint servingRangeMaxOverride;
|
||||
|
||||
// These two properties are only used by clients of Earlybird (E.g. roots),
|
||||
// but not by Earlybirds.
|
||||
private final boolean enabled;
|
||||
private final RequestReadType readType;
|
||||
private final RequestReadType readTypeOverride;
|
||||
|
||||
public TierInfo(String tierName,
|
||||
Date dataStartDate,
|
||||
Date dataEndDate,
|
||||
int numPartitions,
|
||||
int maxTimeslices,
|
||||
boolean enabled,
|
||||
String sinceIdString,
|
||||
String maxIdString,
|
||||
Date servingStartDateOverride,
|
||||
Date servingEndDateOverride,
|
||||
RequestReadType readType,
|
||||
RequestReadType readTypeOverride,
|
||||
Clock clock) {
|
||||
Preconditions.checkArgument(numPartitions > 0);
|
||||
Preconditions.checkArgument(maxTimeslices > 0);
|
||||
this.tierName = tierName;
|
||||
this.dataStartDate = dataStartDate;
|
||||
this.dataEndDate = dataEndDate;
|
||||
this.numPartitions = numPartitions;
|
||||
this.maxTimeslices = maxTimeslices;
|
||||
this.enabled = enabled;
|
||||
this.readType = readType;
|
||||
this.readTypeOverride = readTypeOverride;
|
||||
this.servingRangeSince = TierServingBoundaryEndPoint
|
||||
.newTierServingBoundaryEndPoint(sinceIdString, dataStartDate, clock);
|
||||
this.servingRangeMax = TierServingBoundaryEndPoint
|
||||
.newTierServingBoundaryEndPoint(maxIdString, dataEndDate, clock);
|
||||
if (servingStartDateOverride != null) {
|
||||
this.servingRangeSinceOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint(
|
||||
TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingStartDateOverride, clock);
|
||||
} else {
|
||||
this.servingRangeSinceOverride = servingRangeSince;
|
||||
}
|
||||
|
||||
if (servingEndDateOverride != null) {
|
||||
this.servingRangeMaxOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint(
|
||||
TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingEndDateOverride, clock);
|
||||
} else {
|
||||
this.servingRangeMaxOverride = servingRangeMax;
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public TierInfo(String tierName,
|
||||
Date dataStartDate,
|
||||
Date dataEndDate,
|
||||
int numPartitions,
|
||||
int maxTimeslices,
|
||||
boolean enabled,
|
||||
String sinceIdString,
|
||||
String maxIdString,
|
||||
RequestReadType readType,
|
||||
Clock clock) {
|
||||
// No overrides:
|
||||
// servingRangeSinceOverride == servingRangeSince
|
||||
// servingRangeMaxOverride == servingRangeMax
|
||||
// readTypeOverride == readType
|
||||
this(tierName, dataStartDate, dataEndDate, numPartitions, maxTimeslices, enabled, sinceIdString,
|
||||
maxIdString, null, null, readType, readType, clock);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return tierName;
|
||||
}
|
||||
|
||||
public String getTierName() {
|
||||
return tierName;
|
||||
}
|
||||
|
||||
public Date getDataStartDate() {
|
||||
return dataStartDate;
|
||||
}
|
||||
|
||||
public Date getDataEndDate() {
|
||||
return dataEndDate;
|
||||
}
|
||||
|
||||
public int getNumPartitions() {
|
||||
return numPartitions;
|
||||
}
|
||||
|
||||
public int getMaxTimeslices() {
|
||||
return maxTimeslices;
|
||||
}
|
||||
|
||||
public TierConfig.ConfigSource getSource() {
|
||||
return TierConfig.getTierConfigSource();
|
||||
}
|
||||
|
||||
public boolean isEnabled() {
|
||||
return enabled;
|
||||
}
|
||||
|
||||
public boolean isDarkRead() {
|
||||
return readType == RequestReadType.DARK;
|
||||
}
|
||||
|
||||
public RequestReadType getReadType() {
|
||||
return readType;
|
||||
}
|
||||
|
||||
public RequestReadType getReadTypeOverride() {
|
||||
return readTypeOverride;
|
||||
}
|
||||
|
||||
public long getServingRangeSinceId() {
|
||||
return servingRangeSince.getBoundaryTweetId();
|
||||
}
|
||||
|
||||
public long getServingRangeMaxId() {
|
||||
return servingRangeMax.getBoundaryTweetId();
|
||||
}
|
||||
|
||||
long getServingRangeOverrideSinceId() {
|
||||
return servingRangeSinceOverride.getBoundaryTweetId();
|
||||
}
|
||||
|
||||
long getServingRangeOverrideMaxId() {
|
||||
return servingRangeMaxOverride.getBoundaryTweetId();
|
||||
}
|
||||
|
||||
public long getServingRangeSinceTimeSecondsFromEpoch() {
|
||||
return servingRangeSince.getBoundaryTimeSecondsFromEpoch();
|
||||
}
|
||||
|
||||
public long getServingRangeUntilTimeSecondsFromEpoch() {
|
||||
return servingRangeMax.getBoundaryTimeSecondsFromEpoch();
|
||||
}
|
||||
|
||||
long getServingRangeOverrideSinceTimeSecondsFromEpoch() {
|
||||
return servingRangeSinceOverride.getBoundaryTimeSecondsFromEpoch();
|
||||
}
|
||||
|
||||
long getServingRangeOverrideUntilTimeSecondsFromEpoch() {
|
||||
return servingRangeMaxOverride.getBoundaryTimeSecondsFromEpoch();
|
||||
}
|
||||
}
|
BIN
src/java/com/twitter/search/earlybird/config/TierInfoSource.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/TierInfoSource.docx
Normal file
Binary file not shown.
@ -1,39 +0,0 @@
|
||||
package com.twitter.search.earlybird.config;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
|
||||
import javax.inject.Inject;
|
||||
|
||||
import com.twitter.search.common.util.zookeeper.ZooKeeperProxy;
|
||||
|
||||
public class TierInfoSource {
|
||||
private final ZooKeeperProxy zkClient;
|
||||
|
||||
@Inject
|
||||
public TierInfoSource(ZooKeeperProxy sZooKeeperClient) {
|
||||
this.zkClient = sZooKeeperClient;
|
||||
}
|
||||
|
||||
public List<TierInfo> getTierInformation() {
|
||||
return getTierInfoWithPrefix("tier");
|
||||
}
|
||||
|
||||
public String getConfigFileType() {
|
||||
return TierConfig.getConfigFileName();
|
||||
}
|
||||
|
||||
private List<TierInfo> getTierInfoWithPrefix(String tierPrefix) {
|
||||
Set<String> tierNames = TierConfig.getTierNames();
|
||||
List<TierInfo> tierInfos = new ArrayList<>();
|
||||
for (String name : tierNames) {
|
||||
if (name.startsWith(tierPrefix)) {
|
||||
TierInfo tierInfo = TierConfig.getTierInfo(name);
|
||||
tierInfos.add(tierInfo);
|
||||
}
|
||||
}
|
||||
return tierInfos;
|
||||
}
|
||||
|
||||
}
|
BIN
src/java/com/twitter/search/earlybird/config/TierInfoUtil.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/TierInfoUtil.docx
Normal file
Binary file not shown.
@ -1,78 +0,0 @@
|
||||
package com.twitter.search.earlybird.config;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.SortedSet;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
public final class TierInfoUtil {
|
||||
public static final Comparator<TierInfo> TIER_COMPARATOR = (t1, t2) -> {
|
||||
// Reverse sort order based on date.
|
||||
return t2.getDataStartDate().compareTo(t1.getDataStartDate());
|
||||
};
|
||||
|
||||
private TierInfoUtil() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks that the serving ranges and the override serving ranges of the given tiers do not
|
||||
* overlap, and do not have gaps. Dark reads tiers are ignored.
|
||||
*/
|
||||
public static void checkTierServingRanges(SortedSet<TierInfo> tierInfos) {
|
||||
boolean tierServingRangesOverlap = false;
|
||||
boolean tierOverrideServingRangesOverlap = false;
|
||||
boolean tierServingRangesHaveGaps = false;
|
||||
boolean tierOverrideServingRangesHaveGaps = false;
|
||||
|
||||
TierInfoWrapper previousTierInfoWrapper = null;
|
||||
TierInfoWrapper previousOverrideTierInfoWrapper = null;
|
||||
for (TierInfo tierInfo : tierInfos) {
|
||||
TierInfoWrapper tierInfoWrapper = new TierInfoWrapper(tierInfo, false);
|
||||
TierInfoWrapper overrideTierInfoWrapper = new TierInfoWrapper(tierInfo, true);
|
||||
|
||||
// Check only the tiers to which we send light reads.
|
||||
if (!tierInfoWrapper.isDarkRead()) {
|
||||
if (previousTierInfoWrapper != null) {
|
||||
if (TierInfoWrapper.servingRangesOverlap(previousTierInfoWrapper, tierInfoWrapper)) {
|
||||
// In case of rebalancing, we may have an overlap data range while
|
||||
// overriding with a good serving range.
|
||||
if (previousOverrideTierInfoWrapper == null
|
||||
|| TierInfoWrapper.servingRangesOverlap(
|
||||
previousOverrideTierInfoWrapper, overrideTierInfoWrapper)) {
|
||||
tierServingRangesOverlap = true;
|
||||
}
|
||||
}
|
||||
if (TierInfoWrapper.servingRangesHaveGap(previousTierInfoWrapper, tierInfoWrapper)) {
|
||||
tierServingRangesHaveGaps = true;
|
||||
}
|
||||
}
|
||||
|
||||
previousTierInfoWrapper = tierInfoWrapper;
|
||||
}
|
||||
|
||||
if (!overrideTierInfoWrapper.isDarkRead()) {
|
||||
if (previousOverrideTierInfoWrapper != null) {
|
||||
if (TierInfoWrapper.servingRangesOverlap(previousOverrideTierInfoWrapper,
|
||||
overrideTierInfoWrapper)) {
|
||||
tierOverrideServingRangesOverlap = true;
|
||||
}
|
||||
if (TierInfoWrapper.servingRangesHaveGap(previousOverrideTierInfoWrapper,
|
||||
overrideTierInfoWrapper)) {
|
||||
tierOverrideServingRangesHaveGaps = true;
|
||||
}
|
||||
}
|
||||
|
||||
previousOverrideTierInfoWrapper = overrideTierInfoWrapper;
|
||||
}
|
||||
}
|
||||
|
||||
Preconditions.checkState(!tierServingRangesOverlap,
|
||||
"Serving ranges of light reads tiers must not overlap.");
|
||||
Preconditions.checkState(!tierServingRangesHaveGaps,
|
||||
"Serving ranges of light reads tiers must not have gaps.");
|
||||
Preconditions.checkState(!tierOverrideServingRangesOverlap,
|
||||
"Override serving ranges of light reads tiers must not overlap.");
|
||||
Preconditions.checkState(!tierOverrideServingRangesHaveGaps,
|
||||
"Override serving ranges of light reads tiers must not have gaps.");
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,89 +0,0 @@
|
||||
package com.twitter.search.earlybird.config;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
/**
|
||||
* A simple wrapper around TierInfo that returns the "real" or the "overriden" values from the given
|
||||
* {@code TierInfo} instance, based on the given {@code useOverrideTierConfig} flag.
|
||||
*/
|
||||
public class TierInfoWrapper implements ServingRange {
|
||||
private final TierInfo tierInfo;
|
||||
private final boolean useOverrideTierConfig;
|
||||
|
||||
public TierInfoWrapper(TierInfo tierInfo, boolean useOverrideTierConfig) {
|
||||
this.tierInfo = Preconditions.checkNotNull(tierInfo);
|
||||
this.useOverrideTierConfig = useOverrideTierConfig;
|
||||
}
|
||||
|
||||
public String getTierName() {
|
||||
return tierInfo.getTierName();
|
||||
}
|
||||
|
||||
public Date getDataStartDate() {
|
||||
return tierInfo.getDataStartDate();
|
||||
}
|
||||
|
||||
public Date getDataEndDate() {
|
||||
return tierInfo.getDataEndDate();
|
||||
}
|
||||
|
||||
public int getNumPartitions() {
|
||||
return tierInfo.getNumPartitions();
|
||||
}
|
||||
|
||||
public int getMaxTimeslices() {
|
||||
return tierInfo.getMaxTimeslices();
|
||||
}
|
||||
|
||||
public TierConfig.ConfigSource getSource() {
|
||||
return tierInfo.getSource();
|
||||
}
|
||||
|
||||
public boolean isEnabled() {
|
||||
return tierInfo.isEnabled();
|
||||
}
|
||||
|
||||
public boolean isDarkRead() {
|
||||
return getReadType() == TierInfo.RequestReadType.DARK;
|
||||
}
|
||||
|
||||
public TierInfo.RequestReadType getReadType() {
|
||||
return useOverrideTierConfig ? tierInfo.getReadTypeOverride() : tierInfo.getReadType();
|
||||
}
|
||||
|
||||
public long getServingRangeSinceId() {
|
||||
return useOverrideTierConfig
|
||||
? tierInfo.getServingRangeOverrideSinceId()
|
||||
: tierInfo.getServingRangeSinceId();
|
||||
}
|
||||
|
||||
public long getServingRangeMaxId() {
|
||||
return useOverrideTierConfig
|
||||
? tierInfo.getServingRangeOverrideMaxId()
|
||||
: tierInfo.getServingRangeMaxId();
|
||||
}
|
||||
|
||||
public long getServingRangeSinceTimeSecondsFromEpoch() {
|
||||
return useOverrideTierConfig
|
||||
? tierInfo.getServingRangeOverrideSinceTimeSecondsFromEpoch()
|
||||
: tierInfo.getServingRangeSinceTimeSecondsFromEpoch();
|
||||
}
|
||||
|
||||
public long getServingRangeUntilTimeSecondsFromEpoch() {
|
||||
return useOverrideTierConfig
|
||||
? tierInfo.getServingRangeOverrideUntilTimeSecondsFromEpoch()
|
||||
: tierInfo.getServingRangeUntilTimeSecondsFromEpoch();
|
||||
}
|
||||
|
||||
public static boolean servingRangesOverlap(TierInfoWrapper tier1, TierInfoWrapper tier2) {
|
||||
return (tier1.getServingRangeMaxId() > tier2.getServingRangeSinceId())
|
||||
&& (tier2.getServingRangeMaxId() > tier1.getServingRangeSinceId());
|
||||
}
|
||||
|
||||
public static boolean servingRangesHaveGap(TierInfoWrapper tier1, TierInfoWrapper tier2) {
|
||||
return (tier1.getServingRangeMaxId() < tier2.getServingRangeSinceId())
|
||||
|| (tier2.getServingRangeMaxId() < tier1.getServingRangeSinceId());
|
||||
}
|
||||
}
|
Binary file not shown.
@ -1,146 +0,0 @@
|
||||
package com.twitter.search.earlybird.config;
|
||||
|
||||
import java.util.Date;
|
||||
|
||||
import javax.annotation.Nullable;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import com.twitter.common.util.Clock;
|
||||
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
|
||||
|
||||
/**
|
||||
* The start or end boundary of a tier's serving range.
|
||||
* This is used to add since_id and max_id operators onto search queries.
|
||||
*/
|
||||
public class TierServingBoundaryEndPoint {
|
||||
@VisibleForTesting
|
||||
public static final String INFERRED_FROM_DATA_RANGE = "inferred_from_data_range";
|
||||
public static final String RELATIVE_TO_CURRENT_TIME_MS = "relative_to_current_time_ms";
|
||||
|
||||
// Either offsetToCurrentTimeMillis is set or (absoluteTweetId and timeBoundarySecondsFromEpoch)
|
||||
// are set.
|
||||
@Nullable
|
||||
private final Long offsetToCurrentTimeMillis;
|
||||
@Nullable
|
||||
private final Long absoluteTweetId;
|
||||
@Nullable
|
||||
private final Long timeBoundarySecondsFromEpoch;
|
||||
private final Clock clock;
|
||||
|
||||
TierServingBoundaryEndPoint(Long absoluteTweetId,
|
||||
Long timeBoundarySecondsFromEpoch,
|
||||
Long offsetToCurrentTimeMillis,
|
||||
Clock clock) {
|
||||
this.offsetToCurrentTimeMillis = offsetToCurrentTimeMillis;
|
||||
this.absoluteTweetId = absoluteTweetId;
|
||||
this.timeBoundarySecondsFromEpoch = timeBoundarySecondsFromEpoch;
|
||||
this.clock = clock;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse the boundary string and construct a TierServingBoundaryEndPoint instance.
|
||||
* @param boundaryString boundary configuration string. Valid values are:
|
||||
* <li>
|
||||
* "inferred_from_data_range" infers serving range from data range. This only works after
|
||||
* Nov 2010 when Twitter switched to snowflake IDs.
|
||||
* This is the default value.
|
||||
* </li>
|
||||
* <li>
|
||||
* "absolute_tweet_id_and_timestamp_millis:id:timestamp" a tweet ID/timestamp is given
|
||||
* explicitly as the serving range
|
||||
* boundary.
|
||||
* </li>
|
||||
* <li>
|
||||
* "relative_to_current_time_ms:offset" adds offset onto current timestamp in millis to
|
||||
* compute serving range.
|
||||
* </li>
|
||||
*
|
||||
* @param boundaryDate the data boundary. This is used in conjunction with
|
||||
* inferred_from_data_date to determine the serving boundary.
|
||||
* @param clock Clock used to obtain current time, when relative_to_current_time_ms is used.
|
||||
* Tests pass in a FakeClock.
|
||||
*/
|
||||
public static TierServingBoundaryEndPoint newTierServingBoundaryEndPoint(String boundaryString,
|
||||
Date boundaryDate,
|
||||
Clock clock) {
|
||||
if (boundaryString == null || boundaryString.trim().equals(
|
||||
INFERRED_FROM_DATA_RANGE)) {
|
||||
return inferBoundaryFromDataRange(boundaryDate, clock);
|
||||
} else if (boundaryString.trim().startsWith(RELATIVE_TO_CURRENT_TIME_MS)) {
|
||||
return getRelativeBoundary(boundaryString, clock);
|
||||
} else {
|
||||
throw new IllegalStateException("Cannot parse serving range string: " + boundaryString);
|
||||
}
|
||||
}
|
||||
|
||||
private static TierServingBoundaryEndPoint inferBoundaryFromDataRange(Date boundaryDate,
|
||||
Clock clock) {
|
||||
// infer from data range
|
||||
// handle default start date and end date, in case the dates are not specified in the config
|
||||
if (boundaryDate.equals(TierConfig.DEFAULT_TIER_START_DATE)) {
|
||||
return new TierServingBoundaryEndPoint(
|
||||
-1L, TierConfig.DEFAULT_TIER_START_DATE.getTime() / 1000, null, clock);
|
||||
} else if (boundaryDate.equals(TierConfig.DEFAULT_TIER_END_DATE)) {
|
||||
return new TierServingBoundaryEndPoint(
|
||||
Long.MAX_VALUE, TierConfig.DEFAULT_TIER_END_DATE.getTime() / 1000, null, clock);
|
||||
} else {
|
||||
// convert data start / end dates into since / max ID.
|
||||
long boundaryTimeMillis = boundaryDate.getTime();
|
||||
if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(boundaryTimeMillis)) {
|
||||
throw new IllegalStateException("Serving time range can not be determined, because "
|
||||
+ boundaryDate + " is before Twitter switched to snowflake tweet IDs.");
|
||||
}
|
||||
// Earlybird since_id is inclusive and max_id is exclusive. We substract 1 here.
|
||||
// Consider example:
|
||||
// full0: 5000 (inclusive) - 6000 (exclusive)
|
||||
// full1: 6000 (inclusive) - 7000 (exclusive)
|
||||
// For tier full0, we should use max_id 5999 instead of 6000.
|
||||
// For tier full1, we should use since_id 5999 instead of 6000.
|
||||
// Hence we substract 1 here.
|
||||
long adjustedTweetId =
|
||||
SnowflakeIdParser.generateValidStatusId(boundaryTimeMillis, 0) - 1;
|
||||
Preconditions.checkState(adjustedTweetId >= 0, "boundary tweet ID must be non-negative");
|
||||
return new TierServingBoundaryEndPoint(
|
||||
adjustedTweetId, boundaryTimeMillis / 1000, null, clock);
|
||||
}
|
||||
}
|
||||
|
||||
private static TierServingBoundaryEndPoint getRelativeBoundary(String boundaryString,
|
||||
Clock clock) {
|
||||
// An offset relative to current time is given
|
||||
String[] parts = boundaryString.split(":");
|
||||
Preconditions.checkState(parts.length == 2);
|
||||
long offset = Long.parseLong(parts[1]);
|
||||
return new TierServingBoundaryEndPoint(null, null, offset, clock);
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the tweet ID for this tier boundary. If the tier boundary was created using a tweet ID,
|
||||
* that tweet ID is returned. Otherwise, a tweet ID is derived from the time boundary.
|
||||
*/
|
||||
@VisibleForTesting
|
||||
public long getBoundaryTweetId() {
|
||||
// If absoluteTweetId is available, use it.
|
||||
if (absoluteTweetId != null) {
|
||||
return absoluteTweetId;
|
||||
} else {
|
||||
Preconditions.checkNotNull(offsetToCurrentTimeMillis);
|
||||
long boundaryTime = clock.nowMillis() + offsetToCurrentTimeMillis;
|
||||
return SnowflakeIdParser.generateValidStatusId(boundaryTime, 0);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the time boundary for this tier boundary, in seconds since epoch.
|
||||
*/
|
||||
public long getBoundaryTimeSecondsFromEpoch() {
|
||||
if (timeBoundarySecondsFromEpoch != null) {
|
||||
return timeBoundarySecondsFromEpoch;
|
||||
} else {
|
||||
Preconditions.checkNotNull(offsetToCurrentTimeMillis);
|
||||
return (clock.nowMillis() + offsetToCurrentTimeMillis) / 1000;
|
||||
}
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user