mirror of
https://github.com/twitter/the-algorithm.git
synced 2024-11-16 08:29:21 +01:00
[docx] split commit for file 4200
Signed-off-by: Ari Archer <ari.web.xyz@gmail.com>
This commit is contained in:
parent
47a8228a09
commit
8948d714f6
Binary file not shown.
@ -1,279 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Date;
|
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
import com.google.common.base.Predicate;
|
|
||||||
|
|
||||||
import org.apache.commons.lang.time.FastDateFormat;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.common.util.Clock;
|
|
||||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
|
||||||
import com.twitter.search.common.metrics.SearchStatsReceiver;
|
|
||||||
import com.twitter.search.common.metrics.SearchStatsReceiverImpl;
|
|
||||||
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
|
|
||||||
import com.twitter.search.common.util.io.recordreader.RecordReader;
|
|
||||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
|
||||||
import com.twitter.search.earlybird.EarlybirdIndexConfig;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
|
||||||
import com.twitter.search.earlybird.document.DocumentFactory;
|
|
||||||
import com.twitter.search.earlybird.document.TweetDocument;
|
|
||||||
import com.twitter.search.earlybird.exception.CriticalExceptionHandler;
|
|
||||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
|
||||||
import com.twitter.search.earlybird.partition.SearchIndexingMetricSet;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentHdfsFlusher;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentLoader;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentOptimizer;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
|
||||||
import com.twitter.search.earlybird.partition.SimpleSegmentIndexer;
|
|
||||||
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a segment, this class checks if the segment has an index built on HDFS:
|
|
||||||
* if not, use SimpleSegmentIndexer to build an index
|
|
||||||
* if yes, load the HDFS index, build a new index for the new status data which has dates newer
|
|
||||||
* than the HDFS index, then append the loaded HDFS index.
|
|
||||||
*/
|
|
||||||
public class ArchiveSegmentUpdater {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentUpdater.class);
|
|
||||||
|
|
||||||
private final SegmentSyncConfig sync;
|
|
||||||
private final EarlybirdIndexConfig earlybirdIndexConfig;
|
|
||||||
private final ZooKeeperTryLockFactory zkTryLockFactory;
|
|
||||||
private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl();
|
|
||||||
private final SearchIndexingMetricSet searchIndexingMetricSet =
|
|
||||||
new SearchIndexingMetricSet(statsReceiver);
|
|
||||||
private final EarlybirdSearcherStats searcherStats =
|
|
||||||
new EarlybirdSearcherStats(statsReceiver);
|
|
||||||
private final SearchRateCounter indexNewSegment =
|
|
||||||
new SearchRateCounter("index_new_segment");
|
|
||||||
private final SearchRateCounter updateExistingSegment =
|
|
||||||
new SearchRateCounter("update_existing_segment");
|
|
||||||
private final SearchRateCounter skipExistingSegment =
|
|
||||||
new SearchRateCounter("skip_existing_segment");
|
|
||||||
private Clock clock;
|
|
||||||
|
|
||||||
public ArchiveSegmentUpdater(ZooKeeperTryLockFactory zooKeeperTryLockFactory,
|
|
||||||
SegmentSyncConfig sync,
|
|
||||||
EarlybirdIndexConfig earlybirdIndexConfig,
|
|
||||||
Clock clock) {
|
|
||||||
this.sync = sync;
|
|
||||||
this.earlybirdIndexConfig = earlybirdIndexConfig;
|
|
||||||
this.zkTryLockFactory = zooKeeperTryLockFactory;
|
|
||||||
this.clock = clock;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean canUpdateSegment(SegmentInfo segmentInfo) {
|
|
||||||
if (!(segmentInfo.getSegment() instanceof ArchiveSegment)) {
|
|
||||||
LOG.info("only ArchiveSegment is available for updating now: "
|
|
||||||
+ segmentInfo);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!segmentInfo.isEnabled()) {
|
|
||||||
LOG.debug("Segment is disabled: " + segmentInfo);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (segmentInfo.isComplete() || segmentInfo.isIndexing()
|
|
||||||
|| segmentInfo.getSyncInfo().isLoaded()) {
|
|
||||||
LOG.debug("Cannot update already indexed segment: " + segmentInfo);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a segment, checks if the segment has an index built on HDFS:
|
|
||||||
* if not, use SimpleSegmentIndexer to build an index
|
|
||||||
* if yes, load the HDFS index, build a new index for the new status data which has dates newer
|
|
||||||
* than the HDFS index, then append the loaded HDFS index.
|
|
||||||
*
|
|
||||||
* Returns whether the segment was successfully updated.
|
|
||||||
*/
|
|
||||||
public boolean updateSegment(SegmentInfo segmentInfo) {
|
|
||||||
Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
|
|
||||||
if (!canUpdateSegment(segmentInfo)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (segmentInfo.isIndexing()) {
|
|
||||||
LOG.error("Segment is already being indexed: " + segmentInfo);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
final Date hdfsEndDate = ArchiveHDFSUtils.getSegmentEndDateOnHdfs(sync, segmentInfo);
|
|
||||||
if (hdfsEndDate == null) {
|
|
||||||
indexNewSegment.increment();
|
|
||||||
if (!indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
final Date curEndDate = ((ArchiveSegment) segmentInfo.getSegment()).getDataEndDate();
|
|
||||||
if (!hdfsEndDate.before(curEndDate)) {
|
|
||||||
skipExistingSegment.increment();
|
|
||||||
LOG.info("Segment is up-to-date: " + segmentInfo.getSegment().getTimeSliceID()
|
|
||||||
+ " Found flushed segment on HDFS with end date: "
|
|
||||||
+ FastDateFormat.getInstance("yyyyMMdd").format(hdfsEndDate));
|
|
||||||
segmentInfo.setComplete(true);
|
|
||||||
segmentInfo.getSyncInfo().setFlushed(true);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
updateExistingSegment.increment();
|
|
||||||
LOG.info("Updating segment: " + segmentInfo.getSegment().getTimeSliceID()
|
|
||||||
+ "; new endDate will be " + FastDateFormat.getInstance("yyyyMMdd").format(curEndDate));
|
|
||||||
|
|
||||||
if (!updateSegment(segmentInfo, hdfsEndDate)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean success = SegmentOptimizer.optimize(segmentInfo);
|
|
||||||
if (!success) {
|
|
||||||
// Clean up the segment dir on local disk
|
|
||||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
|
||||||
LOG.info("Error optimizing segment: " + segmentInfo);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Verify segment before uploading.
|
|
||||||
success = ArchiveSegmentVerifier.verifySegment(segmentInfo);
|
|
||||||
if (!success) {
|
|
||||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
|
||||||
LOG.info("Segment not uploaded to HDFS because it did not pass verification: " + segmentInfo);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// upload the index to HDFS
|
|
||||||
success = new SegmentHdfsFlusher(zkTryLockFactory, sync, false)
|
|
||||||
.flushSegmentToDiskAndHDFS(segmentInfo);
|
|
||||||
if (success) {
|
|
||||||
ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, false, true);
|
|
||||||
} else {
|
|
||||||
// Clean up the segment dir on hdfs
|
|
||||||
ArchiveHDFSUtils.deleteHdfsSegmentDir(sync, segmentInfo, true, false);
|
|
||||||
LOG.info("Error uploading segment to HDFS: " + segmentInfo);
|
|
||||||
}
|
|
||||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
|
||||||
|
|
||||||
return success;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Build index for the given segmentInfo. Only those statuses passing the dateFilter are indexed.
|
|
||||||
*/
|
|
||||||
private boolean indexSegment(final SegmentInfo segmentInfo, Predicate<Date> dateFilter) {
|
|
||||||
Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
|
|
||||||
|
|
||||||
RecordReader<TweetDocument> documentReader = null;
|
|
||||||
try {
|
|
||||||
ArchiveSegment archiveSegment = (ArchiveSegment) segmentInfo.getSegment();
|
|
||||||
DocumentFactory<ThriftIndexingEvent> documentFactory =
|
|
||||||
earlybirdIndexConfig.createDocumentFactory();
|
|
||||||
documentReader = archiveSegment.getStatusRecordReader(documentFactory, dateFilter);
|
|
||||||
|
|
||||||
// Read and index the statuses
|
|
||||||
boolean success = new SimpleSegmentIndexer(documentReader, searchIndexingMetricSet)
|
|
||||||
.indexSegment(segmentInfo);
|
|
||||||
if (!success) {
|
|
||||||
// Clean up segment dir on local disk
|
|
||||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
|
||||||
LOG.info("Error indexing segment: " + segmentInfo);
|
|
||||||
}
|
|
||||||
|
|
||||||
return success;
|
|
||||||
} catch (IOException e) {
|
|
||||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
|
||||||
LOG.info("Exception while indexing segment: " + segmentInfo, e);
|
|
||||||
return false;
|
|
||||||
} finally {
|
|
||||||
if (documentReader != null) {
|
|
||||||
documentReader.stop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load the index built on HDFS for the given segmentInfo, index the new data and append the
|
|
||||||
* HDFS index to the new indexed segment
|
|
||||||
*/
|
|
||||||
private boolean updateSegment(final SegmentInfo segmentInfo, final Date hdfsEndDate) {
|
|
||||||
SegmentInfo hdfsSegmentInfo = loadSegmentFromHdfs(segmentInfo, hdfsEndDate);
|
|
||||||
if (hdfsSegmentInfo == null) {
|
|
||||||
return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE);
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean success = indexSegment(segmentInfo, input -> {
|
|
||||||
// we're updating the segment - only index days after the old end date,
|
|
||||||
// and we're sure that the previous days have already been indexed.
|
|
||||||
return input.after(hdfsEndDate);
|
|
||||||
});
|
|
||||||
if (!success) {
|
|
||||||
LOG.error("Error indexing new data: " + segmentInfo);
|
|
||||||
return indexSegment(segmentInfo, ArchiveSegment.MATCH_ALL_DATE_PREDICATE);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Now, append the index loaded from hdfs
|
|
||||||
try {
|
|
||||||
segmentInfo.getIndexSegment().append(hdfsSegmentInfo.getIndexSegment());
|
|
||||||
hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
|
||||||
LOG.info("Deleted local segment directories with end date " + hdfsEndDate + " : "
|
|
||||||
+ segmentInfo);
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.warn("Caught IOException while appending segment " + hdfsSegmentInfo.getSegmentName(), e);
|
|
||||||
hdfsSegmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
|
||||||
segmentInfo.deleteLocalIndexedSegmentDirectoryImmediately();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
segmentInfo.setComplete(true);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load the index built on HDFS for the given segmentInfo and end date
|
|
||||||
*/
|
|
||||||
private SegmentInfo loadSegmentFromHdfs(final SegmentInfo segmentInfo, final Date hdfsEndDate) {
|
|
||||||
Preconditions.checkArgument(segmentInfo.getSegment() instanceof ArchiveSegment);
|
|
||||||
|
|
||||||
ArchiveSegment segment = new ArchiveSegment(
|
|
||||||
segmentInfo.getTimeSliceID(),
|
|
||||||
EarlybirdConfig.getMaxSegmentSize(),
|
|
||||||
segmentInfo.getNumPartitions(),
|
|
||||||
segmentInfo.getSegment().getHashPartitionID(),
|
|
||||||
hdfsEndDate);
|
|
||||||
EarlybirdSegmentFactory factory = new EarlybirdSegmentFactory(
|
|
||||||
earlybirdIndexConfig,
|
|
||||||
searchIndexingMetricSet,
|
|
||||||
searcherStats,
|
|
||||||
clock);
|
|
||||||
|
|
||||||
SegmentInfo hdfsSegmentInfo;
|
|
||||||
|
|
||||||
try {
|
|
||||||
hdfsSegmentInfo = new SegmentInfo(segment, factory, sync);
|
|
||||||
CriticalExceptionHandler criticalExceptionHandler =
|
|
||||||
new CriticalExceptionHandler();
|
|
||||||
|
|
||||||
boolean success = new SegmentLoader(sync, criticalExceptionHandler)
|
|
||||||
.load(hdfsSegmentInfo);
|
|
||||||
if (!success) {
|
|
||||||
// If not successful, segmentLoader has already cleaned up the local dir.
|
|
||||||
LOG.info("Error loading hdfs segment " + hdfsSegmentInfo
|
|
||||||
+ ", building segment from scratch.");
|
|
||||||
hdfsSegmentInfo = null;
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Exception while loading segment from hdfs: " + segmentInfo, e);
|
|
||||||
hdfsSegmentInfo = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
return hdfsSegmentInfo;
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,75 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
|
|
||||||
import org.apache.lucene.index.DirectoryReader;
|
|
||||||
import org.apache.lucene.index.LeafReader;
|
|
||||||
import org.apache.lucene.index.LeafReaderContext;
|
|
||||||
import org.apache.lucene.store.Directory;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
|
||||||
|
|
||||||
public final class ArchiveSegmentVerifier {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(ArchiveSegmentVerifier.class);
|
|
||||||
|
|
||||||
private ArchiveSegmentVerifier() {
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
static boolean shouldVerifySegment(SegmentInfo segmentInfo) {
|
|
||||||
if (segmentInfo.isIndexing()) {
|
|
||||||
LOG.warn("ArchiveSegmentVerifier got segment still indexing.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!segmentInfo.isComplete()) {
|
|
||||||
LOG.warn("ArchiveSegmentVerifyer got incomplete segment.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!segmentInfo.isOptimized()) {
|
|
||||||
LOG.warn("ArchiveSegmentVerifyer got unoptimized segment.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Verifies an archive segment has a sane number of leaves.
|
|
||||||
*/
|
|
||||||
public static boolean verifySegment(SegmentInfo segmentInfo) {
|
|
||||||
if (!shouldVerifySegment(segmentInfo)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
Directory directory = segmentInfo.getIndexSegment().getLuceneDirectory();
|
|
||||||
return verifyLuceneIndex(directory);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean verifyLuceneIndex(Directory directory) {
|
|
||||||
try {
|
|
||||||
DirectoryReader indexerReader = DirectoryReader.open(directory);
|
|
||||||
List<LeafReaderContext> leaves = indexerReader.getContext().leaves();
|
|
||||||
if (leaves.size() != 1) {
|
|
||||||
LOG.warn("Lucene index does not have exactly one segment: " + leaves.size() + " != 1. "
|
|
||||||
+ "Lucene segments should have been merged during optimization.");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
LeafReader reader = leaves.get(0).reader();
|
|
||||||
if (reader.numDocs() <= 0) {
|
|
||||||
LOG.warn("Lucene index has no document: " + reader);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
return true;
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.warn("Found bad lucene index at: " + directory);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,322 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Calendar;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
import com.google.common.base.Predicate;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
|
|
||||||
import com.twitter.search.common.util.io.MergingSortedRecordReader;
|
|
||||||
import com.twitter.search.common.util.io.recordreader.RecordReader;
|
|
||||||
import com.twitter.search.earlybird.config.TierConfig;
|
|
||||||
import com.twitter.search.earlybird.document.DocumentFactory;
|
|
||||||
import com.twitter.search.earlybird.document.ThriftIndexingEventDocumentFactory;
|
|
||||||
import com.twitter.search.earlybird.document.TweetDocument;
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Responsible for taking a number of daily status batches and partitioning them into time slices
|
|
||||||
* which will be used to build segments.
|
|
||||||
*
|
|
||||||
* We try to put at most N number of tweets into a time slice.
|
|
||||||
*/
|
|
||||||
public class ArchiveTimeSlicer {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(ArchiveTimeSlicer.class);
|
|
||||||
|
|
||||||
private static final Comparator<TweetDocument> ASCENDING =
|
|
||||||
(o1, o2) -> Long.compare(o1.getTweetID(), o2.getTweetID());
|
|
||||||
|
|
||||||
private static final Comparator<TweetDocument> DESCENDING =
|
|
||||||
(o1, o2) -> Long.compare(o2.getTweetID(), o1.getTweetID());
|
|
||||||
|
|
||||||
// Represents a number of daily batches which will go into a segment.
|
|
||||||
public static final class ArchiveTimeSlice {
|
|
||||||
private Date startDate;
|
|
||||||
private Date endDate;
|
|
||||||
private int statusCount;
|
|
||||||
private final DailyStatusBatches directory;
|
|
||||||
private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig;
|
|
||||||
|
|
||||||
// This list is always ordered from oldest day, to the newest day.
|
|
||||||
// For the on-disk archive, we reverse the days in getTweetReaders().
|
|
||||||
private final List<DailyStatusBatch> batches = Lists.newArrayList();
|
|
||||||
|
|
||||||
private ArchiveTimeSlice(DailyStatusBatches directory,
|
|
||||||
ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
|
|
||||||
this.directory = directory;
|
|
||||||
this.earlybirdIndexConfig = earlybirdIndexConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Date getEndDate() {
|
|
||||||
return endDate;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getStatusCount() {
|
|
||||||
return statusCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumHashPartitions() {
|
|
||||||
return batches.isEmpty() ? 0 : batches.get(0).getNumHashPartitions();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a reader for reading tweets from this timeslice.
|
|
||||||
*
|
|
||||||
* @param archiveSegment The segment to which the timeslice belongs.
|
|
||||||
* @param documentFactory The ThriftIndexingEvent to TweetDocument converter.
|
|
||||||
* @param filter A filter that determines what dates should be read.
|
|
||||||
*/
|
|
||||||
public RecordReader<TweetDocument> getStatusReader(
|
|
||||||
ArchiveSegment archiveSegment,
|
|
||||||
DocumentFactory<ThriftIndexingEvent> documentFactory,
|
|
||||||
Predicate<Date> filter) throws IOException {
|
|
||||||
// We no longer support ThriftStatus based document factories.
|
|
||||||
Preconditions.checkState(documentFactory instanceof ThriftIndexingEventDocumentFactory);
|
|
||||||
|
|
||||||
final int hashPartitionID = archiveSegment.getHashPartitionID();
|
|
||||||
List<RecordReader<TweetDocument>> readers = new ArrayList<>(batches.size());
|
|
||||||
List<DailyStatusBatch> orderedForReading = orderBatchesForReading(batches);
|
|
||||||
LOG.info("Creating new status reader for hashPartition: "
|
|
||||||
+ hashPartitionID + " timeslice: " + getDescription());
|
|
||||||
|
|
||||||
for (DailyStatusBatch batch : orderedForReading) {
|
|
||||||
if (filter.apply(batch.getDate())) {
|
|
||||||
LOG.info("Adding reader for " + batch.getDate() + " " + getDescription());
|
|
||||||
PartitionedBatch partitionedBatch = batch.getPartition(hashPartitionID);
|
|
||||||
// Don't even try to create a reader if the partition is empty.
|
|
||||||
// There does not seem to be any problem in production now, but HDFS FileSystem's javadoc
|
|
||||||
// does indicate that listStatus() is allowed to throw a FileNotFoundException if the
|
|
||||||
// partition does not exist. This check makes the code more robust against future
|
|
||||||
// HDFS FileSystem implementation changes.
|
|
||||||
if (partitionedBatch.getStatusCount() > 0) {
|
|
||||||
RecordReader<TweetDocument> tweetReaders = partitionedBatch.getTweetReaders(
|
|
||||||
archiveSegment,
|
|
||||||
directory.getStatusPathToUseForDay(batch.getDate()),
|
|
||||||
documentFactory);
|
|
||||||
readers.add(tweetReaders);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOG.info("Filtered reader for " + batch.getDate() + " " + getDescription());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Creating reader for timeslice: " + getDescription()
|
|
||||||
+ " with " + readers.size() + " readers");
|
|
||||||
|
|
||||||
return new MergingSortedRecordReader<TweetDocument>(getMergingComparator(), readers);
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<DailyStatusBatch> orderBatchesForReading(List<DailyStatusBatch> orderedBatches) {
|
|
||||||
// For the index formats using stock lucene, we want the most recent days to be indexed first.
|
|
||||||
// In the twitter in-memory optimized indexes, older tweets will be added first, and
|
|
||||||
// optimization will reverse the documents to make most recent tweets be first.
|
|
||||||
return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering()
|
|
||||||
? orderedBatches : Lists.reverse(orderedBatches);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Comparator<TweetDocument> getMergingComparator() {
|
|
||||||
// We always want to retrieve larger tweet ids first.
|
|
||||||
// LIFO means that the smaller ids get inserted first --> ASCENDING order.
|
|
||||||
// FIFO would mean that we want to first insert the larger ids --> DESCENDING order.
|
|
||||||
return this.earlybirdIndexConfig.isUsingLIFODocumentOrdering()
|
|
||||||
? ASCENDING : DESCENDING;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the smallest indexed tweet ID in this timeslice for the given partition.
|
|
||||||
*
|
|
||||||
* @param hashPartitionID The partition.
|
|
||||||
*/
|
|
||||||
public long getMinStatusID(int hashPartitionID) {
|
|
||||||
if (batches.isEmpty()) {
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = 0; i < batches.size(); i++) {
|
|
||||||
long minStatusID = batches.get(i).getPartition(hashPartitionID).getMinStatusID();
|
|
||||||
if (minStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) {
|
|
||||||
return minStatusID;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the highest indexed tweet ID in this timeslice for the given partition.
|
|
||||||
*
|
|
||||||
* @param hashPartitionID The partition.
|
|
||||||
*/
|
|
||||||
public long getMaxStatusID(int hashPartitionID) {
|
|
||||||
if (batches.isEmpty()) {
|
|
||||||
return Long.MAX_VALUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int i = batches.size() - 1; i >= 0; i--) {
|
|
||||||
long maxStatusID = batches.get(i).getPartition(hashPartitionID).getMaxStatusID();
|
|
||||||
if (maxStatusID != DailyStatusBatch.EMPTY_BATCH_STATUS_ID) {
|
|
||||||
return maxStatusID;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return Long.MAX_VALUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a string with some information for this timeslice.
|
|
||||||
*/
|
|
||||||
public String getDescription() {
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
builder.append("TimeSlice[start date=");
|
|
||||||
builder.append(DailyStatusBatches.DATE_FORMAT.format(startDate));
|
|
||||||
builder.append(", end date=");
|
|
||||||
builder.append(DailyStatusBatches.DATE_FORMAT.format(endDate));
|
|
||||||
builder.append(", status count=");
|
|
||||||
builder.append(statusCount);
|
|
||||||
builder.append(", days count=");
|
|
||||||
builder.append(batches.size());
|
|
||||||
builder.append("]");
|
|
||||||
return builder.toString();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private final int maxSegmentSize;
|
|
||||||
private final DailyStatusBatches dailyStatusBatches;
|
|
||||||
private final Date tierStartDate;
|
|
||||||
private final Date tierEndDate;
|
|
||||||
private final ArchiveEarlybirdIndexConfig earlybirdIndexConfig;
|
|
||||||
|
|
||||||
private List<ArchiveTimeSlice> lastCachedTimeslices = null;
|
|
||||||
|
|
||||||
public ArchiveTimeSlicer(int maxSegmentSize,
|
|
||||||
DailyStatusBatches dailyStatusBatches,
|
|
||||||
ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
|
|
||||||
this(maxSegmentSize, dailyStatusBatches, TierConfig.DEFAULT_TIER_START_DATE,
|
|
||||||
TierConfig.DEFAULT_TIER_END_DATE, earlybirdIndexConfig);
|
|
||||||
}
|
|
||||||
|
|
||||||
public ArchiveTimeSlicer(int maxSegmentSize,
|
|
||||||
DailyStatusBatches dailyStatusBatches,
|
|
||||||
Date tierStartDate,
|
|
||||||
Date tierEndDate,
|
|
||||||
ArchiveEarlybirdIndexConfig earlybirdIndexConfig) {
|
|
||||||
this.maxSegmentSize = maxSegmentSize;
|
|
||||||
this.dailyStatusBatches = dailyStatusBatches;
|
|
||||||
this.tierStartDate = tierStartDate;
|
|
||||||
this.tierEndDate = tierEndDate;
|
|
||||||
this.earlybirdIndexConfig = earlybirdIndexConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean cacheIsValid() throws IOException {
|
|
||||||
return lastCachedTimeslices != null
|
|
||||||
&& !lastCachedTimeslices.isEmpty()
|
|
||||||
&& cacheIsValid(lastCachedTimeslices.get(lastCachedTimeslices.size() - 1).endDate);
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean cacheIsValid(Date lastDate) throws IOException {
|
|
||||||
if (lastCachedTimeslices == null || lastCachedTimeslices.isEmpty()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if we have a daily batch newer than the last batch used for the newest timeslice.
|
|
||||||
Calendar cal = Calendar.getInstance();
|
|
||||||
cal.setTime(lastDate);
|
|
||||||
cal.add(Calendar.DATE, 1);
|
|
||||||
Date nextDate = cal.getTime();
|
|
||||||
|
|
||||||
boolean foundBatch = dailyStatusBatches.hasValidBatchForDay(nextDate);
|
|
||||||
|
|
||||||
LOG.info("Checking cache: Looked for valid batch for day {}. Found: {}",
|
|
||||||
DailyStatusBatches.DATE_FORMAT.format(nextDate), foundBatch);
|
|
||||||
|
|
||||||
return !foundBatch;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean timesliceIsFull(ArchiveTimeSlice timeSlice, DailyStatusBatch batch) {
|
|
||||||
return timeSlice.statusCount + batch.getMaxPerPartitionStatusCount() > maxSegmentSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void doTimeSlicing() throws IOException {
|
|
||||||
dailyStatusBatches.refresh();
|
|
||||||
|
|
||||||
lastCachedTimeslices = Lists.newArrayList();
|
|
||||||
ArchiveTimeSlice currentTimeSlice = null;
|
|
||||||
|
|
||||||
// Iterate over each day and add it to the current timeslice, until it gets full.
|
|
||||||
for (DailyStatusBatch batch : dailyStatusBatches.getStatusBatches()) {
|
|
||||||
if (!batch.isValid()) {
|
|
||||||
LOG.warn("Skipping hole: " + batch.getDate());
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentTimeSlice == null || timesliceIsFull(currentTimeSlice, batch)) {
|
|
||||||
if (currentTimeSlice != null) {
|
|
||||||
LOG.info("Filled timeslice: " + currentTimeSlice.getDescription());
|
|
||||||
}
|
|
||||||
currentTimeSlice = new ArchiveTimeSlice(dailyStatusBatches, earlybirdIndexConfig);
|
|
||||||
currentTimeSlice.startDate = batch.getDate();
|
|
||||||
lastCachedTimeslices.add(currentTimeSlice);
|
|
||||||
}
|
|
||||||
|
|
||||||
currentTimeSlice.endDate = batch.getDate();
|
|
||||||
currentTimeSlice.statusCount += batch.getMaxPerPartitionStatusCount();
|
|
||||||
currentTimeSlice.batches.add(batch);
|
|
||||||
}
|
|
||||||
LOG.info("Last timeslice: {}", currentTimeSlice.getDescription());
|
|
||||||
|
|
||||||
LOG.info("Done with time slicing. Number of timeslices: {}",
|
|
||||||
lastCachedTimeslices.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns all timeslices for this earlybird.
|
|
||||||
*/
|
|
||||||
public List<ArchiveTimeSlice> getTimeSlices() throws IOException {
|
|
||||||
if (cacheIsValid()) {
|
|
||||||
return lastCachedTimeslices;
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Cache is outdated. Loading new daily batches now...");
|
|
||||||
|
|
||||||
doTimeSlicing();
|
|
||||||
|
|
||||||
return lastCachedTimeslices != null ? Collections.unmodifiableList(lastCachedTimeslices) : null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return the timeslices that overlap the tier start/end date ranges if they are specified
|
|
||||||
*/
|
|
||||||
public List<ArchiveTimeSlice> getTimeSlicesInTierRange() throws IOException {
|
|
||||||
List<ArchiveTimeSlice> timeSlices = getTimeSlices();
|
|
||||||
if (tierStartDate == TierConfig.DEFAULT_TIER_START_DATE
|
|
||||||
&& tierEndDate == TierConfig.DEFAULT_TIER_END_DATE) {
|
|
||||||
return timeSlices;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<ArchiveTimeSlice> filteredTimeSlice = Lists.newArrayList();
|
|
||||||
for (ArchiveTimeSlice timeSlice : timeSlices) {
|
|
||||||
if (timeSlice.startDate.before(tierEndDate) && !timeSlice.endDate.before(tierStartDate)) {
|
|
||||||
filteredTimeSlice.add(timeSlice);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return filteredTimeSlice;
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
protected DailyStatusBatches getDailyStatusBatches() {
|
|
||||||
return dailyStatusBatches;
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,166 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
import com.google.gson.Gson;
|
|
||||||
import com.google.gson.JsonParseException;
|
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Represents a day's worth of statuses (tweets) for multiple hash partitions.
|
|
||||||
*
|
|
||||||
* Note that what this class contains is not the data, but metadata.
|
|
||||||
*
|
|
||||||
* A day of tweets will come from:
|
|
||||||
* - A scrubgen, if it has happened before the scrubgen date.
|
|
||||||
* - Our daily jobs pipeline, if it has happened after that.
|
|
||||||
*
|
|
||||||
* This class checks the _SUCCESS file exists in the "statuses" subdirectory and extracts the status
|
|
||||||
* count, min status id and max status id.
|
|
||||||
*/
|
|
||||||
public class DailyStatusBatch implements Comparable<DailyStatusBatch> {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatch.class);
|
|
||||||
|
|
||||||
public static final long EMPTY_BATCH_STATUS_ID = -1;
|
|
||||||
private static final String PARTITION_FORMAT = "p_%d_of_%d";
|
|
||||||
private static final String SUCCESS_FILE_NAME = "_SUCCESS";
|
|
||||||
|
|
||||||
private final Map<Integer, PartitionedBatch> hashPartitionToStatuses = Maps.newHashMap();
|
|
||||||
|
|
||||||
private final Date date;
|
|
||||||
private final int numHashPartitions;
|
|
||||||
private final boolean hasSuccessFiles;
|
|
||||||
|
|
||||||
public DailyStatusBatch(Date date, int numHashPartitions, Path statusPath, FileSystem hdfs) {
|
|
||||||
this.date = date;
|
|
||||||
this.numHashPartitions = numHashPartitions;
|
|
||||||
this.hasSuccessFiles = checkForSuccessFile(hdfs, date, statusPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
public Date getDate() {
|
|
||||||
return date;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check for the presence of the _SUCCESS file for the given day's path on HDFS for the statuses
|
|
||||||
* field group.
|
|
||||||
*/
|
|
||||||
private boolean checkForSuccessFile(FileSystem hdfs, Date inputDate, Path statusPath) {
|
|
||||||
Path dayPath = new Path(statusPath, ArchiveHDFSUtils.dateToPath(inputDate, "/"));
|
|
||||||
Path successFilePath = new Path(dayPath, SUCCESS_FILE_NAME);
|
|
||||||
try {
|
|
||||||
return hdfs.getFileStatus(successFilePath).isFile();
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Could not verify existence of the _SUCCESS file. Assuming it doesn't exist.", e);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Loads the data for this day for the given partition.
|
|
||||||
*/
|
|
||||||
public PartitionedBatch addPartition(FileSystem hdfs, Path dayPath, int hashPartitionID)
|
|
||||||
throws IOException {
|
|
||||||
String partitionDir = String.format(PARTITION_FORMAT, hashPartitionID, numHashPartitions);
|
|
||||||
Path path = new Path(dayPath, partitionDir);
|
|
||||||
PartitionedBatch batch =
|
|
||||||
new PartitionedBatch(path, hashPartitionID, numHashPartitions, date);
|
|
||||||
batch.load(hdfs);
|
|
||||||
hashPartitionToStatuses.put(hashPartitionID, batch);
|
|
||||||
return batch;
|
|
||||||
}
|
|
||||||
|
|
||||||
public PartitionedBatch getPartition(int hashPartitionID) {
|
|
||||||
return hashPartitionToStatuses.get(hashPartitionID);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the greatest status count in all partitions belonging to this batch.
|
|
||||||
*/
|
|
||||||
public int getMaxPerPartitionStatusCount() {
|
|
||||||
int maxPerPartitionStatusCount = 0;
|
|
||||||
for (PartitionedBatch batch : hashPartitionToStatuses.values()) {
|
|
||||||
maxPerPartitionStatusCount = Math.max(batch.getStatusCount(), maxPerPartitionStatusCount);
|
|
||||||
}
|
|
||||||
return maxPerPartitionStatusCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumHashPartitions() {
|
|
||||||
return numHashPartitions;
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
boolean hasSuccessFiles() {
|
|
||||||
return hasSuccessFiles;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns true if the _status_counts files could be found in each
|
|
||||||
* hash partition subfolder that belongs to this timeslice
|
|
||||||
* AND the _SUCCESS file can be found at the root folder for day
|
|
||||||
*/
|
|
||||||
public boolean isValid() {
|
|
||||||
// make sure we have data for all hash partitions
|
|
||||||
for (int i = 0; i < numHashPartitions; i++) {
|
|
||||||
PartitionedBatch day = hashPartitionToStatuses.get(i);
|
|
||||||
if (day == null || !day.hasStatusCount() || day.isDisallowedEmptyPartition()) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return hasSuccessFiles;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
StringBuilder builder = new StringBuilder();
|
|
||||||
builder.append("DailyStatusBatch[date=").append(date)
|
|
||||||
.append(",valid=").append(isValid())
|
|
||||||
.append(",hasSuccessFiles=").append(hasSuccessFiles)
|
|
||||||
.append(",numHashPartitions=").append(numHashPartitions)
|
|
||||||
.append("]:\n");
|
|
||||||
for (int i = 0; i < numHashPartitions; i++) {
|
|
||||||
builder.append('\t').append(hashPartitionToStatuses.get(i).toString()).append('\n');
|
|
||||||
}
|
|
||||||
return builder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compareTo(DailyStatusBatch o) {
|
|
||||||
return date.compareTo(o.date);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Serialize DailyStatusBatch to a json string.
|
|
||||||
*/
|
|
||||||
public String serializeToJson() {
|
|
||||||
return serializeToJson(new Gson());
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
String serializeToJson(Gson gson) {
|
|
||||||
return gson.toJson(this);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a json string, parse its fields and construct a daily status batch.
|
|
||||||
* @param batchStr the json string representation of a daily status batch.
|
|
||||||
* @return the daily status batch constructed; if the string is of invalid format, null will be
|
|
||||||
* returned.
|
|
||||||
*/
|
|
||||||
static DailyStatusBatch deserializeFromJson(String batchStr) {
|
|
||||||
try {
|
|
||||||
return new Gson().fromJson(batchStr, DailyStatusBatch.class);
|
|
||||||
} catch (JsonParseException e) {
|
|
||||||
LOG.error("Error parsing json string: " + batchStr, e);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,702 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.FileWriter;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Calendar;
|
|
||||||
import java.util.Collection;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.NavigableMap;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
import com.google.common.base.Stopwatch;
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.commons.lang3.time.FastDateFormat;
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.common.quantity.Amount;
|
|
||||||
import com.twitter.common.quantity.Time;
|
|
||||||
import com.twitter.search.common.database.DatabaseConfig;
|
|
||||||
import com.twitter.search.common.util.date.DateUtil;
|
|
||||||
import com.twitter.search.common.util.io.LineRecordFileReader;
|
|
||||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
|
||||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdProperty;
|
|
||||||
import com.twitter.search.earlybird.partition.HdfsUtil;
|
|
||||||
import com.twitter.search.earlybird.partition.StatusBatchFlushVersion;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Provides access to preprocessed statuses (tweets) to be indexed by archive search earlybirds.
|
|
||||||
*
|
|
||||||
* These tweets can be coming from a scrub gen or from the output of the daily jobs.
|
|
||||||
*/
|
|
||||||
public class DailyStatusBatches {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(DailyStatusBatches.class);
|
|
||||||
|
|
||||||
// Maximum time to spend on obtaining daily status batches by computing or loading from HDFS
|
|
||||||
private static final Amount<Long, Time> MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES =
|
|
||||||
Amount.of(EarlybirdConfig.getLong("daily_status_batches_max_initial_load_time_minutes"),
|
|
||||||
Time.MINUTES);
|
|
||||||
// Time to wait before trying again when obtaining daily status batches fails
|
|
||||||
private static final Amount<Long, Time> DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES =
|
|
||||||
Amount.of(EarlybirdConfig.getLong("daily_status_batches_waiting_time_minutes"),
|
|
||||||
Time.MINUTES);
|
|
||||||
private static final String DAILY_STATUS_BATCHES_SYNC_PATH =
|
|
||||||
EarlybirdProperty.ZK_APP_ROOT.get() + "/daily_batches_sync";
|
|
||||||
private static final String DAILY_BATCHES_ZK_LOCK = "daily_batches_zk_lock";
|
|
||||||
private static final Amount<Long, Time> DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES =
|
|
||||||
Amount.of(EarlybirdConfig.getLong("daily_status_batches_zk_lock_expiration_minutes"),
|
|
||||||
Time.MINUTES);
|
|
||||||
|
|
||||||
static final FastDateFormat DATE_FORMAT = FastDateFormat.getInstance("yyyyMMdd");
|
|
||||||
|
|
||||||
// before this date, there was no twitter
|
|
||||||
private static final Date FIRST_TWITTER_DAY = DateUtil.toDate(2006, 2, 1);
|
|
||||||
|
|
||||||
private static final String STATUS_BATCHES_PREFIX = "status_batches";
|
|
||||||
|
|
||||||
private final String rootDir =
|
|
||||||
EarlybirdConfig.getString("hdfs_offline_segment_sync_dir", "top_archive_statuses");
|
|
||||||
|
|
||||||
private final String buildGen =
|
|
||||||
EarlybirdConfig.getString("offline_segment_build_gen", "bg_1");
|
|
||||||
|
|
||||||
public static final String STATUS_SUBDIR_NAME = "statuses";
|
|
||||||
public static final String LAYOUT_SUBDIR_NAME = "layouts";
|
|
||||||
public static final String SCRUB_GEN_SUFFIX_PATTERN = "scrubbed/%s";
|
|
||||||
|
|
||||||
private static final String INTERMEDIATE_COUNTS_SUBDIR_NAME = "counts";
|
|
||||||
private static final String SUCCESS_FILE_NAME = "_SUCCESS";
|
|
||||||
private static final Pattern HASH_PARTITION_PATTERN = Pattern.compile("p_(\\d+)_of_(\\d+)");
|
|
||||||
private static final Date FIRST_TWEET_DAY = DateUtil.toDate(2006, 3, 21);
|
|
||||||
|
|
||||||
private final Path rootPath = new Path(rootDir);
|
|
||||||
private final Path buildGenPath = new Path(rootPath, buildGen);
|
|
||||||
private final Path statusPath = new Path(buildGenPath, STATUS_SUBDIR_NAME);
|
|
||||||
|
|
||||||
private final NavigableMap<Date, DailyStatusBatch> statusBatches = Maps.newTreeMap();
|
|
||||||
|
|
||||||
private Date firstValidDay = null;
|
|
||||||
private Date lastValidDay = null;
|
|
||||||
|
|
||||||
private final ZooKeeperTryLockFactory zkTryLockFactory;
|
|
||||||
private final Date scrubGenDay;
|
|
||||||
private long numberOfDaysWithValidScrubGenData;
|
|
||||||
|
|
||||||
public DailyStatusBatches(
|
|
||||||
ZooKeeperTryLockFactory zooKeeperTryLockFactory, Date scrubGenDay) throws IOException {
|
|
||||||
this.zkTryLockFactory = zooKeeperTryLockFactory;
|
|
||||||
this.scrubGenDay = scrubGenDay;
|
|
||||||
|
|
||||||
FileSystem hdfs = null;
|
|
||||||
try {
|
|
||||||
hdfs = HdfsUtil.getHdfsFileSystem();
|
|
||||||
verifyDirectory(hdfs);
|
|
||||||
} finally {
|
|
||||||
IOUtils.closeQuietly(hdfs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
public Date getScrubGenDay() {
|
|
||||||
return scrubGenDay;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Collection<DailyStatusBatch> getStatusBatches() {
|
|
||||||
return statusBatches.values();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reset the states of the directory
|
|
||||||
*/
|
|
||||||
private void resetDirectory() {
|
|
||||||
statusBatches.clear();
|
|
||||||
firstValidDay = null;
|
|
||||||
lastValidDay = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Indicate whether the directory has been initialized
|
|
||||||
*/
|
|
||||||
private boolean isInitialized() {
|
|
||||||
return lastValidDay != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load the daily status batches from HDFS; return true if one or more batches could be loaded.
|
|
||||||
**/
|
|
||||||
private boolean refreshByLoadingHDFSStatusBatches(final FileSystem fs) throws IOException {
|
|
||||||
// first find the latest valid end date of statuses
|
|
||||||
final Date lastValidStatusDay = getLastValidInputDateFromNow(fs);
|
|
||||||
if (lastValidStatusDay != null) {
|
|
||||||
if (hasStatusBatchesOnHdfs(fs, lastValidStatusDay)) {
|
|
||||||
if (loadStatusBatchesFromHdfs(fs, lastValidStatusDay)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
resetDirectory();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks the directory for new data and returns true, if one or more new batches could be loaded.
|
|
||||||
*/
|
|
||||||
public void refresh() throws IOException {
|
|
||||||
final FileSystem hdfs = HdfsUtil.getHdfsFileSystem();
|
|
||||||
|
|
||||||
final Stopwatch stopwatch = Stopwatch.createStarted();
|
|
||||||
try {
|
|
||||||
if (!isInitialized()) {
|
|
||||||
if (initializeDailyStatusBatches(hdfs, stopwatch)) {
|
|
||||||
LOG.info("Successfully obtained daily status batches after {}", stopwatch);
|
|
||||||
} else {
|
|
||||||
String errMsg = "Failed to load or compute daily status batches after "
|
|
||||||
+ stopwatch.toString();
|
|
||||||
LOG.error(errMsg);
|
|
||||||
throw new IOException(errMsg);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
loadNewDailyBatches(hdfs);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
IOUtils.closeQuietly(hdfs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean initializeDailyStatusBatches(final FileSystem hdfs,
|
|
||||||
final Stopwatch stopwatch) throws IOException {
|
|
||||||
long timeSpentOnDailyBatches = 0L;
|
|
||||||
long maxAllowedTimeMs = MAX_TIME_ALLOWED_DAILY_STATUS_BATCHES_MINUTES.as(Time.MILLISECONDS);
|
|
||||||
long waitingTimeMs = DAILY_STATUS_BATCHES_WAITING_TIME_MINUTES.as(Time.MILLISECONDS);
|
|
||||||
boolean firstLoop = true;
|
|
||||||
LOG.info("Starting to load or compute daily status batches for the first time.");
|
|
||||||
while (timeSpentOnDailyBatches <= maxAllowedTimeMs && !Thread.currentThread().isInterrupted()) {
|
|
||||||
if (!firstLoop) {
|
|
||||||
try {
|
|
||||||
LOG.info("Sleeping " + waitingTimeMs
|
|
||||||
+ " millis before trying to obtain daily batches again");
|
|
||||||
Thread.sleep(waitingTimeMs);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
LOG.warn("Interrupted while waiting to load daily batches", e);
|
|
||||||
Thread.currentThread().interrupt();
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (isStatusBatchLoadingEnabled() && refreshByLoadingHDFSStatusBatches(hdfs)) {
|
|
||||||
LOG.info("Successfully loaded daily status batches after {}", stopwatch);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
final AtomicBoolean successRef = new AtomicBoolean(false);
|
|
||||||
if (computeDailyBatchesWithZKLock(hdfs, successRef, stopwatch)) {
|
|
||||||
return successRef.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
timeSpentOnDailyBatches = stopwatch.elapsed(TimeUnit.MILLISECONDS);
|
|
||||||
firstLoop = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean computeDailyBatchesWithZKLock(final FileSystem hdfs,
|
|
||||||
final AtomicBoolean successRef,
|
|
||||||
final Stopwatch stopwatch) throws IOException {
|
|
||||||
// Using a global lock to coordinate among earlybirds and segment builders so that only
|
|
||||||
// one instance would hit the HDFS name node to query the daily status directories
|
|
||||||
TryLock lock = zkTryLockFactory.createTryLock(
|
|
||||||
DatabaseConfig.getLocalHostname(),
|
|
||||||
DAILY_STATUS_BATCHES_SYNC_PATH,
|
|
||||||
DAILY_BATCHES_ZK_LOCK,
|
|
||||||
DAILY_STATUS_BATCHES_ZK_LOCK_EXPIRATION_MINUTES);
|
|
||||||
|
|
||||||
return lock.tryWithLock(() -> {
|
|
||||||
LOG.info("Obtained ZK lock to compute daily status batches after {}", stopwatch);
|
|
||||||
successRef.set(initialLoadDailyBatchInfos(hdfs));
|
|
||||||
if (successRef.get()) {
|
|
||||||
LOG.info("Successfully computed daily status batches after {}", stopwatch);
|
|
||||||
if (isStatusBatchFlushingEnabled()) {
|
|
||||||
LOG.info("Starting to store daily status batches to HDFS");
|
|
||||||
if (storeStatusBatchesToHdfs(hdfs, lastValidDay)) {
|
|
||||||
LOG.info("Successfully stored daily status batches to HDFS");
|
|
||||||
} else {
|
|
||||||
LOG.warn("Failed storing daily status batches to HDFS");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOG.info("Failed loading daily status info");
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private void verifyDirectory(FileSystem hdfs) throws IOException {
|
|
||||||
if (!hdfs.exists(rootPath)) {
|
|
||||||
throw new IOException("Root dir '" + rootPath + "' does not exist.");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!hdfs.exists(buildGenPath)) {
|
|
||||||
throw new IOException("Build gen dir '" + buildGenPath + "' does not exist.");
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!hdfs.exists(statusPath)) {
|
|
||||||
throw new IOException("Status dir '" + statusPath + "' does not exist.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void loadNewDailyBatches(FileSystem hdfs) throws IOException {
|
|
||||||
Preconditions.checkNotNull(lastValidDay);
|
|
||||||
|
|
||||||
Calendar day = Calendar.getInstance();
|
|
||||||
day.setTime(lastValidDay);
|
|
||||||
day.add(Calendar.DATE, 1);
|
|
||||||
|
|
||||||
while (loadDay(hdfs, day.getTime()) != null) {
|
|
||||||
lastValidDay = day.getTime();
|
|
||||||
day.add(Calendar.DATE, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean initialLoadDailyBatchInfos(FileSystem hdfs) throws IOException {
|
|
||||||
LOG.info("Starting to build timeslice map from scratch.");
|
|
||||||
|
|
||||||
final Date lastValidStatusDay = getLastValidInputDateFromNow(hdfs);
|
|
||||||
|
|
||||||
if (lastValidStatusDay == null) {
|
|
||||||
LOG.warn("No data found in " + statusPath + " and scrubbed path");
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
int mostRecentYear = DateUtil.getCalendar(lastValidStatusDay).get(Calendar.YEAR);
|
|
||||||
for (int year = 2006; year <= mostRecentYear; ++year) {
|
|
||||||
// construct path to avoid hdfs.listStatus() calls
|
|
||||||
Calendar day = Calendar.getInstance();
|
|
||||||
day.set(year, Calendar.JANUARY, 1, 0, 0, 0);
|
|
||||||
day.set(Calendar.MILLISECOND, 0);
|
|
||||||
|
|
||||||
Calendar yearEnd = Calendar.getInstance();
|
|
||||||
yearEnd.set(year, Calendar.DECEMBER, 31, 0, 0, 0);
|
|
||||||
yearEnd.set(Calendar.MILLISECOND, 0);
|
|
||||||
|
|
||||||
if (lastValidDay != null) {
|
|
||||||
// We're updating.
|
|
||||||
if (lastValidDay.after(yearEnd.getTime())) {
|
|
||||||
// This year was already loaded.
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (lastValidDay.after(day.getTime())) {
|
|
||||||
// Start one day after last valid date.
|
|
||||||
day.setTime(lastValidDay);
|
|
||||||
day.add(Calendar.DATE, 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (; !day.after(yearEnd); day.add(Calendar.DATE, 1)) {
|
|
||||||
loadDay(hdfs, day.getTime());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean updated = false;
|
|
||||||
numberOfDaysWithValidScrubGenData = 0;
|
|
||||||
|
|
||||||
// Iterate batches in sorted order.
|
|
||||||
for (DailyStatusBatch batch : statusBatches.values()) {
|
|
||||||
if (!batch.isValid()) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
if (batch.getDate().before(scrubGenDay)) {
|
|
||||||
numberOfDaysWithValidScrubGenData++;
|
|
||||||
}
|
|
||||||
if (firstValidDay == null) {
|
|
||||||
firstValidDay = batch.getDate();
|
|
||||||
}
|
|
||||||
if (lastValidDay == null || lastValidDay.before(batch.getDate())) {
|
|
||||||
lastValidDay = batch.getDate();
|
|
||||||
updated = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Number of statusBatches: {}", statusBatches.size());
|
|
||||||
return updated;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String filesToString(FileStatus[] files) {
|
|
||||||
if (files == null) {
|
|
||||||
return "null";
|
|
||||||
}
|
|
||||||
StringBuilder b = new StringBuilder();
|
|
||||||
for (FileStatus s : files) {
|
|
||||||
b.append(s.getPath().toString()).append(", ");
|
|
||||||
}
|
|
||||||
return b.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
protected DailyStatusBatch loadDay(FileSystem hdfs, Date day) throws IOException {
|
|
||||||
Path dayPath = new Path(getStatusPathToUseForDay(day), ArchiveHDFSUtils.dateToPath(day, "/"));
|
|
||||||
LOG.debug("Looking for batch in " + dayPath.toString());
|
|
||||||
DailyStatusBatch result = this.statusBatches.get(day);
|
|
||||||
if (result != null) {
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
final FileStatus[] files;
|
|
||||||
try {
|
|
||||||
files = hdfs.listStatus(dayPath);
|
|
||||||
LOG.debug("Files found: " + filesToString(files));
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
LOG.debug("loadDay() called, but directory does not exist for day: " + day
|
|
||||||
+ " in: " + dayPath);
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (files != null && files.length > 0) {
|
|
||||||
for (FileStatus file : files) {
|
|
||||||
Matcher matcher = HASH_PARTITION_PATTERN.matcher(file.getPath().getName());
|
|
||||||
if (matcher.matches()) {
|
|
||||||
int numHashPartitions = Integer.parseInt(matcher.group(2));
|
|
||||||
result = new DailyStatusBatch(
|
|
||||||
day, numHashPartitions, getStatusPathToUseForDay(day), hdfs);
|
|
||||||
|
|
||||||
for (int partitionID = 0; partitionID < numHashPartitions; partitionID++) {
|
|
||||||
result.addPartition(hdfs, dayPath, partitionID);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (result.isValid()) {
|
|
||||||
statusBatches.put(day, result);
|
|
||||||
return result;
|
|
||||||
} else {
|
|
||||||
LOG.info("Invalid batch found for day: " + day + ", batch: " + result);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// skip logging the intermediate count subdirectories or _SUCCESS files.
|
|
||||||
if (!INTERMEDIATE_COUNTS_SUBDIR_NAME.equals(file.getPath().getName())
|
|
||||||
&& !SUCCESS_FILE_NAME.equals(file.getPath().getName())) {
|
|
||||||
LOG.warn("Path does not match hash partition pattern: " + file.getPath());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOG.warn("No data found for day: " + day + " in: " + dayPath
|
|
||||||
+ " files null: " + (files == null));
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if this directory has a valid batch for the given day.
|
|
||||||
*/
|
|
||||||
public boolean hasValidBatchForDay(Date day) throws IOException {
|
|
||||||
FileSystem hdfs = null;
|
|
||||||
try {
|
|
||||||
hdfs = HdfsUtil.getHdfsFileSystem();
|
|
||||||
return hasValidBatchForDay(hdfs, day);
|
|
||||||
} finally {
|
|
||||||
IOUtils.closeQuietly(hdfs);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean hasValidBatchForDay(FileSystem fs, Date day) throws IOException {
|
|
||||||
DailyStatusBatch batch = loadDay(fs, day);
|
|
||||||
|
|
||||||
return batch != null && batch.isValid();
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
Date getFirstValidDay() {
|
|
||||||
return firstValidDay;
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
Date getLastValidDay() {
|
|
||||||
return lastValidDay;
|
|
||||||
}
|
|
||||||
|
|
||||||
private Date getLastValidInputDateFromNow(FileSystem hdfs) throws IOException {
|
|
||||||
Calendar cal = Calendar.getInstance();
|
|
||||||
cal.setTime(new Date()); // current date
|
|
||||||
return getLastValidInputDate(hdfs, cal);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Starting from current date, probe backward till we find a valid input Date
|
|
||||||
*/
|
|
||||||
@VisibleForTesting
|
|
||||||
Date getLastValidInputDate(FileSystem hdfs, Calendar cal) throws IOException {
|
|
||||||
cal.set(Calendar.MILLISECOND, 0);
|
|
||||||
cal.set(Calendar.HOUR_OF_DAY, 0);
|
|
||||||
cal.set(Calendar.MINUTE, 0);
|
|
||||||
cal.set(Calendar.SECOND, 0);
|
|
||||||
cal.set(Calendar.MILLISECOND, 0);
|
|
||||||
Date lastValidInputDate = cal.getTime();
|
|
||||||
LOG.info("Probing backwards for last valid data date from " + lastValidInputDate);
|
|
||||||
while (lastValidInputDate.after(FIRST_TWITTER_DAY)) {
|
|
||||||
if (hasValidBatchForDay(hdfs, lastValidInputDate)) {
|
|
||||||
LOG.info("Found latest valid data on date " + lastValidInputDate);
|
|
||||||
LOG.info(" Used path: {}", getStatusPathToUseForDay(lastValidInputDate));
|
|
||||||
return lastValidInputDate;
|
|
||||||
}
|
|
||||||
cal.add(Calendar.DATE, -1);
|
|
||||||
lastValidInputDate = cal.getTime();
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check if the daily status batches are already on HDFS
|
|
||||||
*/
|
|
||||||
@VisibleForTesting
|
|
||||||
boolean hasStatusBatchesOnHdfs(FileSystem fs, Date lastDataDay) {
|
|
||||||
String hdfsFileName = getHdfsStatusBatchSyncFileName(lastDataDay);
|
|
||||||
try {
|
|
||||||
return fs.exists(new Path(hdfsFileName));
|
|
||||||
} catch (IOException ex) {
|
|
||||||
LOG.error("Failed checking status batch file on HDFS: " + hdfsFileName, ex);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Load the daily status batches from HDFS by first copying the file from HDFS to local disk
|
|
||||||
* and then reading from the local disk.
|
|
||||||
*
|
|
||||||
* @param day the latest day of valid statuses.
|
|
||||||
* @return true if the loading is successful.
|
|
||||||
*/
|
|
||||||
@VisibleForTesting
|
|
||||||
boolean loadStatusBatchesFromHdfs(FileSystem fs, Date day) {
|
|
||||||
// set the directory state to initial state
|
|
||||||
resetDirectory();
|
|
||||||
|
|
||||||
String fileHdfsPath = getHdfsStatusBatchSyncFileName(day);
|
|
||||||
String fileLocalPath = getLocalStatusBatchSyncFileName(day);
|
|
||||||
|
|
||||||
LOG.info("Using " + fileHdfsPath + " as the HDFS batch summary load path.");
|
|
||||||
LOG.info("Using " + fileLocalPath + " as the local batch summary sync path.");
|
|
||||||
|
|
||||||
LineRecordFileReader lineReader = null;
|
|
||||||
try {
|
|
||||||
fs.copyToLocalFile(new Path(fileHdfsPath), new Path(fileLocalPath));
|
|
||||||
|
|
||||||
lineReader = new LineRecordFileReader(fileLocalPath);
|
|
||||||
String batchLine;
|
|
||||||
while ((batchLine = lineReader.readNext()) != null) {
|
|
||||||
DailyStatusBatch batch = DailyStatusBatch.deserializeFromJson(batchLine);
|
|
||||||
if (batch == null) {
|
|
||||||
LOG.error("Invalid daily status batch constructed from line: " + batchLine);
|
|
||||||
resetDirectory();
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
Date date = batch.getDate();
|
|
||||||
if (firstValidDay == null || firstValidDay.after(date)) {
|
|
||||||
firstValidDay = date;
|
|
||||||
}
|
|
||||||
if (lastValidDay == null || lastValidDay.before(date)) {
|
|
||||||
lastValidDay = date;
|
|
||||||
}
|
|
||||||
statusBatches.put(date, batch);
|
|
||||||
}
|
|
||||||
LOG.info("Loaded {} status batches from HDFS: {}",
|
|
||||||
statusBatches.size(), fileHdfsPath);
|
|
||||||
LOG.info("First entry: {}", statusBatches.firstEntry().getValue().toString());
|
|
||||||
LOG.info("Last entry: {}", statusBatches.lastEntry().getValue().toString());
|
|
||||||
|
|
||||||
return true;
|
|
||||||
} catch (IOException ex) {
|
|
||||||
LOG.error("Failed loading time slices from HDFS: " + fileHdfsPath, ex);
|
|
||||||
resetDirectory();
|
|
||||||
return false;
|
|
||||||
} finally {
|
|
||||||
if (lineReader != null) {
|
|
||||||
lineReader.stop();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Flush the daily status batches to local disk and then upload to HDFS.
|
|
||||||
*/
|
|
||||||
private boolean storeStatusBatchesToHdfs(FileSystem fs, Date day) {
|
|
||||||
Preconditions.checkNotNull(lastValidDay);
|
|
||||||
|
|
||||||
if (!StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.isOfficial()) {
|
|
||||||
LOG.info("Status batch flush version is not official, no batches will be flushed to HDFS");
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
String fileLocalPath = getLocalStatusBatchSyncFileName(day);
|
|
||||||
|
|
||||||
// Flush to local disk
|
|
||||||
File outputFile = null;
|
|
||||||
FileWriter fileWriter = null;
|
|
||||||
try {
|
|
||||||
LOG.info("Flushing daily status batches into: " + fileLocalPath);
|
|
||||||
outputFile = new File(fileLocalPath);
|
|
||||||
outputFile.getParentFile().mkdirs();
|
|
||||||
if (!outputFile.getParentFile().exists()) {
|
|
||||||
LOG.error("Cannot create directory: " + outputFile.getParentFile().toString());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
fileWriter = new FileWriter(outputFile, false);
|
|
||||||
for (Date date : statusBatches.keySet()) {
|
|
||||||
fileWriter.write(statusBatches.get(date).serializeToJson());
|
|
||||||
fileWriter.write("\n");
|
|
||||||
}
|
|
||||||
fileWriter.flush();
|
|
||||||
|
|
||||||
// Upload the file to HDFS
|
|
||||||
return uploadStatusBatchesToHdfs(fs, day);
|
|
||||||
} catch (IOException e) {
|
|
||||||
String fileHdfsPath = getHdfsStatusBatchSyncFileName(day);
|
|
||||||
LOG.error("Failed storing status batches to HDFS: " + fileHdfsPath, e);
|
|
||||||
return false;
|
|
||||||
} finally {
|
|
||||||
try {
|
|
||||||
if (fileWriter != null) {
|
|
||||||
fileWriter.close();
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Error to close fileWrite.", e);
|
|
||||||
}
|
|
||||||
if (outputFile != null) {
|
|
||||||
// Delete the local file
|
|
||||||
outputFile.delete();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Upload the status batches to HDFS.
|
|
||||||
*/
|
|
||||||
@VisibleForTesting
|
|
||||||
boolean uploadStatusBatchesToHdfs(FileSystem fs, Date day) {
|
|
||||||
String localFileName = getLocalStatusBatchSyncFileName(day);
|
|
||||||
String hdfsFileName = getHdfsStatusBatchSyncFileName(day);
|
|
||||||
|
|
||||||
LOG.info("Using " + hdfsFileName + " as the HDFS batch summary upload path.");
|
|
||||||
LOG.info("Using " + localFileName + " as the local batch summary sync path.");
|
|
||||||
|
|
||||||
try {
|
|
||||||
Path hdfsFilePath = new Path(hdfsFileName);
|
|
||||||
if (fs.exists(hdfsFilePath)) {
|
|
||||||
LOG.warn("Found status batch file on HDFS: " + hdfsFileName);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
String hdfsTempName = getHdfsStatusBatchTempSyncFileName(day);
|
|
||||||
Path hdfsTempPath = new Path(hdfsTempName);
|
|
||||||
if (fs.exists(hdfsTempPath)) {
|
|
||||||
LOG.info("Found existing temporary status batch file on HDFS, removing: " + hdfsTempName);
|
|
||||||
if (!fs.delete(hdfsTempPath, false)) {
|
|
||||||
LOG.error("Failed to delete temporary file: " + hdfsTempName);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
fs.copyFromLocalFile(new Path(localFileName), hdfsTempPath);
|
|
||||||
|
|
||||||
if (fs.rename(hdfsTempPath, hdfsFilePath)) {
|
|
||||||
LOG.debug("Renamed " + hdfsTempName + " on HDFS to: " + hdfsFileName);
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
LOG.error("Failed to rename " + hdfsTempName + " on HDFS to: " + hdfsFileName);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} catch (IOException ex) {
|
|
||||||
LOG.error("Failed uploading status batch file to HDFS: " + hdfsFileName, ex);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isStatusBatchFlushingEnabled() {
|
|
||||||
return EarlybirdProperty.ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED.get(false);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static boolean isStatusBatchLoadingEnabled() {
|
|
||||||
return EarlybirdConfig.getBool("archive_daily_status_batch_loading_enabled", false);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String getVersionFileExtension() {
|
|
||||||
return StatusBatchFlushVersion.CURRENT_FLUSH_VERSION.getVersionFileExtension();
|
|
||||||
}
|
|
||||||
|
|
||||||
String getStatusBatchSyncRootDir() {
|
|
||||||
return EarlybirdConfig.getString("archive_daily_status_batch_sync_dir",
|
|
||||||
"daily_status_batches") + "/" + scrubGenSuffix();
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
String getLocalStatusBatchSyncFileName(Date day) {
|
|
||||||
return getStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_"
|
|
||||||
+ DATE_FORMAT.format(day) + getVersionFileExtension();
|
|
||||||
}
|
|
||||||
|
|
||||||
String getHdfsStatusBatchSyncRootDir() {
|
|
||||||
return EarlybirdConfig.getString("hdfs_archive_daily_status_batch_sync_dir",
|
|
||||||
"daily_status_batches") + "/" + scrubGenSuffix();
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
String getHdfsStatusBatchSyncFileName(Date day) {
|
|
||||||
return getHdfsStatusBatchSyncRootDir() + "/" + STATUS_BATCHES_PREFIX + "_"
|
|
||||||
+ DATE_FORMAT.format(day) + getVersionFileExtension();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getHdfsStatusBatchTempSyncFileName(Date day) {
|
|
||||||
return getHdfsStatusBatchSyncRootDir() + "/" + DatabaseConfig.getLocalHostname() + "_"
|
|
||||||
+ STATUS_BATCHES_PREFIX + "_" + DATE_FORMAT.format(day) + getVersionFileExtension();
|
|
||||||
}
|
|
||||||
|
|
||||||
private String scrubGenSuffix() {
|
|
||||||
return String.format(SCRUB_GEN_SUFFIX_PATTERN, DATE_FORMAT.format(scrubGenDay));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the path to the directory that stores the statuses for the given day.
|
|
||||||
*/
|
|
||||||
public Path getStatusPathToUseForDay(Date day) {
|
|
||||||
if (!day.before(scrubGenDay)) {
|
|
||||||
return statusPath;
|
|
||||||
}
|
|
||||||
|
|
||||||
String suffix = scrubGenSuffix();
|
|
||||||
Preconditions.checkArgument(!suffix.isEmpty());
|
|
||||||
Path scrubPath = new Path(buildGenPath, suffix);
|
|
||||||
return new Path(scrubPath, STATUS_SUBDIR_NAME);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Determines if the data for the specified scrub gen was fully built, by checking the number of
|
|
||||||
* days for which data was built against the expected number of days extracted from the specified
|
|
||||||
* scrub gen date.
|
|
||||||
*/
|
|
||||||
public boolean isScrubGenDataFullyBuilt(FileSystem hdfs) throws IOException {
|
|
||||||
initialLoadDailyBatchInfos(hdfs);
|
|
||||||
if (numberOfDaysWithValidScrubGenData == 0) {
|
|
||||||
LOG.warn("numberOfDaysWithValidScrubGenData is 0");
|
|
||||||
}
|
|
||||||
long expectedDays = getDiffBetweenDays(scrubGenDay);
|
|
||||||
return expectedDays == numberOfDaysWithValidScrubGenData;
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
long getDiffBetweenDays(Date day) {
|
|
||||||
long diff = day.getTime() - FIRST_TWEET_DAY.getTime();
|
|
||||||
return TimeUnit.DAYS.convert(diff, TimeUnit.MILLISECONDS);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,333 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive;
|
|
||||||
|
|
||||||
import java.io.FileNotFoundException;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.regex.Matcher;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.base.Function;
|
|
||||||
import com.google.common.base.Predicate;
|
|
||||||
import com.google.common.collect.ComparisonChain;
|
|
||||||
import com.google.common.collect.Lists;
|
|
||||||
|
|
||||||
import org.apache.commons.io.IOUtils;
|
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.hadoop.fs.PathFilter;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.search.common.config.Config;
|
|
||||||
import com.twitter.search.common.metrics.SearchCounter;
|
|
||||||
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
|
|
||||||
import com.twitter.search.common.schema.earlybird.EarlybirdThriftDocumentUtil;
|
|
||||||
import com.twitter.search.common.schema.thriftjava.ThriftIndexingEvent;
|
|
||||||
import com.twitter.search.common.util.date.DateUtil;
|
|
||||||
import com.twitter.search.common.util.io.EmptyRecordReader;
|
|
||||||
import com.twitter.search.common.util.io.LzoThriftBlockFileReader;
|
|
||||||
import com.twitter.search.common.util.io.MergingSortedRecordReader;
|
|
||||||
import com.twitter.search.common.util.io.TransformingRecordReader;
|
|
||||||
import com.twitter.search.common.util.io.recordreader.RecordReader;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
|
||||||
import com.twitter.search.earlybird.document.DocumentFactory;
|
|
||||||
import com.twitter.search.earlybird.document.TweetDocument;
|
|
||||||
import com.twitter.search.earlybird.partition.HdfsUtil;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A batch of pre-processed tweets for a single hash partition from a particular day.
|
|
||||||
*/
|
|
||||||
public class PartitionedBatch {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(PartitionedBatch.class);
|
|
||||||
private static final Date START_DATE_INCLUSIVE = DateUtil.toDate(2006, 03, 21);
|
|
||||||
private static final String STATUS_COUNT_FILE_PREFIX = "_status_count_";
|
|
||||||
private static final Pattern STATUS_COUNT_FILE_PATTERN =
|
|
||||||
Pattern.compile(STATUS_COUNT_FILE_PREFIX + "(\\d+)_minid_(\\d+)_maxid_(\\d+)");
|
|
||||||
private static final int MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS =
|
|
||||||
EarlybirdConfig.getInt("archive_max_out_of_order_tolerance_hours", 12);
|
|
||||||
private static final int READER_INIT_IOEXCEPTION_RETRIES = 20;
|
|
||||||
private static final PathFilter LZO_DATA_FILES_FILTER = file -> file.getName().endsWith(".lzo");
|
|
||||||
private static final PathFilter TXT_DATA_FILES_FILTER = file -> file.getName().endsWith(".txt");
|
|
||||||
|
|
||||||
private static final Comparator<ThriftIndexingEvent> DESC_THRIFT_INDEXING_EVENT_COMPARATOR =
|
|
||||||
(o1, o2) -> ComparisonChain.start()
|
|
||||||
.compare(o2.getSortId(), o1.getSortId())
|
|
||||||
.compare(o2.getUid(), o1.getUid())
|
|
||||||
.result();
|
|
||||||
|
|
||||||
// Number archive tweets skipped because they are too out-of-order.
|
|
||||||
private static final SearchCounter OUT_OF_ORDER_STATUSES_SKIPPED =
|
|
||||||
SearchCounter.export("out_of_order_archive_statuses_skipped");
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
protected static final long MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS =
|
|
||||||
TimeUnit.HOURS.toMillis(MAXIMUM_OUT_OF_ORDER_TOLERANCE_HOURS);
|
|
||||||
|
|
||||||
private final Date date;
|
|
||||||
private final Path path;
|
|
||||||
private int statusCount;
|
|
||||||
private long minStatusID;
|
|
||||||
private long maxStatusID;
|
|
||||||
private final int hashPartitionID;
|
|
||||||
private boolean hasStatusCountFile;
|
|
||||||
private final int numHashPartitions;
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
public PartitionedBatch(
|
|
||||||
Path path,
|
|
||||||
int hashPartitionID,
|
|
||||||
int numHashPartitions,
|
|
||||||
Date date) {
|
|
||||||
this.path = path;
|
|
||||||
this.hashPartitionID = hashPartitionID;
|
|
||||||
this.numHashPartitions = numHashPartitions;
|
|
||||||
this.date = date;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Loads all the information (tweet count, etc.) for this partition and day from HDFS.
|
|
||||||
*/
|
|
||||||
public void load(FileSystem hdfs) throws IOException {
|
|
||||||
FileStatus[] dailyBatchFiles = null;
|
|
||||||
try {
|
|
||||||
// listStatus() javadoc says it throws FileNotFoundException when path does not exist.
|
|
||||||
// However, the actual implementations return null or an empty array instead.
|
|
||||||
// We handle all 3 cases: null, empty array, or FileNotFoundException.
|
|
||||||
dailyBatchFiles = hdfs.listStatus(path);
|
|
||||||
} catch (FileNotFoundException e) {
|
|
||||||
// don't do anything here and the day will be handled as empty.
|
|
||||||
}
|
|
||||||
|
|
||||||
if (dailyBatchFiles != null && dailyBatchFiles.length > 0) {
|
|
||||||
for (FileStatus file : dailyBatchFiles) {
|
|
||||||
String fileName = file.getPath().getName();
|
|
||||||
if (fileName.equals(STATUS_COUNT_FILE_PREFIX)) {
|
|
||||||
// zero tweets in this partition - this can happen for early days in 2006
|
|
||||||
handleEmptyPartition();
|
|
||||||
} else {
|
|
||||||
Matcher matcher = STATUS_COUNT_FILE_PATTERN.matcher(fileName);
|
|
||||||
if (matcher.matches()) {
|
|
||||||
try {
|
|
||||||
statusCount = Integer.parseInt(matcher.group(1));
|
|
||||||
// Only adjustMinStatusId in production. For tests, this makes the tests harder to
|
|
||||||
// understand.
|
|
||||||
minStatusID = Config.environmentIsTest() ? Long.parseLong(matcher.group(2))
|
|
||||||
: adjustMinStatusId(Long.parseLong(matcher.group(2)), date);
|
|
||||||
maxStatusID = Long.parseLong(matcher.group(3));
|
|
||||||
hasStatusCountFile = true;
|
|
||||||
} catch (NumberFormatException e) {
|
|
||||||
// invalid file - ignore
|
|
||||||
LOG.warn("Could not parse status count file name.", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// Partition folder does not exist. This case can happen for early days of twitter
|
|
||||||
// where some partitions are empty. Set us to having a status count file, the validity of
|
|
||||||
// the parent DailyStatusBatch will still be determined by whether there was a _SUCCESS file
|
|
||||||
// in the day root.
|
|
||||||
handleEmptyPartition();
|
|
||||||
|
|
||||||
if (date.after(getEarliestDenseDay())) {
|
|
||||||
LOG.error("Unexpected empty directory {} for {}", path, date);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void handleEmptyPartition() {
|
|
||||||
statusCount = 0;
|
|
||||||
minStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID;
|
|
||||||
maxStatusID = DailyStatusBatch.EMPTY_BATCH_STATUS_ID;
|
|
||||||
hasStatusCountFile = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sometimes tweets are out-of-order (E.g. a tweet from Sep 2012 got into a
|
|
||||||
* batch in July 2013). See SEARCH-1750 for more details.
|
|
||||||
* This adjust the minStatusID if it is badly out-of-order.
|
|
||||||
*/
|
|
||||||
@VisibleForTesting
|
|
||||||
protected static long adjustMinStatusId(long minStatusID, Date date) {
|
|
||||||
long dateTime = date.getTime();
|
|
||||||
// If the daily batch is for a day before we started using snow flake IDs. Never adjust.
|
|
||||||
if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(dateTime)) {
|
|
||||||
return minStatusID;
|
|
||||||
}
|
|
||||||
|
|
||||||
long earliestStartTime = dateTime - MAXIMUM_OUT_OF_ORDER_TOLERANCE_MILLIS;
|
|
||||||
long minStatusTime = SnowflakeIdParser.getTimestampFromTweetId(minStatusID);
|
|
||||||
if (minStatusTime < earliestStartTime) {
|
|
||||||
long newMinId = SnowflakeIdParser.generateValidStatusId(earliestStartTime, 0);
|
|
||||||
LOG.info("Daily batch for " + date + " has badly out of order tweet: " + minStatusID
|
|
||||||
+ ". The minStatusID for the day this batch is adjusted to " + newMinId);
|
|
||||||
return newMinId;
|
|
||||||
} else {
|
|
||||||
return minStatusID;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a reader that reads tweets from the given directory.
|
|
||||||
*
|
|
||||||
* @param archiveSegment Determines the timeslice ID of all read tweets.
|
|
||||||
* @param tweetsPath The path to the directory where the tweets for this day are stored.
|
|
||||||
* @param documentFactory The ThriftIndexingEvent to TweetDocument converter.
|
|
||||||
*/
|
|
||||||
public RecordReader<TweetDocument> getTweetReaders(
|
|
||||||
ArchiveSegment archiveSegment,
|
|
||||||
Path tweetsPath,
|
|
||||||
DocumentFactory<ThriftIndexingEvent> documentFactory) throws IOException {
|
|
||||||
RecordReader<TweetDocument> tweetDocumentReader =
|
|
||||||
new TransformingRecordReader<>(
|
|
||||||
createTweetReader(tweetsPath), new Function<ThriftIndexingEvent, TweetDocument>() {
|
|
||||||
@Override
|
|
||||||
public TweetDocument apply(ThriftIndexingEvent event) {
|
|
||||||
return new TweetDocument(
|
|
||||||
event.getSortId(),
|
|
||||||
archiveSegment.getTimeSliceID(),
|
|
||||||
EarlybirdThriftDocumentUtil.getCreatedAtMs(event.getDocument()),
|
|
||||||
documentFactory.newDocument(event)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
tweetDocumentReader.setExhaustStream(true);
|
|
||||||
return tweetDocumentReader;
|
|
||||||
}
|
|
||||||
|
|
||||||
private RecordReader<ThriftIndexingEvent> createTweetReader(Path tweetsPath) throws IOException {
|
|
||||||
if (date.before(START_DATE_INCLUSIVE)) {
|
|
||||||
return new EmptyRecordReader<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
List<RecordReader<ThriftIndexingEvent>> readers = Lists.newArrayList();
|
|
||||||
FileSystem hdfs = HdfsUtil.getHdfsFileSystem();
|
|
||||||
try {
|
|
||||||
Path dayPath = new Path(tweetsPath, ArchiveHDFSUtils.dateToPath(date, "/"));
|
|
||||||
Path partitionPath =
|
|
||||||
new Path(dayPath, String.format("p_%d_of_%d", hashPartitionID, numHashPartitions));
|
|
||||||
PathFilter pathFilter =
|
|
||||||
Config.environmentIsTest() ? TXT_DATA_FILES_FILTER : LZO_DATA_FILES_FILTER;
|
|
||||||
FileStatus[] files = hdfs.listStatus(partitionPath, pathFilter);
|
|
||||||
for (FileStatus fileStatus : files) {
|
|
||||||
String fileStatusPath = fileStatus.getPath().toString().replaceAll("file:/", "/");
|
|
||||||
RecordReader<ThriftIndexingEvent> reader = createRecordReaderWithRetries(fileStatusPath);
|
|
||||||
readers.add(reader);
|
|
||||||
}
|
|
||||||
} finally {
|
|
||||||
IOUtils.closeQuietly(hdfs);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (readers.isEmpty()) {
|
|
||||||
return new EmptyRecordReader<>();
|
|
||||||
}
|
|
||||||
|
|
||||||
return new MergingSortedRecordReader<>(DESC_THRIFT_INDEXING_EVENT_COMPARATOR, readers);
|
|
||||||
}
|
|
||||||
|
|
||||||
private RecordReader<ThriftIndexingEvent> createRecordReaderWithRetries(String filePath)
|
|
||||||
throws IOException {
|
|
||||||
Predicate<ThriftIndexingEvent> recordFilter = getRecordFilter();
|
|
||||||
int numTries = 0;
|
|
||||||
while (true) {
|
|
||||||
try {
|
|
||||||
++numTries;
|
|
||||||
return new LzoThriftBlockFileReader<>(filePath, ThriftIndexingEvent.class, recordFilter);
|
|
||||||
} catch (IOException e) {
|
|
||||||
if (numTries < READER_INIT_IOEXCEPTION_RETRIES) {
|
|
||||||
LOG.warn("Failed to open LzoThriftBlockFileReader for " + filePath + ". Will retry.", e);
|
|
||||||
} else {
|
|
||||||
LOG.error("Failed to open LzoThriftBlockFileReader for " + filePath
|
|
||||||
+ " after too many retries.", e);
|
|
||||||
throw e;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private Predicate<ThriftIndexingEvent> getRecordFilter() {
|
|
||||||
return Config.environmentIsTest() ? null : input -> {
|
|
||||||
if (input == null) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
// We only guard against status IDs that are too small, because it is possible
|
|
||||||
// for a very old tweet to get into today's batch, but not possible for a very
|
|
||||||
// large ID (a future tweet ID that is not yet published) to get in today's
|
|
||||||
// batch, unless tweet ID generation messed up.
|
|
||||||
long statusId = input.getSortId();
|
|
||||||
boolean keep = statusId >= minStatusID;
|
|
||||||
if (!keep) {
|
|
||||||
LOG.debug("Out of order documentId: {} minStatusID: {} Date: {} Path: {}",
|
|
||||||
statusId, minStatusID, date, path);
|
|
||||||
OUT_OF_ORDER_STATUSES_SKIPPED.increment();
|
|
||||||
}
|
|
||||||
return keep;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the number of statuses in this batch
|
|
||||||
*/
|
|
||||||
public int getStatusCount() {
|
|
||||||
return statusCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Was the _status_count file was found in this folder.
|
|
||||||
*/
|
|
||||||
public boolean hasStatusCount() {
|
|
||||||
return hasStatusCountFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getMinStatusID() {
|
|
||||||
return minStatusID;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getMaxStatusID() {
|
|
||||||
return maxStatusID;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Date getDate() {
|
|
||||||
return date;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Path getPath() {
|
|
||||||
return path;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check whether the partition is
|
|
||||||
* . empty and
|
|
||||||
* . it is disallowed (empty partition can only happen before 2010)
|
|
||||||
* (Empty partition means that the directory is missing when scan happens.)
|
|
||||||
*
|
|
||||||
* @return true if the partition has no documents and it is not allowed.
|
|
||||||
*/
|
|
||||||
public boolean isDisallowedEmptyPartition() {
|
|
||||||
return hasStatusCountFile
|
|
||||||
&& statusCount == 0
|
|
||||||
&& minStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID
|
|
||||||
&& maxStatusID == DailyStatusBatch.EMPTY_BATCH_STATUS_ID
|
|
||||||
&& date.after(getEarliestDenseDay());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return "PartitionedBatch[hashPartitionId=" + hashPartitionID
|
|
||||||
+ ",numHashPartitions=" + numHashPartitions
|
|
||||||
+ ",date=" + date
|
|
||||||
+ ",path=" + path
|
|
||||||
+ ",hasStatusCountFile=" + hasStatusCountFile
|
|
||||||
+ ",statusCount=" + statusCount + "]";
|
|
||||||
}
|
|
||||||
|
|
||||||
private Date getEarliestDenseDay() {
|
|
||||||
return EarlybirdConfig.getDate("archive_search_earliest_dense_day");
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,64 +0,0 @@
|
|||||||
java_library(
|
|
||||||
name = "segment_builder_lib",
|
|
||||||
sources = ["**/*.java"],
|
|
||||||
platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
"3rdparty/jvm/com/google/guava",
|
|
||||||
"3rdparty/jvm/com/google/inject:guice",
|
|
||||||
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server",
|
|
||||||
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider",
|
|
||||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
|
||||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
|
||||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
|
||||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
|
||||||
"decider/src/main/scala",
|
|
||||||
"finatra/inject/inject-core/src/main/scala",
|
|
||||||
"finatra/inject/inject-server/src/main/scala/com/twitter/inject/server",
|
|
||||||
"src/java/com/twitter/common/base",
|
|
||||||
"src/java/com/twitter/common/quantity",
|
|
||||||
"src/java/com/twitter/common/util:system-mocks",
|
|
||||||
"src/java/com/twitter/common_internal/text/version",
|
|
||||||
"src/java/com/twitter/search/common/config",
|
|
||||||
"src/java/com/twitter/search/common/database",
|
|
||||||
"src/java/com/twitter/search/common/metrics",
|
|
||||||
"src/java/com/twitter/search/common/partitioning/base",
|
|
||||||
"src/java/com/twitter/search/common/partitioning/zookeeper",
|
|
||||||
"src/java/com/twitter/search/common/schema",
|
|
||||||
"src/java/com/twitter/search/common/schema/base",
|
|
||||||
"src/java/com/twitter/search/common/util:closeresourceutil",
|
|
||||||
"src/java/com/twitter/search/common/util:gcutil",
|
|
||||||
"src/java/com/twitter/search/common/util:kerberos",
|
|
||||||
"src/java/com/twitter/search/common/util/date",
|
|
||||||
"src/java/com/twitter/search/common/util/io:flushable",
|
|
||||||
"src/java/com/twitter/search/common/util/zktrylock",
|
|
||||||
"src/java/com/twitter/search/common/util/zookeeper",
|
|
||||||
"src/java/com/twitter/search/earlybird:earlybird-lib",
|
|
||||||
"src/java/com/twitter/search/earlybird/common",
|
|
||||||
"src/java/com/twitter/search/earlybird/common/config",
|
|
||||||
"src/java/com/twitter/search/earlybird/common/userupdates",
|
|
||||||
"util/util-core:scala",
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
# Using hadoop_binary target can automatically exclude hadoop related jars in the built jar
|
|
||||||
# and load in the right jars based on hadoop config.
|
|
||||||
hadoop_binary(
|
|
||||||
name = "segment_builder_binary",
|
|
||||||
basename = "segment_builder",
|
|
||||||
main = "com.twitter.search.earlybird.archive.segmentbuilder.SegmentBuilderMain",
|
|
||||||
platform = "java8",
|
|
||||||
runtime_platform = "java8",
|
|
||||||
tags = [
|
|
||||||
"bazel-compatible",
|
|
||||||
"bazel-compatible:migrated",
|
|
||||||
"bazel-only",
|
|
||||||
],
|
|
||||||
dependencies = [
|
|
||||||
":segment_builder_lib",
|
|
||||||
"src/java/com/twitter/search/common/logging:search-log4j",
|
|
||||||
],
|
|
||||||
)
|
|
Binary file not shown.
Binary file not shown.
@ -1,29 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
|
||||||
|
|
||||||
public class BuiltAndFinalizedSegment extends SegmentBuilderSegment {
|
|
||||||
public BuiltAndFinalizedSegment(
|
|
||||||
SegmentInfo segmentInfo,
|
|
||||||
SegmentConfig segmentConfig,
|
|
||||||
EarlybirdSegmentFactory earlybirdSegmentFactory,
|
|
||||||
int alreadyRetriedCount,
|
|
||||||
SegmentSyncConfig sync) {
|
|
||||||
|
|
||||||
super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public SegmentBuilderSegment handle() throws SegmentInfoConstructionException,
|
|
||||||
SegmentUpdaterException {
|
|
||||||
|
|
||||||
throw new IllegalStateException("Should not handle a BuildAndFinalizedSegment.");
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean isBuilt() {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,101 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
|
|
||||||
import com.google.common.base.Stopwatch;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.common.util.Clock;
|
|
||||||
import com.twitter.search.common.util.GCUtil;
|
|
||||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
|
||||||
import com.twitter.search.earlybird.archive.ArchiveSegmentUpdater;
|
|
||||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
|
||||||
|
|
||||||
public class NotYetBuiltSegment extends SegmentBuilderSegment {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(NotYetBuiltSegment.class);
|
|
||||||
|
|
||||||
public NotYetBuiltSegment(
|
|
||||||
SegmentInfo segmentInfo,
|
|
||||||
SegmentConfig segmentConfig,
|
|
||||||
EarlybirdSegmentFactory earlybirdSegmentFactory,
|
|
||||||
int alreadyRetriedCount,
|
|
||||||
SegmentSyncConfig sync) {
|
|
||||||
|
|
||||||
super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* 1. Grab the ZK lock for this segment.
|
|
||||||
* 2a. if lock fails, another host is updating; return the SOMEONE_ELSE_IS_BUILDING state.
|
|
||||||
* 2b. if lock succeeds, check again if the updated segment exists on HDFS.
|
|
||||||
* 3a. if so, just move on.
|
|
||||||
* 3b. if not, update the segment.
|
|
||||||
* In both cases, we need to check if the segment can now be marked as BUILT_AND_FINALIZED.
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public SegmentBuilderSegment handle()
|
|
||||||
throws SegmentUpdaterException, SegmentInfoConstructionException {
|
|
||||||
LOG.info("Handling a not yet built segment: {}", this.getSegmentName());
|
|
||||||
Stopwatch stopwatch = Stopwatch.createStarted();
|
|
||||||
TryLock lock = getZooKeeperTryLock();
|
|
||||||
|
|
||||||
// The tryWithLock can only access variables from parent class that are final. However, we
|
|
||||||
// would like to pass the process() return value to the parent class. So here we use
|
|
||||||
// AtomicBoolean reference instead of Boolean.
|
|
||||||
final AtomicBoolean successRef = new AtomicBoolean(false);
|
|
||||||
boolean gotLock = lock.tryWithLock(() -> {
|
|
||||||
ArchiveSegmentUpdater updater = new ArchiveSegmentUpdater(
|
|
||||||
segmentConfig.getTryLockFactory(),
|
|
||||||
sync,
|
|
||||||
segmentConfig.getEarlybirdIndexConfig(),
|
|
||||||
Clock.SYSTEM_CLOCK);
|
|
||||||
|
|
||||||
boolean success = updater.updateSegment(segmentInfo);
|
|
||||||
successRef.set(success);
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!gotLock) {
|
|
||||||
LOG.info("cannot acquire zookeeper lock for: " + segmentInfo);
|
|
||||||
return new SomeoneElseIsBuildingSegment(
|
|
||||||
segmentInfo,
|
|
||||||
segmentConfig,
|
|
||||||
earlybirdSegmentFactory,
|
|
||||||
alreadyRetriedCount,
|
|
||||||
sync);
|
|
||||||
}
|
|
||||||
|
|
||||||
// 1. we want to make sure the heap is clean right after building a segment so that it's ready
|
|
||||||
// for us to start allocations for a new segment
|
|
||||||
// — I think we've had cases where we were seeing OOM's while building
|
|
||||||
// 2. the thing that I think it helps with is compaction (vs just organically running CMS)
|
|
||||||
// — which would clean up the heap, but may leave it in a fragmented state
|
|
||||||
// — and running a Full GC is supposed to compact the remaining tenured space.
|
|
||||||
GCUtil.runGC();
|
|
||||||
|
|
||||||
if (successRef.get()) {
|
|
||||||
LOG.info("Indexing segment {} took {}", segmentInfo, stopwatch);
|
|
||||||
LOG.info("Finished building {}", segmentInfo.getSegment().getSegmentName());
|
|
||||||
return new BuiltAndFinalizedSegment(
|
|
||||||
segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync);
|
|
||||||
} else {
|
|
||||||
int alreadyTried = alreadyRetriedCount + 1;
|
|
||||||
String errMsg = "failed updating segment for: " + segmentInfo
|
|
||||||
+ " for " + alreadyTried + " times";
|
|
||||||
LOG.error(errMsg);
|
|
||||||
if (alreadyTried < segmentConfig.getMaxRetriesOnFailure()) {
|
|
||||||
return new NotYetBuiltSegment(
|
|
||||||
createNewSegmentInfo(segmentInfo),
|
|
||||||
segmentConfig,
|
|
||||||
earlybirdSegmentFactory,
|
|
||||||
alreadyTried,
|
|
||||||
sync);
|
|
||||||
} else {
|
|
||||||
throw new SegmentUpdaterException(errMsg);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,39 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import com.twitter.common.util.Clock;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A class that prevents handling a given segment more than once every hdfsCheckIntervalMillis
|
|
||||||
*/
|
|
||||||
public class RateLimitingSegmentHandler {
|
|
||||||
private final long hdfsCheckIntervalMillis;
|
|
||||||
private final Clock clock;
|
|
||||||
private final Map<String, Long> segmentNameToLastUpdatedTimeMillis = new HashMap<>();
|
|
||||||
|
|
||||||
RateLimitingSegmentHandler(long hdfsCheckIntervalMillis, Clock clock) {
|
|
||||||
this.hdfsCheckIntervalMillis = hdfsCheckIntervalMillis;
|
|
||||||
this.clock = clock;
|
|
||||||
}
|
|
||||||
|
|
||||||
SegmentBuilderSegment processSegment(SegmentBuilderSegment segment)
|
|
||||||
throws SegmentUpdaterException, SegmentInfoConstructionException {
|
|
||||||
|
|
||||||
String segmentName = segment.getSegmentName();
|
|
||||||
|
|
||||||
Long lastUpdatedMillis = segmentNameToLastUpdatedTimeMillis.get(segmentName);
|
|
||||||
if (lastUpdatedMillis == null) {
|
|
||||||
lastUpdatedMillis = 0L;
|
|
||||||
}
|
|
||||||
|
|
||||||
long nowMillis = clock.nowMillis();
|
|
||||||
if (nowMillis - lastUpdatedMillis < hdfsCheckIntervalMillis) {
|
|
||||||
return segment;
|
|
||||||
}
|
|
||||||
segmentNameToLastUpdatedTimeMillis.put(segmentName, nowMillis);
|
|
||||||
|
|
||||||
return segment.handle();
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,540 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.HashMap;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Random;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
import com.google.common.base.Stopwatch;
|
|
||||||
import com.google.common.collect.ComparisonChain;
|
|
||||||
import com.google.common.collect.ImmutableList;
|
|
||||||
import com.google.common.util.concurrent.Uninterruptibles;
|
|
||||||
import com.google.inject.Inject;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.common.quantity.Amount;
|
|
||||||
import com.twitter.common.quantity.Time;
|
|
||||||
import com.twitter.common.util.Clock;
|
|
||||||
import com.twitter.decider.Decider;
|
|
||||||
import com.twitter.inject.annotations.Flag;
|
|
||||||
import com.twitter.search.common.metrics.SearchCounter;
|
|
||||||
import com.twitter.search.common.metrics.SearchLongGauge;
|
|
||||||
import com.twitter.search.common.metrics.SearchStatsReceiver;
|
|
||||||
import com.twitter.search.common.metrics.SearchStatsReceiverImpl;
|
|
||||||
import com.twitter.search.common.partitioning.zookeeper.SearchZkClient;
|
|
||||||
import com.twitter.search.common.util.Kerberos;
|
|
||||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
|
||||||
import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig;
|
|
||||||
import com.twitter.search.earlybird.archive.ArchiveSegment;
|
|
||||||
import com.twitter.search.earlybird.archive.DailyStatusBatches;
|
|
||||||
import com.twitter.search.earlybird.archive.ArchiveTimeSlicer;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
|
||||||
import com.twitter.search.earlybird.util.ScrubGenUtil;
|
|
||||||
import com.twitter.search.earlybird.exception.CriticalExceptionHandler;
|
|
||||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
|
||||||
import com.twitter.search.earlybird.partition.SearchIndexingMetricSet;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
|
||||||
import com.twitter.search.earlybird.stats.EarlybirdSearcherStats;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This class provides the core logic to build segment indices offline.
|
|
||||||
* For each server, it coordinate via zookeeper to pick the next segment, build the indices for it
|
|
||||||
* and upload them to HDFS. A state machine is used to handle the build state transitions. There
|
|
||||||
* are three states:
|
|
||||||
* NOT_BUILD_YET: a segment that needs to be built
|
|
||||||
* SOMEONE_ELSE_IS_BUILDING: another server is building the segment.
|
|
||||||
* BUILT_AND_FINALIZED: the indices of this segment have already been built.
|
|
||||||
*/
|
|
||||||
public class SegmentBuilder {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilder.class);
|
|
||||||
|
|
||||||
private final boolean onlyRunOnce;
|
|
||||||
private final int waitBetweenLoopsMins;
|
|
||||||
private final int startUpBatchSize;
|
|
||||||
private final int instance;
|
|
||||||
private final int waitBetweenSegmentsSecs;
|
|
||||||
private final int waitBeforeQuitMins;
|
|
||||||
|
|
||||||
// When multiple segment builders start simultaneously, they might make the HDFS name node and
|
|
||||||
// zookeeper overwhelmed. So, we let some instances sleep sometimes before they start to avoid
|
|
||||||
// the issues.
|
|
||||||
private final long startUpSleepMins;
|
|
||||||
|
|
||||||
// If no more segments to built, wait this interval before checking again.
|
|
||||||
private final long processWaitingInterval = TimeUnit.MINUTES.toMillis(10);
|
|
||||||
|
|
||||||
// The hash partitions that segments will be built.
|
|
||||||
private final ImmutableList<Integer> hashPartitions;
|
|
||||||
|
|
||||||
private final SearchStatsReceiver statsReceiver = new SearchStatsReceiverImpl();
|
|
||||||
private final SearchIndexingMetricSet searchIndexingMetricSet =
|
|
||||||
new SearchIndexingMetricSet(statsReceiver);
|
|
||||||
private final EarlybirdSearcherStats searcherStats =
|
|
||||||
new EarlybirdSearcherStats(statsReceiver);
|
|
||||||
|
|
||||||
private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig;
|
|
||||||
|
|
||||||
private final ZooKeeperTryLockFactory zkTryLockFactory;
|
|
||||||
private final RateLimitingSegmentHandler segmentHandler;
|
|
||||||
private final Clock clock;
|
|
||||||
private final int numSegmentBuilderPartitions;
|
|
||||||
private final int myPartitionId;
|
|
||||||
private final SegmentConfig segmentConfig;
|
|
||||||
private final EarlybirdSegmentFactory segmentFactory;
|
|
||||||
private final SegmentBuilderCoordinator segmentBuilderCoordinator;
|
|
||||||
private final SegmentSyncConfig segmentSyncConfig;
|
|
||||||
private final Random random = new Random();
|
|
||||||
|
|
||||||
private static final double SLEEP_RANDOMIZATION_RATIO = .2;
|
|
||||||
|
|
||||||
// Stats
|
|
||||||
// The flush version used to build segments
|
|
||||||
private static final SearchLongGauge CURRENT_FLUSH_VERSION =
|
|
||||||
SearchLongGauge.export("current_flush_version");
|
|
||||||
|
|
||||||
// Accumulated number and time in seconds spent on building segments locally
|
|
||||||
private static SearchCounter segmentsBuiltLocally =
|
|
||||||
SearchCounter.export("segments_built_locally");
|
|
||||||
private static SearchCounter timeSpentOnSuccessfulBuildSecs =
|
|
||||||
SearchCounter.export("time_spent_on_successful_build_secs");
|
|
||||||
|
|
||||||
// The total number of segments to be built
|
|
||||||
private static final SearchLongGauge SEGMENTS_TO_BUILD =
|
|
||||||
SearchLongGauge.export("segments_to_build");
|
|
||||||
|
|
||||||
// How many segments failed locally
|
|
||||||
private static final SearchCounter FAILED_SEGMENTS =
|
|
||||||
SearchCounter.export("failed_segments");
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
protected SegmentBuilder(@Flag("onlyRunOnce") boolean onlyRunOnceFlag,
|
|
||||||
@Flag("waitBetweenLoopsMins") int waitBetweenLoopsMinsFlag,
|
|
||||||
@Flag("startup_batch_size") int startUpBatchSizeFlag,
|
|
||||||
@Flag("instance") int instanceFlag,
|
|
||||||
@Flag("segmentZkLockExpirationHours")
|
|
||||||
int segmentZkLockExpirationHoursFlag,
|
|
||||||
@Flag("startupSleepMins") long startupSleepMinsFlag,
|
|
||||||
@Flag("maxRetriesOnFailure") int maxRetriesOnFailureFlag,
|
|
||||||
@Flag("hash_partitions") List<Integer> hashPartitionsFlag,
|
|
||||||
@Flag("numSegmentBuilderPartitions") int numSegmentBuilderPartitionsFlag,
|
|
||||||
@Flag("waitBetweenSegmentsSecs") int waitBetweenSegmentsSecsFlag,
|
|
||||||
@Flag("waitBeforeQuitMins") int waitBeforeQuitMinsFlag,
|
|
||||||
@Flag("scrubGen") String scrubGen,
|
|
||||||
Decider decider) {
|
|
||||||
this(onlyRunOnceFlag,
|
|
||||||
waitBetweenLoopsMinsFlag,
|
|
||||||
startUpBatchSizeFlag,
|
|
||||||
instanceFlag,
|
|
||||||
segmentZkLockExpirationHoursFlag,
|
|
||||||
startupSleepMinsFlag,
|
|
||||||
hashPartitionsFlag,
|
|
||||||
maxRetriesOnFailureFlag,
|
|
||||||
waitBetweenSegmentsSecsFlag,
|
|
||||||
waitBeforeQuitMinsFlag,
|
|
||||||
SearchZkClient.getSZooKeeperClient().createZooKeeperTryLockFactory(),
|
|
||||||
new RateLimitingSegmentHandler(TimeUnit.MINUTES.toMillis(10), Clock.SYSTEM_CLOCK),
|
|
||||||
Clock.SYSTEM_CLOCK,
|
|
||||||
numSegmentBuilderPartitionsFlag,
|
|
||||||
decider,
|
|
||||||
getSyncConfig(scrubGen));
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
protected SegmentBuilder(boolean onlyRunOnceFlag,
|
|
||||||
int waitBetweenLoopsMinsFlag,
|
|
||||||
int startUpBatchSizeFlag,
|
|
||||||
int instanceFlag,
|
|
||||||
int segmentZkLockExpirationHoursFlag,
|
|
||||||
long startupSleepMinsFlag,
|
|
||||||
List<Integer> hashPartitions,
|
|
||||||
int maxRetriesOnFailure,
|
|
||||||
int waitBetweenSegmentsSecsFlag,
|
|
||||||
int waitBeforeQuitMinsFlag,
|
|
||||||
ZooKeeperTryLockFactory zooKeeperTryLockFactory,
|
|
||||||
RateLimitingSegmentHandler segmentHandler,
|
|
||||||
Clock clock,
|
|
||||||
int numSegmentBuilderPartitions,
|
|
||||||
Decider decider,
|
|
||||||
SegmentSyncConfig syncConfig) {
|
|
||||||
LOG.info("Creating SegmentBuilder");
|
|
||||||
LOG.info("Penguin version in use: " + EarlybirdConfig.getPenguinVersion());
|
|
||||||
|
|
||||||
// Set command line flag values
|
|
||||||
this.onlyRunOnce = onlyRunOnceFlag;
|
|
||||||
this.waitBetweenLoopsMins = waitBetweenLoopsMinsFlag;
|
|
||||||
this.startUpBatchSize = startUpBatchSizeFlag;
|
|
||||||
this.instance = instanceFlag;
|
|
||||||
this.waitBetweenSegmentsSecs = waitBetweenSegmentsSecsFlag;
|
|
||||||
this.waitBeforeQuitMins = waitBeforeQuitMinsFlag;
|
|
||||||
|
|
||||||
this.segmentHandler = segmentHandler;
|
|
||||||
this.zkTryLockFactory = zooKeeperTryLockFactory;
|
|
||||||
this.segmentSyncConfig = syncConfig;
|
|
||||||
this.startUpSleepMins = startupSleepMinsFlag;
|
|
||||||
|
|
||||||
if (!hashPartitions.isEmpty()) {
|
|
||||||
this.hashPartitions = ImmutableList.copyOf(hashPartitions);
|
|
||||||
} else {
|
|
||||||
this.hashPartitions = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
Amount<Long, Time> segmentZKLockExpirationTime = Amount.of((long)
|
|
||||||
segmentZkLockExpirationHoursFlag, Time.HOURS);
|
|
||||||
|
|
||||||
this.earlybirdIndexConfig =
|
|
||||||
new ArchiveOnDiskEarlybirdIndexConfig(decider, searchIndexingMetricSet,
|
|
||||||
new CriticalExceptionHandler());
|
|
||||||
|
|
||||||
this.segmentConfig = new SegmentConfig(
|
|
||||||
earlybirdIndexConfig,
|
|
||||||
segmentZKLockExpirationTime,
|
|
||||||
maxRetriesOnFailure,
|
|
||||||
zkTryLockFactory);
|
|
||||||
this.segmentFactory = new EarlybirdSegmentFactory(
|
|
||||||
earlybirdIndexConfig,
|
|
||||||
searchIndexingMetricSet,
|
|
||||||
searcherStats,
|
|
||||||
clock);
|
|
||||||
this.segmentBuilderCoordinator = new SegmentBuilderCoordinator(
|
|
||||||
zkTryLockFactory, syncConfig, clock);
|
|
||||||
|
|
||||||
this.clock = clock;
|
|
||||||
|
|
||||||
this.numSegmentBuilderPartitions = numSegmentBuilderPartitions;
|
|
||||||
this.myPartitionId = instance % numSegmentBuilderPartitions;
|
|
||||||
SearchLongGauge.export("segment_builder_partition_id_" + myPartitionId).set(1);
|
|
||||||
|
|
||||||
CURRENT_FLUSH_VERSION.set(earlybirdIndexConfig.getSchema().getMajorVersionNumber());
|
|
||||||
}
|
|
||||||
|
|
||||||
void run() {
|
|
||||||
LOG.info("Config values: {}", EarlybirdConfig.allValuesAsString());
|
|
||||||
|
|
||||||
// Sleep some time uninterruptibly before get started so that if multiple instances are running,
|
|
||||||
// the HDFS name node and zookeeper wont be overwhelmed
|
|
||||||
// Say, we have 100 instances (instance_arg will have value from 0 - 99, our
|
|
||||||
// STARTUP_BATCH_SIZE_ARG is 20 and startUpSleepMins is 3 mins. Then the first 20 instances
|
|
||||||
// will not sleep, but start immediately. then instance 20 - 39 will sleep 3 mins and then
|
|
||||||
// start to run. instance 40 - 59 will sleep 6 mins then start to run. instances 60 - 79 will
|
|
||||||
// sleep 9 mins and then start to run and so forth.
|
|
||||||
long sleepTime = instance / startUpBatchSize * startUpSleepMins;
|
|
||||||
LOG.info("Instance={}, Start up batch size={}", instance, startUpBatchSize);
|
|
||||||
LOG.info("Sleep {} minutes to void HDFS name node and ZooKeeper overwhelmed.", sleepTime);
|
|
||||||
Uninterruptibles.sleepUninterruptibly(sleepTime, TimeUnit.MINUTES);
|
|
||||||
|
|
||||||
// Kinit here.
|
|
||||||
Kerberos.kinit(
|
|
||||||
EarlybirdConfig.getString("kerberos_user", ""),
|
|
||||||
EarlybirdConfig.getString("kerberos_keytab_path", "")
|
|
||||||
);
|
|
||||||
|
|
||||||
long waitBetweenLoopsMs = TimeUnit.MINUTES.toMillis(waitBetweenLoopsMins);
|
|
||||||
if (onlyRunOnce) {
|
|
||||||
LOG.info("This segment builder will run the full rebuild of all the segments");
|
|
||||||
} else {
|
|
||||||
LOG.info("This segment builder will incrementally check for new data and rebuilt "
|
|
||||||
+ "current segments as needed.");
|
|
||||||
LOG.info("The waiting interval between two new data checking is: "
|
|
||||||
+ waitBetweenLoopsMs + " ms.");
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean scrubGenPresent = segmentSyncConfig.getScrubGen().isPresent();
|
|
||||||
LOG.info("Scrub gen present: {}", scrubGenPresent);
|
|
||||||
boolean scrubGenDataFullyBuilt = segmentBuilderCoordinator.isScrubGenDataFullyBuilt(instance);
|
|
||||||
LOG.info("Scrub gen data fully built: {}", scrubGenDataFullyBuilt);
|
|
||||||
|
|
||||||
if (!scrubGenPresent || scrubGenDataFullyBuilt) {
|
|
||||||
LOG.info("Starting segment building loop...");
|
|
||||||
while (!Thread.currentThread().isInterrupted()) {
|
|
||||||
try {
|
|
||||||
indexingLoop();
|
|
||||||
if (onlyRunOnce) {
|
|
||||||
LOG.info("only run once is true, breaking");
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
clock.waitFor(waitBetweenLoopsMs);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
LOG.info("Interrupted, quitting segment builder");
|
|
||||||
Thread.currentThread().interrupt();
|
|
||||||
} catch (SegmentInfoConstructionException e) {
|
|
||||||
LOG.error("Error creating new segmentInfo, quitting segment builder: ", e);
|
|
||||||
break;
|
|
||||||
} catch (SegmentUpdaterException e) {
|
|
||||||
FAILED_SEGMENTS.increment();
|
|
||||||
// Before the segment builder quits, sleep for WAIT_BEFORE_QUIT_MINS minutes so that the
|
|
||||||
// FAILED_SEGMENTS stat can be exported.
|
|
||||||
try {
|
|
||||||
clock.waitFor(TimeUnit.MINUTES.toMillis(waitBeforeQuitMins));
|
|
||||||
} catch (InterruptedException ex) {
|
|
||||||
LOG.info("Interrupted, quitting segment builder");
|
|
||||||
Thread.currentThread().interrupt();
|
|
||||||
}
|
|
||||||
LOG.error("SegmentUpdater processing segment error, quitting segment builder: ", e);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
LOG.info("Cannot build the segments for scrub gen yet.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Refactoring the run loop to here for unittest
|
|
||||||
@VisibleForTesting
|
|
||||||
void indexingLoop()
|
|
||||||
throws SegmentInfoConstructionException, InterruptedException, SegmentUpdaterException {
|
|
||||||
// This map contains all the segments to be processed; if a segment is built, it will be removed
|
|
||||||
// from the map.
|
|
||||||
Map<String, SegmentBuilderSegment> buildableSegmentInfoMap;
|
|
||||||
try {
|
|
||||||
buildableSegmentInfoMap = createSegmentInfoMap();
|
|
||||||
printSegmentInfoMap(buildableSegmentInfoMap);
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Error creating segmentInfoMap: ", e);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
while (!buildableSegmentInfoMap.isEmpty()) {
|
|
||||||
boolean hasBuiltSegment = processSegments(buildableSegmentInfoMap);
|
|
||||||
|
|
||||||
if (!hasBuiltSegment) {
|
|
||||||
// If we successfully built a segment, no need to sleep since building a segment takes a
|
|
||||||
// long time
|
|
||||||
clock.waitFor(processWaitingInterval);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Actual shutdown.
|
|
||||||
protected void doShutdown() {
|
|
||||||
LOG.info("doShutdown()...");
|
|
||||||
try {
|
|
||||||
earlybirdIndexConfig.getResourceCloser().shutdownExecutor();
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
LOG.error("Interrupted during shutdown. ", e);
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Segment builder stopped!");
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<ArchiveTimeSlicer.ArchiveTimeSlice> createTimeSlices() throws IOException {
|
|
||||||
Preconditions.checkState(segmentSyncConfig.getScrubGen().isPresent());
|
|
||||||
Date scrubGen = ScrubGenUtil.parseScrubGenToDate(segmentSyncConfig.getScrubGen().get());
|
|
||||||
|
|
||||||
final DailyStatusBatches dailyStatusBatches =
|
|
||||||
new DailyStatusBatches(zkTryLockFactory, scrubGen);
|
|
||||||
final ArchiveTimeSlicer archiveTimeSlicer = new ArchiveTimeSlicer(
|
|
||||||
EarlybirdConfig.getMaxSegmentSize(), dailyStatusBatches, earlybirdIndexConfig);
|
|
||||||
|
|
||||||
Stopwatch stopwatch = Stopwatch.createStarted();
|
|
||||||
List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices = archiveTimeSlicer.getTimeSlices();
|
|
||||||
|
|
||||||
if (timeSlices == null) {
|
|
||||||
LOG.error("Failed to load timeslice map after {}", stopwatch);
|
|
||||||
return Collections.emptyList();
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Took {} to get timeslices", stopwatch);
|
|
||||||
return timeSlices;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static class TimeSliceAndHashPartition implements Comparable<TimeSliceAndHashPartition> {
|
|
||||||
public final ArchiveTimeSlicer.ArchiveTimeSlice timeSlice;
|
|
||||||
public final Integer hashPartition;
|
|
||||||
|
|
||||||
public TimeSliceAndHashPartition(
|
|
||||||
ArchiveTimeSlicer.ArchiveTimeSlice timeSlice,
|
|
||||||
Integer hashPartition) {
|
|
||||||
this.timeSlice = timeSlice;
|
|
||||||
this.hashPartition = hashPartition;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public int compareTo(TimeSliceAndHashPartition o) {
|
|
||||||
Integer myHashPartition = this.hashPartition;
|
|
||||||
Integer otherHashPartition = o.hashPartition;
|
|
||||||
|
|
||||||
long myTimeSliceId = this.timeSlice.getMinStatusID(myHashPartition);
|
|
||||||
long otherTimeSliceId = o.timeSlice.getMinStatusID(otherHashPartition);
|
|
||||||
|
|
||||||
return ComparisonChain.start()
|
|
||||||
.compare(myHashPartition, otherHashPartition)
|
|
||||||
.compare(myTimeSliceId, otherTimeSliceId)
|
|
||||||
.result();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* For all the timeslices, create the corresponding SegmentInfo and store in a map
|
|
||||||
*/
|
|
||||||
@VisibleForTesting
|
|
||||||
Map<String, SegmentBuilderSegment> createSegmentInfoMap() throws IOException {
|
|
||||||
final List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices = createTimeSlices();
|
|
||||||
|
|
||||||
List<TimeSliceAndHashPartition> timeSlicePairs = createPairs(timeSlices);
|
|
||||||
// Export how many segments should be built
|
|
||||||
SEGMENTS_TO_BUILD.set(timeSlicePairs.size());
|
|
||||||
LOG.info("Total number of segments to be built across all segment builders: {}",
|
|
||||||
timeSlicePairs.size());
|
|
||||||
|
|
||||||
List<TimeSliceAndHashPartition> mySegments = getSegmentsForMyPartition(timeSlicePairs);
|
|
||||||
|
|
||||||
Map<String, SegmentBuilderSegment> segmentInfoMap = new HashMap<>();
|
|
||||||
for (TimeSliceAndHashPartition mySegment : mySegments) {
|
|
||||||
ArchiveSegment segment = new ArchiveSegment(mySegment.timeSlice, mySegment.hashPartition,
|
|
||||||
EarlybirdConfig.getMaxSegmentSize());
|
|
||||||
SegmentInfo segmentInfo = new SegmentInfo(segment, segmentFactory, segmentSyncConfig);
|
|
||||||
|
|
||||||
segmentInfoMap.put(segmentInfo.getSegment().getSegmentName(), new NotYetBuiltSegment(
|
|
||||||
segmentInfo, segmentConfig, segmentFactory, 0, segmentSyncConfig));
|
|
||||||
}
|
|
||||||
|
|
||||||
return segmentInfoMap;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<TimeSliceAndHashPartition> createPairs(
|
|
||||||
List<ArchiveTimeSlicer.ArchiveTimeSlice> timeSlices) {
|
|
||||||
|
|
||||||
List<TimeSliceAndHashPartition> timeSlicePairs = new ArrayList<>();
|
|
||||||
|
|
||||||
for (ArchiveTimeSlicer.ArchiveTimeSlice slice : timeSlices) {
|
|
||||||
List<Integer> localPartitions = hashPartitions;
|
|
||||||
if (localPartitions == null) {
|
|
||||||
localPartitions = range(slice.getNumHashPartitions());
|
|
||||||
}
|
|
||||||
|
|
||||||
for (Integer partition : localPartitions) {
|
|
||||||
timeSlicePairs.add(new TimeSliceAndHashPartition(slice, partition));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return timeSlicePairs;
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<TimeSliceAndHashPartition> getSegmentsForMyPartition(
|
|
||||||
List<TimeSliceAndHashPartition> timeSlicePairs) {
|
|
||||||
|
|
||||||
Collections.sort(timeSlicePairs);
|
|
||||||
|
|
||||||
List<TimeSliceAndHashPartition> myTimeSlices = new ArrayList<>();
|
|
||||||
for (int i = myPartitionId; i < timeSlicePairs.size(); i += numSegmentBuilderPartitions) {
|
|
||||||
myTimeSlices.add(timeSlicePairs.get(i));
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Getting segments to be built for partition: {}", myPartitionId);
|
|
||||||
LOG.info("Total number of partitions: {}", numSegmentBuilderPartitions);
|
|
||||||
LOG.info("Number of segments picked: {}", myTimeSlices.size());
|
|
||||||
return myTimeSlices;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Print out the segmentInfo Map for debugging
|
|
||||||
*/
|
|
||||||
private void printSegmentInfoMap(Map<String, SegmentBuilderSegment> segmentInfoMap) {
|
|
||||||
LOG.info("SegmentInfoMap: ");
|
|
||||||
for (Map.Entry<String, SegmentBuilderSegment> entry : segmentInfoMap.entrySet()) {
|
|
||||||
LOG.info(entry.getValue().toString());
|
|
||||||
}
|
|
||||||
LOG.info("Total SegmentInfoMap size: " + segmentInfoMap.size() + ". done.");
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Build indices or refresh state for the segments in the specified segmentInfoMap, which only
|
|
||||||
* contains the segments that need to build or are building. When a segment has not been built,
|
|
||||||
* it is built here. If built successfully, it will be removed from the map; otherwise, its
|
|
||||||
* state will be updated in the map.
|
|
||||||
*
|
|
||||||
* Returns true iff this process has built a segment.
|
|
||||||
*/
|
|
||||||
@VisibleForTesting
|
|
||||||
boolean processSegments(Map<String, SegmentBuilderSegment> segmentInfoMap)
|
|
||||||
throws SegmentInfoConstructionException, SegmentUpdaterException, InterruptedException {
|
|
||||||
|
|
||||||
boolean hasBuiltSegment = false;
|
|
||||||
|
|
||||||
Iterator<Map.Entry<String, SegmentBuilderSegment>> iter =
|
|
||||||
segmentInfoMap.entrySet().iterator();
|
|
||||||
while (iter.hasNext()) {
|
|
||||||
Map.Entry<String, SegmentBuilderSegment> entry = iter.next();
|
|
||||||
SegmentBuilderSegment originalSegment = entry.getValue();
|
|
||||||
|
|
||||||
LOG.info("About to process segment: {}", originalSegment.getSegmentName());
|
|
||||||
long startMillis = System.currentTimeMillis();
|
|
||||||
SegmentBuilderSegment updatedSegment = segmentHandler.processSegment(originalSegment);
|
|
||||||
|
|
||||||
if (updatedSegment.isBuilt()) {
|
|
||||||
iter.remove();
|
|
||||||
hasBuiltSegment = true;
|
|
||||||
|
|
||||||
if (originalSegment instanceof NotYetBuiltSegment) {
|
|
||||||
// Record the total time spent on successfully building a semgent, used to compute the
|
|
||||||
// average segment building time.
|
|
||||||
long timeSpent = System.currentTimeMillis() - startMillis;
|
|
||||||
segmentsBuiltLocally.increment();
|
|
||||||
timeSpentOnSuccessfulBuildSecs.add(timeSpent / 1000);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
entry.setValue(updatedSegment);
|
|
||||||
}
|
|
||||||
|
|
||||||
clock.waitFor(getSegmentSleepTime());
|
|
||||||
}
|
|
||||||
|
|
||||||
return hasBuiltSegment;
|
|
||||||
}
|
|
||||||
|
|
||||||
private long getSegmentSleepTime() {
|
|
||||||
// The Hadoop name node can handle only about 200 requests/sec before it gets overloaded.
|
|
||||||
// Updating the state of a node that has been built takes about 1 second. In the worst case
|
|
||||||
// scenario with 800 segment builders, we end up with about 800 requests/sec. Adding a 10
|
|
||||||
// second sleep lowers the worst case to about 80 requests/sec.
|
|
||||||
|
|
||||||
long sleepMillis = TimeUnit.SECONDS.toMillis(waitBetweenSegmentsSecs);
|
|
||||||
|
|
||||||
// Use randomization so that we can't get all segment builders hitting it at the exact same time
|
|
||||||
|
|
||||||
int lowerSleepBoundMillis = (int) (sleepMillis * (1.0 - SLEEP_RANDOMIZATION_RATIO));
|
|
||||||
int upperSleepBoundMillis = (int) (sleepMillis * (1.0 + SLEEP_RANDOMIZATION_RATIO));
|
|
||||||
return randRange(lowerSleepBoundMillis, upperSleepBoundMillis);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a pseudo-random number between min and max, inclusive.
|
|
||||||
*/
|
|
||||||
private int randRange(int min, int max) {
|
|
||||||
return random.nextInt((max - min) + 1) + min;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns list of integers 0, 1, 2, ..., count-1.
|
|
||||||
*/
|
|
||||||
private static List<Integer> range(int count) {
|
|
||||||
List<Integer> nums = new ArrayList<>(count);
|
|
||||||
|
|
||||||
for (int i = 0; i < count; i++) {
|
|
||||||
nums.add(i);
|
|
||||||
}
|
|
||||||
|
|
||||||
return nums;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static SegmentSyncConfig getSyncConfig(String scrubGen) {
|
|
||||||
if (scrubGen == null || scrubGen.isEmpty()) {
|
|
||||||
throw new RuntimeException(
|
|
||||||
"Scrub gen expected, but could not get it from the arguments.");
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Scrub gen: " + scrubGen);
|
|
||||||
return new SegmentSyncConfig(Optional.of(scrubGen));
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,109 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import java.util.Collection;
|
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableList;
|
|
||||||
import com.google.inject.Module;
|
|
||||||
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.app.Flaggable;
|
|
||||||
import com.twitter.inject.server.AbstractTwitterServer;
|
|
||||||
import com.twitter.util.Future;
|
|
||||||
import com.twitter.util.Time;
|
|
||||||
|
|
||||||
public class SegmentBuilderApp extends AbstractTwitterServer {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderApp.class);
|
|
||||||
|
|
||||||
public SegmentBuilderApp() {
|
|
||||||
createFlag("onlyRunOnce",
|
|
||||||
true,
|
|
||||||
"whether to stop segment builder after one loop",
|
|
||||||
Flaggable.ofBoolean());
|
|
||||||
|
|
||||||
createFlag("waitBetweenLoopsMins",
|
|
||||||
60,
|
|
||||||
"how many minutes to wait between building loops",
|
|
||||||
Flaggable.ofInt());
|
|
||||||
|
|
||||||
createFlag("startup_batch_size",
|
|
||||||
30,
|
|
||||||
"How many instances can start and read timeslice info from HDFS at the same time. "
|
|
||||||
+ "If you don't know what this parameter is, please do not change this parameter.",
|
|
||||||
Flaggable.ofInt());
|
|
||||||
|
|
||||||
createFlag("instance",
|
|
||||||
20,
|
|
||||||
"the job instance number",
|
|
||||||
Flaggable.ofInt());
|
|
||||||
|
|
||||||
createFlag("segmentZkLockExpirationHours",
|
|
||||||
0,
|
|
||||||
"max hours to hold the zookeeper lock while building segment",
|
|
||||||
Flaggable.ofInt());
|
|
||||||
|
|
||||||
createFlag("startupSleepMins",
|
|
||||||
2L,
|
|
||||||
"sleep multiplier of startupSleepMins before job runs",
|
|
||||||
Flaggable.ofLong());
|
|
||||||
|
|
||||||
createFlag("maxRetriesOnFailure",
|
|
||||||
3,
|
|
||||||
"how many times we should try to rebuild a segment when failure happens",
|
|
||||||
Flaggable.ofInt());
|
|
||||||
|
|
||||||
createFlag("hash_partitions",
|
|
||||||
ImmutableList.of(),
|
|
||||||
"comma separated hash partition ids, e.g., 0,1,3,4. "
|
|
||||||
+ "If not specified, all the partitions will be built.",
|
|
||||||
Flaggable.ofJavaList(Flaggable.ofInt()));
|
|
||||||
|
|
||||||
createFlag("numSegmentBuilderPartitions",
|
|
||||||
100,
|
|
||||||
"Number of partitions for dividing up all segment builder work",
|
|
||||||
Flaggable.ofInt());
|
|
||||||
|
|
||||||
createFlag("waitBetweenSegmentsSecs",
|
|
||||||
10,
|
|
||||||
"Time to sleep between processing segments.",
|
|
||||||
Flaggable.ofInt());
|
|
||||||
|
|
||||||
createFlag("waitBeforeQuitMins",
|
|
||||||
2,
|
|
||||||
"How many minutes to sleep before quitting.",
|
|
||||||
Flaggable.ofInt());
|
|
||||||
|
|
||||||
createFlag("scrubGen",
|
|
||||||
"",
|
|
||||||
"Scrub gen for which segment builders should be run.",
|
|
||||||
Flaggable.ofString());
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void start() {
|
|
||||||
SegmentBuilder segmentBuilder = injector().instance(SegmentBuilder.class);
|
|
||||||
closeOnExit((Time time) -> {
|
|
||||||
segmentBuilder.doShutdown();
|
|
||||||
return Future.Unit();
|
|
||||||
});
|
|
||||||
|
|
||||||
LOG.info("Starting run()");
|
|
||||||
segmentBuilder.run();
|
|
||||||
LOG.info("run() complete");
|
|
||||||
|
|
||||||
// Now shutdown
|
|
||||||
shutdown();
|
|
||||||
}
|
|
||||||
|
|
||||||
protected void shutdown() {
|
|
||||||
LOG.info("Calling close() to initiate shutdown");
|
|
||||||
close();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Collection<Module> javaModules() {
|
|
||||||
return ImmutableList.of(new SegmentBuilderModule());
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,200 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.common.quantity.Amount;
|
|
||||||
import com.twitter.common.quantity.Time;
|
|
||||||
import com.twitter.common.util.Clock;
|
|
||||||
import com.twitter.search.common.database.DatabaseConfig;
|
|
||||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
|
||||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
|
||||||
import com.twitter.search.earlybird.archive.DailyStatusBatches;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdProperty;
|
|
||||||
import com.twitter.search.earlybird.util.ScrubGenUtil;
|
|
||||||
import com.twitter.search.earlybird.partition.HdfsUtil;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
|
||||||
import com.twitter.util.Duration;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Coordinate between segment builders for scrubbing pipeline.
|
|
||||||
* When segment builder is running, all of them will try to find a HDFS file indicating if data is
|
|
||||||
* ready. If the file does not exist, only one of them will go through the files and see if
|
|
||||||
* scrubbing pipeline has generated all data for this scrub gen.
|
|
||||||
*
|
|
||||||
* If the instance that got the lock found all data, it still exists, because otherwise we will
|
|
||||||
* have one single segmentbuilder instance trying to build all segments, which is not what we want.
|
|
||||||
* But if it exists, then the next time all segmentbuilder instances are scheduled, they will all
|
|
||||||
* find the file, and will start building segments.
|
|
||||||
*/
|
|
||||||
class SegmentBuilderCoordinator {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(SegmentBuilderCoordinator.class);
|
|
||||||
|
|
||||||
private static final Amount<Long, Time> ZK_LOCK_EXPIRATION_MIN = Amount.of(5L, Time.MINUTES);
|
|
||||||
private static final String SEGMENT_BUILDER_SYNC_NODE = "scrub_gen_data_sync";
|
|
||||||
private static final String SEGMENT_BUILDER_SYNC_ZK_PATH =
|
|
||||||
EarlybirdProperty.ZK_APP_ROOT.get() + "/segment_builder_sync";
|
|
||||||
private static final String DATA_FULLY_BUILT_FILE = "_data_fully_built";
|
|
||||||
static final int FIRST_INSTANCE = 0;
|
|
||||||
|
|
||||||
private static final long NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS =
|
|
||||||
Duration.fromHours(1).inMillis();
|
|
||||||
|
|
||||||
private final ZooKeeperTryLockFactory zkTryLockFactory;
|
|
||||||
private final SegmentSyncConfig syncConfig;
|
|
||||||
private final Optional<Date> scrubGenDayOpt;
|
|
||||||
private final Optional<String> scrubGenOpt;
|
|
||||||
private final Clock clock;
|
|
||||||
|
|
||||||
SegmentBuilderCoordinator(
|
|
||||||
ZooKeeperTryLockFactory zkTryLockFactory, SegmentSyncConfig syncConfig, Clock clock) {
|
|
||||||
this.zkTryLockFactory = zkTryLockFactory;
|
|
||||||
this.syncConfig = syncConfig;
|
|
||||||
this.scrubGenOpt = syncConfig.getScrubGen();
|
|
||||||
this.scrubGenDayOpt = scrubGenOpt.map(ScrubGenUtil::parseScrubGenToDate);
|
|
||||||
this.clock = clock;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
public boolean isScrubGenDataFullyBuilt(int instanceNumber) {
|
|
||||||
// Only segment builder that takes scrub gen should use isPartitioningOutputReady to coordinate
|
|
||||||
Preconditions.checkArgument(scrubGenDayOpt.isPresent());
|
|
||||||
|
|
||||||
final FileSystem hdfs;
|
|
||||||
try {
|
|
||||||
hdfs = HdfsUtil.getHdfsFileSystem();
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Could not create HDFS file system.", e);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return isScrubGenDataFullyBuilt(
|
|
||||||
instanceNumber,
|
|
||||||
scrubGenDayOpt.get(),
|
|
||||||
NON_FIRST_INSTANCE_SLEEP_BEFORE_RETRY_DURATION_MS,
|
|
||||||
hdfs
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
boolean isScrubGenDataFullyBuilt(
|
|
||||||
int instanceNumber,
|
|
||||||
Date scrubGenDay,
|
|
||||||
long nonFirstInstanceSleepBeforeRetryDuration,
|
|
||||||
FileSystem hdfs) {
|
|
||||||
// Check if the scrub gen has been fully built file exists.
|
|
||||||
if (checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If it doesn't exist, let first instance see if scrub gen has been fully built and create the
|
|
||||||
// file.
|
|
||||||
if (instanceNumber == FIRST_INSTANCE) {
|
|
||||||
// We were missing some data on HDFS for this scrub gen in previous run,
|
|
||||||
// but we might've gotten more data in the meantime, check again.
|
|
||||||
// Only allow instance 0 to do this mainly for 2 reasons:
|
|
||||||
// 1) Since instances are scheduled in batches, it's possible that a instance from latter
|
|
||||||
// batch find the fully built file in hdfs and start processing. We end up doing work with
|
|
||||||
// only partial instances.
|
|
||||||
// 2) If we sleep before we release lock, it's hard to estimate how long a instance will
|
|
||||||
// be scheduled.
|
|
||||||
// For deterministic reason, we simplify a bit and only allow instance 0 to check and write
|
|
||||||
// data is fully build file to hdfs.
|
|
||||||
try {
|
|
||||||
checkIfScrubGenDataIsFullyBuilt(hdfs, scrubGenDay);
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Failed to grab lock and check scrub gen data.", e);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
// for all other instances, sleep for a bit to give time for first instance to check if scrub
|
|
||||||
// gen has been fully built and create the file, then check again.
|
|
||||||
try {
|
|
||||||
LOG.info(
|
|
||||||
"Sleeping for {} ms before re-checking if scrub gen has been fully built file exists",
|
|
||||||
nonFirstInstanceSleepBeforeRetryDuration);
|
|
||||||
clock.waitFor(nonFirstInstanceSleepBeforeRetryDuration);
|
|
||||||
return checkHaveScrubGenDataFullyBuiltFileOnHdfs(hdfs);
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
LOG.warn("Interrupted when sleeping before re-checking if scrub gen has been fully built "
|
|
||||||
+ "file exists", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// if hasSuccessFileToHdfs returns false, then should always return false in the end.
|
|
||||||
// next run will find success file for this scrub gen and move forward.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private void checkIfScrubGenDataIsFullyBuilt(
|
|
||||||
FileSystem hdfs, Date scrubGenDay) throws IOException {
|
|
||||||
// Build the lock, try to acquire it, and check the data on HDFS
|
|
||||||
TryLock lock = zkTryLockFactory.createTryLock(
|
|
||||||
DatabaseConfig.getLocalHostname(),
|
|
||||||
SEGMENT_BUILDER_SYNC_ZK_PATH,
|
|
||||||
SEGMENT_BUILDER_SYNC_NODE,
|
|
||||||
ZK_LOCK_EXPIRATION_MIN);
|
|
||||||
Preconditions.checkState(scrubGenOpt.isPresent());
|
|
||||||
String scrubGen = scrubGenOpt.get();
|
|
||||||
|
|
||||||
lock.tryWithLock(() -> {
|
|
||||||
LOG.info(String.format(
|
|
||||||
"Obtained ZK lock to check if data for scrub gen %s is ready.", scrubGen));
|
|
||||||
final DailyStatusBatches directory =
|
|
||||||
new DailyStatusBatches(zkTryLockFactory, scrubGenDay);
|
|
||||||
if (directory.isScrubGenDataFullyBuilt(hdfs)
|
|
||||||
&& createScrubGenDataFullyBuiltFileOnHdfs(hdfs)) {
|
|
||||||
LOG.info(String.format("All data for scrub gen %s is ready.", scrubGen));
|
|
||||||
} else {
|
|
||||||
LOG.info(String.format("Data for scrub gen %s is not ready yet.", scrubGen));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean createScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) {
|
|
||||||
Path path = getScrubGenDataFullyBuiltFilePath();
|
|
||||||
try {
|
|
||||||
fs.mkdirs(new Path(statusReadyHDFSPath()));
|
|
||||||
if (fs.createNewFile(path)) {
|
|
||||||
LOG.info("Successfully created file " + path + " on HDFS.");
|
|
||||||
return true;
|
|
||||||
} else {
|
|
||||||
LOG.warn("Failed to create file " + path + " on HDFS.");
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Failed to create file on HDFS " + path.toString(), e);
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
private boolean checkHaveScrubGenDataFullyBuiltFileOnHdfs(FileSystem fs) {
|
|
||||||
Path path = getScrubGenDataFullyBuiltFilePath();
|
|
||||||
try {
|
|
||||||
boolean ret = fs.exists(path);
|
|
||||||
LOG.info("Checking if file exists showing scrubgen is fully built.");
|
|
||||||
LOG.info("Path checked: {}, Exist check: {}", path, ret);
|
|
||||||
return ret;
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("Failed to check file on HDFS " + path.toString(), e);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
Path getScrubGenDataFullyBuiltFilePath() {
|
|
||||||
return new Path(statusReadyHDFSPath(), DATA_FULLY_BUILT_FILE);
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
String statusReadyHDFSPath() {
|
|
||||||
return syncConfig.getHdfsSegmentSyncRootDir() + "/segment_builder_sync";
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,10 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
public final class SegmentBuilderMain {
|
|
||||||
|
|
||||||
private SegmentBuilderMain() { }
|
|
||||||
|
|
||||||
public static void main(String[] args) {
|
|
||||||
new SegmentBuilderApp().main(args);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,58 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import java.io.File;
|
|
||||||
|
|
||||||
import com.google.inject.Provides;
|
|
||||||
import com.google.inject.Singleton;
|
|
||||||
|
|
||||||
import com.twitter.app.Flaggable;
|
|
||||||
import com.twitter.decider.Decider;
|
|
||||||
import com.twitter.inject.TwitterModule;
|
|
||||||
import com.twitter.inject.annotations.Flag;
|
|
||||||
import com.twitter.search.common.config.LoggerConfiguration;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
|
||||||
import com.twitter.search.earlybird.util.EarlybirdDecider;
|
|
||||||
|
|
||||||
public class SegmentBuilderModule extends TwitterModule {
|
|
||||||
|
|
||||||
private static final String CONFIG_FILE_FLAG_NAME = "config_file";
|
|
||||||
private static final String SEGMENT_LOG_DIR_FLAG_NAME = "segment_log_dir";
|
|
||||||
|
|
||||||
public SegmentBuilderModule() {
|
|
||||||
createFlag(CONFIG_FILE_FLAG_NAME,
|
|
||||||
new File("earlybird-search.yml"),
|
|
||||||
"specify config file",
|
|
||||||
Flaggable.ofFile());
|
|
||||||
|
|
||||||
createFlag(SEGMENT_LOG_DIR_FLAG_NAME,
|
|
||||||
"",
|
|
||||||
"override log dir from config file",
|
|
||||||
Flaggable.ofString());
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Initializes the Earlybird config and the log configuration, and returns an EarlybirdDecider
|
|
||||||
* object, which will be injected into the SegmentBuilder instance.
|
|
||||||
*
|
|
||||||
* @param configFile The config file to use to initialize EarlybirdConfig
|
|
||||||
* @param segmentLogDir If not empty, used to override the log directory from the config file
|
|
||||||
* @return An initialized EarlybirdDecider
|
|
||||||
*/
|
|
||||||
@Provides
|
|
||||||
@Singleton
|
|
||||||
public Decider provideDecider(@Flag(CONFIG_FILE_FLAG_NAME) File configFile,
|
|
||||||
@Flag(SEGMENT_LOG_DIR_FLAG_NAME) String segmentLogDir) {
|
|
||||||
// By default Guice will build singletons eagerly:
|
|
||||||
// https://github.com/google/guice/wiki/Scopes#eager-singletons
|
|
||||||
// So in order to ensure that the EarlybirdConfig and LoggerConfiguration initializations occur
|
|
||||||
// before the EarlybirdDecider initialization, we place them here.
|
|
||||||
EarlybirdConfig.init(configFile.getName());
|
|
||||||
if (!segmentLogDir.isEmpty()) {
|
|
||||||
EarlybirdConfig.overrideLogDir(segmentLogDir);
|
|
||||||
}
|
|
||||||
new LoggerConfiguration(EarlybirdConfig.getLogPropertiesFile(), EarlybirdConfig.getLogDir())
|
|
||||||
.configure();
|
|
||||||
|
|
||||||
return EarlybirdDecider.initialize();
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,100 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
|
|
||||||
import com.twitter.common.quantity.Amount;
|
|
||||||
import com.twitter.common.quantity.Time;
|
|
||||||
import com.twitter.search.common.database.DatabaseConfig;
|
|
||||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
|
||||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
|
||||||
import com.twitter.search.earlybird.archive.ArchiveSegment;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
|
||||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
|
||||||
|
|
||||||
public abstract class SegmentBuilderSegment {
|
|
||||||
protected final SegmentInfo segmentInfo;
|
|
||||||
protected final SegmentConfig segmentConfig;
|
|
||||||
protected final EarlybirdSegmentFactory earlybirdSegmentFactory;
|
|
||||||
protected final int alreadyRetriedCount;
|
|
||||||
protected final SegmentSyncConfig sync;
|
|
||||||
|
|
||||||
public SegmentBuilderSegment(SegmentInfo segmentInfo,
|
|
||||||
SegmentConfig segmentConfig,
|
|
||||||
EarlybirdSegmentFactory earlybirdSegmentFactory,
|
|
||||||
int alreadyRetriedCount,
|
|
||||||
SegmentSyncConfig segmentSyncConfig) {
|
|
||||||
this.segmentConfig = segmentConfig;
|
|
||||||
this.earlybirdSegmentFactory = earlybirdSegmentFactory;
|
|
||||||
this.alreadyRetriedCount = alreadyRetriedCount;
|
|
||||||
this.sync = segmentSyncConfig;
|
|
||||||
Preconditions.checkState(segmentInfo.getSegment() instanceof ArchiveSegment);
|
|
||||||
this.segmentInfo = Preconditions.checkNotNull(segmentInfo);
|
|
||||||
}
|
|
||||||
|
|
||||||
public SegmentInfo getSegmentInfo() {
|
|
||||||
return segmentInfo;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getSegmentName() {
|
|
||||||
return segmentInfo.getSegmentName();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getAlreadyRetriedCount() {
|
|
||||||
return alreadyRetriedCount;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Handle the segment, potentially transitioning to a new state.
|
|
||||||
* @return The state after handling.
|
|
||||||
*/
|
|
||||||
public abstract SegmentBuilderSegment handle()
|
|
||||||
throws SegmentInfoConstructionException, SegmentUpdaterException;
|
|
||||||
|
|
||||||
public boolean isBuilt() {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return "SegmentBuilderSegment{"
|
|
||||||
+ "segmentInfo=" + segmentInfo
|
|
||||||
+ ", state=" + this.getClass().getSimpleName()
|
|
||||||
+ ", alreadyRetriedCount=" + alreadyRetriedCount + '}';
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Given a SegmentInfo, create a new one with the same time slice and partitionID but clean
|
|
||||||
* internal state.
|
|
||||||
*/
|
|
||||||
protected SegmentInfo createNewSegmentInfo(SegmentInfo oldSegmentInfo)
|
|
||||||
throws SegmentInfoConstructionException {
|
|
||||||
Preconditions.checkArgument(oldSegmentInfo.getSegment() instanceof ArchiveSegment);
|
|
||||||
ArchiveSegment archiveSegment = (ArchiveSegment) oldSegmentInfo.getSegment();
|
|
||||||
|
|
||||||
try {
|
|
||||||
ArchiveSegment segment = new ArchiveSegment(archiveSegment.getArchiveTimeSlice(),
|
|
||||||
archiveSegment.getHashPartitionID(), EarlybirdConfig.getMaxSegmentSize());
|
|
||||||
|
|
||||||
return new SegmentInfo(segment, earlybirdSegmentFactory, sync);
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new SegmentInfoConstructionException("Error creating new segments", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
protected TryLock getZooKeeperTryLock() {
|
|
||||||
ZooKeeperTryLockFactory tryLockFactory = segmentConfig.getTryLockFactory();
|
|
||||||
String zkRootPath = sync.getZooKeeperSyncFullPath();
|
|
||||||
String nodeName = segmentInfo.getZkNodeName();
|
|
||||||
Amount<Long, Time> expirationTime = segmentConfig.getSegmentZKLockExpirationTime();
|
|
||||||
|
|
||||||
return tryLockFactory.createTryLock(
|
|
||||||
DatabaseConfig.getLocalHostname(),
|
|
||||||
zkRootPath,
|
|
||||||
nodeName,
|
|
||||||
expirationTime);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,41 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import com.twitter.common.quantity.Amount;
|
|
||||||
import com.twitter.common.quantity.Time;
|
|
||||||
import com.twitter.search.common.util.zktrylock.ZooKeeperTryLockFactory;
|
|
||||||
import com.twitter.search.earlybird.archive.ArchiveOnDiskEarlybirdIndexConfig;
|
|
||||||
|
|
||||||
public class SegmentConfig {
|
|
||||||
private final ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig;
|
|
||||||
private final Amount<Long, Time> segmentZKLockExpirationTime;
|
|
||||||
private final int maxRetriesOnFailure;
|
|
||||||
private final ZooKeeperTryLockFactory tryLockFactory;
|
|
||||||
|
|
||||||
public SegmentConfig(
|
|
||||||
ArchiveOnDiskEarlybirdIndexConfig earlybirdIndexConfig,
|
|
||||||
Amount<Long, Time> segmentZKLockExpirationTime,
|
|
||||||
int maxRetriesOnFailure,
|
|
||||||
ZooKeeperTryLockFactory tryLockFactory) {
|
|
||||||
|
|
||||||
this.earlybirdIndexConfig = earlybirdIndexConfig;
|
|
||||||
this.segmentZKLockExpirationTime = segmentZKLockExpirationTime;
|
|
||||||
this.maxRetriesOnFailure = maxRetriesOnFailure;
|
|
||||||
this.tryLockFactory = tryLockFactory;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ArchiveOnDiskEarlybirdIndexConfig getEarlybirdIndexConfig() {
|
|
||||||
return earlybirdIndexConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Amount<Long, Time> getSegmentZKLockExpirationTime() {
|
|
||||||
return segmentZKLockExpirationTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getMaxRetriesOnFailure() {
|
|
||||||
return maxRetriesOnFailure;
|
|
||||||
}
|
|
||||||
|
|
||||||
public ZooKeeperTryLockFactory getTryLockFactory() {
|
|
||||||
return tryLockFactory;
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,12 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import java.io.IOException;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Used if exceptions are thrown during creating new SegmentInfo during the indexing loop
|
|
||||||
*/
|
|
||||||
class SegmentInfoConstructionException extends Exception {
|
|
||||||
SegmentInfoConstructionException(String msg, IOException e) {
|
|
||||||
super(msg, e);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,13 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Used when when SegmentUpdater fails processing segments.
|
|
||||||
*/
|
|
||||||
@VisibleForTesting
|
|
||||||
class SegmentUpdaterException extends Exception {
|
|
||||||
SegmentUpdaterException(String msg) {
|
|
||||||
super(msg);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,69 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.archive.segmentbuilder;
|
|
||||||
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
|
|
||||||
import com.twitter.common.base.Command;
|
|
||||||
import com.twitter.search.common.util.zktrylock.TryLock;
|
|
||||||
import com.twitter.search.earlybird.archive.ArchiveHDFSUtils;
|
|
||||||
import com.twitter.search.earlybird.index.EarlybirdSegmentFactory;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentInfo;
|
|
||||||
import com.twitter.search.earlybird.partition.SegmentSyncConfig;
|
|
||||||
|
|
||||||
public class SomeoneElseIsBuildingSegment extends SegmentBuilderSegment {
|
|
||||||
public SomeoneElseIsBuildingSegment(
|
|
||||||
SegmentInfo segmentInfo,
|
|
||||||
SegmentConfig segmentConfig,
|
|
||||||
EarlybirdSegmentFactory earlybirdSegmentFactory,
|
|
||||||
int alreadyRetriedCount,
|
|
||||||
SegmentSyncConfig sync) {
|
|
||||||
|
|
||||||
super(segmentInfo, segmentConfig, earlybirdSegmentFactory, alreadyRetriedCount, sync);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This method refreshes local state of a segment.
|
|
||||||
* 1. Try to grab the ZK lock
|
|
||||||
* 2a. if got the lock, the segment is not being built; mark segment as NOT_BUILT_YET.
|
|
||||||
* 2b. otherwise, the segment is being built; keep the SOMEONE_ELSE_IS_BUILDING state
|
|
||||||
*/
|
|
||||||
@Override
|
|
||||||
public SegmentBuilderSegment handle()
|
|
||||||
throws SegmentInfoConstructionException, SegmentUpdaterException {
|
|
||||||
|
|
||||||
TryLock lock = getZooKeeperTryLock();
|
|
||||||
|
|
||||||
final AtomicBoolean alreadyBuilt = new AtomicBoolean(false);
|
|
||||||
boolean gotLock = lock.tryWithLock((Command) () -> {
|
|
||||||
// The segment might have already finished built by others
|
|
||||||
if (segmentExistsOnHdfs()) {
|
|
||||||
alreadyBuilt.set(true);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
if (!gotLock) {
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (alreadyBuilt.get()) {
|
|
||||||
return new BuiltAndFinalizedSegment(
|
|
||||||
segmentInfo, segmentConfig, earlybirdSegmentFactory, 0, sync);
|
|
||||||
} else {
|
|
||||||
// When a segment failed building, its state might not be clean. So, it is necessary to
|
|
||||||
// create a new SegmentInfo with a clean state
|
|
||||||
SegmentInfo newSegmentInfo = createNewSegmentInfo(segmentInfo);
|
|
||||||
return new NotYetBuiltSegment(
|
|
||||||
newSegmentInfo,
|
|
||||||
segmentConfig,
|
|
||||||
earlybirdSegmentFactory,
|
|
||||||
alreadyRetriedCount + 1,
|
|
||||||
sync);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
boolean segmentExistsOnHdfs() {
|
|
||||||
return ArchiveHDFSUtils.hasSegmentIndicesOnHDFS(sync, segmentInfo);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,37 +0,0 @@
|
|||||||
java_library(
|
|
||||||
sources = ["*.java"],
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
"3rdparty/jvm/com/google/guava",
|
|
||||||
"3rdparty/jvm/com/twitter/elephantbird:core",
|
|
||||||
"3rdparty/jvm/commons-codec",
|
|
||||||
"3rdparty/jvm/commons-httpclient",
|
|
||||||
"3rdparty/jvm/geo/google:geoGoogle",
|
|
||||||
"3rdparty/jvm/org/apache/lucene:lucene-core",
|
|
||||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
|
||||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
|
||||||
"decider/src/main/scala",
|
|
||||||
"finagle/finagle-core/src/main",
|
|
||||||
"finagle/finagle-thrift/src/main/java",
|
|
||||||
"finagle/finagle-thrift/src/main/scala",
|
|
||||||
"scrooge/scrooge-core/src/main/scala",
|
|
||||||
"src/java/com/twitter/common/base",
|
|
||||||
"src/java/com/twitter/common/optional",
|
|
||||||
"src/java/com/twitter/search/common/decider",
|
|
||||||
"src/java/com/twitter/search/common/logging",
|
|
||||||
"src/java/com/twitter/search/common/metrics",
|
|
||||||
"src/java/com/twitter/search/common/util:finagleutil",
|
|
||||||
"src/java/com/twitter/search/common/util/earlybird",
|
|
||||||
"src/java/com/twitter/search/common/util/thrift:thrift-utils",
|
|
||||||
"src/java/com/twitter/search/queryparser/query:core-query-nodes",
|
|
||||||
"src/thrift/com/twitter/context:twitter-context-scala",
|
|
||||||
"src/thrift/com/twitter/search:earlybird-java",
|
|
||||||
"src/thrift/com/twitter/search/common:caching-java",
|
|
||||||
"src/thrift/com/twitter/search/common:constants-java",
|
|
||||||
"src/thrift/com/twitter/search/common:query-java",
|
|
||||||
"strato/src/main/scala/com/twitter/strato/opcontext",
|
|
||||||
"twitter-context/src/main/scala",
|
|
||||||
"util/util-core:scala",
|
|
||||||
],
|
|
||||||
)
|
|
BIN
src/java/com/twitter/search/earlybird/common/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/common/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,120 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import org.apache.commons.codec.binary.Base64;
|
|
||||||
import org.apache.thrift.TException;
|
|
||||||
import org.apache.thrift.TSerializer;
|
|
||||||
import org.apache.thrift.protocol.TBinaryProtocol;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
|
||||||
|
|
||||||
public final class Base64RequestResponseForLogging {
|
|
||||||
private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger(
|
|
||||||
Base64RequestResponseForLogging.class);
|
|
||||||
private static final Logger FAILED_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
|
|
||||||
Base64RequestResponseForLogging.class.getName() + ".FailedRequests");
|
|
||||||
private static final Logger RANDOM_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
|
|
||||||
Base64RequestResponseForLogging.class.getName() + ".RandomRequests");
|
|
||||||
private static final Logger SLOW_REQUEST_LOG = org.slf4j.LoggerFactory.getLogger(
|
|
||||||
Base64RequestResponseForLogging.class.getName() + ".SlowRequests");
|
|
||||||
|
|
||||||
private enum LogType {
|
|
||||||
FAILED,
|
|
||||||
RANDOM,
|
|
||||||
SLOW,
|
|
||||||
};
|
|
||||||
|
|
||||||
private final LogType logtype;
|
|
||||||
private final String logLine;
|
|
||||||
private final EarlybirdRequest request;
|
|
||||||
private final EarlybirdResponse response;
|
|
||||||
private final Base64 base64 = new Base64();
|
|
||||||
|
|
||||||
// TSerializer is not threadsafe, so create a new one for each request
|
|
||||||
private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
|
|
||||||
|
|
||||||
private Base64RequestResponseForLogging(
|
|
||||||
LogType logType, String logLine, EarlybirdRequest request, EarlybirdResponse response) {
|
|
||||||
this.logtype = logType;
|
|
||||||
this.logLine = logLine;
|
|
||||||
this.request = request;
|
|
||||||
this.response = response;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Base64RequestResponseForLogging randomRequest(
|
|
||||||
String logLine, EarlybirdRequest request, EarlybirdResponse response) {
|
|
||||||
return new Base64RequestResponseForLogging(LogType.RANDOM, logLine, request, response);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Base64RequestResponseForLogging failedRequest(
|
|
||||||
String logLine, EarlybirdRequest request, EarlybirdResponse response) {
|
|
||||||
return new Base64RequestResponseForLogging(LogType.FAILED, logLine, request, response);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static Base64RequestResponseForLogging slowRequest(
|
|
||||||
String logLine, EarlybirdRequest request, EarlybirdResponse response) {
|
|
||||||
return new Base64RequestResponseForLogging(LogType.SLOW, logLine, request, response);
|
|
||||||
}
|
|
||||||
|
|
||||||
private String asBase64(EarlybirdRequest clearedRequest) {
|
|
||||||
try {
|
|
||||||
// The purpose of this log is to make it easy to re-issue requests in formz to reproduce
|
|
||||||
// issues. If queries are re-issued as is they will be treated as late-arriving queries and
|
|
||||||
// dropped due to the clientRequestTimeMs being set to the original query time. For ease of
|
|
||||||
// use purposes we clear clientRequestTimeMs and log it out separately for the rare case it
|
|
||||||
// is needed.
|
|
||||||
clearedRequest.unsetClientRequestTimeMs();
|
|
||||||
return base64.encodeToString(serializer.serialize(clearedRequest));
|
|
||||||
} catch (TException e) {
|
|
||||||
GENERAL_LOG.error("Failed to serialize request for logging.", e);
|
|
||||||
return "failed_to_serialize";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String asBase64(EarlybirdResponse earlybirdResponse) {
|
|
||||||
try {
|
|
||||||
return base64.encodeToString(serializer.serialize(earlybirdResponse));
|
|
||||||
} catch (TException e) {
|
|
||||||
GENERAL_LOG.error("Failed to serialize response for logging.", e);
|
|
||||||
return "failed_to_serialize";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String getFormattedMessage() {
|
|
||||||
String base64Request = asBase64(
|
|
||||||
EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request));
|
|
||||||
String base64Response = asBase64(response);
|
|
||||||
return logLine + ", clientRequestTimeMs: " + request.getClientRequestTimeMs()
|
|
||||||
+ ", " + base64Request + ", " + base64Response;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Logs the Base64-encoded request and response to the success or failure log.
|
|
||||||
*/
|
|
||||||
public void log() {
|
|
||||||
// Do the serializing/concatting this way so it happens on the background thread for
|
|
||||||
// async logging
|
|
||||||
Object logObject = new Object() {
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return getFormattedMessage();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
switch (logtype) {
|
|
||||||
case FAILED:
|
|
||||||
FAILED_REQUEST_LOG.info("{}", logObject);
|
|
||||||
break;
|
|
||||||
case RANDOM:
|
|
||||||
RANDOM_REQUEST_LOG.info("{}", logObject);
|
|
||||||
break;
|
|
||||||
case SLOW:
|
|
||||||
SLOW_REQUEST_LOG.info("{}", logObject);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
// Not logging anything for other log types.
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,55 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import java.util.concurrent.atomic.AtomicBoolean;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.search.common.metrics.SearchCustomGauge;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A monitor which enforces the condition that a single thread's work is caught up, and allows
|
|
||||||
* other threads to wait to be notified when the work is complete. An AtomicBoolean ensures the
|
|
||||||
* current status is visible to all threads.
|
|
||||||
*/
|
|
||||||
public class CaughtUpMonitor {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(CaughtUpMonitor.class);
|
|
||||||
|
|
||||||
protected final AtomicBoolean isCaughtUp = new AtomicBoolean(false);
|
|
||||||
|
|
||||||
public CaughtUpMonitor(String statPrefix) {
|
|
||||||
SearchCustomGauge.export(statPrefix + "_is_caught_up", () -> isCaughtUp() ? 1 : 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isCaughtUp() {
|
|
||||||
return isCaughtUp.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Set caught up state, and notify waiting threads if caught up.
|
|
||||||
*/
|
|
||||||
public synchronized void setAndNotify(boolean caughtUp) {
|
|
||||||
isCaughtUp.set(caughtUp);
|
|
||||||
if (caughtUp) {
|
|
||||||
// Readers are caught up, notify waiting threads
|
|
||||||
notifyAll();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Wait using Object.wait() until caught up or until thread is interrupted.
|
|
||||||
*/
|
|
||||||
public synchronized void resetAndWaitUntilCaughtUp() {
|
|
||||||
LOG.info("Waiting to catch up.");
|
|
||||||
// Explicitly set isCaughtUp to false before waiting
|
|
||||||
isCaughtUp.set(false);
|
|
||||||
try {
|
|
||||||
while (!isCaughtUp()) {
|
|
||||||
wait();
|
|
||||||
}
|
|
||||||
} catch (InterruptedException e) {
|
|
||||||
LOG.error("{} was interrupted while waiting to catch up", Thread.currentThread());
|
|
||||||
}
|
|
||||||
LOG.info("Caught up.");
|
|
||||||
}
|
|
||||||
}
|
|
BIN
src/java/com/twitter/search/earlybird/common/ClientIdUtil.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/common/ClientIdUtil.docx
Normal file
Binary file not shown.
@ -1,85 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import java.util.Optional;
|
|
||||||
|
|
||||||
import com.twitter.common.optional.Optionals;
|
|
||||||
import com.twitter.search.common.util.FinagleUtil;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
|
||||||
import com.twitter.strato.opcontext.Attribution;
|
|
||||||
import com.twitter.strato.opcontext.HttpEndpoint;
|
|
||||||
|
|
||||||
public final class ClientIdUtil {
|
|
||||||
// Blenders should always set the EarlybirdRequest.clientId field. It should be set to the Finagle
|
|
||||||
// client ID of the client that caused the blender to send this request to the roots. If the
|
|
||||||
// Finagle ID of the blender's client cannot be determined, it will be set to "unknown" (see
|
|
||||||
// com.twitter.search.common.util.FinagleUtil.UNKNOWN_CLIENT_NAME). However, other services that
|
|
||||||
// send requests to roots might not set EarlybirdRequest.clientId.
|
|
||||||
//
|
|
||||||
// So an "unset" clientId means: EarlybirdRequest.clientId was null.
|
|
||||||
// An "unknown" clientId means: the client that sent us the request
|
|
||||||
// tried setting EarlybirdRequest.clientId, but couldn't figure out a good value for it.
|
|
||||||
public static final String UNSET_CLIENT_ID = "unset";
|
|
||||||
|
|
||||||
private static final String CLIENT_ID_FOR_UNKNOWN_CLIENTS = "unknown_client_id";
|
|
||||||
|
|
||||||
private static final String CLIENT_ID_PREFIX = "client_id_";
|
|
||||||
|
|
||||||
private static final String FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN =
|
|
||||||
"finagle_id_%s_and_client_id_%s";
|
|
||||||
|
|
||||||
private static final String CLIENT_ID_AND_REQUEST_TYPE = "client_id_%s_and_type_%s";
|
|
||||||
|
|
||||||
private ClientIdUtil() {
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Returns the ID of the client that initiated this request or UNSET_CLIENT_ID if not set. */
|
|
||||||
public static String getClientIdFromRequest(EarlybirdRequest request) {
|
|
||||||
return Optional
|
|
||||||
.ofNullable(request.getClientId())
|
|
||||||
.map(String::toLowerCase)
|
|
||||||
.orElse(UNSET_CLIENT_ID);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the Strato http endpoint attribution as an Optional.
|
|
||||||
*/
|
|
||||||
public static Optional<String> getClientIdFromHttpEndpointAttribution() {
|
|
||||||
return Optionals
|
|
||||||
.optional(Attribution.httpEndpoint())
|
|
||||||
.map(HttpEndpoint::name)
|
|
||||||
.map(String::toLowerCase);
|
|
||||||
}
|
|
||||||
|
|
||||||
/** Formats the given clientId into a string that can be used for stats. */
|
|
||||||
public static String formatClientId(String clientId) {
|
|
||||||
return CLIENT_ID_PREFIX + clientId;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Formats the given Finagle clientId and the given clientId into a single string that can be used
|
|
||||||
* for stats, or other purposes where the two IDs need to be combined.
|
|
||||||
*/
|
|
||||||
public static String formatFinagleClientIdAndClientId(String finagleClientId, String clientId) {
|
|
||||||
return String.format(FINAGLE_CLIENT_ID_AND_CLIENT_ID_PATTERN, finagleClientId, clientId);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Formats the given clientId and requestType into a single string that can be used
|
|
||||||
* for stats or other purposes.
|
|
||||||
*/
|
|
||||||
public static String formatClientIdAndRequestType(
|
|
||||||
String clientId, String requestType) {
|
|
||||||
return String.format(CLIENT_ID_AND_REQUEST_TYPE, clientId, requestType);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Format the quota client id
|
|
||||||
*/
|
|
||||||
public static String getQuotaClientId(String clientId) {
|
|
||||||
if (FinagleUtil.UNKNOWN_CLIENT_NAME.equals(clientId) || UNSET_CLIENT_ID.equals(clientId)) {
|
|
||||||
return CLIENT_ID_FOR_UNKNOWN_CLIENTS;
|
|
||||||
}
|
|
||||||
|
|
||||||
return clientId;
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,365 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import java.util.EnumMap;
|
|
||||||
import java.util.Map;
|
|
||||||
|
|
||||||
import scala.Option;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.collect.Maps;
|
|
||||||
|
|
||||||
import com.twitter.context.TwitterContext;
|
|
||||||
import com.twitter.context.thriftscala.Viewer;
|
|
||||||
import com.twitter.decider.Decider;
|
|
||||||
import com.twitter.finagle.thrift.ClientId;
|
|
||||||
import com.twitter.finagle.thrift.ClientId$;
|
|
||||||
import com.twitter.search.TwitterContextPermit;
|
|
||||||
import com.twitter.search.common.constants.thriftjava.ThriftQuerySource;
|
|
||||||
import com.twitter.search.common.decider.DeciderUtil;
|
|
||||||
import com.twitter.search.common.logging.RPCLogger;
|
|
||||||
import com.twitter.search.common.metrics.FailureRatioCounter;
|
|
||||||
import com.twitter.search.common.metrics.Timer;
|
|
||||||
import com.twitter.search.common.util.earlybird.TermStatisticsUtil;
|
|
||||||
import com.twitter.search.common.util.earlybird.ThriftSearchResultUtil;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
|
||||||
import com.twitter.search.earlybird.thrift.ThriftFacetFieldRequest;
|
|
||||||
import com.twitter.search.earlybird.thrift.ThriftHistogramSettings;
|
|
||||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
|
||||||
import com.twitter.search.earlybird.thrift.ThriftTermStatisticsRequest;
|
|
||||||
|
|
||||||
import static com.twitter.search.common.util.earlybird.EarlybirdResponseUtil
|
|
||||||
.responseConsideredFailed;
|
|
||||||
|
|
||||||
|
|
||||||
public class EarlybirdRequestLogger extends RPCLogger {
|
|
||||||
protected enum ExtraFields {
|
|
||||||
QUERY_MAX_HITS_TO_PROCESS,
|
|
||||||
COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS,
|
|
||||||
RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS,
|
|
||||||
NUM_HITS_PROCESSED,
|
|
||||||
QUERY_COST,
|
|
||||||
CPU_TOTAL,
|
|
||||||
QUERY_SOURCE,
|
|
||||||
CLIENT_ID,
|
|
||||||
FINAGLE_CLIENT_ID
|
|
||||||
}
|
|
||||||
|
|
||||||
protected enum ShardOnlyExtraFields {
|
|
||||||
NUM_SEARCHED_SEGMENTS,
|
|
||||||
SCORING_TIME_NANOS
|
|
||||||
}
|
|
||||||
|
|
||||||
protected enum RootOnlyExtraFields {
|
|
||||||
CACHING_ALLOWED,
|
|
||||||
DEBUG_MODE,
|
|
||||||
CACHE_HIT,
|
|
||||||
USER_AGENT,
|
|
||||||
// See JIRA APPSEC-2303 for IP addresses logging
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final String LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY =
|
|
||||||
"log_full_request_details_on_error";
|
|
||||||
private static final String LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY =
|
|
||||||
"log_full_request_details_random_fraction";
|
|
||||||
private static final String LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY =
|
|
||||||
"log_full_slow_request_details_random_fraction";
|
|
||||||
private static final String SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY =
|
|
||||||
"slow_request_latency_threshold_ms";
|
|
||||||
|
|
||||||
private final Decider decider;
|
|
||||||
private final boolean enableLogUnknownClientRequests;
|
|
||||||
|
|
||||||
private static final Map<ThriftQuerySource, FailureRatioCounter>
|
|
||||||
FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE = preBuildFailureRatioCounters();
|
|
||||||
private static final FailureRatioCounter NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER =
|
|
||||||
new FailureRatioCounter("earlybird_logger", "query_source", "not_set");
|
|
||||||
|
|
||||||
static EarlybirdRequestLogger buildForRoot(
|
|
||||||
String loggerName, int latencyWarnThreshold, Decider decider) {
|
|
||||||
|
|
||||||
return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold,
|
|
||||||
decider, true, RPCLogger.Fields.values(), ExtraFields.values(),
|
|
||||||
RootOnlyExtraFields.values());
|
|
||||||
}
|
|
||||||
|
|
||||||
static EarlybirdRequestLogger buildForShard(
|
|
||||||
String loggerName, int latencyWarnThreshold, Decider decider) {
|
|
||||||
|
|
||||||
return new EarlybirdRequestLogger(loggerName, latencyWarnThreshold,
|
|
||||||
decider, false, RPCLogger.Fields.values(), ExtraFields.values(),
|
|
||||||
ShardOnlyExtraFields.values());
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider) {
|
|
||||||
this(loggerName, latencyWarnThreshold, decider, false, RPCLogger.Fields.values(),
|
|
||||||
ExtraFields.values(), RootOnlyExtraFields.values(), ShardOnlyExtraFields.values());
|
|
||||||
}
|
|
||||||
|
|
||||||
private EarlybirdRequestLogger(String loggerName, int latencyWarnThreshold, Decider decider,
|
|
||||||
boolean enableLogUnknownClientRequests, Enum[]... fieldEnums) {
|
|
||||||
super(loggerName, fieldEnums);
|
|
||||||
this.decider = decider;
|
|
||||||
this.enableLogUnknownClientRequests = enableLogUnknownClientRequests;
|
|
||||||
setLatencyWarnThreshold(latencyWarnThreshold);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Logs the given earlybird request and response.
|
|
||||||
*
|
|
||||||
* @param request The earlybird request.
|
|
||||||
* @param response The earlybird response.
|
|
||||||
* @param timer The time it took to process this request.
|
|
||||||
*/
|
|
||||||
public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) {
|
|
||||||
try {
|
|
||||||
LogEntry entry = newLogEntry();
|
|
||||||
|
|
||||||
setRequestLogEntries(entry, request);
|
|
||||||
setResponseLogEntries(entry, response);
|
|
||||||
if (timer != null) {
|
|
||||||
entry.setField(ExtraFields.CPU_TOTAL, Long.toString(timer.getElapsedCpuTotal()));
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean wasError = response != null && responseConsideredFailed(response.getResponseCode());
|
|
||||||
|
|
||||||
long responseTime = response != null ? response.getResponseTime() : 0L;
|
|
||||||
|
|
||||||
String logLine = writeLogLine(entry, responseTime, wasError);
|
|
||||||
|
|
||||||
// This code path is called for pre/post logging
|
|
||||||
// Prevent same request showing up twice by only logging on post logging
|
|
||||||
if (response != null && DeciderUtil.isAvailableForRandomRecipient(
|
|
||||||
decider, LOG_FULL_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) {
|
|
||||||
Base64RequestResponseForLogging.randomRequest(logLine, request, response).log();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Unknown client request logging only applies to pre-logging.
|
|
||||||
if (enableLogUnknownClientRequests && response == null) {
|
|
||||||
UnknownClientRequestForLogging unknownClientRequestLogger =
|
|
||||||
UnknownClientRequestForLogging.unknownClientRequest(logLine, request);
|
|
||||||
if (unknownClientRequestLogger != null) {
|
|
||||||
unknownClientRequestLogger.log();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (wasError
|
|
||||||
&& DeciderUtil.isAvailableForRandomRecipient(
|
|
||||||
decider, LOG_FULL_REQUEST_DETAILS_ON_ERROR_DECIDER_KEY)) {
|
|
||||||
new RequestResponseForLogging(request, response).logFailedRequest();
|
|
||||||
Base64RequestResponseForLogging.failedRequest(logLine, request, response).log();
|
|
||||||
}
|
|
||||||
|
|
||||||
boolean wasSlow = response != null
|
|
||||||
&& responseTime >= DeciderUtil.getAvailability(
|
|
||||||
decider, SLOW_REQUEST_LATENCY_THRESHOLD_MS_DECIDER_KEY);
|
|
||||||
if (wasSlow
|
|
||||||
&& DeciderUtil.isAvailableForRandomRecipient(
|
|
||||||
decider, LOG_FULL_SLOW_REQUEST_DETAILS_RANDOM_FRACTION_DECIDER_KEY)) {
|
|
||||||
Base64RequestResponseForLogging.slowRequest(logLine, request, response).log();
|
|
||||||
}
|
|
||||||
|
|
||||||
FailureRatioCounter failureRatioCounter =
|
|
||||||
FAILURE_RATIO_COUNTER_BY_QUERY_SOURCE.get(request.getQuerySource());
|
|
||||||
if (failureRatioCounter != null) {
|
|
||||||
failureRatioCounter.requestFinished(!wasError);
|
|
||||||
} else {
|
|
||||||
NO_QUERY_SOURCE_FAILURE_RATIO_COUNTER.requestFinished(!wasError);
|
|
||||||
}
|
|
||||||
|
|
||||||
} catch (Exception e) {
|
|
||||||
LOG.error("Exception building log entry ", e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setRequestLogEntries(LogEntry entry, EarlybirdRequest request) {
|
|
||||||
entry.setField(Fields.CLIENT_HOST, request.getClientHost());
|
|
||||||
entry.setField(Fields.CLIENT_REQUEST_ID, request.getClientRequestID());
|
|
||||||
entry.setField(Fields.REQUEST_TYPE, requestTypeForLog(request));
|
|
||||||
|
|
||||||
if (request.isSetSearchQuery()) {
|
|
||||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
|
||||||
entry.setField(Fields.QUERY, searchQuery.getSerializedQuery());
|
|
||||||
|
|
||||||
if (searchQuery.isSetMaxHitsToProcess()) {
|
|
||||||
entry.setField(ExtraFields.QUERY_MAX_HITS_TO_PROCESS,
|
|
||||||
Integer.toString(searchQuery.getMaxHitsToProcess()));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (searchQuery.isSetCollectorParams()
|
|
||||||
&& searchQuery.getCollectorParams().isSetTerminationParams()
|
|
||||||
&& searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
|
|
||||||
entry.setField(ExtraFields.COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS,
|
|
||||||
Integer.toString(searchQuery.getCollectorParams().getTerminationParams()
|
|
||||||
.getMaxHitsToProcess()));
|
|
||||||
}
|
|
||||||
|
|
||||||
if (searchQuery.isSetRelevanceOptions()
|
|
||||||
&& searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
|
|
||||||
entry.setField(ExtraFields.RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS,
|
|
||||||
Integer.toString(searchQuery.getRelevanceOptions().getMaxHitsToProcess()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
entry.setField(Fields.NUM_REQUESTED, Integer.toString(numRequestedForLog(request)));
|
|
||||||
|
|
||||||
if (request.isSetQuerySource()) {
|
|
||||||
entry.setField(ExtraFields.QUERY_SOURCE, request.getQuerySource().name());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (request.isSetClientId()) {
|
|
||||||
entry.setField(ExtraFields.CLIENT_ID, request.getClientId());
|
|
||||||
}
|
|
||||||
|
|
||||||
entry.setField(RootOnlyExtraFields.CACHING_ALLOWED,
|
|
||||||
Boolean.toString(EarlybirdRequestUtil.isCachingAllowed(request)));
|
|
||||||
|
|
||||||
entry.setField(RootOnlyExtraFields.DEBUG_MODE, Byte.toString(request.getDebugMode()));
|
|
||||||
|
|
||||||
Option<ClientId> clientIdOption = ClientId$.MODULE$.current();
|
|
||||||
if (clientIdOption.isDefined()) {
|
|
||||||
entry.setField(ExtraFields.FINAGLE_CLIENT_ID, clientIdOption.get().name());
|
|
||||||
}
|
|
||||||
|
|
||||||
setLogEntriesFromTwitterContext(entry);
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
Option<Viewer> getTwitterContext() {
|
|
||||||
return TwitterContext.acquire(TwitterContextPermit.get()).apply();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setLogEntriesFromTwitterContext(LogEntry entry) {
|
|
||||||
Option<Viewer> viewerOption = getTwitterContext();
|
|
||||||
if (viewerOption.nonEmpty()) {
|
|
||||||
Viewer viewer = viewerOption.get();
|
|
||||||
|
|
||||||
if (viewer.userAgent().nonEmpty()) {
|
|
||||||
String userAgent = viewer.userAgent().get();
|
|
||||||
|
|
||||||
// we only replace the comma in the user-agent with %2C to make it easily parseable,
|
|
||||||
// specially with command line tools like cut/sed/awk
|
|
||||||
userAgent = userAgent.replace(",", "%2C");
|
|
||||||
|
|
||||||
entry.setField(RootOnlyExtraFields.USER_AGENT, userAgent);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setResponseLogEntries(LogEntry entry, EarlybirdResponse response) {
|
|
||||||
if (response != null) {
|
|
||||||
entry.setField(Fields.NUM_RETURNED, Integer.toString(numResultsForLog(response)));
|
|
||||||
entry.setField(Fields.RESPONSE_CODE, String.valueOf(response.getResponseCode()));
|
|
||||||
entry.setField(Fields.RESPONSE_TIME_MICROS, Long.toString(response.getResponseTimeMicros()));
|
|
||||||
if (response.isSetSearchResults()) {
|
|
||||||
entry.setField(ExtraFields.NUM_HITS_PROCESSED,
|
|
||||||
Integer.toString(response.getSearchResults().getNumHitsProcessed()));
|
|
||||||
entry.setField(ExtraFields.QUERY_COST,
|
|
||||||
Double.toString(response.getSearchResults().getQueryCost()));
|
|
||||||
if (response.getSearchResults().isSetScoringTimeNanos()) {
|
|
||||||
entry.setField(ShardOnlyExtraFields.SCORING_TIME_NANOS,
|
|
||||||
Long.toString(response.getSearchResults().getScoringTimeNanos()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (response.isSetCacheHit()) {
|
|
||||||
entry.setField(RootOnlyExtraFields.CACHE_HIT, String.valueOf(response.isCacheHit()));
|
|
||||||
}
|
|
||||||
if (response.isSetNumSearchedSegments()) {
|
|
||||||
entry.setField(ShardOnlyExtraFields.NUM_SEARCHED_SEGMENTS,
|
|
||||||
Integer.toString(response.getNumSearchedSegments()));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static int numRequestedForLog(EarlybirdRequest request) {
|
|
||||||
int num = 0;
|
|
||||||
if (request.isSetFacetRequest() && request.getFacetRequest().isSetFacetFields()) {
|
|
||||||
for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) {
|
|
||||||
num += field.getNumResults();
|
|
||||||
}
|
|
||||||
} else if (request.isSetTermStatisticsRequest()) {
|
|
||||||
num = request.getTermStatisticsRequest().getTermRequestsSize();
|
|
||||||
} else if (request.isSetSearchQuery()) {
|
|
||||||
num = request.getSearchQuery().isSetCollectorParams()
|
|
||||||
? request.getSearchQuery().getCollectorParams().getNumResultsToReturn() : 0;
|
|
||||||
if (request.getSearchQuery().getSearchStatusIdsSize() > 0) {
|
|
||||||
num = Math.max(num, request.getSearchQuery().getSearchStatusIdsSize());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return num;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the number of results in the given response. If the response is a term stats response,
|
|
||||||
* then the returned value will be the number of term results. If the response is a facet
|
|
||||||
* response, then the returned value will be the number of facet results. Otherwise, the returned
|
|
||||||
* value will be the number of search results.
|
|
||||||
*/
|
|
||||||
public static int numResultsForLog(EarlybirdResponse response) {
|
|
||||||
if (response == null) {
|
|
||||||
return 0;
|
|
||||||
} else if (response.isSetFacetResults()) {
|
|
||||||
return ThriftSearchResultUtil.numFacetResults(response.getFacetResults());
|
|
||||||
} else if (response.isSetTermStatisticsResults()) {
|
|
||||||
return response.getTermStatisticsResults().getTermResultsSize();
|
|
||||||
} else {
|
|
||||||
return ThriftSearchResultUtil.numResults(response.getSearchResults());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static String requestTypeForLog(EarlybirdRequest request) {
|
|
||||||
StringBuilder requestType = new StringBuilder(64);
|
|
||||||
if (request.isSetFacetRequest()) {
|
|
||||||
requestType.append("FACETS");
|
|
||||||
int numFields = request.getFacetRequest().getFacetFieldsSize();
|
|
||||||
if (numFields > 0) {
|
|
||||||
// For 1 or 2 fields, just put them in the request type. For more, just log the number.
|
|
||||||
if (numFields <= 2) {
|
|
||||||
for (ThriftFacetFieldRequest field : request.getFacetRequest().getFacetFields()) {
|
|
||||||
requestType.append(":").append(field.getFieldName().toUpperCase());
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
requestType.append(":MULTI-").append(numFields);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (request.isSetTermStatisticsRequest()) {
|
|
||||||
ThriftTermStatisticsRequest termStatsRequest = request.getTermStatisticsRequest();
|
|
||||||
requestType.append("TERMSTATS-")
|
|
||||||
.append(termStatsRequest.getTermRequestsSize());
|
|
||||||
|
|
||||||
ThriftHistogramSettings histoSettings = termStatsRequest.getHistogramSettings();
|
|
||||||
if (histoSettings != null) {
|
|
||||||
String binSizeVal = String.valueOf(TermStatisticsUtil.determineBinSize(histoSettings));
|
|
||||||
String numBinsVal = String.valueOf(histoSettings.getNumBins());
|
|
||||||
requestType.append(":NUMBINS-").append(numBinsVal).append(":BINSIZE-").append(binSizeVal);
|
|
||||||
}
|
|
||||||
} else if (request.isSetSearchQuery()) {
|
|
||||||
requestType.append("SEARCH:");
|
|
||||||
requestType.append(request.getSearchQuery().getRankingMode().name());
|
|
||||||
// Denote when a from user id is present.
|
|
||||||
if (request.getSearchQuery().isSetFromUserIDFilter64()) {
|
|
||||||
requestType.append(":NETWORK-")
|
|
||||||
.append(request.getSearchQuery().getFromUserIDFilter64Size());
|
|
||||||
}
|
|
||||||
// Denote when required status ids are present.
|
|
||||||
if (request.getSearchQuery().getSearchStatusIdsSize() > 0) {
|
|
||||||
requestType.append(":IDS-").append(request.getSearchQuery().getSearchStatusIdsSize());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return requestType.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
private static Map<ThriftQuerySource, FailureRatioCounter> preBuildFailureRatioCounters() {
|
|
||||||
Map<ThriftQuerySource, FailureRatioCounter> counterByQuerySource =
|
|
||||||
new EnumMap<>(ThriftQuerySource.class);
|
|
||||||
|
|
||||||
for (ThriftQuerySource thriftQuerySource : ThriftQuerySource.values()) {
|
|
||||||
FailureRatioCounter counter = new FailureRatioCounter("earlybird_logger", "query_source",
|
|
||||||
thriftQuerySource.toString());
|
|
||||||
counterByQuerySource.put(thriftQuerySource, counter);
|
|
||||||
}
|
|
||||||
|
|
||||||
return Maps.immutableEnumMap(counterByQuerySource);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,37 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import com.twitter.decider.Decider;
|
|
||||||
import com.twitter.search.common.metrics.Timer;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
|
||||||
|
|
||||||
public final class EarlybirdRequestPostLogger {
|
|
||||||
private final EarlybirdRequestLogger logger;
|
|
||||||
|
|
||||||
public static EarlybirdRequestPostLogger buildForRoot(
|
|
||||||
int latencyWarnThreshold, Decider decider) {
|
|
||||||
|
|
||||||
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot(
|
|
||||||
EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider);
|
|
||||||
|
|
||||||
return new EarlybirdRequestPostLogger(requestLogger);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static EarlybirdRequestPostLogger buildForShard(
|
|
||||||
int latencyWarnThreshold, Decider decider) {
|
|
||||||
|
|
||||||
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard(
|
|
||||||
EarlybirdRequestPostLogger.class.getName(), latencyWarnThreshold, decider);
|
|
||||||
|
|
||||||
return new EarlybirdRequestPostLogger(requestLogger);
|
|
||||||
}
|
|
||||||
|
|
||||||
private EarlybirdRequestPostLogger(EarlybirdRequestLogger logger) {
|
|
||||||
this.logger = logger;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void logRequest(EarlybirdRequest request, EarlybirdResponse response, Timer timer) {
|
|
||||||
EarlybirdRequestUtil.updateHitsCounters(request);
|
|
||||||
logger.logRequest(request, response, timer);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,32 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import com.twitter.decider.Decider;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
|
||||||
|
|
||||||
public final class EarlybirdRequestPreLogger {
|
|
||||||
private final EarlybirdRequestLogger logger;
|
|
||||||
|
|
||||||
public static EarlybirdRequestPreLogger buildForRoot(Decider decider) {
|
|
||||||
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForRoot(
|
|
||||||
EarlybirdRequestPreLogger.class.getName(), Integer.MAX_VALUE, decider);
|
|
||||||
|
|
||||||
return new EarlybirdRequestPreLogger(requestLogger);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static EarlybirdRequestPreLogger buildForShard(
|
|
||||||
int latencyWarnThreshold, Decider decider) {
|
|
||||||
|
|
||||||
EarlybirdRequestLogger requestLogger = EarlybirdRequestLogger.buildForShard(
|
|
||||||
EarlybirdRequestPreLogger.class.getName(), latencyWarnThreshold, decider);
|
|
||||||
|
|
||||||
return new EarlybirdRequestPreLogger(requestLogger);
|
|
||||||
}
|
|
||||||
|
|
||||||
private EarlybirdRequestPreLogger(EarlybirdRequestLogger logger) {
|
|
||||||
this.logger = logger;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void logRequest(EarlybirdRequest request) {
|
|
||||||
logger.logRequest(request, null, null);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,244 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.search.common.metrics.SearchCounter;
|
|
||||||
import com.twitter.search.common.metrics.SearchMovingAverage;
|
|
||||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
|
||||||
import com.twitter.search.common.metrics.SearchTimerStats;
|
|
||||||
import com.twitter.search.common.query.thriftjava.CollectorParams;
|
|
||||||
import com.twitter.search.common.query.thriftjava.CollectorTerminationParams;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
|
||||||
import com.twitter.search.earlybird.thrift.ThriftSearchQuery;
|
|
||||||
import com.twitter.search.earlybird.thrift.ThriftSearchRelevanceOptions;
|
|
||||||
|
|
||||||
public final class EarlybirdRequestUtil {
|
|
||||||
// This logger is setup to log to a separate set of log files (request_info) and use an
|
|
||||||
// async logger so as to not block the searcher thread. See search/earlybird/config/log4j.xml
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdRequestUtil.class);
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
static final SearchMovingAverage REQUESTED_NUM_RESULTS_STAT =
|
|
||||||
SearchMovingAverage.export("requested_num_results");
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
static final SearchMovingAverage REQUESTED_MAX_HITS_TO_PROCESS_STAT =
|
|
||||||
SearchMovingAverage.export("requested_max_hits_to_process");
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
static final SearchMovingAverage REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT =
|
|
||||||
SearchMovingAverage.export("requested_collector_params_max_hits_to_process");
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
static final SearchMovingAverage REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT =
|
|
||||||
SearchMovingAverage.export("requested_relevance_options_max_hits_to_process");
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
static final SearchCounter REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT =
|
|
||||||
SearchCounter.export("requested_max_hits_to_process_are_different");
|
|
||||||
|
|
||||||
private static final SearchRateCounter REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT =
|
|
||||||
SearchRateCounter.export("request_with_more_than_2k_num_result");
|
|
||||||
private static final SearchRateCounter REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT =
|
|
||||||
SearchRateCounter.export("request_with_more_than_4k_num_result");
|
|
||||||
|
|
||||||
// Stats for tracking clock skew between earlybird and the client-specified request timestamp.
|
|
||||||
@VisibleForTesting
|
|
||||||
public static final SearchTimerStats CLIENT_CLOCK_DIFF_ABS =
|
|
||||||
SearchTimerStats.export("client_clock_diff_abs", TimeUnit.MILLISECONDS, false, true);
|
|
||||||
@VisibleForTesting
|
|
||||||
public static final SearchTimerStats CLIENT_CLOCK_DIFF_POS =
|
|
||||||
SearchTimerStats.export("client_clock_diff_pos", TimeUnit.MILLISECONDS, false, true);
|
|
||||||
@VisibleForTesting
|
|
||||||
public static final SearchTimerStats CLIENT_CLOCK_DIFF_NEG =
|
|
||||||
SearchTimerStats.export("client_clock_diff_neg", TimeUnit.MILLISECONDS, false, true);
|
|
||||||
@VisibleForTesting
|
|
||||||
public static final SearchRateCounter CLIENT_CLOCK_DIFF_MISSING =
|
|
||||||
SearchRateCounter.export("client_clock_diff_missing");
|
|
||||||
|
|
||||||
private static final int MAX_NUM_RESULTS = 4000;
|
|
||||||
private static final int OLD_MAX_NUM_RESULTS = 2000;
|
|
||||||
|
|
||||||
private EarlybirdRequestUtil() {
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Logs and fixes some potentially excessive values in the given request.
|
|
||||||
*/
|
|
||||||
public static void logAndFixExcessiveValues(EarlybirdRequest request) {
|
|
||||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
|
||||||
if (searchQuery != null) {
|
|
||||||
int maxHitsToProcess = 0;
|
|
||||||
int numResultsToReturn = 0;
|
|
||||||
|
|
||||||
if (searchQuery.isSetCollectorParams()) {
|
|
||||||
numResultsToReturn = searchQuery.getCollectorParams().getNumResultsToReturn();
|
|
||||||
|
|
||||||
if (searchQuery.getCollectorParams().isSetTerminationParams()) {
|
|
||||||
maxHitsToProcess =
|
|
||||||
searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (maxHitsToProcess > 50000) {
|
|
||||||
LOG.warn("Excessive max hits in " + request.toString());
|
|
||||||
}
|
|
||||||
|
|
||||||
// We used to limit number of results to 2000. These two counters help us track if we receive
|
|
||||||
// too many requests with large number of results set.
|
|
||||||
String warningMessageTemplate = "Exceed %d num result in %s";
|
|
||||||
if (numResultsToReturn > MAX_NUM_RESULTS) {
|
|
||||||
LOG.warn(String.format(warningMessageTemplate, MAX_NUM_RESULTS, request.toString()));
|
|
||||||
REQUEST_WITH_MORE_THAN_4K_NUM_RESULTS_STAT.increment();
|
|
||||||
searchQuery.getCollectorParams().setNumResultsToReturn(MAX_NUM_RESULTS);
|
|
||||||
} else if (numResultsToReturn > OLD_MAX_NUM_RESULTS) {
|
|
||||||
LOG.warn(String.format(warningMessageTemplate, OLD_MAX_NUM_RESULTS, request.toString()));
|
|
||||||
REQUEST_WITH_MORE_THAN_2K_NUM_RESULTS_STAT.increment();
|
|
||||||
}
|
|
||||||
|
|
||||||
ThriftSearchRelevanceOptions options = searchQuery.getRelevanceOptions();
|
|
||||||
if (options != null) {
|
|
||||||
if (options.getMaxHitsToProcess() > 50000) {
|
|
||||||
LOG.warn("Excessive max hits in " + request.toString());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets {@code request.searchQuery.collectorParams} if they are not already set.
|
|
||||||
*/
|
|
||||||
public static void checkAndSetCollectorParams(EarlybirdRequest request) {
|
|
||||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
|
||||||
if (searchQuery == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!searchQuery.isSetCollectorParams()) {
|
|
||||||
searchQuery.setCollectorParams(new CollectorParams());
|
|
||||||
}
|
|
||||||
if (!searchQuery.getCollectorParams().isSetNumResultsToReturn()) {
|
|
||||||
searchQuery.getCollectorParams().setNumResultsToReturn(searchQuery.getNumResults());
|
|
||||||
}
|
|
||||||
if (!searchQuery.getCollectorParams().isSetTerminationParams()) {
|
|
||||||
CollectorTerminationParams terminationParams = new CollectorTerminationParams();
|
|
||||||
if (request.isSetTimeoutMs()) {
|
|
||||||
terminationParams.setTimeoutMs(request.getTimeoutMs());
|
|
||||||
}
|
|
||||||
if (request.isSetMaxQueryCost()) {
|
|
||||||
terminationParams.setMaxQueryCost(request.getMaxQueryCost());
|
|
||||||
}
|
|
||||||
searchQuery.getCollectorParams().setTerminationParams(terminationParams);
|
|
||||||
}
|
|
||||||
setMaxHitsToProcess(searchQuery);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Early birds will only look for maxHitsToProcess in CollectorParameters.TerminationParameters.
|
|
||||||
// Priority to set CollectorParameters.TerminationParameters.maxHitsToProcess is
|
|
||||||
// 1 Collector parameters
|
|
||||||
// 2 RelevanceParameters
|
|
||||||
// 3 ThrfitQuery.maxHitsToProcess
|
|
||||||
private static void setMaxHitsToProcess(ThriftSearchQuery thriftSearchQuery) {
|
|
||||||
CollectorTerminationParams terminationParams = thriftSearchQuery
|
|
||||||
.getCollectorParams().getTerminationParams();
|
|
||||||
if (!terminationParams.isSetMaxHitsToProcess()) {
|
|
||||||
if (thriftSearchQuery.isSetRelevanceOptions()
|
|
||||||
&& thriftSearchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
|
|
||||||
terminationParams.setMaxHitsToProcess(
|
|
||||||
thriftSearchQuery.getRelevanceOptions().getMaxHitsToProcess());
|
|
||||||
} else {
|
|
||||||
terminationParams.setMaxHitsToProcess(thriftSearchQuery.getMaxHitsToProcess());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Creates a copy of the given request and unsets the binary fields to make the logged line for
|
|
||||||
* this request look nicer.
|
|
||||||
*/
|
|
||||||
public static EarlybirdRequest copyAndClearUnnecessaryValuesForLogging(EarlybirdRequest request) {
|
|
||||||
EarlybirdRequest copiedRequest = request.deepCopy();
|
|
||||||
|
|
||||||
if (copiedRequest.isSetSearchQuery()) {
|
|
||||||
// These fields are very large and the binary data doesn't play well with formz
|
|
||||||
copiedRequest.getSearchQuery().unsetTrustedFilter();
|
|
||||||
copiedRequest.getSearchQuery().unsetDirectFollowFilter();
|
|
||||||
}
|
|
||||||
|
|
||||||
return copiedRequest;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Updates some hit-related stats based on the parameters in the given request.
|
|
||||||
*/
|
|
||||||
public static void updateHitsCounters(EarlybirdRequest request) {
|
|
||||||
if ((request == null) || !request.isSetSearchQuery()) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
ThriftSearchQuery searchQuery = request.getSearchQuery();
|
|
||||||
|
|
||||||
if (searchQuery.isSetNumResults()) {
|
|
||||||
REQUESTED_NUM_RESULTS_STAT.addSample(searchQuery.getNumResults());
|
|
||||||
}
|
|
||||||
|
|
||||||
if (searchQuery.isSetMaxHitsToProcess()) {
|
|
||||||
REQUESTED_MAX_HITS_TO_PROCESS_STAT.addSample(searchQuery.getMaxHitsToProcess());
|
|
||||||
}
|
|
||||||
|
|
||||||
Integer collectorParamsMaxHitsToProcess = null;
|
|
||||||
if (searchQuery.isSetCollectorParams()
|
|
||||||
&& searchQuery.getCollectorParams().isSetTerminationParams()
|
|
||||||
&& searchQuery.getCollectorParams().getTerminationParams().isSetMaxHitsToProcess()) {
|
|
||||||
collectorParamsMaxHitsToProcess =
|
|
||||||
searchQuery.getCollectorParams().getTerminationParams().getMaxHitsToProcess();
|
|
||||||
REQUESTED_COLLECTOR_PARAMS_MAX_HITS_TO_PROCESS_STAT
|
|
||||||
.addSample(collectorParamsMaxHitsToProcess);
|
|
||||||
}
|
|
||||||
|
|
||||||
Integer relevanceOptionsMaxHitsToProcess = null;
|
|
||||||
if (searchQuery.isSetRelevanceOptions()
|
|
||||||
&& searchQuery.getRelevanceOptions().isSetMaxHitsToProcess()) {
|
|
||||||
relevanceOptionsMaxHitsToProcess = searchQuery.getRelevanceOptions().getMaxHitsToProcess();
|
|
||||||
REQUESTED_RELEVANCE_OPTIONS_MAX_HITS_TO_PROCESS_STAT
|
|
||||||
.addSample(relevanceOptionsMaxHitsToProcess);
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((collectorParamsMaxHitsToProcess != null)
|
|
||||||
&& (relevanceOptionsMaxHitsToProcess != null)
|
|
||||||
&& (collectorParamsMaxHitsToProcess != relevanceOptionsMaxHitsToProcess)) {
|
|
||||||
REQUESTED_MAX_HITS_TO_PROCESS_ARE_DIFFERENT_STAT.increment();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean isCachingAllowed(EarlybirdRequest request) {
|
|
||||||
return !request.isSetCachingParams() || request.getCachingParams().isCache();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Track the clock difference between this server and its client's specified request time.
|
|
||||||
* When there is no clock drift between machines, this will record the inflight time between this
|
|
||||||
* server and the client.
|
|
||||||
*
|
|
||||||
* @param request the incoming earlybird request.
|
|
||||||
*/
|
|
||||||
public static void recordClientClockDiff(EarlybirdRequest request) {
|
|
||||||
if (request.isSetClientRequestTimeMs()) {
|
|
||||||
final long timeDiff = System.currentTimeMillis() - request.getClientRequestTimeMs();
|
|
||||||
final long timeDiffAbs = Math.abs(timeDiff);
|
|
||||||
if (timeDiff >= 0) {
|
|
||||||
CLIENT_CLOCK_DIFF_POS.timerIncrement(timeDiffAbs);
|
|
||||||
} else {
|
|
||||||
CLIENT_CLOCK_DIFF_NEG.timerIncrement(timeDiffAbs);
|
|
||||||
}
|
|
||||||
CLIENT_CLOCK_DIFF_ABS.timerIncrement(timeDiffAbs);
|
|
||||||
} else {
|
|
||||||
CLIENT_CLOCK_DIFF_MISSING.increment();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,28 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import javax.inject.Inject;
|
|
||||||
import javax.inject.Singleton;
|
|
||||||
|
|
||||||
import org.apache.thrift.protocol.TProtocolFactory;
|
|
||||||
|
|
||||||
import com.twitter.finagle.Service;
|
|
||||||
import com.twitter.search.common.util.thrift.ThriftToBytesFilter;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdService;
|
|
||||||
|
|
||||||
@Singleton
|
|
||||||
public class EarlybirdThriftBackend extends EarlybirdService.ServiceToClient {
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Wrapping the bytes svc back to a EarlybirdService.ServiceToClient, which
|
|
||||||
* is a EarlybirdService.ServiceIface again.
|
|
||||||
*/
|
|
||||||
@Inject
|
|
||||||
public EarlybirdThriftBackend(
|
|
||||||
ThriftToBytesFilter thriftToBytesFilter,
|
|
||||||
Service<byte[], byte[]> byteService,
|
|
||||||
TProtocolFactory protocolFactory) {
|
|
||||||
|
|
||||||
super(thriftToBytesFilter.andThen(byteService), protocolFactory);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
Binary file not shown.
@ -1,34 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* When incremented, a non-paging alert will be triggered. Use this to assert for bad conditions
|
|
||||||
* that should generally never happen.
|
|
||||||
*/
|
|
||||||
public class NonPagingAssert {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(NonPagingAssert.class);
|
|
||||||
|
|
||||||
private static final String ASSERT_STAT_PREFIX = "non_paging_assert_";
|
|
||||||
|
|
||||||
private final String name;
|
|
||||||
private final SearchRateCounter assertCounter;
|
|
||||||
|
|
||||||
public NonPagingAssert(String name) {
|
|
||||||
this.name = name;
|
|
||||||
this.assertCounter = SearchRateCounter.export(ASSERT_STAT_PREFIX + name);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void assertFailed() {
|
|
||||||
LOG.error("NonPagingAssert failed: {}", name);
|
|
||||||
assertCounter.increment();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void assertFailed(String name) {
|
|
||||||
NonPagingAssert nonPagingAssert = new NonPagingAssert(name);
|
|
||||||
nonPagingAssert.assertFailed();
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,55 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
|
|
||||||
import org.apache.thrift.TException;
|
|
||||||
import org.apache.thrift.TSerializer;
|
|
||||||
import org.apache.thrift.protocol.TSimpleJSONProtocol;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
|
||||||
|
|
||||||
public class RequestResponseForLogging {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(
|
|
||||||
RequestResponseForLogging.class);
|
|
||||||
|
|
||||||
private static final Logger FAILED_REQUEST_LOG = LoggerFactory.getLogger(
|
|
||||||
RequestResponseForLogging.class.getName() + ".FailedRequests");
|
|
||||||
|
|
||||||
private final EarlybirdRequest request;
|
|
||||||
private final EarlybirdResponse response;
|
|
||||||
|
|
||||||
public RequestResponseForLogging(EarlybirdRequest request,
|
|
||||||
EarlybirdResponse response) {
|
|
||||||
this.request = request;
|
|
||||||
this.response = response;
|
|
||||||
}
|
|
||||||
|
|
||||||
private String serialize(EarlybirdRequest clearedRequest, EarlybirdResponse theResponse) {
|
|
||||||
TSerializer serializer = new TSerializer(new TSimpleJSONProtocol.Factory());
|
|
||||||
try {
|
|
||||||
String requestJson = serializer.toString(clearedRequest);
|
|
||||||
String responseJson = serializer.toString(theResponse);
|
|
||||||
return "{\"request\":" + requestJson + ", \"response\":" + responseJson + "}";
|
|
||||||
} catch (TException e) {
|
|
||||||
LOG.error("Failed to serialize request/response for logging.", e);
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Logs the request and response stored in this instance to the failure log file.
|
|
||||||
*/
|
|
||||||
public void logFailedRequest() {
|
|
||||||
// Do the serializing/concatting this way so it happens on the background thread for
|
|
||||||
// async logging
|
|
||||||
FAILED_REQUEST_LOG.info("{}", new Object() {
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return serialize(
|
|
||||||
EarlybirdRequestUtil.copyAndClearUnnecessaryValuesForLogging(request), response);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,44 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import org.apache.lucene.search.Query;
|
|
||||||
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdResponse;
|
|
||||||
|
|
||||||
public class RequestResponsePair {
|
|
||||||
private final EarlybirdRequest request;
|
|
||||||
private final EarlybirdResponse response;
|
|
||||||
private final org.apache.lucene.search.Query luceneQuery;
|
|
||||||
|
|
||||||
// The serialized query in its final form, after various modifications have been applied to it.
|
|
||||||
// As a note, we have some code paths in which this can be null, but I don't really see them
|
|
||||||
// triggered in production right now.
|
|
||||||
private final com.twitter.search.queryparser.query.Query finalSerializedQuery;
|
|
||||||
|
|
||||||
public RequestResponsePair(
|
|
||||||
EarlybirdRequest request,
|
|
||||||
com.twitter.search.queryparser.query.Query finalSerializedQuery,
|
|
||||||
org.apache.lucene.search.Query luceneQuery,
|
|
||||||
EarlybirdResponse response) {
|
|
||||||
this.request = request;
|
|
||||||
this.luceneQuery = luceneQuery;
|
|
||||||
this.response = response;
|
|
||||||
this.finalSerializedQuery = finalSerializedQuery;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getFinalSerializedQuery() {
|
|
||||||
return finalSerializedQuery != null ? finalSerializedQuery.serialize() : "N/A";
|
|
||||||
}
|
|
||||||
|
|
||||||
public EarlybirdRequest getRequest() {
|
|
||||||
return request;
|
|
||||||
}
|
|
||||||
|
|
||||||
public EarlybirdResponse getResponse() {
|
|
||||||
return response;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Query getLuceneQuery() {
|
|
||||||
return luceneQuery;
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,77 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common;
|
|
||||||
|
|
||||||
import org.apache.commons.codec.binary.Base64;
|
|
||||||
import org.apache.thrift.TException;
|
|
||||||
import org.apache.thrift.TSerializer;
|
|
||||||
import org.apache.thrift.protocol.TBinaryProtocol;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
|
|
||||||
import com.twitter.search.common.util.FinagleUtil;
|
|
||||||
import com.twitter.search.earlybird.thrift.EarlybirdRequest;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This class logs all requests that misses either the finagle Id or the client Id.
|
|
||||||
*/
|
|
||||||
public final class UnknownClientRequestForLogging {
|
|
||||||
private static final Logger GENERAL_LOG = org.slf4j.LoggerFactory.getLogger(
|
|
||||||
UnknownClientRequestForLogging.class);
|
|
||||||
private static final Logger LOG = org.slf4j.LoggerFactory.getLogger(
|
|
||||||
UnknownClientRequestForLogging.class.getName() + ".unknownClientRequests");
|
|
||||||
|
|
||||||
private final String logLine;
|
|
||||||
private final EarlybirdRequest request;
|
|
||||||
private final String clientId;
|
|
||||||
private final String finagleId;
|
|
||||||
|
|
||||||
private final Base64 base64 = new Base64();
|
|
||||||
private final TSerializer serializer = new TSerializer(new TBinaryProtocol.Factory());
|
|
||||||
|
|
||||||
private UnknownClientRequestForLogging(
|
|
||||||
String logLine,
|
|
||||||
EarlybirdRequest request,
|
|
||||||
String clientId,
|
|
||||||
String finagleId) {
|
|
||||||
|
|
||||||
this.logLine = logLine;
|
|
||||||
this.request = request;
|
|
||||||
this.clientId = clientId;
|
|
||||||
this.finagleId = finagleId;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns an UnknownClientRequestForLogging instance if a client ID is not set on the given
|
|
||||||
* earlybird request. If the request has a client ID set, {@code null} is returned.
|
|
||||||
*
|
|
||||||
* @param logLine Additional information to propagate to the log file, when logging this request.
|
|
||||||
* @param request The earlybird request.
|
|
||||||
*/
|
|
||||||
public static UnknownClientRequestForLogging unknownClientRequest(
|
|
||||||
String logLine, EarlybirdRequest request) {
|
|
||||||
String clientId = ClientIdUtil.getClientIdFromRequest(request);
|
|
||||||
String finagleId = FinagleUtil.getFinagleClientName();
|
|
||||||
|
|
||||||
if (clientId.equals(ClientIdUtil.UNSET_CLIENT_ID)) {
|
|
||||||
return new UnknownClientRequestForLogging(logLine, request, clientId, finagleId);
|
|
||||||
} else {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private String asBase64() {
|
|
||||||
try {
|
|
||||||
// Need to make a deepCopy() here, because the request may still be in use (e.g. if we are
|
|
||||||
// doing this in the pre-logger), and we should not be modifying crucial fields on the
|
|
||||||
// EarlybirdRequest in place.
|
|
||||||
EarlybirdRequest clearedRequest = request.deepCopy();
|
|
||||||
clearedRequest.unsetClientRequestTimeMs();
|
|
||||||
return base64.encodeToString(serializer.serialize(clearedRequest));
|
|
||||||
} catch (TException e) {
|
|
||||||
GENERAL_LOG.error("Failed to serialize request for logging.", e);
|
|
||||||
return "failed_to_serialize";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public void log() {
|
|
||||||
LOG.info("{},{},{},{}", clientId, finagleId, logLine, asBase64());
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,21 +0,0 @@
|
|||||||
java_library(
|
|
||||||
sources = ["*.java"],
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
"3rdparty/jvm/com/google/code/findbugs:jsr305",
|
|
||||||
"3rdparty/jvm/com/google/guava",
|
|
||||||
"3rdparty/jvm/org/apache/commons:commons-lang3",
|
|
||||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
|
||||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
|
||||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
|
||||||
"3rdparty/jvm/org/yaml:snakeyaml",
|
|
||||||
"finagle-internal/mtls/src/main/scala/com/twitter/finagle/mtls/authentication",
|
|
||||||
"src/java/com/twitter/common/base",
|
|
||||||
"src/java/com/twitter/common_internal/text/version",
|
|
||||||
"src/java/com/twitter/search/common/aurora",
|
|
||||||
"src/java/com/twitter/search/common/config",
|
|
||||||
"src/java/com/twitter/search/common/metrics",
|
|
||||||
"src/java/com/twitter/search/common/util/zookeeper",
|
|
||||||
],
|
|
||||||
)
|
|
BIN
src/java/com/twitter/search/earlybird/common/config/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/common/config/BUILD.docx
Normal file
Binary file not shown.
Binary file not shown.
@ -1,363 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common.config;
|
|
||||||
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Map;
|
|
||||||
import javax.annotation.Nullable;
|
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableMap;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.common_internal.text.version.PenguinVersion;
|
|
||||||
import com.twitter.search.common.aurora.AuroraInstanceKey;
|
|
||||||
import com.twitter.search.common.config.Config;
|
|
||||||
import com.twitter.search.common.config.ConfigFile;
|
|
||||||
import com.twitter.search.common.config.ConfigurationException;
|
|
||||||
import com.twitter.search.common.config.SearchPenguinVersionsConfig;
|
|
||||||
|
|
||||||
public final class EarlybirdConfig {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(EarlybirdConfig.class);
|
|
||||||
|
|
||||||
private static final String DEFAULT_CONFIG_FILE = "earlybird-search.yml";
|
|
||||||
private static final String LATE_TWEET_BUFFER_KEY = "late_tweet_buffer";
|
|
||||||
|
|
||||||
public static final String EARLYBIRD_ZK_CONFIG_DIR = "/twitter/search/production/earlybird/";
|
|
||||||
public static final String EARLYBIRD_CONFIG_DIR = "earlybird/config";
|
|
||||||
|
|
||||||
public static final String USER_SNAPSHOT_BASE_DIR = "user_snapshot_base_dir";
|
|
||||||
|
|
||||||
private static volatile ConfigFile earlybirdConfig = null;
|
|
||||||
private static volatile Map<String, Object> overrideValueMap = ImmutableMap.of();
|
|
||||||
|
|
||||||
private static String logDirOverride = null;
|
|
||||||
private static AuroraInstanceKey auroraInstanceKey = null;
|
|
||||||
|
|
||||||
private static int adminPort;
|
|
||||||
|
|
||||||
private EarlybirdConfig() { }
|
|
||||||
|
|
||||||
private static final class PenguinVersionHolder {
|
|
||||||
private static final PenguinVersion PENGUIN_VERSION_SINGLETON =
|
|
||||||
SearchPenguinVersionsConfig.getSingleSupportedVersion(
|
|
||||||
EarlybirdProperty.PENGUIN_VERSION.get());
|
|
||||||
private static final byte PENGUIN_VERSION_BYTE_VALUE =
|
|
||||||
PENGUIN_VERSION_SINGLETON.getByteValue();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static byte getPenguinVersionByte() {
|
|
||||||
return PenguinVersionHolder.PENGUIN_VERSION_BYTE_VALUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static PenguinVersion getPenguinVersion() {
|
|
||||||
return PenguinVersionHolder.PENGUIN_VERSION_SINGLETON;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Reads the earlybird configuration from the given file.
|
|
||||||
*/
|
|
||||||
public static synchronized void init(@Nullable String configFile) {
|
|
||||||
if (earlybirdConfig == null) {
|
|
||||||
String file = configFile == null ? DEFAULT_CONFIG_FILE : configFile;
|
|
||||||
earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, file);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static synchronized void setOverrideValues(Map<String, Object> overrideValues) {
|
|
||||||
overrideValueMap = ImmutableMap.copyOf(overrideValues);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Pack all values in a string that can be printed for informational purposes.
|
|
||||||
* @return the string.
|
|
||||||
*/
|
|
||||||
public static String allValuesAsString() {
|
|
||||||
Map<String, String> stringMap = earlybirdConfig.getStringMap();
|
|
||||||
|
|
||||||
StringBuilder stringBuilder = new StringBuilder();
|
|
||||||
|
|
||||||
stringBuilder.append("Config environment: " + Config.getEnvironment() + "\n\n");
|
|
||||||
stringBuilder.append(
|
|
||||||
String.format("Values from earlybird-search.yml (total %d):\n", stringMap.size()));
|
|
||||||
|
|
||||||
stringMap.forEach((key, value) -> {
|
|
||||||
stringBuilder.append(String.format(" %s: %s\n", key, value.toString()));
|
|
||||||
if (overrideValueMap.containsKey(key)) {
|
|
||||||
stringBuilder.append(String.format(
|
|
||||||
" override value: %s\n", overrideValueMap.get(key).toString()));
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
stringBuilder.append(String.format(
|
|
||||||
"\n\nAll command-line overrides (total: %d):\n", overrideValueMap.size()));
|
|
||||||
overrideValueMap.forEach((key, value) -> {
|
|
||||||
stringBuilder.append(String.format(" %s: %s\n", key, value.toString()));
|
|
||||||
});
|
|
||||||
|
|
||||||
return stringBuilder.toString();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a string. If the property is not set, a runtime
|
|
||||||
* exception is thrown.
|
|
||||||
*/
|
|
||||||
public static String getString(String property) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (String) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
return earlybirdConfig.getString(property);
|
|
||||||
} catch (ConfigurationException e) {
|
|
||||||
LOG.error("Fatal error: could not get config string " + property, e);
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a string.
|
|
||||||
*/
|
|
||||||
public static String getString(String property, String defaultValue) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (String) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
return earlybirdConfig.getString(property, defaultValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as an integer. If the property is not set, a runtime
|
|
||||||
* exception is thrown.
|
|
||||||
*/
|
|
||||||
public static int getInt(String property) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (int) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
return earlybirdConfig.getInt(property);
|
|
||||||
} catch (ConfigurationException e) {
|
|
||||||
LOG.error("Fatal error: could not get config int " + property, e);
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as an integer.
|
|
||||||
*/
|
|
||||||
public static int getInt(String property, int defaultValue) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (int) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
return earlybirdConfig.getInt(property, defaultValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a double.
|
|
||||||
*/
|
|
||||||
public static double getDouble(String property, double defaultValue) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (double) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
return earlybirdConfig.getDouble(property, defaultValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a long. If the property is not set, a runtime
|
|
||||||
* exception is thrown.
|
|
||||||
*/
|
|
||||||
public static long getLong(String property) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (long) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
return earlybirdConfig.getLong(property);
|
|
||||||
} catch (ConfigurationException e) {
|
|
||||||
LOG.error("Fatal error: could not get config long " + property, e);
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a long.
|
|
||||||
*/
|
|
||||||
public static long getLong(String property, long defaultValue) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (long) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
return earlybirdConfig.getLong(property, defaultValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a boolean. If the property is not set, a runtime
|
|
||||||
* exception is thrown.
|
|
||||||
*/
|
|
||||||
public static boolean getBool(String property) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (boolean) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
return earlybirdConfig.getBool(property);
|
|
||||||
} catch (ConfigurationException e) {
|
|
||||||
LOG.error("Fatal error: could not get config boolean " + property, e);
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a boolean.
|
|
||||||
*/
|
|
||||||
public static boolean getBool(String property, boolean defaultValue) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (boolean) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
return earlybirdConfig.getBool(property, defaultValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a date.
|
|
||||||
*/
|
|
||||||
public static Date getDate(String property) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (Date) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
Date date = (Date) earlybirdConfig.getObject(property, null);
|
|
||||||
if (date == null) {
|
|
||||||
throw new RuntimeException("Could not get config date: " + property);
|
|
||||||
}
|
|
||||||
return date;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a list of strings.
|
|
||||||
*/
|
|
||||||
public static List<String> getListOfStrings(String property) {
|
|
||||||
Object overrideValue = overrideValueMap.get(property);
|
|
||||||
if (overrideValue != null) {
|
|
||||||
return (List<String>) overrideValue;
|
|
||||||
}
|
|
||||||
|
|
||||||
List<String> list = (List<String>) earlybirdConfig.getObject(property, null);
|
|
||||||
if (list == null) {
|
|
||||||
throw new RuntimeException("Could not get list of strings: " + property);
|
|
||||||
}
|
|
||||||
return list;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the value of the given property as a map.
|
|
||||||
*/
|
|
||||||
@SuppressWarnings("unchecked")
|
|
||||||
public static Map<String, Object> getMap(String property) {
|
|
||||||
Map<String, Object> map = (Map<String, Object>) earlybirdConfig.getObject(property, null);
|
|
||||||
if (map == null) {
|
|
||||||
throw new RuntimeException("Could not find config property: " + property);
|
|
||||||
}
|
|
||||||
return map;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int getMaxSegmentSize() {
|
|
||||||
return EarlybirdConfig.getInt("max_segment_size", 1 << 16);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the log properties file.
|
|
||||||
*/
|
|
||||||
public static String getLogPropertiesFile() {
|
|
||||||
try {
|
|
||||||
String filename = earlybirdConfig.getString("log_properties_filename");
|
|
||||||
return earlybirdConfig.getConfigFilePath(filename);
|
|
||||||
} catch (ConfigurationException e) {
|
|
||||||
// Print here rather than use LOG - log was probably not initialized yet.
|
|
||||||
LOG.error("Fatal error: could not get log properties file", e);
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the log directory.
|
|
||||||
*/
|
|
||||||
public static String getLogDir() {
|
|
||||||
if (logDirOverride != null) {
|
|
||||||
return logDirOverride;
|
|
||||||
} else {
|
|
||||||
return EarlybirdConfig.getString("log_dir");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void overrideLogDir(String logDir) {
|
|
||||||
EarlybirdConfig.logDirOverride = logDir;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int getThriftPort() {
|
|
||||||
return EarlybirdProperty.THRIFT_PORT.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int getWarmUpThriftPort() {
|
|
||||||
return EarlybirdProperty.WARMUP_THRIFT_PORT.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int getSearcherThreads() {
|
|
||||||
return EarlybirdProperty.SEARCHER_THREADS.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int getLateTweetBuffer() {
|
|
||||||
return getInt(LATE_TWEET_BUFFER_KEY);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int getAdminPort() {
|
|
||||||
return adminPort;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void setAdminPort(int adminPort) {
|
|
||||||
EarlybirdConfig.adminPort = adminPort;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean isRealtimeOrProtected() {
|
|
||||||
String earlybirdName = EarlybirdProperty.EARLYBIRD_NAME.get();
|
|
||||||
return earlybirdName.contains("realtime") || earlybirdName.contains("protected");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean consumeUserScrubGeoEvents() {
|
|
||||||
return EarlybirdProperty.CONSUME_GEO_SCRUB_EVENTS.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Nullable
|
|
||||||
public static AuroraInstanceKey getAuroraInstanceKey() {
|
|
||||||
return auroraInstanceKey;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void setAuroraInstanceKey(AuroraInstanceKey auroraInstanceKey) {
|
|
||||||
EarlybirdConfig.auroraInstanceKey = auroraInstanceKey;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean isAurora() {
|
|
||||||
return auroraInstanceKey != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void setForTests(String property, Object value) {
|
|
||||||
earlybirdConfig.setForTests(DEFAULT_CONFIG_FILE, property, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static synchronized void clearForTests() {
|
|
||||||
earlybirdConfig = new ConfigFile(EARLYBIRD_CONFIG_DIR, DEFAULT_CONFIG_FILE);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,390 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common.config;
|
|
||||||
|
|
||||||
import java.lang.reflect.Modifier;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.function.BiFunction;
|
|
||||||
import java.util.function.Function;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
|
|
||||||
import com.google.common.collect.ImmutableList;
|
|
||||||
|
|
||||||
import com.twitter.app.Flag;
|
|
||||||
import com.twitter.app.Flaggable;
|
|
||||||
import com.twitter.app.Flags;
|
|
||||||
import com.twitter.finagle.mtls.authentication.ServiceIdentifier;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Stateless class that represents an Earlybird property that can be specified by a command line
|
|
||||||
* flag.
|
|
||||||
* <p>
|
|
||||||
* This is a regular Java class instead of enum to have a generic type.
|
|
||||||
*
|
|
||||||
* @param <T>
|
|
||||||
*/
|
|
||||||
public final class EarlybirdProperty<T> {
|
|
||||||
|
|
||||||
private static final class PropertyType<T> {
|
|
||||||
|
|
||||||
private static final PropertyType<Boolean> BOOLEAN = new PropertyType<>(
|
|
||||||
Flaggable.ofJavaBoolean(), EarlybirdConfig::getBool, EarlybirdConfig::getBool);
|
|
||||||
|
|
||||||
private static final PropertyType<Integer> INT = new PropertyType<>(
|
|
||||||
Flaggable.ofJavaInteger(), EarlybirdConfig::getInt, EarlybirdConfig::getInt);
|
|
||||||
|
|
||||||
private static final PropertyType<String> STRING = new PropertyType<>(
|
|
||||||
Flaggable.ofString(), EarlybirdConfig::getString, EarlybirdConfig::getString);
|
|
||||||
|
|
||||||
private final Flaggable<T> flaggable;
|
|
||||||
private final Function<String, T> getter;
|
|
||||||
private final BiFunction<String, T, T> getterWithDefault;
|
|
||||||
|
|
||||||
private PropertyType(Flaggable<T> flaggable, Function<String, T> getter,
|
|
||||||
BiFunction<String, T, T> getterWithDefault) {
|
|
||||||
this.flaggable = flaggable;
|
|
||||||
this.getter = getter;
|
|
||||||
this.getterWithDefault = getterWithDefault;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> PENGUIN_VERSION =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"penguin_version",
|
|
||||||
"The penguin version to index.",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> THRIFT_PORT = new EarlybirdProperty<>(
|
|
||||||
"thrift_port",
|
|
||||||
"override thrift port from config file",
|
|
||||||
PropertyType.INT,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> WARMUP_THRIFT_PORT = new EarlybirdProperty<>(
|
|
||||||
"warmup_thrift_port",
|
|
||||||
"override warmup thrift port from config file",
|
|
||||||
PropertyType.INT,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> SEARCHER_THREADS = new EarlybirdProperty<>(
|
|
||||||
"searcher_threads",
|
|
||||||
"override number of searcher threads from config file",
|
|
||||||
PropertyType.INT,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> EARLYBIRD_TIER = new EarlybirdProperty<>(
|
|
||||||
"earlybird_tier",
|
|
||||||
"the earlybird tier (e.g. tier1), used on Aurora",
|
|
||||||
PropertyType.STRING,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> REPLICA_ID = new EarlybirdProperty<>(
|
|
||||||
"replica_id",
|
|
||||||
"the ID in a partition, used on Aurora",
|
|
||||||
PropertyType.INT,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> PARTITION_ID = new EarlybirdProperty<>(
|
|
||||||
"partition_id",
|
|
||||||
"partition ID, used on Aurora",
|
|
||||||
PropertyType.INT,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> NUM_PARTITIONS = new EarlybirdProperty<>(
|
|
||||||
"num_partitions",
|
|
||||||
"number of partitions, used on Aurora",
|
|
||||||
PropertyType.INT,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> NUM_INSTANCES = new EarlybirdProperty<>(
|
|
||||||
"num_instances",
|
|
||||||
"number of instances in the job, used on Aurora",
|
|
||||||
PropertyType.INT,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> SERVING_TIMESLICES = new EarlybirdProperty<>(
|
|
||||||
"serving_timeslices",
|
|
||||||
"number of time slices to serve, used on Aurora",
|
|
||||||
PropertyType.INT,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> ROLE = new EarlybirdProperty<>(
|
|
||||||
"role",
|
|
||||||
"Role in the service path of Earlybird",
|
|
||||||
PropertyType.STRING,
|
|
||||||
true,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> EARLYBIRD_NAME = new EarlybirdProperty<>(
|
|
||||||
"earlybird_name",
|
|
||||||
"Name in the service path of Earlybird without hash partition suffix",
|
|
||||||
PropertyType.STRING,
|
|
||||||
true,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> ENV = new EarlybirdProperty<>(
|
|
||||||
"env",
|
|
||||||
"Environment in the service path of Earlybird",
|
|
||||||
PropertyType.STRING,
|
|
||||||
true,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> ZONE = new EarlybirdProperty<>(
|
|
||||||
"zone",
|
|
||||||
"Zone (data center) in the service path of Earlybird",
|
|
||||||
PropertyType.STRING,
|
|
||||||
true,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> DL_URI = new EarlybirdProperty<>(
|
|
||||||
"dl_uri",
|
|
||||||
"DistributedLog URI for default DL reader",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> USER_UPDATES_DL_URI = new EarlybirdProperty<>(
|
|
||||||
"user_updates_dl_uri",
|
|
||||||
"DistributedLog URI for user updates DL reader",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> ANTISOCIAL_USERUPDATES_DL_STREAM =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"antisocial_userupdates_dl_stream",
|
|
||||||
"DL stream name for antisocial user updates without DL version suffix",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> ZK_APP_ROOT = new EarlybirdProperty<>(
|
|
||||||
"zk_app_root",
|
|
||||||
"SZooKeeper base root path for this application",
|
|
||||||
PropertyType.STRING,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Boolean> SEGMENT_LOAD_FROM_HDFS_ENABLED =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"segment_load_from_hdfs_enabled",
|
|
||||||
"Whether to load segment data from HDFS",
|
|
||||||
PropertyType.BOOLEAN,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Boolean> SEGMENT_FLUSH_TO_HDFS_ENABLED =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"segment_flush_to_hdfs_enabled",
|
|
||||||
"Whether to flush segment data to HDFS",
|
|
||||||
PropertyType.BOOLEAN,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> HDFS_SEGMENT_SYNC_DIR = new EarlybirdProperty<>(
|
|
||||||
"hdfs_segment_sync_dir",
|
|
||||||
"HDFS directory to sync segment data",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> HDFS_SEGMENT_UPLOAD_DIR = new EarlybirdProperty<>(
|
|
||||||
"hdfs_segment_upload_dir",
|
|
||||||
"HDFS directory to upload segment data",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Boolean> ARCHIVE_DAILY_STATUS_BATCH_FLUSHING_ENABLED =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"archive_daily_status_batch_flushing_enabled",
|
|
||||||
"Whether to enable archive daily status batch flushing",
|
|
||||||
PropertyType.BOOLEAN,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> HDFS_INDEX_SYNC_DIR = new EarlybirdProperty<>(
|
|
||||||
"hdfs_index_sync_dir",
|
|
||||||
"HDFS directory to sync index data",
|
|
||||||
PropertyType.STRING,
|
|
||||||
true);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Boolean> READ_INDEX_FROM_PROD_LOCATION =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"read_index_from_prod_location",
|
|
||||||
"Read index from prod to speed up startup on staging / loadtest",
|
|
||||||
PropertyType.BOOLEAN,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Boolean> USE_DECIDER_OVERLAY = new EarlybirdProperty<>(
|
|
||||||
"use_decider_overlay",
|
|
||||||
"Whether to use decider overlay",
|
|
||||||
PropertyType.BOOLEAN,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> DECIDER_OVERLAY_CONFIG = new EarlybirdProperty<>(
|
|
||||||
"decider_overlay_config",
|
|
||||||
"Path to decider overlay config",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> MAX_CONCURRENT_SEGMENT_INDEXERS =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"max_concurrent_segment_indexers",
|
|
||||||
"Maximum number of segments indexed concurrently",
|
|
||||||
PropertyType.INT,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Boolean> TF_MODELS_ENABLED =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"tf_models_enabled",
|
|
||||||
"Whether tensorflow models should be loaded",
|
|
||||||
PropertyType.BOOLEAN,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> TF_MODELS_CONFIG_PATH =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"tf_models_config_path",
|
|
||||||
"The configuration path of the yaml file containing the list of tensorflow models to load.",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> TF_INTER_OP_THREADS =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"tf_inter_op_threads",
|
|
||||||
"How many tensorflow inter op threads to use. See TF documentation for more information.",
|
|
||||||
PropertyType.INT,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> TF_INTRA_OP_THREADS =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"tf_intra_op_threads",
|
|
||||||
"How many tensorflow intra op threads to use. See TF documentation for more information.",
|
|
||||||
PropertyType.INT,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> MAX_ALLOWED_REPLICAS_NOT_IN_SERVER_SET =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"max_allowed_replicas_not_in_server_set",
|
|
||||||
"How many replicas are allowed to be missing from the Earlybird server set.",
|
|
||||||
PropertyType.INT,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Boolean> CHECK_NUM_REPLICAS_IN_SERVER_SET =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"check_num_replicas_in_server_set",
|
|
||||||
"Whether CoordinatedEarlybirdActions should check the number of alive replicas",
|
|
||||||
PropertyType.BOOLEAN,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<Integer> MAX_QUEUE_SIZE =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"max_queue_size",
|
|
||||||
"Maximum size of searcher worker executor queue. If <= 0 queue is unbounded.",
|
|
||||||
PropertyType.INT,
|
|
||||||
false);
|
|
||||||
|
|
||||||
public static final EarlybirdProperty<String> KAFKA_ENV =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"kafka_env",
|
|
||||||
"The environment to use for kafka topics.",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
public static final EarlybirdProperty<String> KAFKA_PATH =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"kafka_path",
|
|
||||||
"Wily path to the Search kafka cluster.",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
public static final EarlybirdProperty<String> TWEET_EVENTS_KAFKA_PATH =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"tweet_events_kafka_path",
|
|
||||||
"Wily path to the tweet-events kafka cluster.",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
public static final EarlybirdProperty<String> USER_UPDATES_KAFKA_TOPIC =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"user_updates_topic",
|
|
||||||
"Name of the Kafka topic that contain user updates.",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
public static final EarlybirdProperty<String> USER_SCRUB_GEO_KAFKA_TOPIC =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"user_scrub_geo_topic",
|
|
||||||
"Name of the Kafka topic that contain UserScrubGeoEvents.",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
public static final EarlybirdProperty<String> EARLYBIRD_SCRUB_GEN =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"earlybird_scrub_gen",
|
|
||||||
"SCRUB_GEN TO DEPLOY",
|
|
||||||
PropertyType.STRING,
|
|
||||||
false);
|
|
||||||
public static final EarlybirdProperty<Boolean> CONSUME_GEO_SCRUB_EVENTS =
|
|
||||||
new EarlybirdProperty<>(
|
|
||||||
"consume_geo_scrub_events",
|
|
||||||
"Whether to consume user scrub geo events or not",
|
|
||||||
PropertyType.BOOLEAN,
|
|
||||||
false);
|
|
||||||
|
|
||||||
private static final List<EarlybirdProperty<?>> ALL_PROPERTIES =
|
|
||||||
Arrays.stream(EarlybirdProperty.class.getDeclaredFields())
|
|
||||||
.filter(field ->
|
|
||||||
(field.getModifiers() & Modifier.STATIC) > 0
|
|
||||||
&& field.getType() == EarlybirdProperty.class)
|
|
||||||
.map(field -> {
|
|
||||||
try {
|
|
||||||
return (EarlybirdProperty<?>) field.get(EarlybirdProperty.class);
|
|
||||||
} catch (Exception e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.collect(Collectors.collectingAndThen(Collectors.toList(), ImmutableList::copyOf));
|
|
||||||
|
|
||||||
public static ServiceIdentifier getServiceIdentifier() {
|
|
||||||
return new ServiceIdentifier(
|
|
||||||
ROLE.get(),
|
|
||||||
EARLYBIRD_NAME.get(),
|
|
||||||
ENV.get(),
|
|
||||||
ZONE.get());
|
|
||||||
}
|
|
||||||
|
|
||||||
private final String name;
|
|
||||||
private final String help;
|
|
||||||
private final PropertyType<T> type;
|
|
||||||
private final boolean requiredOnAurora;
|
|
||||||
private final boolean requiredOnDedicated;
|
|
||||||
|
|
||||||
private EarlybirdProperty(String name, String help, PropertyType<T> type,
|
|
||||||
boolean requiredOnAurora) {
|
|
||||||
this(name, help, type, requiredOnAurora, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
private EarlybirdProperty(String name, String help, PropertyType<T> type,
|
|
||||||
boolean requiredOnAurora, boolean requiredOnDedicated) {
|
|
||||||
this.name = name;
|
|
||||||
this.help = help;
|
|
||||||
this.type = type;
|
|
||||||
this.requiredOnAurora = requiredOnAurora;
|
|
||||||
this.requiredOnDedicated = requiredOnDedicated;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String name() {
|
|
||||||
return name;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isRequiredOnAurora() {
|
|
||||||
return requiredOnAurora;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isRequiredOnDedicated() {
|
|
||||||
return requiredOnDedicated;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Flag<T> createFlag(Flags flags) {
|
|
||||||
return flags.createMandatory(name, help, null, type.flaggable);
|
|
||||||
}
|
|
||||||
|
|
||||||
public T get() {
|
|
||||||
return type.getter.apply(name);
|
|
||||||
}
|
|
||||||
|
|
||||||
public T get(T devaultValue) {
|
|
||||||
return type.getterWithDefault.apply(name, devaultValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static EarlybirdProperty[] values() {
|
|
||||||
return ALL_PROPERTIES.toArray(new EarlybirdProperty[0]);
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,45 +0,0 @@
|
|||||||
java_library(
|
|
||||||
sources = ["*.java"],
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
"3rdparty/jvm/com/google/guava",
|
|
||||||
"3rdparty/jvm/com/google/inject:guice",
|
|
||||||
"3rdparty/jvm/commons-io",
|
|
||||||
"3rdparty/jvm/geo/google:geoGoogle",
|
|
||||||
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-server",
|
|
||||||
"3rdparty/jvm/org/apache/bookkeeper:bookkeeper-twitter-science-provider",
|
|
||||||
"3rdparty/jvm/org/apache/hadoop:hadoop-client-default",
|
|
||||||
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-common",
|
|
||||||
"3rdparty/jvm/org/apache/lucene:lucene-analyzers-smartcn",
|
|
||||||
"3rdparty/jvm/org/apache/lucene:lucene-core",
|
|
||||||
"3rdparty/jvm/org/apache/lucene:lucene-facet",
|
|
||||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
|
||||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
|
||||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
|
||||||
"3rdparty/src/jvm/com/twitter/scalding:core",
|
|
||||||
"3rdparty/src/jvm/com/twitter/scalding:date",
|
|
||||||
"3rdparty/src/jvm/com/twitter/scalding:parquet",
|
|
||||||
"decider/src/main/scala",
|
|
||||||
"src/java/com/twitter/common/base",
|
|
||||||
"src/java/com/twitter/common/util:system-mocks",
|
|
||||||
"src/java/com/twitter/common_internal/hadoop",
|
|
||||||
"src/java/com/twitter/search/common/logging",
|
|
||||||
"src/java/com/twitter/search/common/metrics",
|
|
||||||
"src/java/com/twitter/search/common/partitioning/snowflakeparser",
|
|
||||||
"src/java/com/twitter/search/common/schema/earlybird",
|
|
||||||
"src/java/com/twitter/search/common/util/hash",
|
|
||||||
"src/java/com/twitter/search/common/util/io",
|
|
||||||
"src/java/com/twitter/search/common/util/io:dl-reader-writer",
|
|
||||||
"src/java/com/twitter/search/common/util/io:flushable",
|
|
||||||
"src/java/com/twitter/search/common/util/io:record-reader-api",
|
|
||||||
"src/java/com/twitter/search/earlybird/common/config",
|
|
||||||
"src/scala/com/twitter/scalding_internal/error_handling",
|
|
||||||
"src/scala/com/twitter/scalding_internal/multiformat",
|
|
||||||
"src/scala/com/twitter/scalding_internal/source",
|
|
||||||
"src/scala/com/twitter/search/user_table/sources",
|
|
||||||
"src/thrift/com/twitter/search/common:indexing-java",
|
|
||||||
"src/thrift/com/twitter/tweetypie:events-java",
|
|
||||||
"util/util-core:scala",
|
|
||||||
],
|
|
||||||
)
|
|
Binary file not shown.
Binary file not shown.
@ -1,100 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common.userupdates;
|
|
||||||
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
import com.twitter.search.common.metrics.SearchCounter;
|
|
||||||
import com.twitter.search.common.metrics.SearchCustomGauge;
|
|
||||||
import com.twitter.search.common.metrics.SearchTimerStats;
|
|
||||||
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
|
|
||||||
import com.twitter.tweetypie.thriftjava.UserScrubGeoEvent;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Map of users who have actioned to delete location data from their tweets. UserID's are mapped
|
|
||||||
* to the maxTweetId that will eventually be scrubbed from the index (userId -> maxTweetId).
|
|
||||||
*
|
|
||||||
* ConcurrentHashMap is thread safe without synchronizing the whole map. Reads can happen very fast
|
|
||||||
* while writes are done with a lock. This is ideal since many Earlybird Searcher threads could
|
|
||||||
* be reading from the map at once, whereas we will only be adding to the map via kafka.
|
|
||||||
*
|
|
||||||
* This map is checked against to filter out tweets that should not be returned to geo queries.
|
|
||||||
* See: go/realtime-geo-filtering
|
|
||||||
*/
|
|
||||||
public class UserScrubGeoMap {
|
|
||||||
// The number of geo events that contain a user ID already present in the map. This count is used
|
|
||||||
// to verify the number of users in the map against the number of events consumed from kafka.
|
|
||||||
private static final SearchCounter USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT =
|
|
||||||
SearchCounter.export("user_scrub_geo_event_existing_user_count");
|
|
||||||
public static final SearchTimerStats USER_SCRUB_GEO_EVENT_LAG_STAT =
|
|
||||||
SearchTimerStats.export("user_scrub_geo_event_lag",
|
|
||||||
TimeUnit.MILLISECONDS,
|
|
||||||
false,
|
|
||||||
true);
|
|
||||||
private ConcurrentHashMap<Long, Long> map;
|
|
||||||
|
|
||||||
public UserScrubGeoMap() {
|
|
||||||
map = new ConcurrentHashMap<>();
|
|
||||||
SearchCustomGauge.export("num_users_in_geo_map", this::getNumUsersInMap);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Ensure that the max_tweet_id in the userScrubGeoEvent is greater than the one already stored
|
|
||||||
* in the map for the given user id (if any) before updating the entry for this user.
|
|
||||||
* This will protect Earlybirds from potential issues where out of date UserScrubGeoEvents
|
|
||||||
* appear in the incoming Kafka stream.
|
|
||||||
*
|
|
||||||
* @param userScrubGeoEvent
|
|
||||||
*/
|
|
||||||
public void indexUserScrubGeoEvent(UserScrubGeoEvent userScrubGeoEvent) {
|
|
||||||
long userId = userScrubGeoEvent.getUser_id();
|
|
||||||
long newMaxTweetId = userScrubGeoEvent.getMax_tweet_id();
|
|
||||||
long oldMaxTweetId = map.getOrDefault(userId, 0L);
|
|
||||||
if (map.containsKey(userId)) {
|
|
||||||
USER_SCRUB_GEO_EVENT_EXISTING_USER_COUNT.increment();
|
|
||||||
}
|
|
||||||
map.put(userId, Math.max(oldMaxTweetId, newMaxTweetId));
|
|
||||||
USER_SCRUB_GEO_EVENT_LAG_STAT.timerIncrement(computeEventLag(newMaxTweetId));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A tweet is geo scrubbed if it is older than the max tweet id that is scrubbed for the tweet's
|
|
||||||
* author.
|
|
||||||
* If there is no entry for the tweet's author in the map, then the tweet is not geo scrubbed.
|
|
||||||
*
|
|
||||||
* @param tweetId
|
|
||||||
* @param fromUserId
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
public boolean isTweetGeoScrubbed(long tweetId, long fromUserId) {
|
|
||||||
return tweetId <= map.getOrDefault(fromUserId, 0L);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The lag (in milliseconds) from when a UserScrubGeoEvent is created, until it is applied to the
|
|
||||||
* UserScrubGeoMap. Take the maxTweetId found in the current event and convert it to a timestamp.
|
|
||||||
* The maxTweetId will give us a timestamp closest to when Tweetypie processes macaw-geo requests.
|
|
||||||
*
|
|
||||||
* @param maxTweetId
|
|
||||||
* @return
|
|
||||||
*/
|
|
||||||
private long computeEventLag(long maxTweetId) {
|
|
||||||
long eventCreatedAtTime = SnowflakeIdParser.getTimestampFromTweetId(maxTweetId);
|
|
||||||
return System.currentTimeMillis() - eventCreatedAtTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getNumUsersInMap() {
|
|
||||||
return map.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
public ConcurrentHashMap<Long, Long> getMap() {
|
|
||||||
return map;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isEmpty() {
|
|
||||||
return map.isEmpty();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isSet(long userId) {
|
|
||||||
return map.containsKey(userId);
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,572 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common.userupdates;
|
|
||||||
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.concurrent.atomic.AtomicReference;
|
|
||||||
import java.util.function.Predicate;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.search.common.metrics.SearchLongGauge;
|
|
||||||
import com.twitter.search.common.metrics.SearchRateCounter;
|
|
||||||
import com.twitter.search.common.util.hash.GeneralLongHashFunction;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Table containing metadata about users, like NSFW or Antisocial status.
|
|
||||||
* Used for result filtering.
|
|
||||||
*/
|
|
||||||
public class UserTable {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(UserTable.class);
|
|
||||||
|
|
||||||
@VisibleForTesting // Not final for testing.
|
|
||||||
protected static long userUpdateTableMaxCapacity = 1L << 30;
|
|
||||||
|
|
||||||
private static final int DEFAULT_INITIAL_CAPACITY = 1024;
|
|
||||||
private static final int BYTE_WIDTH = 8;
|
|
||||||
|
|
||||||
private static final String USER_TABLE_CAPACITY = "user_table_capacity";
|
|
||||||
private static final String USER_TABLE_SIZE = "user_table_size";
|
|
||||||
private static final String
|
|
||||||
USER_NUM_USERS_WITH_NO_BITS_SET = "user_table_users_with_no_bits_set";
|
|
||||||
private static final String USER_TABLE_ANTISOCIAL_USERS = "user_table_antisocial_users";
|
|
||||||
private static final String USER_TABLE_OFFENSIVE_USERS = "user_table_offensive_users";
|
|
||||||
private static final String USER_TABLE_NSFW_USERS = "user_table_nsfw_users";
|
|
||||||
private static final String USER_TABLE_IS_PROTECTED_USERS = "user_table_is_protected_users";
|
|
||||||
|
|
||||||
/**
|
|
||||||
* number of users filtered
|
|
||||||
*/
|
|
||||||
private static final SearchRateCounter USER_TABLE_USERS_FILTERED_COUNTER =
|
|
||||||
new SearchRateCounter("user_table_users_filtered");
|
|
||||||
|
|
||||||
private SearchLongGauge userTableCapacity;
|
|
||||||
private SearchLongGauge userTableSize;
|
|
||||||
private SearchLongGauge userTableNumUsersWithNoBitsSet;
|
|
||||||
private SearchLongGauge userTableAntisocialUsers;
|
|
||||||
private SearchLongGauge userTableOffensiveUsers;
|
|
||||||
private SearchLongGauge userTableNsfwUsers;
|
|
||||||
private SearchLongGauge userTableIsProtectedUsers;
|
|
||||||
|
|
||||||
private final Predicate<Long> userIdFilter;
|
|
||||||
private long lastRecordTimestamp;
|
|
||||||
|
|
||||||
private static final class HashTable {
|
|
||||||
private int numUsersInTable;
|
|
||||||
private int numUsersWithNoBitsSet;
|
|
||||||
// size 8 array contains the number of users who have the bit set at the index (0-7) position
|
|
||||||
// e.g. setBitCounts[0] stores the number of users who have the 0 bit set in their bytes
|
|
||||||
private long[] setBitCounts;
|
|
||||||
|
|
||||||
private final long[] hash;
|
|
||||||
private final byte[] bits;
|
|
||||||
|
|
||||||
private final int hashMask;
|
|
||||||
|
|
||||||
HashTable(int size) {
|
|
||||||
this.hash = new long[size];
|
|
||||||
this.bits = new byte[size];
|
|
||||||
this.hashMask = size - 1;
|
|
||||||
this.numUsersInTable = 0;
|
|
||||||
this.setBitCounts = new long[BYTE_WIDTH];
|
|
||||||
}
|
|
||||||
|
|
||||||
protected int hashSize() {
|
|
||||||
return hash.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
// If we want to decrease the number of users in the table, we can delete as many users
|
|
||||||
// as this table returns, by calling filterTableAndCountValidItems.
|
|
||||||
public void setCountOfNumUsersWithNoBitsSet() {
|
|
||||||
int count = 0;
|
|
||||||
for (int i = 0; i < hash.length; i++) {
|
|
||||||
if ((hash[i] > 0) && (bits[i] == 0)) {
|
|
||||||
count++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
numUsersWithNoBitsSet = count;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setSetBitCounts() {
|
|
||||||
long[] counts = new long[BYTE_WIDTH];
|
|
||||||
for (int i = 0; i < hash.length; i++) {
|
|
||||||
if (hash[i] > 0) {
|
|
||||||
int tempBits = bits[i] & 0xff;
|
|
||||||
int curBitPos = 0;
|
|
||||||
while (tempBits != 0) {
|
|
||||||
if ((tempBits & 1) != 0) {
|
|
||||||
counts[curBitPos]++;
|
|
||||||
}
|
|
||||||
tempBits = tempBits >>> 1;
|
|
||||||
curBitPos++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
setBitCounts = counts;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static final int ANTISOCIAL_BIT = 1;
|
|
||||||
public static final int OFFENSIVE_BIT = 1 << 1;
|
|
||||||
public static final int NSFW_BIT = 1 << 2;
|
|
||||||
public static final int IS_PROTECTED_BIT = 1 << 3;
|
|
||||||
|
|
||||||
public long getLastRecordTimestamp() {
|
|
||||||
return this.lastRecordTimestamp;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setLastRecordTimestamp(long lastRecordTimestamp) {
|
|
||||||
this.lastRecordTimestamp = lastRecordTimestamp;
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setOffensive(long userID, boolean offensive) {
|
|
||||||
set(userID, OFFENSIVE_BIT, offensive);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setAntisocial(long userID, boolean antisocial) {
|
|
||||||
set(userID, ANTISOCIAL_BIT, antisocial);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setNSFW(long userID, boolean nsfw) {
|
|
||||||
set(userID, NSFW_BIT, nsfw);
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setIsProtected(long userID, boolean isProtected) {
|
|
||||||
set(userID, IS_PROTECTED_BIT, isProtected);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Adds the given user update to this table.
|
|
||||||
*/
|
|
||||||
public boolean indexUserUpdate(UserUpdatesChecker checker, UserUpdate userUpdate) {
|
|
||||||
if (checker.skipUserUpdate(userUpdate)) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
switch (userUpdate.updateType) {
|
|
||||||
case ANTISOCIAL:
|
|
||||||
setAntisocial(userUpdate.twitterUserID, userUpdate.updateValue != 0);
|
|
||||||
break;
|
|
||||||
case NSFW:
|
|
||||||
setNSFW(userUpdate.twitterUserID, userUpdate.updateValue != 0);
|
|
||||||
break;
|
|
||||||
case OFFENSIVE:
|
|
||||||
setOffensive(userUpdate.twitterUserID, userUpdate.updateValue != 0);
|
|
||||||
break;
|
|
||||||
case PROTECTED:
|
|
||||||
setIsProtected(userUpdate.twitterUserID, userUpdate.updateValue != 0);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
private final AtomicReference<HashTable> hashTable = new AtomicReference<>();
|
|
||||||
|
|
||||||
private int hashCode(long userID) {
|
|
||||||
return (int) GeneralLongHashFunction.hash(userID);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns an iterator for user IDs that have at least one of the bits set.
|
|
||||||
*/
|
|
||||||
public Iterator<Long> getFlaggedUserIdIterator() {
|
|
||||||
HashTable table = hashTable.get();
|
|
||||||
|
|
||||||
final long[] currUserIdTable = table.hash;
|
|
||||||
final byte[] currBitsTable = table.bits;
|
|
||||||
return new Iterator<Long>() {
|
|
||||||
private int index = findNext(0);
|
|
||||||
|
|
||||||
private int findNext(int index) {
|
|
||||||
int startingIndex = index;
|
|
||||||
while (startingIndex < currUserIdTable.length) {
|
|
||||||
if (currUserIdTable[startingIndex] != 0 && currBitsTable[startingIndex] != 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
++startingIndex;
|
|
||||||
}
|
|
||||||
return startingIndex;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() {
|
|
||||||
return index < currUserIdTable.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public Long next() {
|
|
||||||
Long r = currUserIdTable[index];
|
|
||||||
index = findNext(index + 1);
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void remove() {
|
|
||||||
throw new UnsupportedOperationException();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Constructs an UserUpdatesTable with an given HashTable instance.
|
|
||||||
* Use <code>useIdFilter</code> as a Predicate that returns true for the elements
|
|
||||||
* needed to be kept in the table.
|
|
||||||
* Use shouldRehash to force a rehasing on the given HashTable.
|
|
||||||
*/
|
|
||||||
private UserTable(HashTable hashTable, Predicate<Long> userIdFilter,
|
|
||||||
boolean shouldRehash) {
|
|
||||||
|
|
||||||
Preconditions.checkNotNull(userIdFilter);
|
|
||||||
|
|
||||||
this.hashTable.set(hashTable);
|
|
||||||
this.userIdFilter = userIdFilter;
|
|
||||||
|
|
||||||
exportUserUpdatesTableStats();
|
|
||||||
|
|
||||||
LOG.info("User table num users: {}. Users with no bits set: {}. "
|
|
||||||
+ "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.",
|
|
||||||
this.getNumUsersInTable(),
|
|
||||||
this.getNumUsersWithNoBitsSet(),
|
|
||||||
this.getSetBitCount(ANTISOCIAL_BIT),
|
|
||||||
this.getSetBitCount(OFFENSIVE_BIT),
|
|
||||||
this.getSetBitCount(NSFW_BIT),
|
|
||||||
this.getSetBitCount(IS_PROTECTED_BIT));
|
|
||||||
|
|
||||||
if (shouldRehash) {
|
|
||||||
int filteredTableSize = filterTableAndCountValidItems();
|
|
||||||
// Having exactly 100% usage can impact lookup. Maintain the table at under 50% usage.
|
|
||||||
int newTableCapacity = computeDesiredHashTableCapacity(filteredTableSize * 2);
|
|
||||||
|
|
||||||
rehash(newTableCapacity);
|
|
||||||
|
|
||||||
LOG.info("User table num users after rehash: {}. Users with no bits set: {}. "
|
|
||||||
+ "Antisocial users: {}. Offensive users: {}. Nsfw users: {}. IsProtected users: {}.",
|
|
||||||
this.getNumUsersInTable(),
|
|
||||||
this.getNumUsersWithNoBitsSet(),
|
|
||||||
this.getSetBitCount(ANTISOCIAL_BIT),
|
|
||||||
this.getSetBitCount(OFFENSIVE_BIT),
|
|
||||||
this.getSetBitCount(NSFW_BIT),
|
|
||||||
this.getSetBitCount(IS_PROTECTED_BIT));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private UserTable(int initialSize, Predicate<Long> userIdFilter) {
|
|
||||||
this(new HashTable(computeDesiredHashTableCapacity(initialSize)), userIdFilter, false);
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
public UserTable(int initialSize) {
|
|
||||||
this(initialSize, userId -> true);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static UserTable
|
|
||||||
newTableWithDefaultCapacityAndPredicate(Predicate<Long> userIdFilter) {
|
|
||||||
|
|
||||||
return new UserTable(DEFAULT_INITIAL_CAPACITY, userIdFilter);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static UserTable newTableNonFilteredWithDefaultCapacity() {
|
|
||||||
return newTableWithDefaultCapacityAndPredicate(userId -> true);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void exportUserUpdatesTableStats() {
|
|
||||||
userTableSize = SearchLongGauge.export(USER_TABLE_SIZE);
|
|
||||||
userTableCapacity = SearchLongGauge.export(USER_TABLE_CAPACITY);
|
|
||||||
userTableNumUsersWithNoBitsSet = SearchLongGauge.export(
|
|
||||||
USER_NUM_USERS_WITH_NO_BITS_SET
|
|
||||||
);
|
|
||||||
userTableAntisocialUsers = SearchLongGauge.export(USER_TABLE_ANTISOCIAL_USERS);
|
|
||||||
userTableOffensiveUsers = SearchLongGauge.export(USER_TABLE_OFFENSIVE_USERS);
|
|
||||||
userTableNsfwUsers = SearchLongGauge.export(USER_TABLE_NSFW_USERS);
|
|
||||||
userTableIsProtectedUsers = SearchLongGauge.export(USER_TABLE_IS_PROTECTED_USERS);
|
|
||||||
|
|
||||||
LOG.info(
|
|
||||||
"Exporting stats for user table. Starting with numUsersInTable={}, usersWithZeroBits={}, "
|
|
||||||
+ "antisocialUsers={}, offensiveUsers={}, nsfwUsers={}, isProtectedUsers={}.",
|
|
||||||
getNumUsersInTable(),
|
|
||||||
getNumUsersWithNoBitsSet(),
|
|
||||||
getSetBitCount(ANTISOCIAL_BIT),
|
|
||||||
getSetBitCount(OFFENSIVE_BIT),
|
|
||||||
getSetBitCount(NSFW_BIT),
|
|
||||||
getSetBitCount(IS_PROTECTED_BIT));
|
|
||||||
updateStats();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void updateStats() {
|
|
||||||
HashTable table = this.hashTable.get();
|
|
||||||
userTableSize.set(table.numUsersInTable);
|
|
||||||
userTableNumUsersWithNoBitsSet.set(table.numUsersWithNoBitsSet);
|
|
||||||
userTableCapacity.set(table.hashSize());
|
|
||||||
userTableAntisocialUsers.set(getSetBitCount(ANTISOCIAL_BIT));
|
|
||||||
userTableOffensiveUsers.set(getSetBitCount(OFFENSIVE_BIT));
|
|
||||||
userTableNsfwUsers.set(getSetBitCount(NSFW_BIT));
|
|
||||||
userTableIsProtectedUsers.set(getSetBitCount(IS_PROTECTED_BIT));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Computes the size of the hashtable as the first power of two greater than or equal to initialSize
|
|
||||||
*/
|
|
||||||
private static int computeDesiredHashTableCapacity(int initialSize) {
|
|
||||||
long powerOfTwoSize = 2;
|
|
||||||
while (initialSize > powerOfTwoSize) {
|
|
||||||
powerOfTwoSize *= 2;
|
|
||||||
}
|
|
||||||
if (powerOfTwoSize > Integer.MAX_VALUE) {
|
|
||||||
LOG.error("Error: powerOfTwoSize overflowed Integer.MAX_VALUE! Initial size: " + initialSize);
|
|
||||||
powerOfTwoSize = 1 << 30; // max power of 2
|
|
||||||
}
|
|
||||||
|
|
||||||
return (int) powerOfTwoSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumUsersInTable() {
|
|
||||||
return hashTable.get().numUsersInTable;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get the number of users who have the bit set at the `userStateBit` position
|
|
||||||
*/
|
|
||||||
public long getSetBitCount(int userStateBit) {
|
|
||||||
int bit = userStateBit;
|
|
||||||
int bitPosition = 0;
|
|
||||||
while (bit != 0 && (bit & 1) == 0) {
|
|
||||||
bit = bit >>> 1;
|
|
||||||
bitPosition++;
|
|
||||||
}
|
|
||||||
return hashTable.get().setBitCounts[bitPosition];
|
|
||||||
}
|
|
||||||
|
|
||||||
public Predicate<Long> getUserIdFilter() {
|
|
||||||
return userIdFilter::test;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Updates a user flag in this table.
|
|
||||||
*/
|
|
||||||
public final void set(long userID, int bit, boolean value) {
|
|
||||||
// if userID is filtered return immediately
|
|
||||||
if (!shouldKeepUser(userID)) {
|
|
||||||
USER_TABLE_USERS_FILTERED_COUNTER.increment();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
HashTable table = this.hashTable.get();
|
|
||||||
|
|
||||||
int hashPos = findHashPosition(table, userID);
|
|
||||||
long item = table.hash[hashPos];
|
|
||||||
byte bits = 0;
|
|
||||||
int bitsDiff = 0;
|
|
||||||
|
|
||||||
if (item != 0) {
|
|
||||||
byte bitsOriginally = bits = table.bits[hashPos];
|
|
||||||
if (value) {
|
|
||||||
bits |= bit;
|
|
||||||
} else {
|
|
||||||
// AND'ing with the inverse map clears the desired bit, but
|
|
||||||
// doesn't change any of the other bits
|
|
||||||
bits &= ~bit;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find the changed bits after the above operation, it is possible that no bit is changed if
|
|
||||||
// the input 'bit' is already set/unset in the table.
|
|
||||||
// Since bitwise operators cannot be directly applied on Byte, Byte is promoted into int to
|
|
||||||
// apply the operators. When that happens, if the most significant bit of the Byte is set,
|
|
||||||
// the promoted int has all significant bits set to 1. 0xff bitmask is applied here to make
|
|
||||||
// sure only the last 8 bits are considered.
|
|
||||||
bitsDiff = (bitsOriginally & 0xff) ^ (bits & 0xff);
|
|
||||||
|
|
||||||
if (bitsOriginally > 0 && bits == 0) {
|
|
||||||
table.numUsersWithNoBitsSet++;
|
|
||||||
} else if (bitsOriginally == 0 && bits > 0) {
|
|
||||||
table.numUsersWithNoBitsSet--;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
if (!value) {
|
|
||||||
// no need to add this user, since all bits would be false anyway
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// New user string.
|
|
||||||
if (table.numUsersInTable + 1 >= (table.hashSize() >> 1)
|
|
||||||
&& table.hashSize() != userUpdateTableMaxCapacity) {
|
|
||||||
if (2L * (long) table.hashSize() < userUpdateTableMaxCapacity) {
|
|
||||||
rehash(2 * table.hashSize());
|
|
||||||
table = this.hashTable.get();
|
|
||||||
} else {
|
|
||||||
if (table.hashSize() < (int) userUpdateTableMaxCapacity) {
|
|
||||||
rehash((int) userUpdateTableMaxCapacity);
|
|
||||||
table = this.hashTable.get();
|
|
||||||
LOG.warn("User update table size reached Integer.MAX_VALUE, performance will degrade.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Must repeat this operation with the resized hashTable.
|
|
||||||
hashPos = findHashPosition(table, userID);
|
|
||||||
}
|
|
||||||
|
|
||||||
item = userID;
|
|
||||||
bits |= bit;
|
|
||||||
bitsDiff = bit & 0xff;
|
|
||||||
|
|
||||||
table.numUsersInTable++;
|
|
||||||
}
|
|
||||||
|
|
||||||
table.hash[hashPos] = item;
|
|
||||||
table.bits[hashPos] = bits;
|
|
||||||
|
|
||||||
// update setBitCounts for the changed bits after applying the input 'bit'
|
|
||||||
int curBitsDiffPos = 0;
|
|
||||||
while (bitsDiff != 0) {
|
|
||||||
if ((bitsDiff & 1) != 0) {
|
|
||||||
if (value) {
|
|
||||||
table.setBitCounts[curBitsDiffPos]++;
|
|
||||||
} else {
|
|
||||||
table.setBitCounts[curBitsDiffPos]--;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
bitsDiff = bitsDiff >>> 1;
|
|
||||||
curBitsDiffPos++;
|
|
||||||
}
|
|
||||||
|
|
||||||
updateStats();
|
|
||||||
}
|
|
||||||
|
|
||||||
public final boolean isSet(long userID, int bits) {
|
|
||||||
HashTable table = hashTable.get();
|
|
||||||
int hashPos = findHashPosition(table, userID);
|
|
||||||
return table.hash[hashPos] != 0 && (table.bits[hashPos] & bits) != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns true when userIdFilter condition is being met.
|
|
||||||
* If filter is not present returns true
|
|
||||||
*/
|
|
||||||
private boolean shouldKeepUser(long userID) {
|
|
||||||
return userIdFilter.test(userID);
|
|
||||||
}
|
|
||||||
|
|
||||||
private int findHashPosition(final HashTable table, final long userID) {
|
|
||||||
int code = hashCode(userID);
|
|
||||||
int hashPos = code & table.hashMask;
|
|
||||||
|
|
||||||
// Locate user in hash
|
|
||||||
long item = table.hash[hashPos];
|
|
||||||
|
|
||||||
if (item != 0 && item != userID) {
|
|
||||||
// Conflict: keep searching different locations in
|
|
||||||
// the hash table.
|
|
||||||
final int inc = ((code >> 8) + code) | 1;
|
|
||||||
do {
|
|
||||||
code += inc;
|
|
||||||
hashPos = code & table.hashMask;
|
|
||||||
item = table.hash[hashPos];
|
|
||||||
} while (item != 0 && item != userID);
|
|
||||||
}
|
|
||||||
|
|
||||||
return hashPos;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Applies the filtering predicate and returns the size of the filtered table.
|
|
||||||
*/
|
|
||||||
private synchronized int filterTableAndCountValidItems() {
|
|
||||||
final HashTable oldTable = this.hashTable.get();
|
|
||||||
int newSize = 0;
|
|
||||||
|
|
||||||
int clearNoItemSet = 0;
|
|
||||||
int clearNoBitsSet = 0;
|
|
||||||
int clearDontKeepUser = 0;
|
|
||||||
|
|
||||||
for (int i = 0; i < oldTable.hashSize(); i++) {
|
|
||||||
final long item = oldTable.hash[i]; // this is the userID
|
|
||||||
final byte bits = oldTable.bits[i];
|
|
||||||
|
|
||||||
boolean clearSlot = false;
|
|
||||||
if (item == 0) {
|
|
||||||
clearSlot = true;
|
|
||||||
clearNoItemSet++;
|
|
||||||
} else if (bits == 0) {
|
|
||||||
clearSlot = true;
|
|
||||||
clearNoBitsSet++;
|
|
||||||
} else if (!shouldKeepUser(item)) {
|
|
||||||
clearSlot = true;
|
|
||||||
clearDontKeepUser++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (clearSlot) {
|
|
||||||
oldTable.hash[i] = 0;
|
|
||||||
oldTable.bits[i] = 0;
|
|
||||||
} else {
|
|
||||||
newSize += 1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
oldTable.setCountOfNumUsersWithNoBitsSet();
|
|
||||||
oldTable.setSetBitCounts();
|
|
||||||
|
|
||||||
LOG.info("Done filtering table: clearNoItemSet={}, clearNoBitsSet={}, clearDontKeepUser={}",
|
|
||||||
clearNoItemSet, clearNoBitsSet, clearDontKeepUser);
|
|
||||||
|
|
||||||
return newSize;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Called when hash is too small (> 50% occupied)
|
|
||||||
*/
|
|
||||||
private void rehash(final int newSize) {
|
|
||||||
final HashTable oldTable = this.hashTable.get();
|
|
||||||
final HashTable newTable = new HashTable(newSize);
|
|
||||||
|
|
||||||
final int newMask = newTable.hashMask;
|
|
||||||
final long[] newHash = newTable.hash;
|
|
||||||
final byte[] newBits = newTable.bits;
|
|
||||||
|
|
||||||
for (int i = 0; i < oldTable.hashSize(); i++) {
|
|
||||||
final long item = oldTable.hash[i];
|
|
||||||
final byte bits = oldTable.bits[i];
|
|
||||||
if (item != 0 && bits != 0) {
|
|
||||||
int code = hashCode(item);
|
|
||||||
|
|
||||||
int hashPos = code & newMask;
|
|
||||||
assert hashPos >= 0;
|
|
||||||
if (newHash[hashPos] != 0) {
|
|
||||||
final int inc = ((code >> 8) + code) | 1;
|
|
||||||
do {
|
|
||||||
code += inc;
|
|
||||||
hashPos = code & newMask;
|
|
||||||
} while (newHash[hashPos] != 0);
|
|
||||||
}
|
|
||||||
newHash[hashPos] = item;
|
|
||||||
newBits[hashPos] = bits;
|
|
||||||
newTable.numUsersInTable++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
newTable.setCountOfNumUsersWithNoBitsSet();
|
|
||||||
newTable.setSetBitCounts();
|
|
||||||
this.hashTable.set(newTable);
|
|
||||||
|
|
||||||
updateStats();
|
|
||||||
}
|
|
||||||
|
|
||||||
public void setTable(UserTable newTable) {
|
|
||||||
hashTable.set(newTable.hashTable.get());
|
|
||||||
updateStats();
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
protected int getHashTableCapacity() {
|
|
||||||
return hashTable.get().hashSize();
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
protected int getNumUsersWithNoBitsSet() {
|
|
||||||
return hashTable.get().numUsersWithNoBitsSet;
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,263 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common.userupdates;
|
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.util.Arrays;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.NoSuchElementException;
|
|
||||||
import java.util.Optional;
|
|
||||||
import java.util.Spliterator;
|
|
||||||
import java.util.Spliterators;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
import java.util.function.Predicate;
|
|
||||||
import java.util.stream.Collectors;
|
|
||||||
import java.util.stream.Stream;
|
|
||||||
import java.util.stream.StreamSupport;
|
|
||||||
import javax.annotation.Nullable;
|
|
||||||
|
|
||||||
import org.apache.hadoop.conf.Configuration;
|
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
|
||||||
import org.apache.hadoop.fs.Path;
|
|
||||||
import org.apache.hadoop.hdfs.HdfsConfiguration;
|
|
||||||
import org.slf4j.Logger;
|
|
||||||
import org.slf4j.LoggerFactory;
|
|
||||||
|
|
||||||
import com.twitter.common_internal.hadoop.HdfsUtils;
|
|
||||||
import com.twitter.scalding.DateRange;
|
|
||||||
import com.twitter.scalding.Hours;
|
|
||||||
import com.twitter.scalding.RichDate;
|
|
||||||
import com.twitter.search.user_table.sources.MostRecentGoodSafetyUserStateSource;
|
|
||||||
import com.twitter.search.common.indexing.thriftjava.SafetyUserState;
|
|
||||||
import com.twitter.search.common.util.io.LzoThriftBlockFileReader;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
|
||||||
import com.twitter.util.Duration;
|
|
||||||
import com.twitter.util.Time;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Builds a user table from a user safety snapshot on HDFS.
|
|
||||||
*/
|
|
||||||
public class UserTableBuilderFromSnapshot {
|
|
||||||
private static final Logger LOG = LoggerFactory.getLogger(UserTableBuilderFromSnapshot.class);
|
|
||||||
|
|
||||||
private static final int MAX_DAYS_TO_CHECK = 7;
|
|
||||||
public static final String DATA_DIR = "user_states";
|
|
||||||
public static final String METADATA_DIR = "last_updated_ms";
|
|
||||||
|
|
||||||
private final String snapshotBaseDir;
|
|
||||||
|
|
||||||
private String snapshotDataPath;
|
|
||||||
private String snapshotMetaDataPath;
|
|
||||||
private UserTable userTable;
|
|
||||||
|
|
||||||
private long nsfwCount;
|
|
||||||
private long antisocialCount;
|
|
||||||
private long isProtectedCount;
|
|
||||||
|
|
||||||
public UserTableBuilderFromSnapshot() {
|
|
||||||
snapshotBaseDir =
|
|
||||||
EarlybirdConfig.getString(EarlybirdConfig.USER_SNAPSHOT_BASE_DIR, null);
|
|
||||||
|
|
||||||
LOG.info("Configured user snapshot directory: " + snapshotBaseDir);
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final class UserUpdate {
|
|
||||||
public final long userId;
|
|
||||||
@Nullable public final Boolean antisocial;
|
|
||||||
@Nullable public final Boolean nsfw;
|
|
||||||
@Nullable public final Boolean isProtected;
|
|
||||||
|
|
||||||
private UserUpdate(long userId,
|
|
||||||
@Nullable Boolean antisocial,
|
|
||||||
@Nullable Boolean nsfw,
|
|
||||||
@Nullable Boolean isProtected) {
|
|
||||||
this.userId = userId;
|
|
||||||
this.antisocial = antisocial;
|
|
||||||
this.nsfw = nsfw;
|
|
||||||
this.isProtected = isProtected;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static UserUpdate fromUserState(SafetyUserState safetyUserState) {
|
|
||||||
long userId = safetyUserState.getUserID();
|
|
||||||
@Nullable Boolean antisocial = null;
|
|
||||||
@Nullable Boolean nsfw = null;
|
|
||||||
@Nullable Boolean isProtected = null;
|
|
||||||
|
|
||||||
if (safetyUserState.isIsAntisocial()) {
|
|
||||||
antisocial = true;
|
|
||||||
}
|
|
||||||
if (safetyUserState.isIsNsfw()) {
|
|
||||||
nsfw = true;
|
|
||||||
}
|
|
||||||
if (safetyUserState.isSetIsProtected() && safetyUserState.isIsProtected()) {
|
|
||||||
isProtected = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
return new UserUpdate(userId, antisocial, nsfw, isProtected);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Builds a user table from an HDFS user snapshot.
|
|
||||||
* @return The table, or nothing if something went wrong.
|
|
||||||
*/
|
|
||||||
public Optional<UserTable> build(Predicate<Long> userFilter) {
|
|
||||||
userTable = UserTable.newTableWithDefaultCapacityAndPredicate(userFilter);
|
|
||||||
nsfwCount = 0;
|
|
||||||
antisocialCount = 0;
|
|
||||||
isProtectedCount = 0;
|
|
||||||
|
|
||||||
if (snapshotBaseDir == null || snapshotBaseDir.isEmpty()) {
|
|
||||||
LOG.info("No snapshot directory. Can't build user table.");
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Starting to build user table.");
|
|
||||||
|
|
||||||
Stream<UserUpdate> stream = null;
|
|
||||||
|
|
||||||
try {
|
|
||||||
setSnapshotPath();
|
|
||||||
|
|
||||||
stream = getUserUpdates();
|
|
||||||
stream.forEach(this::insertUser);
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("IOException while building table: {}", e.getMessage(), e);
|
|
||||||
|
|
||||||
return Optional.empty();
|
|
||||||
} finally {
|
|
||||||
if (stream != null) {
|
|
||||||
stream.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Built user table with {} users, {} nsfw, {} antisocial and {} protected.",
|
|
||||||
userTable.getNumUsersInTable(),
|
|
||||||
nsfwCount,
|
|
||||||
antisocialCount,
|
|
||||||
isProtectedCount);
|
|
||||||
|
|
||||||
try {
|
|
||||||
userTable.setLastRecordTimestamp(readTimestampOfLastSeenUpdateFromSnapshot());
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.error("IOException reading timestamp of last update: {}", e.getMessage(), e);
|
|
||||||
return Optional.empty();
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info("Setting last record timestamp to {}.", userTable.getLastRecordTimestamp());
|
|
||||||
|
|
||||||
return Optional.of(userTable);
|
|
||||||
}
|
|
||||||
|
|
||||||
private void setSnapshotPath() {
|
|
||||||
snapshotDataPath =
|
|
||||||
new MostRecentGoodSafetyUserStateSource(
|
|
||||||
snapshotBaseDir,
|
|
||||||
DATA_DIR,
|
|
||||||
METADATA_DIR,
|
|
||||||
DateRange.apply(
|
|
||||||
RichDate.now().$minus(Hours.apply(MAX_DAYS_TO_CHECK * 24)),
|
|
||||||
RichDate.now())
|
|
||||||
).partitionHdfsPaths(new HdfsConfiguration())
|
|
||||||
._1()
|
|
||||||
.head()
|
|
||||||
.replaceAll("\\*$", "");
|
|
||||||
snapshotMetaDataPath = snapshotDataPath.replace(DATA_DIR, METADATA_DIR);
|
|
||||||
|
|
||||||
LOG.info("Snapshot data path: {}", snapshotDataPath);
|
|
||||||
LOG.info("Snapshot metadata path: {}", snapshotMetaDataPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
private Stream<UserUpdate> getUserUpdates() throws IOException {
|
|
||||||
FileSystem fs = FileSystem.get(new Configuration());
|
|
||||||
List<String> lzoFiles =
|
|
||||||
Arrays.stream(fs.listStatus(new Path(snapshotDataPath),
|
|
||||||
path -> path.getName().startsWith("part-")))
|
|
||||||
.map(fileStatus -> Path.getPathWithoutSchemeAndAuthority(fileStatus.getPath())
|
|
||||||
.toString())
|
|
||||||
.collect(Collectors.toList());
|
|
||||||
|
|
||||||
final LzoThriftBlockFileReader<SafetyUserState> thriftReader =
|
|
||||||
new LzoThriftBlockFileReader<>(lzoFiles, SafetyUserState.class, null);
|
|
||||||
|
|
||||||
Iterator<UserUpdate> iter = new Iterator<UserUpdate>() {
|
|
||||||
private SafetyUserState next;
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public boolean hasNext() {
|
|
||||||
if (next != null) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
do {
|
|
||||||
try {
|
|
||||||
next = thriftReader.readNext();
|
|
||||||
} catch (IOException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
} while (next == null && !thriftReader.isExhausted());
|
|
||||||
return next != null;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public UserUpdate next() {
|
|
||||||
if (next != null || hasNext()) {
|
|
||||||
UserUpdate userUpdate = UserUpdate.fromUserState(next);
|
|
||||||
next = null;
|
|
||||||
return userUpdate;
|
|
||||||
}
|
|
||||||
throw new NoSuchElementException();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
return StreamSupport
|
|
||||||
.stream(
|
|
||||||
Spliterators.spliteratorUnknownSize(iter, Spliterator.ORDERED | Spliterator.NONNULL),
|
|
||||||
false)
|
|
||||||
.onClose(thriftReader::stop);
|
|
||||||
}
|
|
||||||
|
|
||||||
private long readTimestampOfLastSeenUpdateFromSnapshot() throws IOException {
|
|
||||||
String timestampFile = snapshotMetaDataPath + "part-00000";
|
|
||||||
BufferedReader buffer = new BufferedReader(new InputStreamReader(
|
|
||||||
HdfsUtils.getInputStreamSupplier(timestampFile).openStream()));
|
|
||||||
|
|
||||||
long timestampMillis = Long.parseLong(buffer.readLine());
|
|
||||||
LOG.info("read timestamp {} from HDFS:{}", timestampMillis, timestampFile);
|
|
||||||
|
|
||||||
Time time = Time.fromMilliseconds(timestampMillis)
|
|
||||||
.minus(Duration.fromTimeUnit(10, TimeUnit.MINUTES));
|
|
||||||
return time.inMilliseconds();
|
|
||||||
}
|
|
||||||
|
|
||||||
private void insertUser(UserUpdate userUpdate) {
|
|
||||||
if (userUpdate == null) {
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (userUpdate.antisocial != null) {
|
|
||||||
userTable.set(
|
|
||||||
userUpdate.userId,
|
|
||||||
UserTable.ANTISOCIAL_BIT,
|
|
||||||
userUpdate.antisocial);
|
|
||||||
antisocialCount++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (userUpdate.nsfw != null) {
|
|
||||||
userTable.set(
|
|
||||||
userUpdate.userId,
|
|
||||||
UserTable.NSFW_BIT,
|
|
||||||
userUpdate.nsfw);
|
|
||||||
nsfwCount++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (userUpdate.isProtected != null) {
|
|
||||||
userTable.set(
|
|
||||||
userUpdate.userId,
|
|
||||||
UserTable.IS_PROTECTED_BIT,
|
|
||||||
userUpdate.isProtected);
|
|
||||||
isProtectedCount++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,38 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common.userupdates;
|
|
||||||
|
|
||||||
import java.util.Date;
|
|
||||||
|
|
||||||
import com.twitter.search.common.indexing.thriftjava.UserUpdateType;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Contains an update for a user.
|
|
||||||
*/
|
|
||||||
public class UserUpdate {
|
|
||||||
public final long twitterUserID;
|
|
||||||
public final UserUpdateType updateType;
|
|
||||||
public final int updateValue;
|
|
||||||
private final Date updatedAt;
|
|
||||||
|
|
||||||
public UserUpdate(long twitterUserID,
|
|
||||||
UserUpdateType updateType,
|
|
||||||
int updateValue,
|
|
||||||
Date updatedAt) {
|
|
||||||
|
|
||||||
this.twitterUserID = twitterUserID;
|
|
||||||
this.updateType = updateType;
|
|
||||||
this.updateValue = updateValue;
|
|
||||||
this.updatedAt = (Date) updatedAt.clone();
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override public String toString() {
|
|
||||||
return "UserInfoUpdate[userID=" + twitterUserID + ",updateType=" + updateType
|
|
||||||
+ ",updateValue=" + updateValue + ",updatedAt=" + getUpdatedAt() + "]";
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns a copy of the updated-at date.
|
|
||||||
*/
|
|
||||||
public Date getUpdatedAt() {
|
|
||||||
return (Date) updatedAt.clone();
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,70 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.common.userupdates;
|
|
||||||
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.concurrent.TimeUnit;
|
|
||||||
|
|
||||||
import com.twitter.common.util.Clock;
|
|
||||||
import com.twitter.decider.Decider;
|
|
||||||
import com.twitter.search.common.indexing.thriftjava.UserUpdateType;
|
|
||||||
import com.twitter.search.common.schema.earlybird.EarlybirdCluster;
|
|
||||||
import com.twitter.search.earlybird.common.config.EarlybirdConfig;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Contains logic for deciding whether to apply a certain user update to the {@link UserTable}.
|
|
||||||
*/
|
|
||||||
public class UserUpdatesChecker {
|
|
||||||
private final Date antisocialStartDate;
|
|
||||||
private final Decider decider;
|
|
||||||
private final boolean isFullArchiveCluster;
|
|
||||||
|
|
||||||
public UserUpdatesChecker(Clock clock, Decider decider, EarlybirdCluster cluster) {
|
|
||||||
// How many days of antisocial users to keep. A value of -1 means keeping all user updates.
|
|
||||||
long antisocialRecordDays =
|
|
||||||
EarlybirdConfig.getLong("keep_recent_antisocial_user_updates_days", 30);
|
|
||||||
this.antisocialStartDate = antisocialRecordDays > 0
|
|
||||||
? new Date(clock.nowMillis() - TimeUnit.DAYS.toMillis(antisocialRecordDays)) : null;
|
|
||||||
this.decider = decider;
|
|
||||||
this.isFullArchiveCluster = cluster == EarlybirdCluster.FULL_ARCHIVE;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Decides whether to skip the given UserInfoUpdate.
|
|
||||||
*/
|
|
||||||
public boolean skipUserUpdate(UserUpdate userUpdate) {
|
|
||||||
if (userUpdate == null) { // always skip null updates
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
UserUpdateType type = userUpdate.updateType;
|
|
||||||
|
|
||||||
if (type == UserUpdateType.PROTECTED && skipProtectedUserUpdate()) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (type == UserUpdateType.ANTISOCIAL && skipAntisocialUserUpdate(userUpdate)) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// NSFW users can continue to tweet even after they are marked as NSFW. That means
|
|
||||||
// that the snapshot needs to have all NSFW users from the beginning of time. Hence, no NSFW
|
|
||||||
// users updates check here.
|
|
||||||
|
|
||||||
// pass all checks, do not skip this user update
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Antisocial/suspended users can't tweet after they are suspended. Thus if our index stores
|
|
||||||
// tweets from the last 10 days, and they were suspended 60 days ago, we don't need them since
|
|
||||||
// there will be no tweets from them. We can save space by not storing info about those users.
|
|
||||||
|
|
||||||
// (For archive, at rebuild time we filter out all suspended users tweets, so for a user that
|
|
||||||
// was suspended before a rebuild, no need to use space to store that the user is suspended)
|
|
||||||
private boolean skipAntisocialUserUpdate(UserUpdate userUpdate) {
|
|
||||||
return antisocialStartDate != null && userUpdate.getUpdatedAt().before(antisocialStartDate);
|
|
||||||
}
|
|
||||||
|
|
||||||
// skip protected user updates for realtime and protected clusters
|
|
||||||
private boolean skipProtectedUserUpdate() {
|
|
||||||
return !isFullArchiveCluster;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,21 +0,0 @@
|
|||||||
java_library(
|
|
||||||
sources = ["**/*.java"],
|
|
||||||
platform = "java8",
|
|
||||||
tags = ["bazel-compatible"],
|
|
||||||
dependencies = [
|
|
||||||
"3rdparty/jvm/com/google/code/findbugs:jsr305",
|
|
||||||
"3rdparty/jvm/com/google/guava",
|
|
||||||
"3rdparty/jvm/com/google/inject:guice",
|
|
||||||
"3rdparty/jvm/org/apache/thrift:libthrift",
|
|
||||||
"3rdparty/jvm/org/apache/zookeeper:zookeeper-client",
|
|
||||||
"3rdparty/jvm/org/slf4j:slf4j-api",
|
|
||||||
"src/java/com/twitter/common/base",
|
|
||||||
"src/java/com/twitter/common/util:system-mocks",
|
|
||||||
"src/java/com/twitter/search/common/config",
|
|
||||||
"src/java/com/twitter/search/common/metrics",
|
|
||||||
"src/java/com/twitter/search/common/partitioning/snowflakeparser",
|
|
||||||
"src/java/com/twitter/search/common/util/date",
|
|
||||||
"src/java/com/twitter/search/common/util/zookeeper",
|
|
||||||
"src/java/com/twitter/search/earlybird/common/config",
|
|
||||||
],
|
|
||||||
)
|
|
BIN
src/java/com/twitter/search/earlybird/config/BUILD.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/BUILD.docx
Normal file
Binary file not shown.
BIN
src/java/com/twitter/search/earlybird/config/ServingRange.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/ServingRange.docx
Normal file
Binary file not shown.
@ -1,26 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.config;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* An interface for abstracting a tier's serving range.
|
|
||||||
*/
|
|
||||||
public interface ServingRange {
|
|
||||||
/**
|
|
||||||
* Returns the serving range's lowest tweet ID.
|
|
||||||
*/
|
|
||||||
long getServingRangeSinceId();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the serving range's highest tweet ID.
|
|
||||||
*/
|
|
||||||
long getServingRangeMaxId();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the serving range's earliest time, in seconds since epoch.
|
|
||||||
*/
|
|
||||||
long getServingRangeSinceTimeSecondsFromEpoch();
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the serving range's latest time, in seconds since epoch.
|
|
||||||
*/
|
|
||||||
long getServingRangeUntilTimeSecondsFromEpoch();
|
|
||||||
}
|
|
BIN
src/java/com/twitter/search/earlybird/config/TierConfig.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/TierConfig.docx
Normal file
Binary file not shown.
@ -1,175 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.config;
|
|
||||||
|
|
||||||
import java.util.Date;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
|
|
||||||
import com.twitter.common.util.Clock;
|
|
||||||
import com.twitter.search.common.config.Config;
|
|
||||||
import com.twitter.search.common.config.ConfigFile;
|
|
||||||
import com.twitter.search.common.config.ConfigurationException;
|
|
||||||
import com.twitter.search.common.metrics.SearchLongGauge;
|
|
||||||
import com.twitter.search.common.util.date.DateUtil;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* This class provides APIs to access the tier configurations for a cluster.
|
|
||||||
* Each tier has tier name, number of partitions, tier start time and end time.
|
|
||||||
*/
|
|
||||||
public final class TierConfig {
|
|
||||||
private static final org.slf4j.Logger LOG = org.slf4j.LoggerFactory.getLogger(TierConfig.class);
|
|
||||||
|
|
||||||
private static final String DEFAULT_CONFIG_DIR = "common/config";
|
|
||||||
public static final String DEFAULT_TIER_FILE = "earlybird-tiers.yml";
|
|
||||||
|
|
||||||
public static final Date DEFAULT_TIER_START_DATE = DateUtil.toDate(2006, 3, 21);
|
|
||||||
// It's convenient for DEFAULT_TIER_END_DATE to be before ~2100, because then the output of
|
|
||||||
// FieldTermCounter.getHourValue(DEFAULT_TIER_END_END_DATE) can still fit into an integer.
|
|
||||||
public static final Date DEFAULT_TIER_END_DATE = DateUtil.toDate(2099, 1, 1);
|
|
||||||
|
|
||||||
public static final String DEFAULT_TIER_NAME = "all";
|
|
||||||
public static final boolean DEFAULT_ENABLED = true;
|
|
||||||
public static final TierInfo.RequestReadType DEFAULT_READ_TYPE = TierInfo.RequestReadType.LIGHT;
|
|
||||||
|
|
||||||
private static ConfigFile tierConfigFile = null;
|
|
||||||
private static ConfigSource tierConfigSource = null;
|
|
||||||
|
|
||||||
public enum ConfigSource {
|
|
||||||
LOCAL,
|
|
||||||
ZOOKEEPER
|
|
||||||
}
|
|
||||||
|
|
||||||
private TierConfig() { }
|
|
||||||
|
|
||||||
private static synchronized void init() {
|
|
||||||
if (tierConfigFile == null) {
|
|
||||||
tierConfigFile = new ConfigFile(DEFAULT_CONFIG_DIR, DEFAULT_TIER_FILE);
|
|
||||||
tierConfigSource = ConfigSource.LOCAL;
|
|
||||||
SearchLongGauge.export("tier_config_source_" + tierConfigSource.name()).set(1);
|
|
||||||
LOG.info("Tier config file " + DEFAULT_TIER_FILE + " is successfully loaded from bundle.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static ConfigFile getConfigFile() {
|
|
||||||
init();
|
|
||||||
return tierConfigFile;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static String getConfigFileName() {
|
|
||||||
return getConfigFile().getConfigFileName();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Return all the tier names specified in the config file.
|
|
||||||
*/
|
|
||||||
public static Set<String> getTierNames() {
|
|
||||||
return Config.getConfig().getMapCopy(getConfigFileName()).keySet();
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Sets the value of the given tier config property to the given value.
|
|
||||||
*/
|
|
||||||
public static void setForTests(String property, Object value) {
|
|
||||||
Config.getConfig().setForTests(DEFAULT_TIER_FILE, property, value);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the config info for the specified tier.
|
|
||||||
*/
|
|
||||||
public static TierInfo getTierInfo(String tierName) {
|
|
||||||
return getTierInfo(tierName, null /* use current environment */);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the config info for the specified tier and environment.
|
|
||||||
*/
|
|
||||||
public static TierInfo getTierInfo(String tierName, @Nullable String environment) {
|
|
||||||
String tierConfigFileType = getConfigFileName();
|
|
||||||
Map<String, Object> tierInfo;
|
|
||||||
try {
|
|
||||||
tierInfo = (Map<String, Object>) Config.getConfig()
|
|
||||||
.getFromEnvironment(environment, tierConfigFileType, tierName);
|
|
||||||
} catch (ConfigurationException e) {
|
|
||||||
throw new RuntimeException(e);
|
|
||||||
}
|
|
||||||
if (tierInfo == null) {
|
|
||||||
LOG.error("Cannot find tier config for "
|
|
||||||
+ tierName + "in config file: " + tierConfigFileType);
|
|
||||||
throw new RuntimeException("Configuration error: " + tierConfigFileType);
|
|
||||||
}
|
|
||||||
|
|
||||||
Long partitions = (Long) tierInfo.get("number_of_partitions");
|
|
||||||
if (partitions == null) {
|
|
||||||
LOG.error("No number of partition is specified for tier "
|
|
||||||
+ tierName + " in tier config file " + tierConfigFileType);
|
|
||||||
throw new RuntimeException("Configuration error: " + tierConfigFileType);
|
|
||||||
}
|
|
||||||
|
|
||||||
Long numTimeslices = (Long) tierInfo.get("serving_timeslices");
|
|
||||||
if (numTimeslices == null) {
|
|
||||||
LOG.info("No max timeslices is specified for tier "
|
|
||||||
+ tierName + " in tier config file " + tierConfigFileType
|
|
||||||
+ ", not setting a cap on number of serving timeslices");
|
|
||||||
// NOTE: we use max int32 here because it will ultimately be cast to an int, but the config
|
|
||||||
// map expects Longs for all integral types. Using Long.MAX_VALUE leads to max serving
|
|
||||||
// timeslices being set to -1 when it is truncated to an int.
|
|
||||||
numTimeslices = (long) Integer.MAX_VALUE;
|
|
||||||
}
|
|
||||||
|
|
||||||
Date tierStartDate = (Date) tierInfo.get("data_range_start_date_inclusive");
|
|
||||||
if (tierStartDate == null) {
|
|
||||||
tierStartDate = DEFAULT_TIER_START_DATE;
|
|
||||||
}
|
|
||||||
Date tierEndDate = (Date) tierInfo.get("data_range_end_date_exclusive");
|
|
||||||
if (tierEndDate == null) {
|
|
||||||
tierEndDate = DEFAULT_TIER_END_DATE;
|
|
||||||
}
|
|
||||||
|
|
||||||
Boolean tierEnabled = (Boolean) tierInfo.get("tier_enabled");
|
|
||||||
if (tierEnabled == null) {
|
|
||||||
tierEnabled = DEFAULT_ENABLED;
|
|
||||||
}
|
|
||||||
|
|
||||||
TierInfo.RequestReadType readType =
|
|
||||||
getRequestReadType((String) tierInfo.get("tier_read_type"), DEFAULT_READ_TYPE);
|
|
||||||
TierInfo.RequestReadType readTypeOverride =
|
|
||||||
getRequestReadType((String) tierInfo.get("tier_read_type_override"), readType);
|
|
||||||
|
|
||||||
return new TierInfo(
|
|
||||||
tierName,
|
|
||||||
tierStartDate,
|
|
||||||
tierEndDate,
|
|
||||||
partitions.intValue(),
|
|
||||||
numTimeslices.intValue(),
|
|
||||||
tierEnabled,
|
|
||||||
(String) tierInfo.get("serving_range_since_id_exclusive"),
|
|
||||||
(String) tierInfo.get("serving_range_max_id_inclusive"),
|
|
||||||
(Date) tierInfo.get("serving_range_start_date_inclusive_override"),
|
|
||||||
(Date) tierInfo.get("serving_range_end_date_exclusive_override"),
|
|
||||||
readType,
|
|
||||||
readTypeOverride,
|
|
||||||
Clock.SYSTEM_CLOCK);
|
|
||||||
}
|
|
||||||
|
|
||||||
public static synchronized void clear() {
|
|
||||||
tierConfigFile = null;
|
|
||||||
tierConfigSource = null;
|
|
||||||
}
|
|
||||||
|
|
||||||
protected static synchronized ConfigSource getTierConfigSource() {
|
|
||||||
return tierConfigSource;
|
|
||||||
}
|
|
||||||
|
|
||||||
private static TierInfo.RequestReadType getRequestReadType(
|
|
||||||
String readTypeEnumName, TierInfo.RequestReadType defaultReadType) {
|
|
||||||
TierInfo.RequestReadType readType = defaultReadType;
|
|
||||||
if (readTypeEnumName != null) {
|
|
||||||
readType = TierInfo.RequestReadType.valueOf(readTypeEnumName.trim().toUpperCase());
|
|
||||||
Preconditions.checkState(readType != null);
|
|
||||||
}
|
|
||||||
return readType;
|
|
||||||
}
|
|
||||||
}
|
|
BIN
src/java/com/twitter/search/earlybird/config/TierInfo.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/TierInfo.docx
Normal file
Binary file not shown.
@ -1,180 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.config;
|
|
||||||
|
|
||||||
import java.util.Date;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
|
|
||||||
import com.twitter.common.util.Clock;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Properties of a single tier.
|
|
||||||
*/
|
|
||||||
public class TierInfo implements ServingRange {
|
|
||||||
// What I'm seeing historically is that this has been used when adding a new tier. First you
|
|
||||||
// add it and send dark traffic to it, then possibly grey and then you launch it by turning on
|
|
||||||
// light traffic.
|
|
||||||
public static enum RequestReadType {
|
|
||||||
// Light read: send request, wait for results, and results are returned
|
|
||||||
LIGHT,
|
|
||||||
// Dark read: send request, do not wait for results, and results are discarded
|
|
||||||
DARK,
|
|
||||||
// Grey read: send request, wait for results, but discard after results come back.
|
|
||||||
// Same results as dark read; similar latency as light read.
|
|
||||||
GREY,
|
|
||||||
}
|
|
||||||
|
|
||||||
private final String tierName;
|
|
||||||
private final Date dataStartDate;
|
|
||||||
private final Date dataEndDate;
|
|
||||||
private final int numPartitions;
|
|
||||||
private final int maxTimeslices;
|
|
||||||
private final TierServingBoundaryEndPoint servingRangeSince;
|
|
||||||
private final TierServingBoundaryEndPoint servingRangeMax;
|
|
||||||
private final TierServingBoundaryEndPoint servingRangeSinceOverride;
|
|
||||||
private final TierServingBoundaryEndPoint servingRangeMaxOverride;
|
|
||||||
|
|
||||||
// These two properties are only used by clients of Earlybird (E.g. roots),
|
|
||||||
// but not by Earlybirds.
|
|
||||||
private final boolean enabled;
|
|
||||||
private final RequestReadType readType;
|
|
||||||
private final RequestReadType readTypeOverride;
|
|
||||||
|
|
||||||
public TierInfo(String tierName,
|
|
||||||
Date dataStartDate,
|
|
||||||
Date dataEndDate,
|
|
||||||
int numPartitions,
|
|
||||||
int maxTimeslices,
|
|
||||||
boolean enabled,
|
|
||||||
String sinceIdString,
|
|
||||||
String maxIdString,
|
|
||||||
Date servingStartDateOverride,
|
|
||||||
Date servingEndDateOverride,
|
|
||||||
RequestReadType readType,
|
|
||||||
RequestReadType readTypeOverride,
|
|
||||||
Clock clock) {
|
|
||||||
Preconditions.checkArgument(numPartitions > 0);
|
|
||||||
Preconditions.checkArgument(maxTimeslices > 0);
|
|
||||||
this.tierName = tierName;
|
|
||||||
this.dataStartDate = dataStartDate;
|
|
||||||
this.dataEndDate = dataEndDate;
|
|
||||||
this.numPartitions = numPartitions;
|
|
||||||
this.maxTimeslices = maxTimeslices;
|
|
||||||
this.enabled = enabled;
|
|
||||||
this.readType = readType;
|
|
||||||
this.readTypeOverride = readTypeOverride;
|
|
||||||
this.servingRangeSince = TierServingBoundaryEndPoint
|
|
||||||
.newTierServingBoundaryEndPoint(sinceIdString, dataStartDate, clock);
|
|
||||||
this.servingRangeMax = TierServingBoundaryEndPoint
|
|
||||||
.newTierServingBoundaryEndPoint(maxIdString, dataEndDate, clock);
|
|
||||||
if (servingStartDateOverride != null) {
|
|
||||||
this.servingRangeSinceOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint(
|
|
||||||
TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingStartDateOverride, clock);
|
|
||||||
} else {
|
|
||||||
this.servingRangeSinceOverride = servingRangeSince;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (servingEndDateOverride != null) {
|
|
||||||
this.servingRangeMaxOverride = TierServingBoundaryEndPoint.newTierServingBoundaryEndPoint(
|
|
||||||
TierServingBoundaryEndPoint.INFERRED_FROM_DATA_RANGE, servingEndDateOverride, clock);
|
|
||||||
} else {
|
|
||||||
this.servingRangeMaxOverride = servingRangeMax;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@VisibleForTesting
|
|
||||||
public TierInfo(String tierName,
|
|
||||||
Date dataStartDate,
|
|
||||||
Date dataEndDate,
|
|
||||||
int numPartitions,
|
|
||||||
int maxTimeslices,
|
|
||||||
boolean enabled,
|
|
||||||
String sinceIdString,
|
|
||||||
String maxIdString,
|
|
||||||
RequestReadType readType,
|
|
||||||
Clock clock) {
|
|
||||||
// No overrides:
|
|
||||||
// servingRangeSinceOverride == servingRangeSince
|
|
||||||
// servingRangeMaxOverride == servingRangeMax
|
|
||||||
// readTypeOverride == readType
|
|
||||||
this(tierName, dataStartDate, dataEndDate, numPartitions, maxTimeslices, enabled, sinceIdString,
|
|
||||||
maxIdString, null, null, readType, readType, clock);
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public String toString() {
|
|
||||||
return tierName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTierName() {
|
|
||||||
return tierName;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Date getDataStartDate() {
|
|
||||||
return dataStartDate;
|
|
||||||
}
|
|
||||||
|
|
||||||
public Date getDataEndDate() {
|
|
||||||
return dataEndDate;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumPartitions() {
|
|
||||||
return numPartitions;
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getMaxTimeslices() {
|
|
||||||
return maxTimeslices;
|
|
||||||
}
|
|
||||||
|
|
||||||
public TierConfig.ConfigSource getSource() {
|
|
||||||
return TierConfig.getTierConfigSource();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isEnabled() {
|
|
||||||
return enabled;
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isDarkRead() {
|
|
||||||
return readType == RequestReadType.DARK;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RequestReadType getReadType() {
|
|
||||||
return readType;
|
|
||||||
}
|
|
||||||
|
|
||||||
public RequestReadType getReadTypeOverride() {
|
|
||||||
return readTypeOverride;
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getServingRangeSinceId() {
|
|
||||||
return servingRangeSince.getBoundaryTweetId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getServingRangeMaxId() {
|
|
||||||
return servingRangeMax.getBoundaryTweetId();
|
|
||||||
}
|
|
||||||
|
|
||||||
long getServingRangeOverrideSinceId() {
|
|
||||||
return servingRangeSinceOverride.getBoundaryTweetId();
|
|
||||||
}
|
|
||||||
|
|
||||||
long getServingRangeOverrideMaxId() {
|
|
||||||
return servingRangeMaxOverride.getBoundaryTweetId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getServingRangeSinceTimeSecondsFromEpoch() {
|
|
||||||
return servingRangeSince.getBoundaryTimeSecondsFromEpoch();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getServingRangeUntilTimeSecondsFromEpoch() {
|
|
||||||
return servingRangeMax.getBoundaryTimeSecondsFromEpoch();
|
|
||||||
}
|
|
||||||
|
|
||||||
long getServingRangeOverrideSinceTimeSecondsFromEpoch() {
|
|
||||||
return servingRangeSinceOverride.getBoundaryTimeSecondsFromEpoch();
|
|
||||||
}
|
|
||||||
|
|
||||||
long getServingRangeOverrideUntilTimeSecondsFromEpoch() {
|
|
||||||
return servingRangeMaxOverride.getBoundaryTimeSecondsFromEpoch();
|
|
||||||
}
|
|
||||||
}
|
|
BIN
src/java/com/twitter/search/earlybird/config/TierInfoSource.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/TierInfoSource.docx
Normal file
Binary file not shown.
@ -1,39 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.config;
|
|
||||||
|
|
||||||
import java.util.ArrayList;
|
|
||||||
import java.util.List;
|
|
||||||
import java.util.Set;
|
|
||||||
|
|
||||||
import javax.inject.Inject;
|
|
||||||
|
|
||||||
import com.twitter.search.common.util.zookeeper.ZooKeeperProxy;
|
|
||||||
|
|
||||||
public class TierInfoSource {
|
|
||||||
private final ZooKeeperProxy zkClient;
|
|
||||||
|
|
||||||
@Inject
|
|
||||||
public TierInfoSource(ZooKeeperProxy sZooKeeperClient) {
|
|
||||||
this.zkClient = sZooKeeperClient;
|
|
||||||
}
|
|
||||||
|
|
||||||
public List<TierInfo> getTierInformation() {
|
|
||||||
return getTierInfoWithPrefix("tier");
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getConfigFileType() {
|
|
||||||
return TierConfig.getConfigFileName();
|
|
||||||
}
|
|
||||||
|
|
||||||
private List<TierInfo> getTierInfoWithPrefix(String tierPrefix) {
|
|
||||||
Set<String> tierNames = TierConfig.getTierNames();
|
|
||||||
List<TierInfo> tierInfos = new ArrayList<>();
|
|
||||||
for (String name : tierNames) {
|
|
||||||
if (name.startsWith(tierPrefix)) {
|
|
||||||
TierInfo tierInfo = TierConfig.getTierInfo(name);
|
|
||||||
tierInfos.add(tierInfo);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return tierInfos;
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
BIN
src/java/com/twitter/search/earlybird/config/TierInfoUtil.docx
Normal file
BIN
src/java/com/twitter/search/earlybird/config/TierInfoUtil.docx
Normal file
Binary file not shown.
@ -1,78 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.config;
|
|
||||||
|
|
||||||
import java.util.Comparator;
|
|
||||||
import java.util.SortedSet;
|
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
|
|
||||||
public final class TierInfoUtil {
|
|
||||||
public static final Comparator<TierInfo> TIER_COMPARATOR = (t1, t2) -> {
|
|
||||||
// Reverse sort order based on date.
|
|
||||||
return t2.getDataStartDate().compareTo(t1.getDataStartDate());
|
|
||||||
};
|
|
||||||
|
|
||||||
private TierInfoUtil() {
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks that the serving ranges and the override serving ranges of the given tiers do not
|
|
||||||
* overlap, and do not have gaps. Dark reads tiers are ignored.
|
|
||||||
*/
|
|
||||||
public static void checkTierServingRanges(SortedSet<TierInfo> tierInfos) {
|
|
||||||
boolean tierServingRangesOverlap = false;
|
|
||||||
boolean tierOverrideServingRangesOverlap = false;
|
|
||||||
boolean tierServingRangesHaveGaps = false;
|
|
||||||
boolean tierOverrideServingRangesHaveGaps = false;
|
|
||||||
|
|
||||||
TierInfoWrapper previousTierInfoWrapper = null;
|
|
||||||
TierInfoWrapper previousOverrideTierInfoWrapper = null;
|
|
||||||
for (TierInfo tierInfo : tierInfos) {
|
|
||||||
TierInfoWrapper tierInfoWrapper = new TierInfoWrapper(tierInfo, false);
|
|
||||||
TierInfoWrapper overrideTierInfoWrapper = new TierInfoWrapper(tierInfo, true);
|
|
||||||
|
|
||||||
// Check only the tiers to which we send light reads.
|
|
||||||
if (!tierInfoWrapper.isDarkRead()) {
|
|
||||||
if (previousTierInfoWrapper != null) {
|
|
||||||
if (TierInfoWrapper.servingRangesOverlap(previousTierInfoWrapper, tierInfoWrapper)) {
|
|
||||||
// In case of rebalancing, we may have an overlap data range while
|
|
||||||
// overriding with a good serving range.
|
|
||||||
if (previousOverrideTierInfoWrapper == null
|
|
||||||
|| TierInfoWrapper.servingRangesOverlap(
|
|
||||||
previousOverrideTierInfoWrapper, overrideTierInfoWrapper)) {
|
|
||||||
tierServingRangesOverlap = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (TierInfoWrapper.servingRangesHaveGap(previousTierInfoWrapper, tierInfoWrapper)) {
|
|
||||||
tierServingRangesHaveGaps = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
previousTierInfoWrapper = tierInfoWrapper;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!overrideTierInfoWrapper.isDarkRead()) {
|
|
||||||
if (previousOverrideTierInfoWrapper != null) {
|
|
||||||
if (TierInfoWrapper.servingRangesOverlap(previousOverrideTierInfoWrapper,
|
|
||||||
overrideTierInfoWrapper)) {
|
|
||||||
tierOverrideServingRangesOverlap = true;
|
|
||||||
}
|
|
||||||
if (TierInfoWrapper.servingRangesHaveGap(previousOverrideTierInfoWrapper,
|
|
||||||
overrideTierInfoWrapper)) {
|
|
||||||
tierOverrideServingRangesHaveGaps = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
previousOverrideTierInfoWrapper = overrideTierInfoWrapper;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Preconditions.checkState(!tierServingRangesOverlap,
|
|
||||||
"Serving ranges of light reads tiers must not overlap.");
|
|
||||||
Preconditions.checkState(!tierServingRangesHaveGaps,
|
|
||||||
"Serving ranges of light reads tiers must not have gaps.");
|
|
||||||
Preconditions.checkState(!tierOverrideServingRangesOverlap,
|
|
||||||
"Override serving ranges of light reads tiers must not overlap.");
|
|
||||||
Preconditions.checkState(!tierOverrideServingRangesHaveGaps,
|
|
||||||
"Override serving ranges of light reads tiers must not have gaps.");
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,89 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.config;
|
|
||||||
|
|
||||||
import java.util.Date;
|
|
||||||
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* A simple wrapper around TierInfo that returns the "real" or the "overriden" values from the given
|
|
||||||
* {@code TierInfo} instance, based on the given {@code useOverrideTierConfig} flag.
|
|
||||||
*/
|
|
||||||
public class TierInfoWrapper implements ServingRange {
|
|
||||||
private final TierInfo tierInfo;
|
|
||||||
private final boolean useOverrideTierConfig;
|
|
||||||
|
|
||||||
public TierInfoWrapper(TierInfo tierInfo, boolean useOverrideTierConfig) {
|
|
||||||
this.tierInfo = Preconditions.checkNotNull(tierInfo);
|
|
||||||
this.useOverrideTierConfig = useOverrideTierConfig;
|
|
||||||
}
|
|
||||||
|
|
||||||
public String getTierName() {
|
|
||||||
return tierInfo.getTierName();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Date getDataStartDate() {
|
|
||||||
return tierInfo.getDataStartDate();
|
|
||||||
}
|
|
||||||
|
|
||||||
public Date getDataEndDate() {
|
|
||||||
return tierInfo.getDataEndDate();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getNumPartitions() {
|
|
||||||
return tierInfo.getNumPartitions();
|
|
||||||
}
|
|
||||||
|
|
||||||
public int getMaxTimeslices() {
|
|
||||||
return tierInfo.getMaxTimeslices();
|
|
||||||
}
|
|
||||||
|
|
||||||
public TierConfig.ConfigSource getSource() {
|
|
||||||
return tierInfo.getSource();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isEnabled() {
|
|
||||||
return tierInfo.isEnabled();
|
|
||||||
}
|
|
||||||
|
|
||||||
public boolean isDarkRead() {
|
|
||||||
return getReadType() == TierInfo.RequestReadType.DARK;
|
|
||||||
}
|
|
||||||
|
|
||||||
public TierInfo.RequestReadType getReadType() {
|
|
||||||
return useOverrideTierConfig ? tierInfo.getReadTypeOverride() : tierInfo.getReadType();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getServingRangeSinceId() {
|
|
||||||
return useOverrideTierConfig
|
|
||||||
? tierInfo.getServingRangeOverrideSinceId()
|
|
||||||
: tierInfo.getServingRangeSinceId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getServingRangeMaxId() {
|
|
||||||
return useOverrideTierConfig
|
|
||||||
? tierInfo.getServingRangeOverrideMaxId()
|
|
||||||
: tierInfo.getServingRangeMaxId();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getServingRangeSinceTimeSecondsFromEpoch() {
|
|
||||||
return useOverrideTierConfig
|
|
||||||
? tierInfo.getServingRangeOverrideSinceTimeSecondsFromEpoch()
|
|
||||||
: tierInfo.getServingRangeSinceTimeSecondsFromEpoch();
|
|
||||||
}
|
|
||||||
|
|
||||||
public long getServingRangeUntilTimeSecondsFromEpoch() {
|
|
||||||
return useOverrideTierConfig
|
|
||||||
? tierInfo.getServingRangeOverrideUntilTimeSecondsFromEpoch()
|
|
||||||
: tierInfo.getServingRangeUntilTimeSecondsFromEpoch();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean servingRangesOverlap(TierInfoWrapper tier1, TierInfoWrapper tier2) {
|
|
||||||
return (tier1.getServingRangeMaxId() > tier2.getServingRangeSinceId())
|
|
||||||
&& (tier2.getServingRangeMaxId() > tier1.getServingRangeSinceId());
|
|
||||||
}
|
|
||||||
|
|
||||||
public static boolean servingRangesHaveGap(TierInfoWrapper tier1, TierInfoWrapper tier2) {
|
|
||||||
return (tier1.getServingRangeMaxId() < tier2.getServingRangeSinceId())
|
|
||||||
|| (tier2.getServingRangeMaxId() < tier1.getServingRangeSinceId());
|
|
||||||
}
|
|
||||||
}
|
|
Binary file not shown.
@ -1,146 +0,0 @@
|
|||||||
package com.twitter.search.earlybird.config;
|
|
||||||
|
|
||||||
import java.util.Date;
|
|
||||||
|
|
||||||
import javax.annotation.Nullable;
|
|
||||||
|
|
||||||
import com.google.common.annotations.VisibleForTesting;
|
|
||||||
import com.google.common.base.Preconditions;
|
|
||||||
|
|
||||||
import com.twitter.common.util.Clock;
|
|
||||||
import com.twitter.search.common.partitioning.snowflakeparser.SnowflakeIdParser;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* The start or end boundary of a tier's serving range.
|
|
||||||
* This is used to add since_id and max_id operators onto search queries.
|
|
||||||
*/
|
|
||||||
public class TierServingBoundaryEndPoint {
|
|
||||||
@VisibleForTesting
|
|
||||||
public static final String INFERRED_FROM_DATA_RANGE = "inferred_from_data_range";
|
|
||||||
public static final String RELATIVE_TO_CURRENT_TIME_MS = "relative_to_current_time_ms";
|
|
||||||
|
|
||||||
// Either offsetToCurrentTimeMillis is set or (absoluteTweetId and timeBoundarySecondsFromEpoch)
|
|
||||||
// are set.
|
|
||||||
@Nullable
|
|
||||||
private final Long offsetToCurrentTimeMillis;
|
|
||||||
@Nullable
|
|
||||||
private final Long absoluteTweetId;
|
|
||||||
@Nullable
|
|
||||||
private final Long timeBoundarySecondsFromEpoch;
|
|
||||||
private final Clock clock;
|
|
||||||
|
|
||||||
TierServingBoundaryEndPoint(Long absoluteTweetId,
|
|
||||||
Long timeBoundarySecondsFromEpoch,
|
|
||||||
Long offsetToCurrentTimeMillis,
|
|
||||||
Clock clock) {
|
|
||||||
this.offsetToCurrentTimeMillis = offsetToCurrentTimeMillis;
|
|
||||||
this.absoluteTweetId = absoluteTweetId;
|
|
||||||
this.timeBoundarySecondsFromEpoch = timeBoundarySecondsFromEpoch;
|
|
||||||
this.clock = clock;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Parse the boundary string and construct a TierServingBoundaryEndPoint instance.
|
|
||||||
* @param boundaryString boundary configuration string. Valid values are:
|
|
||||||
* <li>
|
|
||||||
* "inferred_from_data_range" infers serving range from data range. This only works after
|
|
||||||
* Nov 2010 when Twitter switched to snowflake IDs.
|
|
||||||
* This is the default value.
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* "absolute_tweet_id_and_timestamp_millis:id:timestamp" a tweet ID/timestamp is given
|
|
||||||
* explicitly as the serving range
|
|
||||||
* boundary.
|
|
||||||
* </li>
|
|
||||||
* <li>
|
|
||||||
* "relative_to_current_time_ms:offset" adds offset onto current timestamp in millis to
|
|
||||||
* compute serving range.
|
|
||||||
* </li>
|
|
||||||
*
|
|
||||||
* @param boundaryDate the data boundary. This is used in conjunction with
|
|
||||||
* inferred_from_data_date to determine the serving boundary.
|
|
||||||
* @param clock Clock used to obtain current time, when relative_to_current_time_ms is used.
|
|
||||||
* Tests pass in a FakeClock.
|
|
||||||
*/
|
|
||||||
public static TierServingBoundaryEndPoint newTierServingBoundaryEndPoint(String boundaryString,
|
|
||||||
Date boundaryDate,
|
|
||||||
Clock clock) {
|
|
||||||
if (boundaryString == null || boundaryString.trim().equals(
|
|
||||||
INFERRED_FROM_DATA_RANGE)) {
|
|
||||||
return inferBoundaryFromDataRange(boundaryDate, clock);
|
|
||||||
} else if (boundaryString.trim().startsWith(RELATIVE_TO_CURRENT_TIME_MS)) {
|
|
||||||
return getRelativeBoundary(boundaryString, clock);
|
|
||||||
} else {
|
|
||||||
throw new IllegalStateException("Cannot parse serving range string: " + boundaryString);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static TierServingBoundaryEndPoint inferBoundaryFromDataRange(Date boundaryDate,
|
|
||||||
Clock clock) {
|
|
||||||
// infer from data range
|
|
||||||
// handle default start date and end date, in case the dates are not specified in the config
|
|
||||||
if (boundaryDate.equals(TierConfig.DEFAULT_TIER_START_DATE)) {
|
|
||||||
return new TierServingBoundaryEndPoint(
|
|
||||||
-1L, TierConfig.DEFAULT_TIER_START_DATE.getTime() / 1000, null, clock);
|
|
||||||
} else if (boundaryDate.equals(TierConfig.DEFAULT_TIER_END_DATE)) {
|
|
||||||
return new TierServingBoundaryEndPoint(
|
|
||||||
Long.MAX_VALUE, TierConfig.DEFAULT_TIER_END_DATE.getTime() / 1000, null, clock);
|
|
||||||
} else {
|
|
||||||
// convert data start / end dates into since / max ID.
|
|
||||||
long boundaryTimeMillis = boundaryDate.getTime();
|
|
||||||
if (!SnowflakeIdParser.isUsableSnowflakeTimestamp(boundaryTimeMillis)) {
|
|
||||||
throw new IllegalStateException("Serving time range can not be determined, because "
|
|
||||||
+ boundaryDate + " is before Twitter switched to snowflake tweet IDs.");
|
|
||||||
}
|
|
||||||
// Earlybird since_id is inclusive and max_id is exclusive. We substract 1 here.
|
|
||||||
// Consider example:
|
|
||||||
// full0: 5000 (inclusive) - 6000 (exclusive)
|
|
||||||
// full1: 6000 (inclusive) - 7000 (exclusive)
|
|
||||||
// For tier full0, we should use max_id 5999 instead of 6000.
|
|
||||||
// For tier full1, we should use since_id 5999 instead of 6000.
|
|
||||||
// Hence we substract 1 here.
|
|
||||||
long adjustedTweetId =
|
|
||||||
SnowflakeIdParser.generateValidStatusId(boundaryTimeMillis, 0) - 1;
|
|
||||||
Preconditions.checkState(adjustedTweetId >= 0, "boundary tweet ID must be non-negative");
|
|
||||||
return new TierServingBoundaryEndPoint(
|
|
||||||
adjustedTweetId, boundaryTimeMillis / 1000, null, clock);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private static TierServingBoundaryEndPoint getRelativeBoundary(String boundaryString,
|
|
||||||
Clock clock) {
|
|
||||||
// An offset relative to current time is given
|
|
||||||
String[] parts = boundaryString.split(":");
|
|
||||||
Preconditions.checkState(parts.length == 2);
|
|
||||||
long offset = Long.parseLong(parts[1]);
|
|
||||||
return new TierServingBoundaryEndPoint(null, null, offset, clock);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the tweet ID for this tier boundary. If the tier boundary was created using a tweet ID,
|
|
||||||
* that tweet ID is returned. Otherwise, a tweet ID is derived from the time boundary.
|
|
||||||
*/
|
|
||||||
@VisibleForTesting
|
|
||||||
public long getBoundaryTweetId() {
|
|
||||||
// If absoluteTweetId is available, use it.
|
|
||||||
if (absoluteTweetId != null) {
|
|
||||||
return absoluteTweetId;
|
|
||||||
} else {
|
|
||||||
Preconditions.checkNotNull(offsetToCurrentTimeMillis);
|
|
||||||
long boundaryTime = clock.nowMillis() + offsetToCurrentTimeMillis;
|
|
||||||
return SnowflakeIdParser.generateValidStatusId(boundaryTime, 0);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Returns the time boundary for this tier boundary, in seconds since epoch.
|
|
||||||
*/
|
|
||||||
public long getBoundaryTimeSecondsFromEpoch() {
|
|
||||||
if (timeBoundarySecondsFromEpoch != null) {
|
|
||||||
return timeBoundarySecondsFromEpoch;
|
|
||||||
} else {
|
|
||||||
Preconditions.checkNotNull(offsetToCurrentTimeMillis);
|
|
||||||
return (clock.nowMillis() + offsetToCurrentTimeMillis) / 1000;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user